{ "best_metric": 1.89783895, "best_model_checkpoint": "D:\\_____NEW_NN\\LLM\\MiniCPM-V\\finetune\\output\\phi3-vision-128k-instruct\\v0-20240531-071942\\checkpoint-1476", "epoch": 4.0, "eval_steps": 50, "global_step": 1476, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "acc": 0.47843874, "epoch": 0.0027100271002710027, "grad_norm": 0.921875, "learning_rate": 2.027027027027027e-06, "loss": 2.52092218, "memory(GiB)": 12.33, "step": 1, "train_speed(iter/s)": 0.066334 }, { "acc": 0.51126236, "epoch": 0.013550135501355014, "grad_norm": 0.796875, "learning_rate": 1.0135135135135135e-05, "loss": 2.42214346, "memory(GiB)": 13.38, "step": 5, "train_speed(iter/s)": 0.127981 }, { "acc": 0.48636103, "epoch": 0.02710027100271003, "grad_norm": 1.09375, "learning_rate": 2.027027027027027e-05, "loss": 2.53938828, "memory(GiB)": 13.38, "step": 10, "train_speed(iter/s)": 0.144622 }, { "acc": 0.47292571, "epoch": 0.04065040650406504, "grad_norm": 1.1328125, "learning_rate": 3.0405405405405404e-05, "loss": 2.50403137, "memory(GiB)": 13.38, "step": 15, "train_speed(iter/s)": 0.150329 }, { "acc": 0.52069569, "epoch": 0.05420054200542006, "grad_norm": 0.83984375, "learning_rate": 4.054054054054054e-05, "loss": 2.28312988, "memory(GiB)": 13.38, "step": 20, "train_speed(iter/s)": 0.152832 }, { "acc": 0.53072515, "epoch": 0.06775067750677506, "grad_norm": 0.59765625, "learning_rate": 5.067567567567567e-05, "loss": 2.20206394, "memory(GiB)": 13.38, "step": 25, "train_speed(iter/s)": 0.154186 }, { "acc": 0.52423077, "epoch": 0.08130081300813008, "grad_norm": 0.89453125, "learning_rate": 6.081081081081081e-05, "loss": 2.31712227, "memory(GiB)": 14.44, "step": 30, "train_speed(iter/s)": 0.155403 }, { "acc": 0.54341574, "epoch": 0.0948509485094851, "grad_norm": 0.7109375, "learning_rate": 7.094594594594594e-05, "loss": 2.03825722, "memory(GiB)": 14.44, "step": 35, "train_speed(iter/s)": 0.156383 }, { "acc": 0.56128917, "epoch": 0.10840108401084012, "grad_norm": 0.734375, "learning_rate": 8.108108108108108e-05, "loss": 1.98176575, "memory(GiB)": 14.44, "step": 40, "train_speed(iter/s)": 0.156973 }, { "acc": 0.53511982, "epoch": 0.12195121951219512, "grad_norm": 1.1484375, "learning_rate": 9.121621621621621e-05, "loss": 2.07999401, "memory(GiB)": 14.44, "step": 45, "train_speed(iter/s)": 0.157395 }, { "acc": 0.56112757, "epoch": 0.13550135501355012, "grad_norm": 0.83984375, "learning_rate": 0.00010135135135135135, "loss": 1.92188244, "memory(GiB)": 14.44, "step": 50, "train_speed(iter/s)": 0.157924 }, { "epoch": 0.13550135501355012, "eval_acc": 0.5618983279851376, "eval_loss": 1.901513695716858, "eval_runtime": 44.6741, "eval_samples_per_second": 0.851, "eval_steps_per_second": 0.851, "step": 50 }, { "acc": 0.57467785, "epoch": 0.14905149051490515, "grad_norm": 0.9921875, "learning_rate": 0.00011148648648648647, "loss": 1.92710114, "memory(GiB)": 15.21, "step": 55, "train_speed(iter/s)": 0.140365 }, { "acc": 0.55041471, "epoch": 0.16260162601626016, "grad_norm": 1.1484375, "learning_rate": 0.00012162162162162162, "loss": 2.09731407, "memory(GiB)": 15.21, "step": 60, "train_speed(iter/s)": 0.141894 }, { "acc": 0.53853202, "epoch": 0.17615176151761516, "grad_norm": 1.03125, "learning_rate": 0.00013175675675675675, "loss": 2.01167774, "memory(GiB)": 15.21, "step": 65, "train_speed(iter/s)": 0.143162 }, { "acc": 0.57625084, "epoch": 0.1897018970189702, "grad_norm": 0.81640625, "learning_rate": 0.00014189189189189188, "loss": 1.77697067, "memory(GiB)": 15.21, "step": 70, "train_speed(iter/s)": 0.144318 }, { "acc": 0.57468171, "epoch": 0.2032520325203252, "grad_norm": 0.96484375, "learning_rate": 0.00014989300998573466, "loss": 1.92596684, "memory(GiB)": 15.21, "step": 75, "train_speed(iter/s)": 0.145344 }, { "acc": 0.58551345, "epoch": 0.21680216802168023, "grad_norm": 1.0546875, "learning_rate": 0.00014935805991440798, "loss": 1.85387783, "memory(GiB)": 15.21, "step": 80, "train_speed(iter/s)": 0.146049 }, { "acc": 0.54782548, "epoch": 0.23035230352303523, "grad_norm": 1.0078125, "learning_rate": 0.0001488231098430813, "loss": 2.04980812, "memory(GiB)": 15.21, "step": 85, "train_speed(iter/s)": 0.146726 }, { "acc": 0.56594706, "epoch": 0.24390243902439024, "grad_norm": 1.0, "learning_rate": 0.0001482881597717546, "loss": 1.93099308, "memory(GiB)": 15.21, "step": 90, "train_speed(iter/s)": 0.147437 }, { "acc": 0.57289362, "epoch": 0.25745257452574527, "grad_norm": 0.8046875, "learning_rate": 0.00014775320970042795, "loss": 1.84038963, "memory(GiB)": 15.58, "step": 95, "train_speed(iter/s)": 0.147976 }, { "acc": 0.5938961, "epoch": 0.27100271002710025, "grad_norm": 1.015625, "learning_rate": 0.00014721825962910127, "loss": 1.71151619, "memory(GiB)": 15.58, "step": 100, "train_speed(iter/s)": 0.148573 }, { "epoch": 0.27100271002710025, "eval_acc": 0.587907448066205, "eval_loss": 1.7437644004821777, "eval_runtime": 44.775, "eval_samples_per_second": 0.849, "eval_steps_per_second": 0.849, "step": 100 }, { "acc": 0.5828393, "epoch": 0.2845528455284553, "grad_norm": 1.078125, "learning_rate": 0.00014668330955777461, "loss": 1.90272522, "memory(GiB)": 15.58, "step": 105, "train_speed(iter/s)": 0.140241 }, { "acc": 0.57981925, "epoch": 0.2981029810298103, "grad_norm": 0.859375, "learning_rate": 0.00014614835948644793, "loss": 1.88997154, "memory(GiB)": 15.58, "step": 110, "train_speed(iter/s)": 0.141057 }, { "acc": 0.56191244, "epoch": 0.3116531165311653, "grad_norm": 0.94140625, "learning_rate": 0.00014561340941512125, "loss": 1.92102909, "memory(GiB)": 15.58, "step": 115, "train_speed(iter/s)": 0.141824 }, { "acc": 0.57420607, "epoch": 0.3252032520325203, "grad_norm": 1.171875, "learning_rate": 0.00014507845934379456, "loss": 1.80924416, "memory(GiB)": 15.58, "step": 120, "train_speed(iter/s)": 0.14256 }, { "acc": 0.57703466, "epoch": 0.33875338753387535, "grad_norm": 0.84765625, "learning_rate": 0.00014454350927246788, "loss": 1.85804043, "memory(GiB)": 15.58, "step": 125, "train_speed(iter/s)": 0.143173 }, { "acc": 0.58581357, "epoch": 0.3523035230352303, "grad_norm": 0.96484375, "learning_rate": 0.0001440085592011412, "loss": 1.75103779, "memory(GiB)": 15.58, "step": 130, "train_speed(iter/s)": 0.143759 }, { "acc": 0.59179254, "epoch": 0.36585365853658536, "grad_norm": 0.93359375, "learning_rate": 0.00014347360912981454, "loss": 1.67790794, "memory(GiB)": 15.58, "step": 135, "train_speed(iter/s)": 0.14428 }, { "acc": 0.59449296, "epoch": 0.3794037940379404, "grad_norm": 0.74609375, "learning_rate": 0.00014293865905848786, "loss": 1.72561359, "memory(GiB)": 15.58, "step": 140, "train_speed(iter/s)": 0.144799 }, { "acc": 0.57711525, "epoch": 0.39295392953929537, "grad_norm": 1.109375, "learning_rate": 0.0001424037089871612, "loss": 1.79506092, "memory(GiB)": 15.58, "step": 145, "train_speed(iter/s)": 0.14532 }, { "acc": 0.57541366, "epoch": 0.4065040650406504, "grad_norm": 5.46875, "learning_rate": 0.00014186875891583452, "loss": 1.80377541, "memory(GiB)": 15.58, "step": 150, "train_speed(iter/s)": 0.145757 }, { "epoch": 0.4065040650406504, "eval_acc": 0.5983786522546867, "eval_loss": 1.676965594291687, "eval_runtime": 44.3227, "eval_samples_per_second": 0.857, "eval_steps_per_second": 0.857, "step": 150 }, { "acc": 0.58702893, "epoch": 0.42005420054200543, "grad_norm": 1.0625, "learning_rate": 0.00014133380884450783, "loss": 1.86865826, "memory(GiB)": 15.58, "step": 155, "train_speed(iter/s)": 0.140358 }, { "acc": 0.58541198, "epoch": 0.43360433604336046, "grad_norm": 1.015625, "learning_rate": 0.00014079885877318115, "loss": 1.82644196, "memory(GiB)": 15.58, "step": 160, "train_speed(iter/s)": 0.140936 }, { "acc": 0.58856125, "epoch": 0.44715447154471544, "grad_norm": 0.9921875, "learning_rate": 0.00014026390870185447, "loss": 1.68714104, "memory(GiB)": 15.58, "step": 165, "train_speed(iter/s)": 0.141495 }, { "acc": 0.59458299, "epoch": 0.46070460704607047, "grad_norm": 0.91015625, "learning_rate": 0.0001397289586305278, "loss": 1.72063637, "memory(GiB)": 15.58, "step": 170, "train_speed(iter/s)": 0.142005 }, { "acc": 0.57354069, "epoch": 0.4742547425474255, "grad_norm": 0.7734375, "learning_rate": 0.00013919400855920113, "loss": 1.83291931, "memory(GiB)": 15.58, "step": 175, "train_speed(iter/s)": 0.142509 }, { "acc": 0.58851786, "epoch": 0.4878048780487805, "grad_norm": 1.234375, "learning_rate": 0.00013865905848787447, "loss": 1.70540199, "memory(GiB)": 15.58, "step": 180, "train_speed(iter/s)": 0.143002 }, { "acc": 0.59666262, "epoch": 0.5013550135501355, "grad_norm": 0.87890625, "learning_rate": 0.0001381241084165478, "loss": 1.70772285, "memory(GiB)": 15.58, "step": 185, "train_speed(iter/s)": 0.143463 }, { "acc": 0.58001013, "epoch": 0.5149051490514905, "grad_norm": 0.890625, "learning_rate": 0.0001375891583452211, "loss": 1.73730106, "memory(GiB)": 15.58, "step": 190, "train_speed(iter/s)": 0.143883 }, { "acc": 0.5934463, "epoch": 0.5284552845528455, "grad_norm": 1.421875, "learning_rate": 0.00013705420827389442, "loss": 1.68941402, "memory(GiB)": 15.58, "step": 195, "train_speed(iter/s)": 0.144297 }, { "acc": 0.58476119, "epoch": 0.5420054200542005, "grad_norm": 0.91015625, "learning_rate": 0.00013651925820256774, "loss": 1.74896088, "memory(GiB)": 15.58, "step": 200, "train_speed(iter/s)": 0.144655 }, { "epoch": 0.5420054200542005, "eval_acc": 0.6105387603445364, "eval_loss": 1.6297248601913452, "eval_runtime": 44.3331, "eval_samples_per_second": 0.857, "eval_steps_per_second": 0.857, "step": 200 }, { "acc": 0.58323898, "epoch": 0.5555555555555556, "grad_norm": 0.79296875, "learning_rate": 0.00013598430813124105, "loss": 1.74196358, "memory(GiB)": 15.58, "step": 205, "train_speed(iter/s)": 0.140612 }, { "acc": 0.59854908, "epoch": 0.5691056910569106, "grad_norm": 1.109375, "learning_rate": 0.0001354493580599144, "loss": 1.6279623, "memory(GiB)": 15.58, "step": 210, "train_speed(iter/s)": 0.141053 }, { "acc": 0.58306313, "epoch": 0.5826558265582655, "grad_norm": 0.98828125, "learning_rate": 0.00013491440798858771, "loss": 1.85492191, "memory(GiB)": 15.58, "step": 215, "train_speed(iter/s)": 0.141461 }, { "acc": 0.58454275, "epoch": 0.5962059620596206, "grad_norm": 0.890625, "learning_rate": 0.00013437945791726106, "loss": 1.75104046, "memory(GiB)": 15.58, "step": 220, "train_speed(iter/s)": 0.141869 }, { "acc": 0.60898943, "epoch": 0.6097560975609756, "grad_norm": 0.828125, "learning_rate": 0.00013384450784593437, "loss": 1.57786808, "memory(GiB)": 15.58, "step": 225, "train_speed(iter/s)": 0.142237 }, { "acc": 0.58954048, "epoch": 0.6233062330623306, "grad_norm": 1.1640625, "learning_rate": 0.0001333095577746077, "loss": 1.72581081, "memory(GiB)": 15.58, "step": 230, "train_speed(iter/s)": 0.142622 }, { "acc": 0.59608021, "epoch": 0.6368563685636857, "grad_norm": 1.2265625, "learning_rate": 0.000132774607703281, "loss": 1.70160866, "memory(GiB)": 15.58, "step": 235, "train_speed(iter/s)": 0.142983 }, { "acc": 0.57084417, "epoch": 0.6504065040650406, "grad_norm": 1.1640625, "learning_rate": 0.00013223965763195432, "loss": 1.81941319, "memory(GiB)": 15.58, "step": 240, "train_speed(iter/s)": 0.143329 }, { "acc": 0.61476159, "epoch": 0.6639566395663956, "grad_norm": 0.9453125, "learning_rate": 0.00013170470756062767, "loss": 1.68505306, "memory(GiB)": 15.58, "step": 245, "train_speed(iter/s)": 0.143652 }, { "acc": 0.5995626, "epoch": 0.6775067750677507, "grad_norm": 0.9140625, "learning_rate": 0.00013116975748930098, "loss": 1.77789631, "memory(GiB)": 15.58, "step": 250, "train_speed(iter/s)": 0.144002 }, { "epoch": 0.6775067750677507, "eval_acc": 0.61087654112481, "eval_loss": 1.6108067035675049, "eval_runtime": 44.1094, "eval_samples_per_second": 0.861, "eval_steps_per_second": 0.861, "step": 250 }, { "acc": 0.62165804, "epoch": 0.6910569105691057, "grad_norm": 1.6171875, "learning_rate": 0.0001306348074179743, "loss": 1.57510653, "memory(GiB)": 15.58, "step": 255, "train_speed(iter/s)": 0.140805 }, { "acc": 0.59346585, "epoch": 0.7046070460704607, "grad_norm": 1.046875, "learning_rate": 0.00013009985734664764, "loss": 1.76646061, "memory(GiB)": 15.58, "step": 260, "train_speed(iter/s)": 0.141165 }, { "acc": 0.60632119, "epoch": 0.7181571815718157, "grad_norm": 1.078125, "learning_rate": 0.00012956490727532096, "loss": 1.59940271, "memory(GiB)": 15.95, "step": 265, "train_speed(iter/s)": 0.141502 }, { "acc": 0.59642992, "epoch": 0.7317073170731707, "grad_norm": 1.203125, "learning_rate": 0.00012902995720399428, "loss": 1.6549921, "memory(GiB)": 15.95, "step": 270, "train_speed(iter/s)": 0.141855 }, { "acc": 0.60280037, "epoch": 0.7452574525745257, "grad_norm": 1.0625, "learning_rate": 0.0001284950071326676, "loss": 1.63179569, "memory(GiB)": 15.95, "step": 275, "train_speed(iter/s)": 0.142164 }, { "acc": 0.594034, "epoch": 0.7588075880758808, "grad_norm": 1.1796875, "learning_rate": 0.00012796005706134094, "loss": 1.61152973, "memory(GiB)": 15.95, "step": 280, "train_speed(iter/s)": 0.142474 }, { "acc": 0.59763761, "epoch": 0.7723577235772358, "grad_norm": 1.0859375, "learning_rate": 0.00012742510699001425, "loss": 1.66443863, "memory(GiB)": 15.95, "step": 285, "train_speed(iter/s)": 0.142791 }, { "acc": 0.58316121, "epoch": 0.7859078590785907, "grad_norm": 0.73828125, "learning_rate": 0.00012689015691868757, "loss": 1.77028389, "memory(GiB)": 15.95, "step": 290, "train_speed(iter/s)": 0.143067 }, { "acc": 0.60665183, "epoch": 0.7994579945799458, "grad_norm": 1.25, "learning_rate": 0.00012635520684736091, "loss": 1.64849663, "memory(GiB)": 15.95, "step": 295, "train_speed(iter/s)": 0.143366 }, { "acc": 0.55526123, "epoch": 0.8130081300813008, "grad_norm": 1.203125, "learning_rate": 0.00012582025677603423, "loss": 1.86367321, "memory(GiB)": 15.95, "step": 300, "train_speed(iter/s)": 0.143649 }, { "epoch": 0.8130081300813008, "eval_acc": 0.6129032258064516, "eval_loss": 1.5865528583526611, "eval_runtime": 44.0975, "eval_samples_per_second": 0.862, "eval_steps_per_second": 0.862, "step": 300 }, { "acc": 0.6019598, "epoch": 0.8265582655826558, "grad_norm": 1.1328125, "learning_rate": 0.00012528530670470755, "loss": 1.71853142, "memory(GiB)": 15.95, "step": 305, "train_speed(iter/s)": 0.14098 }, { "acc": 0.6009192, "epoch": 0.8401084010840109, "grad_norm": 1.359375, "learning_rate": 0.00012475035663338086, "loss": 1.72751312, "memory(GiB)": 16.34, "step": 310, "train_speed(iter/s)": 0.141284 }, { "acc": 0.58907671, "epoch": 0.8536585365853658, "grad_norm": 0.98046875, "learning_rate": 0.0001242154065620542, "loss": 1.71860523, "memory(GiB)": 16.34, "step": 315, "train_speed(iter/s)": 0.141562 }, { "acc": 0.62246222, "epoch": 0.8672086720867209, "grad_norm": 1.0234375, "learning_rate": 0.00012368045649072752, "loss": 1.58847914, "memory(GiB)": 16.34, "step": 320, "train_speed(iter/s)": 0.141848 }, { "acc": 0.61927052, "epoch": 0.8807588075880759, "grad_norm": 1.59375, "learning_rate": 0.00012314550641940084, "loss": 1.54207745, "memory(GiB)": 16.34, "step": 325, "train_speed(iter/s)": 0.142133 }, { "acc": 0.60672359, "epoch": 0.8943089430894309, "grad_norm": 2.140625, "learning_rate": 0.00012261055634807416, "loss": 1.71789646, "memory(GiB)": 16.34, "step": 330, "train_speed(iter/s)": 0.142392 }, { "acc": 0.63249903, "epoch": 0.907859078590786, "grad_norm": 1.1328125, "learning_rate": 0.0001220756062767475, "loss": 1.46934195, "memory(GiB)": 16.34, "step": 335, "train_speed(iter/s)": 0.142658 }, { "acc": 0.60973873, "epoch": 0.9214092140921409, "grad_norm": 0.84375, "learning_rate": 0.00012154065620542082, "loss": 1.67951736, "memory(GiB)": 16.34, "step": 340, "train_speed(iter/s)": 0.142914 }, { "acc": 0.58657136, "epoch": 0.9349593495934959, "grad_norm": 1.2734375, "learning_rate": 0.00012100570613409413, "loss": 1.68692303, "memory(GiB)": 16.34, "step": 345, "train_speed(iter/s)": 0.143167 }, { "acc": 0.59965162, "epoch": 0.948509485094851, "grad_norm": 1.421875, "learning_rate": 0.00012047075606276746, "loss": 1.71949959, "memory(GiB)": 16.34, "step": 350, "train_speed(iter/s)": 0.143387 }, { "epoch": 0.948509485094851, "eval_acc": 0.6171254855598717, "eval_loss": 1.562721610069275, "eval_runtime": 44.0566, "eval_samples_per_second": 0.863, "eval_steps_per_second": 0.863, "step": 350 }, { "acc": 0.59322553, "epoch": 0.962059620596206, "grad_norm": 1.3046875, "learning_rate": 0.0001199358059914408, "loss": 1.66712799, "memory(GiB)": 16.34, "step": 355, "train_speed(iter/s)": 0.14111 }, { "acc": 0.59898105, "epoch": 0.975609756097561, "grad_norm": 1.265625, "learning_rate": 0.00011940085592011411, "loss": 1.66754684, "memory(GiB)": 16.34, "step": 360, "train_speed(iter/s)": 0.141373 }, { "acc": 0.60211391, "epoch": 0.989159891598916, "grad_norm": 1.359375, "learning_rate": 0.00011886590584878744, "loss": 1.67542057, "memory(GiB)": 16.34, "step": 365, "train_speed(iter/s)": 0.141638 }, { "acc": 0.63076615, "epoch": 1.002710027100271, "grad_norm": 0.8515625, "learning_rate": 0.00011833095577746076, "loss": 1.51196938, "memory(GiB)": 16.34, "step": 370, "train_speed(iter/s)": 0.141355 }, { "acc": 0.63542643, "epoch": 1.016260162601626, "grad_norm": 0.828125, "learning_rate": 0.00011779600570613407, "loss": 1.38132334, "memory(GiB)": 16.34, "step": 375, "train_speed(iter/s)": 0.141603 }, { "acc": 0.66358423, "epoch": 1.029810298102981, "grad_norm": 0.859375, "learning_rate": 0.00011726105563480742, "loss": 1.41512909, "memory(GiB)": 16.34, "step": 380, "train_speed(iter/s)": 0.141845 }, { "acc": 0.63692493, "epoch": 1.043360433604336, "grad_norm": 1.0859375, "learning_rate": 0.00011672610556348073, "loss": 1.38078823, "memory(GiB)": 16.34, "step": 385, "train_speed(iter/s)": 0.142082 }, { "acc": 0.6641499, "epoch": 1.056910569105691, "grad_norm": 1.125, "learning_rate": 0.00011619115549215406, "loss": 1.3458046, "memory(GiB)": 16.34, "step": 390, "train_speed(iter/s)": 0.142305 }, { "acc": 0.67276783, "epoch": 1.070460704607046, "grad_norm": 0.98828125, "learning_rate": 0.00011565620542082738, "loss": 1.2076004, "memory(GiB)": 16.34, "step": 395, "train_speed(iter/s)": 0.142533 }, { "acc": 0.65821433, "epoch": 1.084010840108401, "grad_norm": 1.15625, "learning_rate": 0.0001151212553495007, "loss": 1.38641891, "memory(GiB)": 16.34, "step": 400, "train_speed(iter/s)": 0.142742 }, { "epoch": 1.084010840108401, "eval_acc": 0.6162810336091876, "eval_loss": 1.5887514352798462, "eval_runtime": 44.1624, "eval_samples_per_second": 0.86, "eval_steps_per_second": 0.86, "step": 400 }, { "acc": 0.63352804, "epoch": 1.0975609756097562, "grad_norm": 1.2421875, "learning_rate": 0.00011458630527817403, "loss": 1.44696302, "memory(GiB)": 16.34, "step": 405, "train_speed(iter/s)": 0.140759 }, { "acc": 0.61955671, "epoch": 1.1111111111111112, "grad_norm": 1.5546875, "learning_rate": 0.00011405135520684734, "loss": 1.48915672, "memory(GiB)": 16.34, "step": 410, "train_speed(iter/s)": 0.140987 }, { "acc": 0.69259667, "epoch": 1.1246612466124661, "grad_norm": 0.796875, "learning_rate": 0.00011351640513552069, "loss": 1.22069292, "memory(GiB)": 16.34, "step": 415, "train_speed(iter/s)": 0.141191 }, { "acc": 0.67346158, "epoch": 1.1382113821138211, "grad_norm": 1.171875, "learning_rate": 0.000112981455064194, "loss": 1.36795778, "memory(GiB)": 16.34, "step": 420, "train_speed(iter/s)": 0.141417 }, { "acc": 0.67346702, "epoch": 1.151761517615176, "grad_norm": 1.0390625, "learning_rate": 0.00011244650499286732, "loss": 1.27111759, "memory(GiB)": 16.34, "step": 425, "train_speed(iter/s)": 0.141634 }, { "acc": 0.61625972, "epoch": 1.165311653116531, "grad_norm": 1.2578125, "learning_rate": 0.00011191155492154065, "loss": 1.42474003, "memory(GiB)": 16.34, "step": 430, "train_speed(iter/s)": 0.141853 }, { "acc": 0.65880041, "epoch": 1.1788617886178863, "grad_norm": 1.2578125, "learning_rate": 0.00011137660485021397, "loss": 1.33981524, "memory(GiB)": 16.34, "step": 435, "train_speed(iter/s)": 0.142048 }, { "acc": 0.63503499, "epoch": 1.1924119241192412, "grad_norm": 1.2265625, "learning_rate": 0.00011084165477888728, "loss": 1.44122829, "memory(GiB)": 16.34, "step": 440, "train_speed(iter/s)": 0.142254 }, { "acc": 0.61518383, "epoch": 1.2059620596205962, "grad_norm": 1.4765625, "learning_rate": 0.00011030670470756061, "loss": 1.54818697, "memory(GiB)": 16.34, "step": 445, "train_speed(iter/s)": 0.142446 }, { "acc": 0.68010144, "epoch": 1.2195121951219512, "grad_norm": 1.3828125, "learning_rate": 0.00010977175463623394, "loss": 1.28560524, "memory(GiB)": 16.34, "step": 450, "train_speed(iter/s)": 0.142639 }, { "epoch": 1.2195121951219512, "eval_acc": 0.6230366492146597, "eval_loss": 1.5483555793762207, "eval_runtime": 44.0588, "eval_samples_per_second": 0.862, "eval_steps_per_second": 0.862, "step": 450 }, { "acc": 0.64718337, "epoch": 1.2330623306233062, "grad_norm": 1.4375, "learning_rate": 0.00010923680456490727, "loss": 1.3778326, "memory(GiB)": 16.34, "step": 455, "train_speed(iter/s)": 0.140879 }, { "acc": 0.68637676, "epoch": 1.2466124661246614, "grad_norm": 1.453125, "learning_rate": 0.00010870185449358059, "loss": 1.21860657, "memory(GiB)": 16.34, "step": 460, "train_speed(iter/s)": 0.141097 }, { "acc": 0.63144946, "epoch": 1.2601626016260163, "grad_norm": 1.0703125, "learning_rate": 0.00010816690442225392, "loss": 1.38577023, "memory(GiB)": 16.34, "step": 465, "train_speed(iter/s)": 0.141294 }, { "acc": 0.66895781, "epoch": 1.2737127371273713, "grad_norm": 1.296875, "learning_rate": 0.00010763195435092724, "loss": 1.29167385, "memory(GiB)": 16.34, "step": 470, "train_speed(iter/s)": 0.141488 }, { "acc": 0.6553885, "epoch": 1.2872628726287263, "grad_norm": 1.1484375, "learning_rate": 0.00010709700427960055, "loss": 1.35011473, "memory(GiB)": 16.34, "step": 475, "train_speed(iter/s)": 0.141687 }, { "acc": 0.65576148, "epoch": 1.3008130081300813, "grad_norm": 1.1796875, "learning_rate": 0.00010656205420827388, "loss": 1.39116755, "memory(GiB)": 16.34, "step": 480, "train_speed(iter/s)": 0.141863 }, { "acc": 0.62025633, "epoch": 1.3143631436314362, "grad_norm": 1.375, "learning_rate": 0.00010602710413694721, "loss": 1.55190315, "memory(GiB)": 16.34, "step": 485, "train_speed(iter/s)": 0.142058 }, { "acc": 0.65667233, "epoch": 1.3279132791327912, "grad_norm": 1.1484375, "learning_rate": 0.00010549215406562054, "loss": 1.30497828, "memory(GiB)": 16.34, "step": 490, "train_speed(iter/s)": 0.142237 }, { "acc": 0.6458045, "epoch": 1.3414634146341464, "grad_norm": 2.421875, "learning_rate": 0.00010495720399429386, "loss": 1.40175285, "memory(GiB)": 16.34, "step": 495, "train_speed(iter/s)": 0.14241 }, { "acc": 0.65004997, "epoch": 1.3550135501355014, "grad_norm": 1.265625, "learning_rate": 0.00010442225392296718, "loss": 1.31625471, "memory(GiB)": 16.34, "step": 500, "train_speed(iter/s)": 0.142583 }, { "epoch": 1.3550135501355014, "eval_acc": 0.61932106063165, "eval_loss": 1.535814642906189, "eval_runtime": 44.07, "eval_samples_per_second": 0.862, "eval_steps_per_second": 0.862, "step": 500 }, { "acc": 0.65512385, "epoch": 1.3685636856368564, "grad_norm": 0.9375, "learning_rate": 0.00010388730385164051, "loss": 1.2989893, "memory(GiB)": 16.34, "step": 505, "train_speed(iter/s)": 0.140995 }, { "acc": 0.65444102, "epoch": 1.3821138211382114, "grad_norm": 1.5625, "learning_rate": 0.00010335235378031382, "loss": 1.32140932, "memory(GiB)": 16.34, "step": 510, "train_speed(iter/s)": 0.141172 }, { "acc": 0.66242504, "epoch": 1.3956639566395663, "grad_norm": 0.9140625, "learning_rate": 0.00010281740370898714, "loss": 1.34259834, "memory(GiB)": 16.34, "step": 515, "train_speed(iter/s)": 0.141336 }, { "acc": 0.62716827, "epoch": 1.4092140921409215, "grad_norm": 1.4296875, "learning_rate": 0.00010228245363766048, "loss": 1.46508284, "memory(GiB)": 16.34, "step": 520, "train_speed(iter/s)": 0.141508 }, { "acc": 0.61702657, "epoch": 1.4227642276422765, "grad_norm": 1.5546875, "learning_rate": 0.0001017475035663338, "loss": 1.43188276, "memory(GiB)": 16.34, "step": 525, "train_speed(iter/s)": 0.141681 }, { "acc": 0.63238263, "epoch": 1.4363143631436315, "grad_norm": 1.3671875, "learning_rate": 0.00010121255349500713, "loss": 1.52214499, "memory(GiB)": 16.34, "step": 530, "train_speed(iter/s)": 0.141854 }, { "acc": 0.6323854, "epoch": 1.4498644986449865, "grad_norm": 1.375, "learning_rate": 0.00010067760342368045, "loss": 1.38910236, "memory(GiB)": 16.34, "step": 535, "train_speed(iter/s)": 0.142019 }, { "acc": 0.60772176, "epoch": 1.4634146341463414, "grad_norm": 1.21875, "learning_rate": 0.00010014265335235376, "loss": 1.46768923, "memory(GiB)": 16.34, "step": 540, "train_speed(iter/s)": 0.14218 }, { "acc": 0.67315965, "epoch": 1.4769647696476964, "grad_norm": 1.34375, "learning_rate": 9.96077032810271e-05, "loss": 1.25136938, "memory(GiB)": 16.34, "step": 545, "train_speed(iter/s)": 0.142338 }, { "acc": 0.66408758, "epoch": 1.4905149051490514, "grad_norm": 2.046875, "learning_rate": 9.907275320970041e-05, "loss": 1.25655928, "memory(GiB)": 16.34, "step": 550, "train_speed(iter/s)": 0.142495 }, { "epoch": 1.4905149051490514, "eval_acc": 0.6254011146765749, "eval_loss": 1.5346177816390991, "eval_runtime": 44.1805, "eval_samples_per_second": 0.86, "eval_steps_per_second": 0.86, "step": 550 }, { "acc": 0.68655195, "epoch": 1.5040650406504064, "grad_norm": 1.125, "learning_rate": 9.853780313837375e-05, "loss": 1.08906736, "memory(GiB)": 16.34, "step": 555, "train_speed(iter/s)": 0.141037 }, { "acc": 0.65898876, "epoch": 1.5176151761517616, "grad_norm": 1.4296875, "learning_rate": 9.800285306704707e-05, "loss": 1.26696377, "memory(GiB)": 16.34, "step": 560, "train_speed(iter/s)": 0.14119 }, { "acc": 0.63674688, "epoch": 1.5311653116531165, "grad_norm": 1.5390625, "learning_rate": 9.746790299572039e-05, "loss": 1.38277016, "memory(GiB)": 16.34, "step": 565, "train_speed(iter/s)": 0.141349 }, { "acc": 0.67204466, "epoch": 1.5447154471544715, "grad_norm": 1.234375, "learning_rate": 9.693295292439372e-05, "loss": 1.25243311, "memory(GiB)": 16.34, "step": 570, "train_speed(iter/s)": 0.141502 }, { "acc": 0.67878027, "epoch": 1.5582655826558267, "grad_norm": 1.3828125, "learning_rate": 9.639800285306703e-05, "loss": 1.18253031, "memory(GiB)": 16.34, "step": 575, "train_speed(iter/s)": 0.141663 }, { "acc": 0.64645357, "epoch": 1.5718157181571817, "grad_norm": 1.671875, "learning_rate": 9.586305278174036e-05, "loss": 1.39789114, "memory(GiB)": 16.34, "step": 580, "train_speed(iter/s)": 0.141812 }, { "acc": 0.63896599, "epoch": 1.5853658536585367, "grad_norm": 1.5546875, "learning_rate": 9.532810271041368e-05, "loss": 1.37434454, "memory(GiB)": 16.34, "step": 585, "train_speed(iter/s)": 0.141961 }, { "acc": 0.67930498, "epoch": 1.5989159891598916, "grad_norm": 1.3671875, "learning_rate": 9.479315263908701e-05, "loss": 1.24675446, "memory(GiB)": 16.34, "step": 590, "train_speed(iter/s)": 0.142113 }, { "acc": 0.66582651, "epoch": 1.6124661246612466, "grad_norm": 1.1171875, "learning_rate": 9.425820256776034e-05, "loss": 1.33937092, "memory(GiB)": 16.34, "step": 595, "train_speed(iter/s)": 0.142256 }, { "acc": 0.65553112, "epoch": 1.6260162601626016, "grad_norm": 1.5859375, "learning_rate": 9.372325249643366e-05, "loss": 1.25127707, "memory(GiB)": 16.34, "step": 600, "train_speed(iter/s)": 0.142394 }, { "epoch": 1.6260162601626016, "eval_acc": 0.6233744299949333, "eval_loss": 1.5426762104034424, "eval_runtime": 44.3564, "eval_samples_per_second": 0.857, "eval_steps_per_second": 0.857, "step": 600 }, { "acc": 0.64805899, "epoch": 1.6395663956639566, "grad_norm": 1.078125, "learning_rate": 9.318830242510699e-05, "loss": 1.30654621, "memory(GiB)": 16.34, "step": 605, "train_speed(iter/s)": 0.141073 }, { "acc": 0.67441335, "epoch": 1.6531165311653115, "grad_norm": 1.34375, "learning_rate": 9.26533523537803e-05, "loss": 1.20785751, "memory(GiB)": 16.34, "step": 610, "train_speed(iter/s)": 0.141227 }, { "acc": 0.64989614, "epoch": 1.6666666666666665, "grad_norm": 2.25, "learning_rate": 9.211840228245362e-05, "loss": 1.41231976, "memory(GiB)": 16.34, "step": 615, "train_speed(iter/s)": 0.141374 }, { "acc": 0.65118213, "epoch": 1.6802168021680217, "grad_norm": 1.34375, "learning_rate": 9.158345221112695e-05, "loss": 1.33156195, "memory(GiB)": 16.34, "step": 620, "train_speed(iter/s)": 0.141527 }, { "acc": 0.68491917, "epoch": 1.6937669376693767, "grad_norm": 2.078125, "learning_rate": 9.104850213980028e-05, "loss": 1.15608921, "memory(GiB)": 16.34, "step": 625, "train_speed(iter/s)": 0.141656 }, { "acc": 0.6636075, "epoch": 1.7073170731707317, "grad_norm": 1.2890625, "learning_rate": 9.051355206847361e-05, "loss": 1.3377409, "memory(GiB)": 16.34, "step": 630, "train_speed(iter/s)": 0.141795 }, { "acc": 0.66115265, "epoch": 1.7208672086720869, "grad_norm": 1.796875, "learning_rate": 8.997860199714693e-05, "loss": 1.41666918, "memory(GiB)": 16.34, "step": 635, "train_speed(iter/s)": 0.141928 }, { "acc": 0.65316691, "epoch": 1.7344173441734418, "grad_norm": 1.875, "learning_rate": 8.944365192582024e-05, "loss": 1.37885714, "memory(GiB)": 16.34, "step": 640, "train_speed(iter/s)": 0.14206 }, { "acc": 0.63365035, "epoch": 1.7479674796747968, "grad_norm": 2.03125, "learning_rate": 8.890870185449357e-05, "loss": 1.43597651, "memory(GiB)": 16.34, "step": 645, "train_speed(iter/s)": 0.142195 }, { "acc": 0.6461009, "epoch": 1.7615176151761518, "grad_norm": 1.390625, "learning_rate": 8.837375178316689e-05, "loss": 1.40720987, "memory(GiB)": 16.34, "step": 650, "train_speed(iter/s)": 0.142326 }, { "epoch": 1.7615176151761518, "eval_acc": 0.6223610876541125, "eval_loss": 1.5279418230056763, "eval_runtime": 44.3195, "eval_samples_per_second": 0.857, "eval_steps_per_second": 0.857, "step": 650 }, { "acc": 0.68952079, "epoch": 1.7750677506775068, "grad_norm": 1.5859375, "learning_rate": 8.783880171184023e-05, "loss": 1.1804883, "memory(GiB)": 16.34, "step": 655, "train_speed(iter/s)": 0.141093 }, { "acc": 0.63278737, "epoch": 1.7886178861788617, "grad_norm": 1.3125, "learning_rate": 8.730385164051355e-05, "loss": 1.45864544, "memory(GiB)": 16.34, "step": 660, "train_speed(iter/s)": 0.141227 }, { "acc": 0.6499536, "epoch": 1.8021680216802167, "grad_norm": 1.0859375, "learning_rate": 8.676890156918687e-05, "loss": 1.36629667, "memory(GiB)": 16.34, "step": 665, "train_speed(iter/s)": 0.14135 }, { "acc": 0.66262636, "epoch": 1.8157181571815717, "grad_norm": 2.8125, "learning_rate": 8.62339514978602e-05, "loss": 1.28030796, "memory(GiB)": 16.34, "step": 670, "train_speed(iter/s)": 0.141486 }, { "acc": 0.6478013, "epoch": 1.8292682926829267, "grad_norm": 1.3203125, "learning_rate": 8.569900142653351e-05, "loss": 1.37910089, "memory(GiB)": 16.34, "step": 675, "train_speed(iter/s)": 0.141617 }, { "acc": 0.65061078, "epoch": 1.8428184281842819, "grad_norm": 1.859375, "learning_rate": 8.516405135520683e-05, "loss": 1.24110394, "memory(GiB)": 16.34, "step": 680, "train_speed(iter/s)": 0.141749 }, { "acc": 0.66720443, "epoch": 1.8563685636856369, "grad_norm": 1.2734375, "learning_rate": 8.462910128388016e-05, "loss": 1.36949673, "memory(GiB)": 16.34, "step": 685, "train_speed(iter/s)": 0.141882 }, { "acc": 0.65051932, "epoch": 1.8699186991869918, "grad_norm": 1.796875, "learning_rate": 8.409415121255349e-05, "loss": 1.3470686, "memory(GiB)": 16.34, "step": 690, "train_speed(iter/s)": 0.142007 }, { "acc": 0.64647999, "epoch": 1.883468834688347, "grad_norm": 1.0546875, "learning_rate": 8.355920114122682e-05, "loss": 1.27561255, "memory(GiB)": 16.34, "step": 695, "train_speed(iter/s)": 0.142124 }, { "acc": 0.64771528, "epoch": 1.897018970189702, "grad_norm": 2.5, "learning_rate": 8.302425106990014e-05, "loss": 1.3874403, "memory(GiB)": 16.34, "step": 700, "train_speed(iter/s)": 0.142242 }, { "epoch": 1.897018970189702, "eval_acc": 0.6274277993582165, "eval_loss": 1.5236802101135254, "eval_runtime": 44.4112, "eval_samples_per_second": 0.856, "eval_steps_per_second": 0.856, "step": 700 }, { "acc": 0.65817127, "epoch": 1.910569105691057, "grad_norm": 1.3203125, "learning_rate": 8.248930099857345e-05, "loss": 1.29632025, "memory(GiB)": 16.34, "step": 705, "train_speed(iter/s)": 0.141099 }, { "acc": 0.64647436, "epoch": 1.924119241192412, "grad_norm": 1.234375, "learning_rate": 8.195435092724678e-05, "loss": 1.39382238, "memory(GiB)": 16.34, "step": 710, "train_speed(iter/s)": 0.141217 }, { "acc": 0.65741391, "epoch": 1.937669376693767, "grad_norm": 0.98828125, "learning_rate": 8.14194008559201e-05, "loss": 1.32606802, "memory(GiB)": 16.34, "step": 715, "train_speed(iter/s)": 0.141337 }, { "acc": 0.65078535, "epoch": 1.951219512195122, "grad_norm": 1.6171875, "learning_rate": 8.088445078459343e-05, "loss": 1.28092451, "memory(GiB)": 16.34, "step": 720, "train_speed(iter/s)": 0.141457 }, { "acc": 0.64983764, "epoch": 1.9647696476964769, "grad_norm": 1.7890625, "learning_rate": 8.034950071326676e-05, "loss": 1.35295801, "memory(GiB)": 16.34, "step": 725, "train_speed(iter/s)": 0.141585 }, { "acc": 0.64880919, "epoch": 1.9783197831978319, "grad_norm": 1.6640625, "learning_rate": 7.981455064194009e-05, "loss": 1.38945732, "memory(GiB)": 16.34, "step": 730, "train_speed(iter/s)": 0.141703 }, { "acc": 0.64617214, "epoch": 1.9918699186991868, "grad_norm": 1.3671875, "learning_rate": 7.927960057061341e-05, "loss": 1.31189098, "memory(GiB)": 16.34, "step": 735, "train_speed(iter/s)": 0.141817 }, { "acc": 0.67255039, "epoch": 2.005420054200542, "grad_norm": 1.3359375, "learning_rate": 7.874465049928672e-05, "loss": 1.17746153, "memory(GiB)": 16.34, "step": 740, "train_speed(iter/s)": 0.141669 }, { "acc": 0.69897633, "epoch": 2.0189701897018972, "grad_norm": 1.1875, "learning_rate": 7.820970042796005e-05, "loss": 1.03704672, "memory(GiB)": 16.34, "step": 745, "train_speed(iter/s)": 0.141785 }, { "acc": 0.71377811, "epoch": 2.032520325203252, "grad_norm": 1.640625, "learning_rate": 7.767475035663337e-05, "loss": 1.04516726, "memory(GiB)": 16.34, "step": 750, "train_speed(iter/s)": 0.141905 }, { "epoch": 2.032520325203252, "eval_acc": 0.6208410741428813, "eval_loss": 1.603255271911621, "eval_runtime": 44.4359, "eval_samples_per_second": 0.855, "eval_steps_per_second": 0.855, "step": 750 }, { "acc": 0.73705945, "epoch": 2.046070460704607, "grad_norm": 1.8203125, "learning_rate": 7.713980028530669e-05, "loss": 0.9383255, "memory(GiB)": 16.34, "step": 755, "train_speed(iter/s)": 0.14084 }, { "acc": 0.73054934, "epoch": 2.059620596205962, "grad_norm": 1.8203125, "learning_rate": 7.660485021398003e-05, "loss": 0.91379232, "memory(GiB)": 16.34, "step": 760, "train_speed(iter/s)": 0.140958 }, { "acc": 0.73154573, "epoch": 2.073170731707317, "grad_norm": 1.6796875, "learning_rate": 7.606990014265335e-05, "loss": 0.97700481, "memory(GiB)": 16.34, "step": 765, "train_speed(iter/s)": 0.141076 }, { "acc": 0.72998781, "epoch": 2.086720867208672, "grad_norm": 1.4140625, "learning_rate": 7.553495007132668e-05, "loss": 0.93853807, "memory(GiB)": 16.34, "step": 770, "train_speed(iter/s)": 0.141186 }, { "acc": 0.73213534, "epoch": 2.100271002710027, "grad_norm": 1.78125, "learning_rate": 7.5e-05, "loss": 0.94589176, "memory(GiB)": 16.34, "step": 775, "train_speed(iter/s)": 0.141303 }, { "acc": 0.73035493, "epoch": 2.113821138211382, "grad_norm": 1.6953125, "learning_rate": 7.446504992867331e-05, "loss": 0.95597754, "memory(GiB)": 16.34, "step": 780, "train_speed(iter/s)": 0.141422 }, { "acc": 0.73428354, "epoch": 2.127371273712737, "grad_norm": 1.484375, "learning_rate": 7.393009985734664e-05, "loss": 0.98254423, "memory(GiB)": 16.34, "step": 785, "train_speed(iter/s)": 0.141543 }, { "acc": 0.69750729, "epoch": 2.140921409214092, "grad_norm": 1.828125, "learning_rate": 7.339514978601997e-05, "loss": 1.10206041, "memory(GiB)": 16.34, "step": 790, "train_speed(iter/s)": 0.141651 }, { "acc": 0.73764381, "epoch": 2.154471544715447, "grad_norm": 1.296875, "learning_rate": 7.286019971469329e-05, "loss": 0.84075432, "memory(GiB)": 16.34, "step": 795, "train_speed(iter/s)": 0.141765 }, { "acc": 0.72124152, "epoch": 2.168021680216802, "grad_norm": 1.4375, "learning_rate": 7.23252496433666e-05, "loss": 1.05863771, "memory(GiB)": 16.34, "step": 800, "train_speed(iter/s)": 0.141874 }, { "epoch": 2.168021680216802, "eval_acc": 0.6240499915554805, "eval_loss": 1.61227285861969, "eval_runtime": 44.3145, "eval_samples_per_second": 0.858, "eval_steps_per_second": 0.858, "step": 800 }, { "acc": 0.73518519, "epoch": 2.181571815718157, "grad_norm": 1.546875, "learning_rate": 7.179029957203993e-05, "loss": 0.96204119, "memory(GiB)": 16.34, "step": 805, "train_speed(iter/s)": 0.140881 }, { "acc": 0.74290161, "epoch": 2.1951219512195124, "grad_norm": 2.015625, "learning_rate": 7.125534950071326e-05, "loss": 0.92387733, "memory(GiB)": 16.34, "step": 810, "train_speed(iter/s)": 0.14021 }, { "acc": 0.73101602, "epoch": 2.2086720867208673, "grad_norm": 1.6015625, "learning_rate": 7.072039942938658e-05, "loss": 0.96248655, "memory(GiB)": 16.34, "step": 815, "train_speed(iter/s)": 0.140323 }, { "acc": 0.73887796, "epoch": 2.2222222222222223, "grad_norm": 1.796875, "learning_rate": 7.018544935805991e-05, "loss": 0.94751673, "memory(GiB)": 16.34, "step": 820, "train_speed(iter/s)": 0.140441 }, { "acc": 0.75793715, "epoch": 2.2357723577235773, "grad_norm": 2.015625, "learning_rate": 6.965049928673323e-05, "loss": 0.84263477, "memory(GiB)": 16.34, "step": 825, "train_speed(iter/s)": 0.140555 }, { "acc": 0.75873909, "epoch": 2.2493224932249323, "grad_norm": 1.953125, "learning_rate": 6.911554921540656e-05, "loss": 0.89048252, "memory(GiB)": 16.34, "step": 830, "train_speed(iter/s)": 0.14067 }, { "acc": 0.74456077, "epoch": 2.2628726287262872, "grad_norm": 1.7890625, "learning_rate": 6.858059914407987e-05, "loss": 0.90777779, "memory(GiB)": 16.34, "step": 835, "train_speed(iter/s)": 0.140775 }, { "acc": 0.75809846, "epoch": 2.2764227642276422, "grad_norm": 2.0625, "learning_rate": 6.80456490727532e-05, "loss": 0.87556753, "memory(GiB)": 16.34, "step": 840, "train_speed(iter/s)": 0.14088 }, { "acc": 0.73194971, "epoch": 2.289972899728997, "grad_norm": 2.765625, "learning_rate": 6.751069900142653e-05, "loss": 0.9232769, "memory(GiB)": 16.34, "step": 845, "train_speed(iter/s)": 0.140986 }, { "acc": 0.74470835, "epoch": 2.303523035230352, "grad_norm": 1.8515625, "learning_rate": 6.697574893009985e-05, "loss": 0.88600664, "memory(GiB)": 16.34, "step": 850, "train_speed(iter/s)": 0.141086 }, { "epoch": 2.303523035230352, "eval_acc": 0.6174632663401453, "eval_loss": 1.660492181777954, "eval_runtime": 44.1393, "eval_samples_per_second": 0.861, "eval_steps_per_second": 0.861, "step": 850 }, { "acc": 0.76063514, "epoch": 2.317073170731707, "grad_norm": 2.40625, "learning_rate": 6.644079885877318e-05, "loss": 0.80236473, "memory(GiB)": 16.34, "step": 855, "train_speed(iter/s)": 0.140168 }, { "acc": 0.71505499, "epoch": 2.330623306233062, "grad_norm": 2.671875, "learning_rate": 6.59058487874465e-05, "loss": 1.01317081, "memory(GiB)": 16.34, "step": 860, "train_speed(iter/s)": 0.140281 }, { "acc": 0.73396034, "epoch": 2.3441734417344176, "grad_norm": 1.703125, "learning_rate": 6.537089871611983e-05, "loss": 0.96496754, "memory(GiB)": 16.34, "step": 865, "train_speed(iter/s)": 0.140387 }, { "acc": 0.74145699, "epoch": 2.3577235772357725, "grad_norm": 1.9453125, "learning_rate": 6.483594864479316e-05, "loss": 1.00490999, "memory(GiB)": 16.34, "step": 870, "train_speed(iter/s)": 0.140492 }, { "acc": 0.76035104, "epoch": 2.3712737127371275, "grad_norm": 1.7265625, "learning_rate": 6.430099857346647e-05, "loss": 0.93969469, "memory(GiB)": 16.34, "step": 875, "train_speed(iter/s)": 0.140599 }, { "acc": 0.75214877, "epoch": 2.3848238482384825, "grad_norm": 1.9296875, "learning_rate": 6.376604850213979e-05, "loss": 0.9385232, "memory(GiB)": 16.34, "step": 880, "train_speed(iter/s)": 0.140705 }, { "acc": 0.73846526, "epoch": 2.3983739837398375, "grad_norm": 2.421875, "learning_rate": 6.323109843081312e-05, "loss": 0.95887814, "memory(GiB)": 16.34, "step": 885, "train_speed(iter/s)": 0.140812 }, { "acc": 0.72576594, "epoch": 2.4119241192411924, "grad_norm": 1.46875, "learning_rate": 6.269614835948645e-05, "loss": 0.95252094, "memory(GiB)": 16.34, "step": 890, "train_speed(iter/s)": 0.140912 }, { "acc": 0.74866586, "epoch": 2.4254742547425474, "grad_norm": 1.5546875, "learning_rate": 6.216119828815977e-05, "loss": 0.91533632, "memory(GiB)": 16.34, "step": 895, "train_speed(iter/s)": 0.141011 }, { "acc": 0.72289066, "epoch": 2.4390243902439024, "grad_norm": 1.375, "learning_rate": 6.162624821683308e-05, "loss": 0.996418, "memory(GiB)": 16.34, "step": 900, "train_speed(iter/s)": 0.13966 }, { "epoch": 2.4390243902439024, "eval_acc": 0.6152676912683668, "eval_loss": 1.6503233909606934, "eval_runtime": 44.1939, "eval_samples_per_second": 0.86, "eval_steps_per_second": 0.86, "step": 900 }, { "acc": 0.73535914, "epoch": 2.4525745257452574, "grad_norm": 1.3125, "learning_rate": 6.109129814550641e-05, "loss": 1.00612879, "memory(GiB)": 16.34, "step": 905, "train_speed(iter/s)": 0.138805 }, { "acc": 0.7277998, "epoch": 2.4661246612466123, "grad_norm": 2.328125, "learning_rate": 6.0556348074179737e-05, "loss": 1.00636635, "memory(GiB)": 16.34, "step": 910, "train_speed(iter/s)": 0.138904 }, { "acc": 0.70795035, "epoch": 2.4796747967479673, "grad_norm": 1.6484375, "learning_rate": 6.002139800285306e-05, "loss": 0.98440151, "memory(GiB)": 16.34, "step": 915, "train_speed(iter/s)": 0.139013 }, { "acc": 0.7650095, "epoch": 2.4932249322493227, "grad_norm": 2.25, "learning_rate": 5.948644793152638e-05, "loss": 0.77239523, "memory(GiB)": 16.34, "step": 920, "train_speed(iter/s)": 0.139126 }, { "acc": 0.71504927, "epoch": 2.5067750677506773, "grad_norm": 1.6328125, "learning_rate": 5.895149786019971e-05, "loss": 1.01518288, "memory(GiB)": 16.34, "step": 925, "train_speed(iter/s)": 0.139233 }, { "acc": 0.71582847, "epoch": 2.5203252032520327, "grad_norm": 2.28125, "learning_rate": 5.841654778887303e-05, "loss": 0.97862291, "memory(GiB)": 16.34, "step": 930, "train_speed(iter/s)": 0.13934 }, { "acc": 0.74984369, "epoch": 2.5338753387533877, "grad_norm": 1.8515625, "learning_rate": 5.788159771754635e-05, "loss": 0.85021992, "memory(GiB)": 16.34, "step": 935, "train_speed(iter/s)": 0.139448 }, { "acc": 0.74142261, "epoch": 2.5474254742547426, "grad_norm": 1.796875, "learning_rate": 5.734664764621968e-05, "loss": 0.91843338, "memory(GiB)": 16.34, "step": 940, "train_speed(iter/s)": 0.139564 }, { "acc": 0.71266222, "epoch": 2.5609756097560976, "grad_norm": 1.875, "learning_rate": 5.6811697574893007e-05, "loss": 1.14017658, "memory(GiB)": 16.34, "step": 945, "train_speed(iter/s)": 0.139667 }, { "acc": 0.73374839, "epoch": 2.5745257452574526, "grad_norm": 1.671875, "learning_rate": 5.627674750356633e-05, "loss": 0.99979248, "memory(GiB)": 16.34, "step": 950, "train_speed(iter/s)": 0.139769 }, { "epoch": 2.5745257452574526, "eval_acc": 0.6161121432190508, "eval_loss": 1.6511973142623901, "eval_runtime": 44.1881, "eval_samples_per_second": 0.86, "eval_steps_per_second": 0.86, "step": 950 }, { "acc": 0.74381332, "epoch": 2.5880758807588076, "grad_norm": 1.7734375, "learning_rate": 5.5741797432239646e-05, "loss": 0.93189411, "memory(GiB)": 16.34, "step": 955, "train_speed(iter/s)": 0.138972 }, { "acc": 0.73305674, "epoch": 2.6016260162601625, "grad_norm": 2.4375, "learning_rate": 5.5206847360912977e-05, "loss": 0.93946905, "memory(GiB)": 16.34, "step": 960, "train_speed(iter/s)": 0.139079 }, { "acc": 0.72961435, "epoch": 2.6151761517615175, "grad_norm": 1.84375, "learning_rate": 5.46718972895863e-05, "loss": 0.99850683, "memory(GiB)": 16.34, "step": 965, "train_speed(iter/s)": 0.139175 }, { "acc": 0.7133184, "epoch": 2.6287262872628725, "grad_norm": 1.96875, "learning_rate": 5.413694721825962e-05, "loss": 1.03378878, "memory(GiB)": 16.34, "step": 970, "train_speed(iter/s)": 0.139282 }, { "acc": 0.71164918, "epoch": 2.642276422764228, "grad_norm": 1.3828125, "learning_rate": 5.360199714693295e-05, "loss": 1.02268944, "memory(GiB)": 16.34, "step": 975, "train_speed(iter/s)": 0.139382 }, { "acc": 0.71824646, "epoch": 2.6558265582655824, "grad_norm": 1.84375, "learning_rate": 5.306704707560627e-05, "loss": 1.06014738, "memory(GiB)": 16.34, "step": 980, "train_speed(iter/s)": 0.139479 }, { "acc": 0.75523286, "epoch": 2.669376693766938, "grad_norm": 1.7734375, "learning_rate": 5.253209700427959e-05, "loss": 0.80533791, "memory(GiB)": 16.34, "step": 985, "train_speed(iter/s)": 0.139581 }, { "acc": 0.72265592, "epoch": 2.682926829268293, "grad_norm": 1.5546875, "learning_rate": 5.199714693295292e-05, "loss": 1.0223958, "memory(GiB)": 16.34, "step": 990, "train_speed(iter/s)": 0.139675 }, { "acc": 0.74640193, "epoch": 2.696476964769648, "grad_norm": 1.5546875, "learning_rate": 5.1462196861626247e-05, "loss": 0.8896265, "memory(GiB)": 16.34, "step": 995, "train_speed(iter/s)": 0.139767 }, { "acc": 0.75235152, "epoch": 2.710027100271003, "grad_norm": 1.6484375, "learning_rate": 5.092724679029957e-05, "loss": 0.87937946, "memory(GiB)": 16.34, "step": 1000, "train_speed(iter/s)": 0.139867 }, { "epoch": 2.710027100271003, "eval_acc": 0.6123965546360413, "eval_loss": 1.6541699171066284, "eval_runtime": 44.2136, "eval_samples_per_second": 0.859, "eval_steps_per_second": 0.859, "step": 1000 }, { "acc": 0.73264089, "epoch": 2.7235772357723578, "grad_norm": 1.96875, "learning_rate": 5.0392296718972886e-05, "loss": 0.95834293, "memory(GiB)": 16.34, "step": 1005, "train_speed(iter/s)": 0.139102 }, { "acc": 0.76931543, "epoch": 2.7371273712737128, "grad_norm": 1.7890625, "learning_rate": 4.9857346647646217e-05, "loss": 0.82889891, "memory(GiB)": 16.34, "step": 1010, "train_speed(iter/s)": 0.139199 }, { "acc": 0.74891815, "epoch": 2.7506775067750677, "grad_norm": 2.65625, "learning_rate": 4.932239657631954e-05, "loss": 0.93503094, "memory(GiB)": 16.34, "step": 1015, "train_speed(iter/s)": 0.139298 }, { "acc": 0.72551074, "epoch": 2.7642276422764227, "grad_norm": 1.5703125, "learning_rate": 4.878744650499286e-05, "loss": 1.06086788, "memory(GiB)": 16.34, "step": 1020, "train_speed(iter/s)": 0.139383 }, { "acc": 0.75321589, "epoch": 2.7777777777777777, "grad_norm": 1.7734375, "learning_rate": 4.825249643366619e-05, "loss": 0.83553734, "memory(GiB)": 16.34, "step": 1025, "train_speed(iter/s)": 0.139478 }, { "acc": 0.73832054, "epoch": 2.7913279132791327, "grad_norm": 1.53125, "learning_rate": 4.771754636233951e-05, "loss": 0.94093418, "memory(GiB)": 16.34, "step": 1030, "train_speed(iter/s)": 0.139572 }, { "acc": 0.76351671, "epoch": 2.8048780487804876, "grad_norm": 1.2890625, "learning_rate": 4.718259629101283e-05, "loss": 0.84423561, "memory(GiB)": 16.34, "step": 1035, "train_speed(iter/s)": 0.139664 }, { "acc": 0.75100279, "epoch": 2.818428184281843, "grad_norm": 1.890625, "learning_rate": 4.6647646219686156e-05, "loss": 0.91757078, "memory(GiB)": 16.34, "step": 1040, "train_speed(iter/s)": 0.139754 }, { "acc": 0.73302913, "epoch": 2.8319783197831976, "grad_norm": 1.3671875, "learning_rate": 4.6112696148359487e-05, "loss": 0.94331446, "memory(GiB)": 16.34, "step": 1045, "train_speed(iter/s)": 0.139841 }, { "acc": 0.73944206, "epoch": 2.845528455284553, "grad_norm": 1.59375, "learning_rate": 4.55777460770328e-05, "loss": 0.98726377, "memory(GiB)": 16.34, "step": 1050, "train_speed(iter/s)": 0.139931 }, { "epoch": 2.845528455284553, "eval_acc": 0.6145921297078196, "eval_loss": 1.6571751832962036, "eval_runtime": 44.3705, "eval_samples_per_second": 0.856, "eval_steps_per_second": 0.856, "step": 1050 }, { "acc": 0.73031135, "epoch": 2.859078590785908, "grad_norm": 2.25, "learning_rate": 4.5042796005706126e-05, "loss": 0.95681438, "memory(GiB)": 16.34, "step": 1055, "train_speed(iter/s)": 0.139201 }, { "acc": 0.71694627, "epoch": 2.872628726287263, "grad_norm": 1.6328125, "learning_rate": 4.4507845934379456e-05, "loss": 1.02266083, "memory(GiB)": 16.34, "step": 1060, "train_speed(iter/s)": 0.139294 }, { "acc": 0.72791638, "epoch": 2.886178861788618, "grad_norm": 1.7578125, "learning_rate": 4.397289586305278e-05, "loss": 0.99341927, "memory(GiB)": 16.34, "step": 1065, "train_speed(iter/s)": 0.139385 }, { "acc": 0.74939594, "epoch": 2.899728997289973, "grad_norm": 1.921875, "learning_rate": 4.34379457917261e-05, "loss": 0.91077061, "memory(GiB)": 16.34, "step": 1070, "train_speed(iter/s)": 0.139478 }, { "acc": 0.72694654, "epoch": 2.913279132791328, "grad_norm": 1.7265625, "learning_rate": 4.290299572039942e-05, "loss": 0.98774853, "memory(GiB)": 16.34, "step": 1075, "train_speed(iter/s)": 0.139564 }, { "acc": 0.70588508, "epoch": 2.926829268292683, "grad_norm": 2.15625, "learning_rate": 4.236804564907275e-05, "loss": 1.07887812, "memory(GiB)": 16.34, "step": 1080, "train_speed(iter/s)": 0.139657 }, { "acc": 0.74654303, "epoch": 2.940379403794038, "grad_norm": 1.7109375, "learning_rate": 4.183309557774607e-05, "loss": 0.91062069, "memory(GiB)": 16.34, "step": 1085, "train_speed(iter/s)": 0.139743 }, { "acc": 0.73493595, "epoch": 2.953929539295393, "grad_norm": 2.03125, "learning_rate": 4.1298145506419396e-05, "loss": 0.92819033, "memory(GiB)": 16.34, "step": 1090, "train_speed(iter/s)": 0.139829 }, { "acc": 0.71466756, "epoch": 2.9674796747967482, "grad_norm": 2.109375, "learning_rate": 4.0763195435092727e-05, "loss": 1.01220913, "memory(GiB)": 16.34, "step": 1095, "train_speed(iter/s)": 0.139916 }, { "acc": 0.7607831, "epoch": 2.9810298102981028, "grad_norm": 2.375, "learning_rate": 4.022824536376604e-05, "loss": 0.8505785, "memory(GiB)": 16.34, "step": 1100, "train_speed(iter/s)": 0.140004 }, { "epoch": 2.9810298102981028, "eval_acc": 0.6206721837527445, "eval_loss": 1.6525288820266724, "eval_runtime": 44.2124, "eval_samples_per_second": 0.859, "eval_steps_per_second": 0.859, "step": 1100 }, { "acc": 0.74116454, "epoch": 2.994579945799458, "grad_norm": 2.65625, "learning_rate": 3.9693295292439366e-05, "loss": 0.92264805, "memory(GiB)": 16.34, "step": 1105, "train_speed(iter/s)": 0.139311 }, { "acc": 0.8182869, "epoch": 3.008130081300813, "grad_norm": 1.3671875, "learning_rate": 3.915834522111269e-05, "loss": 0.6927434, "memory(GiB)": 16.34, "step": 1110, "train_speed(iter/s)": 0.13923 }, { "acc": 0.83515587, "epoch": 3.021680216802168, "grad_norm": 1.9140625, "learning_rate": 3.862339514978602e-05, "loss": 0.58284421, "memory(GiB)": 16.34, "step": 1115, "train_speed(iter/s)": 0.139319 }, { "acc": 0.84008894, "epoch": 3.035230352303523, "grad_norm": 1.8828125, "learning_rate": 3.808844507845934e-05, "loss": 0.59128432, "memory(GiB)": 16.34, "step": 1120, "train_speed(iter/s)": 0.139405 }, { "acc": 0.84298267, "epoch": 3.048780487804878, "grad_norm": 2.4375, "learning_rate": 3.755349500713266e-05, "loss": 0.56685705, "memory(GiB)": 16.34, "step": 1125, "train_speed(iter/s)": 0.139493 }, { "acc": 0.83680372, "epoch": 3.062330623306233, "grad_norm": 3.171875, "learning_rate": 3.701854493580599e-05, "loss": 0.55599914, "memory(GiB)": 16.34, "step": 1130, "train_speed(iter/s)": 0.139571 }, { "acc": 0.81269236, "epoch": 3.075880758807588, "grad_norm": 2.40625, "learning_rate": 3.648359486447931e-05, "loss": 0.63413367, "memory(GiB)": 16.34, "step": 1135, "train_speed(iter/s)": 0.139659 }, { "acc": 0.8300642, "epoch": 3.089430894308943, "grad_norm": 2.71875, "learning_rate": 3.5948644793152636e-05, "loss": 0.59288554, "memory(GiB)": 16.34, "step": 1140, "train_speed(iter/s)": 0.139738 }, { "acc": 0.84225779, "epoch": 3.102981029810298, "grad_norm": 2.703125, "learning_rate": 3.541369472182596e-05, "loss": 0.57104263, "memory(GiB)": 16.34, "step": 1145, "train_speed(iter/s)": 0.13982 }, { "acc": 0.86128368, "epoch": 3.116531165311653, "grad_norm": 3.0, "learning_rate": 3.487874465049928e-05, "loss": 0.47776127, "memory(GiB)": 16.34, "step": 1150, "train_speed(iter/s)": 0.139903 }, { "epoch": 3.116531165311653, "eval_acc": 0.6091876372234419, "eval_loss": 1.9067459106445312, "eval_runtime": 44.213, "eval_samples_per_second": 0.859, "eval_steps_per_second": 0.859, "step": 1150 }, { "acc": 0.81826935, "epoch": 3.130081300813008, "grad_norm": 2.234375, "learning_rate": 3.4343794579172606e-05, "loss": 0.64635777, "memory(GiB)": 16.34, "step": 1155, "train_speed(iter/s)": 0.13924 }, { "acc": 0.79423141, "epoch": 3.1436314363143634, "grad_norm": 2.859375, "learning_rate": 3.380884450784593e-05, "loss": 0.71202464, "memory(GiB)": 16.34, "step": 1160, "train_speed(iter/s)": 0.139319 }, { "acc": 0.81948729, "epoch": 3.1571815718157183, "grad_norm": 2.046875, "learning_rate": 3.327389443651925e-05, "loss": 0.62771091, "memory(GiB)": 16.34, "step": 1165, "train_speed(iter/s)": 0.1394 }, { "acc": 0.85081501, "epoch": 3.1707317073170733, "grad_norm": 2.125, "learning_rate": 3.2738944365192576e-05, "loss": 0.55290155, "memory(GiB)": 16.34, "step": 1170, "train_speed(iter/s)": 0.139484 }, { "acc": 0.84410248, "epoch": 3.1842818428184283, "grad_norm": 2.140625, "learning_rate": 3.2203994293865906e-05, "loss": 0.57252893, "memory(GiB)": 16.34, "step": 1175, "train_speed(iter/s)": 0.139567 }, { "acc": 0.78565025, "epoch": 3.1978319783197833, "grad_norm": 3.203125, "learning_rate": 3.166904422253922e-05, "loss": 0.73501797, "memory(GiB)": 16.34, "step": 1180, "train_speed(iter/s)": 0.13965 }, { "acc": 0.78738356, "epoch": 3.2113821138211383, "grad_norm": 2.953125, "learning_rate": 3.113409415121255e-05, "loss": 0.72265806, "memory(GiB)": 16.34, "step": 1185, "train_speed(iter/s)": 0.139732 }, { "acc": 0.818221, "epoch": 3.2249322493224932, "grad_norm": 2.21875, "learning_rate": 3.0599144079885876e-05, "loss": 0.61062384, "memory(GiB)": 16.34, "step": 1190, "train_speed(iter/s)": 0.139813 }, { "acc": 0.82556448, "epoch": 3.238482384823848, "grad_norm": 2.3125, "learning_rate": 3.00641940085592e-05, "loss": 0.61080799, "memory(GiB)": 16.34, "step": 1195, "train_speed(iter/s)": 0.139893 }, { "acc": 0.82586126, "epoch": 3.252032520325203, "grad_norm": 2.34375, "learning_rate": 2.952924393723252e-05, "loss": 0.6164794, "memory(GiB)": 16.34, "step": 1200, "train_speed(iter/s)": 0.139974 }, { "epoch": 3.252032520325203, "eval_acc": 0.6044587062996115, "eval_loss": 1.8586076498031616, "eval_runtime": 44.1645, "eval_samples_per_second": 0.86, "eval_steps_per_second": 0.86, "step": 1200 }, { "acc": 0.82835131, "epoch": 3.265582655826558, "grad_norm": 6.46875, "learning_rate": 2.8994293865905846e-05, "loss": 0.61753302, "memory(GiB)": 16.34, "step": 1205, "train_speed(iter/s)": 0.139339 }, { "acc": 0.82603951, "epoch": 3.279132791327913, "grad_norm": 2.90625, "learning_rate": 2.8459343794579173e-05, "loss": 0.58360481, "memory(GiB)": 16.34, "step": 1210, "train_speed(iter/s)": 0.139423 }, { "acc": 0.79192553, "epoch": 3.292682926829268, "grad_norm": 2.5, "learning_rate": 2.7924393723252493e-05, "loss": 0.75090132, "memory(GiB)": 16.34, "step": 1215, "train_speed(iter/s)": 0.1395 }, { "acc": 0.83583679, "epoch": 3.306233062330623, "grad_norm": 2.15625, "learning_rate": 2.738944365192582e-05, "loss": 0.56217456, "memory(GiB)": 16.34, "step": 1220, "train_speed(iter/s)": 0.139581 }, { "acc": 0.80481911, "epoch": 3.3197831978319785, "grad_norm": 4.125, "learning_rate": 2.685449358059914e-05, "loss": 0.73073688, "memory(GiB)": 16.34, "step": 1225, "train_speed(iter/s)": 0.139659 }, { "acc": 0.8173975, "epoch": 3.3333333333333335, "grad_norm": 3.1875, "learning_rate": 2.6319543509272466e-05, "loss": 0.59854188, "memory(GiB)": 16.34, "step": 1230, "train_speed(iter/s)": 0.139735 }, { "acc": 0.88763266, "epoch": 3.3468834688346885, "grad_norm": 2.078125, "learning_rate": 2.5784593437945793e-05, "loss": 0.3878314, "memory(GiB)": 16.34, "step": 1235, "train_speed(iter/s)": 0.139826 }, { "acc": 0.82199507, "epoch": 3.3604336043360434, "grad_norm": 2.28125, "learning_rate": 2.5249643366619113e-05, "loss": 0.65350094, "memory(GiB)": 16.34, "step": 1240, "train_speed(iter/s)": 0.139901 }, { "acc": 0.83115234, "epoch": 3.3739837398373984, "grad_norm": 2.484375, "learning_rate": 2.471469329529244e-05, "loss": 0.61815515, "memory(GiB)": 16.34, "step": 1245, "train_speed(iter/s)": 0.13998 }, { "acc": 0.81266117, "epoch": 3.3875338753387534, "grad_norm": 1.78125, "learning_rate": 2.417974322396576e-05, "loss": 0.65289016, "memory(GiB)": 16.34, "step": 1250, "train_speed(iter/s)": 0.140052 }, { "epoch": 3.3875338753387534, "eval_acc": 0.6037831447390644, "eval_loss": 1.893083095550537, "eval_runtime": 44.1639, "eval_samples_per_second": 0.86, "eval_steps_per_second": 0.86, "step": 1250 }, { "acc": 0.84776039, "epoch": 3.4010840108401084, "grad_norm": 2.046875, "learning_rate": 2.3644793152639086e-05, "loss": 0.53162961, "memory(GiB)": 16.34, "step": 1255, "train_speed(iter/s)": 0.139443 }, { "acc": 0.84318447, "epoch": 3.4146341463414633, "grad_norm": 2.234375, "learning_rate": 2.3109843081312406e-05, "loss": 0.52226324, "memory(GiB)": 16.34, "step": 1260, "train_speed(iter/s)": 0.139515 }, { "acc": 0.80521078, "epoch": 3.4281842818428183, "grad_norm": 2.25, "learning_rate": 2.2574893009985733e-05, "loss": 0.62338996, "memory(GiB)": 16.34, "step": 1265, "train_speed(iter/s)": 0.13959 }, { "acc": 0.83542995, "epoch": 3.4417344173441733, "grad_norm": 3.0625, "learning_rate": 2.203994293865906e-05, "loss": 0.60965805, "memory(GiB)": 16.34, "step": 1270, "train_speed(iter/s)": 0.139638 }, { "acc": 0.8333952, "epoch": 3.4552845528455283, "grad_norm": 2.625, "learning_rate": 2.150499286733238e-05, "loss": 0.60031776, "memory(GiB)": 16.34, "step": 1275, "train_speed(iter/s)": 0.139705 }, { "acc": 0.84367867, "epoch": 3.4688346883468837, "grad_norm": 1.859375, "learning_rate": 2.0970042796005706e-05, "loss": 0.54981174, "memory(GiB)": 16.34, "step": 1280, "train_speed(iter/s)": 0.13978 }, { "acc": 0.8121892, "epoch": 3.4823848238482387, "grad_norm": 2.046875, "learning_rate": 2.0435092724679026e-05, "loss": 0.67483497, "memory(GiB)": 16.34, "step": 1285, "train_speed(iter/s)": 0.139851 }, { "acc": 0.87883768, "epoch": 3.4959349593495936, "grad_norm": 1.6875, "learning_rate": 1.9900142653352353e-05, "loss": 0.45111437, "memory(GiB)": 16.34, "step": 1290, "train_speed(iter/s)": 0.139924 }, { "acc": 0.81952734, "epoch": 3.5094850948509486, "grad_norm": 2.640625, "learning_rate": 1.9365192582025676e-05, "loss": 0.66096244, "memory(GiB)": 16.34, "step": 1295, "train_speed(iter/s)": 0.139995 }, { "acc": 0.82709446, "epoch": 3.5230352303523036, "grad_norm": 2.140625, "learning_rate": 1.8830242510699e-05, "loss": 0.58278303, "memory(GiB)": 16.34, "step": 1300, "train_speed(iter/s)": 0.140068 }, { "epoch": 3.5230352303523036, "eval_acc": 0.603276473568654, "eval_loss": 1.8986655473709106, "eval_runtime": 44.3091, "eval_samples_per_second": 0.858, "eval_steps_per_second": 0.858, "step": 1300 }, { "acc": 0.83689537, "epoch": 3.5365853658536586, "grad_norm": 1.9453125, "learning_rate": 1.8295292439372323e-05, "loss": 0.55435977, "memory(GiB)": 16.34, "step": 1305, "train_speed(iter/s)": 0.139476 }, { "acc": 0.8525279, "epoch": 3.5501355013550135, "grad_norm": 2.21875, "learning_rate": 1.776034236804565e-05, "loss": 0.51831579, "memory(GiB)": 16.34, "step": 1310, "train_speed(iter/s)": 0.139549 }, { "acc": 0.83099527, "epoch": 3.5636856368563685, "grad_norm": 1.8828125, "learning_rate": 1.7225392296718973e-05, "loss": 0.61766572, "memory(GiB)": 16.34, "step": 1315, "train_speed(iter/s)": 0.139617 }, { "acc": 0.82115297, "epoch": 3.5772357723577235, "grad_norm": 2.765625, "learning_rate": 1.6690442225392296e-05, "loss": 0.62109137, "memory(GiB)": 16.34, "step": 1320, "train_speed(iter/s)": 0.13969 }, { "acc": 0.85132771, "epoch": 3.5907859078590785, "grad_norm": 2.421875, "learning_rate": 1.615549215406562e-05, "loss": 0.55242, "memory(GiB)": 16.34, "step": 1325, "train_speed(iter/s)": 0.139763 }, { "acc": 0.82085381, "epoch": 3.6043360433604335, "grad_norm": 2.8125, "learning_rate": 1.5620542082738943e-05, "loss": 0.62965093, "memory(GiB)": 16.34, "step": 1330, "train_speed(iter/s)": 0.139837 }, { "acc": 0.85592861, "epoch": 3.617886178861789, "grad_norm": 1.875, "learning_rate": 1.5085592011412266e-05, "loss": 0.54980545, "memory(GiB)": 16.34, "step": 1335, "train_speed(iter/s)": 0.139907 }, { "acc": 0.80786858, "epoch": 3.6314363143631434, "grad_norm": 2.328125, "learning_rate": 1.4550641940085591e-05, "loss": 0.67223382, "memory(GiB)": 16.34, "step": 1340, "train_speed(iter/s)": 0.139974 }, { "acc": 0.85045481, "epoch": 3.644986449864499, "grad_norm": 2.078125, "learning_rate": 1.4015691868758914e-05, "loss": 0.50758219, "memory(GiB)": 16.34, "step": 1345, "train_speed(iter/s)": 0.140054 }, { "acc": 0.85536547, "epoch": 3.658536585365854, "grad_norm": 1.7734375, "learning_rate": 1.348074179743224e-05, "loss": 0.53356686, "memory(GiB)": 16.34, "step": 1350, "train_speed(iter/s)": 0.140124 }, { "epoch": 3.658536585365854, "eval_acc": 0.6010808984968755, "eval_loss": 1.8972593545913696, "eval_runtime": 44.2915, "eval_samples_per_second": 0.858, "eval_steps_per_second": 0.858, "step": 1350 }, { "acc": 0.8181016, "epoch": 3.6720867208672088, "grad_norm": 2.3125, "learning_rate": 1.2945791726105563e-05, "loss": 0.59544349, "memory(GiB)": 16.34, "step": 1355, "train_speed(iter/s)": 0.139555 }, { "acc": 0.87799835, "epoch": 3.6856368563685638, "grad_norm": 2.03125, "learning_rate": 1.2410841654778886e-05, "loss": 0.42857742, "memory(GiB)": 16.34, "step": 1360, "train_speed(iter/s)": 0.139631 }, { "acc": 0.81485729, "epoch": 3.6991869918699187, "grad_norm": 1.65625, "learning_rate": 1.187589158345221e-05, "loss": 0.64030995, "memory(GiB)": 16.34, "step": 1365, "train_speed(iter/s)": 0.139703 }, { "acc": 0.82011805, "epoch": 3.7127371273712737, "grad_norm": 2.125, "learning_rate": 1.1340941512125534e-05, "loss": 0.63428659, "memory(GiB)": 16.34, "step": 1370, "train_speed(iter/s)": 0.139771 }, { "acc": 0.78548884, "epoch": 3.7262872628726287, "grad_norm": 3.984375, "learning_rate": 1.0805991440798858e-05, "loss": 0.74558139, "memory(GiB)": 16.34, "step": 1375, "train_speed(iter/s)": 0.139836 }, { "acc": 0.85822716, "epoch": 3.7398373983739837, "grad_norm": 2.328125, "learning_rate": 1.0271041369472183e-05, "loss": 0.47486768, "memory(GiB)": 16.34, "step": 1380, "train_speed(iter/s)": 0.139902 }, { "acc": 0.83469105, "epoch": 3.7533875338753386, "grad_norm": 2.53125, "learning_rate": 9.736091298145506e-06, "loss": 0.56590233, "memory(GiB)": 16.34, "step": 1385, "train_speed(iter/s)": 0.139971 }, { "acc": 0.80486965, "epoch": 3.7669376693766936, "grad_norm": 3.28125, "learning_rate": 9.20114122681883e-06, "loss": 0.68021841, "memory(GiB)": 16.34, "step": 1390, "train_speed(iter/s)": 0.140036 }, { "acc": 0.84560328, "epoch": 3.7804878048780486, "grad_norm": 2.421875, "learning_rate": 8.666191155492154e-06, "loss": 0.52926989, "memory(GiB)": 16.34, "step": 1395, "train_speed(iter/s)": 0.140102 }, { "acc": 0.83154497, "epoch": 3.794037940379404, "grad_norm": 2.671875, "learning_rate": 8.131241084165478e-06, "loss": 0.5866107, "memory(GiB)": 16.34, "step": 1400, "train_speed(iter/s)": 0.14017 }, { "epoch": 3.794037940379404, "eval_acc": 0.6036142543489276, "eval_loss": 1.8983793258666992, "eval_runtime": 44.4268, "eval_samples_per_second": 0.855, "eval_steps_per_second": 0.855, "step": 1400 }, { "acc": 0.84980259, "epoch": 3.8075880758807585, "grad_norm": 2.859375, "learning_rate": 7.596291012838801e-06, "loss": 0.56505013, "memory(GiB)": 16.34, "step": 1405, "train_speed(iter/s)": 0.139621 }, { "acc": 0.80126667, "epoch": 3.821138211382114, "grad_norm": 2.21875, "learning_rate": 7.061340941512125e-06, "loss": 0.67650762, "memory(GiB)": 16.34, "step": 1410, "train_speed(iter/s)": 0.139688 }, { "acc": 0.79974804, "epoch": 3.834688346883469, "grad_norm": 2.3125, "learning_rate": 6.5263908701854486e-06, "loss": 0.743855, "memory(GiB)": 16.34, "step": 1415, "train_speed(iter/s)": 0.139749 }, { "acc": 0.84023552, "epoch": 3.848238482384824, "grad_norm": 2.234375, "learning_rate": 5.991440798858773e-06, "loss": 0.57722979, "memory(GiB)": 16.34, "step": 1420, "train_speed(iter/s)": 0.139813 }, { "acc": 0.82472792, "epoch": 3.861788617886179, "grad_norm": 2.484375, "learning_rate": 5.456490727532097e-06, "loss": 0.56691737, "memory(GiB)": 16.34, "step": 1425, "train_speed(iter/s)": 0.139877 }, { "acc": 0.83573132, "epoch": 3.875338753387534, "grad_norm": 1.9765625, "learning_rate": 4.92154065620542e-06, "loss": 0.60298834, "memory(GiB)": 16.34, "step": 1430, "train_speed(iter/s)": 0.13994 }, { "acc": 0.83143454, "epoch": 3.888888888888889, "grad_norm": 1.9453125, "learning_rate": 4.386590584878744e-06, "loss": 0.61832671, "memory(GiB)": 16.34, "step": 1435, "train_speed(iter/s)": 0.140003 }, { "acc": 0.83209105, "epoch": 3.902439024390244, "grad_norm": 3.265625, "learning_rate": 3.851640513552068e-06, "loss": 0.6038147, "memory(GiB)": 16.34, "step": 1440, "train_speed(iter/s)": 0.140065 }, { "acc": 0.79715924, "epoch": 3.915989159891599, "grad_norm": 2.015625, "learning_rate": 3.316690442225392e-06, "loss": 0.70867634, "memory(GiB)": 16.34, "step": 1445, "train_speed(iter/s)": 0.140125 }, { "acc": 0.85556803, "epoch": 3.9295392953929538, "grad_norm": 1.7734375, "learning_rate": 2.781740370898716e-06, "loss": 0.52723417, "memory(GiB)": 16.34, "step": 1450, "train_speed(iter/s)": 0.140182 }, { "epoch": 3.9295392953929538, "eval_acc": 0.6029386927883803, "eval_loss": 1.8965932130813599, "eval_runtime": 44.4125, "eval_samples_per_second": 0.856, "eval_steps_per_second": 0.856, "step": 1450 }, { "acc": 0.83306837, "epoch": 3.943089430894309, "grad_norm": 2.25, "learning_rate": 2.2467902995720398e-06, "loss": 0.55906692, "memory(GiB)": 16.34, "step": 1455, "train_speed(iter/s)": 0.139645 }, { "acc": 0.81574478, "epoch": 3.9566395663956637, "grad_norm": 2.6875, "learning_rate": 1.7118402282453637e-06, "loss": 0.62746248, "memory(GiB)": 16.34, "step": 1460, "train_speed(iter/s)": 0.139708 }, { "acc": 0.83905201, "epoch": 3.970189701897019, "grad_norm": 2.46875, "learning_rate": 1.1768901569186875e-06, "loss": 0.56978045, "memory(GiB)": 16.34, "step": 1465, "train_speed(iter/s)": 0.139771 }, { "acc": 0.85569115, "epoch": 3.983739837398374, "grad_norm": 1.90625, "learning_rate": 6.419400855920114e-07, "loss": 0.48882861, "memory(GiB)": 16.34, "step": 1470, "train_speed(iter/s)": 0.139834 }, { "acc": 0.82850809, "epoch": 3.997289972899729, "grad_norm": 1.828125, "learning_rate": 1.0699001426533523e-07, "loss": 0.56869435, "memory(GiB)": 16.34, "step": 1475, "train_speed(iter/s)": 0.139901 }, { "epoch": 4.0, "eval_acc": 0.6037831447390644, "eval_loss": 1.8978389501571655, "eval_runtime": 44.2826, "eval_samples_per_second": 0.858, "eval_steps_per_second": 0.858, "step": 1476 } ], "logging_steps": 5, "max_steps": 1476, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 369, "total_flos": 1.991427194376192e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }