diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -1,3251 +1,2005 @@ { - "best_metric": 1.89783895, - "best_model_checkpoint": "D:\\_____NEW_NN\\LLM\\MiniCPM-V\\finetune\\output\\phi3-vision-128k-instruct\\v0-20240531-071942\\checkpoint-1476", - "epoch": 4.0, + "best_metric": 1.52509904, + "best_model_checkpoint": "D:\\_____NEW_NN\\LLM\\MiniCPM-V\\finetune\\output\\phi3-vision-128k-instruct\\v0-20240629-080216\\checkpoint-300", + "epoch": 3.5225048923679063, "eval_steps": 50, - "global_step": 1476, + "global_step": 900, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { - "acc": 0.47843874, - "epoch": 0.0027100271002710027, - "grad_norm": 0.921875, - "learning_rate": 2.027027027027027e-06, - "loss": 2.52092218, - "memory(GiB)": 12.33, + "acc": 0.49833804, + "epoch": 0.003913894324853229, + "grad_norm": 0.77734375, + "learning_rate": 2.745098039215686e-06, + "loss": 2.37747383, + "memory(GiB)": 17.35, "step": 1, - "train_speed(iter/s)": 0.066334 + "train_speed(iter/s)": 0.076826 }, { - "acc": 0.51126236, - "epoch": 0.013550135501355014, - "grad_norm": 0.796875, - "learning_rate": 1.0135135135135135e-05, - "loss": 2.42214346, - "memory(GiB)": 13.38, + "acc": 0.50652587, + "epoch": 0.019569471624266144, + "grad_norm": 1.140625, + "learning_rate": 1.372549019607843e-05, + "loss": 2.29183841, + "memory(GiB)": 19.33, "step": 5, - "train_speed(iter/s)": 0.127981 + "train_speed(iter/s)": 0.082188 }, { - "acc": 0.48636103, - "epoch": 0.02710027100271003, - "grad_norm": 1.09375, - "learning_rate": 2.027027027027027e-05, - "loss": 2.53938828, - "memory(GiB)": 13.38, + "acc": 0.52587533, + "epoch": 0.03913894324853229, + "grad_norm": 0.68359375, + "learning_rate": 2.745098039215686e-05, + "loss": 2.22724895, + "memory(GiB)": 19.89, "step": 10, - "train_speed(iter/s)": 0.144622 + "train_speed(iter/s)": 0.082805 }, { - "acc": 0.47292571, - "epoch": 0.04065040650406504, - "grad_norm": 1.1328125, - "learning_rate": 3.0405405405405404e-05, - "loss": 2.50403137, - "memory(GiB)": 13.38, + "acc": 0.52128973, + "epoch": 0.05870841487279843, + "grad_norm": 0.8359375, + "learning_rate": 4.117647058823529e-05, + "loss": 2.27491264, + "memory(GiB)": 19.24, "step": 15, - "train_speed(iter/s)": 0.150329 + "train_speed(iter/s)": 0.082482 }, { - "acc": 0.52069569, - "epoch": 0.05420054200542006, - "grad_norm": 0.83984375, - "learning_rate": 4.054054054054054e-05, - "loss": 2.28312988, - "memory(GiB)": 13.38, + "acc": 0.51135335, + "epoch": 0.07827788649706457, + "grad_norm": 0.66015625, + "learning_rate": 5.490196078431372e-05, + "loss": 2.32762127, + "memory(GiB)": 19.86, "step": 20, - "train_speed(iter/s)": 0.152832 + "train_speed(iter/s)": 0.082557 }, { - "acc": 0.53072515, - "epoch": 0.06775067750677506, - "grad_norm": 0.59765625, - "learning_rate": 5.067567567567567e-05, - "loss": 2.20206394, - "memory(GiB)": 13.38, + "acc": 0.54442377, + "epoch": 0.09784735812133072, + "grad_norm": 0.65625, + "learning_rate": 6.862745098039214e-05, + "loss": 2.09772224, + "memory(GiB)": 19.05, "step": 25, - "train_speed(iter/s)": 0.154186 + "train_speed(iter/s)": 0.082348 }, { - "acc": 0.52423077, - "epoch": 0.08130081300813008, - "grad_norm": 0.89453125, - "learning_rate": 6.081081081081081e-05, - "loss": 2.31712227, - "memory(GiB)": 14.44, + "acc": 0.5545311, + "epoch": 0.11741682974559686, + "grad_norm": 0.62109375, + "learning_rate": 8.235294117647058e-05, + "loss": 2.00072975, + "memory(GiB)": 19.89, "step": 30, - "train_speed(iter/s)": 0.155403 + "train_speed(iter/s)": 0.082166 }, { - "acc": 0.54341574, - "epoch": 0.0948509485094851, - "grad_norm": 0.7109375, - "learning_rate": 7.094594594594594e-05, - "loss": 2.03825722, - "memory(GiB)": 14.44, + "acc": 0.57092514, + "epoch": 0.136986301369863, + "grad_norm": 0.9296875, + "learning_rate": 9.6078431372549e-05, + "loss": 1.94450474, + "memory(GiB)": 19.16, "step": 35, - "train_speed(iter/s)": 0.156383 + "train_speed(iter/s)": 0.081966 }, { - "acc": 0.56128917, - "epoch": 0.10840108401084012, - "grad_norm": 0.734375, - "learning_rate": 8.108108108108108e-05, - "loss": 1.98176575, - "memory(GiB)": 14.44, + "acc": 0.56716595, + "epoch": 0.15655577299412915, + "grad_norm": 0.7734375, + "learning_rate": 0.00010980392156862745, + "loss": 1.90242462, + "memory(GiB)": 19.62, "step": 40, - "train_speed(iter/s)": 0.156973 + "train_speed(iter/s)": 0.081987 }, { - "acc": 0.53511982, - "epoch": 0.12195121951219512, - "grad_norm": 1.1484375, - "learning_rate": 9.121621621621621e-05, - "loss": 2.07999401, - "memory(GiB)": 14.44, + "acc": 0.57822714, + "epoch": 0.1761252446183953, + "grad_norm": 0.74609375, + "learning_rate": 0.00012352941176470587, + "loss": 1.83147659, + "memory(GiB)": 19.99, "step": 45, - "train_speed(iter/s)": 0.157395 + "train_speed(iter/s)": 0.081878 }, { - "acc": 0.56112757, - "epoch": 0.13550135501355012, - "grad_norm": 0.83984375, - "learning_rate": 0.00010135135135135135, - "loss": 1.92188244, - "memory(GiB)": 14.44, + "acc": 0.57696896, + "epoch": 0.19569471624266144, + "grad_norm": 0.85546875, + "learning_rate": 0.00013725490196078428, + "loss": 1.82299595, + "memory(GiB)": 19.11, "step": 50, - "train_speed(iter/s)": 0.157924 + "train_speed(iter/s)": 0.081843 }, { - "epoch": 0.13550135501355012, - "eval_acc": 0.5618983279851376, - "eval_loss": 1.901513695716858, - "eval_runtime": 44.6741, - "eval_samples_per_second": 0.851, - "eval_steps_per_second": 0.851, + "epoch": 0.19569471624266144, + "eval_acc": 0.583503534956795, + "eval_loss": 1.8029242753982544, + "eval_runtime": 85.1254, + "eval_samples_per_second": 0.893, + "eval_steps_per_second": 0.446, "step": 50 }, { - "acc": 0.57467785, - "epoch": 0.14905149051490515, - "grad_norm": 0.9921875, - "learning_rate": 0.00011148648648648647, - "loss": 1.92710114, - "memory(GiB)": 15.21, + "acc": 0.59343066, + "epoch": 0.21526418786692758, + "grad_norm": 1.0, + "learning_rate": 0.0001399941138119636, + "loss": 1.82339039, + "memory(GiB)": 22.92, "step": 55, - "train_speed(iter/s)": 0.140365 + "train_speed(iter/s)": 0.072544 }, { - "acc": 0.55041471, - "epoch": 0.16260162601626016, - "grad_norm": 1.1484375, - "learning_rate": 0.00012162162162162162, - "loss": 2.09731407, - "memory(GiB)": 15.21, + "acc": 0.58571839, + "epoch": 0.23483365949119372, + "grad_norm": 0.7734375, + "learning_rate": 0.00013997020286964757, + "loss": 1.80549526, + "memory(GiB)": 19.43, "step": 60, - "train_speed(iter/s)": 0.141894 + "train_speed(iter/s)": 0.073269 }, { - "acc": 0.53853202, - "epoch": 0.17615176151761516, - "grad_norm": 1.03125, - "learning_rate": 0.00013175675675675675, - "loss": 2.01167774, - "memory(GiB)": 15.21, + "acc": 0.60369935, + "epoch": 0.25440313111545987, + "grad_norm": 0.99609375, + "learning_rate": 0.0001399279055646442, + "loss": 1.6768074, + "memory(GiB)": 19.57, "step": 65, - "train_speed(iter/s)": 0.143162 + "train_speed(iter/s)": 0.073897 }, { - "acc": 0.57625084, - "epoch": 0.1897018970189702, - "grad_norm": 0.81640625, - "learning_rate": 0.00014189189189189188, - "loss": 1.77697067, - "memory(GiB)": 15.21, + "acc": 0.58763909, + "epoch": 0.273972602739726, + "grad_norm": 1.1640625, + "learning_rate": 0.00013986723301159307, + "loss": 1.79169483, + "memory(GiB)": 19.48, "step": 70, - "train_speed(iter/s)": 0.144318 + "train_speed(iter/s)": 0.074533 }, { - "acc": 0.57468171, - "epoch": 0.2032520325203252, - "grad_norm": 0.96484375, - "learning_rate": 0.00014989300998573466, - "loss": 1.92596684, - "memory(GiB)": 15.21, + "acc": 0.58979025, + "epoch": 0.29354207436399216, + "grad_norm": 0.69140625, + "learning_rate": 0.00013978820115367462, + "loss": 1.72388344, + "memory(GiB)": 19.35, "step": 75, - "train_speed(iter/s)": 0.145344 + "train_speed(iter/s)": 0.075045 }, { - "acc": 0.58551345, - "epoch": 0.21680216802168023, - "grad_norm": 1.0546875, - "learning_rate": 0.00014935805991440798, - "loss": 1.85387783, - "memory(GiB)": 15.21, + "acc": 0.59725327, + "epoch": 0.3131115459882583, + "grad_norm": 0.75, + "learning_rate": 0.00013969083075842048, + "loss": 1.70864868, + "memory(GiB)": 19.49, "step": 80, - "train_speed(iter/s)": 0.146049 + "train_speed(iter/s)": 0.075523 }, { - "acc": 0.54782548, - "epoch": 0.23035230352303523, - "grad_norm": 1.0078125, - "learning_rate": 0.0001488231098430813, - "loss": 2.04980812, - "memory(GiB)": 15.21, + "acc": 0.60098982, + "epoch": 0.33268101761252444, + "grad_norm": 4.59375, + "learning_rate": 0.00013957514741225646, + "loss": 1.67311764, + "memory(GiB)": 20.01, "step": 85, - "train_speed(iter/s)": 0.146726 + "train_speed(iter/s)": 0.075928 }, { - "acc": 0.56594706, - "epoch": 0.24390243902439024, - "grad_norm": 1.0, - "learning_rate": 0.0001482881597717546, - "loss": 1.93099308, - "memory(GiB)": 15.21, + "acc": 0.58315139, + "epoch": 0.3522504892367906, + "grad_norm": 0.8359375, + "learning_rate": 0.00013944118151377894, + "loss": 1.74437752, + "memory(GiB)": 20.14, "step": 90, - "train_speed(iter/s)": 0.147437 + "train_speed(iter/s)": 0.076154 }, { - "acc": 0.57289362, - "epoch": 0.25745257452574527, - "grad_norm": 0.8046875, - "learning_rate": 0.00014775320970042795, - "loss": 1.84038963, - "memory(GiB)": 15.58, + "acc": 0.6138227, + "epoch": 0.37181996086105673, + "grad_norm": 0.75, + "learning_rate": 0.0001392889682657671, + "loss": 1.63750076, + "memory(GiB)": 19.59, "step": 95, - "train_speed(iter/s)": 0.147976 + "train_speed(iter/s)": 0.076253 }, { - "acc": 0.5938961, - "epoch": 0.27100271002710025, - "grad_norm": 1.015625, - "learning_rate": 0.00014721825962910127, - "loss": 1.71151619, - "memory(GiB)": 15.58, + "acc": 0.63383026, + "epoch": 0.3913894324853229, + "grad_norm": 0.8515625, + "learning_rate": 0.00013911854766593233, + "loss": 1.56653557, + "memory(GiB)": 19.5, "step": 100, - "train_speed(iter/s)": 0.148573 + "train_speed(iter/s)": 0.076386 }, { - "epoch": 0.27100271002710025, - "eval_acc": 0.587907448066205, - "eval_loss": 1.7437644004821777, - "eval_runtime": 44.775, - "eval_samples_per_second": 0.849, - "eval_steps_per_second": 0.849, + "epoch": 0.3913894324853229, + "eval_acc": 0.604241948153967, + "eval_loss": 1.6681365966796875, + "eval_runtime": 72.2811, + "eval_samples_per_second": 1.051, + "eval_steps_per_second": 0.526, "step": 100 }, { - "acc": 0.5828393, - "epoch": 0.2845528455284553, - "grad_norm": 1.078125, - "learning_rate": 0.00014668330955777461, - "loss": 1.90272522, - "memory(GiB)": 15.58, + "acc": 0.61646304, + "epoch": 0.410958904109589, + "grad_norm": 0.73046875, + "learning_rate": 0.00013892996449640807, + "loss": 1.59651537, + "memory(GiB)": 22.5, "step": 105, - "train_speed(iter/s)": 0.140241 + "train_speed(iter/s)": 0.072857 }, { - "acc": 0.57981925, - "epoch": 0.2981029810298103, - "grad_norm": 0.859375, - "learning_rate": 0.00014614835948644793, - "loss": 1.88997154, - "memory(GiB)": 15.58, + "acc": 0.60897431, + "epoch": 0.43052837573385516, + "grad_norm": 0.83984375, + "learning_rate": 0.00013872326831198205, + "loss": 1.70257473, + "memory(GiB)": 19.42, "step": 110, - "train_speed(iter/s)": 0.141057 + "train_speed(iter/s)": 0.073309 }, { - "acc": 0.56191244, - "epoch": 0.3116531165311653, - "grad_norm": 0.94140625, - "learning_rate": 0.00014561340941512125, - "loss": 1.92102909, - "memory(GiB)": 15.58, + "acc": 0.58328586, + "epoch": 0.4500978473581213, + "grad_norm": 0.9453125, + "learning_rate": 0.00013849851342707462, + "loss": 1.71216717, + "memory(GiB)": 19.47, "step": 115, - "train_speed(iter/s)": 0.141824 + "train_speed(iter/s)": 0.073753 }, { - "acc": 0.57420607, - "epoch": 0.3252032520325203, - "grad_norm": 1.171875, - "learning_rate": 0.00014507845934379456, - "loss": 1.80924416, - "memory(GiB)": 15.58, + "acc": 0.62397904, + "epoch": 0.46966731898238745, + "grad_norm": 0.80078125, + "learning_rate": 0.0001382557589014664, + "loss": 1.54239073, + "memory(GiB)": 19.33, "step": 120, - "train_speed(iter/s)": 0.14256 + "train_speed(iter/s)": 0.074078 }, { - "acc": 0.57703466, - "epoch": 0.33875338753387535, - "grad_norm": 0.84765625, - "learning_rate": 0.00014454350927246788, - "loss": 1.85804043, - "memory(GiB)": 15.58, + "acc": 0.60271235, + "epoch": 0.4892367906066536, + "grad_norm": 1.171875, + "learning_rate": 0.0001379950685247788, + "loss": 1.72333088, + "memory(GiB)": 19.37, "step": 125, - "train_speed(iter/s)": 0.143173 + "train_speed(iter/s)": 0.074428 }, { - "acc": 0.58581357, - "epoch": 0.3523035230352303, - "grad_norm": 0.96484375, - "learning_rate": 0.0001440085592011412, - "loss": 1.75103779, - "memory(GiB)": 15.58, + "acc": 0.5755064, + "epoch": 0.5088062622309197, + "grad_norm": 0.94921875, + "learning_rate": 0.00013771651079971182, + "loss": 1.81728477, + "memory(GiB)": 19.52, "step": 130, - "train_speed(iter/s)": 0.143759 + "train_speed(iter/s)": 0.074768 }, { - "acc": 0.59179254, - "epoch": 0.36585365853658536, - "grad_norm": 0.93359375, - "learning_rate": 0.00014347360912981454, - "loss": 1.67790794, - "memory(GiB)": 15.58, + "acc": 0.5844254, + "epoch": 0.5283757338551859, + "grad_norm": 0.8515625, + "learning_rate": 0.00013742015892404325, + "loss": 1.77252998, + "memory(GiB)": 19.51, "step": 135, - "train_speed(iter/s)": 0.14428 + "train_speed(iter/s)": 0.075066 }, { - "acc": 0.59449296, - "epoch": 0.3794037940379404, - "grad_norm": 0.74609375, - "learning_rate": 0.00014293865905848786, - "loss": 1.72561359, - "memory(GiB)": 15.58, + "acc": 0.5998323, + "epoch": 0.547945205479452, + "grad_norm": 0.8671875, + "learning_rate": 0.0001371060907713942, + "loss": 1.69012871, + "memory(GiB)": 19.54, "step": 140, - "train_speed(iter/s)": 0.144799 + "train_speed(iter/s)": 0.07528 }, { - "acc": 0.57711525, - "epoch": 0.39295392953929537, - "grad_norm": 1.109375, - "learning_rate": 0.0001424037089871612, - "loss": 1.79506092, - "memory(GiB)": 15.58, + "acc": 0.62686119, + "epoch": 0.5675146771037182, + "grad_norm": 0.68359375, + "learning_rate": 0.00013677438887076603, + "loss": 1.66314449, + "memory(GiB)": 19.54, "step": 145, - "train_speed(iter/s)": 0.14532 + "train_speed(iter/s)": 0.075467 }, { - "acc": 0.57541366, - "epoch": 0.4065040650406504, - "grad_norm": 5.46875, - "learning_rate": 0.00014186875891583452, - "loss": 1.80377541, - "memory(GiB)": 15.58, + "acc": 0.59954901, + "epoch": 0.5870841487279843, + "grad_norm": 0.6328125, + "learning_rate": 0.00013642514038485367, + "loss": 1.67525444, + "memory(GiB)": 19.55, "step": 150, - "train_speed(iter/s)": 0.145757 + "train_speed(iter/s)": 0.075722 }, { - "epoch": 0.4065040650406504, - "eval_acc": 0.5983786522546867, - "eval_loss": 1.676965594291687, - "eval_runtime": 44.3227, - "eval_samples_per_second": 0.857, - "eval_steps_per_second": 0.857, + "epoch": 0.5870841487279843, + "eval_acc": 0.6184603299293009, + "eval_loss": 1.5965631008148193, + "eval_runtime": 72.3005, + "eval_samples_per_second": 1.051, + "eval_steps_per_second": 0.526, "step": 150 }, { - "acc": 0.58702893, - "epoch": 0.42005420054200543, - "grad_norm": 1.0625, - "learning_rate": 0.00014133380884450783, - "loss": 1.86865826, - "memory(GiB)": 15.58, + "acc": 0.585955, + "epoch": 0.6066536203522505, + "grad_norm": 0.9375, + "learning_rate": 0.00013605843708714162, + "loss": 1.7486639, + "memory(GiB)": 23.22, "step": 155, - "train_speed(iter/s)": 0.140358 + "train_speed(iter/s)": 0.073368 }, { - "acc": 0.58541198, - "epoch": 0.43360433604336046, - "grad_norm": 1.015625, - "learning_rate": 0.00014079885877318115, - "loss": 1.82644196, - "memory(GiB)": 15.58, + "acc": 0.62769904, + "epoch": 0.6262230919765166, + "grad_norm": 0.7265625, + "learning_rate": 0.00013567437533778826, + "loss": 1.55238762, + "memory(GiB)": 19.62, "step": 160, - "train_speed(iter/s)": 0.140936 + "train_speed(iter/s)": 0.073628 }, { - "acc": 0.58856125, - "epoch": 0.44715447154471544, - "grad_norm": 0.9921875, - "learning_rate": 0.00014026390870185447, - "loss": 1.68714104, - "memory(GiB)": 15.58, + "acc": 0.63651643, + "epoch": 0.6457925636007827, + "grad_norm": 0.80078125, + "learning_rate": 0.00013527305605830488, + "loss": 1.54306393, + "memory(GiB)": 19.88, "step": 165, - "train_speed(iter/s)": 0.141495 + "train_speed(iter/s)": 0.073903 }, { - "acc": 0.59458299, - "epoch": 0.46070460704607047, - "grad_norm": 0.91015625, - "learning_rate": 0.0001397289586305278, - "loss": 1.72063637, - "memory(GiB)": 15.58, + "acc": 0.59288979, + "epoch": 0.6653620352250489, + "grad_norm": 0.703125, + "learning_rate": 0.0001348545847050361, + "loss": 1.69727612, + "memory(GiB)": 19.58, "step": 170, - "train_speed(iter/s)": 0.142005 + "train_speed(iter/s)": 0.074077 }, { - "acc": 0.57354069, - "epoch": 0.4742547425474255, - "grad_norm": 0.7734375, - "learning_rate": 0.00013919400855920113, - "loss": 1.83291931, - "memory(GiB)": 15.58, + "acc": 0.61248484, + "epoch": 0.684931506849315, + "grad_norm": 0.9140625, + "learning_rate": 0.00013441907124144866, + "loss": 1.65900764, + "memory(GiB)": 19.49, "step": 175, - "train_speed(iter/s)": 0.142509 + "train_speed(iter/s)": 0.074329 }, { - "acc": 0.58851786, - "epoch": 0.4878048780487805, - "grad_norm": 1.234375, - "learning_rate": 0.00013865905848787447, - "loss": 1.70540199, - "memory(GiB)": 15.58, + "acc": 0.61740661, + "epoch": 0.7045009784735812, + "grad_norm": 0.90625, + "learning_rate": 0.0001339666301092358, + "loss": 1.6518961, + "memory(GiB)": 19.68, "step": 180, - "train_speed(iter/s)": 0.143002 + "train_speed(iter/s)": 0.074558 }, { - "acc": 0.59666262, - "epoch": 0.5013550135501355, - "grad_norm": 0.87890625, - "learning_rate": 0.0001381241084165478, - "loss": 1.70772285, - "memory(GiB)": 15.58, + "acc": 0.62250223, + "epoch": 0.7240704500978473, + "grad_norm": 0.84765625, + "learning_rate": 0.00013349738019824512, + "loss": 1.55100412, + "memory(GiB)": 19.34, "step": 185, - "train_speed(iter/s)": 0.143463 + "train_speed(iter/s)": 0.07477 }, { - "acc": 0.58001013, - "epoch": 0.5149051490514905, - "grad_norm": 0.890625, - "learning_rate": 0.0001375891583452211, - "loss": 1.73730106, - "memory(GiB)": 15.58, + "acc": 0.61055808, + "epoch": 0.7436399217221135, + "grad_norm": 0.90625, + "learning_rate": 0.00013301144481523718, + "loss": 1.67241592, + "memory(GiB)": 19.56, "step": 190, - "train_speed(iter/s)": 0.143883 + "train_speed(iter/s)": 0.075006 }, { - "acc": 0.5934463, - "epoch": 0.5284552845528455, - "grad_norm": 1.421875, - "learning_rate": 0.00013705420827389442, - "loss": 1.68941402, - "memory(GiB)": 15.58, + "acc": 0.6389596, + "epoch": 0.7632093933463796, + "grad_norm": 0.83203125, + "learning_rate": 0.00013250895165148384, + "loss": 1.54227753, + "memory(GiB)": 19.29, "step": 195, - "train_speed(iter/s)": 0.144297 + "train_speed(iter/s)": 0.075192 }, { - "acc": 0.58476119, - "epoch": 0.5420054200542005, - "grad_norm": 0.91015625, - "learning_rate": 0.00013651925820256774, - "loss": 1.74896088, - "memory(GiB)": 15.58, + "acc": 0.59149747, + "epoch": 0.7827788649706457, + "grad_norm": 0.68359375, + "learning_rate": 0.00013199003274921416, + "loss": 1.71190453, + "memory(GiB)": 19.35, "step": 200, - "train_speed(iter/s)": 0.144655 + "train_speed(iter/s)": 0.075393 }, { - "epoch": 0.5420054200542005, - "eval_acc": 0.6105387603445364, - "eval_loss": 1.6297248601913452, - "eval_runtime": 44.3331, - "eval_samples_per_second": 0.857, - "eval_steps_per_second": 0.857, + "epoch": 0.7827788649706457, + "eval_acc": 0.6241162608012569, + "eval_loss": 1.5573129653930664, + "eval_runtime": 69.5471, + "eval_samples_per_second": 1.093, + "eval_steps_per_second": 0.546, "step": 200 }, { - "acc": 0.58323898, - "epoch": 0.5555555555555556, - "grad_norm": 0.79296875, - "learning_rate": 0.00013598430813124105, - "loss": 1.74196358, - "memory(GiB)": 15.58, + "acc": 0.62623324, + "epoch": 0.8023483365949119, + "grad_norm": 0.81640625, + "learning_rate": 0.00013145482446691724, + "loss": 1.55779324, + "memory(GiB)": 20.56, "step": 205, - "train_speed(iter/s)": 0.140612 + "train_speed(iter/s)": 0.073671 }, { - "acc": 0.59854908, - "epoch": 0.5691056910569106, - "grad_norm": 1.109375, - "learning_rate": 0.0001354493580599144, - "loss": 1.6279623, - "memory(GiB)": 15.58, + "acc": 0.61495056, + "epoch": 0.821917808219178, + "grad_norm": 1.03125, + "learning_rate": 0.00013090346744351058, + "loss": 1.56424398, + "memory(GiB)": 19.48, "step": 210, - "train_speed(iter/s)": 0.141053 + "train_speed(iter/s)": 0.073902 }, { - "acc": 0.58306313, - "epoch": 0.5826558265582655, - "grad_norm": 0.98828125, - "learning_rate": 0.00013491440798858771, - "loss": 1.85492191, - "memory(GiB)": 15.58, + "acc": 0.59643593, + "epoch": 0.8414872798434442, + "grad_norm": 1.0703125, + "learning_rate": 0.00013033610656138395, + "loss": 1.62190418, + "memory(GiB)": 19.5, "step": 215, - "train_speed(iter/s)": 0.141461 + "train_speed(iter/s)": 0.074133 }, { - "acc": 0.58454275, - "epoch": 0.5962059620596206, - "grad_norm": 0.890625, - "learning_rate": 0.00013437945791726106, - "loss": 1.75104046, - "memory(GiB)": 15.58, + "acc": 0.63052382, + "epoch": 0.8610567514677103, + "grad_norm": 0.59765625, + "learning_rate": 0.00012975289090832792, + "loss": 1.53521852, + "memory(GiB)": 19.53, "step": 220, - "train_speed(iter/s)": 0.141869 + "train_speed(iter/s)": 0.074334 }, { - "acc": 0.60898943, - "epoch": 0.6097560975609756, - "grad_norm": 0.828125, - "learning_rate": 0.00013384450784593437, - "loss": 1.57786808, - "memory(GiB)": 15.58, + "acc": 0.61408448, + "epoch": 0.8806262230919765, + "grad_norm": 0.7734375, + "learning_rate": 0.00012915397373835754, + "loss": 1.59712257, + "memory(GiB)": 19.52, "step": 225, - "train_speed(iter/s)": 0.142237 + "train_speed(iter/s)": 0.074533 }, { - "acc": 0.58954048, - "epoch": 0.6233062330623306, - "grad_norm": 1.1640625, - "learning_rate": 0.0001333095577746077, - "loss": 1.72581081, - "memory(GiB)": 15.58, + "acc": 0.62307076, + "epoch": 0.9001956947162426, + "grad_norm": 0.66796875, + "learning_rate": 0.00012853951243144105, + "loss": 1.57903328, + "memory(GiB)": 19.49, "step": 230, - "train_speed(iter/s)": 0.142622 + "train_speed(iter/s)": 0.074719 }, { - "acc": 0.59608021, - "epoch": 0.6368563685636857, - "grad_norm": 1.2265625, - "learning_rate": 0.000132774607703281, - "loss": 1.70160866, - "memory(GiB)": 15.58, + "acc": 0.61717134, + "epoch": 0.9197651663405088, + "grad_norm": 0.84375, + "learning_rate": 0.00012790966845214457, + "loss": 1.61422024, + "memory(GiB)": 19.25, "step": 235, - "train_speed(iter/s)": 0.142983 + "train_speed(iter/s)": 0.074916 }, { - "acc": 0.57084417, - "epoch": 0.6504065040650406, - "grad_norm": 1.1640625, - "learning_rate": 0.00013223965763195432, - "loss": 1.81941319, - "memory(GiB)": 15.58, + "acc": 0.62549253, + "epoch": 0.9393346379647749, + "grad_norm": 0.8125, + "learning_rate": 0.0001272646073072033, + "loss": 1.62806015, + "memory(GiB)": 19.36, "step": 240, - "train_speed(iter/s)": 0.143329 + "train_speed(iter/s)": 0.0751 }, { - "acc": 0.61476159, - "epoch": 0.6639566395663956, - "grad_norm": 0.9453125, - "learning_rate": 0.00013170470756062767, - "loss": 1.68505306, - "memory(GiB)": 15.58, + "acc": 0.61903515, + "epoch": 0.958904109589041, + "grad_norm": 0.74609375, + "learning_rate": 0.0001266044985020307, + "loss": 1.55927486, + "memory(GiB)": 19.36, "step": 245, - "train_speed(iter/s)": 0.143652 + "train_speed(iter/s)": 0.075266 }, { - "acc": 0.5995626, - "epoch": 0.6775067750677507, - "grad_norm": 0.9140625, - "learning_rate": 0.00013116975748930098, - "loss": 1.77789631, - "memory(GiB)": 15.58, + "acc": 0.61238952, + "epoch": 0.9784735812133072, + "grad_norm": 0.87890625, + "learning_rate": 0.00012592951549617683, + "loss": 1.52888412, + "memory(GiB)": 19.33, "step": 250, - "train_speed(iter/s)": 0.144002 + "train_speed(iter/s)": 0.075438 }, { - "epoch": 0.6775067750677507, - "eval_acc": 0.61087654112481, - "eval_loss": 1.6108067035675049, - "eval_runtime": 44.1094, - "eval_samples_per_second": 0.861, - "eval_steps_per_second": 0.861, + "epoch": 0.9784735812133072, + "eval_acc": 0.6267085624509033, + "eval_loss": 1.5281730890274048, + "eval_runtime": 69.069, + "eval_samples_per_second": 1.1, + "eval_steps_per_second": 0.55, "step": 250 }, { - "acc": 0.62165804, - "epoch": 0.6910569105691057, - "grad_norm": 1.6171875, - "learning_rate": 0.0001306348074179743, - "loss": 1.57510653, - "memory(GiB)": 15.58, + "acc": 0.63230977, + "epoch": 0.9980430528375733, + "grad_norm": 0.84765625, + "learning_rate": 0.00012523983565774753, + "loss": 1.53058205, + "memory(GiB)": 19.46, "step": 255, - "train_speed(iter/s)": 0.140805 + "train_speed(iter/s)": 0.074081 }, { - "acc": 0.59346585, - "epoch": 0.7046070460704607, - "grad_norm": 1.046875, - "learning_rate": 0.00013009985734664764, - "loss": 1.76646061, - "memory(GiB)": 15.58, + "acc": 0.66042156, + "epoch": 1.0176125244618395, + "grad_norm": 0.76171875, + "learning_rate": 0.00012453564021679692, + "loss": 1.37123928, + "memory(GiB)": 20.18, "step": 260, - "train_speed(iter/s)": 0.141165 + "train_speed(iter/s)": 0.074295 }, { - "acc": 0.60632119, - "epoch": 0.7181571815718157, - "grad_norm": 1.078125, - "learning_rate": 0.00012956490727532096, - "loss": 1.59940271, - "memory(GiB)": 15.95, + "acc": 0.67253222, + "epoch": 1.0371819960861057, + "grad_norm": 0.76953125, + "learning_rate": 0.00012381711421770455, + "loss": 1.28407507, + "memory(GiB)": 19.7, "step": 265, - "train_speed(iter/s)": 0.141502 + "train_speed(iter/s)": 0.074448 }, { - "acc": 0.59642992, - "epoch": 0.7317073170731707, - "grad_norm": 1.203125, - "learning_rate": 0.00012902995720399428, - "loss": 1.6549921, - "memory(GiB)": 15.95, + "acc": 0.66850777, + "epoch": 1.0567514677103718, + "grad_norm": 0.98046875, + "learning_rate": 0.0001230844464705507, + "loss": 1.27961807, + "memory(GiB)": 19.58, "step": 270, - "train_speed(iter/s)": 0.141855 + "train_speed(iter/s)": 0.07459 }, { - "acc": 0.60280037, - "epoch": 0.7452574525745257, - "grad_norm": 1.0625, - "learning_rate": 0.0001284950071326676, - "loss": 1.63179569, - "memory(GiB)": 15.95, + "acc": 0.67196817, + "epoch": 1.076320939334638, + "grad_norm": 0.9140625, + "learning_rate": 0.00012233782950150186, + "loss": 1.28494987, + "memory(GiB)": 19.61, "step": 275, - "train_speed(iter/s)": 0.142164 + "train_speed(iter/s)": 0.074728 }, { - "acc": 0.594034, - "epoch": 0.7588075880758808, - "grad_norm": 1.1796875, - "learning_rate": 0.00012796005706134094, - "loss": 1.61152973, - "memory(GiB)": 15.95, + "acc": 0.67708378, + "epoch": 1.095890410958904, + "grad_norm": 0.87109375, + "learning_rate": 0.00012157745950221989, + "loss": 1.29551096, + "memory(GiB)": 19.63, "step": 280, - "train_speed(iter/s)": 0.142474 + "train_speed(iter/s)": 0.074881 }, { - "acc": 0.59763761, - "epoch": 0.7723577235772358, + "acc": 0.66973438, + "epoch": 1.1154598825831703, "grad_norm": 1.0859375, - "learning_rate": 0.00012742510699001425, - "loss": 1.66443863, - "memory(GiB)": 15.95, + "learning_rate": 0.0001208035362783079, + "loss": 1.27705774, + "memory(GiB)": 19.49, "step": 285, - "train_speed(iter/s)": 0.142791 + "train_speed(iter/s)": 0.075029 }, { - "acc": 0.58316121, - "epoch": 0.7859078590785907, - "grad_norm": 0.73828125, - "learning_rate": 0.00012689015691868757, - "loss": 1.77028389, - "memory(GiB)": 15.95, + "acc": 0.6750237, + "epoch": 1.1350293542074363, + "grad_norm": 1.0859375, + "learning_rate": 0.00012001626319680648, + "loss": 1.25660419, + "memory(GiB)": 19.55, "step": 290, - "train_speed(iter/s)": 0.143067 + "train_speed(iter/s)": 0.07515 }, { - "acc": 0.60665183, - "epoch": 0.7994579945799458, - "grad_norm": 1.25, - "learning_rate": 0.00012635520684736091, - "loss": 1.64849663, - "memory(GiB)": 15.95, + "acc": 0.624368, + "epoch": 1.1545988258317026, + "grad_norm": 1.1953125, + "learning_rate": 0.00011921584713275411, + "loss": 1.5070508, + "memory(GiB)": 19.52, "step": 295, - "train_speed(iter/s)": 0.143366 + "train_speed(iter/s)": 0.075278 }, { - "acc": 0.55526123, - "epoch": 0.8130081300813008, - "grad_norm": 1.203125, - "learning_rate": 0.00012582025677603423, - "loss": 1.86367321, - "memory(GiB)": 15.95, + "acc": 0.66252189, + "epoch": 1.1741682974559686, + "grad_norm": 0.828125, + "learning_rate": 0.0001184024984148257, + "loss": 1.32014723, + "memory(GiB)": 19.92, "step": 300, - "train_speed(iter/s)": 0.143649 + "train_speed(iter/s)": 0.075433 }, { - "epoch": 0.8130081300813008, - "eval_acc": 0.6129032258064516, - "eval_loss": 1.5865528583526611, - "eval_runtime": 44.0975, - "eval_samples_per_second": 0.862, - "eval_steps_per_second": 0.862, + "epoch": 1.1741682974559686, + "eval_acc": 0.6282796543597801, + "eval_loss": 1.5250990390777588, + "eval_runtime": 70.3986, + "eval_samples_per_second": 1.08, + "eval_steps_per_second": 0.54, "step": 300 }, { - "acc": 0.6019598, - "epoch": 0.8265582655826558, - "grad_norm": 1.1328125, - "learning_rate": 0.00012528530670470755, - "loss": 1.71853142, - "memory(GiB)": 15.95, + "acc": 0.67028356, + "epoch": 1.1937377690802349, + "grad_norm": 1.7109375, + "learning_rate": 0.00011757643077006372, + "loss": 1.28037386, + "memory(GiB)": 22.6, "step": 305, - "train_speed(iter/s)": 0.14098 + "train_speed(iter/s)": 0.074243 }, { - "acc": 0.6009192, - "epoch": 0.8401084010840109, - "grad_norm": 1.359375, - "learning_rate": 0.00012475035663338086, - "loss": 1.72751312, - "memory(GiB)": 16.34, + "acc": 0.655305, + "epoch": 1.213307240704501, + "grad_norm": 1.1015625, + "learning_rate": 0.00011673786126771617, + "loss": 1.31057158, + "memory(GiB)": 19.72, "step": 310, - "train_speed(iter/s)": 0.141284 + "train_speed(iter/s)": 0.074392 }, { - "acc": 0.58907671, - "epoch": 0.8536585365853658, - "grad_norm": 0.98046875, - "learning_rate": 0.0001242154065620542, - "loss": 1.71860523, - "memory(GiB)": 16.34, + "acc": 0.66528535, + "epoch": 1.2328767123287672, + "grad_norm": 1.6171875, + "learning_rate": 0.0001158870102621965, + "loss": 1.29698696, + "memory(GiB)": 19.08, "step": 315, - "train_speed(iter/s)": 0.141562 + "train_speed(iter/s)": 0.074534 }, { - "acc": 0.62246222, - "epoch": 0.8672086720867209, - "grad_norm": 1.0234375, - "learning_rate": 0.00012368045649072752, - "loss": 1.58847914, - "memory(GiB)": 16.34, + "acc": 0.66950455, + "epoch": 1.2524461839530332, + "grad_norm": 1.2421875, + "learning_rate": 0.00011502410133517998, + "loss": 1.27706356, + "memory(GiB)": 19.87, "step": 320, - "train_speed(iter/s)": 0.141848 + "train_speed(iter/s)": 0.074667 }, { - "acc": 0.61927052, - "epoch": 0.8807588075880759, - "grad_norm": 1.59375, - "learning_rate": 0.00012314550641940084, - "loss": 1.54207745, - "memory(GiB)": 16.34, + "acc": 0.65843534, + "epoch": 1.2720156555772995, + "grad_norm": 1.2265625, + "learning_rate": 0.0001141493612368524, + "loss": 1.30308371, + "memory(GiB)": 19.87, "step": 325, - "train_speed(iter/s)": 0.142133 + "train_speed(iter/s)": 0.0748 }, { - "acc": 0.60672359, - "epoch": 0.8943089430894309, - "grad_norm": 2.140625, - "learning_rate": 0.00012261055634807416, - "loss": 1.71789646, - "memory(GiB)": 16.34, + "acc": 0.66441913, + "epoch": 1.2915851272015655, + "grad_norm": 1.2578125, + "learning_rate": 0.00011326301982632583, + "loss": 1.26109972, + "memory(GiB)": 19.09, "step": 330, - "train_speed(iter/s)": 0.142392 + "train_speed(iter/s)": 0.074935 }, { - "acc": 0.63249903, - "epoch": 0.907859078590786, - "grad_norm": 1.1328125, - "learning_rate": 0.0001220756062767475, - "loss": 1.46934195, - "memory(GiB)": 16.34, + "acc": 0.68711085, + "epoch": 1.3111545988258317, + "grad_norm": 0.95703125, + "learning_rate": 0.00011236531001123771, + "loss": 1.19278584, + "memory(GiB)": 19.73, "step": 335, - "train_speed(iter/s)": 0.142658 + "train_speed(iter/s)": 0.075053 }, { - "acc": 0.60973873, - "epoch": 0.9214092140921409, - "grad_norm": 0.84375, - "learning_rate": 0.00012154065620542082, - "loss": 1.67951736, - "memory(GiB)": 16.34, + "acc": 0.66676803, + "epoch": 1.3307240704500978, + "grad_norm": 1.96875, + "learning_rate": 0.0001114564676865486, + "loss": 1.3068346, + "memory(GiB)": 19.84, "step": 340, - "train_speed(iter/s)": 0.142914 + "train_speed(iter/s)": 0.075151 }, { - "acc": 0.58657136, - "epoch": 0.9349593495934959, - "grad_norm": 1.2734375, - "learning_rate": 0.00012100570613409413, - "loss": 1.68692303, - "memory(GiB)": 16.34, + "acc": 0.66865935, + "epoch": 1.350293542074364, + "grad_norm": 1.2421875, + "learning_rate": 0.00011053673167255516, + "loss": 1.30573978, + "memory(GiB)": 19.66, "step": 345, - "train_speed(iter/s)": 0.143167 + "train_speed(iter/s)": 0.075271 }, { - "acc": 0.59965162, - "epoch": 0.948509485094851, - "grad_norm": 1.421875, - "learning_rate": 0.00012047075606276746, - "loss": 1.71949959, - "memory(GiB)": 16.34, + "acc": 0.66606102, + "epoch": 1.36986301369863, + "grad_norm": 0.76171875, + "learning_rate": 0.00010960634365213437, + "loss": 1.26872787, + "memory(GiB)": 19.73, "step": 350, - "train_speed(iter/s)": 0.143387 + "train_speed(iter/s)": 0.075377 }, { - "epoch": 0.948509485094851, - "eval_acc": 0.6171254855598717, - "eval_loss": 1.562721610069275, - "eval_runtime": 44.0566, - "eval_samples_per_second": 0.863, - "eval_steps_per_second": 0.863, + "epoch": 1.36986301369863, + "eval_acc": 0.6315003927729772, + "eval_loss": 1.5066882371902466, + "eval_runtime": 72.5685, + "eval_samples_per_second": 1.047, + "eval_steps_per_second": 0.524, "step": 350 }, { - "acc": 0.59322553, - "epoch": 0.962059620596206, - "grad_norm": 1.3046875, - "learning_rate": 0.0001199358059914408, - "loss": 1.66712799, - "memory(GiB)": 16.34, + "acc": 0.67307239, + "epoch": 1.3894324853228963, + "grad_norm": 1.1796875, + "learning_rate": 0.0001086655481072354, + "loss": 1.27917318, + "memory(GiB)": 22.92, "step": 355, - "train_speed(iter/s)": 0.14111 + "train_speed(iter/s)": 0.074318 }, { - "acc": 0.59898105, - "epoch": 0.975609756097561, - "grad_norm": 1.265625, - "learning_rate": 0.00011940085592011411, - "loss": 1.66754684, - "memory(GiB)": 16.34, + "acc": 0.65870218, + "epoch": 1.4090019569471623, + "grad_norm": 3.609375, + "learning_rate": 0.00010771459225463617, + "loss": 1.33731461, + "memory(GiB)": 19.67, "step": 360, - "train_speed(iter/s)": 0.141373 + "train_speed(iter/s)": 0.074416 }, { - "acc": 0.60211391, - "epoch": 0.989159891598916, - "grad_norm": 1.359375, - "learning_rate": 0.00011886590584878744, - "loss": 1.67542057, - "memory(GiB)": 16.34, + "acc": 0.68150563, + "epoch": 1.4285714285714286, + "grad_norm": 0.9296875, + "learning_rate": 0.00010675372598098113, + "loss": 1.20515957, + "memory(GiB)": 19.99, "step": 365, - "train_speed(iter/s)": 0.141638 + "train_speed(iter/s)": 0.07451 }, { - "acc": 0.63076615, - "epoch": 1.002710027100271, - "grad_norm": 0.8515625, - "learning_rate": 0.00011833095577746076, - "loss": 1.51196938, - "memory(GiB)": 16.34, + "acc": 0.66793504, + "epoch": 1.4481409001956946, + "grad_norm": 1.03125, + "learning_rate": 0.00010578320177711743, + "loss": 1.31133595, + "memory(GiB)": 19.9, "step": 370, - "train_speed(iter/s)": 0.141355 + "train_speed(iter/s)": 0.074613 }, { - "acc": 0.63542643, - "epoch": 1.016260162601626, - "grad_norm": 0.828125, - "learning_rate": 0.00011779600570613407, - "loss": 1.38132334, - "memory(GiB)": 16.34, + "acc": 0.66840873, + "epoch": 1.467710371819961, + "grad_norm": 0.9453125, + "learning_rate": 0.00010480327467174705, + "loss": 1.27730675, + "memory(GiB)": 19.91, "step": 375, - "train_speed(iter/s)": 0.141603 + "train_speed(iter/s)": 0.074709 }, { - "acc": 0.66358423, - "epoch": 1.029810298102981, - "grad_norm": 0.859375, - "learning_rate": 0.00011726105563480742, - "loss": 1.41512909, - "memory(GiB)": 16.34, + "acc": 0.6621439, + "epoch": 1.487279843444227, + "grad_norm": 0.7890625, + "learning_rate": 0.00010381420216441152, + "loss": 1.29670372, + "memory(GiB)": 19.65, "step": 380, - "train_speed(iter/s)": 0.141845 + "train_speed(iter/s)": 0.074824 }, { - "acc": 0.63692493, - "epoch": 1.043360433604336, - "grad_norm": 1.0859375, - "learning_rate": 0.00011672610556348073, - "loss": 1.38078823, - "memory(GiB)": 16.34, + "acc": 0.66805882, + "epoch": 1.5068493150684932, + "grad_norm": 0.8203125, + "learning_rate": 0.00010281624415782804, + "loss": 1.23922901, + "memory(GiB)": 19.77, "step": 385, - "train_speed(iter/s)": 0.142082 + "train_speed(iter/s)": 0.074927 }, { - "acc": 0.6641499, - "epoch": 1.056910569105691, - "grad_norm": 1.125, - "learning_rate": 0.00011619115549215406, - "loss": 1.3458046, - "memory(GiB)": 16.34, + "acc": 0.66435666, + "epoch": 1.5264187866927594, + "grad_norm": 0.82421875, + "learning_rate": 0.0001018096628895935, + "loss": 1.27945633, + "memory(GiB)": 19.79, "step": 390, - "train_speed(iter/s)": 0.142305 + "train_speed(iter/s)": 0.075033 }, { - "acc": 0.67276783, - "epoch": 1.070460704607046, - "grad_norm": 0.98828125, - "learning_rate": 0.00011565620542082738, - "loss": 1.2076004, - "memory(GiB)": 16.34, + "acc": 0.68444743, + "epoch": 1.5459882583170255, + "grad_norm": 0.98046875, + "learning_rate": 0.00010079472286327533, + "loss": 1.2325819, + "memory(GiB)": 19.55, "step": 395, - "train_speed(iter/s)": 0.142533 + "train_speed(iter/s)": 0.075133 }, { - "acc": 0.65821433, - "epoch": 1.084010840108401, - "grad_norm": 1.15625, - "learning_rate": 0.0001151212553495007, - "loss": 1.38641891, - "memory(GiB)": 16.34, + "acc": 0.68633671, + "epoch": 1.5655577299412915, + "grad_norm": 1.171875, + "learning_rate": 9.977169077890672e-05, + "loss": 1.26248102, + "memory(GiB)": 19.79, "step": 400, - "train_speed(iter/s)": 0.142742 + "train_speed(iter/s)": 0.075233 }, { - "epoch": 1.084010840108401, - "eval_acc": 0.6162810336091876, - "eval_loss": 1.5887514352798462, - "eval_runtime": 44.1624, - "eval_samples_per_second": 0.86, - "eval_steps_per_second": 0.86, + "epoch": 1.5655577299412915, + "eval_acc": 0.6297721916732129, + "eval_loss": 1.5114485025405884, + "eval_runtime": 70.7985, + "eval_samples_per_second": 1.073, + "eval_steps_per_second": 0.537, "step": 400 }, { - "acc": 0.63352804, - "epoch": 1.0975609756097562, - "grad_norm": 1.2421875, - "learning_rate": 0.00011458630527817403, - "loss": 1.44696302, - "memory(GiB)": 16.34, + "acc": 0.67859097, + "epoch": 1.5851272015655578, + "grad_norm": 1.046875, + "learning_rate": 9.874083546290482e-05, + "loss": 1.2065486, + "memory(GiB)": 22.72, "step": 405, - "train_speed(iter/s)": 0.140759 + "train_speed(iter/s)": 0.074347 }, { - "acc": 0.61955671, - "epoch": 1.1111111111111112, - "grad_norm": 1.5546875, - "learning_rate": 0.00011405135520684734, - "loss": 1.48915672, - "memory(GiB)": 16.34, + "acc": 0.66178751, + "epoch": 1.604696673189824, + "grad_norm": 0.96484375, + "learning_rate": 9.770242779743008e-05, + "loss": 1.30969448, + "memory(GiB)": 20.13, "step": 410, - "train_speed(iter/s)": 0.140987 + "train_speed(iter/s)": 0.074453 }, { - "acc": 0.69259667, - "epoch": 1.1246612466124661, - "grad_norm": 0.796875, - "learning_rate": 0.00011351640513552069, - "loss": 1.22069292, - "memory(GiB)": 16.34, + "acc": 0.65872512, + "epoch": 1.62426614481409, + "grad_norm": 0.74609375, + "learning_rate": 9.665674064920533e-05, + "loss": 1.27483397, + "memory(GiB)": 20.17, "step": 415, - "train_speed(iter/s)": 0.141191 + "train_speed(iter/s)": 0.074534 }, { - "acc": 0.67346158, - "epoch": 1.1382113821138211, - "grad_norm": 1.171875, - "learning_rate": 0.000112981455064194, - "loss": 1.36795778, - "memory(GiB)": 16.34, + "acc": 0.66567349, + "epoch": 1.643835616438356, + "grad_norm": 0.87109375, + "learning_rate": 9.560404879781353e-05, + "loss": 1.31585007, + "memory(GiB)": 20.07, "step": 420, - "train_speed(iter/s)": 0.141417 + "train_speed(iter/s)": 0.074639 }, { - "acc": 0.67346702, - "epoch": 1.151761517615176, - "grad_norm": 1.0390625, - "learning_rate": 0.00011244650499286732, - "loss": 1.27111759, - "memory(GiB)": 16.34, + "acc": 0.66216898, + "epoch": 1.6634050880626223, + "grad_norm": 0.85546875, + "learning_rate": 9.454462886349281e-05, + "loss": 1.32738457, + "memory(GiB)": 19.43, "step": 425, - "train_speed(iter/s)": 0.141634 + "train_speed(iter/s)": 0.074732 }, { - "acc": 0.61625972, - "epoch": 1.165311653116531, - "grad_norm": 1.2578125, - "learning_rate": 0.00011191155492154065, - "loss": 1.42474003, - "memory(GiB)": 16.34, + "acc": 0.6608973, + "epoch": 1.6829745596868886, + "grad_norm": 1.1328125, + "learning_rate": 9.347875923444772e-05, + "loss": 1.2792593, + "memory(GiB)": 20.05, "step": 430, - "train_speed(iter/s)": 0.141853 + "train_speed(iter/s)": 0.074827 }, { - "acc": 0.65880041, - "epoch": 1.1788617886178863, - "grad_norm": 1.2578125, - "learning_rate": 0.00011137660485021397, - "loss": 1.33981524, - "memory(GiB)": 16.34, + "acc": 0.65830297, + "epoch": 1.7025440313111546, + "grad_norm": 0.94921875, + "learning_rate": 9.240671999369607e-05, + "loss": 1.34132614, + "memory(GiB)": 19.82, "step": 435, - "train_speed(iter/s)": 0.142048 + "train_speed(iter/s)": 0.074914 }, { - "acc": 0.63503499, - "epoch": 1.1924119241192412, - "grad_norm": 1.2265625, - "learning_rate": 0.00011084165477888728, - "loss": 1.44122829, - "memory(GiB)": 16.34, + "acc": 0.68926673, + "epoch": 1.7221135029354206, + "grad_norm": 0.76953125, + "learning_rate": 9.132879284547038e-05, + "loss": 1.15266266, + "memory(GiB)": 19.28, "step": 440, - "train_speed(iter/s)": 0.142254 + "train_speed(iter/s)": 0.074997 }, { - "acc": 0.61518383, - "epoch": 1.2059620596205962, - "grad_norm": 1.4765625, - "learning_rate": 0.00011030670470756061, - "loss": 1.54818697, - "memory(GiB)": 16.34, + "acc": 0.65699558, + "epoch": 1.741682974559687, + "grad_norm": 0.96484375, + "learning_rate": 9.024526104119312e-05, + "loss": 1.32417459, + "memory(GiB)": 19.29, "step": 445, - "train_speed(iter/s)": 0.142446 + "train_speed(iter/s)": 0.075079 }, { - "acc": 0.68010144, - "epoch": 1.2195121951219512, - "grad_norm": 1.3828125, - "learning_rate": 0.00010977175463623394, - "loss": 1.28560524, - "memory(GiB)": 16.34, + "acc": 0.68860197, + "epoch": 1.7612524461839532, + "grad_norm": 0.8203125, + "learning_rate": 8.91564093050458e-05, + "loss": 1.20134068, + "memory(GiB)": 19.33, "step": 450, - "train_speed(iter/s)": 0.142639 + "train_speed(iter/s)": 0.07515 }, { - "epoch": 1.2195121951219512, - "eval_acc": 0.6230366492146597, - "eval_loss": 1.5483555793762207, - "eval_runtime": 44.0588, - "eval_samples_per_second": 0.862, - "eval_steps_per_second": 0.862, + "epoch": 1.7612524461839532, + "eval_acc": 0.6351924587588373, + "eval_loss": 1.4908838272094727, + "eval_runtime": 71.5161, + "eval_samples_per_second": 1.063, + "eval_steps_per_second": 0.531, "step": 450 }, { - "acc": 0.64718337, - "epoch": 1.2330623306233062, - "grad_norm": 1.4375, - "learning_rate": 0.00010923680456490727, - "loss": 1.3778326, - "memory(GiB)": 16.34, + "acc": 0.65404687, + "epoch": 1.7808219178082192, + "grad_norm": 1.0078125, + "learning_rate": 8.806252375915052e-05, + "loss": 1.31502724, + "memory(GiB)": 19.13, "step": 455, - "train_speed(iter/s)": 0.140879 + "train_speed(iter/s)": 0.074358 }, { - "acc": 0.68637676, - "epoch": 1.2466124661246614, - "grad_norm": 1.453125, - "learning_rate": 0.00010870185449358059, - "loss": 1.21860657, - "memory(GiB)": 16.34, + "acc": 0.69379678, + "epoch": 1.8003913894324852, + "grad_norm": 1.1015625, + "learning_rate": 8.696389184838471e-05, + "loss": 1.1870966, + "memory(GiB)": 20.18, "step": 460, - "train_speed(iter/s)": 0.141097 + "train_speed(iter/s)": 0.074437 }, { - "acc": 0.63144946, - "epoch": 1.2601626016260163, - "grad_norm": 1.0703125, - "learning_rate": 0.00010816690442225392, - "loss": 1.38577023, - "memory(GiB)": 16.34, + "acc": 0.67447538, + "epoch": 1.8199608610567515, + "grad_norm": 1.2890625, + "learning_rate": 8.586080226484789e-05, + "loss": 1.19511604, + "memory(GiB)": 20.09, "step": 465, - "train_speed(iter/s)": 0.141294 + "train_speed(iter/s)": 0.074531 }, { - "acc": 0.66895781, - "epoch": 1.2737127371273713, - "grad_norm": 1.296875, - "learning_rate": 0.00010763195435092724, - "loss": 1.29167385, - "memory(GiB)": 16.34, + "acc": 0.67230067, + "epoch": 1.8395303326810177, + "grad_norm": 1.0390625, + "learning_rate": 8.475354487200092e-05, + "loss": 1.30591021, + "memory(GiB)": 19.29, "step": 470, - "train_speed(iter/s)": 0.141488 + "train_speed(iter/s)": 0.074608 }, { - "acc": 0.6553885, - "epoch": 1.2872628726287263, - "grad_norm": 1.1484375, - "learning_rate": 0.00010709700427960055, - "loss": 1.35011473, - "memory(GiB)": 16.34, + "acc": 0.65006552, + "epoch": 1.8590998043052838, + "grad_norm": 3.21875, + "learning_rate": 8.364241062849732e-05, + "loss": 1.35613279, + "memory(GiB)": 19.51, "step": 475, - "train_speed(iter/s)": 0.141687 + "train_speed(iter/s)": 0.07469 }, { - "acc": 0.65576148, - "epoch": 1.3008130081300813, - "grad_norm": 1.1796875, - "learning_rate": 0.00010656205420827388, - "loss": 1.39116755, - "memory(GiB)": 16.34, + "acc": 0.66248426, + "epoch": 1.8786692759295498, + "grad_norm": 1.0703125, + "learning_rate": 8.252769151172682e-05, + "loss": 1.34706697, + "memory(GiB)": 19.16, "step": 480, - "train_speed(iter/s)": 0.141863 + "train_speed(iter/s)": 0.074779 }, { - "acc": 0.62025633, - "epoch": 1.3143631436314362, - "grad_norm": 1.375, - "learning_rate": 0.00010602710413694721, - "loss": 1.55190315, - "memory(GiB)": 16.34, + "acc": 0.66462736, + "epoch": 1.898238747553816, + "grad_norm": 0.8515625, + "learning_rate": 8.140968044109134e-05, + "loss": 1.31343336, + "memory(GiB)": 19.17, "step": 485, - "train_speed(iter/s)": 0.142058 + "train_speed(iter/s)": 0.07486 }, { - "acc": 0.65667233, - "epoch": 1.3279132791327912, - "grad_norm": 1.1484375, - "learning_rate": 0.00010549215406562054, - "loss": 1.30497828, - "memory(GiB)": 16.34, + "acc": 0.65373287, + "epoch": 1.9178082191780823, + "grad_norm": 1.078125, + "learning_rate": 8.028867120103326e-05, + "loss": 1.31145601, + "memory(GiB)": 19.46, "step": 490, - "train_speed(iter/s)": 0.142237 + "train_speed(iter/s)": 0.074941 }, { - "acc": 0.6458045, - "epoch": 1.3414634146341464, - "grad_norm": 2.421875, - "learning_rate": 0.00010495720399429386, - "loss": 1.40175285, - "memory(GiB)": 16.34, + "acc": 0.6731041, + "epoch": 1.9373776908023483, + "grad_norm": 0.89453125, + "learning_rate": 7.916495836383648e-05, + "loss": 1.24272699, + "memory(GiB)": 19.45, "step": 495, - "train_speed(iter/s)": 0.14241 + "train_speed(iter/s)": 0.075011 }, { - "acc": 0.65004997, - "epoch": 1.3550135501355014, - "grad_norm": 1.265625, - "learning_rate": 0.00010442225392296718, - "loss": 1.31625471, - "memory(GiB)": 16.34, + "acc": 0.66485052, + "epoch": 1.9569471624266144, + "grad_norm": 1.03125, + "learning_rate": 7.80388372122204e-05, + "loss": 1.28164721, + "memory(GiB)": 19.24, "step": 500, - "train_speed(iter/s)": 0.142583 + "train_speed(iter/s)": 0.07509 }, { - "epoch": 1.3550135501355014, - "eval_acc": 0.61932106063165, - "eval_loss": 1.535814642906189, - "eval_runtime": 44.07, - "eval_samples_per_second": 0.862, - "eval_steps_per_second": 0.862, + "epoch": 1.9569471624266144, + "eval_acc": 0.6349567949725059, + "eval_loss": 1.483258843421936, + "eval_runtime": 72.4797, + "eval_samples_per_second": 1.049, + "eval_steps_per_second": 0.524, "step": 500 }, { - "acc": 0.65512385, - "epoch": 1.3685636856368564, - "grad_norm": 0.9375, - "learning_rate": 0.00010388730385164051, - "loss": 1.2989893, - "memory(GiB)": 16.34, + "acc": 0.68325486, + "epoch": 1.9765166340508806, + "grad_norm": 1.2890625, + "learning_rate": 7.691060366174728e-05, + "loss": 1.2257865, + "memory(GiB)": 22.98, "step": 505, - "train_speed(iter/s)": 0.140995 + "train_speed(iter/s)": 0.074371 }, { - "acc": 0.65444102, - "epoch": 1.3821138211382114, - "grad_norm": 1.5625, - "learning_rate": 0.00010335235378031382, - "loss": 1.32140932, - "memory(GiB)": 16.34, + "acc": 0.68977013, + "epoch": 1.9960861056751469, + "grad_norm": 1.0234375, + "learning_rate": 7.578055418306327e-05, + "loss": 1.25723343, + "memory(GiB)": 19.56, "step": 510, - "train_speed(iter/s)": 0.141172 + "train_speed(iter/s)": 0.074471 }, { - "acc": 0.66242504, - "epoch": 1.3956639566395663, - "grad_norm": 0.9140625, - "learning_rate": 0.00010281740370898714, - "loss": 1.34259834, - "memory(GiB)": 16.34, + "acc": 0.72185702, + "epoch": 2.015655577299413, + "grad_norm": 0.7890625, + "learning_rate": 7.464898572399353e-05, + "loss": 1.01715631, + "memory(GiB)": 20.07, "step": 515, - "train_speed(iter/s)": 0.141336 + "train_speed(iter/s)": 0.074591 }, { - "acc": 0.62716827, - "epoch": 1.4092140921409215, - "grad_norm": 1.4296875, - "learning_rate": 0.00010228245363766048, - "loss": 1.46508284, - "memory(GiB)": 16.34, + "acc": 0.71889682, + "epoch": 2.035225048923679, + "grad_norm": 1.0625, + "learning_rate": 7.351619563151208e-05, + "loss": 1.03077154, + "memory(GiB)": 19.92, "step": 520, - "train_speed(iter/s)": 0.141508 + "train_speed(iter/s)": 0.074683 }, { - "acc": 0.61702657, - "epoch": 1.4227642276422765, - "grad_norm": 1.5546875, - "learning_rate": 0.0001017475035663338, - "loss": 1.43188276, - "memory(GiB)": 16.34, + "acc": 0.7505311, + "epoch": 2.0547945205479454, + "grad_norm": 1.9609375, + "learning_rate": 7.238248157360663e-05, + "loss": 0.93218956, + "memory(GiB)": 19.85, "step": 525, - "train_speed(iter/s)": 0.141681 + "train_speed(iter/s)": 0.07477 }, { - "acc": 0.63238263, - "epoch": 1.4363143631436315, - "grad_norm": 1.3671875, - "learning_rate": 0.00010121255349500713, - "loss": 1.52214499, - "memory(GiB)": 16.34, + "acc": 0.7315311, + "epoch": 2.0743639921722115, + "grad_norm": 1.1875, + "learning_rate": 7.124814146105921e-05, + "loss": 0.96330833, + "memory(GiB)": 19.87, "step": 530, - "train_speed(iter/s)": 0.141854 + "train_speed(iter/s)": 0.074853 }, { - "acc": 0.6323854, - "epoch": 1.4498644986449865, - "grad_norm": 1.375, - "learning_rate": 0.00010067760342368045, - "loss": 1.38910236, - "memory(GiB)": 16.34, + "acc": 0.75555606, + "epoch": 2.0939334637964775, + "grad_norm": 1.3515625, + "learning_rate": 7.011347336916277e-05, + "loss": 0.86877937, + "memory(GiB)": 18.46, "step": 535, - "train_speed(iter/s)": 0.142019 + "train_speed(iter/s)": 0.074938 }, { - "acc": 0.60772176, - "epoch": 1.4634146341463414, - "grad_norm": 1.21875, - "learning_rate": 0.00010014265335235376, - "loss": 1.46768923, - "memory(GiB)": 16.34, + "acc": 0.74034052, + "epoch": 2.1135029354207435, + "grad_norm": 1.546875, + "learning_rate": 6.897877545939475e-05, + "loss": 0.90922012, + "memory(GiB)": 19.89, "step": 540, - "train_speed(iter/s)": 0.14218 + "train_speed(iter/s)": 0.075027 }, { - "acc": 0.67315965, - "epoch": 1.4769647696476964, - "grad_norm": 1.34375, - "learning_rate": 9.96077032810271e-05, - "loss": 1.25136938, - "memory(GiB)": 16.34, + "acc": 0.72400937, + "epoch": 2.1330724070450096, + "grad_norm": 1.90625, + "learning_rate": 6.784434590106808e-05, + "loss": 0.98424711, + "memory(GiB)": 19.11, "step": 545, - "train_speed(iter/s)": 0.142338 + "train_speed(iter/s)": 0.075114 }, { - "acc": 0.66408758, - "epoch": 1.4905149051490514, - "grad_norm": 2.046875, - "learning_rate": 9.907275320970041e-05, - "loss": 1.25655928, - "memory(GiB)": 16.34, + "acc": 0.77706275, + "epoch": 2.152641878669276, + "grad_norm": 1.359375, + "learning_rate": 6.671048279297972e-05, + "loss": 0.80820856, + "memory(GiB)": 19.86, "step": 550, - "train_speed(iter/s)": 0.142495 + "train_speed(iter/s)": 0.075193 }, { - "epoch": 1.4905149051490514, - "eval_acc": 0.6254011146765749, - "eval_loss": 1.5346177816390991, - "eval_runtime": 44.1805, - "eval_samples_per_second": 0.86, - "eval_steps_per_second": 0.86, + "epoch": 2.152641878669276, + "eval_acc": 0.6260015710919089, + "eval_loss": 1.6081812381744385, + "eval_runtime": 68.6973, + "eval_samples_per_second": 1.106, + "eval_steps_per_second": 0.553, "step": 550 }, { - "acc": 0.68655195, - "epoch": 1.5040650406504064, - "grad_norm": 1.125, - "learning_rate": 9.853780313837375e-05, - "loss": 1.08906736, - "memory(GiB)": 16.34, + "acc": 0.75351696, + "epoch": 2.172211350293542, + "grad_norm": 2.015625, + "learning_rate": 6.55774840850782e-05, + "loss": 0.86192131, + "memory(GiB)": 22.21, "step": 555, - "train_speed(iter/s)": 0.141037 + "train_speed(iter/s)": 0.074578 }, { - "acc": 0.65898876, - "epoch": 1.5176151761517616, - "grad_norm": 1.4296875, - "learning_rate": 9.800285306704707e-05, - "loss": 1.26696377, - "memory(GiB)": 16.34, + "acc": 0.74249997, + "epoch": 2.191780821917808, + "grad_norm": 1.4609375, + "learning_rate": 6.444564750017003e-05, + "loss": 0.91982813, + "memory(GiB)": 19.87, "step": 560, - "train_speed(iter/s)": 0.14119 + "train_speed(iter/s)": 0.074665 }, { - "acc": 0.63674688, - "epoch": 1.5311653116531165, - "grad_norm": 1.5390625, - "learning_rate": 9.746790299572039e-05, - "loss": 1.38277016, - "memory(GiB)": 16.34, + "acc": 0.73636398, + "epoch": 2.2113502935420746, + "grad_norm": 1.9375, + "learning_rate": 6.331527045568573e-05, + "loss": 0.93448582, + "memory(GiB)": 19.33, "step": 565, - "train_speed(iter/s)": 0.141349 + "train_speed(iter/s)": 0.074752 }, { - "acc": 0.67204466, - "epoch": 1.5447154471544715, - "grad_norm": 1.234375, - "learning_rate": 9.693295292439372e-05, - "loss": 1.25243311, - "memory(GiB)": 16.34, + "acc": 0.74081583, + "epoch": 2.2309197651663406, + "grad_norm": 2.21875, + "learning_rate": 6.218664998552634e-05, + "loss": 0.94956303, + "memory(GiB)": 19.8, "step": 570, - "train_speed(iter/s)": 0.141502 + "train_speed(iter/s)": 0.074842 }, { - "acc": 0.67878027, - "epoch": 1.5582655826558267, - "grad_norm": 1.3828125, - "learning_rate": 9.639800285306703e-05, - "loss": 1.18253031, - "memory(GiB)": 16.34, + "acc": 0.74573116, + "epoch": 2.2504892367906066, + "grad_norm": 2.546875, + "learning_rate": 6.106008266201046e-05, + "loss": 0.88486786, + "memory(GiB)": 19.92, "step": 575, - "train_speed(iter/s)": 0.141663 + "train_speed(iter/s)": 0.074925 }, { - "acc": 0.64645357, - "epoch": 1.5718157181571817, - "grad_norm": 1.671875, - "learning_rate": 9.586305278174036e-05, - "loss": 1.39789114, - "memory(GiB)": 16.34, + "acc": 0.75495067, + "epoch": 2.2700587084148727, + "grad_norm": 2.09375, + "learning_rate": 5.9935864517942844e-05, + "loss": 0.84776802, + "memory(GiB)": 19.89, "step": 580, - "train_speed(iter/s)": 0.141812 + "train_speed(iter/s)": 0.075 }, { - "acc": 0.63896599, - "epoch": 1.5853658536585367, - "grad_norm": 1.5546875, - "learning_rate": 9.532810271041368e-05, - "loss": 1.37434454, - "memory(GiB)": 16.34, + "acc": 0.74743519, + "epoch": 2.2896281800391387, + "grad_norm": 1.5859375, + "learning_rate": 5.881429096882449e-05, + "loss": 0.92330503, + "memory(GiB)": 19.03, "step": 585, - "train_speed(iter/s)": 0.141961 + "train_speed(iter/s)": 0.075076 }, { - "acc": 0.67930498, - "epoch": 1.5989159891598916, - "grad_norm": 1.3671875, - "learning_rate": 9.479315263908701e-05, - "loss": 1.24675446, - "memory(GiB)": 16.34, + "acc": 0.74913769, + "epoch": 2.309197651663405, + "grad_norm": 1.6640625, + "learning_rate": 5.769565673522515e-05, + "loss": 0.92942295, + "memory(GiB)": 20.04, "step": 590, - "train_speed(iter/s)": 0.142113 + "train_speed(iter/s)": 0.075149 }, { - "acc": 0.66582651, - "epoch": 1.6124661246612466, - "grad_norm": 1.1171875, - "learning_rate": 9.425820256776034e-05, - "loss": 1.33937092, - "memory(GiB)": 16.34, + "acc": 0.74875064, + "epoch": 2.328767123287671, + "grad_norm": 1.25, + "learning_rate": 5.658025576533832e-05, + "loss": 0.90142069, + "memory(GiB)": 19.96, "step": 595, - "train_speed(iter/s)": 0.142256 + "train_speed(iter/s)": 0.075215 }, { - "acc": 0.65553112, - "epoch": 1.6260162601626016, - "grad_norm": 1.5859375, - "learning_rate": 9.372325249643366e-05, - "loss": 1.25127707, - "memory(GiB)": 16.34, + "acc": 0.74648356, + "epoch": 2.3483365949119372, + "grad_norm": 1.65625, + "learning_rate": 5.546838115773929e-05, + "loss": 0.91528139, + "memory(GiB)": 19.84, "step": 600, - "train_speed(iter/s)": 0.142394 + "train_speed(iter/s)": 0.075292 }, { - "epoch": 1.6260162601626016, - "eval_acc": 0.6233744299949333, - "eval_loss": 1.5426762104034424, - "eval_runtime": 44.3564, - "eval_samples_per_second": 0.857, - "eval_steps_per_second": 0.857, + "epoch": 2.3483365949119372, + "eval_acc": 0.6284367635506677, + "eval_loss": 1.593437910079956, + "eval_runtime": 68.9856, + "eval_samples_per_second": 1.102, + "eval_steps_per_second": 0.551, "step": 600 }, { - "acc": 0.64805899, - "epoch": 1.6395663956639566, - "grad_norm": 1.078125, - "learning_rate": 9.318830242510699e-05, - "loss": 1.30654621, - "memory(GiB)": 16.34, + "acc": 0.75246172, + "epoch": 2.3679060665362037, + "grad_norm": 1.2109375, + "learning_rate": 5.4360325084366416e-05, + "loss": 0.87402363, + "memory(GiB)": 22.69, "step": 605, - "train_speed(iter/s)": 0.141073 + "train_speed(iter/s)": 0.074706 }, { - "acc": 0.67441335, - "epoch": 1.6531165311653115, - "grad_norm": 1.34375, - "learning_rate": 9.26533523537803e-05, - "loss": 1.20785751, - "memory(GiB)": 16.34, + "acc": 0.74078665, + "epoch": 2.3874755381604698, + "grad_norm": 1.0390625, + "learning_rate": 5.3256378713745815e-05, + "loss": 0.91142588, + "memory(GiB)": 20.15, "step": 610, - "train_speed(iter/s)": 0.141227 + "train_speed(iter/s)": 0.074788 }, { - "acc": 0.64989614, - "epoch": 1.6666666666666665, - "grad_norm": 2.25, - "learning_rate": 9.211840228245362e-05, - "loss": 1.41231976, - "memory(GiB)": 16.34, + "acc": 0.75772052, + "epoch": 2.407045009784736, + "grad_norm": 2.03125, + "learning_rate": 5.21568321344799e-05, + "loss": 0.85517597, + "memory(GiB)": 19.37, "step": 615, - "train_speed(iter/s)": 0.141374 + "train_speed(iter/s)": 0.074857 }, { - "acc": 0.65118213, - "epoch": 1.6802168021680217, - "grad_norm": 1.34375, - "learning_rate": 9.158345221112695e-05, - "loss": 1.33156195, - "memory(GiB)": 16.34, + "acc": 0.75341692, + "epoch": 2.426614481409002, + "grad_norm": 1.40625, + "learning_rate": 5.10619742790194e-05, + "loss": 0.87981377, + "memory(GiB)": 18.91, "step": 620, - "train_speed(iter/s)": 0.141527 + "train_speed(iter/s)": 0.074925 }, { - "acc": 0.68491917, - "epoch": 1.6937669376693767, - "grad_norm": 2.078125, - "learning_rate": 9.104850213980028e-05, - "loss": 1.15608921, - "memory(GiB)": 16.34, + "acc": 0.76221485, + "epoch": 2.446183953033268, + "grad_norm": 5.5625, + "learning_rate": 4.9972092847739603e-05, + "loss": 0.89623175, + "memory(GiB)": 20.27, "step": 625, - "train_speed(iter/s)": 0.141656 + "train_speed(iter/s)": 0.074994 }, { - "acc": 0.6636075, - "epoch": 1.7073170731707317, - "grad_norm": 1.2890625, - "learning_rate": 9.051355206847361e-05, - "loss": 1.3377409, - "memory(GiB)": 16.34, + "acc": 0.74322577, + "epoch": 2.4657534246575343, + "grad_norm": 1.6796875, + "learning_rate": 4.8887474233339963e-05, + "loss": 0.89493027, + "memory(GiB)": 19.38, "step": 630, - "train_speed(iter/s)": 0.141795 + "train_speed(iter/s)": 0.075068 }, { - "acc": 0.66115265, - "epoch": 1.7208672086720869, - "grad_norm": 1.796875, - "learning_rate": 8.997860199714693e-05, - "loss": 1.41666918, - "memory(GiB)": 16.34, + "acc": 0.74455509, + "epoch": 2.4853228962818004, + "grad_norm": 1.3046875, + "learning_rate": 4.780840344558753e-05, + "loss": 0.92399101, + "memory(GiB)": 19.32, "step": 635, - "train_speed(iter/s)": 0.141928 + "train_speed(iter/s)": 0.075143 }, { - "acc": 0.65316691, - "epoch": 1.7344173441734418, - "grad_norm": 1.875, - "learning_rate": 8.944365192582024e-05, - "loss": 1.37885714, - "memory(GiB)": 16.34, + "acc": 0.75597148, + "epoch": 2.5048923679060664, + "grad_norm": 1.65625, + "learning_rate": 4.673516403642383e-05, + "loss": 0.86396818, + "memory(GiB)": 19.52, "step": 640, - "train_speed(iter/s)": 0.14206 + "train_speed(iter/s)": 0.075214 }, { - "acc": 0.63365035, - "epoch": 1.7479674796747968, - "grad_norm": 2.03125, - "learning_rate": 8.890870185449357e-05, - "loss": 1.43597651, - "memory(GiB)": 16.34, + "acc": 0.75100412, + "epoch": 2.524461839530333, + "grad_norm": 1.5390625, + "learning_rate": 4.5668038025454554e-05, + "loss": 0.89630232, + "memory(GiB)": 19.54, "step": 645, - "train_speed(iter/s)": 0.142195 + "train_speed(iter/s)": 0.07528 }, { - "acc": 0.6461009, - "epoch": 1.7615176151761518, - "grad_norm": 1.390625, - "learning_rate": 8.837375178316689e-05, - "loss": 1.40720987, - "memory(GiB)": 16.34, + "acc": 0.74814, + "epoch": 2.544031311154599, + "grad_norm": 1.7265625, + "learning_rate": 4.460730582584228e-05, + "loss": 0.90660105, + "memory(GiB)": 19.46, "step": 650, - "train_speed(iter/s)": 0.142326 + "train_speed(iter/s)": 0.075343 }, { - "epoch": 1.7615176151761518, - "eval_acc": 0.6223610876541125, - "eval_loss": 1.5279418230056763, - "eval_runtime": 44.3195, - "eval_samples_per_second": 0.857, - "eval_steps_per_second": 0.857, + "epoch": 2.544031311154599, + "eval_acc": 0.6304006284367636, + "eval_loss": 1.6207610368728638, + "eval_runtime": 68.9365, + "eval_samples_per_second": 1.102, + "eval_steps_per_second": 0.551, "step": 650 }, { - "acc": 0.68952079, - "epoch": 1.7750677506775068, - "grad_norm": 1.5859375, - "learning_rate": 8.783880171184023e-05, - "loss": 1.1804883, - "memory(GiB)": 16.34, + "acc": 0.74153934, + "epoch": 2.563600782778865, + "grad_norm": 2.328125, + "learning_rate": 4.3553246170621e-05, + "loss": 0.90404129, + "memory(GiB)": 19.38, "step": 655, - "train_speed(iter/s)": 0.141093 + "train_speed(iter/s)": 0.074813 }, { - "acc": 0.63278737, - "epoch": 1.7886178861788617, - "grad_norm": 1.3125, - "learning_rate": 8.730385164051355e-05, - "loss": 1.45864544, - "memory(GiB)": 16.34, + "acc": 0.76082869, + "epoch": 2.583170254403131, + "grad_norm": 1.5390625, + "learning_rate": 4.2506136039452357e-05, + "loss": 0.90251627, + "memory(GiB)": 20.24, "step": 660, - "train_speed(iter/s)": 0.141227 + "train_speed(iter/s)": 0.074877 }, { - "acc": 0.6499536, - "epoch": 1.8021680216802167, - "grad_norm": 1.0859375, - "learning_rate": 8.676890156918687e-05, - "loss": 1.36629667, - "memory(GiB)": 16.34, + "acc": 0.76424356, + "epoch": 2.602739726027397, + "grad_norm": 1.109375, + "learning_rate": 4.146625058584251e-05, + "loss": 0.85076065, + "memory(GiB)": 19.4, "step": 665, - "train_speed(iter/s)": 0.14135 + "train_speed(iter/s)": 0.07494 }, { - "acc": 0.66262636, - "epoch": 1.8157181571815717, - "grad_norm": 2.8125, - "learning_rate": 8.62339514978602e-05, - "loss": 1.28030796, - "memory(GiB)": 16.34, + "acc": 0.75788155, + "epoch": 2.6223091976516635, + "grad_norm": 1.828125, + "learning_rate": 4.043386306483886e-05, + "loss": 0.8638917, + "memory(GiB)": 18.71, "step": 670, - "train_speed(iter/s)": 0.141486 + "train_speed(iter/s)": 0.075 }, { - "acc": 0.6478013, - "epoch": 1.8292682926829267, - "grad_norm": 1.3203125, - "learning_rate": 8.569900142653351e-05, - "loss": 1.37910089, - "memory(GiB)": 16.34, + "acc": 0.74567804, + "epoch": 2.6418786692759295, + "grad_norm": 1.5078125, + "learning_rate": 3.940924476122573e-05, + "loss": 0.91406345, + "memory(GiB)": 19.53, "step": 675, - "train_speed(iter/s)": 0.141617 + "train_speed(iter/s)": 0.075062 }, { - "acc": 0.65061078, - "epoch": 1.8428184281842819, - "grad_norm": 1.859375, - "learning_rate": 8.516405135520683e-05, - "loss": 1.24110394, - "memory(GiB)": 16.34, + "acc": 0.77229648, + "epoch": 2.6614481409001955, + "grad_norm": 1.3984375, + "learning_rate": 3.839266491823776e-05, + "loss": 0.79556112, + "memory(GiB)": 19.59, "step": 680, - "train_speed(iter/s)": 0.141749 + "train_speed(iter/s)": 0.075125 }, { - "acc": 0.66720443, - "epoch": 1.8563685636856369, - "grad_norm": 1.2734375, - "learning_rate": 8.462910128388016e-05, - "loss": 1.36949673, - "memory(GiB)": 16.34, + "acc": 0.7331708, + "epoch": 2.681017612524462, + "grad_norm": 1.6015625, + "learning_rate": 3.73843906668096e-05, + "loss": 0.95133247, + "memory(GiB)": 19.69, "step": 685, - "train_speed(iter/s)": 0.141882 + "train_speed(iter/s)": 0.075185 }, { - "acc": 0.65051932, - "epoch": 1.8699186991869918, - "grad_norm": 1.796875, - "learning_rate": 8.409415121255349e-05, - "loss": 1.3470686, - "memory(GiB)": 16.34, + "acc": 0.76955137, + "epoch": 2.700587084148728, + "grad_norm": 1.4140625, + "learning_rate": 3.6384686955380996e-05, + "loss": 0.82770052, + "memory(GiB)": 19.53, "step": 690, - "train_speed(iter/s)": 0.142007 + "train_speed(iter/s)": 0.075245 }, { - "acc": 0.64647999, - "epoch": 1.883468834688347, - "grad_norm": 1.0546875, - "learning_rate": 8.355920114122682e-05, - "loss": 1.27561255, - "memory(GiB)": 16.34, + "acc": 0.73245583, + "epoch": 2.720156555772994, + "grad_norm": 1.59375, + "learning_rate": 3.539381648027495e-05, + "loss": 0.93347349, + "memory(GiB)": 19.38, "step": 695, - "train_speed(iter/s)": 0.142124 + "train_speed(iter/s)": 0.075313 }, { - "acc": 0.64771528, - "epoch": 1.897018970189702, - "grad_norm": 2.5, - "learning_rate": 8.302425106990014e-05, - "loss": 1.3874403, - "memory(GiB)": 16.34, + "acc": 0.7664053, + "epoch": 2.73972602739726, + "grad_norm": 1.4296875, + "learning_rate": 3.441203961666818e-05, + "loss": 0.84118309, + "memory(GiB)": 19.55, "step": 700, - "train_speed(iter/s)": 0.142242 + "train_speed(iter/s)": 0.075373 }, { - "epoch": 1.897018970189702, - "eval_acc": 0.6274277993582165, - "eval_loss": 1.5236802101135254, - "eval_runtime": 44.4112, - "eval_samples_per_second": 0.856, - "eval_steps_per_second": 0.856, + "epoch": 2.73972602739726, + "eval_acc": 0.628750981932443, + "eval_loss": 1.5982366800308228, + "eval_runtime": 69.1268, + "eval_samples_per_second": 1.099, + "eval_steps_per_second": 0.55, "step": 700 }, { - "acc": 0.65817127, - "epoch": 1.910569105691057, - "grad_norm": 1.3203125, - "learning_rate": 8.248930099857345e-05, - "loss": 1.29632025, - "memory(GiB)": 16.34, + "acc": 0.74386759, + "epoch": 2.759295499021526, + "grad_norm": 2.21875, + "learning_rate": 3.343961435017094e-05, + "loss": 0.92712116, + "memory(GiB)": 23.1, "step": 705, - "train_speed(iter/s)": 0.141099 + "train_speed(iter/s)": 0.074881 }, { - "acc": 0.64647436, - "epoch": 1.924119241192412, - "grad_norm": 1.234375, - "learning_rate": 8.195435092724678e-05, - "loss": 1.39382238, - "memory(GiB)": 16.34, + "acc": 0.75352135, + "epoch": 2.7788649706457926, + "grad_norm": 1.5625, + "learning_rate": 3.247679620903533e-05, + "loss": 0.90610752, + "memory(GiB)": 19.56, "step": 710, - "train_speed(iter/s)": 0.141217 + "train_speed(iter/s)": 0.074934 }, { - "acc": 0.65741391, - "epoch": 1.937669376693767, - "grad_norm": 0.98828125, - "learning_rate": 8.14194008559201e-05, - "loss": 1.32606802, - "memory(GiB)": 16.34, + "acc": 0.75765467, + "epoch": 2.7984344422700587, + "grad_norm": 4.4375, + "learning_rate": 3.1523838197008956e-05, + "loss": 0.88628139, + "memory(GiB)": 19.44, "step": 715, - "train_speed(iter/s)": 0.141337 + "train_speed(iter/s)": 0.074999 }, { - "acc": 0.65078535, - "epoch": 1.951219512195122, - "grad_norm": 1.6171875, - "learning_rate": 8.088445078459343e-05, - "loss": 1.28092451, - "memory(GiB)": 16.34, + "acc": 0.763375, + "epoch": 2.8180039138943247, + "grad_norm": 1.1640625, + "learning_rate": 3.058099072685204e-05, + "loss": 0.86159172, + "memory(GiB)": 19.5, "step": 720, - "train_speed(iter/s)": 0.141457 + "train_speed(iter/s)": 0.075059 }, { - "acc": 0.64983764, - "epoch": 1.9647696476964769, - "grad_norm": 1.7890625, - "learning_rate": 8.034950071326676e-05, - "loss": 1.35295801, - "memory(GiB)": 16.34, + "acc": 0.75694184, + "epoch": 2.837573385518591, + "grad_norm": 1.6171875, + "learning_rate": 2.964850155453543e-05, + "loss": 0.85433092, + "memory(GiB)": 19.38, "step": 725, - "train_speed(iter/s)": 0.141585 + "train_speed(iter/s)": 0.075121 }, { - "acc": 0.64880919, - "epoch": 1.9783197831978319, - "grad_norm": 1.6640625, - "learning_rate": 7.981455064194009e-05, - "loss": 1.38945732, - "memory(GiB)": 16.34, + "acc": 0.76086893, + "epoch": 2.857142857142857, + "grad_norm": 1.5859375, + "learning_rate": 2.8726615714136827e-05, + "loss": 0.8608798, + "memory(GiB)": 19.58, "step": 730, - "train_speed(iter/s)": 0.141703 + "train_speed(iter/s)": 0.075181 }, { - "acc": 0.64617214, - "epoch": 1.9918699186991868, - "grad_norm": 1.3671875, - "learning_rate": 7.927960057061341e-05, - "loss": 1.31189098, - "memory(GiB)": 16.34, + "acc": 0.74008894, + "epoch": 2.8767123287671232, + "grad_norm": 1.4375, + "learning_rate": 2.7815575453452058e-05, + "loss": 0.98413734, + "memory(GiB)": 19.59, "step": 735, - "train_speed(iter/s)": 0.141817 + "train_speed(iter/s)": 0.075242 }, { - "acc": 0.67255039, - "epoch": 2.005420054200542, - "grad_norm": 1.3359375, - "learning_rate": 7.874465049928672e-05, - "loss": 1.17746153, - "memory(GiB)": 16.34, + "acc": 0.75941825, + "epoch": 2.8962818003913893, + "grad_norm": 1.7734375, + "learning_rate": 2.6915620170338612e-05, + "loss": 0.85438929, + "memory(GiB)": 19.39, "step": 740, - "train_speed(iter/s)": 0.141669 + "train_speed(iter/s)": 0.075307 }, { - "acc": 0.69897633, - "epoch": 2.0189701897018972, - "grad_norm": 1.1875, - "learning_rate": 7.820970042796005e-05, - "loss": 1.03704672, - "memory(GiB)": 16.34, + "acc": 0.77891464, + "epoch": 2.9158512720156553, + "grad_norm": 1.7265625, + "learning_rate": 2.6026986349808058e-05, + "loss": 0.79716868, + "memory(GiB)": 19.61, "step": 745, - "train_speed(iter/s)": 0.141785 + "train_speed(iter/s)": 0.075361 }, { - "acc": 0.71377811, - "epoch": 2.032520325203252, - "grad_norm": 1.640625, - "learning_rate": 7.767475035663337e-05, - "loss": 1.04516726, - "memory(GiB)": 16.34, + "acc": 0.75023217, + "epoch": 2.935420743639922, + "grad_norm": 1.28125, + "learning_rate": 2.514990750188399e-05, + "loss": 0.85774508, + "memory(GiB)": 18.86, "step": 750, - "train_speed(iter/s)": 0.141905 + "train_speed(iter/s)": 0.075417 }, { - "epoch": 2.032520325203252, - "eval_acc": 0.6208410741428813, - "eval_loss": 1.603255271911621, - "eval_runtime": 44.4359, - "eval_samples_per_second": 0.855, - "eval_steps_per_second": 0.855, + "epoch": 2.935420743639922, + "eval_acc": 0.6324430479183032, + "eval_loss": 1.5986852645874023, + "eval_runtime": 69.3348, + "eval_samples_per_second": 1.096, + "eval_steps_per_second": 0.548, "step": 750 }, { - "acc": 0.73705945, - "epoch": 2.046070460704607, - "grad_norm": 1.8203125, - "learning_rate": 7.713980028530669e-05, - "loss": 0.9383255, - "memory(GiB)": 16.34, + "acc": 0.74531512, + "epoch": 2.954990215264188, + "grad_norm": 1.5625, + "learning_rate": 2.4284614100241538e-05, + "loss": 0.93483381, + "memory(GiB)": 23.14, "step": 755, - "train_speed(iter/s)": 0.14084 + "train_speed(iter/s)": 0.074953 }, { - "acc": 0.73054934, - "epoch": 2.059620596205962, - "grad_norm": 1.8203125, - "learning_rate": 7.660485021398003e-05, - "loss": 0.91379232, - "memory(GiB)": 16.34, + "acc": 0.76761031, + "epoch": 2.974559686888454, + "grad_norm": 1.6171875, + "learning_rate": 2.343133352164477e-05, + "loss": 0.84630623, + "memory(GiB)": 19.36, "step": 760, - "train_speed(iter/s)": 0.140958 + "train_speed(iter/s)": 0.075015 }, { - "acc": 0.73154573, - "epoch": 2.073170731707317, - "grad_norm": 1.6796875, - "learning_rate": 7.606990014265335e-05, - "loss": 0.97700481, - "memory(GiB)": 16.34, + "acc": 0.75018072, + "epoch": 2.9941291585127203, + "grad_norm": 1.5703125, + "learning_rate": 2.2590289986198136e-05, + "loss": 0.89352074, + "memory(GiB)": 19.6, "step": 765, - "train_speed(iter/s)": 0.141076 + "train_speed(iter/s)": 0.075072 }, { - "acc": 0.72998781, - "epoch": 2.086720867208672, - "grad_norm": 1.4140625, - "learning_rate": 7.553495007132668e-05, - "loss": 0.93853807, - "memory(GiB)": 16.34, + "acc": 0.80383377, + "epoch": 3.0136986301369864, + "grad_norm": 1.453125, + "learning_rate": 2.1761704498427003e-05, + "loss": 0.68276234, + "memory(GiB)": 19.62, "step": 770, - "train_speed(iter/s)": 0.141186 + "train_speed(iter/s)": 0.075153 }, { - "acc": 0.73213534, - "epoch": 2.100271002710027, - "grad_norm": 1.78125, - "learning_rate": 7.5e-05, - "loss": 0.94589176, - "memory(GiB)": 16.34, + "acc": 0.82252359, + "epoch": 3.0332681017612524, + "grad_norm": 1.328125, + "learning_rate": 2.094579478920358e-05, + "loss": 0.64008789, + "memory(GiB)": 19.76, "step": 775, - "train_speed(iter/s)": 0.141303 + "train_speed(iter/s)": 0.075213 }, { - "acc": 0.73035493, - "epoch": 2.113821138211382, - "grad_norm": 1.6953125, - "learning_rate": 7.446504992867331e-05, - "loss": 0.95597754, - "memory(GiB)": 16.34, + "acc": 0.83448801, + "epoch": 3.0528375733855184, + "grad_norm": 1.8828125, + "learning_rate": 2.0142775258532654e-05, + "loss": 0.61610913, + "memory(GiB)": 19.59, "step": 780, - "train_speed(iter/s)": 0.141422 + "train_speed(iter/s)": 0.075271 }, { - "acc": 0.73428354, - "epoch": 2.127371273712737, - "grad_norm": 1.484375, - "learning_rate": 7.393009985734664e-05, - "loss": 0.98254423, - "memory(GiB)": 16.34, + "acc": 0.83116817, + "epoch": 3.072407045009785, + "grad_norm": 1.5546875, + "learning_rate": 1.9352856919212994e-05, + "loss": 0.58688097, + "memory(GiB)": 19.53, "step": 785, - "train_speed(iter/s)": 0.141543 + "train_speed(iter/s)": 0.075323 }, { - "acc": 0.69750729, - "epoch": 2.140921409214092, - "grad_norm": 1.828125, - "learning_rate": 7.339514978601997e-05, - "loss": 1.10206041, - "memory(GiB)": 16.34, + "acc": 0.82525949, + "epoch": 3.091976516634051, + "grad_norm": 1.4375, + "learning_rate": 1.8576247341388544e-05, + "loss": 0.62312498, + "memory(GiB)": 19.85, "step": 790, - "train_speed(iter/s)": 0.141651 + "train_speed(iter/s)": 0.07537 }, { - "acc": 0.73764381, - "epoch": 2.154471544715447, - "grad_norm": 1.296875, - "learning_rate": 7.286019971469329e-05, - "loss": 0.84075432, - "memory(GiB)": 16.34, + "acc": 0.81645441, + "epoch": 3.111545988258317, + "grad_norm": 1.65625, + "learning_rate": 1.7813150598004313e-05, + "loss": 0.62203112, + "memory(GiB)": 19.79, "step": 795, - "train_speed(iter/s)": 0.141765 + "train_speed(iter/s)": 0.075423 }, { - "acc": 0.72124152, - "epoch": 2.168021680216802, - "grad_norm": 1.4375, - "learning_rate": 7.23252496433666e-05, - "loss": 1.05863771, - "memory(GiB)": 16.34, + "acc": 0.83432789, + "epoch": 3.131115459882583, + "grad_norm": 1.5859375, + "learning_rate": 1.7063767211181333e-05, + "loss": 0.60077624, + "memory(GiB)": 19.52, "step": 800, - "train_speed(iter/s)": 0.141874 + "train_speed(iter/s)": 0.07548 }, { - "epoch": 2.168021680216802, - "eval_acc": 0.6240499915554805, - "eval_loss": 1.61227285861969, - "eval_runtime": 44.3145, - "eval_samples_per_second": 0.858, - "eval_steps_per_second": 0.858, + "epoch": 3.131115459882583, + "eval_acc": 0.6209740769835035, + "eval_loss": 1.7955598831176758, + "eval_runtime": 69.0109, + "eval_samples_per_second": 1.101, + "eval_steps_per_second": 0.551, "step": 800 }, { - "acc": 0.73518519, - "epoch": 2.181571815718157, - "grad_norm": 1.546875, - "learning_rate": 7.179029957203993e-05, - "loss": 0.96204119, - "memory(GiB)": 16.34, + "acc": 0.82124023, + "epoch": 3.1506849315068495, + "grad_norm": 1.7578125, + "learning_rate": 1.6328294099524644e-05, + "loss": 0.60847788, + "memory(GiB)": 22.65, "step": 805, - "train_speed(iter/s)": 0.140881 + "train_speed(iter/s)": 0.075043 }, { - "acc": 0.74290161, - "epoch": 2.1951219512195124, - "grad_norm": 2.015625, - "learning_rate": 7.125534950071326e-05, - "loss": 0.92387733, - "memory(GiB)": 16.34, + "acc": 0.83265171, + "epoch": 3.1702544031311155, + "grad_norm": 4.09375, + "learning_rate": 1.5606924526378136e-05, + "loss": 0.57863126, + "memory(GiB)": 18.89, "step": 810, - "train_speed(iter/s)": 0.14021 + "train_speed(iter/s)": 0.07509 }, { - "acc": 0.73101602, - "epoch": 2.2086720867208673, - "grad_norm": 1.6015625, - "learning_rate": 7.072039942938658e-05, - "loss": 0.96248655, - "memory(GiB)": 16.34, + "acc": 0.8407362, + "epoch": 3.1898238747553815, + "grad_norm": 1.1796875, + "learning_rate": 1.4899848049039881e-05, + "loss": 0.53706379, + "memory(GiB)": 19.37, "step": 815, - "train_speed(iter/s)": 0.140323 + "train_speed(iter/s)": 0.075142 }, { - "acc": 0.73887796, - "epoch": 2.2222222222222223, - "grad_norm": 1.796875, - "learning_rate": 7.018544935805991e-05, - "loss": 0.94751673, - "memory(GiB)": 16.34, + "acc": 0.82116756, + "epoch": 3.2093933463796476, + "grad_norm": 1.859375, + "learning_rate": 1.4207250468951426e-05, + "loss": 0.64039102, + "memory(GiB)": 19.52, "step": 820, - "train_speed(iter/s)": 0.140441 + "train_speed(iter/s)": 0.075197 }, { - "acc": 0.75793715, - "epoch": 2.2357723577235773, - "grad_norm": 2.015625, - "learning_rate": 6.965049928673323e-05, - "loss": 0.84263477, - "memory(GiB)": 16.34, + "acc": 0.85004549, + "epoch": 3.228962818003914, + "grad_norm": 1.0390625, + "learning_rate": 1.3529313782874023e-05, + "loss": 0.53315983, + "memory(GiB)": 19.52, "step": 825, - "train_speed(iter/s)": 0.140555 + "train_speed(iter/s)": 0.07525 }, { - "acc": 0.75873909, - "epoch": 2.2493224932249323, - "grad_norm": 1.953125, - "learning_rate": 6.911554921540656e-05, - "loss": 0.89048252, - "memory(GiB)": 16.34, + "acc": 0.83273296, + "epoch": 3.24853228962818, + "grad_norm": 1.578125, + "learning_rate": 1.2866216135064487e-05, + "loss": 0.58545351, + "memory(GiB)": 19.36, "step": 830, - "train_speed(iter/s)": 0.14067 + "train_speed(iter/s)": 0.075303 }, { - "acc": 0.74456077, - "epoch": 2.2628726287262872, - "grad_norm": 1.7890625, - "learning_rate": 6.858059914407987e-05, - "loss": 0.90777779, - "memory(GiB)": 16.34, + "acc": 0.80788403, + "epoch": 3.268101761252446, + "grad_norm": 2.296875, + "learning_rate": 1.2218131770463487e-05, + "loss": 0.67468171, + "memory(GiB)": 19.28, "step": 835, - "train_speed(iter/s)": 0.140775 + "train_speed(iter/s)": 0.075356 }, { - "acc": 0.75809846, - "epoch": 2.2764227642276422, - "grad_norm": 2.0625, - "learning_rate": 6.80456490727532e-05, - "loss": 0.87556753, - "memory(GiB)": 16.34, + "acc": 0.8440134, + "epoch": 3.287671232876712, + "grad_norm": 1.21875, + "learning_rate": 1.1585230988908576e-05, + "loss": 0.55293651, + "memory(GiB)": 19.37, "step": 840, - "train_speed(iter/s)": 0.14088 + "train_speed(iter/s)": 0.07541 }, { - "acc": 0.73194971, - "epoch": 2.289972899728997, - "grad_norm": 2.765625, - "learning_rate": 6.751069900142653e-05, - "loss": 0.9232769, - "memory(GiB)": 16.34, + "acc": 0.81569691, + "epoch": 3.3072407045009786, + "grad_norm": 1.671875, + "learning_rate": 1.0967680100383645e-05, + "loss": 0.61190109, + "memory(GiB)": 18.09, "step": 845, - "train_speed(iter/s)": 0.140986 + "train_speed(iter/s)": 0.075466 }, { - "acc": 0.74470835, - "epoch": 2.303523035230352, - "grad_norm": 1.8515625, - "learning_rate": 6.697574893009985e-05, - "loss": 0.88600664, - "memory(GiB)": 16.34, + "acc": 0.84766483, + "epoch": 3.3268101761252447, + "grad_norm": 1.8046875, + "learning_rate": 1.0365641381317113e-05, + "loss": 0.52525816, + "memory(GiB)": 19.31, "step": 850, - "train_speed(iter/s)": 0.141086 + "train_speed(iter/s)": 0.075523 }, { - "epoch": 2.303523035230352, - "eval_acc": 0.6174632663401453, - "eval_loss": 1.660492181777954, - "eval_runtime": 44.1393, - "eval_samples_per_second": 0.861, - "eval_steps_per_second": 0.861, + "epoch": 3.3268101761252447, + "eval_acc": 0.6203456402199529, + "eval_loss": 1.7881730794906616, + "eval_runtime": 69.1552, + "eval_samples_per_second": 1.099, + "eval_steps_per_second": 0.549, "step": 850 }, { - "acc": 0.76063514, - "epoch": 2.317073170731707, - "grad_norm": 2.40625, - "learning_rate": 6.644079885877318e-05, - "loss": 0.80236473, - "memory(GiB)": 16.34, + "acc": 0.84491625, + "epoch": 3.3463796477495107, + "grad_norm": 1.8046875, + "learning_rate": 9.779273031939692e-06, + "loss": 0.56272998, + "memory(GiB)": 23.04, "step": 855, - "train_speed(iter/s)": 0.140168 + "train_speed(iter/s)": 0.07511 }, { - "acc": 0.71505499, - "epoch": 2.330623306233062, - "grad_norm": 2.671875, - "learning_rate": 6.59058487874465e-05, - "loss": 1.01317081, - "memory(GiB)": 16.34, + "acc": 0.84104662, + "epoch": 3.3659491193737767, + "grad_norm": 1.796875, + "learning_rate": 9.20872913471363e-06, + "loss": 0.57019663, + "memory(GiB)": 19.42, "step": 860, - "train_speed(iter/s)": 0.140281 + "train_speed(iter/s)": 0.075157 }, { - "acc": 0.73396034, - "epoch": 2.3441734417344176, - "grad_norm": 1.703125, - "learning_rate": 6.537089871611983e-05, - "loss": 0.96496754, - "memory(GiB)": 16.34, + "acc": 0.84433002, + "epoch": 3.385518590998043, + "grad_norm": 1.6484375, + "learning_rate": 8.654159613843715e-06, + "loss": 0.55449514, + "memory(GiB)": 19.59, "step": 865, - "train_speed(iter/s)": 0.140387 + "train_speed(iter/s)": 0.07521 }, { - "acc": 0.74145699, - "epoch": 2.3577235772357725, - "grad_norm": 1.9453125, - "learning_rate": 6.483594864479316e-05, - "loss": 1.00490999, - "memory(GiB)": 16.34, + "acc": 0.80005312, + "epoch": 3.4050880626223092, + "grad_norm": 1.46875, + "learning_rate": 8.115710195881068e-06, + "loss": 0.73595409, + "memory(GiB)": 19.36, "step": 870, - "train_speed(iter/s)": 0.140492 + "train_speed(iter/s)": 0.075258 }, { - "acc": 0.76035104, - "epoch": 2.3712737127371275, - "grad_norm": 1.7265625, - "learning_rate": 6.430099857346647e-05, - "loss": 0.93969469, - "memory(GiB)": 16.34, + "acc": 0.83217945, + "epoch": 3.4246575342465753, + "grad_norm": 3.328125, + "learning_rate": 7.593522371429972e-06, + "loss": 0.58270836, + "memory(GiB)": 19.58, "step": 875, - "train_speed(iter/s)": 0.140599 + "train_speed(iter/s)": 0.075306 }, { - "acc": 0.75214877, - "epoch": 2.3848238482384825, - "grad_norm": 1.9296875, - "learning_rate": 6.376604850213979e-05, - "loss": 0.9385232, - "memory(GiB)": 16.34, + "acc": 0.82742786, + "epoch": 3.4442270058708413, + "grad_norm": 1.234375, + "learning_rate": 7.0877333579678585e-06, + "loss": 0.59052157, + "memory(GiB)": 19.6, "step": 880, - "train_speed(iter/s)": 0.140705 + "train_speed(iter/s)": 0.075358 }, { - "acc": 0.73846526, - "epoch": 2.3983739837398375, - "grad_norm": 2.421875, - "learning_rate": 6.323109843081312e-05, - "loss": 0.95887814, - "memory(GiB)": 16.34, + "acc": 0.81994705, + "epoch": 3.4637964774951078, + "grad_norm": 1.7578125, + "learning_rate": 6.598476063788036e-06, + "loss": 0.62256751, + "memory(GiB)": 19.56, "step": 885, - "train_speed(iter/s)": 0.140812 + "train_speed(iter/s)": 0.075405 }, { - "acc": 0.72576594, - "epoch": 2.4119241192411924, - "grad_norm": 1.46875, - "learning_rate": 6.269614835948645e-05, - "loss": 0.95252094, - "memory(GiB)": 16.34, + "acc": 0.8157341, + "epoch": 3.483365949119374, + "grad_norm": 1.8203125, + "learning_rate": 6.12587905307477e-06, + "loss": 0.66806622, + "memory(GiB)": 19.49, "step": 890, - "train_speed(iter/s)": 0.140912 + "train_speed(iter/s)": 0.075454 }, { - "acc": 0.74866586, - "epoch": 2.4254742547425474, - "grad_norm": 1.5546875, - "learning_rate": 6.216119828815977e-05, - "loss": 0.91533632, - "memory(GiB)": 16.34, + "acc": 0.82838688, + "epoch": 3.50293542074364, + "grad_norm": 1.515625, + "learning_rate": 5.67006651212008e-06, + "loss": 0.63044977, + "memory(GiB)": 19.54, "step": 895, - "train_speed(iter/s)": 0.141011 + "train_speed(iter/s)": 0.075497 }, { - "acc": 0.72289066, - "epoch": 2.4390243902439024, - "grad_norm": 1.375, - "learning_rate": 6.162624821683308e-05, - "loss": 0.996418, - "memory(GiB)": 16.34, + "acc": 0.79130597, + "epoch": 3.5225048923679063, + "grad_norm": 1.640625, + "learning_rate": 5.2311582166906605e-06, + "loss": 0.7558567, + "memory(GiB)": 19.28, "step": 900, - "train_speed(iter/s)": 0.13966 + "train_speed(iter/s)": 0.07555 }, { - "epoch": 2.4390243902439024, - "eval_acc": 0.6152676912683668, - "eval_loss": 1.6503233909606934, - "eval_runtime": 44.1939, - "eval_samples_per_second": 0.86, - "eval_steps_per_second": 0.86, + "epoch": 3.5225048923679063, + "eval_acc": 0.6211311861743912, + "eval_loss": 1.7854998111724854, + "eval_runtime": 69.2434, + "eval_samples_per_second": 1.098, + "eval_steps_per_second": 0.549, "step": 900 - }, - { - "acc": 0.73535914, - "epoch": 2.4525745257452574, - "grad_norm": 1.3125, - "learning_rate": 6.109129814550641e-05, - "loss": 1.00612879, - "memory(GiB)": 16.34, - "step": 905, - "train_speed(iter/s)": 0.138805 - }, - { - "acc": 0.7277998, - "epoch": 2.4661246612466123, - "grad_norm": 2.328125, - "learning_rate": 6.0556348074179737e-05, - "loss": 1.00636635, - "memory(GiB)": 16.34, - "step": 910, - "train_speed(iter/s)": 0.138904 - }, - { - "acc": 0.70795035, - "epoch": 2.4796747967479673, - "grad_norm": 1.6484375, - "learning_rate": 6.002139800285306e-05, - "loss": 0.98440151, - "memory(GiB)": 16.34, - "step": 915, - "train_speed(iter/s)": 0.139013 - }, - { - "acc": 0.7650095, - "epoch": 2.4932249322493227, - "grad_norm": 2.25, - "learning_rate": 5.948644793152638e-05, - "loss": 0.77239523, - "memory(GiB)": 16.34, - "step": 920, - "train_speed(iter/s)": 0.139126 - }, - { - "acc": 0.71504927, - "epoch": 2.5067750677506773, - "grad_norm": 1.6328125, - "learning_rate": 5.895149786019971e-05, - "loss": 1.01518288, - "memory(GiB)": 16.34, - "step": 925, - "train_speed(iter/s)": 0.139233 - }, - { - "acc": 0.71582847, - "epoch": 2.5203252032520327, - "grad_norm": 2.28125, - "learning_rate": 5.841654778887303e-05, - "loss": 0.97862291, - "memory(GiB)": 16.34, - "step": 930, - "train_speed(iter/s)": 0.13934 - }, - { - "acc": 0.74984369, - "epoch": 2.5338753387533877, - "grad_norm": 1.8515625, - "learning_rate": 5.788159771754635e-05, - "loss": 0.85021992, - "memory(GiB)": 16.34, - "step": 935, - "train_speed(iter/s)": 0.139448 - }, - { - "acc": 0.74142261, - "epoch": 2.5474254742547426, - "grad_norm": 1.796875, - "learning_rate": 5.734664764621968e-05, - "loss": 0.91843338, - "memory(GiB)": 16.34, - "step": 940, - "train_speed(iter/s)": 0.139564 - }, - { - "acc": 0.71266222, - "epoch": 2.5609756097560976, - "grad_norm": 1.875, - "learning_rate": 5.6811697574893007e-05, - "loss": 1.14017658, - "memory(GiB)": 16.34, - "step": 945, - "train_speed(iter/s)": 0.139667 - }, - { - "acc": 0.73374839, - "epoch": 2.5745257452574526, - "grad_norm": 1.671875, - "learning_rate": 5.627674750356633e-05, - "loss": 0.99979248, - "memory(GiB)": 16.34, - "step": 950, - "train_speed(iter/s)": 0.139769 - }, - { - "epoch": 2.5745257452574526, - "eval_acc": 0.6161121432190508, - "eval_loss": 1.6511973142623901, - "eval_runtime": 44.1881, - "eval_samples_per_second": 0.86, - "eval_steps_per_second": 0.86, - "step": 950 - }, - { - "acc": 0.74381332, - "epoch": 2.5880758807588076, - "grad_norm": 1.7734375, - "learning_rate": 5.5741797432239646e-05, - "loss": 0.93189411, - "memory(GiB)": 16.34, - "step": 955, - "train_speed(iter/s)": 0.138972 - }, - { - "acc": 0.73305674, - "epoch": 2.6016260162601625, - "grad_norm": 2.4375, - "learning_rate": 5.5206847360912977e-05, - "loss": 0.93946905, - "memory(GiB)": 16.34, - "step": 960, - "train_speed(iter/s)": 0.139079 - }, - { - "acc": 0.72961435, - "epoch": 2.6151761517615175, - "grad_norm": 1.84375, - "learning_rate": 5.46718972895863e-05, - "loss": 0.99850683, - "memory(GiB)": 16.34, - "step": 965, - "train_speed(iter/s)": 0.139175 - }, - { - "acc": 0.7133184, - "epoch": 2.6287262872628725, - "grad_norm": 1.96875, - "learning_rate": 5.413694721825962e-05, - "loss": 1.03378878, - "memory(GiB)": 16.34, - "step": 970, - "train_speed(iter/s)": 0.139282 - }, - { - "acc": 0.71164918, - "epoch": 2.642276422764228, - "grad_norm": 1.3828125, - "learning_rate": 5.360199714693295e-05, - "loss": 1.02268944, - "memory(GiB)": 16.34, - "step": 975, - "train_speed(iter/s)": 0.139382 - }, - { - "acc": 0.71824646, - "epoch": 2.6558265582655824, - "grad_norm": 1.84375, - "learning_rate": 5.306704707560627e-05, - "loss": 1.06014738, - "memory(GiB)": 16.34, - "step": 980, - "train_speed(iter/s)": 0.139479 - }, - { - "acc": 0.75523286, - "epoch": 2.669376693766938, - "grad_norm": 1.7734375, - "learning_rate": 5.253209700427959e-05, - "loss": 0.80533791, - "memory(GiB)": 16.34, - "step": 985, - "train_speed(iter/s)": 0.139581 - }, - { - "acc": 0.72265592, - "epoch": 2.682926829268293, - "grad_norm": 1.5546875, - "learning_rate": 5.199714693295292e-05, - "loss": 1.0223958, - "memory(GiB)": 16.34, - "step": 990, - "train_speed(iter/s)": 0.139675 - }, - { - "acc": 0.74640193, - "epoch": 2.696476964769648, - "grad_norm": 1.5546875, - "learning_rate": 5.1462196861626247e-05, - "loss": 0.8896265, - "memory(GiB)": 16.34, - "step": 995, - "train_speed(iter/s)": 0.139767 - }, - { - "acc": 0.75235152, - "epoch": 2.710027100271003, - "grad_norm": 1.6484375, - "learning_rate": 5.092724679029957e-05, - "loss": 0.87937946, - "memory(GiB)": 16.34, - "step": 1000, - "train_speed(iter/s)": 0.139867 - }, - { - "epoch": 2.710027100271003, - "eval_acc": 0.6123965546360413, - "eval_loss": 1.6541699171066284, - "eval_runtime": 44.2136, - "eval_samples_per_second": 0.859, - "eval_steps_per_second": 0.859, - "step": 1000 - }, - { - "acc": 0.73264089, - "epoch": 2.7235772357723578, - "grad_norm": 1.96875, - "learning_rate": 5.0392296718972886e-05, - "loss": 0.95834293, - "memory(GiB)": 16.34, - "step": 1005, - "train_speed(iter/s)": 0.139102 - }, - { - "acc": 0.76931543, - "epoch": 2.7371273712737128, - "grad_norm": 1.7890625, - "learning_rate": 4.9857346647646217e-05, - "loss": 0.82889891, - "memory(GiB)": 16.34, - "step": 1010, - "train_speed(iter/s)": 0.139199 - }, - { - "acc": 0.74891815, - "epoch": 2.7506775067750677, - "grad_norm": 2.65625, - "learning_rate": 4.932239657631954e-05, - "loss": 0.93503094, - "memory(GiB)": 16.34, - "step": 1015, - "train_speed(iter/s)": 0.139298 - }, - { - "acc": 0.72551074, - "epoch": 2.7642276422764227, - "grad_norm": 1.5703125, - "learning_rate": 4.878744650499286e-05, - "loss": 1.06086788, - "memory(GiB)": 16.34, - "step": 1020, - "train_speed(iter/s)": 0.139383 - }, - { - "acc": 0.75321589, - "epoch": 2.7777777777777777, - "grad_norm": 1.7734375, - "learning_rate": 4.825249643366619e-05, - "loss": 0.83553734, - "memory(GiB)": 16.34, - "step": 1025, - "train_speed(iter/s)": 0.139478 - }, - { - "acc": 0.73832054, - "epoch": 2.7913279132791327, - "grad_norm": 1.53125, - "learning_rate": 4.771754636233951e-05, - "loss": 0.94093418, - "memory(GiB)": 16.34, - "step": 1030, - "train_speed(iter/s)": 0.139572 - }, - { - "acc": 0.76351671, - "epoch": 2.8048780487804876, - "grad_norm": 1.2890625, - "learning_rate": 4.718259629101283e-05, - "loss": 0.84423561, - "memory(GiB)": 16.34, - "step": 1035, - "train_speed(iter/s)": 0.139664 - }, - { - "acc": 0.75100279, - "epoch": 2.818428184281843, - "grad_norm": 1.890625, - "learning_rate": 4.6647646219686156e-05, - "loss": 0.91757078, - "memory(GiB)": 16.34, - "step": 1040, - "train_speed(iter/s)": 0.139754 - }, - { - "acc": 0.73302913, - "epoch": 2.8319783197831976, - "grad_norm": 1.3671875, - "learning_rate": 4.6112696148359487e-05, - "loss": 0.94331446, - "memory(GiB)": 16.34, - "step": 1045, - "train_speed(iter/s)": 0.139841 - }, - { - "acc": 0.73944206, - "epoch": 2.845528455284553, - "grad_norm": 1.59375, - "learning_rate": 4.55777460770328e-05, - "loss": 0.98726377, - "memory(GiB)": 16.34, - "step": 1050, - "train_speed(iter/s)": 0.139931 - }, - { - "epoch": 2.845528455284553, - "eval_acc": 0.6145921297078196, - "eval_loss": 1.6571751832962036, - "eval_runtime": 44.3705, - "eval_samples_per_second": 0.856, - "eval_steps_per_second": 0.856, - "step": 1050 - }, - { - "acc": 0.73031135, - "epoch": 2.859078590785908, - "grad_norm": 2.25, - "learning_rate": 4.5042796005706126e-05, - "loss": 0.95681438, - "memory(GiB)": 16.34, - "step": 1055, - "train_speed(iter/s)": 0.139201 - }, - { - "acc": 0.71694627, - "epoch": 2.872628726287263, - "grad_norm": 1.6328125, - "learning_rate": 4.4507845934379456e-05, - "loss": 1.02266083, - "memory(GiB)": 16.34, - "step": 1060, - "train_speed(iter/s)": 0.139294 - }, - { - "acc": 0.72791638, - "epoch": 2.886178861788618, - "grad_norm": 1.7578125, - "learning_rate": 4.397289586305278e-05, - "loss": 0.99341927, - "memory(GiB)": 16.34, - "step": 1065, - "train_speed(iter/s)": 0.139385 - }, - { - "acc": 0.74939594, - "epoch": 2.899728997289973, - "grad_norm": 1.921875, - "learning_rate": 4.34379457917261e-05, - "loss": 0.91077061, - "memory(GiB)": 16.34, - "step": 1070, - "train_speed(iter/s)": 0.139478 - }, - { - "acc": 0.72694654, - "epoch": 2.913279132791328, - "grad_norm": 1.7265625, - "learning_rate": 4.290299572039942e-05, - "loss": 0.98774853, - "memory(GiB)": 16.34, - "step": 1075, - "train_speed(iter/s)": 0.139564 - }, - { - "acc": 0.70588508, - "epoch": 2.926829268292683, - "grad_norm": 2.15625, - "learning_rate": 4.236804564907275e-05, - "loss": 1.07887812, - "memory(GiB)": 16.34, - "step": 1080, - "train_speed(iter/s)": 0.139657 - }, - { - "acc": 0.74654303, - "epoch": 2.940379403794038, - "grad_norm": 1.7109375, - "learning_rate": 4.183309557774607e-05, - "loss": 0.91062069, - "memory(GiB)": 16.34, - "step": 1085, - "train_speed(iter/s)": 0.139743 - }, - { - "acc": 0.73493595, - "epoch": 2.953929539295393, - "grad_norm": 2.03125, - "learning_rate": 4.1298145506419396e-05, - "loss": 0.92819033, - "memory(GiB)": 16.34, - "step": 1090, - "train_speed(iter/s)": 0.139829 - }, - { - "acc": 0.71466756, - "epoch": 2.9674796747967482, - "grad_norm": 2.109375, - "learning_rate": 4.0763195435092727e-05, - "loss": 1.01220913, - "memory(GiB)": 16.34, - "step": 1095, - "train_speed(iter/s)": 0.139916 - }, - { - "acc": 0.7607831, - "epoch": 2.9810298102981028, - "grad_norm": 2.375, - "learning_rate": 4.022824536376604e-05, - "loss": 0.8505785, - "memory(GiB)": 16.34, - "step": 1100, - "train_speed(iter/s)": 0.140004 - }, - { - "epoch": 2.9810298102981028, - "eval_acc": 0.6206721837527445, - "eval_loss": 1.6525288820266724, - "eval_runtime": 44.2124, - "eval_samples_per_second": 0.859, - "eval_steps_per_second": 0.859, - "step": 1100 - }, - { - "acc": 0.74116454, - "epoch": 2.994579945799458, - "grad_norm": 2.65625, - "learning_rate": 3.9693295292439366e-05, - "loss": 0.92264805, - "memory(GiB)": 16.34, - "step": 1105, - "train_speed(iter/s)": 0.139311 - }, - { - "acc": 0.8182869, - "epoch": 3.008130081300813, - "grad_norm": 1.3671875, - "learning_rate": 3.915834522111269e-05, - "loss": 0.6927434, - "memory(GiB)": 16.34, - "step": 1110, - "train_speed(iter/s)": 0.13923 - }, - { - "acc": 0.83515587, - "epoch": 3.021680216802168, - "grad_norm": 1.9140625, - "learning_rate": 3.862339514978602e-05, - "loss": 0.58284421, - "memory(GiB)": 16.34, - "step": 1115, - "train_speed(iter/s)": 0.139319 - }, - { - "acc": 0.84008894, - "epoch": 3.035230352303523, - "grad_norm": 1.8828125, - "learning_rate": 3.808844507845934e-05, - "loss": 0.59128432, - "memory(GiB)": 16.34, - "step": 1120, - "train_speed(iter/s)": 0.139405 - }, - { - "acc": 0.84298267, - "epoch": 3.048780487804878, - "grad_norm": 2.4375, - "learning_rate": 3.755349500713266e-05, - "loss": 0.56685705, - "memory(GiB)": 16.34, - "step": 1125, - "train_speed(iter/s)": 0.139493 - }, - { - "acc": 0.83680372, - "epoch": 3.062330623306233, - "grad_norm": 3.171875, - "learning_rate": 3.701854493580599e-05, - "loss": 0.55599914, - "memory(GiB)": 16.34, - "step": 1130, - "train_speed(iter/s)": 0.139571 - }, - { - "acc": 0.81269236, - "epoch": 3.075880758807588, - "grad_norm": 2.40625, - "learning_rate": 3.648359486447931e-05, - "loss": 0.63413367, - "memory(GiB)": 16.34, - "step": 1135, - "train_speed(iter/s)": 0.139659 - }, - { - "acc": 0.8300642, - "epoch": 3.089430894308943, - "grad_norm": 2.71875, - "learning_rate": 3.5948644793152636e-05, - "loss": 0.59288554, - "memory(GiB)": 16.34, - "step": 1140, - "train_speed(iter/s)": 0.139738 - }, - { - "acc": 0.84225779, - "epoch": 3.102981029810298, - "grad_norm": 2.703125, - "learning_rate": 3.541369472182596e-05, - "loss": 0.57104263, - "memory(GiB)": 16.34, - "step": 1145, - "train_speed(iter/s)": 0.13982 - }, - { - "acc": 0.86128368, - "epoch": 3.116531165311653, - "grad_norm": 3.0, - "learning_rate": 3.487874465049928e-05, - "loss": 0.47776127, - "memory(GiB)": 16.34, - "step": 1150, - "train_speed(iter/s)": 0.139903 - }, - { - "epoch": 3.116531165311653, - "eval_acc": 0.6091876372234419, - "eval_loss": 1.9067459106445312, - "eval_runtime": 44.213, - "eval_samples_per_second": 0.859, - "eval_steps_per_second": 0.859, - "step": 1150 - }, - { - "acc": 0.81826935, - "epoch": 3.130081300813008, - "grad_norm": 2.234375, - "learning_rate": 3.4343794579172606e-05, - "loss": 0.64635777, - "memory(GiB)": 16.34, - "step": 1155, - "train_speed(iter/s)": 0.13924 - }, - { - "acc": 0.79423141, - "epoch": 3.1436314363143634, - "grad_norm": 2.859375, - "learning_rate": 3.380884450784593e-05, - "loss": 0.71202464, - "memory(GiB)": 16.34, - "step": 1160, - "train_speed(iter/s)": 0.139319 - }, - { - "acc": 0.81948729, - "epoch": 3.1571815718157183, - "grad_norm": 2.046875, - "learning_rate": 3.327389443651925e-05, - "loss": 0.62771091, - "memory(GiB)": 16.34, - "step": 1165, - "train_speed(iter/s)": 0.1394 - }, - { - "acc": 0.85081501, - "epoch": 3.1707317073170733, - "grad_norm": 2.125, - "learning_rate": 3.2738944365192576e-05, - "loss": 0.55290155, - "memory(GiB)": 16.34, - "step": 1170, - "train_speed(iter/s)": 0.139484 - }, - { - "acc": 0.84410248, - "epoch": 3.1842818428184283, - "grad_norm": 2.140625, - "learning_rate": 3.2203994293865906e-05, - "loss": 0.57252893, - "memory(GiB)": 16.34, - "step": 1175, - "train_speed(iter/s)": 0.139567 - }, - { - "acc": 0.78565025, - "epoch": 3.1978319783197833, - "grad_norm": 3.203125, - "learning_rate": 3.166904422253922e-05, - "loss": 0.73501797, - "memory(GiB)": 16.34, - "step": 1180, - "train_speed(iter/s)": 0.13965 - }, - { - "acc": 0.78738356, - "epoch": 3.2113821138211383, - "grad_norm": 2.953125, - "learning_rate": 3.113409415121255e-05, - "loss": 0.72265806, - "memory(GiB)": 16.34, - "step": 1185, - "train_speed(iter/s)": 0.139732 - }, - { - "acc": 0.818221, - "epoch": 3.2249322493224932, - "grad_norm": 2.21875, - "learning_rate": 3.0599144079885876e-05, - "loss": 0.61062384, - "memory(GiB)": 16.34, - "step": 1190, - "train_speed(iter/s)": 0.139813 - }, - { - "acc": 0.82556448, - "epoch": 3.238482384823848, - "grad_norm": 2.3125, - "learning_rate": 3.00641940085592e-05, - "loss": 0.61080799, - "memory(GiB)": 16.34, - "step": 1195, - "train_speed(iter/s)": 0.139893 - }, - { - "acc": 0.82586126, - "epoch": 3.252032520325203, - "grad_norm": 2.34375, - "learning_rate": 2.952924393723252e-05, - "loss": 0.6164794, - "memory(GiB)": 16.34, - "step": 1200, - "train_speed(iter/s)": 0.139974 - }, - { - "epoch": 3.252032520325203, - "eval_acc": 0.6044587062996115, - "eval_loss": 1.8586076498031616, - "eval_runtime": 44.1645, - "eval_samples_per_second": 0.86, - "eval_steps_per_second": 0.86, - "step": 1200 - }, - { - "acc": 0.82835131, - "epoch": 3.265582655826558, - "grad_norm": 6.46875, - "learning_rate": 2.8994293865905846e-05, - "loss": 0.61753302, - "memory(GiB)": 16.34, - "step": 1205, - "train_speed(iter/s)": 0.139339 - }, - { - "acc": 0.82603951, - "epoch": 3.279132791327913, - "grad_norm": 2.90625, - "learning_rate": 2.8459343794579173e-05, - "loss": 0.58360481, - "memory(GiB)": 16.34, - "step": 1210, - "train_speed(iter/s)": 0.139423 - }, - { - "acc": 0.79192553, - "epoch": 3.292682926829268, - "grad_norm": 2.5, - "learning_rate": 2.7924393723252493e-05, - "loss": 0.75090132, - "memory(GiB)": 16.34, - "step": 1215, - "train_speed(iter/s)": 0.1395 - }, - { - "acc": 0.83583679, - "epoch": 3.306233062330623, - "grad_norm": 2.15625, - "learning_rate": 2.738944365192582e-05, - "loss": 0.56217456, - "memory(GiB)": 16.34, - "step": 1220, - "train_speed(iter/s)": 0.139581 - }, - { - "acc": 0.80481911, - "epoch": 3.3197831978319785, - "grad_norm": 4.125, - "learning_rate": 2.685449358059914e-05, - "loss": 0.73073688, - "memory(GiB)": 16.34, - "step": 1225, - "train_speed(iter/s)": 0.139659 - }, - { - "acc": 0.8173975, - "epoch": 3.3333333333333335, - "grad_norm": 3.1875, - "learning_rate": 2.6319543509272466e-05, - "loss": 0.59854188, - "memory(GiB)": 16.34, - "step": 1230, - "train_speed(iter/s)": 0.139735 - }, - { - "acc": 0.88763266, - "epoch": 3.3468834688346885, - "grad_norm": 2.078125, - "learning_rate": 2.5784593437945793e-05, - "loss": 0.3878314, - "memory(GiB)": 16.34, - "step": 1235, - "train_speed(iter/s)": 0.139826 - }, - { - "acc": 0.82199507, - "epoch": 3.3604336043360434, - "grad_norm": 2.28125, - "learning_rate": 2.5249643366619113e-05, - "loss": 0.65350094, - "memory(GiB)": 16.34, - "step": 1240, - "train_speed(iter/s)": 0.139901 - }, - { - "acc": 0.83115234, - "epoch": 3.3739837398373984, - "grad_norm": 2.484375, - "learning_rate": 2.471469329529244e-05, - "loss": 0.61815515, - "memory(GiB)": 16.34, - "step": 1245, - "train_speed(iter/s)": 0.13998 - }, - { - "acc": 0.81266117, - "epoch": 3.3875338753387534, - "grad_norm": 1.78125, - "learning_rate": 2.417974322396576e-05, - "loss": 0.65289016, - "memory(GiB)": 16.34, - "step": 1250, - "train_speed(iter/s)": 0.140052 - }, - { - "epoch": 3.3875338753387534, - "eval_acc": 0.6037831447390644, - "eval_loss": 1.893083095550537, - "eval_runtime": 44.1639, - "eval_samples_per_second": 0.86, - "eval_steps_per_second": 0.86, - "step": 1250 - }, - { - "acc": 0.84776039, - "epoch": 3.4010840108401084, - "grad_norm": 2.046875, - "learning_rate": 2.3644793152639086e-05, - "loss": 0.53162961, - "memory(GiB)": 16.34, - "step": 1255, - "train_speed(iter/s)": 0.139443 - }, - { - "acc": 0.84318447, - "epoch": 3.4146341463414633, - "grad_norm": 2.234375, - "learning_rate": 2.3109843081312406e-05, - "loss": 0.52226324, - "memory(GiB)": 16.34, - "step": 1260, - "train_speed(iter/s)": 0.139515 - }, - { - "acc": 0.80521078, - "epoch": 3.4281842818428183, - "grad_norm": 2.25, - "learning_rate": 2.2574893009985733e-05, - "loss": 0.62338996, - "memory(GiB)": 16.34, - "step": 1265, - "train_speed(iter/s)": 0.13959 - }, - { - "acc": 0.83542995, - "epoch": 3.4417344173441733, - "grad_norm": 3.0625, - "learning_rate": 2.203994293865906e-05, - "loss": 0.60965805, - "memory(GiB)": 16.34, - "step": 1270, - "train_speed(iter/s)": 0.139638 - }, - { - "acc": 0.8333952, - "epoch": 3.4552845528455283, - "grad_norm": 2.625, - "learning_rate": 2.150499286733238e-05, - "loss": 0.60031776, - "memory(GiB)": 16.34, - "step": 1275, - "train_speed(iter/s)": 0.139705 - }, - { - "acc": 0.84367867, - "epoch": 3.4688346883468837, - "grad_norm": 1.859375, - "learning_rate": 2.0970042796005706e-05, - "loss": 0.54981174, - "memory(GiB)": 16.34, - "step": 1280, - "train_speed(iter/s)": 0.13978 - }, - { - "acc": 0.8121892, - "epoch": 3.4823848238482387, - "grad_norm": 2.046875, - "learning_rate": 2.0435092724679026e-05, - "loss": 0.67483497, - "memory(GiB)": 16.34, - "step": 1285, - "train_speed(iter/s)": 0.139851 - }, - { - "acc": 0.87883768, - "epoch": 3.4959349593495936, - "grad_norm": 1.6875, - "learning_rate": 1.9900142653352353e-05, - "loss": 0.45111437, - "memory(GiB)": 16.34, - "step": 1290, - "train_speed(iter/s)": 0.139924 - }, - { - "acc": 0.81952734, - "epoch": 3.5094850948509486, - "grad_norm": 2.640625, - "learning_rate": 1.9365192582025676e-05, - "loss": 0.66096244, - "memory(GiB)": 16.34, - "step": 1295, - "train_speed(iter/s)": 0.139995 - }, - { - "acc": 0.82709446, - "epoch": 3.5230352303523036, - "grad_norm": 2.140625, - "learning_rate": 1.8830242510699e-05, - "loss": 0.58278303, - "memory(GiB)": 16.34, - "step": 1300, - "train_speed(iter/s)": 0.140068 - }, - { - "epoch": 3.5230352303523036, - "eval_acc": 0.603276473568654, - "eval_loss": 1.8986655473709106, - "eval_runtime": 44.3091, - "eval_samples_per_second": 0.858, - "eval_steps_per_second": 0.858, - "step": 1300 - }, - { - "acc": 0.83689537, - "epoch": 3.5365853658536586, - "grad_norm": 1.9453125, - "learning_rate": 1.8295292439372323e-05, - "loss": 0.55435977, - "memory(GiB)": 16.34, - "step": 1305, - "train_speed(iter/s)": 0.139476 - }, - { - "acc": 0.8525279, - "epoch": 3.5501355013550135, - "grad_norm": 2.21875, - "learning_rate": 1.776034236804565e-05, - "loss": 0.51831579, - "memory(GiB)": 16.34, - "step": 1310, - "train_speed(iter/s)": 0.139549 - }, - { - "acc": 0.83099527, - "epoch": 3.5636856368563685, - "grad_norm": 1.8828125, - "learning_rate": 1.7225392296718973e-05, - "loss": 0.61766572, - "memory(GiB)": 16.34, - "step": 1315, - "train_speed(iter/s)": 0.139617 - }, - { - "acc": 0.82115297, - "epoch": 3.5772357723577235, - "grad_norm": 2.765625, - "learning_rate": 1.6690442225392296e-05, - "loss": 0.62109137, - "memory(GiB)": 16.34, - "step": 1320, - "train_speed(iter/s)": 0.13969 - }, - { - "acc": 0.85132771, - "epoch": 3.5907859078590785, - "grad_norm": 2.421875, - "learning_rate": 1.615549215406562e-05, - "loss": 0.55242, - "memory(GiB)": 16.34, - "step": 1325, - "train_speed(iter/s)": 0.139763 - }, - { - "acc": 0.82085381, - "epoch": 3.6043360433604335, - "grad_norm": 2.8125, - "learning_rate": 1.5620542082738943e-05, - "loss": 0.62965093, - "memory(GiB)": 16.34, - "step": 1330, - "train_speed(iter/s)": 0.139837 - }, - { - "acc": 0.85592861, - "epoch": 3.617886178861789, - "grad_norm": 1.875, - "learning_rate": 1.5085592011412266e-05, - "loss": 0.54980545, - "memory(GiB)": 16.34, - "step": 1335, - "train_speed(iter/s)": 0.139907 - }, - { - "acc": 0.80786858, - "epoch": 3.6314363143631434, - "grad_norm": 2.328125, - "learning_rate": 1.4550641940085591e-05, - "loss": 0.67223382, - "memory(GiB)": 16.34, - "step": 1340, - "train_speed(iter/s)": 0.139974 - }, - { - "acc": 0.85045481, - "epoch": 3.644986449864499, - "grad_norm": 2.078125, - "learning_rate": 1.4015691868758914e-05, - "loss": 0.50758219, - "memory(GiB)": 16.34, - "step": 1345, - "train_speed(iter/s)": 0.140054 - }, - { - "acc": 0.85536547, - "epoch": 3.658536585365854, - "grad_norm": 1.7734375, - "learning_rate": 1.348074179743224e-05, - "loss": 0.53356686, - "memory(GiB)": 16.34, - "step": 1350, - "train_speed(iter/s)": 0.140124 - }, - { - "epoch": 3.658536585365854, - "eval_acc": 0.6010808984968755, - "eval_loss": 1.8972593545913696, - "eval_runtime": 44.2915, - "eval_samples_per_second": 0.858, - "eval_steps_per_second": 0.858, - "step": 1350 - }, - { - "acc": 0.8181016, - "epoch": 3.6720867208672088, - "grad_norm": 2.3125, - "learning_rate": 1.2945791726105563e-05, - "loss": 0.59544349, - "memory(GiB)": 16.34, - "step": 1355, - "train_speed(iter/s)": 0.139555 - }, - { - "acc": 0.87799835, - "epoch": 3.6856368563685638, - "grad_norm": 2.03125, - "learning_rate": 1.2410841654778886e-05, - "loss": 0.42857742, - "memory(GiB)": 16.34, - "step": 1360, - "train_speed(iter/s)": 0.139631 - }, - { - "acc": 0.81485729, - "epoch": 3.6991869918699187, - "grad_norm": 1.65625, - "learning_rate": 1.187589158345221e-05, - "loss": 0.64030995, - "memory(GiB)": 16.34, - "step": 1365, - "train_speed(iter/s)": 0.139703 - }, - { - "acc": 0.82011805, - "epoch": 3.7127371273712737, - "grad_norm": 2.125, - "learning_rate": 1.1340941512125534e-05, - "loss": 0.63428659, - "memory(GiB)": 16.34, - "step": 1370, - "train_speed(iter/s)": 0.139771 - }, - { - "acc": 0.78548884, - "epoch": 3.7262872628726287, - "grad_norm": 3.984375, - "learning_rate": 1.0805991440798858e-05, - "loss": 0.74558139, - "memory(GiB)": 16.34, - "step": 1375, - "train_speed(iter/s)": 0.139836 - }, - { - "acc": 0.85822716, - "epoch": 3.7398373983739837, - "grad_norm": 2.328125, - "learning_rate": 1.0271041369472183e-05, - "loss": 0.47486768, - "memory(GiB)": 16.34, - "step": 1380, - "train_speed(iter/s)": 0.139902 - }, - { - "acc": 0.83469105, - "epoch": 3.7533875338753386, - "grad_norm": 2.53125, - "learning_rate": 9.736091298145506e-06, - "loss": 0.56590233, - "memory(GiB)": 16.34, - "step": 1385, - "train_speed(iter/s)": 0.139971 - }, - { - "acc": 0.80486965, - "epoch": 3.7669376693766936, - "grad_norm": 3.28125, - "learning_rate": 9.20114122681883e-06, - "loss": 0.68021841, - "memory(GiB)": 16.34, - "step": 1390, - "train_speed(iter/s)": 0.140036 - }, - { - "acc": 0.84560328, - "epoch": 3.7804878048780486, - "grad_norm": 2.421875, - "learning_rate": 8.666191155492154e-06, - "loss": 0.52926989, - "memory(GiB)": 16.34, - "step": 1395, - "train_speed(iter/s)": 0.140102 - }, - { - "acc": 0.83154497, - "epoch": 3.794037940379404, - "grad_norm": 2.671875, - "learning_rate": 8.131241084165478e-06, - "loss": 0.5866107, - "memory(GiB)": 16.34, - "step": 1400, - "train_speed(iter/s)": 0.14017 - }, - { - "epoch": 3.794037940379404, - "eval_acc": 0.6036142543489276, - "eval_loss": 1.8983793258666992, - "eval_runtime": 44.4268, - "eval_samples_per_second": 0.855, - "eval_steps_per_second": 0.855, - "step": 1400 - }, - { - "acc": 0.84980259, - "epoch": 3.8075880758807585, - "grad_norm": 2.859375, - "learning_rate": 7.596291012838801e-06, - "loss": 0.56505013, - "memory(GiB)": 16.34, - "step": 1405, - "train_speed(iter/s)": 0.139621 - }, - { - "acc": 0.80126667, - "epoch": 3.821138211382114, - "grad_norm": 2.21875, - "learning_rate": 7.061340941512125e-06, - "loss": 0.67650762, - "memory(GiB)": 16.34, - "step": 1410, - "train_speed(iter/s)": 0.139688 - }, - { - "acc": 0.79974804, - "epoch": 3.834688346883469, - "grad_norm": 2.3125, - "learning_rate": 6.5263908701854486e-06, - "loss": 0.743855, - "memory(GiB)": 16.34, - "step": 1415, - "train_speed(iter/s)": 0.139749 - }, - { - "acc": 0.84023552, - "epoch": 3.848238482384824, - "grad_norm": 2.234375, - "learning_rate": 5.991440798858773e-06, - "loss": 0.57722979, - "memory(GiB)": 16.34, - "step": 1420, - "train_speed(iter/s)": 0.139813 - }, - { - "acc": 0.82472792, - "epoch": 3.861788617886179, - "grad_norm": 2.484375, - "learning_rate": 5.456490727532097e-06, - "loss": 0.56691737, - "memory(GiB)": 16.34, - "step": 1425, - "train_speed(iter/s)": 0.139877 - }, - { - "acc": 0.83573132, - "epoch": 3.875338753387534, - "grad_norm": 1.9765625, - "learning_rate": 4.92154065620542e-06, - "loss": 0.60298834, - "memory(GiB)": 16.34, - "step": 1430, - "train_speed(iter/s)": 0.13994 - }, - { - "acc": 0.83143454, - "epoch": 3.888888888888889, - "grad_norm": 1.9453125, - "learning_rate": 4.386590584878744e-06, - "loss": 0.61832671, - "memory(GiB)": 16.34, - "step": 1435, - "train_speed(iter/s)": 0.140003 - }, - { - "acc": 0.83209105, - "epoch": 3.902439024390244, - "grad_norm": 3.265625, - "learning_rate": 3.851640513552068e-06, - "loss": 0.6038147, - "memory(GiB)": 16.34, - "step": 1440, - "train_speed(iter/s)": 0.140065 - }, - { - "acc": 0.79715924, - "epoch": 3.915989159891599, - "grad_norm": 2.015625, - "learning_rate": 3.316690442225392e-06, - "loss": 0.70867634, - "memory(GiB)": 16.34, - "step": 1445, - "train_speed(iter/s)": 0.140125 - }, - { - "acc": 0.85556803, - "epoch": 3.9295392953929538, - "grad_norm": 1.7734375, - "learning_rate": 2.781740370898716e-06, - "loss": 0.52723417, - "memory(GiB)": 16.34, - "step": 1450, - "train_speed(iter/s)": 0.140182 - }, - { - "epoch": 3.9295392953929538, - "eval_acc": 0.6029386927883803, - "eval_loss": 1.8965932130813599, - "eval_runtime": 44.4125, - "eval_samples_per_second": 0.856, - "eval_steps_per_second": 0.856, - "step": 1450 - }, - { - "acc": 0.83306837, - "epoch": 3.943089430894309, - "grad_norm": 2.25, - "learning_rate": 2.2467902995720398e-06, - "loss": 0.55906692, - "memory(GiB)": 16.34, - "step": 1455, - "train_speed(iter/s)": 0.139645 - }, - { - "acc": 0.81574478, - "epoch": 3.9566395663956637, - "grad_norm": 2.6875, - "learning_rate": 1.7118402282453637e-06, - "loss": 0.62746248, - "memory(GiB)": 16.34, - "step": 1460, - "train_speed(iter/s)": 0.139708 - }, - { - "acc": 0.83905201, - "epoch": 3.970189701897019, - "grad_norm": 2.46875, - "learning_rate": 1.1768901569186875e-06, - "loss": 0.56978045, - "memory(GiB)": 16.34, - "step": 1465, - "train_speed(iter/s)": 0.139771 - }, - { - "acc": 0.85569115, - "epoch": 3.983739837398374, - "grad_norm": 1.90625, - "learning_rate": 6.419400855920114e-07, - "loss": 0.48882861, - "memory(GiB)": 16.34, - "step": 1470, - "train_speed(iter/s)": 0.139834 - }, - { - "acc": 0.82850809, - "epoch": 3.997289972899729, - "grad_norm": 1.828125, - "learning_rate": 1.0699001426533523e-07, - "loss": 0.56869435, - "memory(GiB)": 16.34, - "step": 1475, - "train_speed(iter/s)": 0.139901 - }, - { - "epoch": 4.0, - "eval_acc": 0.6037831447390644, - "eval_loss": 1.8978389501571655, - "eval_runtime": 44.2826, - "eval_samples_per_second": 0.858, - "eval_steps_per_second": 0.858, - "step": 1476 } ], "logging_steps": 5, - "max_steps": 1476, + "max_steps": 1020, "num_input_tokens_seen": 0, "num_train_epochs": 4, - "save_steps": 369, - "total_flos": 1.991427194376192e+17, - "train_batch_size": 1, + "save_steps": 300, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.605539502350213e+17, + "train_batch_size": 2, "trial_name": null, "trial_params": null }