{ "best_metric": 1.52509904, "best_model_checkpoint": "D:\\_____NEW_NN\\LLM\\MiniCPM-V\\finetune\\output\\phi3-vision-128k-instruct\\v0-20240629-080216\\checkpoint-300", "epoch": 3.5225048923679063, "eval_steps": 50, "global_step": 900, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "acc": 0.49833804, "epoch": 0.003913894324853229, "grad_norm": 0.77734375, "learning_rate": 2.745098039215686e-06, "loss": 2.37747383, "memory(GiB)": 17.35, "step": 1, "train_speed(iter/s)": 0.076826 }, { "acc": 0.50652587, "epoch": 0.019569471624266144, "grad_norm": 1.140625, "learning_rate": 1.372549019607843e-05, "loss": 2.29183841, "memory(GiB)": 19.33, "step": 5, "train_speed(iter/s)": 0.082188 }, { "acc": 0.52587533, "epoch": 0.03913894324853229, "grad_norm": 0.68359375, "learning_rate": 2.745098039215686e-05, "loss": 2.22724895, "memory(GiB)": 19.89, "step": 10, "train_speed(iter/s)": 0.082805 }, { "acc": 0.52128973, "epoch": 0.05870841487279843, "grad_norm": 0.8359375, "learning_rate": 4.117647058823529e-05, "loss": 2.27491264, "memory(GiB)": 19.24, "step": 15, "train_speed(iter/s)": 0.082482 }, { "acc": 0.51135335, "epoch": 0.07827788649706457, "grad_norm": 0.66015625, "learning_rate": 5.490196078431372e-05, "loss": 2.32762127, "memory(GiB)": 19.86, "step": 20, "train_speed(iter/s)": 0.082557 }, { "acc": 0.54442377, "epoch": 0.09784735812133072, "grad_norm": 0.65625, "learning_rate": 6.862745098039214e-05, "loss": 2.09772224, "memory(GiB)": 19.05, "step": 25, "train_speed(iter/s)": 0.082348 }, { "acc": 0.5545311, "epoch": 0.11741682974559686, "grad_norm": 0.62109375, "learning_rate": 8.235294117647058e-05, "loss": 2.00072975, "memory(GiB)": 19.89, "step": 30, "train_speed(iter/s)": 0.082166 }, { "acc": 0.57092514, "epoch": 0.136986301369863, "grad_norm": 0.9296875, "learning_rate": 9.6078431372549e-05, "loss": 1.94450474, "memory(GiB)": 19.16, "step": 35, "train_speed(iter/s)": 0.081966 }, { "acc": 0.56716595, "epoch": 0.15655577299412915, "grad_norm": 0.7734375, "learning_rate": 0.00010980392156862745, "loss": 1.90242462, "memory(GiB)": 19.62, "step": 40, "train_speed(iter/s)": 0.081987 }, { "acc": 0.57822714, "epoch": 0.1761252446183953, "grad_norm": 0.74609375, "learning_rate": 0.00012352941176470587, "loss": 1.83147659, "memory(GiB)": 19.99, "step": 45, "train_speed(iter/s)": 0.081878 }, { "acc": 0.57696896, "epoch": 0.19569471624266144, "grad_norm": 0.85546875, "learning_rate": 0.00013725490196078428, "loss": 1.82299595, "memory(GiB)": 19.11, "step": 50, "train_speed(iter/s)": 0.081843 }, { "epoch": 0.19569471624266144, "eval_acc": 0.583503534956795, "eval_loss": 1.8029242753982544, "eval_runtime": 85.1254, "eval_samples_per_second": 0.893, "eval_steps_per_second": 0.446, "step": 50 }, { "acc": 0.59343066, "epoch": 0.21526418786692758, "grad_norm": 1.0, "learning_rate": 0.0001399941138119636, "loss": 1.82339039, "memory(GiB)": 22.92, "step": 55, "train_speed(iter/s)": 0.072544 }, { "acc": 0.58571839, "epoch": 0.23483365949119372, "grad_norm": 0.7734375, "learning_rate": 0.00013997020286964757, "loss": 1.80549526, "memory(GiB)": 19.43, "step": 60, "train_speed(iter/s)": 0.073269 }, { "acc": 0.60369935, "epoch": 0.25440313111545987, "grad_norm": 0.99609375, "learning_rate": 0.0001399279055646442, "loss": 1.6768074, "memory(GiB)": 19.57, "step": 65, "train_speed(iter/s)": 0.073897 }, { "acc": 0.58763909, "epoch": 0.273972602739726, "grad_norm": 1.1640625, "learning_rate": 0.00013986723301159307, "loss": 1.79169483, "memory(GiB)": 19.48, "step": 70, "train_speed(iter/s)": 0.074533 }, { "acc": 0.58979025, "epoch": 0.29354207436399216, "grad_norm": 0.69140625, "learning_rate": 0.00013978820115367462, "loss": 1.72388344, "memory(GiB)": 19.35, "step": 75, "train_speed(iter/s)": 0.075045 }, { "acc": 0.59725327, "epoch": 0.3131115459882583, "grad_norm": 0.75, "learning_rate": 0.00013969083075842048, "loss": 1.70864868, "memory(GiB)": 19.49, "step": 80, "train_speed(iter/s)": 0.075523 }, { "acc": 0.60098982, "epoch": 0.33268101761252444, "grad_norm": 4.59375, "learning_rate": 0.00013957514741225646, "loss": 1.67311764, "memory(GiB)": 20.01, "step": 85, "train_speed(iter/s)": 0.075928 }, { "acc": 0.58315139, "epoch": 0.3522504892367906, "grad_norm": 0.8359375, "learning_rate": 0.00013944118151377894, "loss": 1.74437752, "memory(GiB)": 20.14, "step": 90, "train_speed(iter/s)": 0.076154 }, { "acc": 0.6138227, "epoch": 0.37181996086105673, "grad_norm": 0.75, "learning_rate": 0.0001392889682657671, "loss": 1.63750076, "memory(GiB)": 19.59, "step": 95, "train_speed(iter/s)": 0.076253 }, { "acc": 0.63383026, "epoch": 0.3913894324853229, "grad_norm": 0.8515625, "learning_rate": 0.00013911854766593233, "loss": 1.56653557, "memory(GiB)": 19.5, "step": 100, "train_speed(iter/s)": 0.076386 }, { "epoch": 0.3913894324853229, "eval_acc": 0.604241948153967, "eval_loss": 1.6681365966796875, "eval_runtime": 72.2811, "eval_samples_per_second": 1.051, "eval_steps_per_second": 0.526, "step": 100 }, { "acc": 0.61646304, "epoch": 0.410958904109589, "grad_norm": 0.73046875, "learning_rate": 0.00013892996449640807, "loss": 1.59651537, "memory(GiB)": 22.5, "step": 105, "train_speed(iter/s)": 0.072857 }, { "acc": 0.60897431, "epoch": 0.43052837573385516, "grad_norm": 0.83984375, "learning_rate": 0.00013872326831198205, "loss": 1.70257473, "memory(GiB)": 19.42, "step": 110, "train_speed(iter/s)": 0.073309 }, { "acc": 0.58328586, "epoch": 0.4500978473581213, "grad_norm": 0.9453125, "learning_rate": 0.00013849851342707462, "loss": 1.71216717, "memory(GiB)": 19.47, "step": 115, "train_speed(iter/s)": 0.073753 }, { "acc": 0.62397904, "epoch": 0.46966731898238745, "grad_norm": 0.80078125, "learning_rate": 0.0001382557589014664, "loss": 1.54239073, "memory(GiB)": 19.33, "step": 120, "train_speed(iter/s)": 0.074078 }, { "acc": 0.60271235, "epoch": 0.4892367906066536, "grad_norm": 1.171875, "learning_rate": 0.0001379950685247788, "loss": 1.72333088, "memory(GiB)": 19.37, "step": 125, "train_speed(iter/s)": 0.074428 }, { "acc": 0.5755064, "epoch": 0.5088062622309197, "grad_norm": 0.94921875, "learning_rate": 0.00013771651079971182, "loss": 1.81728477, "memory(GiB)": 19.52, "step": 130, "train_speed(iter/s)": 0.074768 }, { "acc": 0.5844254, "epoch": 0.5283757338551859, "grad_norm": 0.8515625, "learning_rate": 0.00013742015892404325, "loss": 1.77252998, "memory(GiB)": 19.51, "step": 135, "train_speed(iter/s)": 0.075066 }, { "acc": 0.5998323, "epoch": 0.547945205479452, "grad_norm": 0.8671875, "learning_rate": 0.0001371060907713942, "loss": 1.69012871, "memory(GiB)": 19.54, "step": 140, "train_speed(iter/s)": 0.07528 }, { "acc": 0.62686119, "epoch": 0.5675146771037182, "grad_norm": 0.68359375, "learning_rate": 0.00013677438887076603, "loss": 1.66314449, "memory(GiB)": 19.54, "step": 145, "train_speed(iter/s)": 0.075467 }, { "acc": 0.59954901, "epoch": 0.5870841487279843, "grad_norm": 0.6328125, "learning_rate": 0.00013642514038485367, "loss": 1.67525444, "memory(GiB)": 19.55, "step": 150, "train_speed(iter/s)": 0.075722 }, { "epoch": 0.5870841487279843, "eval_acc": 0.6184603299293009, "eval_loss": 1.5965631008148193, "eval_runtime": 72.3005, "eval_samples_per_second": 1.051, "eval_steps_per_second": 0.526, "step": 150 }, { "acc": 0.585955, "epoch": 0.6066536203522505, "grad_norm": 0.9375, "learning_rate": 0.00013605843708714162, "loss": 1.7486639, "memory(GiB)": 23.22, "step": 155, "train_speed(iter/s)": 0.073368 }, { "acc": 0.62769904, "epoch": 0.6262230919765166, "grad_norm": 0.7265625, "learning_rate": 0.00013567437533778826, "loss": 1.55238762, "memory(GiB)": 19.62, "step": 160, "train_speed(iter/s)": 0.073628 }, { "acc": 0.63651643, "epoch": 0.6457925636007827, "grad_norm": 0.80078125, "learning_rate": 0.00013527305605830488, "loss": 1.54306393, "memory(GiB)": 19.88, "step": 165, "train_speed(iter/s)": 0.073903 }, { "acc": 0.59288979, "epoch": 0.6653620352250489, "grad_norm": 0.703125, "learning_rate": 0.0001348545847050361, "loss": 1.69727612, "memory(GiB)": 19.58, "step": 170, "train_speed(iter/s)": 0.074077 }, { "acc": 0.61248484, "epoch": 0.684931506849315, "grad_norm": 0.9140625, "learning_rate": 0.00013441907124144866, "loss": 1.65900764, "memory(GiB)": 19.49, "step": 175, "train_speed(iter/s)": 0.074329 }, { "acc": 0.61740661, "epoch": 0.7045009784735812, "grad_norm": 0.90625, "learning_rate": 0.0001339666301092358, "loss": 1.6518961, "memory(GiB)": 19.68, "step": 180, "train_speed(iter/s)": 0.074558 }, { "acc": 0.62250223, "epoch": 0.7240704500978473, "grad_norm": 0.84765625, "learning_rate": 0.00013349738019824512, "loss": 1.55100412, "memory(GiB)": 19.34, "step": 185, "train_speed(iter/s)": 0.07477 }, { "acc": 0.61055808, "epoch": 0.7436399217221135, "grad_norm": 0.90625, "learning_rate": 0.00013301144481523718, "loss": 1.67241592, "memory(GiB)": 19.56, "step": 190, "train_speed(iter/s)": 0.075006 }, { "acc": 0.6389596, "epoch": 0.7632093933463796, "grad_norm": 0.83203125, "learning_rate": 0.00013250895165148384, "loss": 1.54227753, "memory(GiB)": 19.29, "step": 195, "train_speed(iter/s)": 0.075192 }, { "acc": 0.59149747, "epoch": 0.7827788649706457, "grad_norm": 0.68359375, "learning_rate": 0.00013199003274921416, "loss": 1.71190453, "memory(GiB)": 19.35, "step": 200, "train_speed(iter/s)": 0.075393 }, { "epoch": 0.7827788649706457, "eval_acc": 0.6241162608012569, "eval_loss": 1.5573129653930664, "eval_runtime": 69.5471, "eval_samples_per_second": 1.093, "eval_steps_per_second": 0.546, "step": 200 }, { "acc": 0.62623324, "epoch": 0.8023483365949119, "grad_norm": 0.81640625, "learning_rate": 0.00013145482446691724, "loss": 1.55779324, "memory(GiB)": 20.56, "step": 205, "train_speed(iter/s)": 0.073671 }, { "acc": 0.61495056, "epoch": 0.821917808219178, "grad_norm": 1.03125, "learning_rate": 0.00013090346744351058, "loss": 1.56424398, "memory(GiB)": 19.48, "step": 210, "train_speed(iter/s)": 0.073902 }, { "acc": 0.59643593, "epoch": 0.8414872798434442, "grad_norm": 1.0703125, "learning_rate": 0.00013033610656138395, "loss": 1.62190418, "memory(GiB)": 19.5, "step": 215, "train_speed(iter/s)": 0.074133 }, { "acc": 0.63052382, "epoch": 0.8610567514677103, "grad_norm": 0.59765625, "learning_rate": 0.00012975289090832792, "loss": 1.53521852, "memory(GiB)": 19.53, "step": 220, "train_speed(iter/s)": 0.074334 }, { "acc": 0.61408448, "epoch": 0.8806262230919765, "grad_norm": 0.7734375, "learning_rate": 0.00012915397373835754, "loss": 1.59712257, "memory(GiB)": 19.52, "step": 225, "train_speed(iter/s)": 0.074533 }, { "acc": 0.62307076, "epoch": 0.9001956947162426, "grad_norm": 0.66796875, "learning_rate": 0.00012853951243144105, "loss": 1.57903328, "memory(GiB)": 19.49, "step": 230, "train_speed(iter/s)": 0.074719 }, { "acc": 0.61717134, "epoch": 0.9197651663405088, "grad_norm": 0.84375, "learning_rate": 0.00012790966845214457, "loss": 1.61422024, "memory(GiB)": 19.25, "step": 235, "train_speed(iter/s)": 0.074916 }, { "acc": 0.62549253, "epoch": 0.9393346379647749, "grad_norm": 0.8125, "learning_rate": 0.0001272646073072033, "loss": 1.62806015, "memory(GiB)": 19.36, "step": 240, "train_speed(iter/s)": 0.0751 }, { "acc": 0.61903515, "epoch": 0.958904109589041, "grad_norm": 0.74609375, "learning_rate": 0.0001266044985020307, "loss": 1.55927486, "memory(GiB)": 19.36, "step": 245, "train_speed(iter/s)": 0.075266 }, { "acc": 0.61238952, "epoch": 0.9784735812133072, "grad_norm": 0.87890625, "learning_rate": 0.00012592951549617683, "loss": 1.52888412, "memory(GiB)": 19.33, "step": 250, "train_speed(iter/s)": 0.075438 }, { "epoch": 0.9784735812133072, "eval_acc": 0.6267085624509033, "eval_loss": 1.5281730890274048, "eval_runtime": 69.069, "eval_samples_per_second": 1.1, "eval_steps_per_second": 0.55, "step": 250 }, { "acc": 0.63230977, "epoch": 0.9980430528375733, "grad_norm": 0.84765625, "learning_rate": 0.00012523983565774753, "loss": 1.53058205, "memory(GiB)": 19.46, "step": 255, "train_speed(iter/s)": 0.074081 }, { "acc": 0.66042156, "epoch": 1.0176125244618395, "grad_norm": 0.76171875, "learning_rate": 0.00012453564021679692, "loss": 1.37123928, "memory(GiB)": 20.18, "step": 260, "train_speed(iter/s)": 0.074295 }, { "acc": 0.67253222, "epoch": 1.0371819960861057, "grad_norm": 0.76953125, "learning_rate": 0.00012381711421770455, "loss": 1.28407507, "memory(GiB)": 19.7, "step": 265, "train_speed(iter/s)": 0.074448 }, { "acc": 0.66850777, "epoch": 1.0567514677103718, "grad_norm": 0.98046875, "learning_rate": 0.0001230844464705507, "loss": 1.27961807, "memory(GiB)": 19.58, "step": 270, "train_speed(iter/s)": 0.07459 }, { "acc": 0.67196817, "epoch": 1.076320939334638, "grad_norm": 0.9140625, "learning_rate": 0.00012233782950150186, "loss": 1.28494987, "memory(GiB)": 19.61, "step": 275, "train_speed(iter/s)": 0.074728 }, { "acc": 0.67708378, "epoch": 1.095890410958904, "grad_norm": 0.87109375, "learning_rate": 0.00012157745950221989, "loss": 1.29551096, "memory(GiB)": 19.63, "step": 280, "train_speed(iter/s)": 0.074881 }, { "acc": 0.66973438, "epoch": 1.1154598825831703, "grad_norm": 1.0859375, "learning_rate": 0.0001208035362783079, "loss": 1.27705774, "memory(GiB)": 19.49, "step": 285, "train_speed(iter/s)": 0.075029 }, { "acc": 0.6750237, "epoch": 1.1350293542074363, "grad_norm": 1.0859375, "learning_rate": 0.00012001626319680648, "loss": 1.25660419, "memory(GiB)": 19.55, "step": 290, "train_speed(iter/s)": 0.07515 }, { "acc": 0.624368, "epoch": 1.1545988258317026, "grad_norm": 1.1953125, "learning_rate": 0.00011921584713275411, "loss": 1.5070508, "memory(GiB)": 19.52, "step": 295, "train_speed(iter/s)": 0.075278 }, { "acc": 0.66252189, "epoch": 1.1741682974559686, "grad_norm": 0.828125, "learning_rate": 0.0001184024984148257, "loss": 1.32014723, "memory(GiB)": 19.92, "step": 300, "train_speed(iter/s)": 0.075433 }, { "epoch": 1.1741682974559686, "eval_acc": 0.6282796543597801, "eval_loss": 1.5250990390777588, "eval_runtime": 70.3986, "eval_samples_per_second": 1.08, "eval_steps_per_second": 0.54, "step": 300 }, { "acc": 0.67028356, "epoch": 1.1937377690802349, "grad_norm": 1.7109375, "learning_rate": 0.00011757643077006372, "loss": 1.28037386, "memory(GiB)": 22.6, "step": 305, "train_speed(iter/s)": 0.074243 }, { "acc": 0.655305, "epoch": 1.213307240704501, "grad_norm": 1.1015625, "learning_rate": 0.00011673786126771617, "loss": 1.31057158, "memory(GiB)": 19.72, "step": 310, "train_speed(iter/s)": 0.074392 }, { "acc": 0.66528535, "epoch": 1.2328767123287672, "grad_norm": 1.6171875, "learning_rate": 0.0001158870102621965, "loss": 1.29698696, "memory(GiB)": 19.08, "step": 315, "train_speed(iter/s)": 0.074534 }, { "acc": 0.66950455, "epoch": 1.2524461839530332, "grad_norm": 1.2421875, "learning_rate": 0.00011502410133517998, "loss": 1.27706356, "memory(GiB)": 19.87, "step": 320, "train_speed(iter/s)": 0.074667 }, { "acc": 0.65843534, "epoch": 1.2720156555772995, "grad_norm": 1.2265625, "learning_rate": 0.0001141493612368524, "loss": 1.30308371, "memory(GiB)": 19.87, "step": 325, "train_speed(iter/s)": 0.0748 }, { "acc": 0.66441913, "epoch": 1.2915851272015655, "grad_norm": 1.2578125, "learning_rate": 0.00011326301982632583, "loss": 1.26109972, "memory(GiB)": 19.09, "step": 330, "train_speed(iter/s)": 0.074935 }, { "acc": 0.68711085, "epoch": 1.3111545988258317, "grad_norm": 0.95703125, "learning_rate": 0.00011236531001123771, "loss": 1.19278584, "memory(GiB)": 19.73, "step": 335, "train_speed(iter/s)": 0.075053 }, { "acc": 0.66676803, "epoch": 1.3307240704500978, "grad_norm": 1.96875, "learning_rate": 0.0001114564676865486, "loss": 1.3068346, "memory(GiB)": 19.84, "step": 340, "train_speed(iter/s)": 0.075151 }, { "acc": 0.66865935, "epoch": 1.350293542074364, "grad_norm": 1.2421875, "learning_rate": 0.00011053673167255516, "loss": 1.30573978, "memory(GiB)": 19.66, "step": 345, "train_speed(iter/s)": 0.075271 }, { "acc": 0.66606102, "epoch": 1.36986301369863, "grad_norm": 0.76171875, "learning_rate": 0.00010960634365213437, "loss": 1.26872787, "memory(GiB)": 19.73, "step": 350, "train_speed(iter/s)": 0.075377 }, { "epoch": 1.36986301369863, "eval_acc": 0.6315003927729772, "eval_loss": 1.5066882371902466, "eval_runtime": 72.5685, "eval_samples_per_second": 1.047, "eval_steps_per_second": 0.524, "step": 350 }, { "acc": 0.67307239, "epoch": 1.3894324853228963, "grad_norm": 1.1796875, "learning_rate": 0.0001086655481072354, "loss": 1.27917318, "memory(GiB)": 22.92, "step": 355, "train_speed(iter/s)": 0.074318 }, { "acc": 0.65870218, "epoch": 1.4090019569471623, "grad_norm": 3.609375, "learning_rate": 0.00010771459225463617, "loss": 1.33731461, "memory(GiB)": 19.67, "step": 360, "train_speed(iter/s)": 0.074416 }, { "acc": 0.68150563, "epoch": 1.4285714285714286, "grad_norm": 0.9296875, "learning_rate": 0.00010675372598098113, "loss": 1.20515957, "memory(GiB)": 19.99, "step": 365, "train_speed(iter/s)": 0.07451 }, { "acc": 0.66793504, "epoch": 1.4481409001956946, "grad_norm": 1.03125, "learning_rate": 0.00010578320177711743, "loss": 1.31133595, "memory(GiB)": 19.9, "step": 370, "train_speed(iter/s)": 0.074613 }, { "acc": 0.66840873, "epoch": 1.467710371819961, "grad_norm": 0.9453125, "learning_rate": 0.00010480327467174705, "loss": 1.27730675, "memory(GiB)": 19.91, "step": 375, "train_speed(iter/s)": 0.074709 }, { "acc": 0.6621439, "epoch": 1.487279843444227, "grad_norm": 0.7890625, "learning_rate": 0.00010381420216441152, "loss": 1.29670372, "memory(GiB)": 19.65, "step": 380, "train_speed(iter/s)": 0.074824 }, { "acc": 0.66805882, "epoch": 1.5068493150684932, "grad_norm": 0.8203125, "learning_rate": 0.00010281624415782804, "loss": 1.23922901, "memory(GiB)": 19.77, "step": 385, "train_speed(iter/s)": 0.074927 }, { "acc": 0.66435666, "epoch": 1.5264187866927594, "grad_norm": 0.82421875, "learning_rate": 0.0001018096628895935, "loss": 1.27945633, "memory(GiB)": 19.79, "step": 390, "train_speed(iter/s)": 0.075033 }, { "acc": 0.68444743, "epoch": 1.5459882583170255, "grad_norm": 0.98046875, "learning_rate": 0.00010079472286327533, "loss": 1.2325819, "memory(GiB)": 19.55, "step": 395, "train_speed(iter/s)": 0.075133 }, { "acc": 0.68633671, "epoch": 1.5655577299412915, "grad_norm": 1.171875, "learning_rate": 9.977169077890672e-05, "loss": 1.26248102, "memory(GiB)": 19.79, "step": 400, "train_speed(iter/s)": 0.075233 }, { "epoch": 1.5655577299412915, "eval_acc": 0.6297721916732129, "eval_loss": 1.5114485025405884, "eval_runtime": 70.7985, "eval_samples_per_second": 1.073, "eval_steps_per_second": 0.537, "step": 400 }, { "acc": 0.67859097, "epoch": 1.5851272015655578, "grad_norm": 1.046875, "learning_rate": 9.874083546290482e-05, "loss": 1.2065486, "memory(GiB)": 22.72, "step": 405, "train_speed(iter/s)": 0.074347 }, { "acc": 0.66178751, "epoch": 1.604696673189824, "grad_norm": 0.96484375, "learning_rate": 9.770242779743008e-05, "loss": 1.30969448, "memory(GiB)": 20.13, "step": 410, "train_speed(iter/s)": 0.074453 }, { "acc": 0.65872512, "epoch": 1.62426614481409, "grad_norm": 0.74609375, "learning_rate": 9.665674064920533e-05, "loss": 1.27483397, "memory(GiB)": 20.17, "step": 415, "train_speed(iter/s)": 0.074534 }, { "acc": 0.66567349, "epoch": 1.643835616438356, "grad_norm": 0.87109375, "learning_rate": 9.560404879781353e-05, "loss": 1.31585007, "memory(GiB)": 20.07, "step": 420, "train_speed(iter/s)": 0.074639 }, { "acc": 0.66216898, "epoch": 1.6634050880626223, "grad_norm": 0.85546875, "learning_rate": 9.454462886349281e-05, "loss": 1.32738457, "memory(GiB)": 19.43, "step": 425, "train_speed(iter/s)": 0.074732 }, { "acc": 0.6608973, "epoch": 1.6829745596868886, "grad_norm": 1.1328125, "learning_rate": 9.347875923444772e-05, "loss": 1.2792593, "memory(GiB)": 20.05, "step": 430, "train_speed(iter/s)": 0.074827 }, { "acc": 0.65830297, "epoch": 1.7025440313111546, "grad_norm": 0.94921875, "learning_rate": 9.240671999369607e-05, "loss": 1.34132614, "memory(GiB)": 19.82, "step": 435, "train_speed(iter/s)": 0.074914 }, { "acc": 0.68926673, "epoch": 1.7221135029354206, "grad_norm": 0.76953125, "learning_rate": 9.132879284547038e-05, "loss": 1.15266266, "memory(GiB)": 19.28, "step": 440, "train_speed(iter/s)": 0.074997 }, { "acc": 0.65699558, "epoch": 1.741682974559687, "grad_norm": 0.96484375, "learning_rate": 9.024526104119312e-05, "loss": 1.32417459, "memory(GiB)": 19.29, "step": 445, "train_speed(iter/s)": 0.075079 }, { "acc": 0.68860197, "epoch": 1.7612524461839532, "grad_norm": 0.8203125, "learning_rate": 8.91564093050458e-05, "loss": 1.20134068, "memory(GiB)": 19.33, "step": 450, "train_speed(iter/s)": 0.07515 }, { "epoch": 1.7612524461839532, "eval_acc": 0.6351924587588373, "eval_loss": 1.4908838272094727, "eval_runtime": 71.5161, "eval_samples_per_second": 1.063, "eval_steps_per_second": 0.531, "step": 450 }, { "acc": 0.65404687, "epoch": 1.7808219178082192, "grad_norm": 1.0078125, "learning_rate": 8.806252375915052e-05, "loss": 1.31502724, "memory(GiB)": 19.13, "step": 455, "train_speed(iter/s)": 0.074358 }, { "acc": 0.69379678, "epoch": 1.8003913894324852, "grad_norm": 1.1015625, "learning_rate": 8.696389184838471e-05, "loss": 1.1870966, "memory(GiB)": 20.18, "step": 460, "train_speed(iter/s)": 0.074437 }, { "acc": 0.67447538, "epoch": 1.8199608610567515, "grad_norm": 1.2890625, "learning_rate": 8.586080226484789e-05, "loss": 1.19511604, "memory(GiB)": 20.09, "step": 465, "train_speed(iter/s)": 0.074531 }, { "acc": 0.67230067, "epoch": 1.8395303326810177, "grad_norm": 1.0390625, "learning_rate": 8.475354487200092e-05, "loss": 1.30591021, "memory(GiB)": 19.29, "step": 470, "train_speed(iter/s)": 0.074608 }, { "acc": 0.65006552, "epoch": 1.8590998043052838, "grad_norm": 3.21875, "learning_rate": 8.364241062849732e-05, "loss": 1.35613279, "memory(GiB)": 19.51, "step": 475, "train_speed(iter/s)": 0.07469 }, { "acc": 0.66248426, "epoch": 1.8786692759295498, "grad_norm": 1.0703125, "learning_rate": 8.252769151172682e-05, "loss": 1.34706697, "memory(GiB)": 19.16, "step": 480, "train_speed(iter/s)": 0.074779 }, { "acc": 0.66462736, "epoch": 1.898238747553816, "grad_norm": 0.8515625, "learning_rate": 8.140968044109134e-05, "loss": 1.31343336, "memory(GiB)": 19.17, "step": 485, "train_speed(iter/s)": 0.07486 }, { "acc": 0.65373287, "epoch": 1.9178082191780823, "grad_norm": 1.078125, "learning_rate": 8.028867120103326e-05, "loss": 1.31145601, "memory(GiB)": 19.46, "step": 490, "train_speed(iter/s)": 0.074941 }, { "acc": 0.6731041, "epoch": 1.9373776908023483, "grad_norm": 0.89453125, "learning_rate": 7.916495836383648e-05, "loss": 1.24272699, "memory(GiB)": 19.45, "step": 495, "train_speed(iter/s)": 0.075011 }, { "acc": 0.66485052, "epoch": 1.9569471624266144, "grad_norm": 1.03125, "learning_rate": 7.80388372122204e-05, "loss": 1.28164721, "memory(GiB)": 19.24, "step": 500, "train_speed(iter/s)": 0.07509 }, { "epoch": 1.9569471624266144, "eval_acc": 0.6349567949725059, "eval_loss": 1.483258843421936, "eval_runtime": 72.4797, "eval_samples_per_second": 1.049, "eval_steps_per_second": 0.524, "step": 500 }, { "acc": 0.68325486, "epoch": 1.9765166340508806, "grad_norm": 1.2890625, "learning_rate": 7.691060366174728e-05, "loss": 1.2257865, "memory(GiB)": 22.98, "step": 505, "train_speed(iter/s)": 0.074371 }, { "acc": 0.68977013, "epoch": 1.9960861056751469, "grad_norm": 1.0234375, "learning_rate": 7.578055418306327e-05, "loss": 1.25723343, "memory(GiB)": 19.56, "step": 510, "train_speed(iter/s)": 0.074471 }, { "acc": 0.72185702, "epoch": 2.015655577299413, "grad_norm": 0.7890625, "learning_rate": 7.464898572399353e-05, "loss": 1.01715631, "memory(GiB)": 20.07, "step": 515, "train_speed(iter/s)": 0.074591 }, { "acc": 0.71889682, "epoch": 2.035225048923679, "grad_norm": 1.0625, "learning_rate": 7.351619563151208e-05, "loss": 1.03077154, "memory(GiB)": 19.92, "step": 520, "train_speed(iter/s)": 0.074683 }, { "acc": 0.7505311, "epoch": 2.0547945205479454, "grad_norm": 1.9609375, "learning_rate": 7.238248157360663e-05, "loss": 0.93218956, "memory(GiB)": 19.85, "step": 525, "train_speed(iter/s)": 0.07477 }, { "acc": 0.7315311, "epoch": 2.0743639921722115, "grad_norm": 1.1875, "learning_rate": 7.124814146105921e-05, "loss": 0.96330833, "memory(GiB)": 19.87, "step": 530, "train_speed(iter/s)": 0.074853 }, { "acc": 0.75555606, "epoch": 2.0939334637964775, "grad_norm": 1.3515625, "learning_rate": 7.011347336916277e-05, "loss": 0.86877937, "memory(GiB)": 18.46, "step": 535, "train_speed(iter/s)": 0.074938 }, { "acc": 0.74034052, "epoch": 2.1135029354207435, "grad_norm": 1.546875, "learning_rate": 6.897877545939475e-05, "loss": 0.90922012, "memory(GiB)": 19.89, "step": 540, "train_speed(iter/s)": 0.075027 }, { "acc": 0.72400937, "epoch": 2.1330724070450096, "grad_norm": 1.90625, "learning_rate": 6.784434590106808e-05, "loss": 0.98424711, "memory(GiB)": 19.11, "step": 545, "train_speed(iter/s)": 0.075114 }, { "acc": 0.77706275, "epoch": 2.152641878669276, "grad_norm": 1.359375, "learning_rate": 6.671048279297972e-05, "loss": 0.80820856, "memory(GiB)": 19.86, "step": 550, "train_speed(iter/s)": 0.075193 }, { "epoch": 2.152641878669276, "eval_acc": 0.6260015710919089, "eval_loss": 1.6081812381744385, "eval_runtime": 68.6973, "eval_samples_per_second": 1.106, "eval_steps_per_second": 0.553, "step": 550 }, { "acc": 0.75351696, "epoch": 2.172211350293542, "grad_norm": 2.015625, "learning_rate": 6.55774840850782e-05, "loss": 0.86192131, "memory(GiB)": 22.21, "step": 555, "train_speed(iter/s)": 0.074578 }, { "acc": 0.74249997, "epoch": 2.191780821917808, "grad_norm": 1.4609375, "learning_rate": 6.444564750017003e-05, "loss": 0.91982813, "memory(GiB)": 19.87, "step": 560, "train_speed(iter/s)": 0.074665 }, { "acc": 0.73636398, "epoch": 2.2113502935420746, "grad_norm": 1.9375, "learning_rate": 6.331527045568573e-05, "loss": 0.93448582, "memory(GiB)": 19.33, "step": 565, "train_speed(iter/s)": 0.074752 }, { "acc": 0.74081583, "epoch": 2.2309197651663406, "grad_norm": 2.21875, "learning_rate": 6.218664998552634e-05, "loss": 0.94956303, "memory(GiB)": 19.8, "step": 570, "train_speed(iter/s)": 0.074842 }, { "acc": 0.74573116, "epoch": 2.2504892367906066, "grad_norm": 2.546875, "learning_rate": 6.106008266201046e-05, "loss": 0.88486786, "memory(GiB)": 19.92, "step": 575, "train_speed(iter/s)": 0.074925 }, { "acc": 0.75495067, "epoch": 2.2700587084148727, "grad_norm": 2.09375, "learning_rate": 5.9935864517942844e-05, "loss": 0.84776802, "memory(GiB)": 19.89, "step": 580, "train_speed(iter/s)": 0.075 }, { "acc": 0.74743519, "epoch": 2.2896281800391387, "grad_norm": 1.5859375, "learning_rate": 5.881429096882449e-05, "loss": 0.92330503, "memory(GiB)": 19.03, "step": 585, "train_speed(iter/s)": 0.075076 }, { "acc": 0.74913769, "epoch": 2.309197651663405, "grad_norm": 1.6640625, "learning_rate": 5.769565673522515e-05, "loss": 0.92942295, "memory(GiB)": 20.04, "step": 590, "train_speed(iter/s)": 0.075149 }, { "acc": 0.74875064, "epoch": 2.328767123287671, "grad_norm": 1.25, "learning_rate": 5.658025576533832e-05, "loss": 0.90142069, "memory(GiB)": 19.96, "step": 595, "train_speed(iter/s)": 0.075215 }, { "acc": 0.74648356, "epoch": 2.3483365949119372, "grad_norm": 1.65625, "learning_rate": 5.546838115773929e-05, "loss": 0.91528139, "memory(GiB)": 19.84, "step": 600, "train_speed(iter/s)": 0.075292 }, { "epoch": 2.3483365949119372, "eval_acc": 0.6284367635506677, "eval_loss": 1.593437910079956, "eval_runtime": 68.9856, "eval_samples_per_second": 1.102, "eval_steps_per_second": 0.551, "step": 600 }, { "acc": 0.75246172, "epoch": 2.3679060665362037, "grad_norm": 1.2109375, "learning_rate": 5.4360325084366416e-05, "loss": 0.87402363, "memory(GiB)": 22.69, "step": 605, "train_speed(iter/s)": 0.074706 }, { "acc": 0.74078665, "epoch": 2.3874755381604698, "grad_norm": 1.0390625, "learning_rate": 5.3256378713745815e-05, "loss": 0.91142588, "memory(GiB)": 20.15, "step": 610, "train_speed(iter/s)": 0.074788 }, { "acc": 0.75772052, "epoch": 2.407045009784736, "grad_norm": 2.03125, "learning_rate": 5.21568321344799e-05, "loss": 0.85517597, "memory(GiB)": 19.37, "step": 615, "train_speed(iter/s)": 0.074857 }, { "acc": 0.75341692, "epoch": 2.426614481409002, "grad_norm": 1.40625, "learning_rate": 5.10619742790194e-05, "loss": 0.87981377, "memory(GiB)": 18.91, "step": 620, "train_speed(iter/s)": 0.074925 }, { "acc": 0.76221485, "epoch": 2.446183953033268, "grad_norm": 5.5625, "learning_rate": 4.9972092847739603e-05, "loss": 0.89623175, "memory(GiB)": 20.27, "step": 625, "train_speed(iter/s)": 0.074994 }, { "acc": 0.74322577, "epoch": 2.4657534246575343, "grad_norm": 1.6796875, "learning_rate": 4.8887474233339963e-05, "loss": 0.89493027, "memory(GiB)": 19.38, "step": 630, "train_speed(iter/s)": 0.075068 }, { "acc": 0.74455509, "epoch": 2.4853228962818004, "grad_norm": 1.3046875, "learning_rate": 4.780840344558753e-05, "loss": 0.92399101, "memory(GiB)": 19.32, "step": 635, "train_speed(iter/s)": 0.075143 }, { "acc": 0.75597148, "epoch": 2.5048923679060664, "grad_norm": 1.65625, "learning_rate": 4.673516403642383e-05, "loss": 0.86396818, "memory(GiB)": 19.52, "step": 640, "train_speed(iter/s)": 0.075214 }, { "acc": 0.75100412, "epoch": 2.524461839530333, "grad_norm": 1.5390625, "learning_rate": 4.5668038025454554e-05, "loss": 0.89630232, "memory(GiB)": 19.54, "step": 645, "train_speed(iter/s)": 0.07528 }, { "acc": 0.74814, "epoch": 2.544031311154599, "grad_norm": 1.7265625, "learning_rate": 4.460730582584228e-05, "loss": 0.90660105, "memory(GiB)": 19.46, "step": 650, "train_speed(iter/s)": 0.075343 }, { "epoch": 2.544031311154599, "eval_acc": 0.6304006284367636, "eval_loss": 1.6207610368728638, "eval_runtime": 68.9365, "eval_samples_per_second": 1.102, "eval_steps_per_second": 0.551, "step": 650 }, { "acc": 0.74153934, "epoch": 2.563600782778865, "grad_norm": 2.328125, "learning_rate": 4.3553246170621e-05, "loss": 0.90404129, "memory(GiB)": 19.38, "step": 655, "train_speed(iter/s)": 0.074813 }, { "acc": 0.76082869, "epoch": 2.583170254403131, "grad_norm": 1.5390625, "learning_rate": 4.2506136039452357e-05, "loss": 0.90251627, "memory(GiB)": 20.24, "step": 660, "train_speed(iter/s)": 0.074877 }, { "acc": 0.76424356, "epoch": 2.602739726027397, "grad_norm": 1.109375, "learning_rate": 4.146625058584251e-05, "loss": 0.85076065, "memory(GiB)": 19.4, "step": 665, "train_speed(iter/s)": 0.07494 }, { "acc": 0.75788155, "epoch": 2.6223091976516635, "grad_norm": 1.828125, "learning_rate": 4.043386306483886e-05, "loss": 0.8638917, "memory(GiB)": 18.71, "step": 670, "train_speed(iter/s)": 0.075 }, { "acc": 0.74567804, "epoch": 2.6418786692759295, "grad_norm": 1.5078125, "learning_rate": 3.940924476122573e-05, "loss": 0.91406345, "memory(GiB)": 19.53, "step": 675, "train_speed(iter/s)": 0.075062 }, { "acc": 0.77229648, "epoch": 2.6614481409001955, "grad_norm": 1.3984375, "learning_rate": 3.839266491823776e-05, "loss": 0.79556112, "memory(GiB)": 19.59, "step": 680, "train_speed(iter/s)": 0.075125 }, { "acc": 0.7331708, "epoch": 2.681017612524462, "grad_norm": 1.6015625, "learning_rate": 3.73843906668096e-05, "loss": 0.95133247, "memory(GiB)": 19.69, "step": 685, "train_speed(iter/s)": 0.075185 }, { "acc": 0.76955137, "epoch": 2.700587084148728, "grad_norm": 1.4140625, "learning_rate": 3.6384686955380996e-05, "loss": 0.82770052, "memory(GiB)": 19.53, "step": 690, "train_speed(iter/s)": 0.075245 }, { "acc": 0.73245583, "epoch": 2.720156555772994, "grad_norm": 1.59375, "learning_rate": 3.539381648027495e-05, "loss": 0.93347349, "memory(GiB)": 19.38, "step": 695, "train_speed(iter/s)": 0.075313 }, { "acc": 0.7664053, "epoch": 2.73972602739726, "grad_norm": 1.4296875, "learning_rate": 3.441203961666818e-05, "loss": 0.84118309, "memory(GiB)": 19.55, "step": 700, "train_speed(iter/s)": 0.075373 }, { "epoch": 2.73972602739726, "eval_acc": 0.628750981932443, "eval_loss": 1.5982366800308228, "eval_runtime": 69.1268, "eval_samples_per_second": 1.099, "eval_steps_per_second": 0.55, "step": 700 }, { "acc": 0.74386759, "epoch": 2.759295499021526, "grad_norm": 2.21875, "learning_rate": 3.343961435017094e-05, "loss": 0.92712116, "memory(GiB)": 23.1, "step": 705, "train_speed(iter/s)": 0.074881 }, { "acc": 0.75352135, "epoch": 2.7788649706457926, "grad_norm": 1.5625, "learning_rate": 3.247679620903533e-05, "loss": 0.90610752, "memory(GiB)": 19.56, "step": 710, "train_speed(iter/s)": 0.074934 }, { "acc": 0.75765467, "epoch": 2.7984344422700587, "grad_norm": 4.4375, "learning_rate": 3.1523838197008956e-05, "loss": 0.88628139, "memory(GiB)": 19.44, "step": 715, "train_speed(iter/s)": 0.074999 }, { "acc": 0.763375, "epoch": 2.8180039138943247, "grad_norm": 1.1640625, "learning_rate": 3.058099072685204e-05, "loss": 0.86159172, "memory(GiB)": 19.5, "step": 720, "train_speed(iter/s)": 0.075059 }, { "acc": 0.75694184, "epoch": 2.837573385518591, "grad_norm": 1.6171875, "learning_rate": 2.964850155453543e-05, "loss": 0.85433092, "memory(GiB)": 19.38, "step": 725, "train_speed(iter/s)": 0.075121 }, { "acc": 0.76086893, "epoch": 2.857142857142857, "grad_norm": 1.5859375, "learning_rate": 2.8726615714136827e-05, "loss": 0.8608798, "memory(GiB)": 19.58, "step": 730, "train_speed(iter/s)": 0.075181 }, { "acc": 0.74008894, "epoch": 2.8767123287671232, "grad_norm": 1.4375, "learning_rate": 2.7815575453452058e-05, "loss": 0.98413734, "memory(GiB)": 19.59, "step": 735, "train_speed(iter/s)": 0.075242 }, { "acc": 0.75941825, "epoch": 2.8962818003913893, "grad_norm": 1.7734375, "learning_rate": 2.6915620170338612e-05, "loss": 0.85438929, "memory(GiB)": 19.39, "step": 740, "train_speed(iter/s)": 0.075307 }, { "acc": 0.77891464, "epoch": 2.9158512720156553, "grad_norm": 1.7265625, "learning_rate": 2.6026986349808058e-05, "loss": 0.79716868, "memory(GiB)": 19.61, "step": 745, "train_speed(iter/s)": 0.075361 }, { "acc": 0.75023217, "epoch": 2.935420743639922, "grad_norm": 1.28125, "learning_rate": 2.514990750188399e-05, "loss": 0.85774508, "memory(GiB)": 18.86, "step": 750, "train_speed(iter/s)": 0.075417 }, { "epoch": 2.935420743639922, "eval_acc": 0.6324430479183032, "eval_loss": 1.5986852645874023, "eval_runtime": 69.3348, "eval_samples_per_second": 1.096, "eval_steps_per_second": 0.548, "step": 750 }, { "acc": 0.74531512, "epoch": 2.954990215264188, "grad_norm": 1.5625, "learning_rate": 2.4284614100241538e-05, "loss": 0.93483381, "memory(GiB)": 23.14, "step": 755, "train_speed(iter/s)": 0.074953 }, { "acc": 0.76761031, "epoch": 2.974559686888454, "grad_norm": 1.6171875, "learning_rate": 2.343133352164477e-05, "loss": 0.84630623, "memory(GiB)": 19.36, "step": 760, "train_speed(iter/s)": 0.075015 }, { "acc": 0.75018072, "epoch": 2.9941291585127203, "grad_norm": 1.5703125, "learning_rate": 2.2590289986198136e-05, "loss": 0.89352074, "memory(GiB)": 19.6, "step": 765, "train_speed(iter/s)": 0.075072 }, { "acc": 0.80383377, "epoch": 3.0136986301369864, "grad_norm": 1.453125, "learning_rate": 2.1761704498427003e-05, "loss": 0.68276234, "memory(GiB)": 19.62, "step": 770, "train_speed(iter/s)": 0.075153 }, { "acc": 0.82252359, "epoch": 3.0332681017612524, "grad_norm": 1.328125, "learning_rate": 2.094579478920358e-05, "loss": 0.64008789, "memory(GiB)": 19.76, "step": 775, "train_speed(iter/s)": 0.075213 }, { "acc": 0.83448801, "epoch": 3.0528375733855184, "grad_norm": 1.8828125, "learning_rate": 2.0142775258532654e-05, "loss": 0.61610913, "memory(GiB)": 19.59, "step": 780, "train_speed(iter/s)": 0.075271 }, { "acc": 0.83116817, "epoch": 3.072407045009785, "grad_norm": 1.5546875, "learning_rate": 1.9352856919212994e-05, "loss": 0.58688097, "memory(GiB)": 19.53, "step": 785, "train_speed(iter/s)": 0.075323 }, { "acc": 0.82525949, "epoch": 3.091976516634051, "grad_norm": 1.4375, "learning_rate": 1.8576247341388544e-05, "loss": 0.62312498, "memory(GiB)": 19.85, "step": 790, "train_speed(iter/s)": 0.07537 }, { "acc": 0.81645441, "epoch": 3.111545988258317, "grad_norm": 1.65625, "learning_rate": 1.7813150598004313e-05, "loss": 0.62203112, "memory(GiB)": 19.79, "step": 795, "train_speed(iter/s)": 0.075423 }, { "acc": 0.83432789, "epoch": 3.131115459882583, "grad_norm": 1.5859375, "learning_rate": 1.7063767211181333e-05, "loss": 0.60077624, "memory(GiB)": 19.52, "step": 800, "train_speed(iter/s)": 0.07548 }, { "epoch": 3.131115459882583, "eval_acc": 0.6209740769835035, "eval_loss": 1.7955598831176758, "eval_runtime": 69.0109, "eval_samples_per_second": 1.101, "eval_steps_per_second": 0.551, "step": 800 }, { "acc": 0.82124023, "epoch": 3.1506849315068495, "grad_norm": 1.7578125, "learning_rate": 1.6328294099524644e-05, "loss": 0.60847788, "memory(GiB)": 22.65, "step": 805, "train_speed(iter/s)": 0.075043 }, { "acc": 0.83265171, "epoch": 3.1702544031311155, "grad_norm": 4.09375, "learning_rate": 1.5606924526378136e-05, "loss": 0.57863126, "memory(GiB)": 18.89, "step": 810, "train_speed(iter/s)": 0.07509 }, { "acc": 0.8407362, "epoch": 3.1898238747553815, "grad_norm": 1.1796875, "learning_rate": 1.4899848049039881e-05, "loss": 0.53706379, "memory(GiB)": 19.37, "step": 815, "train_speed(iter/s)": 0.075142 }, { "acc": 0.82116756, "epoch": 3.2093933463796476, "grad_norm": 1.859375, "learning_rate": 1.4207250468951426e-05, "loss": 0.64039102, "memory(GiB)": 19.52, "step": 820, "train_speed(iter/s)": 0.075197 }, { "acc": 0.85004549, "epoch": 3.228962818003914, "grad_norm": 1.0390625, "learning_rate": 1.3529313782874023e-05, "loss": 0.53315983, "memory(GiB)": 19.52, "step": 825, "train_speed(iter/s)": 0.07525 }, { "acc": 0.83273296, "epoch": 3.24853228962818, "grad_norm": 1.578125, "learning_rate": 1.2866216135064487e-05, "loss": 0.58545351, "memory(GiB)": 19.36, "step": 830, "train_speed(iter/s)": 0.075303 }, { "acc": 0.80788403, "epoch": 3.268101761252446, "grad_norm": 2.296875, "learning_rate": 1.2218131770463487e-05, "loss": 0.67468171, "memory(GiB)": 19.28, "step": 835, "train_speed(iter/s)": 0.075356 }, { "acc": 0.8440134, "epoch": 3.287671232876712, "grad_norm": 1.21875, "learning_rate": 1.1585230988908576e-05, "loss": 0.55293651, "memory(GiB)": 19.37, "step": 840, "train_speed(iter/s)": 0.07541 }, { "acc": 0.81569691, "epoch": 3.3072407045009786, "grad_norm": 1.671875, "learning_rate": 1.0967680100383645e-05, "loss": 0.61190109, "memory(GiB)": 18.09, "step": 845, "train_speed(iter/s)": 0.075466 }, { "acc": 0.84766483, "epoch": 3.3268101761252447, "grad_norm": 1.8046875, "learning_rate": 1.0365641381317113e-05, "loss": 0.52525816, "memory(GiB)": 19.31, "step": 850, "train_speed(iter/s)": 0.075523 }, { "epoch": 3.3268101761252447, "eval_acc": 0.6203456402199529, "eval_loss": 1.7881730794906616, "eval_runtime": 69.1552, "eval_samples_per_second": 1.099, "eval_steps_per_second": 0.549, "step": 850 }, { "acc": 0.84491625, "epoch": 3.3463796477495107, "grad_norm": 1.8046875, "learning_rate": 9.779273031939692e-06, "loss": 0.56272998, "memory(GiB)": 23.04, "step": 855, "train_speed(iter/s)": 0.07511 }, { "acc": 0.84104662, "epoch": 3.3659491193737767, "grad_norm": 1.796875, "learning_rate": 9.20872913471363e-06, "loss": 0.57019663, "memory(GiB)": 19.42, "step": 860, "train_speed(iter/s)": 0.075157 }, { "acc": 0.84433002, "epoch": 3.385518590998043, "grad_norm": 1.6484375, "learning_rate": 8.654159613843715e-06, "loss": 0.55449514, "memory(GiB)": 19.59, "step": 865, "train_speed(iter/s)": 0.07521 }, { "acc": 0.80005312, "epoch": 3.4050880626223092, "grad_norm": 1.46875, "learning_rate": 8.115710195881068e-06, "loss": 0.73595409, "memory(GiB)": 19.36, "step": 870, "train_speed(iter/s)": 0.075258 }, { "acc": 0.83217945, "epoch": 3.4246575342465753, "grad_norm": 3.328125, "learning_rate": 7.593522371429972e-06, "loss": 0.58270836, "memory(GiB)": 19.58, "step": 875, "train_speed(iter/s)": 0.075306 }, { "acc": 0.82742786, "epoch": 3.4442270058708413, "grad_norm": 1.234375, "learning_rate": 7.0877333579678585e-06, "loss": 0.59052157, "memory(GiB)": 19.6, "step": 880, "train_speed(iter/s)": 0.075358 }, { "acc": 0.81994705, "epoch": 3.4637964774951078, "grad_norm": 1.7578125, "learning_rate": 6.598476063788036e-06, "loss": 0.62256751, "memory(GiB)": 19.56, "step": 885, "train_speed(iter/s)": 0.075405 }, { "acc": 0.8157341, "epoch": 3.483365949119374, "grad_norm": 1.8203125, "learning_rate": 6.12587905307477e-06, "loss": 0.66806622, "memory(GiB)": 19.49, "step": 890, "train_speed(iter/s)": 0.075454 }, { "acc": 0.82838688, "epoch": 3.50293542074364, "grad_norm": 1.515625, "learning_rate": 5.67006651212008e-06, "loss": 0.63044977, "memory(GiB)": 19.54, "step": 895, "train_speed(iter/s)": 0.075497 }, { "acc": 0.79130597, "epoch": 3.5225048923679063, "grad_norm": 1.640625, "learning_rate": 5.2311582166906605e-06, "loss": 0.7558567, "memory(GiB)": 19.28, "step": 900, "train_speed(iter/s)": 0.07555 }, { "epoch": 3.5225048923679063, "eval_acc": 0.6211311861743912, "eval_loss": 1.7854998111724854, "eval_runtime": 69.2434, "eval_samples_per_second": 1.098, "eval_steps_per_second": 0.549, "step": 900 } ], "logging_steps": 5, "max_steps": 1020, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 300, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.605539502350213e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }