{ "best_metric": 1.47908163, "best_model_checkpoint": "D:\\_____NEW_NN\\LLM\\MiniCPM-V\\finetune\\output\\phi3-vision-128k-instruct\\v9-20240710-235159\\checkpoint-500", "epoch": 2.8828828828828827, "eval_steps": 50, "global_step": 800, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "acc": 0.4856407, "epoch": 0.0036036036036036037, "grad_norm": 0.734375, "learning_rate": 2.4107142857142856e-06, "loss": 2.42667556, "memory(GiB)": 18.11, "step": 1, "train_speed(iter/s)": 0.072451 }, { "acc": 0.50815099, "epoch": 0.018018018018018018, "grad_norm": 0.671875, "learning_rate": 1.2053571428571429e-05, "loss": 2.28746271, "memory(GiB)": 19.3, "step": 5, "train_speed(iter/s)": 0.081978 }, { "acc": 0.50680609, "epoch": 0.036036036036036036, "grad_norm": 0.76953125, "learning_rate": 2.4107142857142858e-05, "loss": 2.29894772, "memory(GiB)": 19.3, "step": 10, "train_speed(iter/s)": 0.084125 }, { "acc": 0.51412601, "epoch": 0.05405405405405406, "grad_norm": 0.76171875, "learning_rate": 3.616071428571428e-05, "loss": 2.34161263, "memory(GiB)": 19.7, "step": 15, "train_speed(iter/s)": 0.08456 }, { "acc": 0.52338777, "epoch": 0.07207207207207207, "grad_norm": 0.6015625, "learning_rate": 4.8214285714285716e-05, "loss": 2.23036633, "memory(GiB)": 19.88, "step": 20, "train_speed(iter/s)": 0.084117 }, { "acc": 0.55944238, "epoch": 0.09009009009009009, "grad_norm": 0.66796875, "learning_rate": 6.026785714285715e-05, "loss": 2.01084595, "memory(GiB)": 19.93, "step": 25, "train_speed(iter/s)": 0.084444 }, { "acc": 0.57758675, "epoch": 0.10810810810810811, "grad_norm": 0.765625, "learning_rate": 7.232142857142856e-05, "loss": 1.94100876, "memory(GiB)": 20.21, "step": 30, "train_speed(iter/s)": 0.085158 }, { "acc": 0.5666451, "epoch": 0.12612612612612611, "grad_norm": 0.796875, "learning_rate": 8.4375e-05, "loss": 1.96992569, "memory(GiB)": 19.42, "step": 35, "train_speed(iter/s)": 0.085562 }, { "acc": 0.55766659, "epoch": 0.14414414414414414, "grad_norm": 0.828125, "learning_rate": 9.642857142857143e-05, "loss": 2.01305885, "memory(GiB)": 19.71, "step": 40, "train_speed(iter/s)": 0.0857 }, { "acc": 0.56964116, "epoch": 0.16216216216216217, "grad_norm": 0.83203125, "learning_rate": 0.00010848214285714286, "loss": 1.925914, "memory(GiB)": 19.68, "step": 45, "train_speed(iter/s)": 0.08577 }, { "acc": 0.56270452, "epoch": 0.18018018018018017, "grad_norm": 0.9375, "learning_rate": 0.0001205357142857143, "loss": 1.94923038, "memory(GiB)": 19.65, "step": 50, "train_speed(iter/s)": 0.085942 }, { "epoch": 0.18018018018018017, "eval_acc": 0.5890983000739098, "eval_loss": 1.795773983001709, "eval_runtime": 136.6505, "eval_samples_per_second": 1.105, "eval_steps_per_second": 0.556, "step": 50 }, { "acc": 0.57772484, "epoch": 0.1981981981981982, "grad_norm": 0.7265625, "learning_rate": 0.00013258928571428571, "loss": 1.86195869, "memory(GiB)": 23.11, "step": 55, "train_speed(iter/s)": 0.070857 }, { "acc": 0.59196444, "epoch": 0.21621621621621623, "grad_norm": 0.8125, "learning_rate": 0.00013499518432841625, "loss": 1.74724998, "memory(GiB)": 19.42, "step": 60, "train_speed(iter/s)": 0.071911 }, { "acc": 0.57253065, "epoch": 0.23423423423423423, "grad_norm": 0.69921875, "learning_rate": 0.00013497562184025362, "loss": 1.87580814, "memory(GiB)": 19.61, "step": 65, "train_speed(iter/s)": 0.072807 }, { "acc": 0.59546819, "epoch": 0.25225225225225223, "grad_norm": 0.73046875, "learning_rate": 0.00013494101591406666, "loss": 1.73464546, "memory(GiB)": 19.58, "step": 70, "train_speed(iter/s)": 0.073652 }, { "acc": 0.59667702, "epoch": 0.2702702702702703, "grad_norm": 0.8203125, "learning_rate": 0.00013489137426511745, "loss": 1.69518318, "memory(GiB)": 18.19, "step": 75, "train_speed(iter/s)": 0.074445 }, { "acc": 0.61824327, "epoch": 0.2882882882882883, "grad_norm": 0.828125, "learning_rate": 0.00013482670796082633, "loss": 1.64374161, "memory(GiB)": 19.52, "step": 80, "train_speed(iter/s)": 0.075071 }, { "acc": 0.60798159, "epoch": 0.3063063063063063, "grad_norm": 0.7734375, "learning_rate": 0.00013474703141830443, "loss": 1.68669338, "memory(GiB)": 19.57, "step": 85, "train_speed(iter/s)": 0.07562 }, { "acc": 0.5981144, "epoch": 0.32432432432432434, "grad_norm": 0.80078125, "learning_rate": 0.00013465236240113953, "loss": 1.701264, "memory(GiB)": 20.19, "step": 90, "train_speed(iter/s)": 0.076188 }, { "acc": 0.59871612, "epoch": 0.34234234234234234, "grad_norm": 1.0234375, "learning_rate": 0.00013454272201543564, "loss": 1.76608849, "memory(GiB)": 19.35, "step": 95, "train_speed(iter/s)": 0.076637 }, { "acc": 0.61396523, "epoch": 0.36036036036036034, "grad_norm": 0.7109375, "learning_rate": 0.00013441813470510747, "loss": 1.61449242, "memory(GiB)": 19.69, "step": 100, "train_speed(iter/s)": 0.077075 }, { "epoch": 0.36036036036036034, "eval_acc": 0.6091648189209165, "eval_loss": 1.6449466943740845, "eval_runtime": 134.5726, "eval_samples_per_second": 1.122, "eval_steps_per_second": 0.565, "step": 100 }, { "acc": 0.61147785, "epoch": 0.3783783783783784, "grad_norm": 0.69921875, "learning_rate": 0.00013427862824643083, "loss": 1.60589867, "memory(GiB)": 21.03, "step": 105, "train_speed(iter/s)": 0.070426 }, { "acc": 0.6038115, "epoch": 0.3963963963963964, "grad_norm": 0.88671875, "learning_rate": 0.00013412423374184996, "loss": 1.69055023, "memory(GiB)": 19.44, "step": 110, "train_speed(iter/s)": 0.07105 }, { "acc": 0.62303677, "epoch": 0.4144144144144144, "grad_norm": 0.84375, "learning_rate": 0.00013395498561304334, "loss": 1.5716897, "memory(GiB)": 19.27, "step": 115, "train_speed(iter/s)": 0.071618 }, { "acc": 0.6214046, "epoch": 0.43243243243243246, "grad_norm": 0.640625, "learning_rate": 0.00013377092159324956, "loss": 1.57531881, "memory(GiB)": 19.36, "step": 120, "train_speed(iter/s)": 0.07209 }, { "acc": 0.58676672, "epoch": 0.45045045045045046, "grad_norm": 0.68359375, "learning_rate": 0.00013357208271885473, "loss": 1.74933128, "memory(GiB)": 19.32, "step": 125, "train_speed(iter/s)": 0.072581 }, { "acc": 0.59380612, "epoch": 0.46846846846846846, "grad_norm": 0.7890625, "learning_rate": 0.00013335851332024374, "loss": 1.69583378, "memory(GiB)": 20.18, "step": 130, "train_speed(iter/s)": 0.073016 }, { "acc": 0.62007999, "epoch": 0.4864864864864865, "grad_norm": 0.73828125, "learning_rate": 0.0001331302610119168, "loss": 1.60020466, "memory(GiB)": 19.52, "step": 135, "train_speed(iter/s)": 0.073417 }, { "acc": 0.6116991, "epoch": 0.5045045045045045, "grad_norm": 1.1015625, "learning_rate": 0.00013288737668187408, "loss": 1.62470894, "memory(GiB)": 19.47, "step": 140, "train_speed(iter/s)": 0.073817 }, { "acc": 0.60051751, "epoch": 0.5225225225225225, "grad_norm": 0.87109375, "learning_rate": 0.00013262991448027034, "loss": 1.6651041, "memory(GiB)": 19.42, "step": 145, "train_speed(iter/s)": 0.074194 }, { "acc": 0.60736594, "epoch": 0.5405405405405406, "grad_norm": 0.76953125, "learning_rate": 0.00013235793180734238, "loss": 1.64281559, "memory(GiB)": 19.53, "step": 150, "train_speed(iter/s)": 0.074547 }, { "epoch": 0.5405405405405406, "eval_acc": 0.6190317812269032, "eval_loss": 1.5917434692382812, "eval_runtime": 135.0141, "eval_samples_per_second": 1.118, "eval_steps_per_second": 0.563, "step": 150 }, { "acc": 0.61663914, "epoch": 0.5585585585585585, "grad_norm": 1.0625, "learning_rate": 0.00013207148930061195, "loss": 1.60914173, "memory(GiB)": 23.05, "step": 155, "train_speed(iter/s)": 0.070306 }, { "acc": 0.60967774, "epoch": 0.5765765765765766, "grad_norm": 0.76953125, "learning_rate": 0.00013177065082136668, "loss": 1.59582939, "memory(GiB)": 19.47, "step": 160, "train_speed(iter/s)": 0.070712 }, { "acc": 0.63630972, "epoch": 0.5945945945945946, "grad_norm": 0.70703125, "learning_rate": 0.00013145548344042262, "loss": 1.50356016, "memory(GiB)": 19.62, "step": 165, "train_speed(iter/s)": 0.071104 }, { "acc": 0.60439692, "epoch": 0.6126126126126126, "grad_norm": 0.73046875, "learning_rate": 0.00013112605742317095, "loss": 1.67050171, "memory(GiB)": 19.41, "step": 170, "train_speed(iter/s)": 0.071478 }, { "acc": 0.62380457, "epoch": 0.6306306306306306, "grad_norm": 0.76171875, "learning_rate": 0.0001307824462139125, "loss": 1.53042831, "memory(GiB)": 19.5, "step": 175, "train_speed(iter/s)": 0.071843 }, { "acc": 0.61549187, "epoch": 0.6486486486486487, "grad_norm": 0.7578125, "learning_rate": 0.00013042472641948386, "loss": 1.59476538, "memory(GiB)": 19.53, "step": 180, "train_speed(iter/s)": 0.072168 }, { "acc": 0.64418182, "epoch": 0.6666666666666666, "grad_norm": 1.1796875, "learning_rate": 0.0001300529777921779, "loss": 1.47999802, "memory(GiB)": 19.32, "step": 185, "train_speed(iter/s)": 0.072501 }, { "acc": 0.62201657, "epoch": 0.6846846846846847, "grad_norm": 0.6484375, "learning_rate": 0.00012966728321196346, "loss": 1.5685544, "memory(GiB)": 19.47, "step": 190, "train_speed(iter/s)": 0.072821 }, { "acc": 0.61418505, "epoch": 0.7027027027027027, "grad_norm": 0.8984375, "learning_rate": 0.00012926772866800757, "loss": 1.6284462, "memory(GiB)": 19.45, "step": 195, "train_speed(iter/s)": 0.073127 }, { "acc": 0.62820964, "epoch": 0.7207207207207207, "grad_norm": 0.8515625, "learning_rate": 0.00012885440323950434, "loss": 1.54364405, "memory(GiB)": 19.53, "step": 200, "train_speed(iter/s)": 0.073413 }, { "epoch": 0.7207207207207207, "eval_acc": 0.6269770879526977, "eval_loss": 1.5466336011886597, "eval_runtime": 134.7868, "eval_samples_per_second": 1.12, "eval_steps_per_second": 0.564, "step": 200 }, { "acc": 0.6605804, "epoch": 0.7387387387387387, "grad_norm": 0.7578125, "learning_rate": 0.00012842739907581525, "loss": 1.42957153, "memory(GiB)": 23.0, "step": 205, "train_speed(iter/s)": 0.070232 }, { "acc": 0.61267309, "epoch": 0.7567567567567568, "grad_norm": 0.90234375, "learning_rate": 0.00012798681137592477, "loss": 1.62853241, "memory(GiB)": 17.96, "step": 210, "train_speed(iter/s)": 0.070571 }, { "acc": 0.63069816, "epoch": 0.7747747747747747, "grad_norm": 0.89453125, "learning_rate": 0.00012753273836721597, "loss": 1.56295233, "memory(GiB)": 19.4, "step": 215, "train_speed(iter/s)": 0.070892 }, { "acc": 0.60362072, "epoch": 0.7927927927927928, "grad_norm": 1.0703125, "learning_rate": 0.00012706528128357127, "loss": 1.63038826, "memory(GiB)": 19.37, "step": 220, "train_speed(iter/s)": 0.071181 }, { "acc": 0.62272639, "epoch": 0.8108108108108109, "grad_norm": 0.8828125, "learning_rate": 0.00012658454434280253, "loss": 1.5756237, "memory(GiB)": 19.62, "step": 225, "train_speed(iter/s)": 0.071466 }, { "acc": 0.59926658, "epoch": 0.8288288288288288, "grad_norm": 0.75390625, "learning_rate": 0.00012609063472341633, "loss": 1.60503426, "memory(GiB)": 19.63, "step": 230, "train_speed(iter/s)": 0.071751 }, { "acc": 0.60133944, "epoch": 0.8468468468468469, "grad_norm": 1.3515625, "learning_rate": 0.0001255836625407187, "loss": 1.64450779, "memory(GiB)": 19.31, "step": 235, "train_speed(iter/s)": 0.072034 }, { "acc": 0.64020758, "epoch": 0.8648648648648649, "grad_norm": 0.9375, "learning_rate": 0.00012506374082226534, "loss": 1.47053967, "memory(GiB)": 18.85, "step": 240, "train_speed(iter/s)": 0.072286 }, { "acc": 0.62713485, "epoch": 0.8828828828828829, "grad_norm": 0.82421875, "learning_rate": 0.00012453098548266276, "loss": 1.51464148, "memory(GiB)": 19.35, "step": 245, "train_speed(iter/s)": 0.07254 }, { "acc": 0.6202302, "epoch": 0.9009009009009009, "grad_norm": 0.625, "learning_rate": 0.0001239855152977253, "loss": 1.54778471, "memory(GiB)": 19.53, "step": 250, "train_speed(iter/s)": 0.072758 }, { "epoch": 0.9009009009009009, "eval_acc": 0.6308573540280857, "eval_loss": 1.510523796081543, "eval_runtime": 134.5445, "eval_samples_per_second": 1.122, "eval_steps_per_second": 0.565, "step": 250 }, { "acc": 0.63671951, "epoch": 0.918918918918919, "grad_norm": 1.7109375, "learning_rate": 0.00012342745187799459, "loss": 1.48321924, "memory(GiB)": 19.53, "step": 255, "train_speed(iter/s)": 0.070273 }, { "acc": 0.63577223, "epoch": 0.9369369369369369, "grad_norm": 0.7890625, "learning_rate": 0.000122856919641627, "loss": 1.50699987, "memory(GiB)": 19.94, "step": 260, "train_speed(iter/s)": 0.070553 }, { "acc": 0.64953299, "epoch": 0.954954954954955, "grad_norm": 0.85546875, "learning_rate": 0.000122274045786655, "loss": 1.46005678, "memory(GiB)": 20.1, "step": 265, "train_speed(iter/s)": 0.070802 }, { "acc": 0.62153759, "epoch": 0.972972972972973, "grad_norm": 1.0625, "learning_rate": 0.00012167896026262893, "loss": 1.55834417, "memory(GiB)": 19.86, "step": 270, "train_speed(iter/s)": 0.071052 }, { "acc": 0.64055209, "epoch": 0.990990990990991, "grad_norm": 1.125, "learning_rate": 0.00012107179574164504, "loss": 1.54932261, "memory(GiB)": 20.06, "step": 275, "train_speed(iter/s)": 0.071274 }, { "acc": 0.62708969, "epoch": 1.009009009009009, "grad_norm": 0.671875, "learning_rate": 0.00012045268758876699, "loss": 1.49731979, "memory(GiB)": 19.82, "step": 280, "train_speed(iter/s)": 0.07152 }, { "acc": 0.6689836, "epoch": 1.027027027027027, "grad_norm": 0.859375, "learning_rate": 0.00011982177383184648, "loss": 1.2817215, "memory(GiB)": 19.85, "step": 285, "train_speed(iter/s)": 0.07175 }, { "acc": 0.67519293, "epoch": 1.045045045045045, "grad_norm": 1.046875, "learning_rate": 0.00011917919513075066, "loss": 1.28632126, "memory(GiB)": 19.98, "step": 290, "train_speed(iter/s)": 0.071951 }, { "acc": 0.67276659, "epoch": 1.063063063063063, "grad_norm": 0.8984375, "learning_rate": 0.00011852509474600237, "loss": 1.27065611, "memory(GiB)": 20.03, "step": 295, "train_speed(iter/s)": 0.072155 }, { "acc": 0.64641519, "epoch": 1.0810810810810811, "grad_norm": 0.98046875, "learning_rate": 0.00011785961850684083, "loss": 1.38271847, "memory(GiB)": 19.09, "step": 300, "train_speed(iter/s)": 0.072371 }, { "epoch": 1.0810810810810811, "eval_acc": 0.6305617147080562, "eval_loss": 1.523685097694397, "eval_runtime": 134.8234, "eval_samples_per_second": 1.12, "eval_steps_per_second": 0.564, "step": 300 }, { "acc": 0.67837138, "epoch": 1.0990990990990992, "grad_norm": 0.953125, "learning_rate": 0.00011718291477870959, "loss": 1.29290819, "memory(GiB)": 22.8, "step": 305, "train_speed(iter/s)": 0.070277 }, { "acc": 0.67195911, "epoch": 1.117117117117117, "grad_norm": 1.796875, "learning_rate": 0.00011649513443017889, "loss": 1.24073734, "memory(GiB)": 19.39, "step": 310, "train_speed(iter/s)": 0.070516 }, { "acc": 0.69478951, "epoch": 1.135135135135135, "grad_norm": 1.203125, "learning_rate": 0.00011579643079931018, "loss": 1.20378675, "memory(GiB)": 19.38, "step": 315, "train_speed(iter/s)": 0.070713 }, { "acc": 0.68726826, "epoch": 1.1531531531531531, "grad_norm": 0.98828125, "learning_rate": 0.00011508695965946992, "loss": 1.23284683, "memory(GiB)": 19.98, "step": 320, "train_speed(iter/s)": 0.070919 }, { "acc": 0.65419765, "epoch": 1.1711711711711712, "grad_norm": 0.93359375, "learning_rate": 0.00011436687918460052, "loss": 1.37520065, "memory(GiB)": 20.02, "step": 325, "train_speed(iter/s)": 0.071117 }, { "acc": 0.66610641, "epoch": 1.1891891891891893, "grad_norm": 0.8671875, "learning_rate": 0.000113636349913956, "loss": 1.30743008, "memory(GiB)": 19.35, "step": 330, "train_speed(iter/s)": 0.071322 }, { "acc": 0.67390976, "epoch": 1.2072072072072073, "grad_norm": 1.6640625, "learning_rate": 0.00011289553471631045, "loss": 1.28322783, "memory(GiB)": 19.49, "step": 335, "train_speed(iter/s)": 0.071518 }, { "acc": 0.68137512, "epoch": 1.2252252252252251, "grad_norm": 0.6953125, "learning_rate": 0.00011214459875364693, "loss": 1.23027716, "memory(GiB)": 19.38, "step": 340, "train_speed(iter/s)": 0.071692 }, { "acc": 0.67859125, "epoch": 1.2432432432432432, "grad_norm": 0.78515625, "learning_rate": 0.00011138370944433531, "loss": 1.22896252, "memory(GiB)": 20.06, "step": 345, "train_speed(iter/s)": 0.071876 }, { "acc": 0.66445112, "epoch": 1.2612612612612613, "grad_norm": 0.90234375, "learning_rate": 0.00011061303642580694, "loss": 1.30674038, "memory(GiB)": 19.49, "step": 350, "train_speed(iter/s)": 0.072045 }, { "epoch": 1.2612612612612613, "eval_acc": 0.6356245380635624, "eval_loss": 1.5072119235992432, "eval_runtime": 134.5232, "eval_samples_per_second": 1.122, "eval_steps_per_second": 0.565, "step": 350 }, { "acc": 0.67729836, "epoch": 1.2792792792792793, "grad_norm": 0.90625, "learning_rate": 0.00010983275151673467, "loss": 1.24173574, "memory(GiB)": 18.93, "step": 355, "train_speed(iter/s)": 0.07029 }, { "acc": 0.7040791, "epoch": 1.2972972972972974, "grad_norm": 0.84765625, "learning_rate": 0.00010904302867872639, "loss": 1.17582674, "memory(GiB)": 19.29, "step": 360, "train_speed(iter/s)": 0.070479 }, { "acc": 0.66356058, "epoch": 1.3153153153153152, "grad_norm": 0.82421875, "learning_rate": 0.00010824404397754104, "loss": 1.26798725, "memory(GiB)": 19.36, "step": 365, "train_speed(iter/s)": 0.070661 }, { "acc": 0.69379635, "epoch": 1.3333333333333333, "grad_norm": 0.98828125, "learning_rate": 0.0001074359755438354, "loss": 1.24331112, "memory(GiB)": 20.16, "step": 370, "train_speed(iter/s)": 0.070843 }, { "acc": 0.68220735, "epoch": 1.3513513513513513, "grad_norm": 0.94140625, "learning_rate": 0.00010661900353345051, "loss": 1.20891714, "memory(GiB)": 19.61, "step": 375, "train_speed(iter/s)": 0.071015 }, { "acc": 0.67620883, "epoch": 1.3693693693693694, "grad_norm": 1.0625, "learning_rate": 0.0001057933100872466, "loss": 1.23957863, "memory(GiB)": 20.17, "step": 380, "train_speed(iter/s)": 0.071181 }, { "acc": 0.63655629, "epoch": 1.3873873873873874, "grad_norm": 0.78515625, "learning_rate": 0.00010495907929049546, "loss": 1.44390507, "memory(GiB)": 19.25, "step": 385, "train_speed(iter/s)": 0.071356 }, { "acc": 0.67883902, "epoch": 1.4054054054054055, "grad_norm": 0.8828125, "learning_rate": 0.00010411649713183925, "loss": 1.29691544, "memory(GiB)": 18.78, "step": 390, "train_speed(iter/s)": 0.071515 }, { "acc": 0.67202511, "epoch": 1.4234234234234235, "grad_norm": 0.953125, "learning_rate": 0.00010326575146182521, "loss": 1.31318274, "memory(GiB)": 19.88, "step": 395, "train_speed(iter/s)": 0.071677 }, { "acc": 0.69274058, "epoch": 1.4414414414414414, "grad_norm": 0.82421875, "learning_rate": 0.00010240703195102489, "loss": 1.15976305, "memory(GiB)": 19.46, "step": 400, "train_speed(iter/s)": 0.071832 }, { "epoch": 1.4414414414414414, "eval_acc": 0.6368440502586844, "eval_loss": 1.4986343383789062, "eval_runtime": 134.3425, "eval_samples_per_second": 1.124, "eval_steps_per_second": 0.566, "step": 400 }, { "acc": 0.71039405, "epoch": 1.4594594594594594, "grad_norm": 0.77734375, "learning_rate": 0.0001015405300477479, "loss": 1.12253609, "memory(GiB)": 19.92, "step": 405, "train_speed(iter/s)": 0.070298 }, { "acc": 0.71356583, "epoch": 1.4774774774774775, "grad_norm": 0.84375, "learning_rate": 0.0001006664389353592, "loss": 1.13753939, "memory(GiB)": 19.31, "step": 410, "train_speed(iter/s)": 0.070457 }, { "acc": 0.675458, "epoch": 1.4954954954954955, "grad_norm": 1.1328125, "learning_rate": 9.978495348920958e-05, "loss": 1.29233532, "memory(GiB)": 19.06, "step": 415, "train_speed(iter/s)": 0.070616 }, { "acc": 0.67761598, "epoch": 1.5135135135135136, "grad_norm": 0.6875, "learning_rate": 9.889627023318897e-05, "loss": 1.22440186, "memory(GiB)": 19.16, "step": 420, "train_speed(iter/s)": 0.070773 }, { "acc": 0.67492404, "epoch": 1.5315315315315314, "grad_norm": 0.81640625, "learning_rate": 9.800058729591212e-05, "loss": 1.22408361, "memory(GiB)": 19.97, "step": 425, "train_speed(iter/s)": 0.070935 }, { "acc": 0.68050842, "epoch": 1.5495495495495497, "grad_norm": 0.84765625, "learning_rate": 9.70981043665466e-05, "loss": 1.2078824, "memory(GiB)": 19.92, "step": 430, "train_speed(iter/s)": 0.07109 }, { "acc": 0.6750885, "epoch": 1.5675675675675675, "grad_norm": 0.66796875, "learning_rate": 9.618902265029284e-05, "loss": 1.28742075, "memory(GiB)": 19.27, "step": 435, "train_speed(iter/s)": 0.071229 }, { "acc": 0.64411507, "epoch": 1.5855855855855856, "grad_norm": 0.95703125, "learning_rate": 9.527354482352616e-05, "loss": 1.37240067, "memory(GiB)": 20.21, "step": 440, "train_speed(iter/s)": 0.071374 }, { "acc": 0.67574663, "epoch": 1.6036036036036037, "grad_norm": 0.83984375, "learning_rate": 9.435187498861085e-05, "loss": 1.27780771, "memory(GiB)": 19.95, "step": 445, "train_speed(iter/s)": 0.071519 }, { "acc": 0.67897987, "epoch": 1.6216216216216215, "grad_norm": 1.2265625, "learning_rate": 9.342421862839632e-05, "loss": 1.26616125, "memory(GiB)": 19.32, "step": 450, "train_speed(iter/s)": 0.071661 }, { "epoch": 1.6216216216216215, "eval_acc": 0.6424611973392461, "eval_loss": 1.4772522449493408, "eval_runtime": 134.5995, "eval_samples_per_second": 1.122, "eval_steps_per_second": 0.565, "step": 450 }, { "acc": 0.66755495, "epoch": 1.6396396396396398, "grad_norm": 1.0390625, "learning_rate": 9.249078256040541e-05, "loss": 1.30118093, "memory(GiB)": 22.82, "step": 455, "train_speed(iter/s)": 0.070312 }, { "acc": 0.66560607, "epoch": 1.6576576576576576, "grad_norm": 1.0546875, "learning_rate": 9.155177489072527e-05, "loss": 1.31042576, "memory(GiB)": 19.56, "step": 460, "train_speed(iter/s)": 0.070454 }, { "acc": 0.67957892, "epoch": 1.6756756756756757, "grad_norm": 1.3828125, "learning_rate": 9.060740496761082e-05, "loss": 1.31165123, "memory(GiB)": 19.38, "step": 465, "train_speed(iter/s)": 0.070592 }, { "acc": 0.6744031, "epoch": 1.6936936936936937, "grad_norm": 1.4140625, "learning_rate": 8.965788333481144e-05, "loss": 1.26758223, "memory(GiB)": 19.42, "step": 470, "train_speed(iter/s)": 0.070726 }, { "acc": 0.66551232, "epoch": 1.7117117117117115, "grad_norm": 0.98046875, "learning_rate": 8.870342168463085e-05, "loss": 1.27216129, "memory(GiB)": 19.27, "step": 475, "train_speed(iter/s)": 0.070864 }, { "acc": 0.65833273, "epoch": 1.7297297297297298, "grad_norm": 0.9140625, "learning_rate": 8.77442328107313e-05, "loss": 1.32684155, "memory(GiB)": 19.48, "step": 480, "train_speed(iter/s)": 0.070997 }, { "acc": 0.68646383, "epoch": 1.7477477477477477, "grad_norm": 1.3671875, "learning_rate": 8.678053056069184e-05, "loss": 1.2200016, "memory(GiB)": 19.24, "step": 485, "train_speed(iter/s)": 0.071136 }, { "acc": 0.69040904, "epoch": 1.7657657657657657, "grad_norm": 1.6171875, "learning_rate": 8.581252978833194e-05, "loss": 1.18706884, "memory(GiB)": 19.53, "step": 490, "train_speed(iter/s)": 0.07127 }, { "acc": 0.66571455, "epoch": 1.7837837837837838, "grad_norm": 0.8515625, "learning_rate": 8.484044630581057e-05, "loss": 1.29456005, "memory(GiB)": 20.09, "step": 495, "train_speed(iter/s)": 0.071401 }, { "acc": 0.67682033, "epoch": 1.8018018018018018, "grad_norm": 1.0, "learning_rate": 8.386449683551164e-05, "loss": 1.20547714, "memory(GiB)": 19.95, "step": 500, "train_speed(iter/s)": 0.071533 }, { "epoch": 1.8018018018018018, "eval_acc": 0.6413155949741316, "eval_loss": 1.479081630706787, "eval_runtime": 134.2299, "eval_samples_per_second": 1.125, "eval_steps_per_second": 0.566, "step": 500 }, { "acc": 0.67326751, "epoch": 1.8198198198198199, "grad_norm": 1.0546875, "learning_rate": 8.288489896172669e-05, "loss": 1.25247726, "memory(GiB)": 20.29, "step": 505, "train_speed(iter/s)": 0.070304 }, { "acc": 0.66375732, "epoch": 1.8378378378378377, "grad_norm": 0.9296875, "learning_rate": 8.190187108214514e-05, "loss": 1.28065901, "memory(GiB)": 20.04, "step": 510, "train_speed(iter/s)": 0.070438 }, { "acc": 0.69006267, "epoch": 1.855855855855856, "grad_norm": 1.0234375, "learning_rate": 8.091563235916343e-05, "loss": 1.13905525, "memory(GiB)": 20.03, "step": 515, "train_speed(iter/s)": 0.070569 }, { "acc": 0.69745221, "epoch": 1.8738738738738738, "grad_norm": 0.96484375, "learning_rate": 7.992640267102351e-05, "loss": 1.14712362, "memory(GiB)": 18.5, "step": 520, "train_speed(iter/s)": 0.070709 }, { "acc": 0.6707756, "epoch": 1.8918918918918919, "grad_norm": 1.328125, "learning_rate": 7.893440256279186e-05, "loss": 1.30717278, "memory(GiB)": 20.66, "step": 525, "train_speed(iter/s)": 0.07083 }, { "acc": 0.66872559, "epoch": 1.90990990990991, "grad_norm": 0.9765625, "learning_rate": 7.793985319718982e-05, "loss": 1.28408003, "memory(GiB)": 19.48, "step": 530, "train_speed(iter/s)": 0.070948 }, { "acc": 0.68111048, "epoch": 1.9279279279279278, "grad_norm": 0.76171875, "learning_rate": 7.694297630528612e-05, "loss": 1.21391411, "memory(GiB)": 19.88, "step": 535, "train_speed(iter/s)": 0.071071 }, { "acc": 0.65094652, "epoch": 1.945945945945946, "grad_norm": 0.83203125, "learning_rate": 7.594399413706277e-05, "loss": 1.34138126, "memory(GiB)": 19.9, "step": 540, "train_speed(iter/s)": 0.071193 }, { "acc": 0.67896776, "epoch": 1.9639639639639639, "grad_norm": 0.796875, "learning_rate": 7.494312941186529e-05, "loss": 1.22575331, "memory(GiB)": 19.43, "step": 545, "train_speed(iter/s)": 0.071302 }, { "acc": 0.6839644, "epoch": 1.981981981981982, "grad_norm": 0.78515625, "learning_rate": 7.394060526874825e-05, "loss": 1.25017443, "memory(GiB)": 19.25, "step": 550, "train_speed(iter/s)": 0.07142 }, { "epoch": 1.981981981981982, "eval_acc": 0.645269770879527, "eval_loss": 1.4606801271438599, "eval_runtime": 134.7756, "eval_samples_per_second": 1.12, "eval_steps_per_second": 0.564, "step": 550 }, { "acc": 0.68771811, "epoch": 2.0, "grad_norm": 0.81640625, "learning_rate": 7.293664521672729e-05, "loss": 1.22415581, "memory(GiB)": 22.67, "step": 555, "train_speed(iter/s)": 0.070304 }, { "acc": 0.741537, "epoch": 2.018018018018018, "grad_norm": 0.6171875, "learning_rate": 7.193147308494851e-05, "loss": 0.95370378, "memory(GiB)": 19.64, "step": 560, "train_speed(iter/s)": 0.070425 }, { "acc": 0.75044699, "epoch": 2.036036036036036, "grad_norm": 1.09375, "learning_rate": 7.09253129727867e-05, "loss": 0.95568914, "memory(GiB)": 19.4, "step": 565, "train_speed(iter/s)": 0.070541 }, { "acc": 0.75126195, "epoch": 2.054054054054054, "grad_norm": 1.3671875, "learning_rate": 6.991838919988322e-05, "loss": 0.92719631, "memory(GiB)": 19.54, "step": 570, "train_speed(iter/s)": 0.070658 }, { "acc": 0.74883032, "epoch": 2.0720720720720722, "grad_norm": 1.0078125, "learning_rate": 6.891092625613469e-05, "loss": 0.92080975, "memory(GiB)": 20.17, "step": 575, "train_speed(iter/s)": 0.07077 }, { "acc": 0.76222944, "epoch": 2.09009009009009, "grad_norm": 0.99609375, "learning_rate": 6.790314875164393e-05, "loss": 0.88407106, "memory(GiB)": 19.57, "step": 580, "train_speed(iter/s)": 0.070882 }, { "acc": 0.76224823, "epoch": 2.108108108108108, "grad_norm": 1.0859375, "learning_rate": 6.689528136664377e-05, "loss": 0.85150976, "memory(GiB)": 19.54, "step": 585, "train_speed(iter/s)": 0.070995 }, { "acc": 0.73958569, "epoch": 2.126126126126126, "grad_norm": 1.3828125, "learning_rate": 6.588754880140573e-05, "loss": 0.92128286, "memory(GiB)": 19.58, "step": 590, "train_speed(iter/s)": 0.071101 }, { "acc": 0.74549003, "epoch": 2.144144144144144, "grad_norm": 1.359375, "learning_rate": 6.488017572614363e-05, "loss": 0.90851021, "memory(GiB)": 18.59, "step": 595, "train_speed(iter/s)": 0.071211 }, { "acc": 0.73912826, "epoch": 2.1621621621621623, "grad_norm": 1.3125, "learning_rate": 6.387338673092443e-05, "loss": 0.92900734, "memory(GiB)": 19.54, "step": 600, "train_speed(iter/s)": 0.071321 }, { "epoch": 2.1621621621621623, "eval_acc": 0.6320768662232077, "eval_loss": 1.5818341970443726, "eval_runtime": 134.4691, "eval_samples_per_second": 1.123, "eval_steps_per_second": 0.565, "step": 600 }, { "acc": 0.75979438, "epoch": 2.18018018018018, "grad_norm": 1.09375, "learning_rate": 6.286740627559656e-05, "loss": 0.89129753, "memory(GiB)": 22.37, "step": 605, "train_speed(iter/s)": 0.070301 }, { "acc": 0.72820721, "epoch": 2.1981981981981984, "grad_norm": 2.15625, "learning_rate": 6.186245863974757e-05, "loss": 0.96495447, "memory(GiB)": 19.6, "step": 610, "train_speed(iter/s)": 0.070413 }, { "acc": 0.75764585, "epoch": 2.2162162162162162, "grad_norm": 1.0078125, "learning_rate": 6.0858767872701715e-05, "loss": 0.89218092, "memory(GiB)": 20.15, "step": 615, "train_speed(iter/s)": 0.070515 }, { "acc": 0.75772595, "epoch": 2.234234234234234, "grad_norm": 1.6328125, "learning_rate": 5.985655774356901e-05, "loss": 0.89191771, "memory(GiB)": 19.46, "step": 620, "train_speed(iter/s)": 0.070627 }, { "acc": 0.7377079, "epoch": 2.2522522522522523, "grad_norm": 1.1875, "learning_rate": 5.8856051691356884e-05, "loss": 0.94241228, "memory(GiB)": 19.35, "step": 625, "train_speed(iter/s)": 0.070733 }, { "acc": 0.77948771, "epoch": 2.27027027027027, "grad_norm": 1.2890625, "learning_rate": 5.785747277515506e-05, "loss": 0.79317036, "memory(GiB)": 20.48, "step": 630, "train_speed(iter/s)": 0.070844 }, { "acc": 0.76766949, "epoch": 2.2882882882882885, "grad_norm": 0.97265625, "learning_rate": 5.686104362440552e-05, "loss": 0.82855272, "memory(GiB)": 20.12, "step": 635, "train_speed(iter/s)": 0.070945 }, { "acc": 0.74998231, "epoch": 2.3063063063063063, "grad_norm": 2.9375, "learning_rate": 5.586698638926811e-05, "loss": 0.93049393, "memory(GiB)": 20.06, "step": 640, "train_speed(iter/s)": 0.071044 }, { "acc": 0.75094385, "epoch": 2.3243243243243246, "grad_norm": 1.1875, "learning_rate": 5.487552269109287e-05, "loss": 0.86875353, "memory(GiB)": 19.33, "step": 645, "train_speed(iter/s)": 0.071146 }, { "acc": 0.74836354, "epoch": 2.3423423423423424, "grad_norm": 1.1328125, "learning_rate": 5.388687357301051e-05, "loss": 0.88861446, "memory(GiB)": 20.11, "step": 650, "train_speed(iter/s)": 0.071249 }, { "epoch": 2.3423423423423424, "eval_acc": 0.630709534368071, "eval_loss": 1.5767972469329834, "eval_runtime": 134.3063, "eval_samples_per_second": 1.124, "eval_steps_per_second": 0.566, "step": 650 }, { "acc": 0.76697993, "epoch": 2.3603603603603602, "grad_norm": 1.2734375, "learning_rate": 5.290125945065162e-05, "loss": 0.85701361, "memory(GiB)": 22.96, "step": 655, "train_speed(iter/s)": 0.070324 }, { "acc": 0.76252317, "epoch": 2.3783783783783785, "grad_norm": 1.0390625, "learning_rate": 5.191890006300573e-05, "loss": 0.85787058, "memory(GiB)": 20.13, "step": 660, "train_speed(iter/s)": 0.070422 }, { "acc": 0.7651772, "epoch": 2.3963963963963963, "grad_norm": 1.1875, "learning_rate": 5.094001442343155e-05, "loss": 0.8521904, "memory(GiB)": 19.86, "step": 665, "train_speed(iter/s)": 0.070523 }, { "acc": 0.73847542, "epoch": 2.4144144144144146, "grad_norm": 1.2734375, "learning_rate": 4.996482077082849e-05, "loss": 0.95858736, "memory(GiB)": 19.29, "step": 670, "train_speed(iter/s)": 0.070628 }, { "acc": 0.74675932, "epoch": 2.4324324324324325, "grad_norm": 1.2734375, "learning_rate": 4.899353652098139e-05, "loss": 0.86487961, "memory(GiB)": 18.64, "step": 675, "train_speed(iter/s)": 0.070727 }, { "acc": 0.73309464, "epoch": 2.4504504504504503, "grad_norm": 1.8671875, "learning_rate": 4.802637821808819e-05, "loss": 0.93775883, "memory(GiB)": 19.78, "step": 680, "train_speed(iter/s)": 0.070825 }, { "acc": 0.76575212, "epoch": 2.4684684684684686, "grad_norm": 1.03125, "learning_rate": 4.706356148648246e-05, "loss": 0.8259285, "memory(GiB)": 19.9, "step": 685, "train_speed(iter/s)": 0.07092 }, { "acc": 0.76865396, "epoch": 2.4864864864864864, "grad_norm": 1.3125, "learning_rate": 4.6105300982560625e-05, "loss": 0.84868517, "memory(GiB)": 19.19, "step": 690, "train_speed(iter/s)": 0.071014 }, { "acc": 0.75694928, "epoch": 2.5045045045045047, "grad_norm": 1.03125, "learning_rate": 4.515181034692515e-05, "loss": 0.87043924, "memory(GiB)": 19.95, "step": 695, "train_speed(iter/s)": 0.071105 }, { "acc": 0.75771561, "epoch": 2.5225225225225225, "grad_norm": 1.3515625, "learning_rate": 4.420330215675415e-05, "loss": 0.86245804, "memory(GiB)": 19.18, "step": 700, "train_speed(iter/s)": 0.071194 }, { "epoch": 2.5225225225225225, "eval_acc": 0.6335181079083518, "eval_loss": 1.5894646644592285, "eval_runtime": 134.225, "eval_samples_per_second": 1.125, "eval_steps_per_second": 0.566, "step": 700 }, { "acc": 0.76191721, "epoch": 2.5405405405405403, "grad_norm": 1.71875, "learning_rate": 4.325998787840818e-05, "loss": 0.85848246, "memory(GiB)": 19.14, "step": 705, "train_speed(iter/s)": 0.070324 }, { "acc": 0.76571012, "epoch": 2.5585585585585586, "grad_norm": 1.15625, "learning_rate": 4.2322077820284477e-05, "loss": 0.85979414, "memory(GiB)": 20.01, "step": 710, "train_speed(iter/s)": 0.070422 }, { "acc": 0.73852654, "epoch": 2.5765765765765765, "grad_norm": 1.6484375, "learning_rate": 4.138978108592962e-05, "loss": 0.90148897, "memory(GiB)": 19.05, "step": 715, "train_speed(iter/s)": 0.070518 }, { "acc": 0.76960816, "epoch": 2.5945945945945947, "grad_norm": 3.71875, "learning_rate": 4.046330552742053e-05, "loss": 0.88053255, "memory(GiB)": 19.25, "step": 720, "train_speed(iter/s)": 0.070616 }, { "acc": 0.77552128, "epoch": 2.6126126126126126, "grad_norm": 0.96484375, "learning_rate": 3.954285769902474e-05, "loss": 0.83608866, "memory(GiB)": 19.96, "step": 725, "train_speed(iter/s)": 0.070707 }, { "acc": 0.76034231, "epoch": 2.6306306306306304, "grad_norm": 1.078125, "learning_rate": 3.8628642811149894e-05, "loss": 0.84258709, "memory(GiB)": 19.75, "step": 730, "train_speed(iter/s)": 0.070796 }, { "acc": 0.73506665, "epoch": 2.6486486486486487, "grad_norm": 2.125, "learning_rate": 3.772086468459271e-05, "loss": 0.96418314, "memory(GiB)": 19.94, "step": 735, "train_speed(iter/s)": 0.070887 }, { "acc": 0.74339218, "epoch": 2.6666666666666665, "grad_norm": 1.3359375, "learning_rate": 3.6819725705098094e-05, "loss": 0.94632616, "memory(GiB)": 19.98, "step": 740, "train_speed(iter/s)": 0.070978 }, { "acc": 0.75258017, "epoch": 2.684684684684685, "grad_norm": 1.328125, "learning_rate": 3.592542677823787e-05, "loss": 0.89630384, "memory(GiB)": 19.9, "step": 745, "train_speed(iter/s)": 0.071065 }, { "acc": 0.7422905, "epoch": 2.7027027027027026, "grad_norm": 1.46875, "learning_rate": 3.503816728461963e-05, "loss": 0.92554636, "memory(GiB)": 19.94, "step": 750, "train_speed(iter/s)": 0.071152 }, { "epoch": 2.7027027027027026, "eval_acc": 0.6360679970436068, "eval_loss": 1.577430248260498, "eval_runtime": 134.0595, "eval_samples_per_second": 1.126, "eval_steps_per_second": 0.567, "step": 750 }, { "acc": 0.76009235, "epoch": 2.7207207207207205, "grad_norm": 1.7265625, "learning_rate": 3.415814503543563e-05, "loss": 0.89433851, "memory(GiB)": 19.38, "step": 755, "train_speed(iter/s)": 0.070345 }, { "acc": 0.75049233, "epoch": 2.7387387387387387, "grad_norm": 1.453125, "learning_rate": 3.3285556228361483e-05, "loss": 0.90194426, "memory(GiB)": 19.78, "step": 760, "train_speed(iter/s)": 0.070432 }, { "acc": 0.73652792, "epoch": 2.756756756756757, "grad_norm": 1.375, "learning_rate": 3.2420595403814615e-05, "loss": 0.94170513, "memory(GiB)": 19.18, "step": 765, "train_speed(iter/s)": 0.070517 }, { "acc": 0.74097948, "epoch": 2.774774774774775, "grad_norm": 1.171875, "learning_rate": 3.156345540158226e-05, "loss": 0.92526283, "memory(GiB)": 19.96, "step": 770, "train_speed(iter/s)": 0.070603 }, { "acc": 0.77357135, "epoch": 2.7927927927927927, "grad_norm": 1.21875, "learning_rate": 3.0714327317828445e-05, "loss": 0.84344234, "memory(GiB)": 19.42, "step": 775, "train_speed(iter/s)": 0.070681 }, { "acc": 0.76570077, "epoch": 2.810810810810811, "grad_norm": 1.4765625, "learning_rate": 2.9873400462489982e-05, "loss": 0.85261898, "memory(GiB)": 19.91, "step": 780, "train_speed(iter/s)": 0.070768 }, { "acc": 0.73979292, "epoch": 2.828828828828829, "grad_norm": 1.375, "learning_rate": 2.904086231707032e-05, "loss": 0.94777365, "memory(GiB)": 19.72, "step": 785, "train_speed(iter/s)": 0.07085 }, { "acc": 0.75035534, "epoch": 2.846846846846847, "grad_norm": 1.1484375, "learning_rate": 2.8216898492841355e-05, "loss": 0.88380022, "memory(GiB)": 19.09, "step": 790, "train_speed(iter/s)": 0.070936 }, { "acc": 0.76033754, "epoch": 2.864864864864865, "grad_norm": 1.078125, "learning_rate": 2.7401692689462153e-05, "loss": 0.84767551, "memory(GiB)": 20.02, "step": 795, "train_speed(iter/s)": 0.071016 }, { "acc": 0.74806399, "epoch": 2.8828828828828827, "grad_norm": 1.53125, "learning_rate": 2.6595426654023643e-05, "loss": 0.92544088, "memory(GiB)": 19.88, "step": 800, "train_speed(iter/s)": 0.0711 }, { "epoch": 2.8828828828828827, "eval_acc": 0.635920177383592, "eval_loss": 1.5869847536087036, "eval_runtime": 134.517, "eval_samples_per_second": 1.123, "eval_steps_per_second": 0.565, "step": 800 } ], "logging_steps": 5, "max_steps": 1108, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.3166381763355443e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }