{ "best_metric": null, "best_model_checkpoint": null, "epoch": 10.0, "eval_steps": 500, "global_step": 119380, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.08, "learning_rate": 2.0833333333333333e-05, "loss": 7.7598, "step": 1000 }, { "epoch": 0.17, "learning_rate": 4.1666666666666665e-05, "loss": 5.5454, "step": 2000 }, { "epoch": 0.25, "learning_rate": 6.25e-05, "loss": 4.9947, "step": 3000 }, { "epoch": 0.34, "learning_rate": 8.333333333333333e-05, "loss": 4.6428, "step": 4000 }, { "epoch": 0.42, "learning_rate": 0.00010416666666666667, "loss": 4.3201, "step": 5000 }, { "epoch": 0.5, "learning_rate": 0.000125, "loss": 4.0651, "step": 6000 }, { "epoch": 0.59, "learning_rate": 0.00014583333333333335, "loss": 3.8937, "step": 7000 }, { "epoch": 0.67, "learning_rate": 0.00016666666666666666, "loss": 3.7657, "step": 8000 }, { "epoch": 0.75, "learning_rate": 0.0001875, "loss": 3.6866, "step": 9000 }, { "epoch": 0.84, "learning_rate": 0.00020833333333333335, "loss": 3.5962, "step": 10000 }, { "epoch": 0.92, "learning_rate": 0.00022916666666666666, "loss": 3.5129, "step": 11000 }, { "epoch": 1.0, "eval_accuracy": 0.3523433003120361, "eval_loss": 3.4626712799072266, "eval_runtime": 8.7029, "eval_samples_per_second": 8043.269, "eval_steps_per_second": 15.742, "step": 11938 }, { "epoch": 1.01, "learning_rate": 0.00025, "loss": 3.4983, "step": 12000 }, { "epoch": 1.09, "learning_rate": 0.0002708333333333333, "loss": 3.4416, "step": 13000 }, { "epoch": 1.17, "learning_rate": 0.0002916666666666667, "loss": 3.4233, "step": 14000 }, { "epoch": 1.26, "learning_rate": 0.0003125, "loss": 3.3965, "step": 15000 }, { "epoch": 1.34, "learning_rate": 0.0003333333333333333, "loss": 3.3694, "step": 16000 }, { "epoch": 1.42, "learning_rate": 0.0003541666666666667, "loss": 3.3621, "step": 17000 }, { "epoch": 1.51, "learning_rate": 0.000375, "loss": 3.333, "step": 18000 }, { "epoch": 1.59, "learning_rate": 0.0003958333333333333, "loss": 3.3604, "step": 19000 }, { "epoch": 1.68, "learning_rate": 0.0004166666666666667, "loss": 3.3273, "step": 20000 }, { "epoch": 1.76, "learning_rate": 0.0004375, "loss": 3.3097, "step": 21000 }, { "epoch": 1.84, "learning_rate": 0.0004583333333333333, "loss": 3.3087, "step": 22000 }, { "epoch": 1.93, "learning_rate": 0.0004791666666666667, "loss": 3.319, "step": 23000 }, { "epoch": 2.0, "eval_accuracy": 0.36410225308548383, "eval_loss": 3.3322079181671143, "eval_runtime": 9.4912, "eval_samples_per_second": 7375.242, "eval_steps_per_second": 14.434, "step": 23876 }, { "epoch": 2.01, "learning_rate": 0.0005, "loss": 3.3119, "step": 24000 }, { "epoch": 2.09, "learning_rate": 0.0004947578108618159, "loss": 3.304, "step": 25000 }, { "epoch": 2.18, "learning_rate": 0.0004895156217236318, "loss": 3.2935, "step": 26000 }, { "epoch": 2.26, "learning_rate": 0.0004842734325854477, "loss": 3.2888, "step": 27000 }, { "epoch": 2.35, "learning_rate": 0.0004790312434472636, "loss": 3.2313, "step": 28000 }, { "epoch": 2.43, "learning_rate": 0.0004737890543090795, "loss": 3.2641, "step": 29000 }, { "epoch": 2.51, "learning_rate": 0.0004685468651708954, "loss": 3.2296, "step": 30000 }, { "epoch": 2.6, "learning_rate": 0.00046330467603271125, "loss": 3.2117, "step": 31000 }, { "epoch": 2.68, "learning_rate": 0.00045806248689452716, "loss": 3.1985, "step": 32000 }, { "epoch": 2.76, "learning_rate": 0.000452820297756343, "loss": 3.2068, "step": 33000 }, { "epoch": 2.85, "learning_rate": 0.00044757810861815897, "loss": 3.1658, "step": 34000 }, { "epoch": 2.93, "learning_rate": 0.0004423359194799749, "loss": 3.1577, "step": 35000 }, { "epoch": 3.0, "eval_accuracy": 0.38102238455359716, "eval_loss": 3.184136390686035, "eval_runtime": 8.8582, "eval_samples_per_second": 7902.295, "eval_steps_per_second": 15.466, "step": 35814 }, { "epoch": 3.02, "learning_rate": 0.00043709373034179073, "loss": 3.1581, "step": 36000 }, { "epoch": 3.1, "learning_rate": 0.00043185154120360664, "loss": 3.1229, "step": 37000 }, { "epoch": 3.18, "learning_rate": 0.0004266093520654225, "loss": 3.1043, "step": 38000 }, { "epoch": 3.27, "learning_rate": 0.0004213671629272384, "loss": 3.0881, "step": 39000 }, { "epoch": 3.35, "learning_rate": 0.00041612497378905436, "loss": 3.095, "step": 40000 }, { "epoch": 3.43, "learning_rate": 0.0004108827846508702, "loss": 3.0888, "step": 41000 }, { "epoch": 3.52, "learning_rate": 0.0004056405955126861, "loss": 3.0583, "step": 42000 }, { "epoch": 3.6, "learning_rate": 0.00040039840637450197, "loss": 3.0716, "step": 43000 }, { "epoch": 3.69, "learning_rate": 0.0003951562172363179, "loss": 3.046, "step": 44000 }, { "epoch": 3.77, "learning_rate": 0.0003899140280981338, "loss": 3.0566, "step": 45000 }, { "epoch": 3.85, "learning_rate": 0.0003846718389599497, "loss": 3.0571, "step": 46000 }, { "epoch": 3.94, "learning_rate": 0.0003794296498217656, "loss": 3.0357, "step": 47000 }, { "epoch": 4.0, "eval_accuracy": 0.3981597974867745, "eval_loss": 3.058806896209717, "eval_runtime": 9.478, "eval_samples_per_second": 7385.506, "eval_steps_per_second": 14.454, "step": 47752 }, { "epoch": 4.02, "learning_rate": 0.00037418746068358145, "loss": 3.0134, "step": 48000 }, { "epoch": 4.1, "learning_rate": 0.00036894527154539736, "loss": 2.9809, "step": 49000 }, { "epoch": 4.19, "learning_rate": 0.00036370308240721327, "loss": 2.9708, "step": 50000 }, { "epoch": 4.27, "learning_rate": 0.0003584608932690291, "loss": 2.9975, "step": 51000 }, { "epoch": 4.36, "learning_rate": 0.0003532187041308451, "loss": 2.9888, "step": 52000 }, { "epoch": 4.44, "learning_rate": 0.00034797651499266093, "loss": 2.9647, "step": 53000 }, { "epoch": 4.52, "learning_rate": 0.00034273432585447684, "loss": 2.9853, "step": 54000 }, { "epoch": 4.61, "learning_rate": 0.00033749213671629275, "loss": 2.9783, "step": 55000 }, { "epoch": 4.69, "learning_rate": 0.0003322499475781086, "loss": 2.9553, "step": 56000 }, { "epoch": 4.77, "learning_rate": 0.0003270077584399245, "loss": 2.9312, "step": 57000 }, { "epoch": 4.86, "learning_rate": 0.0003217655693017404, "loss": 2.9468, "step": 58000 }, { "epoch": 4.94, "learning_rate": 0.0003165233801635563, "loss": 2.9606, "step": 59000 }, { "epoch": 5.0, "eval_accuracy": 0.41093517308452115, "eval_loss": 2.9534778594970703, "eval_runtime": 8.7667, "eval_samples_per_second": 7984.805, "eval_steps_per_second": 15.627, "step": 59690 }, { "epoch": 5.03, "learning_rate": 0.00031128119102537223, "loss": 2.9322, "step": 60000 }, { "epoch": 5.11, "learning_rate": 0.0003060390018871881, "loss": 2.9048, "step": 61000 }, { "epoch": 5.19, "learning_rate": 0.000300796812749004, "loss": 2.8633, "step": 62000 }, { "epoch": 5.28, "learning_rate": 0.00029555462361081984, "loss": 2.914, "step": 63000 }, { "epoch": 5.36, "learning_rate": 0.0002903124344726358, "loss": 2.881, "step": 64000 }, { "epoch": 5.44, "learning_rate": 0.0002850702453344517, "loss": 2.8721, "step": 65000 }, { "epoch": 5.53, "learning_rate": 0.00027982805619626756, "loss": 2.886, "step": 66000 }, { "epoch": 5.61, "learning_rate": 0.00027458586705808347, "loss": 2.8801, "step": 67000 }, { "epoch": 5.7, "learning_rate": 0.0002693436779198993, "loss": 2.87, "step": 68000 }, { "epoch": 5.78, "learning_rate": 0.00026410148878171523, "loss": 2.8705, "step": 69000 }, { "epoch": 5.86, "learning_rate": 0.0002588592996435312, "loss": 2.8638, "step": 70000 }, { "epoch": 5.95, "learning_rate": 0.00025361711050534704, "loss": 2.87, "step": 71000 }, { "epoch": 6.0, "eval_accuracy": 0.42207386406938285, "eval_loss": 2.8744776248931885, "eval_runtime": 8.7882, "eval_samples_per_second": 7965.243, "eval_steps_per_second": 15.589, "step": 71628 }, { "epoch": 6.03, "learning_rate": 0.00024837492136716295, "loss": 2.8343, "step": 72000 }, { "epoch": 6.11, "learning_rate": 0.00024313273222897883, "loss": 2.8347, "step": 73000 }, { "epoch": 6.2, "learning_rate": 0.0002378905430907947, "loss": 2.8269, "step": 74000 }, { "epoch": 6.28, "learning_rate": 0.00023264835395261061, "loss": 2.8174, "step": 75000 }, { "epoch": 6.37, "learning_rate": 0.0002274061648144265, "loss": 2.8174, "step": 76000 }, { "epoch": 6.45, "learning_rate": 0.0002221639756762424, "loss": 2.8216, "step": 77000 }, { "epoch": 6.53, "learning_rate": 0.0002169217865380583, "loss": 2.805, "step": 78000 }, { "epoch": 6.62, "learning_rate": 0.0002116795973998742, "loss": 2.8225, "step": 79000 }, { "epoch": 6.7, "learning_rate": 0.00020643740826169007, "loss": 2.8059, "step": 80000 }, { "epoch": 6.79, "learning_rate": 0.00020119521912350598, "loss": 2.8046, "step": 81000 }, { "epoch": 6.87, "learning_rate": 0.00019595302998532188, "loss": 2.7754, "step": 82000 }, { "epoch": 6.95, "learning_rate": 0.00019071084084713776, "loss": 2.7817, "step": 83000 }, { "epoch": 7.0, "eval_accuracy": 0.4283667840107286, "eval_loss": 2.8350658416748047, "eval_runtime": 8.7803, "eval_samples_per_second": 7972.401, "eval_steps_per_second": 15.603, "step": 83566 }, { "epoch": 7.04, "learning_rate": 0.00018546865170895367, "loss": 2.7572, "step": 84000 }, { "epoch": 7.12, "learning_rate": 0.00018022646257076955, "loss": 2.7627, "step": 85000 }, { "epoch": 7.2, "learning_rate": 0.00017498427343258543, "loss": 2.7528, "step": 86000 }, { "epoch": 7.29, "learning_rate": 0.00016974208429440136, "loss": 2.7468, "step": 87000 }, { "epoch": 7.37, "learning_rate": 0.00016449989515621724, "loss": 2.7556, "step": 88000 }, { "epoch": 7.46, "learning_rate": 0.00015925770601803312, "loss": 2.7687, "step": 89000 }, { "epoch": 7.54, "learning_rate": 0.00015401551687984903, "loss": 2.7265, "step": 90000 }, { "epoch": 7.62, "learning_rate": 0.0001487733277416649, "loss": 2.7375, "step": 91000 }, { "epoch": 7.71, "learning_rate": 0.00014353113860348082, "loss": 2.7091, "step": 92000 }, { "epoch": 7.79, "learning_rate": 0.00013828894946529672, "loss": 2.7401, "step": 93000 }, { "epoch": 7.87, "learning_rate": 0.0001330467603271126, "loss": 2.7341, "step": 94000 }, { "epoch": 7.96, "learning_rate": 0.00012780457118892848, "loss": 2.7388, "step": 95000 }, { "epoch": 8.0, "eval_accuracy": 0.44167649816982785, "eval_loss": 2.7535808086395264, "eval_runtime": 8.9073, "eval_samples_per_second": 7858.746, "eval_steps_per_second": 15.381, "step": 95504 }, { "epoch": 8.04, "learning_rate": 0.0001225623820507444, "loss": 2.7064, "step": 96000 }, { "epoch": 8.13, "learning_rate": 0.00011732019291256028, "loss": 2.6997, "step": 97000 }, { "epoch": 8.21, "learning_rate": 0.00011207800377437619, "loss": 2.6857, "step": 98000 }, { "epoch": 8.29, "learning_rate": 0.00010683581463619207, "loss": 2.6924, "step": 99000 }, { "epoch": 8.38, "learning_rate": 0.00010159362549800798, "loss": 2.6781, "step": 100000 }, { "epoch": 8.46, "learning_rate": 9.635143635982387e-05, "loss": 2.6773, "step": 101000 }, { "epoch": 8.54, "learning_rate": 9.110924722163975e-05, "loss": 2.6722, "step": 102000 }, { "epoch": 8.63, "learning_rate": 8.586705808345566e-05, "loss": 2.6965, "step": 103000 }, { "epoch": 8.71, "learning_rate": 8.062486894527155e-05, "loss": 2.6532, "step": 104000 }, { "epoch": 8.8, "learning_rate": 7.538267980708744e-05, "loss": 2.6804, "step": 105000 }, { "epoch": 8.88, "learning_rate": 7.014049066890334e-05, "loss": 2.6515, "step": 106000 }, { "epoch": 8.96, "learning_rate": 6.489830153071923e-05, "loss": 2.6618, "step": 107000 }, { "epoch": 9.0, "eval_accuracy": 0.4423983455491013, "eval_loss": 2.7308034896850586, "eval_runtime": 8.6936, "eval_samples_per_second": 8051.873, "eval_steps_per_second": 15.759, "step": 107442 }, { "epoch": 9.05, "learning_rate": 5.965611239253512e-05, "loss": 2.6375, "step": 108000 }, { "epoch": 9.13, "learning_rate": 5.441392325435102e-05, "loss": 2.6582, "step": 109000 }, { "epoch": 9.21, "learning_rate": 4.917173411616691e-05, "loss": 2.6559, "step": 110000 }, { "epoch": 9.3, "learning_rate": 4.3929544977982805e-05, "loss": 2.6294, "step": 111000 }, { "epoch": 9.38, "learning_rate": 3.8687355839798705e-05, "loss": 2.6339, "step": 112000 }, { "epoch": 9.47, "learning_rate": 3.344516670161459e-05, "loss": 2.633, "step": 113000 }, { "epoch": 9.55, "learning_rate": 2.820297756343049e-05, "loss": 2.6274, "step": 114000 }, { "epoch": 9.63, "learning_rate": 2.2960788425246382e-05, "loss": 2.6228, "step": 115000 }, { "epoch": 9.72, "learning_rate": 1.771859928706228e-05, "loss": 2.6251, "step": 116000 }, { "epoch": 9.8, "learning_rate": 1.2476410148878172e-05, "loss": 2.6179, "step": 117000 }, { "epoch": 9.88, "learning_rate": 7.234221010694066e-06, "loss": 2.6305, "step": 118000 }, { "epoch": 9.97, "learning_rate": 1.99203187250996e-06, "loss": 2.6258, "step": 119000 }, { "epoch": 10.0, "eval_accuracy": 0.45217033845404303, "eval_loss": 2.688030242919922, "eval_runtime": 8.8102, "eval_samples_per_second": 7945.334, "eval_steps_per_second": 15.55, "step": 119380 }, { "epoch": 10.0, "step": 119380, "total_flos": 6558690447189504.0, "train_loss": 3.080881611061448, "train_runtime": 7359.6812, "train_samples_per_second": 1038.074, "train_steps_per_second": 16.221 } ], "logging_steps": 1000, "max_steps": 119380, "num_train_epochs": 10, "save_steps": 2000, "total_flos": 6558690447189504.0, "trial_name": null, "trial_params": null }