{ "best_metric": null, "best_model_checkpoint": null, "epoch": 10.0, "eval_steps": 500, "global_step": 119380, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.08, "learning_rate": 2.0833333333333333e-05, "loss": 7.7261, "step": 1000 }, { "epoch": 0.17, "learning_rate": 4.1666666666666665e-05, "loss": 5.5648, "step": 2000 }, { "epoch": 0.25, "learning_rate": 6.25e-05, "loss": 5.0119, "step": 3000 }, { "epoch": 0.34, "learning_rate": 8.333333333333333e-05, "loss": 4.6599, "step": 4000 }, { "epoch": 0.42, "learning_rate": 0.00010416666666666667, "loss": 4.3568, "step": 5000 }, { "epoch": 0.5, "learning_rate": 0.000125, "loss": 4.1194, "step": 6000 }, { "epoch": 0.59, "learning_rate": 0.00014583333333333335, "loss": 3.8926, "step": 7000 }, { "epoch": 0.67, "learning_rate": 0.00016666666666666666, "loss": 3.778, "step": 8000 }, { "epoch": 0.75, "learning_rate": 0.0001875, "loss": 3.65, "step": 9000 }, { "epoch": 0.84, "learning_rate": 0.00020833333333333335, "loss": 3.6165, "step": 10000 }, { "epoch": 0.92, "learning_rate": 0.00022916666666666666, "loss": 3.5523, "step": 11000 }, { "epoch": 1.0, "eval_accuracy": 0.3512480392306169, "eval_loss": 3.490408420562744, "eval_runtime": 8.7154, "eval_samples_per_second": 8031.783, "eval_steps_per_second": 15.719, "step": 11938 }, { "epoch": 1.01, "learning_rate": 0.00025, "loss": 3.4876, "step": 12000 }, { "epoch": 1.09, "learning_rate": 0.0002708333333333333, "loss": 3.4466, "step": 13000 }, { "epoch": 1.17, "learning_rate": 0.0002916666666666667, "loss": 3.4306, "step": 14000 }, { "epoch": 1.26, "learning_rate": 0.0003125, "loss": 3.39, "step": 15000 }, { "epoch": 1.34, "learning_rate": 0.0003333333333333333, "loss": 3.3891, "step": 16000 }, { "epoch": 1.42, "learning_rate": 0.0003541666666666667, "loss": 3.367, "step": 17000 }, { "epoch": 1.51, "learning_rate": 0.000375, "loss": 3.3546, "step": 18000 }, { "epoch": 1.59, "learning_rate": 0.0003958333333333333, "loss": 3.3348, "step": 19000 }, { "epoch": 1.68, "learning_rate": 0.0004166666666666667, "loss": 3.3235, "step": 20000 }, { "epoch": 1.76, "learning_rate": 0.0004375, "loss": 3.3207, "step": 21000 }, { "epoch": 1.84, "learning_rate": 0.0004583333333333333, "loss": 3.3028, "step": 22000 }, { "epoch": 1.93, "learning_rate": 0.0004791666666666667, "loss": 3.3151, "step": 23000 }, { "epoch": 2.0, "eval_accuracy": 0.36783764827859605, "eval_loss": 3.319209098815918, "eval_runtime": 8.6438, "eval_samples_per_second": 8098.333, "eval_steps_per_second": 15.85, "step": 23876 }, { "epoch": 2.01, "learning_rate": 0.0005, "loss": 3.2991, "step": 24000 }, { "epoch": 2.09, "learning_rate": 0.0004947578108618159, "loss": 3.3032, "step": 25000 }, { "epoch": 2.18, "learning_rate": 0.0004895156217236318, "loss": 3.2872, "step": 26000 }, { "epoch": 2.26, "learning_rate": 0.0004842734325854477, "loss": 3.2814, "step": 27000 }, { "epoch": 2.35, "learning_rate": 0.0004790312434472636, "loss": 3.2595, "step": 28000 }, { "epoch": 2.43, "learning_rate": 0.0004737890543090795, "loss": 3.2423, "step": 29000 }, { "epoch": 2.51, "learning_rate": 0.0004685468651708954, "loss": 3.2336, "step": 30000 }, { "epoch": 2.6, "learning_rate": 0.00046330467603271125, "loss": 3.2224, "step": 31000 }, { "epoch": 2.68, "learning_rate": 0.00045806248689452716, "loss": 3.1981, "step": 32000 }, { "epoch": 2.76, "learning_rate": 0.000452820297756343, "loss": 3.1808, "step": 33000 }, { "epoch": 2.85, "learning_rate": 0.00044757810861815897, "loss": 3.1593, "step": 34000 }, { "epoch": 2.93, "learning_rate": 0.0004423359194799749, "loss": 3.1526, "step": 35000 }, { "epoch": 3.0, "eval_accuracy": 0.3893979057591623, "eval_loss": 3.154465675354004, "eval_runtime": 8.7169, "eval_samples_per_second": 8030.369, "eval_steps_per_second": 15.717, "step": 35814 }, { "epoch": 3.02, "learning_rate": 0.00043709373034179073, "loss": 3.1466, "step": 36000 }, { "epoch": 3.1, "learning_rate": 0.00043185154120360664, "loss": 3.1035, "step": 37000 }, { "epoch": 3.18, "learning_rate": 0.0004266093520654225, "loss": 3.0893, "step": 38000 }, { "epoch": 3.27, "learning_rate": 0.0004213671629272384, "loss": 3.1148, "step": 39000 }, { "epoch": 3.35, "learning_rate": 0.00041612497378905436, "loss": 3.0801, "step": 40000 }, { "epoch": 3.43, "learning_rate": 0.0004108827846508702, "loss": 3.0814, "step": 41000 }, { "epoch": 3.52, "learning_rate": 0.0004056405955126861, "loss": 3.0665, "step": 42000 }, { "epoch": 3.6, "learning_rate": 0.00040039840637450197, "loss": 3.0718, "step": 43000 }, { "epoch": 3.69, "learning_rate": 0.0003951562172363179, "loss": 3.0462, "step": 44000 }, { "epoch": 3.77, "learning_rate": 0.0003899140280981338, "loss": 3.0328, "step": 45000 }, { "epoch": 3.85, "learning_rate": 0.0003846718389599497, "loss": 3.0601, "step": 46000 }, { "epoch": 3.94, "learning_rate": 0.0003794296498217656, "loss": 3.0031, "step": 47000 }, { "epoch": 4.0, "eval_accuracy": 0.3971048095550437, "eval_loss": 3.0706326961517334, "eval_runtime": 8.6604, "eval_samples_per_second": 8082.758, "eval_steps_per_second": 15.819, "step": 47752 }, { "epoch": 4.02, "learning_rate": 0.00037418746068358145, "loss": 3.0134, "step": 48000 }, { "epoch": 4.1, "learning_rate": 0.00036894527154539736, "loss": 3.0064, "step": 49000 }, { "epoch": 4.19, "learning_rate": 0.00036370308240721327, "loss": 2.9887, "step": 50000 }, { "epoch": 4.27, "learning_rate": 0.0003584608932690291, "loss": 2.9757, "step": 51000 }, { "epoch": 4.36, "learning_rate": 0.0003532187041308451, "loss": 2.9678, "step": 52000 }, { "epoch": 4.44, "learning_rate": 0.00034797651499266093, "loss": 2.964, "step": 53000 }, { "epoch": 4.52, "learning_rate": 0.00034273432585447684, "loss": 2.9642, "step": 54000 }, { "epoch": 4.61, "learning_rate": 0.00033749213671629275, "loss": 2.9797, "step": 55000 }, { "epoch": 4.69, "learning_rate": 0.0003322499475781086, "loss": 2.9534, "step": 56000 }, { "epoch": 4.77, "learning_rate": 0.0003270077584399245, "loss": 2.9476, "step": 57000 }, { "epoch": 4.86, "learning_rate": 0.0003217655693017404, "loss": 2.9431, "step": 58000 }, { "epoch": 4.94, "learning_rate": 0.0003165233801635563, "loss": 2.9668, "step": 59000 }, { "epoch": 5.0, "eval_accuracy": 0.41213198432230425, "eval_loss": 2.9605636596679688, "eval_runtime": 8.7459, "eval_samples_per_second": 8003.788, "eval_steps_per_second": 15.665, "step": 59690 }, { "epoch": 5.03, "learning_rate": 0.00031128119102537223, "loss": 2.9106, "step": 60000 }, { "epoch": 5.11, "learning_rate": 0.0003060390018871881, "loss": 2.9053, "step": 61000 }, { "epoch": 5.19, "learning_rate": 0.000300796812749004, "loss": 2.8983, "step": 62000 }, { "epoch": 5.28, "learning_rate": 0.00029555462361081984, "loss": 2.8828, "step": 63000 }, { "epoch": 5.36, "learning_rate": 0.0002903124344726358, "loss": 2.8971, "step": 64000 }, { "epoch": 5.44, "learning_rate": 0.0002850702453344517, "loss": 2.9074, "step": 65000 }, { "epoch": 5.53, "learning_rate": 0.00027982805619626756, "loss": 2.8792, "step": 66000 }, { "epoch": 5.61, "learning_rate": 0.00027458586705808347, "loss": 2.8902, "step": 67000 }, { "epoch": 5.7, "learning_rate": 0.0002693436779198993, "loss": 2.8666, "step": 68000 }, { "epoch": 5.78, "learning_rate": 0.00026410148878171523, "loss": 2.8868, "step": 69000 }, { "epoch": 5.86, "learning_rate": 0.0002588592996435312, "loss": 2.8509, "step": 70000 }, { "epoch": 5.95, "learning_rate": 0.00025361711050534704, "loss": 2.8548, "step": 71000 }, { "epoch": 6.0, "eval_accuracy": 0.4243559291065229, "eval_loss": 2.8717477321624756, "eval_runtime": 8.7188, "eval_samples_per_second": 8028.594, "eval_steps_per_second": 15.713, "step": 71628 }, { "epoch": 6.03, "learning_rate": 0.00024837492136716295, "loss": 2.8484, "step": 72000 }, { "epoch": 6.11, "learning_rate": 0.00024313273222897883, "loss": 2.8266, "step": 73000 }, { "epoch": 6.2, "learning_rate": 0.0002378905430907947, "loss": 2.8247, "step": 74000 }, { "epoch": 6.28, "learning_rate": 0.00023264835395261061, "loss": 2.8151, "step": 75000 }, { "epoch": 6.37, "learning_rate": 0.0002274061648144265, "loss": 2.8078, "step": 76000 }, { "epoch": 6.45, "learning_rate": 0.0002221639756762424, "loss": 2.8143, "step": 77000 }, { "epoch": 6.53, "learning_rate": 0.0002169217865380583, "loss": 2.8043, "step": 78000 }, { "epoch": 6.62, "learning_rate": 0.0002116795973998742, "loss": 2.8159, "step": 79000 }, { "epoch": 6.7, "learning_rate": 0.00020643740826169007, "loss": 2.7825, "step": 80000 }, { "epoch": 6.79, "learning_rate": 0.00020119521912350598, "loss": 2.812, "step": 81000 }, { "epoch": 6.87, "learning_rate": 0.00019595302998532188, "loss": 2.766, "step": 82000 }, { "epoch": 6.95, "learning_rate": 0.00019071084084713776, "loss": 2.7686, "step": 83000 }, { "epoch": 7.0, "eval_accuracy": 0.4311035312390553, "eval_loss": 2.8147239685058594, "eval_runtime": 8.6579, "eval_samples_per_second": 8085.129, "eval_steps_per_second": 15.824, "step": 83566 }, { "epoch": 7.04, "learning_rate": 0.00018546865170895367, "loss": 2.7648, "step": 84000 }, { "epoch": 7.12, "learning_rate": 0.00018022646257076955, "loss": 2.7609, "step": 85000 }, { "epoch": 7.2, "learning_rate": 0.00017498427343258543, "loss": 2.7306, "step": 86000 }, { "epoch": 7.29, "learning_rate": 0.00016974208429440136, "loss": 2.718, "step": 87000 }, { "epoch": 7.37, "learning_rate": 0.00016449989515621724, "loss": 2.7218, "step": 88000 }, { "epoch": 7.46, "learning_rate": 0.00015925770601803312, "loss": 2.7624, "step": 89000 }, { "epoch": 7.54, "learning_rate": 0.00015401551687984903, "loss": 2.7478, "step": 90000 }, { "epoch": 7.62, "learning_rate": 0.0001487733277416649, "loss": 2.7372, "step": 91000 }, { "epoch": 7.71, "learning_rate": 0.00014353113860348082, "loss": 2.7467, "step": 92000 }, { "epoch": 7.79, "learning_rate": 0.00013828894946529672, "loss": 2.7205, "step": 93000 }, { "epoch": 7.87, "learning_rate": 0.0001330467603271126, "loss": 2.7269, "step": 94000 }, { "epoch": 7.96, "learning_rate": 0.00012780457118892848, "loss": 2.7195, "step": 95000 }, { "epoch": 8.0, "eval_accuracy": 0.4389084267882209, "eval_loss": 2.7656898498535156, "eval_runtime": 8.6162, "eval_samples_per_second": 8124.227, "eval_steps_per_second": 15.9, "step": 95504 }, { "epoch": 8.04, "learning_rate": 0.0001225623820507444, "loss": 2.7004, "step": 96000 }, { "epoch": 8.13, "learning_rate": 0.00011732019291256028, "loss": 2.7043, "step": 97000 }, { "epoch": 8.21, "learning_rate": 0.00011207800377437619, "loss": 2.7322, "step": 98000 }, { "epoch": 8.29, "learning_rate": 0.00010683581463619207, "loss": 2.6874, "step": 99000 }, { "epoch": 8.38, "learning_rate": 0.00010159362549800798, "loss": 2.6771, "step": 100000 }, { "epoch": 8.46, "learning_rate": 9.635143635982387e-05, "loss": 2.6827, "step": 101000 }, { "epoch": 8.54, "learning_rate": 9.110924722163975e-05, "loss": 2.6756, "step": 102000 }, { "epoch": 8.63, "learning_rate": 8.586705808345566e-05, "loss": 2.6575, "step": 103000 }, { "epoch": 8.71, "learning_rate": 8.062486894527155e-05, "loss": 2.669, "step": 104000 }, { "epoch": 8.8, "learning_rate": 7.538267980708744e-05, "loss": 2.6623, "step": 105000 }, { "epoch": 8.88, "learning_rate": 7.014049066890334e-05, "loss": 2.6518, "step": 106000 }, { "epoch": 8.96, "learning_rate": 6.489830153071923e-05, "loss": 2.6659, "step": 107000 }, { "epoch": 9.0, "eval_accuracy": 0.4483279917760174, "eval_loss": 2.710298538208008, "eval_runtime": 8.5942, "eval_samples_per_second": 8145.024, "eval_steps_per_second": 15.941, "step": 107442 }, { "epoch": 9.05, "learning_rate": 5.965611239253512e-05, "loss": 2.6505, "step": 108000 }, { "epoch": 9.13, "learning_rate": 5.441392325435102e-05, "loss": 2.6367, "step": 109000 }, { "epoch": 9.21, "learning_rate": 4.917173411616691e-05, "loss": 2.6232, "step": 110000 }, { "epoch": 9.3, "learning_rate": 4.3929544977982805e-05, "loss": 2.6453, "step": 111000 }, { "epoch": 9.38, "learning_rate": 3.8687355839798705e-05, "loss": 2.6565, "step": 112000 }, { "epoch": 9.47, "learning_rate": 3.344516670161459e-05, "loss": 2.6244, "step": 113000 }, { "epoch": 9.55, "learning_rate": 2.820297756343049e-05, "loss": 2.6429, "step": 114000 }, { "epoch": 9.63, "learning_rate": 2.2960788425246382e-05, "loss": 2.6117, "step": 115000 }, { "epoch": 9.72, "learning_rate": 1.771859928706228e-05, "loss": 2.6289, "step": 116000 }, { "epoch": 9.8, "learning_rate": 1.2476410148878172e-05, "loss": 2.6174, "step": 117000 }, { "epoch": 9.88, "learning_rate": 7.234221010694066e-06, "loss": 2.6142, "step": 118000 }, { "epoch": 9.97, "learning_rate": 1.99203187250996e-06, "loss": 2.6254, "step": 119000 }, { "epoch": 10.0, "eval_accuracy": 0.4510256799878438, "eval_loss": 2.699005126953125, "eval_runtime": 8.5626, "eval_samples_per_second": 8175.05, "eval_steps_per_second": 16.0, "step": 119380 }, { "epoch": 10.0, "step": 119380, "total_flos": 6556448698096128.0, "train_loss": 3.079910436302079, "train_runtime": 7252.3341, "train_samples_per_second": 1053.439, "train_steps_per_second": 16.461 } ], "logging_steps": 1000, "max_steps": 119380, "num_train_epochs": 10, "save_steps": 2000, "total_flos": 6556448698096128.0, "trial_name": null, "trial_params": null }