{ "best_metric": null, "best_model_checkpoint": null, "epoch": 10.0, "eval_steps": 500, "global_step": 119380, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.08, "learning_rate": 2.0833333333333333e-05, "loss": 7.6872, "step": 1000 }, { "epoch": 0.17, "learning_rate": 4.1666666666666665e-05, "loss": 5.503, "step": 2000 }, { "epoch": 0.25, "learning_rate": 6.25e-05, "loss": 4.9934, "step": 3000 }, { "epoch": 0.34, "learning_rate": 8.333333333333333e-05, "loss": 4.6664, "step": 4000 }, { "epoch": 0.42, "learning_rate": 0.00010416666666666667, "loss": 4.3638, "step": 5000 }, { "epoch": 0.5, "learning_rate": 0.000125, "loss": 4.1058, "step": 6000 }, { "epoch": 0.59, "learning_rate": 0.00014583333333333335, "loss": 3.8999, "step": 7000 }, { "epoch": 0.67, "learning_rate": 0.00016666666666666666, "loss": 3.7697, "step": 8000 }, { "epoch": 0.75, "learning_rate": 0.0001875, "loss": 3.6506, "step": 9000 }, { "epoch": 0.84, "learning_rate": 0.00020833333333333335, "loss": 3.5937, "step": 10000 }, { "epoch": 0.92, "learning_rate": 0.00022916666666666666, "loss": 3.5334, "step": 11000 }, { "epoch": 1.0, "eval_accuracy": 0.3463196185119521, "eval_loss": 3.4895663261413574, "eval_runtime": 8.5644, "eval_samples_per_second": 8173.325, "eval_steps_per_second": 15.996, "step": 11938 }, { "epoch": 1.01, "learning_rate": 0.00025, "loss": 3.504, "step": 12000 }, { "epoch": 1.09, "learning_rate": 0.0002708333333333333, "loss": 3.4409, "step": 13000 }, { "epoch": 1.17, "learning_rate": 0.0002916666666666667, "loss": 3.3981, "step": 14000 }, { "epoch": 1.26, "learning_rate": 0.0003125, "loss": 3.3919, "step": 15000 }, { "epoch": 1.34, "learning_rate": 0.0003333333333333333, "loss": 3.3718, "step": 16000 }, { "epoch": 1.42, "learning_rate": 0.0003541666666666667, "loss": 3.3781, "step": 17000 }, { "epoch": 1.51, "learning_rate": 0.000375, "loss": 3.3678, "step": 18000 }, { "epoch": 1.59, "learning_rate": 0.0003958333333333333, "loss": 3.3523, "step": 19000 }, { "epoch": 1.68, "learning_rate": 0.0004166666666666667, "loss": 3.3197, "step": 20000 }, { "epoch": 1.76, "learning_rate": 0.0004375, "loss": 3.3632, "step": 21000 }, { "epoch": 1.84, "learning_rate": 0.0004583333333333333, "loss": 3.3426, "step": 22000 }, { "epoch": 1.93, "learning_rate": 0.0004791666666666667, "loss": 3.3402, "step": 23000 }, { "epoch": 2.0, "eval_accuracy": 0.35903201627583253, "eval_loss": 3.3813889026641846, "eval_runtime": 8.6325, "eval_samples_per_second": 8108.854, "eval_steps_per_second": 15.87, "step": 23876 }, { "epoch": 2.01, "learning_rate": 0.0005, "loss": 3.3366, "step": 24000 }, { "epoch": 2.09, "learning_rate": 0.0004947578108618159, "loss": 3.3048, "step": 25000 }, { "epoch": 2.18, "learning_rate": 0.0004895156217236318, "loss": 3.2956, "step": 26000 }, { "epoch": 2.26, "learning_rate": 0.0004842734325854477, "loss": 3.2634, "step": 27000 }, { "epoch": 2.35, "learning_rate": 0.0004790312434472636, "loss": 3.2722, "step": 28000 }, { "epoch": 2.43, "learning_rate": 0.0004737890543090795, "loss": 3.2701, "step": 29000 }, { "epoch": 2.51, "learning_rate": 0.0004685468651708954, "loss": 3.2337, "step": 30000 }, { "epoch": 2.6, "learning_rate": 0.00046330467603271125, "loss": 3.2225, "step": 31000 }, { "epoch": 2.68, "learning_rate": 0.00045806248689452716, "loss": 3.2134, "step": 32000 }, { "epoch": 2.76, "learning_rate": 0.000452820297756343, "loss": 3.1857, "step": 33000 }, { "epoch": 2.85, "learning_rate": 0.00044757810861815897, "loss": 3.1779, "step": 34000 }, { "epoch": 2.93, "learning_rate": 0.0004423359194799749, "loss": 3.1641, "step": 35000 }, { "epoch": 3.0, "eval_accuracy": 0.3844471778051114, "eval_loss": 3.1702160835266113, "eval_runtime": 8.6474, "eval_samples_per_second": 8094.894, "eval_steps_per_second": 15.843, "step": 35814 }, { "epoch": 3.02, "learning_rate": 0.00043709373034179073, "loss": 3.1689, "step": 36000 }, { "epoch": 3.1, "learning_rate": 0.00043185154120360664, "loss": 3.1411, "step": 37000 }, { "epoch": 3.18, "learning_rate": 0.0004266093520654225, "loss": 3.1109, "step": 38000 }, { "epoch": 3.27, "learning_rate": 0.0004213671629272384, "loss": 3.1049, "step": 39000 }, { "epoch": 3.35, "learning_rate": 0.00041612497378905436, "loss": 3.121, "step": 40000 }, { "epoch": 3.43, "learning_rate": 0.0004108827846508702, "loss": 3.0791, "step": 41000 }, { "epoch": 3.52, "learning_rate": 0.0004056405955126861, "loss": 3.0776, "step": 42000 }, { "epoch": 3.6, "learning_rate": 0.00040039840637450197, "loss": 3.0606, "step": 43000 }, { "epoch": 3.69, "learning_rate": 0.0003951562172363179, "loss": 3.0772, "step": 44000 }, { "epoch": 3.77, "learning_rate": 0.0003899140280981338, "loss": 3.0507, "step": 45000 }, { "epoch": 3.85, "learning_rate": 0.0003846718389599497, "loss": 3.0437, "step": 46000 }, { "epoch": 3.94, "learning_rate": 0.0003794296498217656, "loss": 3.0325, "step": 47000 }, { "epoch": 4.0, "eval_accuracy": 0.40186195535538866, "eval_loss": 3.047513961791992, "eval_runtime": 8.6099, "eval_samples_per_second": 8130.172, "eval_steps_per_second": 15.912, "step": 47752 }, { "epoch": 4.02, "learning_rate": 0.00037418746068358145, "loss": 3.028, "step": 48000 }, { "epoch": 4.1, "learning_rate": 0.00036894527154539736, "loss": 3.0231, "step": 49000 }, { "epoch": 4.19, "learning_rate": 0.00036370308240721327, "loss": 2.9995, "step": 50000 }, { "epoch": 4.27, "learning_rate": 0.0003584608932690291, "loss": 2.9881, "step": 51000 }, { "epoch": 4.36, "learning_rate": 0.0003532187041308451, "loss": 2.9755, "step": 52000 }, { "epoch": 4.44, "learning_rate": 0.00034797651499266093, "loss": 2.9888, "step": 53000 }, { "epoch": 4.52, "learning_rate": 0.00034273432585447684, "loss": 2.9792, "step": 54000 }, { "epoch": 4.61, "learning_rate": 0.00033749213671629275, "loss": 2.9666, "step": 55000 }, { "epoch": 4.69, "learning_rate": 0.0003322499475781086, "loss": 2.9836, "step": 56000 }, { "epoch": 4.77, "learning_rate": 0.0003270077584399245, "loss": 2.9389, "step": 57000 }, { "epoch": 4.86, "learning_rate": 0.0003217655693017404, "loss": 2.944, "step": 58000 }, { "epoch": 4.94, "learning_rate": 0.0003165233801635563, "loss": 2.951, "step": 59000 }, { "epoch": 5.0, "eval_accuracy": 0.4095356186962242, "eval_loss": 2.966625452041626, "eval_runtime": 8.6323, "eval_samples_per_second": 8109.048, "eval_steps_per_second": 15.871, "step": 59690 }, { "epoch": 5.03, "learning_rate": 0.00031128119102537223, "loss": 2.9252, "step": 60000 }, { "epoch": 5.11, "learning_rate": 0.0003060390018871881, "loss": 2.915, "step": 61000 }, { "epoch": 5.19, "learning_rate": 0.000300796812749004, "loss": 2.9111, "step": 62000 }, { "epoch": 5.28, "learning_rate": 0.00029555462361081984, "loss": 2.9048, "step": 63000 }, { "epoch": 5.36, "learning_rate": 0.0002903124344726358, "loss": 2.8744, "step": 64000 }, { "epoch": 5.44, "learning_rate": 0.0002850702453344517, "loss": 2.8936, "step": 65000 }, { "epoch": 5.53, "learning_rate": 0.00027982805619626756, "loss": 2.9017, "step": 66000 }, { "epoch": 5.61, "learning_rate": 0.00027458586705808347, "loss": 2.8971, "step": 67000 }, { "epoch": 5.7, "learning_rate": 0.0002693436779198993, "loss": 2.8617, "step": 68000 }, { "epoch": 5.78, "learning_rate": 0.00026410148878171523, "loss": 2.8399, "step": 69000 }, { "epoch": 5.86, "learning_rate": 0.0002588592996435312, "loss": 2.868, "step": 70000 }, { "epoch": 5.95, "learning_rate": 0.00025361711050534704, "loss": 2.8583, "step": 71000 }, { "epoch": 6.0, "eval_accuracy": 0.4200933305051362, "eval_loss": 2.8908023834228516, "eval_runtime": 8.6923, "eval_samples_per_second": 8053.075, "eval_steps_per_second": 15.761, "step": 71628 }, { "epoch": 6.03, "learning_rate": 0.00024837492136716295, "loss": 2.8551, "step": 72000 }, { "epoch": 6.11, "learning_rate": 0.00024313273222897883, "loss": 2.8157, "step": 73000 }, { "epoch": 6.2, "learning_rate": 0.0002378905430907947, "loss": 2.8135, "step": 74000 }, { "epoch": 6.28, "learning_rate": 0.00023264835395261061, "loss": 2.8351, "step": 75000 }, { "epoch": 6.37, "learning_rate": 0.0002274061648144265, "loss": 2.8116, "step": 76000 }, { "epoch": 6.45, "learning_rate": 0.0002221639756762424, "loss": 2.8285, "step": 77000 }, { "epoch": 6.53, "learning_rate": 0.0002169217865380583, "loss": 2.8201, "step": 78000 }, { "epoch": 6.62, "learning_rate": 0.0002116795973998742, "loss": 2.7951, "step": 79000 }, { "epoch": 6.7, "learning_rate": 0.00020643740826169007, "loss": 2.8164, "step": 80000 }, { "epoch": 6.79, "learning_rate": 0.00020119521912350598, "loss": 2.8162, "step": 81000 }, { "epoch": 6.87, "learning_rate": 0.00019595302998532188, "loss": 2.8221, "step": 82000 }, { "epoch": 6.95, "learning_rate": 0.00019071084084713776, "loss": 2.7872, "step": 83000 }, { "epoch": 7.0, "eval_accuracy": 0.43096100087072087, "eval_loss": 2.8298897743225098, "eval_runtime": 8.7511, "eval_samples_per_second": 7998.992, "eval_steps_per_second": 15.655, "step": 83566 }, { "epoch": 7.04, "learning_rate": 0.00018546865170895367, "loss": 2.7705, "step": 84000 }, { "epoch": 7.12, "learning_rate": 0.00018022646257076955, "loss": 2.7635, "step": 85000 }, { "epoch": 7.2, "learning_rate": 0.00017498427343258543, "loss": 2.7752, "step": 86000 }, { "epoch": 7.29, "learning_rate": 0.00016974208429440136, "loss": 2.774, "step": 87000 }, { "epoch": 7.37, "learning_rate": 0.00016449989515621724, "loss": 2.7576, "step": 88000 }, { "epoch": 7.46, "learning_rate": 0.00015925770601803312, "loss": 2.7551, "step": 89000 }, { "epoch": 7.54, "learning_rate": 0.00015401551687984903, "loss": 2.7467, "step": 90000 }, { "epoch": 7.62, "learning_rate": 0.0001487733277416649, "loss": 2.7414, "step": 91000 }, { "epoch": 7.71, "learning_rate": 0.00014353113860348082, "loss": 2.7554, "step": 92000 }, { "epoch": 7.79, "learning_rate": 0.00013828894946529672, "loss": 2.7518, "step": 93000 }, { "epoch": 7.87, "learning_rate": 0.0001330467603271126, "loss": 2.7183, "step": 94000 }, { "epoch": 7.96, "learning_rate": 0.00012780457118892848, "loss": 2.7348, "step": 95000 }, { "epoch": 8.0, "eval_accuracy": 0.4334868640344232, "eval_loss": 2.789966106414795, "eval_runtime": 8.5842, "eval_samples_per_second": 8154.547, "eval_steps_per_second": 15.96, "step": 95504 }, { "epoch": 8.04, "learning_rate": 0.0001225623820507444, "loss": 2.7162, "step": 96000 }, { "epoch": 8.13, "learning_rate": 0.00011732019291256028, "loss": 2.706, "step": 97000 }, { "epoch": 8.21, "learning_rate": 0.00011207800377437619, "loss": 2.6916, "step": 98000 }, { "epoch": 8.29, "learning_rate": 0.00010683581463619207, "loss": 2.6985, "step": 99000 }, { "epoch": 8.38, "learning_rate": 0.00010159362549800798, "loss": 2.6958, "step": 100000 }, { "epoch": 8.46, "learning_rate": 9.635143635982387e-05, "loss": 2.6912, "step": 101000 }, { "epoch": 8.54, "learning_rate": 9.110924722163975e-05, "loss": 2.7134, "step": 102000 }, { "epoch": 8.63, "learning_rate": 8.586705808345566e-05, "loss": 2.6608, "step": 103000 }, { "epoch": 8.71, "learning_rate": 8.062486894527155e-05, "loss": 2.674, "step": 104000 }, { "epoch": 8.8, "learning_rate": 7.538267980708744e-05, "loss": 2.6678, "step": 105000 }, { "epoch": 8.88, "learning_rate": 7.014049066890334e-05, "loss": 2.6799, "step": 106000 }, { "epoch": 8.96, "learning_rate": 6.489830153071923e-05, "loss": 2.6584, "step": 107000 }, { "epoch": 9.0, "eval_accuracy": 0.4442780062057686, "eval_loss": 2.7272143363952637, "eval_runtime": 9.2939, "eval_samples_per_second": 7531.799, "eval_steps_per_second": 14.741, "step": 107442 }, { "epoch": 9.05, "learning_rate": 5.965611239253512e-05, "loss": 2.6407, "step": 108000 }, { "epoch": 9.13, "learning_rate": 5.441392325435102e-05, "loss": 2.6569, "step": 109000 }, { "epoch": 9.21, "learning_rate": 4.917173411616691e-05, "loss": 2.6657, "step": 110000 }, { "epoch": 9.3, "learning_rate": 4.3929544977982805e-05, "loss": 2.6412, "step": 111000 }, { "epoch": 9.38, "learning_rate": 3.8687355839798705e-05, "loss": 2.6475, "step": 112000 }, { "epoch": 9.47, "learning_rate": 3.344516670161459e-05, "loss": 2.6345, "step": 113000 }, { "epoch": 9.55, "learning_rate": 2.820297756343049e-05, "loss": 2.6401, "step": 114000 }, { "epoch": 9.63, "learning_rate": 2.2960788425246382e-05, "loss": 2.6381, "step": 115000 }, { "epoch": 9.72, "learning_rate": 1.771859928706228e-05, "loss": 2.6571, "step": 116000 }, { "epoch": 9.8, "learning_rate": 1.2476410148878172e-05, "loss": 2.6252, "step": 117000 }, { "epoch": 9.88, "learning_rate": 7.234221010694066e-06, "loss": 2.6229, "step": 118000 }, { "epoch": 9.97, "learning_rate": 1.99203187250996e-06, "loss": 2.6462, "step": 119000 }, { "epoch": 10.0, "eval_accuracy": 0.4501244611741849, "eval_loss": 2.696192502975464, "eval_runtime": 8.7519, "eval_samples_per_second": 7998.23, "eval_steps_per_second": 15.654, "step": 119380 }, { "epoch": 10.0, "step": 119380, "total_flos": 6557990097185280.0, "train_loss": 3.087518670990711, "train_runtime": 7284.0934, "train_samples_per_second": 1048.846, "train_steps_per_second": 16.389 } ], "logging_steps": 1000, "max_steps": 119380, "num_train_epochs": 10, "save_steps": 2000, "total_flos": 6557990097185280.0, "trial_name": null, "trial_params": null }