{ "best_metric": 0.9703001976013184, "best_model_checkpoint": "/kaggle/output/checkpoint-114000", "epoch": 4.644719687092568, "eval_steps": 1000, "global_step": 114000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 2.7777777777777777e-11, "loss": 1.2184, "step": 1 }, { "epoch": 0.04, "learning_rate": 2.7750000000000004e-08, "loss": 1.1394, "step": 1000 }, { "epoch": 0.04, "eval_accuracy": 0.3327345309381238, "eval_loss": 1.1149410009384155, "eval_runtime": 20.6803, "eval_samples_per_second": 242.26, "eval_steps_per_second": 30.319, "step": 1000 }, { "epoch": 0.08, "learning_rate": 5.5527777777777784e-08, "loss": 1.1141, "step": 2000 }, { "epoch": 0.08, "eval_accuracy": 0.3401197604790419, "eval_loss": 1.104099988937378, "eval_runtime": 20.8477, "eval_samples_per_second": 240.314, "eval_steps_per_second": 30.075, "step": 2000 }, { "epoch": 0.12, "learning_rate": 8.330555555555556e-08, "loss": 1.116, "step": 3000 }, { "epoch": 0.12, "eval_accuracy": 0.3407185628742515, "eval_loss": 1.1040862798690796, "eval_runtime": 20.6818, "eval_samples_per_second": 242.242, "eval_steps_per_second": 30.317, "step": 3000 }, { "epoch": 0.16, "learning_rate": 1.1108333333333333e-07, "loss": 1.1158, "step": 4000 }, { "epoch": 0.16, "eval_accuracy": 0.32894211576846305, "eval_loss": 1.1020556688308716, "eval_runtime": 20.8541, "eval_samples_per_second": 240.241, "eval_steps_per_second": 30.066, "step": 4000 }, { "epoch": 0.2, "learning_rate": 1.3883333333333335e-07, "loss": 1.1135, "step": 5000 }, { "epoch": 0.2, "eval_accuracy": 0.34271457085828344, "eval_loss": 1.1008552312850952, "eval_runtime": 20.8055, "eval_samples_per_second": 240.802, "eval_steps_per_second": 30.136, "step": 5000 }, { "epoch": 0.24, "learning_rate": 1.6658333333333335e-07, "loss": 1.1121, "step": 6000 }, { "epoch": 0.24, "eval_accuracy": 0.3395209580838323, "eval_loss": 1.1004050970077515, "eval_runtime": 20.8985, "eval_samples_per_second": 239.731, "eval_steps_per_second": 30.002, "step": 6000 }, { "epoch": 0.29, "learning_rate": 1.9436111111111112e-07, "loss": 1.1089, "step": 7000 }, { "epoch": 0.29, "eval_accuracy": 0.35788423153692617, "eval_loss": 1.0985721349716187, "eval_runtime": 20.84, "eval_samples_per_second": 240.403, "eval_steps_per_second": 30.086, "step": 7000 }, { "epoch": 0.33, "learning_rate": 2.2213888888888891e-07, "loss": 1.1079, "step": 8000 }, { "epoch": 0.33, "eval_accuracy": 0.3331337325349301, "eval_loss": 1.098374843597412, "eval_runtime": 20.7886, "eval_samples_per_second": 240.998, "eval_steps_per_second": 30.161, "step": 8000 }, { "epoch": 0.37, "learning_rate": 2.4988888888888893e-07, "loss": 1.1087, "step": 9000 }, { "epoch": 0.37, "eval_accuracy": 0.34510978043912177, "eval_loss": 1.0993521213531494, "eval_runtime": 20.782, "eval_samples_per_second": 241.074, "eval_steps_per_second": 30.17, "step": 9000 }, { "epoch": 0.41, "learning_rate": 2.776666666666667e-07, "loss": 1.109, "step": 10000 }, { "epoch": 0.41, "eval_accuracy": 0.3475049900199601, "eval_loss": 1.0967597961425781, "eval_runtime": 20.6798, "eval_samples_per_second": 242.265, "eval_steps_per_second": 30.319, "step": 10000 }, { "epoch": 0.45, "learning_rate": 3.054444444444444e-07, "loss": 1.1052, "step": 11000 }, { "epoch": 0.45, "eval_accuracy": 0.37544910179640717, "eval_loss": 1.0941349267959595, "eval_runtime": 20.8641, "eval_samples_per_second": 240.126, "eval_steps_per_second": 30.052, "step": 11000 }, { "epoch": 0.49, "learning_rate": 3.3322222222222225e-07, "loss": 1.105, "step": 12000 }, { "epoch": 0.49, "eval_accuracy": 0.3834331337325349, "eval_loss": 1.0927647352218628, "eval_runtime": 20.6541, "eval_samples_per_second": 242.567, "eval_steps_per_second": 30.357, "step": 12000 }, { "epoch": 0.53, "learning_rate": 3.609722222222222e-07, "loss": 1.1016, "step": 13000 }, { "epoch": 0.53, "eval_accuracy": 0.3457085828343313, "eval_loss": 1.0942081212997437, "eval_runtime": 21.0733, "eval_samples_per_second": 237.742, "eval_steps_per_second": 29.753, "step": 13000 }, { "epoch": 0.57, "learning_rate": 3.8875e-07, "loss": 1.1031, "step": 14000 }, { "epoch": 0.57, "eval_accuracy": 0.37005988023952097, "eval_loss": 1.0918152332305908, "eval_runtime": 20.9151, "eval_samples_per_second": 239.54, "eval_steps_per_second": 29.978, "step": 14000 }, { "epoch": 0.61, "learning_rate": 4.1652777777777786e-07, "loss": 1.1026, "step": 15000 }, { "epoch": 0.61, "eval_accuracy": 0.3790419161676647, "eval_loss": 1.0895211696624756, "eval_runtime": 21.0591, "eval_samples_per_second": 237.902, "eval_steps_per_second": 29.773, "step": 15000 }, { "epoch": 0.65, "learning_rate": 4.4427777777777783e-07, "loss": 1.0988, "step": 16000 }, { "epoch": 0.65, "eval_accuracy": 0.4101796407185629, "eval_loss": 1.0852997303009033, "eval_runtime": 20.9509, "eval_samples_per_second": 239.131, "eval_steps_per_second": 29.927, "step": 16000 }, { "epoch": 0.69, "learning_rate": 4.720555555555556e-07, "loss": 1.0974, "step": 17000 }, { "epoch": 0.69, "eval_accuracy": 0.43213572854291415, "eval_loss": 1.0791982412338257, "eval_runtime": 20.7526, "eval_samples_per_second": 241.415, "eval_steps_per_second": 30.213, "step": 17000 }, { "epoch": 0.73, "learning_rate": 4.998055555555556e-07, "loss": 1.0932, "step": 18000 }, { "epoch": 0.73, "eval_accuracy": 0.4275449101796407, "eval_loss": 1.072191596031189, "eval_runtime": 21.2435, "eval_samples_per_second": 235.837, "eval_steps_per_second": 29.515, "step": 18000 }, { "epoch": 0.77, "learning_rate": 5.275833333333334e-07, "loss": 1.0833, "step": 19000 }, { "epoch": 0.77, "eval_accuracy": 0.43233532934131735, "eval_loss": 1.06425940990448, "eval_runtime": 20.7923, "eval_samples_per_second": 240.955, "eval_steps_per_second": 30.155, "step": 19000 }, { "epoch": 0.81, "learning_rate": 5.553333333333334e-07, "loss": 1.0787, "step": 20000 }, { "epoch": 0.81, "eval_accuracy": 0.4295409181636727, "eval_loss": 1.0638529062271118, "eval_runtime": 21.0018, "eval_samples_per_second": 238.551, "eval_steps_per_second": 29.855, "step": 20000 }, { "epoch": 0.86, "learning_rate": 5.831111111111111e-07, "loss": 1.0779, "step": 21000 }, { "epoch": 0.86, "eval_accuracy": 0.4243512974051896, "eval_loss": 1.0603673458099365, "eval_runtime": 20.9689, "eval_samples_per_second": 238.926, "eval_steps_per_second": 29.901, "step": 21000 }, { "epoch": 0.9, "learning_rate": 6.108888888888888e-07, "loss": 1.0751, "step": 22000 }, { "epoch": 0.9, "eval_accuracy": 0.43233532934131735, "eval_loss": 1.0603009462356567, "eval_runtime": 20.8897, "eval_samples_per_second": 239.831, "eval_steps_per_second": 30.015, "step": 22000 }, { "epoch": 0.94, "learning_rate": 6.386388888888889e-07, "loss": 1.0776, "step": 23000 }, { "epoch": 0.94, "eval_accuracy": 0.42734530938123755, "eval_loss": 1.0591468811035156, "eval_runtime": 20.964, "eval_samples_per_second": 238.981, "eval_steps_per_second": 29.908, "step": 23000 }, { "epoch": 0.98, "learning_rate": 6.664166666666667e-07, "loss": 1.0754, "step": 24000 }, { "epoch": 0.98, "eval_accuracy": 0.4245508982035928, "eval_loss": 1.0589721202850342, "eval_runtime": 20.9053, "eval_samples_per_second": 239.652, "eval_steps_per_second": 29.992, "step": 24000 }, { "epoch": 1.02, "learning_rate": 6.941666666666667e-07, "loss": 1.0736, "step": 25000 }, { "epoch": 1.02, "eval_accuracy": 0.43213572854291415, "eval_loss": 1.0583962202072144, "eval_runtime": 21.3265, "eval_samples_per_second": 234.919, "eval_steps_per_second": 29.4, "step": 25000 }, { "epoch": 1.06, "learning_rate": 7.219444444444444e-07, "loss": 1.0717, "step": 26000 }, { "epoch": 1.06, "eval_accuracy": 0.4305389221556886, "eval_loss": 1.0561293363571167, "eval_runtime": 21.3034, "eval_samples_per_second": 235.174, "eval_steps_per_second": 29.432, "step": 26000 }, { "epoch": 1.1, "learning_rate": 7.496944444444444e-07, "loss": 1.0709, "step": 27000 }, { "epoch": 1.1, "eval_accuracy": 0.4281437125748503, "eval_loss": 1.0555357933044434, "eval_runtime": 21.2178, "eval_samples_per_second": 236.123, "eval_steps_per_second": 29.551, "step": 27000 }, { "epoch": 1.14, "learning_rate": 7.774722222222223e-07, "loss": 1.0701, "step": 28000 }, { "epoch": 1.14, "eval_accuracy": 0.4217564870259481, "eval_loss": 1.054961085319519, "eval_runtime": 21.1775, "eval_samples_per_second": 236.571, "eval_steps_per_second": 29.607, "step": 28000 }, { "epoch": 1.18, "learning_rate": 8.052222222222223e-07, "loss": 1.0641, "step": 29000 }, { "epoch": 1.18, "eval_accuracy": 0.4291417165668663, "eval_loss": 1.0518379211425781, "eval_runtime": 21.0932, "eval_samples_per_second": 237.517, "eval_steps_per_second": 29.725, "step": 29000 }, { "epoch": 1.22, "learning_rate": 8.330000000000001e-07, "loss": 1.064, "step": 30000 }, { "epoch": 1.22, "eval_accuracy": 0.43173652694610776, "eval_loss": 1.0493717193603516, "eval_runtime": 21.2843, "eval_samples_per_second": 235.385, "eval_steps_per_second": 29.458, "step": 30000 }, { "epoch": 1.26, "learning_rate": 8.607500000000001e-07, "loss": 1.0693, "step": 31000 }, { "epoch": 1.26, "eval_accuracy": 0.4291417165668663, "eval_loss": 1.0521764755249023, "eval_runtime": 21.2278, "eval_samples_per_second": 236.011, "eval_steps_per_second": 29.537, "step": 31000 }, { "epoch": 1.3, "learning_rate": 8.885277777777779e-07, "loss": 1.0649, "step": 32000 }, { "epoch": 1.3, "eval_accuracy": 0.42375249500998, "eval_loss": 1.0528494119644165, "eval_runtime": 21.1249, "eval_samples_per_second": 237.161, "eval_steps_per_second": 29.681, "step": 32000 }, { "epoch": 1.34, "learning_rate": 9.163055555555556e-07, "loss": 1.0619, "step": 33000 }, { "epoch": 1.34, "eval_accuracy": 0.43293413173652695, "eval_loss": 1.049193263053894, "eval_runtime": 21.241, "eval_samples_per_second": 235.865, "eval_steps_per_second": 29.518, "step": 33000 }, { "epoch": 1.39, "learning_rate": 9.440555555555557e-07, "loss": 1.0582, "step": 34000 }, { "epoch": 1.39, "eval_accuracy": 0.4357285429141717, "eval_loss": 1.04512619972229, "eval_runtime": 21.2531, "eval_samples_per_second": 235.73, "eval_steps_per_second": 29.502, "step": 34000 }, { "epoch": 1.43, "learning_rate": 9.718333333333334e-07, "loss": 1.0629, "step": 35000 }, { "epoch": 1.43, "eval_accuracy": 0.43253493013972055, "eval_loss": 1.043523907661438, "eval_runtime": 21.2176, "eval_samples_per_second": 236.124, "eval_steps_per_second": 29.551, "step": 35000 }, { "epoch": 1.47, "learning_rate": 9.995833333333334e-07, "loss": 1.0588, "step": 36000 }, { "epoch": 1.47, "eval_accuracy": 0.4307385229540918, "eval_loss": 1.0413768291473389, "eval_runtime": 21.2225, "eval_samples_per_second": 236.07, "eval_steps_per_second": 29.544, "step": 36000 }, { "epoch": 1.51, "learning_rate": 1.0273611111111112e-06, "loss": 1.0552, "step": 37000 }, { "epoch": 1.51, "eval_accuracy": 0.4415169660678643, "eval_loss": 1.0397700071334839, "eval_runtime": 21.0651, "eval_samples_per_second": 237.835, "eval_steps_per_second": 29.765, "step": 37000 }, { "epoch": 1.55, "learning_rate": 1.055138888888889e-06, "loss": 1.0567, "step": 38000 }, { "epoch": 1.55, "eval_accuracy": 0.4419161676646707, "eval_loss": 1.0390877723693848, "eval_runtime": 21.2858, "eval_samples_per_second": 235.368, "eval_steps_per_second": 29.456, "step": 38000 }, { "epoch": 1.59, "learning_rate": 1.0829166666666667e-06, "loss": 1.054, "step": 39000 }, { "epoch": 1.59, "eval_accuracy": 0.43952095808383235, "eval_loss": 1.0404103994369507, "eval_runtime": 21.2544, "eval_samples_per_second": 235.715, "eval_steps_per_second": 29.5, "step": 39000 }, { "epoch": 1.63, "learning_rate": 1.1106666666666668e-06, "loss": 1.049, "step": 40000 }, { "epoch": 1.63, "eval_accuracy": 0.4477045908183633, "eval_loss": 1.0360453128814697, "eval_runtime": 21.0993, "eval_samples_per_second": 237.448, "eval_steps_per_second": 29.717, "step": 40000 }, { "epoch": 1.67, "learning_rate": 1.1384444444444446e-06, "loss": 1.0522, "step": 41000 }, { "epoch": 1.67, "eval_accuracy": 0.443313373253493, "eval_loss": 1.0359567403793335, "eval_runtime": 21.2828, "eval_samples_per_second": 235.401, "eval_steps_per_second": 29.46, "step": 41000 }, { "epoch": 1.71, "learning_rate": 1.1661944444444447e-06, "loss": 1.0459, "step": 42000 }, { "epoch": 1.71, "eval_accuracy": 0.437125748502994, "eval_loss": 1.0328505039215088, "eval_runtime": 21.2626, "eval_samples_per_second": 235.625, "eval_steps_per_second": 29.488, "step": 42000 }, { "epoch": 1.75, "learning_rate": 1.1939722222222222e-06, "loss": 1.0488, "step": 43000 }, { "epoch": 1.75, "eval_accuracy": 0.4379241516966068, "eval_loss": 1.0460196733474731, "eval_runtime": 21.2023, "eval_samples_per_second": 236.295, "eval_steps_per_second": 29.572, "step": 43000 }, { "epoch": 1.79, "learning_rate": 1.2217222222222223e-06, "loss": 1.0504, "step": 44000 }, { "epoch": 1.79, "eval_accuracy": 0.4421157684630739, "eval_loss": 1.0345144271850586, "eval_runtime": 21.2684, "eval_samples_per_second": 235.561, "eval_steps_per_second": 29.48, "step": 44000 }, { "epoch": 1.83, "learning_rate": 1.2494722222222224e-06, "loss": 1.0482, "step": 45000 }, { "epoch": 1.83, "eval_accuracy": 0.46467065868263474, "eval_loss": 1.0280784368515015, "eval_runtime": 21.1069, "eval_samples_per_second": 237.363, "eval_steps_per_second": 29.706, "step": 45000 }, { "epoch": 1.87, "learning_rate": 1.2772500000000001e-06, "loss": 1.046, "step": 46000 }, { "epoch": 1.87, "eval_accuracy": 0.4554890219560878, "eval_loss": 1.031961441040039, "eval_runtime": 21.2098, "eval_samples_per_second": 236.211, "eval_steps_per_second": 29.562, "step": 46000 }, { "epoch": 1.91, "learning_rate": 1.3050277777777777e-06, "loss": 1.0391, "step": 47000 }, { "epoch": 1.91, "eval_accuracy": 0.46706586826347307, "eval_loss": 1.023941159248352, "eval_runtime": 21.2269, "eval_samples_per_second": 236.021, "eval_steps_per_second": 29.538, "step": 47000 }, { "epoch": 1.96, "learning_rate": 1.3328055555555555e-06, "loss": 1.0444, "step": 48000 }, { "epoch": 1.96, "eval_accuracy": 0.4491017964071856, "eval_loss": 1.036636471748352, "eval_runtime": 21.1179, "eval_samples_per_second": 237.239, "eval_steps_per_second": 29.69, "step": 48000 }, { "epoch": 2.0, "learning_rate": 1.3605555555555555e-06, "loss": 1.0386, "step": 49000 }, { "epoch": 2.0, "eval_accuracy": 0.47305389221556887, "eval_loss": 1.0199605226516724, "eval_runtime": 21.2669, "eval_samples_per_second": 235.577, "eval_steps_per_second": 29.482, "step": 49000 }, { "epoch": 2.04, "learning_rate": 1.3883333333333333e-06, "loss": 1.0379, "step": 50000 }, { "epoch": 2.04, "eval_accuracy": 0.4738522954091816, "eval_loss": 1.0184080600738525, "eval_runtime": 21.2806, "eval_samples_per_second": 235.426, "eval_steps_per_second": 29.463, "step": 50000 }, { "epoch": 2.08, "learning_rate": 1.416111111111111e-06, "loss": 1.0365, "step": 51000 }, { "epoch": 2.08, "eval_accuracy": 0.474251497005988, "eval_loss": 1.0143232345581055, "eval_runtime": 21.1003, "eval_samples_per_second": 237.437, "eval_steps_per_second": 29.715, "step": 51000 }, { "epoch": 2.12, "learning_rate": 1.4438888888888889e-06, "loss": 1.0356, "step": 52000 }, { "epoch": 2.12, "eval_accuracy": 0.4782435129740519, "eval_loss": 1.0179694890975952, "eval_runtime": 21.2847, "eval_samples_per_second": 235.38, "eval_steps_per_second": 29.458, "step": 52000 }, { "epoch": 2.16, "learning_rate": 1.471611111111111e-06, "loss": 1.0401, "step": 53000 }, { "epoch": 2.16, "eval_accuracy": 0.4864271457085828, "eval_loss": 1.008955955505371, "eval_runtime": 21.0808, "eval_samples_per_second": 237.657, "eval_steps_per_second": 29.743, "step": 53000 }, { "epoch": 2.2, "learning_rate": 1.4993888888888888e-06, "loss": 1.0338, "step": 54000 }, { "epoch": 2.2, "eval_accuracy": 0.4870259481037924, "eval_loss": 1.0100921392440796, "eval_runtime": 21.2581, "eval_samples_per_second": 235.674, "eval_steps_per_second": 29.495, "step": 54000 }, { "epoch": 2.24, "learning_rate": 1.5271388888888889e-06, "loss": 1.0322, "step": 55000 }, { "epoch": 2.24, "eval_accuracy": 0.48403193612774453, "eval_loss": 1.0082924365997314, "eval_runtime": 21.2835, "eval_samples_per_second": 235.394, "eval_steps_per_second": 29.459, "step": 55000 }, { "epoch": 2.28, "learning_rate": 1.5549166666666666e-06, "loss": 1.0287, "step": 56000 }, { "epoch": 2.28, "eval_accuracy": 0.47065868263473054, "eval_loss": 1.0229328870773315, "eval_runtime": 21.1356, "eval_samples_per_second": 237.04, "eval_steps_per_second": 29.666, "step": 56000 }, { "epoch": 2.32, "learning_rate": 1.5826944444444446e-06, "loss": 1.0291, "step": 57000 }, { "epoch": 2.32, "eval_accuracy": 0.4844311377245509, "eval_loss": 1.0074658393859863, "eval_runtime": 21.2919, "eval_samples_per_second": 235.301, "eval_steps_per_second": 29.448, "step": 57000 }, { "epoch": 2.36, "learning_rate": 1.6104722222222222e-06, "loss": 1.0304, "step": 58000 }, { "epoch": 2.36, "eval_accuracy": 0.4874251497005988, "eval_loss": 1.007895588874817, "eval_runtime": 21.1997, "eval_samples_per_second": 236.324, "eval_steps_per_second": 29.576, "step": 58000 }, { "epoch": 2.4, "learning_rate": 1.6381944444444445e-06, "loss": 1.0259, "step": 59000 }, { "epoch": 2.4, "eval_accuracy": 0.492814371257485, "eval_loss": 0.9993996620178223, "eval_runtime": 21.0668, "eval_samples_per_second": 237.815, "eval_steps_per_second": 29.763, "step": 59000 }, { "epoch": 2.44, "learning_rate": 1.6659722222222223e-06, "loss": 1.0314, "step": 60000 }, { "epoch": 2.44, "eval_accuracy": 0.48902195608782434, "eval_loss": 1.0036838054656982, "eval_runtime": 21.2359, "eval_samples_per_second": 235.921, "eval_steps_per_second": 29.525, "step": 60000 }, { "epoch": 2.49, "learning_rate": 1.6937500000000003e-06, "loss": 1.0276, "step": 61000 }, { "epoch": 2.49, "eval_accuracy": 0.48502994011976047, "eval_loss": 1.0065844058990479, "eval_runtime": 21.07, "eval_samples_per_second": 237.778, "eval_steps_per_second": 29.758, "step": 61000 }, { "epoch": 2.53, "learning_rate": 1.7215000000000002e-06, "loss": 1.0298, "step": 62000 }, { "epoch": 2.53, "eval_accuracy": 0.48263473053892214, "eval_loss": 1.0085350275039673, "eval_runtime": 21.4949, "eval_samples_per_second": 233.079, "eval_steps_per_second": 29.17, "step": 62000 }, { "epoch": 2.57, "learning_rate": 1.7492777777777777e-06, "loss": 1.0292, "step": 63000 }, { "epoch": 2.57, "eval_accuracy": 0.49560878243512974, "eval_loss": 0.9984678030014038, "eval_runtime": 21.4688, "eval_samples_per_second": 233.362, "eval_steps_per_second": 29.205, "step": 63000 }, { "epoch": 2.61, "learning_rate": 1.7770277777777778e-06, "loss": 1.0273, "step": 64000 }, { "epoch": 2.61, "eval_accuracy": 0.4942115768463074, "eval_loss": 0.9960917830467224, "eval_runtime": 21.307, "eval_samples_per_second": 235.134, "eval_steps_per_second": 29.427, "step": 64000 }, { "epoch": 2.65, "learning_rate": 1.8048055555555558e-06, "loss": 1.0256, "step": 65000 }, { "epoch": 2.65, "eval_accuracy": 0.49840319361277446, "eval_loss": 0.9994720816612244, "eval_runtime": 21.5116, "eval_samples_per_second": 232.897, "eval_steps_per_second": 29.147, "step": 65000 }, { "epoch": 2.69, "learning_rate": 1.8325833333333333e-06, "loss": 1.0306, "step": 66000 }, { "epoch": 2.69, "eval_accuracy": 0.49660678642714573, "eval_loss": 0.9951530694961548, "eval_runtime": 21.5301, "eval_samples_per_second": 232.697, "eval_steps_per_second": 29.122, "step": 66000 }, { "epoch": 2.73, "learning_rate": 1.8603333333333334e-06, "loss": 1.0245, "step": 67000 }, { "epoch": 2.73, "eval_accuracy": 0.499001996007984, "eval_loss": 0.9978940486907959, "eval_runtime": 21.0729, "eval_samples_per_second": 237.746, "eval_steps_per_second": 29.754, "step": 67000 }, { "epoch": 2.77, "learning_rate": 1.8881111111111114e-06, "loss": 1.0247, "step": 68000 }, { "epoch": 2.77, "eval_accuracy": 0.5, "eval_loss": 0.9951012134552002, "eval_runtime": 21.1793, "eval_samples_per_second": 236.552, "eval_steps_per_second": 29.604, "step": 68000 }, { "epoch": 2.81, "learning_rate": 1.9158611111111115e-06, "loss": 1.0241, "step": 69000 }, { "epoch": 2.81, "eval_accuracy": 0.5021956087824351, "eval_loss": 0.9969767332077026, "eval_runtime": 21.2229, "eval_samples_per_second": 236.066, "eval_steps_per_second": 29.544, "step": 69000 }, { "epoch": 2.85, "learning_rate": 1.943638888888889e-06, "loss": 1.0242, "step": 70000 }, { "epoch": 2.85, "eval_accuracy": 0.49720558882235527, "eval_loss": 1.00163733959198, "eval_runtime": 21.0442, "eval_samples_per_second": 238.071, "eval_steps_per_second": 29.794, "step": 70000 }, { "epoch": 2.89, "learning_rate": 1.971416666666667e-06, "loss": 1.017, "step": 71000 }, { "epoch": 2.89, "eval_accuracy": 0.49600798403193613, "eval_loss": 1.0031572580337524, "eval_runtime": 21.1987, "eval_samples_per_second": 236.336, "eval_steps_per_second": 29.577, "step": 71000 }, { "epoch": 2.93, "learning_rate": 1.999166666666667e-06, "loss": 1.0237, "step": 72000 }, { "epoch": 2.93, "eval_accuracy": 0.49001996007984033, "eval_loss": 1.0067389011383057, "eval_runtime": 21.2825, "eval_samples_per_second": 235.405, "eval_steps_per_second": 29.461, "step": 72000 }, { "epoch": 2.97, "learning_rate": 2.0269444444444444e-06, "loss": 1.0208, "step": 73000 }, { "epoch": 2.97, "eval_accuracy": 0.49560878243512974, "eval_loss": 1.0011804103851318, "eval_runtime": 21.137, "eval_samples_per_second": 237.025, "eval_steps_per_second": 29.664, "step": 73000 }, { "epoch": 3.01, "learning_rate": 2.0546944444444447e-06, "loss": 1.021, "step": 74000 }, { "epoch": 3.01, "eval_accuracy": 0.49580838323353293, "eval_loss": 0.9941307902336121, "eval_runtime": 21.161, "eval_samples_per_second": 236.757, "eval_steps_per_second": 29.63, "step": 74000 }, { "epoch": 3.06, "learning_rate": 2.0824722222222223e-06, "loss": 1.0154, "step": 75000 }, { "epoch": 3.06, "eval_accuracy": 0.49580838323353293, "eval_loss": 0.9971462488174438, "eval_runtime": 21.0263, "eval_samples_per_second": 238.273, "eval_steps_per_second": 29.82, "step": 75000 }, { "epoch": 3.1, "learning_rate": 2.1102222222222226e-06, "loss": 1.0173, "step": 76000 }, { "epoch": 3.1, "eval_accuracy": 0.5011976047904192, "eval_loss": 0.9884433150291443, "eval_runtime": 21.2199, "eval_samples_per_second": 236.099, "eval_steps_per_second": 29.548, "step": 76000 }, { "epoch": 3.14, "learning_rate": 2.138e-06, "loss": 1.0181, "step": 77000 }, { "epoch": 3.14, "eval_accuracy": 0.4808383233532934, "eval_loss": 1.0162075757980347, "eval_runtime": 21.2108, "eval_samples_per_second": 236.201, "eval_steps_per_second": 29.56, "step": 77000 }, { "epoch": 3.18, "learning_rate": 2.16575e-06, "loss": 1.0112, "step": 78000 }, { "epoch": 3.18, "eval_accuracy": 0.49880239520958086, "eval_loss": 0.9902569651603699, "eval_runtime": 21.032, "eval_samples_per_second": 238.209, "eval_steps_per_second": 29.812, "step": 78000 }, { "epoch": 3.22, "learning_rate": 2.193527777777778e-06, "loss": 1.0166, "step": 79000 }, { "epoch": 3.22, "eval_accuracy": 0.49500998003992014, "eval_loss": 1.0056451559066772, "eval_runtime": 21.1866, "eval_samples_per_second": 236.47, "eval_steps_per_second": 29.594, "step": 79000 }, { "epoch": 3.26, "learning_rate": 2.221277777777778e-06, "loss": 1.0135, "step": 80000 }, { "epoch": 3.26, "eval_accuracy": 0.49161676646706587, "eval_loss": 1.004488229751587, "eval_runtime": 21.2075, "eval_samples_per_second": 236.237, "eval_steps_per_second": 29.565, "step": 80000 }, { "epoch": 3.3, "learning_rate": 2.249055555555556e-06, "loss": 1.0147, "step": 81000 }, { "epoch": 3.3, "eval_accuracy": 0.49101796407185627, "eval_loss": 1.0022324323654175, "eval_runtime": 21.1622, "eval_samples_per_second": 236.743, "eval_steps_per_second": 29.628, "step": 81000 }, { "epoch": 3.34, "learning_rate": 2.2768055555555557e-06, "loss": 1.0249, "step": 82000 }, { "epoch": 3.34, "eval_accuracy": 0.49540918163672654, "eval_loss": 0.9961836338043213, "eval_runtime": 21.2661, "eval_samples_per_second": 235.586, "eval_steps_per_second": 29.484, "step": 82000 }, { "epoch": 3.38, "learning_rate": 2.3045833333333336e-06, "loss": 1.012, "step": 83000 }, { "epoch": 3.38, "eval_accuracy": 0.49520958083832334, "eval_loss": 0.997968316078186, "eval_runtime": 21.1127, "eval_samples_per_second": 237.298, "eval_steps_per_second": 29.698, "step": 83000 }, { "epoch": 3.42, "learning_rate": 2.3323333333333335e-06, "loss": 1.0153, "step": 84000 }, { "epoch": 3.42, "eval_accuracy": 0.5063872255489021, "eval_loss": 0.9885319471359253, "eval_runtime": 21.2015, "eval_samples_per_second": 236.304, "eval_steps_per_second": 29.573, "step": 84000 }, { "epoch": 3.46, "learning_rate": 2.360111111111111e-06, "loss": 1.0113, "step": 85000 }, { "epoch": 3.46, "eval_accuracy": 0.49860279441117766, "eval_loss": 0.9927662014961243, "eval_runtime": 21.2447, "eval_samples_per_second": 235.823, "eval_steps_per_second": 29.513, "step": 85000 }, { "epoch": 3.5, "learning_rate": 2.3878611111111113e-06, "loss": 1.0222, "step": 86000 }, { "epoch": 3.5, "eval_accuracy": 0.5073852295409181, "eval_loss": 0.9855450987815857, "eval_runtime": 21.03, "eval_samples_per_second": 238.231, "eval_steps_per_second": 29.815, "step": 86000 }, { "epoch": 3.54, "learning_rate": 2.4156388888888893e-06, "loss": 1.0062, "step": 87000 }, { "epoch": 3.54, "eval_accuracy": 0.5075848303393213, "eval_loss": 0.9856505990028381, "eval_runtime": 21.1796, "eval_samples_per_second": 236.548, "eval_steps_per_second": 29.604, "step": 87000 }, { "epoch": 3.59, "learning_rate": 2.443388888888889e-06, "loss": 1.0157, "step": 88000 }, { "epoch": 3.59, "eval_accuracy": 0.5115768463073852, "eval_loss": 0.9844857454299927, "eval_runtime": 21.2024, "eval_samples_per_second": 236.294, "eval_steps_per_second": 29.572, "step": 88000 }, { "epoch": 3.63, "learning_rate": 2.4711666666666668e-06, "loss": 1.0069, "step": 89000 }, { "epoch": 3.63, "eval_accuracy": 0.499001996007984, "eval_loss": 0.9894497394561768, "eval_runtime": 21.2532, "eval_samples_per_second": 235.729, "eval_steps_per_second": 29.501, "step": 89000 }, { "epoch": 3.67, "learning_rate": 2.4989166666666666e-06, "loss": 1.0164, "step": 90000 }, { "epoch": 3.67, "eval_accuracy": 0.5101796407185629, "eval_loss": 0.9842925667762756, "eval_runtime": 21.1914, "eval_samples_per_second": 236.416, "eval_steps_per_second": 29.587, "step": 90000 }, { "epoch": 3.71, "learning_rate": 2.5266944444444446e-06, "loss": 1.0175, "step": 91000 }, { "epoch": 3.71, "eval_accuracy": 0.5055888223552895, "eval_loss": 0.9913986325263977, "eval_runtime": 21.0589, "eval_samples_per_second": 237.904, "eval_steps_per_second": 29.774, "step": 91000 }, { "epoch": 3.75, "learning_rate": 2.554444444444445e-06, "loss": 1.013, "step": 92000 }, { "epoch": 3.75, "eval_accuracy": 0.5057884231536927, "eval_loss": 0.9887382388114929, "eval_runtime": 21.2965, "eval_samples_per_second": 235.25, "eval_steps_per_second": 29.441, "step": 92000 }, { "epoch": 3.79, "learning_rate": 2.5822222222222224e-06, "loss": 1.0157, "step": 93000 }, { "epoch": 3.79, "eval_accuracy": 0.5127744510978044, "eval_loss": 0.9854277968406677, "eval_runtime": 21.2839, "eval_samples_per_second": 235.389, "eval_steps_per_second": 29.459, "step": 93000 }, { "epoch": 3.83, "learning_rate": 2.6099722222222223e-06, "loss": 1.0108, "step": 94000 }, { "epoch": 3.83, "eval_accuracy": 0.5039920159680639, "eval_loss": 0.9907957315444946, "eval_runtime": 21.0477, "eval_samples_per_second": 238.03, "eval_steps_per_second": 29.789, "step": 94000 }, { "epoch": 3.87, "learning_rate": 2.6377500000000003e-06, "loss": 1.0132, "step": 95000 }, { "epoch": 3.87, "eval_accuracy": 0.5161676646706587, "eval_loss": 0.9830310940742493, "eval_runtime": 21.2625, "eval_samples_per_second": 235.626, "eval_steps_per_second": 29.489, "step": 95000 }, { "epoch": 3.91, "learning_rate": 2.6655e-06, "loss": 1.0139, "step": 96000 }, { "epoch": 3.91, "eval_accuracy": 0.5125748502994012, "eval_loss": 0.9806137681007385, "eval_runtime": 21.1848, "eval_samples_per_second": 236.49, "eval_steps_per_second": 29.597, "step": 96000 }, { "epoch": 3.95, "learning_rate": 2.6932777777777777e-06, "loss": 1.0104, "step": 97000 }, { "epoch": 3.95, "eval_accuracy": 0.49640718562874253, "eval_loss": 0.9965940117835999, "eval_runtime": 21.1145, "eval_samples_per_second": 237.278, "eval_steps_per_second": 29.695, "step": 97000 }, { "epoch": 3.99, "learning_rate": 2.721027777777778e-06, "loss": 1.011, "step": 98000 }, { "epoch": 3.99, "eval_accuracy": 0.49800399201596807, "eval_loss": 0.9846762418746948, "eval_runtime": 21.2444, "eval_samples_per_second": 235.827, "eval_steps_per_second": 29.514, "step": 98000 }, { "epoch": 4.03, "learning_rate": 2.748805555555556e-06, "loss": 1.0063, "step": 99000 }, { "epoch": 4.03, "eval_accuracy": 0.5103792415169661, "eval_loss": 0.9853057265281677, "eval_runtime": 21.0515, "eval_samples_per_second": 237.988, "eval_steps_per_second": 29.784, "step": 99000 }, { "epoch": 4.07, "learning_rate": 2.776555555555556e-06, "loss": 1.007, "step": 100000 }, { "epoch": 4.07, "eval_accuracy": 0.5115768463073852, "eval_loss": 0.983065128326416, "eval_runtime": 21.2494, "eval_samples_per_second": 235.771, "eval_steps_per_second": 29.507, "step": 100000 }, { "epoch": 4.12, "learning_rate": 2.8043333333333334e-06, "loss": 1.0107, "step": 101000 }, { "epoch": 4.12, "eval_accuracy": 0.5129740518962076, "eval_loss": 0.9863881468772888, "eval_runtime": 21.2658, "eval_samples_per_second": 235.59, "eval_steps_per_second": 29.484, "step": 101000 }, { "epoch": 4.16, "learning_rate": 2.8320833333333333e-06, "loss": 1.0055, "step": 102000 }, { "epoch": 4.16, "eval_accuracy": 0.49840319361277446, "eval_loss": 0.9891018867492676, "eval_runtime": 21.0702, "eval_samples_per_second": 237.777, "eval_steps_per_second": 29.758, "step": 102000 }, { "epoch": 4.2, "learning_rate": 2.8598611111111112e-06, "loss": 1.0115, "step": 103000 }, { "epoch": 4.2, "eval_accuracy": 0.5115768463073852, "eval_loss": 0.9800674319267273, "eval_runtime": 21.2539, "eval_samples_per_second": 235.722, "eval_steps_per_second": 29.5, "step": 103000 }, { "epoch": 4.24, "learning_rate": 2.8876111111111115e-06, "loss": 1.0039, "step": 104000 }, { "epoch": 4.24, "eval_accuracy": 0.49500998003992014, "eval_loss": 0.9979402422904968, "eval_runtime": 21.1981, "eval_samples_per_second": 236.342, "eval_steps_per_second": 29.578, "step": 104000 }, { "epoch": 4.28, "learning_rate": 2.915388888888889e-06, "loss": 1.0028, "step": 105000 }, { "epoch": 4.28, "eval_accuracy": 0.492814371257485, "eval_loss": 1.007880687713623, "eval_runtime": 21.1024, "eval_samples_per_second": 237.413, "eval_steps_per_second": 29.712, "step": 105000 }, { "epoch": 4.32, "learning_rate": 2.943138888888889e-06, "loss": 0.9993, "step": 106000 }, { "epoch": 4.32, "eval_accuracy": 0.513373253493014, "eval_loss": 0.9815743565559387, "eval_runtime": 21.2317, "eval_samples_per_second": 235.968, "eval_steps_per_second": 29.531, "step": 106000 }, { "epoch": 4.36, "learning_rate": 2.9709166666666665e-06, "loss": 1.0033, "step": 107000 }, { "epoch": 4.36, "eval_accuracy": 0.5021956087824351, "eval_loss": 0.9937964081764221, "eval_runtime": 21.2363, "eval_samples_per_second": 235.917, "eval_steps_per_second": 29.525, "step": 107000 }, { "epoch": 4.4, "learning_rate": 2.9986666666666668e-06, "loss": 1.003, "step": 108000 }, { "epoch": 4.4, "eval_accuracy": 0.5035928143712575, "eval_loss": 0.9884979128837585, "eval_runtime": 21.1776, "eval_samples_per_second": 236.57, "eval_steps_per_second": 29.607, "step": 108000 }, { "epoch": 4.44, "learning_rate": 3.0264444444444448e-06, "loss": 0.9986, "step": 109000 }, { "epoch": 4.44, "eval_accuracy": 0.5155688622754491, "eval_loss": 0.982122004032135, "eval_runtime": 21.1623, "eval_samples_per_second": 236.741, "eval_steps_per_second": 29.628, "step": 109000 }, { "epoch": 4.48, "learning_rate": 3.0541944444444446e-06, "loss": 1.0062, "step": 110000 }, { "epoch": 4.48, "eval_accuracy": 0.5125748502994012, "eval_loss": 0.9810440540313721, "eval_runtime": 21.0636, "eval_samples_per_second": 237.851, "eval_steps_per_second": 29.767, "step": 110000 }, { "epoch": 4.52, "learning_rate": 3.081972222222222e-06, "loss": 0.9979, "step": 111000 }, { "epoch": 4.52, "eval_accuracy": 0.5147704590818363, "eval_loss": 0.9768257141113281, "eval_runtime": 21.2634, "eval_samples_per_second": 235.616, "eval_steps_per_second": 29.487, "step": 111000 }, { "epoch": 4.56, "learning_rate": 3.109722222222222e-06, "loss": 1.0098, "step": 112000 }, { "epoch": 4.56, "eval_accuracy": 0.5179640718562875, "eval_loss": 0.9725316762924194, "eval_runtime": 21.2038, "eval_samples_per_second": 236.279, "eval_steps_per_second": 29.57, "step": 112000 }, { "epoch": 4.6, "learning_rate": 3.1375e-06, "loss": 0.9928, "step": 113000 }, { "epoch": 4.6, "eval_accuracy": 0.5055888223552895, "eval_loss": 0.9796192049980164, "eval_runtime": 21.0973, "eval_samples_per_second": 237.471, "eval_steps_per_second": 29.719, "step": 113000 }, { "epoch": 4.64, "learning_rate": 3.1652500000000003e-06, "loss": 1.0021, "step": 114000 }, { "epoch": 4.64, "eval_accuracy": 0.513373253493014, "eval_loss": 0.9703001976013184, "eval_runtime": 21.2766, "eval_samples_per_second": 235.47, "eval_steps_per_second": 29.469, "step": 114000 } ], "logging_steps": 1000, "max_steps": 10000000, "num_train_epochs": 408, "save_steps": 1000, "total_flos": 7.94374498984919e+16, "trial_name": null, "trial_params": null }