{ "best_metric": 0.9560614824295044, "best_model_checkpoint": "/kaggle/output/checkpoint-112000", "epoch": 4.644719687092568, "eval_steps": 1000, "global_step": 114000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 2.7777777777777777e-11, "loss": 1.05, "step": 1 }, { "epoch": 0.04, "learning_rate": 2.7750000000000004e-08, "loss": 1.1403, "step": 1000 }, { "epoch": 0.04, "eval_accuracy": 0.33712574850299404, "eval_loss": 1.1030837297439575, "eval_runtime": 29.7882, "eval_samples_per_second": 168.187, "eval_steps_per_second": 21.049, "step": 1000 }, { "epoch": 0.08, "learning_rate": 5.5527777777777784e-08, "loss": 1.1194, "step": 2000 }, { "epoch": 0.08, "eval_accuracy": 0.34331337325349304, "eval_loss": 1.1002165079116821, "eval_runtime": 29.8685, "eval_samples_per_second": 167.735, "eval_steps_per_second": 20.992, "step": 2000 }, { "epoch": 0.12, "learning_rate": 8.327777777777778e-08, "loss": 1.1194, "step": 3000 }, { "epoch": 0.12, "eval_accuracy": 0.3471057884231537, "eval_loss": 1.101266622543335, "eval_runtime": 29.709, "eval_samples_per_second": 168.636, "eval_steps_per_second": 21.105, "step": 3000 }, { "epoch": 0.16, "learning_rate": 1.1105555555555557e-07, "loss": 1.1166, "step": 4000 }, { "epoch": 0.16, "eval_accuracy": 0.34311377245508984, "eval_loss": 1.0984796285629272, "eval_runtime": 29.9401, "eval_samples_per_second": 167.334, "eval_steps_per_second": 20.942, "step": 4000 }, { "epoch": 0.2, "learning_rate": 1.3883333333333335e-07, "loss": 1.1137, "step": 5000 }, { "epoch": 0.2, "eval_accuracy": 0.3403193612774451, "eval_loss": 1.0974279642105103, "eval_runtime": 29.8185, "eval_samples_per_second": 168.016, "eval_steps_per_second": 21.027, "step": 5000 }, { "epoch": 0.24, "learning_rate": 1.6658333333333335e-07, "loss": 1.1109, "step": 6000 }, { "epoch": 0.24, "eval_accuracy": 0.3592814371257485, "eval_loss": 1.0984892845153809, "eval_runtime": 29.893, "eval_samples_per_second": 167.598, "eval_steps_per_second": 20.975, "step": 6000 }, { "epoch": 0.29, "learning_rate": 1.9436111111111112e-07, "loss": 1.1122, "step": 7000 }, { "epoch": 0.29, "eval_accuracy": 0.36327345309381237, "eval_loss": 1.0960686206817627, "eval_runtime": 29.8315, "eval_samples_per_second": 167.943, "eval_steps_per_second": 21.018, "step": 7000 }, { "epoch": 0.33, "learning_rate": 2.2213888888888891e-07, "loss": 1.1091, "step": 8000 }, { "epoch": 0.33, "eval_accuracy": 0.35728542914171657, "eval_loss": 1.0958523750305176, "eval_runtime": 29.9494, "eval_samples_per_second": 167.282, "eval_steps_per_second": 20.935, "step": 8000 }, { "epoch": 0.37, "learning_rate": 2.4988888888888893e-07, "loss": 1.1077, "step": 9000 }, { "epoch": 0.37, "eval_accuracy": 0.35528942115768464, "eval_loss": 1.1003851890563965, "eval_runtime": 29.9762, "eval_samples_per_second": 167.133, "eval_steps_per_second": 20.917, "step": 9000 }, { "epoch": 0.41, "learning_rate": 2.776666666666667e-07, "loss": 1.1091, "step": 10000 }, { "epoch": 0.41, "eval_accuracy": 0.3790419161676647, "eval_loss": 1.0951762199401855, "eval_runtime": 29.8593, "eval_samples_per_second": 167.787, "eval_steps_per_second": 20.998, "step": 10000 }, { "epoch": 0.45, "learning_rate": 3.0541666666666667e-07, "loss": 1.1047, "step": 11000 }, { "epoch": 0.45, "eval_accuracy": 0.3780439121756487, "eval_loss": 1.0928910970687866, "eval_runtime": 29.8768, "eval_samples_per_second": 167.689, "eval_steps_per_second": 20.986, "step": 11000 }, { "epoch": 0.49, "learning_rate": 3.3319444444444444e-07, "loss": 1.1053, "step": 12000 }, { "epoch": 0.49, "eval_accuracy": 0.38323353293413176, "eval_loss": 1.0910215377807617, "eval_runtime": 29.8703, "eval_samples_per_second": 167.725, "eval_steps_per_second": 20.991, "step": 12000 }, { "epoch": 0.53, "learning_rate": 3.6094444444444446e-07, "loss": 1.1035, "step": 13000 }, { "epoch": 0.53, "eval_accuracy": 0.34850299401197604, "eval_loss": 1.0930161476135254, "eval_runtime": 29.8865, "eval_samples_per_second": 167.634, "eval_steps_per_second": 20.979, "step": 13000 }, { "epoch": 0.57, "learning_rate": 3.8872222222222223e-07, "loss": 1.1002, "step": 14000 }, { "epoch": 0.57, "eval_accuracy": 0.3646706586826347, "eval_loss": 1.093497633934021, "eval_runtime": 30.0222, "eval_samples_per_second": 166.877, "eval_steps_per_second": 20.885, "step": 14000 }, { "epoch": 0.61, "learning_rate": 4.1647222222222225e-07, "loss": 1.1025, "step": 15000 }, { "epoch": 0.61, "eval_accuracy": 0.37924151696606784, "eval_loss": 1.0900779962539673, "eval_runtime": 30.0049, "eval_samples_per_second": 166.973, "eval_steps_per_second": 20.897, "step": 15000 }, { "epoch": 0.65, "learning_rate": 4.4425e-07, "loss": 1.0992, "step": 16000 }, { "epoch": 0.65, "eval_accuracy": 0.40119760479041916, "eval_loss": 1.0855979919433594, "eval_runtime": 30.1095, "eval_samples_per_second": 166.393, "eval_steps_per_second": 20.824, "step": 16000 }, { "epoch": 0.69, "learning_rate": 4.7200000000000004e-07, "loss": 1.0965, "step": 17000 }, { "epoch": 0.69, "eval_accuracy": 0.4171656686626746, "eval_loss": 1.0845381021499634, "eval_runtime": 30.0586, "eval_samples_per_second": 166.674, "eval_steps_per_second": 20.859, "step": 17000 }, { "epoch": 0.73, "learning_rate": 4.997777777777779e-07, "loss": 1.0962, "step": 18000 }, { "epoch": 0.73, "eval_accuracy": 0.4087824351297405, "eval_loss": 1.0860552787780762, "eval_runtime": 30.2343, "eval_samples_per_second": 165.706, "eval_steps_per_second": 20.738, "step": 18000 }, { "epoch": 0.77, "learning_rate": 5.275277777777778e-07, "loss": 1.0936, "step": 19000 }, { "epoch": 0.77, "eval_accuracy": 0.4039920159680639, "eval_loss": 1.0822259187698364, "eval_runtime": 30.216, "eval_samples_per_second": 165.806, "eval_steps_per_second": 20.751, "step": 19000 }, { "epoch": 0.81, "learning_rate": 5.553055555555556e-07, "loss": 1.0923, "step": 20000 }, { "epoch": 0.81, "eval_accuracy": 0.4095808383233533, "eval_loss": 1.0820951461791992, "eval_runtime": 29.968, "eval_samples_per_second": 167.178, "eval_steps_per_second": 20.922, "step": 20000 }, { "epoch": 0.86, "learning_rate": 5.830555555555556e-07, "loss": 1.0897, "step": 21000 }, { "epoch": 0.86, "eval_accuracy": 0.4149700598802395, "eval_loss": 1.0763800144195557, "eval_runtime": 30.021, "eval_samples_per_second": 166.883, "eval_steps_per_second": 20.885, "step": 21000 }, { "epoch": 0.9, "learning_rate": 6.108333333333333e-07, "loss": 1.0888, "step": 22000 }, { "epoch": 0.9, "eval_accuracy": 0.43193612774451096, "eval_loss": 1.0687414407730103, "eval_runtime": 30.1577, "eval_samples_per_second": 166.127, "eval_steps_per_second": 20.791, "step": 22000 }, { "epoch": 0.94, "learning_rate": 6.385833333333334e-07, "loss": 1.0823, "step": 23000 }, { "epoch": 0.94, "eval_accuracy": 0.4285429141716567, "eval_loss": 1.0642070770263672, "eval_runtime": 30.0701, "eval_samples_per_second": 166.611, "eval_steps_per_second": 20.851, "step": 23000 }, { "epoch": 0.98, "learning_rate": 6.663611111111112e-07, "loss": 1.0786, "step": 24000 }, { "epoch": 0.98, "eval_accuracy": 0.4363273453093812, "eval_loss": 1.0598841905593872, "eval_runtime": 30.0689, "eval_samples_per_second": 166.617, "eval_steps_per_second": 20.852, "step": 24000 }, { "epoch": 1.02, "learning_rate": 6.941111111111112e-07, "loss": 1.0708, "step": 25000 }, { "epoch": 1.02, "eval_accuracy": 0.4429141716566866, "eval_loss": 1.0561795234680176, "eval_runtime": 30.1301, "eval_samples_per_second": 166.279, "eval_steps_per_second": 20.81, "step": 25000 }, { "epoch": 1.06, "learning_rate": 7.218888888888889e-07, "loss": 1.072, "step": 26000 }, { "epoch": 1.06, "eval_accuracy": 0.4303393213572854, "eval_loss": 1.0574887990951538, "eval_runtime": 30.0473, "eval_samples_per_second": 166.737, "eval_steps_per_second": 20.867, "step": 26000 }, { "epoch": 1.1, "learning_rate": 7.496666666666667e-07, "loss": 1.0701, "step": 27000 }, { "epoch": 1.1, "eval_accuracy": 0.4421157684630739, "eval_loss": 1.055459976196289, "eval_runtime": 30.1049, "eval_samples_per_second": 166.418, "eval_steps_per_second": 20.827, "step": 27000 }, { "epoch": 1.14, "learning_rate": 7.774166666666668e-07, "loss": 1.0677, "step": 28000 }, { "epoch": 1.14, "eval_accuracy": 0.4285429141716567, "eval_loss": 1.0527067184448242, "eval_runtime": 30.1476, "eval_samples_per_second": 166.182, "eval_steps_per_second": 20.798, "step": 28000 }, { "epoch": 1.18, "learning_rate": 8.051944444444445e-07, "loss": 1.065, "step": 29000 }, { "epoch": 1.18, "eval_accuracy": 0.43852295409181635, "eval_loss": 1.0494154691696167, "eval_runtime": 30.036, "eval_samples_per_second": 166.8, "eval_steps_per_second": 20.875, "step": 29000 }, { "epoch": 1.22, "learning_rate": 8.329444444444445e-07, "loss": 1.0628, "step": 30000 }, { "epoch": 1.22, "eval_accuracy": 0.4377245508982036, "eval_loss": 1.0461777448654175, "eval_runtime": 30.1005, "eval_samples_per_second": 166.442, "eval_steps_per_second": 20.83, "step": 30000 }, { "epoch": 1.26, "learning_rate": 8.607222222222223e-07, "loss": 1.0672, "step": 31000 }, { "epoch": 1.26, "eval_accuracy": 0.4435129740518962, "eval_loss": 1.0531694889068604, "eval_runtime": 30.1733, "eval_samples_per_second": 166.041, "eval_steps_per_second": 20.78, "step": 31000 }, { "epoch": 1.3, "learning_rate": 8.884722222222224e-07, "loss": 1.0638, "step": 32000 }, { "epoch": 1.3, "eval_accuracy": 0.4297405189620758, "eval_loss": 1.0501775741577148, "eval_runtime": 30.058, "eval_samples_per_second": 166.678, "eval_steps_per_second": 20.86, "step": 32000 }, { "epoch": 1.34, "learning_rate": 9.162500000000001e-07, "loss": 1.0637, "step": 33000 }, { "epoch": 1.34, "eval_accuracy": 0.44491017964071855, "eval_loss": 1.0452704429626465, "eval_runtime": 30.3623, "eval_samples_per_second": 165.007, "eval_steps_per_second": 20.651, "step": 33000 }, { "epoch": 1.39, "learning_rate": 9.440000000000001e-07, "loss": 1.0558, "step": 34000 }, { "epoch": 1.39, "eval_accuracy": 0.44510978043912175, "eval_loss": 1.0434744358062744, "eval_runtime": 30.2239, "eval_samples_per_second": 165.763, "eval_steps_per_second": 20.745, "step": 34000 }, { "epoch": 1.43, "learning_rate": 9.71777777777778e-07, "loss": 1.0598, "step": 35000 }, { "epoch": 1.43, "eval_accuracy": 0.4473053892215569, "eval_loss": 1.039801001548767, "eval_runtime": 30.107, "eval_samples_per_second": 166.407, "eval_steps_per_second": 20.826, "step": 35000 }, { "epoch": 1.47, "learning_rate": 9.995277777777778e-07, "loss": 1.0601, "step": 36000 }, { "epoch": 1.47, "eval_accuracy": 0.4469061876247505, "eval_loss": 1.037074327468872, "eval_runtime": 30.1954, "eval_samples_per_second": 165.919, "eval_steps_per_second": 20.765, "step": 36000 }, { "epoch": 1.51, "learning_rate": 1.0273055555555556e-06, "loss": 1.0529, "step": 37000 }, { "epoch": 1.51, "eval_accuracy": 0.45229540918163674, "eval_loss": 1.0346434116363525, "eval_runtime": 30.2856, "eval_samples_per_second": 165.425, "eval_steps_per_second": 20.703, "step": 37000 }, { "epoch": 1.55, "learning_rate": 1.0550555555555557e-06, "loss": 1.0544, "step": 38000 }, { "epoch": 1.55, "eval_accuracy": 0.45129740518962075, "eval_loss": 1.0338674783706665, "eval_runtime": 30.1511, "eval_samples_per_second": 166.163, "eval_steps_per_second": 20.795, "step": 38000 }, { "epoch": 1.59, "learning_rate": 1.0828333333333334e-06, "loss": 1.0528, "step": 39000 }, { "epoch": 1.59, "eval_accuracy": 0.44550898203592815, "eval_loss": 1.0376836061477661, "eval_runtime": 30.304, "eval_samples_per_second": 165.324, "eval_steps_per_second": 20.69, "step": 39000 }, { "epoch": 1.63, "learning_rate": 1.1106111111111112e-06, "loss": 1.0482, "step": 40000 }, { "epoch": 1.63, "eval_accuracy": 0.4540918163672655, "eval_loss": 1.031366229057312, "eval_runtime": 30.2062, "eval_samples_per_second": 165.86, "eval_steps_per_second": 20.757, "step": 40000 }, { "epoch": 1.67, "learning_rate": 1.1383611111111113e-06, "loss": 1.0494, "step": 41000 }, { "epoch": 1.67, "eval_accuracy": 0.4552894211576846, "eval_loss": 1.032992959022522, "eval_runtime": 30.1667, "eval_samples_per_second": 166.077, "eval_steps_per_second": 20.784, "step": 41000 }, { "epoch": 1.71, "learning_rate": 1.166138888888889e-06, "loss": 1.0474, "step": 42000 }, { "epoch": 1.71, "eval_accuracy": 0.45129740518962075, "eval_loss": 1.0255250930786133, "eval_runtime": 30.2122, "eval_samples_per_second": 165.827, "eval_steps_per_second": 20.753, "step": 42000 }, { "epoch": 1.75, "learning_rate": 1.193888888888889e-06, "loss": 1.0448, "step": 43000 }, { "epoch": 1.75, "eval_accuracy": 0.4483033932135729, "eval_loss": 1.0412585735321045, "eval_runtime": 30.2083, "eval_samples_per_second": 165.849, "eval_steps_per_second": 20.756, "step": 43000 }, { "epoch": 1.79, "learning_rate": 1.2216666666666667e-06, "loss": 1.0458, "step": 44000 }, { "epoch": 1.79, "eval_accuracy": 0.46427145708582834, "eval_loss": 1.0211690664291382, "eval_runtime": 30.4051, "eval_samples_per_second": 164.775, "eval_steps_per_second": 20.622, "step": 44000 }, { "epoch": 1.83, "learning_rate": 1.2494166666666668e-06, "loss": 1.0468, "step": 45000 }, { "epoch": 1.83, "eval_accuracy": 0.47105788423153694, "eval_loss": 1.0185887813568115, "eval_runtime": 30.1465, "eval_samples_per_second": 166.188, "eval_steps_per_second": 20.798, "step": 45000 }, { "epoch": 1.87, "learning_rate": 1.2771944444444445e-06, "loss": 1.0414, "step": 46000 }, { "epoch": 1.87, "eval_accuracy": 0.46966067864271455, "eval_loss": 1.0215678215026855, "eval_runtime": 30.1615, "eval_samples_per_second": 166.106, "eval_steps_per_second": 20.788, "step": 46000 }, { "epoch": 1.91, "learning_rate": 1.3049444444444446e-06, "loss": 1.0374, "step": 47000 }, { "epoch": 1.91, "eval_accuracy": 0.47684630738522954, "eval_loss": 1.0100239515304565, "eval_runtime": 30.0919, "eval_samples_per_second": 166.49, "eval_steps_per_second": 20.836, "step": 47000 }, { "epoch": 1.96, "learning_rate": 1.3327222222222224e-06, "loss": 1.0423, "step": 48000 }, { "epoch": 1.96, "eval_accuracy": 0.462874251497006, "eval_loss": 1.0277544260025024, "eval_runtime": 30.0926, "eval_samples_per_second": 166.486, "eval_steps_per_second": 20.836, "step": 48000 }, { "epoch": 2.0, "learning_rate": 1.3604722222222224e-06, "loss": 1.0332, "step": 49000 }, { "epoch": 2.0, "eval_accuracy": 0.4870259481037924, "eval_loss": 1.0046255588531494, "eval_runtime": 30.0835, "eval_samples_per_second": 166.536, "eval_steps_per_second": 20.842, "step": 49000 }, { "epoch": 2.04, "learning_rate": 1.3882500000000002e-06, "loss": 1.0337, "step": 50000 }, { "epoch": 2.04, "eval_accuracy": 0.48542914171656687, "eval_loss": 1.00551438331604, "eval_runtime": 30.1523, "eval_samples_per_second": 166.157, "eval_steps_per_second": 20.794, "step": 50000 }, { "epoch": 2.08, "learning_rate": 1.416e-06, "loss": 1.0335, "step": 51000 }, { "epoch": 2.08, "eval_accuracy": 0.4782435129740519, "eval_loss": 1.0022392272949219, "eval_runtime": 30.0244, "eval_samples_per_second": 166.864, "eval_steps_per_second": 20.883, "step": 51000 }, { "epoch": 2.12, "learning_rate": 1.4437777777777779e-06, "loss": 1.0272, "step": 52000 }, { "epoch": 2.12, "eval_accuracy": 0.48003992015968067, "eval_loss": 1.014728307723999, "eval_runtime": 30.0497, "eval_samples_per_second": 166.724, "eval_steps_per_second": 20.865, "step": 52000 }, { "epoch": 2.16, "learning_rate": 1.471527777777778e-06, "loss": 1.0315, "step": 53000 }, { "epoch": 2.16, "eval_accuracy": 0.4848303393213573, "eval_loss": 0.9981946349143982, "eval_runtime": 30.2291, "eval_samples_per_second": 165.734, "eval_steps_per_second": 20.742, "step": 53000 }, { "epoch": 2.2, "learning_rate": 1.4993055555555557e-06, "loss": 1.0291, "step": 54000 }, { "epoch": 2.2, "eval_accuracy": 0.49021956087824353, "eval_loss": 0.9988633990287781, "eval_runtime": 30.0961, "eval_samples_per_second": 166.467, "eval_steps_per_second": 20.833, "step": 54000 }, { "epoch": 2.24, "learning_rate": 1.5270555555555558e-06, "loss": 1.027, "step": 55000 }, { "epoch": 2.24, "eval_accuracy": 0.48602794411177647, "eval_loss": 0.99713534116745, "eval_runtime": 30.1599, "eval_samples_per_second": 166.115, "eval_steps_per_second": 20.789, "step": 55000 }, { "epoch": 2.28, "learning_rate": 1.5548333333333335e-06, "loss": 1.0215, "step": 56000 }, { "epoch": 2.28, "eval_accuracy": 0.46487025948103794, "eval_loss": 1.0212756395339966, "eval_runtime": 30.2641, "eval_samples_per_second": 165.543, "eval_steps_per_second": 20.718, "step": 56000 }, { "epoch": 2.32, "learning_rate": 1.5825833333333334e-06, "loss": 1.0203, "step": 57000 }, { "epoch": 2.32, "eval_accuracy": 0.4874251497005988, "eval_loss": 0.9981949925422668, "eval_runtime": 30.1111, "eval_samples_per_second": 166.384, "eval_steps_per_second": 20.823, "step": 57000 }, { "epoch": 2.36, "learning_rate": 1.6103611111111112e-06, "loss": 1.0238, "step": 58000 }, { "epoch": 2.36, "eval_accuracy": 0.48562874251497007, "eval_loss": 1.0043253898620605, "eval_runtime": 30.4057, "eval_samples_per_second": 164.771, "eval_steps_per_second": 20.621, "step": 58000 }, { "epoch": 2.4, "learning_rate": 1.638111111111111e-06, "loss": 1.0211, "step": 59000 }, { "epoch": 2.4, "eval_accuracy": 0.49820359281437127, "eval_loss": 0.9920447468757629, "eval_runtime": 30.4581, "eval_samples_per_second": 164.488, "eval_steps_per_second": 20.586, "step": 59000 }, { "epoch": 2.44, "learning_rate": 1.665888888888889e-06, "loss": 1.0236, "step": 60000 }, { "epoch": 2.44, "eval_accuracy": 0.49241516966067866, "eval_loss": 0.9953387975692749, "eval_runtime": 30.3298, "eval_samples_per_second": 165.184, "eval_steps_per_second": 20.673, "step": 60000 }, { "epoch": 2.49, "learning_rate": 1.693638888888889e-06, "loss": 1.0207, "step": 61000 }, { "epoch": 2.49, "eval_accuracy": 0.49760479041916167, "eval_loss": 0.9943307042121887, "eval_runtime": 30.3886, "eval_samples_per_second": 164.865, "eval_steps_per_second": 20.633, "step": 61000 }, { "epoch": 2.53, "learning_rate": 1.7214166666666666e-06, "loss": 1.0222, "step": 62000 }, { "epoch": 2.53, "eval_accuracy": 0.4880239520958084, "eval_loss": 1.005075454711914, "eval_runtime": 30.41, "eval_samples_per_second": 164.748, "eval_steps_per_second": 20.618, "step": 62000 }, { "epoch": 2.57, "learning_rate": 1.7491666666666667e-06, "loss": 1.0221, "step": 63000 }, { "epoch": 2.57, "eval_accuracy": 0.5011976047904192, "eval_loss": 0.9902356266975403, "eval_runtime": 30.3384, "eval_samples_per_second": 165.137, "eval_steps_per_second": 20.667, "step": 63000 }, { "epoch": 2.61, "learning_rate": 1.7769444444444447e-06, "loss": 1.0188, "step": 64000 }, { "epoch": 2.61, "eval_accuracy": 0.5003992015968064, "eval_loss": 0.9907160997390747, "eval_runtime": 30.3592, "eval_samples_per_second": 165.024, "eval_steps_per_second": 20.653, "step": 64000 }, { "epoch": 2.65, "learning_rate": 1.8046944444444446e-06, "loss": 1.0187, "step": 65000 }, { "epoch": 2.65, "eval_accuracy": 0.49820359281437127, "eval_loss": 0.9914441704750061, "eval_runtime": 30.3614, "eval_samples_per_second": 165.012, "eval_steps_per_second": 20.651, "step": 65000 }, { "epoch": 2.69, "learning_rate": 1.8324722222222223e-06, "loss": 1.023, "step": 66000 }, { "epoch": 2.69, "eval_accuracy": 0.5007984031936128, "eval_loss": 0.9877445101737976, "eval_runtime": 30.341, "eval_samples_per_second": 165.123, "eval_steps_per_second": 20.665, "step": 66000 }, { "epoch": 2.73, "learning_rate": 1.8602222222222222e-06, "loss": 1.0195, "step": 67000 }, { "epoch": 2.73, "eval_accuracy": 0.49660678642714573, "eval_loss": 0.9916561245918274, "eval_runtime": 30.3344, "eval_samples_per_second": 165.159, "eval_steps_per_second": 20.67, "step": 67000 }, { "epoch": 2.77, "learning_rate": 1.8880000000000002e-06, "loss": 1.0181, "step": 68000 }, { "epoch": 2.77, "eval_accuracy": 0.4992015968063872, "eval_loss": 0.985427975654602, "eval_runtime": 30.3261, "eval_samples_per_second": 165.204, "eval_steps_per_second": 20.675, "step": 68000 }, { "epoch": 2.81, "learning_rate": 1.9157500000000003e-06, "loss": 1.0183, "step": 69000 }, { "epoch": 2.81, "eval_accuracy": 0.5033932135728543, "eval_loss": 0.9862416982650757, "eval_runtime": 30.3869, "eval_samples_per_second": 164.874, "eval_steps_per_second": 20.634, "step": 69000 }, { "epoch": 2.85, "learning_rate": 1.943527777777778e-06, "loss": 1.0176, "step": 70000 }, { "epoch": 2.85, "eval_accuracy": 0.501996007984032, "eval_loss": 0.9887968897819519, "eval_runtime": 30.3292, "eval_samples_per_second": 165.187, "eval_steps_per_second": 20.673, "step": 70000 }, { "epoch": 2.89, "learning_rate": 1.9712777777777777e-06, "loss": 1.0108, "step": 71000 }, { "epoch": 2.89, "eval_accuracy": 0.49520958083832334, "eval_loss": 0.992777407169342, "eval_runtime": 30.4, "eval_samples_per_second": 164.803, "eval_steps_per_second": 20.625, "step": 71000 }, { "epoch": 2.93, "learning_rate": 1.9990555555555557e-06, "loss": 1.0172, "step": 72000 }, { "epoch": 2.93, "eval_accuracy": 0.49141716566866267, "eval_loss": 0.9982264041900635, "eval_runtime": 30.36, "eval_samples_per_second": 165.02, "eval_steps_per_second": 20.652, "step": 72000 }, { "epoch": 2.97, "learning_rate": 2.026805555555556e-06, "loss": 1.0151, "step": 73000 }, { "epoch": 2.97, "eval_accuracy": 0.49720558882235527, "eval_loss": 0.9918563365936279, "eval_runtime": 30.3051, "eval_samples_per_second": 165.318, "eval_steps_per_second": 20.69, "step": 73000 }, { "epoch": 3.01, "learning_rate": 2.0545833333333335e-06, "loss": 1.0129, "step": 74000 }, { "epoch": 3.01, "eval_accuracy": 0.49840319361277446, "eval_loss": 0.9832558631896973, "eval_runtime": 30.3597, "eval_samples_per_second": 165.022, "eval_steps_per_second": 20.652, "step": 74000 }, { "epoch": 3.06, "learning_rate": 2.0823611111111115e-06, "loss": 1.006, "step": 75000 }, { "epoch": 3.06, "eval_accuracy": 0.5033932135728543, "eval_loss": 0.9880185127258301, "eval_runtime": 30.2854, "eval_samples_per_second": 165.426, "eval_steps_per_second": 20.703, "step": 75000 }, { "epoch": 3.1, "learning_rate": 2.1101111111111113e-06, "loss": 1.0069, "step": 76000 }, { "epoch": 3.1, "eval_accuracy": 0.5045908183632735, "eval_loss": 0.9803372025489807, "eval_runtime": 30.3285, "eval_samples_per_second": 165.191, "eval_steps_per_second": 20.674, "step": 76000 }, { "epoch": 3.14, "learning_rate": 2.137888888888889e-06, "loss": 1.0106, "step": 77000 }, { "epoch": 3.14, "eval_accuracy": 0.47724550898203594, "eval_loss": 1.0122599601745605, "eval_runtime": 30.3162, "eval_samples_per_second": 165.258, "eval_steps_per_second": 20.682, "step": 77000 }, { "epoch": 3.18, "learning_rate": 2.1656388888888888e-06, "loss": 1.0019, "step": 78000 }, { "epoch": 3.18, "eval_accuracy": 0.5045908183632735, "eval_loss": 0.9800828099250793, "eval_runtime": 30.2724, "eval_samples_per_second": 165.497, "eval_steps_per_second": 20.712, "step": 78000 }, { "epoch": 3.22, "learning_rate": 2.1934166666666667e-06, "loss": 1.0093, "step": 79000 }, { "epoch": 3.22, "eval_accuracy": 0.493812375249501, "eval_loss": 0.9990217089653015, "eval_runtime": 30.2198, "eval_samples_per_second": 165.785, "eval_steps_per_second": 20.748, "step": 79000 }, { "epoch": 3.26, "learning_rate": 2.221166666666667e-06, "loss": 1.0039, "step": 80000 }, { "epoch": 3.26, "eval_accuracy": 0.5067864271457085, "eval_loss": 0.98616623878479, "eval_runtime": 30.3059, "eval_samples_per_second": 165.315, "eval_steps_per_second": 20.689, "step": 80000 }, { "epoch": 3.3, "learning_rate": 2.2489444444444446e-06, "loss": 1.0062, "step": 81000 }, { "epoch": 3.3, "eval_accuracy": 0.5013972055888224, "eval_loss": 0.9929732084274292, "eval_runtime": 30.2754, "eval_samples_per_second": 165.481, "eval_steps_per_second": 20.71, "step": 81000 }, { "epoch": 3.34, "learning_rate": 2.2766944444444444e-06, "loss": 1.0145, "step": 82000 }, { "epoch": 3.34, "eval_accuracy": 0.5051896207584831, "eval_loss": 0.9891365766525269, "eval_runtime": 30.204, "eval_samples_per_second": 165.872, "eval_steps_per_second": 20.759, "step": 82000 }, { "epoch": 3.38, "learning_rate": 2.3044722222222224e-06, "loss": 1.0012, "step": 83000 }, { "epoch": 3.38, "eval_accuracy": 0.5067864271457085, "eval_loss": 0.984894871711731, "eval_runtime": 30.2196, "eval_samples_per_second": 165.787, "eval_steps_per_second": 20.748, "step": 83000 }, { "epoch": 3.42, "learning_rate": 2.3322222222222223e-06, "loss": 1.0072, "step": 84000 }, { "epoch": 3.42, "eval_accuracy": 0.5075848303393213, "eval_loss": 0.979631781578064, "eval_runtime": 30.1636, "eval_samples_per_second": 166.094, "eval_steps_per_second": 20.787, "step": 84000 }, { "epoch": 3.46, "learning_rate": 2.36e-06, "loss": 1.0038, "step": 85000 }, { "epoch": 3.46, "eval_accuracy": 0.5035928143712575, "eval_loss": 0.9814818501472473, "eval_runtime": 30.2922, "eval_samples_per_second": 165.389, "eval_steps_per_second": 20.698, "step": 85000 }, { "epoch": 3.5, "learning_rate": 2.38775e-06, "loss": 1.0107, "step": 86000 }, { "epoch": 3.5, "eval_accuracy": 0.5117764471057884, "eval_loss": 0.9743403792381287, "eval_runtime": 30.2926, "eval_samples_per_second": 165.387, "eval_steps_per_second": 20.698, "step": 86000 }, { "epoch": 3.54, "learning_rate": 2.415527777777778e-06, "loss": 1.0011, "step": 87000 }, { "epoch": 3.54, "eval_accuracy": 0.5073852295409181, "eval_loss": 0.9760627150535583, "eval_runtime": 30.3925, "eval_samples_per_second": 164.843, "eval_steps_per_second": 20.63, "step": 87000 }, { "epoch": 3.59, "learning_rate": 2.443277777777778e-06, "loss": 1.0045, "step": 88000 }, { "epoch": 3.59, "eval_accuracy": 0.5101796407185629, "eval_loss": 0.9735654592514038, "eval_runtime": 30.2088, "eval_samples_per_second": 165.846, "eval_steps_per_second": 20.756, "step": 88000 }, { "epoch": 3.63, "learning_rate": 2.4710555555555555e-06, "loss": 0.999, "step": 89000 }, { "epoch": 3.63, "eval_accuracy": 0.4996007984031936, "eval_loss": 0.9788551926612854, "eval_runtime": 30.2584, "eval_samples_per_second": 165.574, "eval_steps_per_second": 20.721, "step": 89000 }, { "epoch": 3.67, "learning_rate": 2.498805555555556e-06, "loss": 1.0075, "step": 90000 }, { "epoch": 3.67, "eval_accuracy": 0.5095808383233533, "eval_loss": 0.972398579120636, "eval_runtime": 30.1697, "eval_samples_per_second": 166.061, "eval_steps_per_second": 20.782, "step": 90000 }, { "epoch": 3.71, "learning_rate": 2.5265833333333334e-06, "loss": 1.0085, "step": 91000 }, { "epoch": 3.71, "eval_accuracy": 0.5073852295409181, "eval_loss": 0.9791931509971619, "eval_runtime": 30.1687, "eval_samples_per_second": 166.066, "eval_steps_per_second": 20.783, "step": 91000 }, { "epoch": 3.75, "learning_rate": 2.5543333333333337e-06, "loss": 1.0022, "step": 92000 }, { "epoch": 3.75, "eval_accuracy": 0.5105788423153692, "eval_loss": 0.9835652112960815, "eval_runtime": 30.389, "eval_samples_per_second": 164.862, "eval_steps_per_second": 20.632, "step": 92000 }, { "epoch": 3.79, "learning_rate": 2.5821111111111112e-06, "loss": 1.006, "step": 93000 }, { "epoch": 3.79, "eval_accuracy": 0.5127744510978044, "eval_loss": 0.9716958403587341, "eval_runtime": 30.2822, "eval_samples_per_second": 165.444, "eval_steps_per_second": 20.705, "step": 93000 }, { "epoch": 3.83, "learning_rate": 2.609861111111111e-06, "loss": 1.0001, "step": 94000 }, { "epoch": 3.83, "eval_accuracy": 0.49880239520958086, "eval_loss": 0.982902467250824, "eval_runtime": 30.3141, "eval_samples_per_second": 165.27, "eval_steps_per_second": 20.683, "step": 94000 }, { "epoch": 3.87, "learning_rate": 2.637638888888889e-06, "loss": 1.0018, "step": 95000 }, { "epoch": 3.87, "eval_accuracy": 0.5117764471057884, "eval_loss": 0.9711886644363403, "eval_runtime": 30.3419, "eval_samples_per_second": 165.118, "eval_steps_per_second": 20.665, "step": 95000 }, { "epoch": 3.91, "learning_rate": 2.6653888888888894e-06, "loss": 1.001, "step": 96000 }, { "epoch": 3.91, "eval_accuracy": 0.5085828343313373, "eval_loss": 0.9692357182502747, "eval_runtime": 30.2793, "eval_samples_per_second": 165.46, "eval_steps_per_second": 20.707, "step": 96000 }, { "epoch": 3.95, "learning_rate": 2.693166666666667e-06, "loss": 0.9967, "step": 97000 }, { "epoch": 3.95, "eval_accuracy": 0.5035928143712575, "eval_loss": 0.9811190366744995, "eval_runtime": 30.3522, "eval_samples_per_second": 165.062, "eval_steps_per_second": 20.657, "step": 97000 }, { "epoch": 3.99, "learning_rate": 2.7209166666666668e-06, "loss": 0.9963, "step": 98000 }, { "epoch": 3.99, "eval_accuracy": 0.5117764471057884, "eval_loss": 0.9744167923927307, "eval_runtime": 30.2586, "eval_samples_per_second": 165.573, "eval_steps_per_second": 20.721, "step": 98000 }, { "epoch": 4.03, "learning_rate": 2.7486944444444448e-06, "loss": 0.9923, "step": 99000 }, { "epoch": 4.03, "eval_accuracy": 0.5155688622754491, "eval_loss": 0.9705401659011841, "eval_runtime": 30.3652, "eval_samples_per_second": 164.992, "eval_steps_per_second": 20.649, "step": 99000 }, { "epoch": 4.07, "learning_rate": 2.7764444444444446e-06, "loss": 0.9925, "step": 100000 }, { "epoch": 4.07, "eval_accuracy": 0.5161676646706587, "eval_loss": 0.967511773109436, "eval_runtime": 30.3989, "eval_samples_per_second": 164.809, "eval_steps_per_second": 20.626, "step": 100000 }, { "epoch": 4.12, "learning_rate": 2.804222222222222e-06, "loss": 0.9965, "step": 101000 }, { "epoch": 4.12, "eval_accuracy": 0.5179640718562875, "eval_loss": 0.9703009724617004, "eval_runtime": 30.306, "eval_samples_per_second": 165.314, "eval_steps_per_second": 20.689, "step": 101000 }, { "epoch": 4.16, "learning_rate": 2.8319722222222225e-06, "loss": 0.994, "step": 102000 }, { "epoch": 4.16, "eval_accuracy": 0.5027944111776447, "eval_loss": 0.9799131155014038, "eval_runtime": 30.4057, "eval_samples_per_second": 164.772, "eval_steps_per_second": 20.621, "step": 102000 }, { "epoch": 4.2, "learning_rate": 2.8597500000000004e-06, "loss": 0.9989, "step": 103000 }, { "epoch": 4.2, "eval_accuracy": 0.5175648702594811, "eval_loss": 0.965115487575531, "eval_runtime": 30.5041, "eval_samples_per_second": 164.24, "eval_steps_per_second": 20.555, "step": 103000 }, { "epoch": 4.24, "learning_rate": 2.8875000000000003e-06, "loss": 0.9909, "step": 104000 }, { "epoch": 4.24, "eval_accuracy": 0.5005988023952096, "eval_loss": 0.9864978790283203, "eval_runtime": 30.393, "eval_samples_per_second": 164.84, "eval_steps_per_second": 20.63, "step": 104000 }, { "epoch": 4.28, "learning_rate": 2.915277777777778e-06, "loss": 0.9873, "step": 105000 }, { "epoch": 4.28, "eval_accuracy": 0.5017964071856288, "eval_loss": 0.9913762211799622, "eval_runtime": 30.3351, "eval_samples_per_second": 165.155, "eval_steps_per_second": 20.669, "step": 105000 }, { "epoch": 4.32, "learning_rate": 2.9430277777777777e-06, "loss": 0.9826, "step": 106000 }, { "epoch": 4.32, "eval_accuracy": 0.5201596806387225, "eval_loss": 0.9676294922828674, "eval_runtime": 30.2604, "eval_samples_per_second": 165.563, "eval_steps_per_second": 20.72, "step": 106000 }, { "epoch": 4.36, "learning_rate": 2.9708055555555557e-06, "loss": 0.9832, "step": 107000 }, { "epoch": 4.36, "eval_accuracy": 0.5173652694610779, "eval_loss": 0.9818444848060608, "eval_runtime": 30.3828, "eval_samples_per_second": 164.896, "eval_steps_per_second": 20.637, "step": 107000 }, { "epoch": 4.4, "learning_rate": 2.998555555555556e-06, "loss": 0.984, "step": 108000 }, { "epoch": 4.4, "eval_accuracy": 0.511377245508982, "eval_loss": 0.9720831513404846, "eval_runtime": 30.2651, "eval_samples_per_second": 165.537, "eval_steps_per_second": 20.717, "step": 108000 }, { "epoch": 4.44, "learning_rate": 3.0263333333333336e-06, "loss": 0.982, "step": 109000 }, { "epoch": 4.44, "eval_accuracy": 0.5209580838323353, "eval_loss": 0.9712573885917664, "eval_runtime": 30.204, "eval_samples_per_second": 165.872, "eval_steps_per_second": 20.759, "step": 109000 }, { "epoch": 4.48, "learning_rate": 3.0540833333333334e-06, "loss": 0.9876, "step": 110000 }, { "epoch": 4.48, "eval_accuracy": 0.5265469061876248, "eval_loss": 0.9630448222160339, "eval_runtime": 30.2241, "eval_samples_per_second": 165.762, "eval_steps_per_second": 20.745, "step": 110000 }, { "epoch": 4.52, "learning_rate": 3.0818611111111114e-06, "loss": 0.9826, "step": 111000 }, { "epoch": 4.52, "eval_accuracy": 0.524750499001996, "eval_loss": 0.9607908129692078, "eval_runtime": 30.5687, "eval_samples_per_second": 163.893, "eval_steps_per_second": 20.511, "step": 111000 }, { "epoch": 4.56, "learning_rate": 3.1096111111111113e-06, "loss": 0.9922, "step": 112000 }, { "epoch": 4.56, "eval_accuracy": 0.5295409181636727, "eval_loss": 0.9560614824295044, "eval_runtime": 30.2441, "eval_samples_per_second": 165.652, "eval_steps_per_second": 20.731, "step": 112000 }, { "epoch": 4.6, "learning_rate": 3.137388888888889e-06, "loss": 0.9784, "step": 113000 }, { "epoch": 4.6, "eval_accuracy": 0.5245508982035928, "eval_loss": 0.9655715823173523, "eval_runtime": 30.341, "eval_samples_per_second": 165.123, "eval_steps_per_second": 20.665, "step": 113000 }, { "epoch": 4.64, "learning_rate": 3.165138888888889e-06, "loss": 0.9819, "step": 114000 }, { "epoch": 4.64, "eval_accuracy": 0.5255489021956088, "eval_loss": 0.9573906660079956, "eval_runtime": 30.3563, "eval_samples_per_second": 165.04, "eval_steps_per_second": 20.655, "step": 114000 } ], "logging_steps": 1000, "max_steps": 10000000, "num_train_epochs": 408, "save_steps": 1000, "total_flos": 1.1915294734496563e+17, "trial_name": null, "trial_params": null }