{ "best_metric": null, "best_model_checkpoint": null, "epoch": 37.51465416178195, "global_step": 32000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.12, "learning_rate": 5e-06, "loss": 0.8323, "step": 100 }, { "epoch": 0.23, "learning_rate": 5e-06, "loss": 0.793, "step": 200 }, { "epoch": 0.35, "learning_rate": 5e-06, "loss": 0.7508, "step": 300 }, { "epoch": 0.47, "learning_rate": 5e-06, "loss": 0.7777, "step": 400 }, { "epoch": 0.59, "learning_rate": 5e-06, "loss": 0.7741, "step": 500 }, { "epoch": 0.7, "learning_rate": 5e-06, "loss": 0.7435, "step": 600 }, { "epoch": 0.82, "learning_rate": 5e-06, "loss": 0.7681, "step": 700 }, { "epoch": 0.94, "learning_rate": 5e-06, "loss": 0.7505, "step": 800 }, { "epoch": 1.06, "learning_rate": 5e-06, "loss": 0.7285, "step": 900 }, { "epoch": 1.17, "learning_rate": 5e-06, "loss": 0.7243, "step": 1000 }, { "epoch": 1.29, "learning_rate": 5e-06, "loss": 0.7562, "step": 1100 }, { "epoch": 1.41, "learning_rate": 5e-06, "loss": 0.743, "step": 1200 }, { "epoch": 1.52, "learning_rate": 5e-06, "loss": 0.7086, "step": 1300 }, { "epoch": 1.64, "learning_rate": 5e-06, "loss": 0.7177, "step": 1400 }, { "epoch": 1.76, "learning_rate": 5e-06, "loss": 0.727, "step": 1500 }, { "epoch": 1.76, "eval_loss": 0.6887246966362, "eval_runtime": 70.0491, "eval_samples_per_second": 3.041, "eval_steps_per_second": 3.041, "step": 1500 }, { "epoch": 1.88, "learning_rate": 5e-06, "loss": 0.7781, "step": 1600 }, { "epoch": 1.99, "learning_rate": 5e-06, "loss": 0.7509, "step": 1700 }, { "epoch": 2.11, "learning_rate": 5e-06, "loss": 0.7057, "step": 1800 }, { "epoch": 2.23, "learning_rate": 5e-06, "loss": 0.7292, "step": 1900 }, { "epoch": 2.34, "learning_rate": 5e-06, "loss": 0.7037, "step": 2000 }, { "epoch": 2.46, "learning_rate": 5e-06, "loss": 0.7005, "step": 2100 }, { "epoch": 2.58, "learning_rate": 5e-06, "loss": 0.7501, "step": 2200 }, { "epoch": 2.7, "learning_rate": 5e-06, "loss": 0.7162, "step": 2300 }, { "epoch": 2.81, "learning_rate": 5e-06, "loss": 0.7428, "step": 2400 }, { "epoch": 2.93, "learning_rate": 5e-06, "loss": 0.7403, "step": 2500 }, { "epoch": 3.05, "learning_rate": 5e-06, "loss": 0.7286, "step": 2600 }, { "epoch": 3.17, "learning_rate": 5e-06, "loss": 0.72, "step": 2700 }, { "epoch": 3.28, "learning_rate": 5e-06, "loss": 0.6998, "step": 2800 }, { "epoch": 3.4, "learning_rate": 5e-06, "loss": 0.7515, "step": 2900 }, { "epoch": 3.52, "learning_rate": 5e-06, "loss": 0.7263, "step": 3000 }, { "epoch": 3.52, "eval_loss": 0.6762834191322327, "eval_runtime": 70.0592, "eval_samples_per_second": 3.04, "eval_steps_per_second": 3.04, "step": 3000 }, { "epoch": 3.63, "learning_rate": 5e-06, "loss": 0.6931, "step": 3100 }, { "epoch": 3.75, "learning_rate": 5e-06, "loss": 0.6907, "step": 3200 }, { "epoch": 3.87, "learning_rate": 5e-06, "loss": 0.6818, "step": 3300 }, { "epoch": 3.99, "learning_rate": 5e-06, "loss": 0.7417, "step": 3400 }, { "epoch": 4.1, "learning_rate": 5e-06, "loss": 0.6987, "step": 3500 }, { "epoch": 4.22, "learning_rate": 5e-06, "loss": 0.6854, "step": 3600 }, { "epoch": 4.34, "learning_rate": 5e-06, "loss": 0.7236, "step": 3700 }, { "epoch": 4.45, "learning_rate": 5e-06, "loss": 0.7045, "step": 3800 }, { "epoch": 4.57, "learning_rate": 5e-06, "loss": 0.699, "step": 3900 }, { "epoch": 4.69, "learning_rate": 5e-06, "loss": 0.7179, "step": 4000 }, { "epoch": 4.81, "learning_rate": 5e-06, "loss": 0.7066, "step": 4100 }, { "epoch": 4.92, "learning_rate": 5e-06, "loss": 0.713, "step": 4200 }, { "epoch": 5.04, "learning_rate": 5e-06, "loss": 0.6749, "step": 4300 }, { "epoch": 5.16, "learning_rate": 5e-06, "loss": 0.6882, "step": 4400 }, { "epoch": 5.28, "learning_rate": 5e-06, "loss": 0.6809, "step": 4500 }, { "epoch": 5.28, "eval_loss": 0.6606600284576416, "eval_runtime": 69.4571, "eval_samples_per_second": 3.067, "eval_steps_per_second": 3.067, "step": 4500 }, { "epoch": 5.39, "learning_rate": 5e-06, "loss": 0.7433, "step": 4600 }, { "epoch": 5.51, "learning_rate": 5e-06, "loss": 0.6881, "step": 4700 }, { "epoch": 5.63, "learning_rate": 5e-06, "loss": 0.7343, "step": 4800 }, { "epoch": 5.74, "learning_rate": 5e-06, "loss": 0.6675, "step": 4900 }, { "epoch": 5.86, "learning_rate": 5e-06, "loss": 0.7187, "step": 5000 }, { "epoch": 5.98, "learning_rate": 5e-06, "loss": 0.6837, "step": 5100 }, { "epoch": 6.1, "learning_rate": 5e-06, "loss": 0.6825, "step": 5200 }, { "epoch": 6.21, "learning_rate": 5e-06, "loss": 0.6976, "step": 5300 }, { "epoch": 6.33, "learning_rate": 5e-06, "loss": 0.7055, "step": 5400 }, { "epoch": 6.45, "learning_rate": 5e-06, "loss": 0.6584, "step": 5500 }, { "epoch": 6.57, "learning_rate": 5e-06, "loss": 0.6819, "step": 5600 }, { "epoch": 6.68, "learning_rate": 5e-06, "loss": 0.6652, "step": 5700 }, { "epoch": 6.8, "learning_rate": 5e-06, "loss": 0.6728, "step": 5800 }, { "epoch": 6.92, "learning_rate": 5e-06, "loss": 0.6916, "step": 5900 }, { "epoch": 7.03, "learning_rate": 5e-06, "loss": 0.657, "step": 6000 }, { "epoch": 7.03, "eval_loss": 0.6467106938362122, "eval_runtime": 69.8327, "eval_samples_per_second": 3.05, "eval_steps_per_second": 3.05, "step": 6000 }, { "epoch": 7.15, "learning_rate": 5e-06, "loss": 0.7331, "step": 6100 }, { "epoch": 7.27, "learning_rate": 5e-06, "loss": 0.6598, "step": 6200 }, { "epoch": 7.39, "learning_rate": 5e-06, "loss": 0.6602, "step": 6300 }, { "epoch": 7.5, "learning_rate": 5e-06, "loss": 0.6819, "step": 6400 }, { "epoch": 7.62, "learning_rate": 5e-06, "loss": 0.6764, "step": 6500 }, { "epoch": 7.74, "learning_rate": 5e-06, "loss": 0.6674, "step": 6600 }, { "epoch": 7.85, "learning_rate": 5e-06, "loss": 0.6848, "step": 6700 }, { "epoch": 7.97, "learning_rate": 5e-06, "loss": 0.6446, "step": 6800 }, { "epoch": 8.09, "learning_rate": 5e-06, "loss": 0.6601, "step": 6900 }, { "epoch": 8.21, "learning_rate": 5e-06, "loss": 0.6649, "step": 7000 }, { "epoch": 8.32, "learning_rate": 5e-06, "loss": 0.657, "step": 7100 }, { "epoch": 8.44, "learning_rate": 5e-06, "loss": 0.6326, "step": 7200 }, { "epoch": 8.56, "learning_rate": 5e-06, "loss": 0.6541, "step": 7300 }, { "epoch": 8.68, "learning_rate": 5e-06, "loss": 0.6579, "step": 7400 }, { "epoch": 8.79, "learning_rate": 5e-06, "loss": 0.6784, "step": 7500 }, { "epoch": 8.79, "eval_loss": 0.6341073513031006, "eval_runtime": 70.1746, "eval_samples_per_second": 3.035, "eval_steps_per_second": 3.035, "step": 7500 }, { "epoch": 8.91, "learning_rate": 5e-06, "loss": 0.6889, "step": 7600 }, { "epoch": 9.03, "learning_rate": 5e-06, "loss": 0.6752, "step": 7700 }, { "epoch": 9.14, "learning_rate": 5e-06, "loss": 0.6654, "step": 7800 }, { "epoch": 9.26, "learning_rate": 5e-06, "loss": 0.6516, "step": 7900 }, { "epoch": 9.38, "learning_rate": 5e-06, "loss": 0.6847, "step": 8000 }, { "epoch": 9.5, "learning_rate": 5e-06, "loss": 0.6396, "step": 8100 }, { "epoch": 9.61, "learning_rate": 5e-06, "loss": 0.6484, "step": 8200 }, { "epoch": 9.73, "learning_rate": 5e-06, "loss": 0.6396, "step": 8300 }, { "epoch": 9.85, "learning_rate": 5e-06, "loss": 0.6951, "step": 8400 }, { "epoch": 9.96, "learning_rate": 5e-06, "loss": 0.641, "step": 8500 }, { "epoch": 10.08, "learning_rate": 5e-06, "loss": 0.6379, "step": 8600 }, { "epoch": 10.2, "learning_rate": 5e-06, "loss": 0.6264, "step": 8700 }, { "epoch": 10.32, "learning_rate": 5e-06, "loss": 0.6364, "step": 8800 }, { "epoch": 10.43, "learning_rate": 5e-06, "loss": 0.676, "step": 8900 }, { "epoch": 10.55, "learning_rate": 5e-06, "loss": 0.6756, "step": 9000 }, { "epoch": 10.55, "eval_loss": 0.6359875202178955, "eval_runtime": 70.4519, "eval_samples_per_second": 3.023, "eval_steps_per_second": 3.023, "step": 9000 }, { "epoch": 10.67, "learning_rate": 5e-06, "loss": 0.6641, "step": 9100 }, { "epoch": 10.79, "learning_rate": 5e-06, "loss": 0.6126, "step": 9200 }, { "epoch": 10.9, "learning_rate": 5e-06, "loss": 0.6538, "step": 9300 }, { "epoch": 11.02, "learning_rate": 5e-06, "loss": 0.641, "step": 9400 }, { "epoch": 11.14, "learning_rate": 5e-06, "loss": 0.6501, "step": 9500 }, { "epoch": 11.25, "learning_rate": 5e-06, "loss": 0.647, "step": 9600 }, { "epoch": 11.37, "learning_rate": 5e-06, "loss": 0.6463, "step": 9700 }, { "epoch": 11.49, "learning_rate": 5e-06, "loss": 0.6507, "step": 9800 }, { "epoch": 11.61, "learning_rate": 5e-06, "loss": 0.6525, "step": 9900 }, { "epoch": 11.72, "learning_rate": 5e-06, "loss": 0.6194, "step": 10000 }, { "epoch": 11.84, "learning_rate": 5e-06, "loss": 0.6338, "step": 10100 }, { "epoch": 11.96, "learning_rate": 5e-06, "loss": 0.6492, "step": 10200 }, { "epoch": 12.08, "learning_rate": 5e-06, "loss": 0.6531, "step": 10300 }, { "epoch": 12.19, "learning_rate": 5e-06, "loss": 0.6073, "step": 10400 }, { "epoch": 12.31, "learning_rate": 5e-06, "loss": 0.6307, "step": 10500 }, { "epoch": 12.31, "eval_loss": 0.6309817433357239, "eval_runtime": 70.2077, "eval_samples_per_second": 3.034, "eval_steps_per_second": 3.034, "step": 10500 }, { "epoch": 12.43, "learning_rate": 5e-06, "loss": 0.6608, "step": 10600 }, { "epoch": 12.54, "learning_rate": 5e-06, "loss": 0.6252, "step": 10700 }, { "epoch": 12.66, "learning_rate": 5e-06, "loss": 0.6258, "step": 10800 }, { "epoch": 12.78, "learning_rate": 5e-06, "loss": 0.6504, "step": 10900 }, { "epoch": 12.9, "learning_rate": 5e-06, "loss": 0.6281, "step": 11000 }, { "epoch": 13.01, "learning_rate": 5e-06, "loss": 0.6398, "step": 11100 }, { "epoch": 13.13, "learning_rate": 5e-06, "loss": 0.6318, "step": 11200 }, { "epoch": 13.25, "learning_rate": 5e-06, "loss": 0.6162, "step": 11300 }, { "epoch": 13.36, "learning_rate": 5e-06, "loss": 0.6101, "step": 11400 }, { "epoch": 13.48, "learning_rate": 5e-06, "loss": 0.6124, "step": 11500 }, { "epoch": 13.6, "learning_rate": 5e-06, "loss": 0.5994, "step": 11600 }, { "epoch": 13.72, "learning_rate": 5e-06, "loss": 0.6599, "step": 11700 }, { "epoch": 13.83, "learning_rate": 5e-06, "loss": 0.6192, "step": 11800 }, { "epoch": 13.95, "learning_rate": 5e-06, "loss": 0.6341, "step": 11900 }, { "epoch": 14.07, "learning_rate": 5e-06, "loss": 0.6155, "step": 12000 }, { "epoch": 14.07, "eval_loss": 0.63248211145401, "eval_runtime": 70.8191, "eval_samples_per_second": 3.008, "eval_steps_per_second": 3.008, "step": 12000 }, { "epoch": 14.19, "learning_rate": 5e-06, "loss": 0.6562, "step": 12100 }, { "epoch": 14.3, "learning_rate": 5e-06, "loss": 0.633, "step": 12200 }, { "epoch": 14.42, "learning_rate": 5e-06, "loss": 0.6169, "step": 12300 }, { "epoch": 14.54, "learning_rate": 5e-06, "loss": 0.6312, "step": 12400 }, { "epoch": 14.65, "learning_rate": 5e-06, "loss": 0.6401, "step": 12500 }, { "epoch": 14.77, "learning_rate": 5e-06, "loss": 0.6365, "step": 12600 }, { "epoch": 14.89, "learning_rate": 5e-06, "loss": 0.6286, "step": 12700 }, { "epoch": 15.01, "learning_rate": 5e-06, "loss": 0.5877, "step": 12800 }, { "epoch": 15.12, "learning_rate": 5e-06, "loss": 0.6334, "step": 12900 }, { "epoch": 15.24, "learning_rate": 5e-06, "loss": 0.5785, "step": 13000 }, { "epoch": 15.36, "learning_rate": 5e-06, "loss": 0.6155, "step": 13100 }, { "epoch": 15.47, "learning_rate": 5e-06, "loss": 0.6404, "step": 13200 }, { "epoch": 15.59, "learning_rate": 5e-06, "loss": 0.6302, "step": 13300 }, { "epoch": 15.71, "learning_rate": 5e-06, "loss": 0.6154, "step": 13400 }, { "epoch": 15.83, "learning_rate": 5e-06, "loss": 0.6119, "step": 13500 }, { "epoch": 15.83, "eval_loss": 0.6228322386741638, "eval_runtime": 70.1345, "eval_samples_per_second": 3.037, "eval_steps_per_second": 3.037, "step": 13500 }, { "epoch": 15.94, "learning_rate": 5e-06, "loss": 0.6193, "step": 13600 }, { "epoch": 16.06, "learning_rate": 5e-06, "loss": 0.6161, "step": 13700 }, { "epoch": 16.18, "learning_rate": 5e-06, "loss": 0.5785, "step": 13800 }, { "epoch": 16.3, "learning_rate": 5e-06, "loss": 0.6043, "step": 13900 }, { "epoch": 16.41, "learning_rate": 5e-06, "loss": 0.6205, "step": 14000 }, { "epoch": 16.53, "learning_rate": 5e-06, "loss": 0.6321, "step": 14100 }, { "epoch": 16.65, "learning_rate": 5e-06, "loss": 0.5996, "step": 14200 }, { "epoch": 16.76, "learning_rate": 5e-06, "loss": 0.6232, "step": 14300 }, { "epoch": 16.88, "learning_rate": 5e-06, "loss": 0.6148, "step": 14400 }, { "epoch": 17.0, "learning_rate": 5e-06, "loss": 0.602, "step": 14500 }, { "epoch": 17.12, "learning_rate": 5e-06, "loss": 0.5716, "step": 14600 }, { "epoch": 17.23, "learning_rate": 5e-06, "loss": 0.629, "step": 14700 }, { "epoch": 17.35, "learning_rate": 5e-06, "loss": 0.6134, "step": 14800 }, { "epoch": 17.47, "learning_rate": 5e-06, "loss": 0.6023, "step": 14900 }, { "epoch": 17.58, "learning_rate": 5e-06, "loss": 0.5943, "step": 15000 }, { "epoch": 17.58, "eval_loss": 0.623548686504364, "eval_runtime": 70.4978, "eval_samples_per_second": 3.021, "eval_steps_per_second": 3.021, "step": 15000 }, { "epoch": 17.7, "learning_rate": 5e-06, "loss": 0.6092, "step": 15100 }, { "epoch": 17.82, "learning_rate": 5e-06, "loss": 0.6172, "step": 15200 }, { "epoch": 17.94, "learning_rate": 5e-06, "loss": 0.6247, "step": 15300 }, { "epoch": 18.05, "learning_rate": 5e-06, "loss": 0.6043, "step": 15400 }, { "epoch": 18.17, "learning_rate": 5e-06, "loss": 0.6186, "step": 15500 }, { "epoch": 18.29, "learning_rate": 5e-06, "loss": 0.5877, "step": 15600 }, { "epoch": 18.41, "learning_rate": 5e-06, "loss": 0.5993, "step": 15700 }, { "epoch": 18.52, "learning_rate": 5e-06, "loss": 0.5949, "step": 15800 }, { "epoch": 18.64, "learning_rate": 5e-06, "loss": 0.5775, "step": 15900 }, { "epoch": 18.76, "learning_rate": 5e-06, "loss": 0.6147, "step": 16000 }, { "epoch": 18.87, "learning_rate": 5e-06, "loss": 0.5973, "step": 16100 }, { "epoch": 18.99, "learning_rate": 5e-06, "loss": 0.6103, "step": 16200 }, { "epoch": 19.11, "learning_rate": 5e-06, "loss": 0.6024, "step": 16300 }, { "epoch": 19.23, "learning_rate": 5e-06, "loss": 0.5729, "step": 16400 }, { "epoch": 19.34, "learning_rate": 5e-06, "loss": 0.6012, "step": 16500 }, { "epoch": 19.34, "eval_loss": 0.6155942678451538, "eval_runtime": 70.5596, "eval_samples_per_second": 3.019, "eval_steps_per_second": 3.019, "step": 16500 }, { "epoch": 19.46, "learning_rate": 5e-06, "loss": 0.6123, "step": 16600 }, { "epoch": 19.58, "learning_rate": 5e-06, "loss": 0.5937, "step": 16700 }, { "epoch": 19.7, "learning_rate": 5e-06, "loss": 0.5824, "step": 16800 }, { "epoch": 19.81, "learning_rate": 5e-06, "loss": 0.6433, "step": 16900 }, { "epoch": 19.93, "learning_rate": 5e-06, "loss": 0.5799, "step": 17000 }, { "epoch": 20.05, "learning_rate": 5e-06, "loss": 0.593, "step": 17100 }, { "epoch": 20.16, "learning_rate": 5e-06, "loss": 0.5909, "step": 17200 }, { "epoch": 20.28, "learning_rate": 5e-06, "loss": 0.5918, "step": 17300 }, { "epoch": 20.4, "learning_rate": 5e-06, "loss": 0.5908, "step": 17400 }, { "epoch": 20.52, "learning_rate": 5e-06, "loss": 0.5932, "step": 17500 }, { "epoch": 20.63, "learning_rate": 5e-06, "loss": 0.6085, "step": 17600 }, { "epoch": 20.75, "learning_rate": 5e-06, "loss": 0.5737, "step": 17700 }, { "epoch": 20.87, "learning_rate": 5e-06, "loss": 0.5926, "step": 17800 }, { "epoch": 20.98, "learning_rate": 5e-06, "loss": 0.606, "step": 17900 }, { "epoch": 21.1, "learning_rate": 5e-06, "loss": 0.5834, "step": 18000 }, { "epoch": 21.1, "eval_loss": 0.6064698100090027, "eval_runtime": 70.551, "eval_samples_per_second": 3.019, "eval_steps_per_second": 3.019, "step": 18000 }, { "epoch": 21.22, "learning_rate": 5e-06, "loss": 0.57, "step": 18100 }, { "epoch": 21.34, "learning_rate": 5e-06, "loss": 0.5878, "step": 18200 }, { "epoch": 21.45, "learning_rate": 5e-06, "loss": 0.5623, "step": 18300 }, { "epoch": 21.57, "learning_rate": 5e-06, "loss": 0.5978, "step": 18400 }, { "epoch": 21.69, "learning_rate": 5e-06, "loss": 0.594, "step": 18500 }, { "epoch": 21.81, "learning_rate": 5e-06, "loss": 0.6013, "step": 18600 }, { "epoch": 21.92, "learning_rate": 5e-06, "loss": 0.5576, "step": 18700 }, { "epoch": 22.04, "learning_rate": 5e-06, "loss": 0.5794, "step": 18800 }, { "epoch": 22.16, "learning_rate": 5e-06, "loss": 0.5863, "step": 18900 }, { "epoch": 22.27, "learning_rate": 5e-06, "loss": 0.5956, "step": 19000 }, { "epoch": 22.39, "learning_rate": 5e-06, "loss": 0.5849, "step": 19100 }, { "epoch": 22.51, "learning_rate": 5e-06, "loss": 0.5705, "step": 19200 }, { "epoch": 22.63, "learning_rate": 5e-06, "loss": 0.5945, "step": 19300 }, { "epoch": 22.74, "learning_rate": 5e-06, "loss": 0.5673, "step": 19400 }, { "epoch": 22.86, "learning_rate": 5e-06, "loss": 0.5942, "step": 19500 }, { "epoch": 22.86, "eval_loss": 0.6019883155822754, "eval_runtime": 70.6262, "eval_samples_per_second": 3.016, "eval_steps_per_second": 3.016, "step": 19500 }, { "epoch": 22.98, "learning_rate": 5e-06, "loss": 0.556, "step": 19600 }, { "epoch": 23.09, "learning_rate": 5e-06, "loss": 0.5919, "step": 19700 }, { "epoch": 23.21, "learning_rate": 5e-06, "loss": 0.554, "step": 19800 }, { "epoch": 23.33, "learning_rate": 5e-06, "loss": 0.5708, "step": 19900 }, { "epoch": 23.45, "learning_rate": 5e-06, "loss": 0.5555, "step": 20000 }, { "epoch": 23.56, "learning_rate": 5e-06, "loss": 0.6004, "step": 20100 }, { "epoch": 23.68, "learning_rate": 5e-06, "loss": 0.5894, "step": 20200 }, { "epoch": 23.8, "learning_rate": 5e-06, "loss": 0.5718, "step": 20300 }, { "epoch": 23.92, "learning_rate": 5e-06, "loss": 0.5744, "step": 20400 }, { "epoch": 24.03, "learning_rate": 5e-06, "loss": 0.5602, "step": 20500 }, { "epoch": 24.15, "learning_rate": 5e-06, "loss": 0.5656, "step": 20600 }, { "epoch": 24.27, "learning_rate": 5e-06, "loss": 0.5657, "step": 20700 }, { "epoch": 24.38, "learning_rate": 5e-06, "loss": 0.5553, "step": 20800 }, { "epoch": 24.5, "learning_rate": 5e-06, "loss": 0.5962, "step": 20900 }, { "epoch": 24.62, "learning_rate": 5e-06, "loss": 0.5982, "step": 21000 }, { "epoch": 24.62, "eval_loss": 0.5987153053283691, "eval_runtime": 70.2465, "eval_samples_per_second": 3.032, "eval_steps_per_second": 3.032, "step": 21000 }, { "epoch": 24.74, "learning_rate": 5e-06, "loss": 0.5946, "step": 21100 }, { "epoch": 24.85, "learning_rate": 5e-06, "loss": 0.5473, "step": 21200 }, { "epoch": 24.97, "learning_rate": 5e-06, "loss": 0.5605, "step": 21300 }, { "epoch": 25.09, "learning_rate": 5e-06, "loss": 0.5953, "step": 21400 }, { "epoch": 25.21, "learning_rate": 5e-06, "loss": 0.5697, "step": 21500 }, { "epoch": 25.32, "learning_rate": 5e-06, "loss": 0.5627, "step": 21600 }, { "epoch": 25.44, "learning_rate": 5e-06, "loss": 0.567, "step": 21700 }, { "epoch": 25.56, "learning_rate": 5e-06, "loss": 0.5394, "step": 21800 }, { "epoch": 25.67, "learning_rate": 5e-06, "loss": 0.5461, "step": 21900 }, { "epoch": 25.79, "learning_rate": 5e-06, "loss": 0.5615, "step": 22000 }, { "epoch": 25.91, "learning_rate": 5e-06, "loss": 0.5547, "step": 22100 }, { "epoch": 26.03, "learning_rate": 5e-06, "loss": 0.5534, "step": 22200 }, { "epoch": 26.14, "learning_rate": 5e-06, "loss": 0.5494, "step": 22300 }, { "epoch": 26.26, "learning_rate": 5e-06, "loss": 0.569, "step": 22400 }, { "epoch": 26.38, "learning_rate": 5e-06, "loss": 0.5352, "step": 22500 }, { "epoch": 26.38, "eval_loss": 0.6042129397392273, "eval_runtime": 69.7858, "eval_samples_per_second": 3.052, "eval_steps_per_second": 3.052, "step": 22500 }, { "epoch": 26.49, "learning_rate": 5e-06, "loss": 0.5754, "step": 22600 }, { "epoch": 26.61, "learning_rate": 5e-06, "loss": 0.5443, "step": 22700 }, { "epoch": 26.73, "learning_rate": 5e-06, "loss": 0.5765, "step": 22800 }, { "epoch": 26.85, "learning_rate": 5e-06, "loss": 0.5494, "step": 22900 }, { "epoch": 26.96, "learning_rate": 5e-06, "loss": 0.5598, "step": 23000 }, { "epoch": 27.08, "learning_rate": 5e-06, "loss": 0.5634, "step": 23100 }, { "epoch": 27.2, "learning_rate": 5e-06, "loss": 0.534, "step": 23200 }, { "epoch": 27.32, "learning_rate": 5e-06, "loss": 0.5626, "step": 23300 }, { "epoch": 27.43, "learning_rate": 5e-06, "loss": 0.5681, "step": 23400 }, { "epoch": 27.55, "learning_rate": 5e-06, "loss": 0.5735, "step": 23500 }, { "epoch": 27.67, "learning_rate": 5e-06, "loss": 0.5464, "step": 23600 }, { "epoch": 27.78, "learning_rate": 5e-06, "loss": 0.529, "step": 23700 }, { "epoch": 27.9, "learning_rate": 5e-06, "loss": 0.548, "step": 23800 }, { "epoch": 28.02, "learning_rate": 5e-06, "loss": 0.5699, "step": 23900 }, { "epoch": 28.14, "learning_rate": 5e-06, "loss": 0.5746, "step": 24000 }, { "epoch": 28.14, "eval_loss": 0.5994372367858887, "eval_runtime": 70.1931, "eval_samples_per_second": 3.034, "eval_steps_per_second": 3.034, "step": 24000 }, { "epoch": 28.25, "learning_rate": 5e-06, "loss": 0.5537, "step": 24100 }, { "epoch": 28.37, "learning_rate": 5e-06, "loss": 0.5479, "step": 24200 }, { "epoch": 28.49, "learning_rate": 5e-06, "loss": 0.5643, "step": 24300 }, { "epoch": 28.6, "learning_rate": 5e-06, "loss": 0.5273, "step": 24400 }, { "epoch": 28.72, "learning_rate": 5e-06, "loss": 0.544, "step": 24500 }, { "epoch": 28.84, "learning_rate": 5e-06, "loss": 0.5172, "step": 24600 }, { "epoch": 28.96, "learning_rate": 5e-06, "loss": 0.5658, "step": 24700 }, { "epoch": 29.07, "learning_rate": 5e-06, "loss": 0.5343, "step": 24800 }, { "epoch": 29.19, "learning_rate": 5e-06, "loss": 0.5307, "step": 24900 }, { "epoch": 29.31, "learning_rate": 5e-06, "loss": 0.5386, "step": 25000 }, { "epoch": 29.43, "learning_rate": 5e-06, "loss": 0.5553, "step": 25100 }, { "epoch": 29.54, "learning_rate": 5e-06, "loss": 0.5309, "step": 25200 }, { "epoch": 29.66, "learning_rate": 5e-06, "loss": 0.5323, "step": 25300 }, { "epoch": 29.78, "learning_rate": 5e-06, "loss": 0.5477, "step": 25400 }, { "epoch": 29.89, "learning_rate": 5e-06, "loss": 0.5618, "step": 25500 }, { "epoch": 29.89, "eval_loss": 0.5992656350135803, "eval_runtime": 70.8526, "eval_samples_per_second": 3.006, "eval_steps_per_second": 3.006, "step": 25500 }, { "epoch": 30.01, "learning_rate": 5e-06, "loss": 0.5368, "step": 25600 }, { "epoch": 30.13, "learning_rate": 5e-06, "loss": 0.55, "step": 25700 }, { "epoch": 30.25, "learning_rate": 5e-06, "loss": 0.5138, "step": 25800 }, { "epoch": 30.36, "learning_rate": 5e-06, "loss": 0.5266, "step": 25900 }, { "epoch": 30.48, "learning_rate": 5e-06, "loss": 0.5539, "step": 26000 }, { "epoch": 30.6, "learning_rate": 5e-06, "loss": 0.536, "step": 26100 }, { "epoch": 30.72, "learning_rate": 5e-06, "loss": 0.5427, "step": 26200 }, { "epoch": 30.83, "learning_rate": 5e-06, "loss": 0.5496, "step": 26300 }, { "epoch": 30.95, "learning_rate": 5e-06, "loss": 0.5127, "step": 26400 }, { "epoch": 31.07, "learning_rate": 5e-06, "loss": 0.5569, "step": 26500 }, { "epoch": 31.18, "learning_rate": 5e-06, "loss": 0.5196, "step": 26600 }, { "epoch": 31.3, "learning_rate": 5e-06, "loss": 0.5268, "step": 26700 }, { "epoch": 31.42, "learning_rate": 5e-06, "loss": 0.5419, "step": 26800 }, { "epoch": 31.54, "learning_rate": 5e-06, "loss": 0.5087, "step": 26900 }, { "epoch": 31.65, "learning_rate": 5e-06, "loss": 0.5254, "step": 27000 }, { "epoch": 31.65, "eval_loss": 0.5909192562103271, "eval_runtime": 70.5089, "eval_samples_per_second": 3.021, "eval_steps_per_second": 3.021, "step": 27000 }, { "epoch": 31.77, "learning_rate": 5e-06, "loss": 0.5346, "step": 27100 }, { "epoch": 31.89, "learning_rate": 5e-06, "loss": 0.5279, "step": 27200 }, { "epoch": 32.0, "learning_rate": 5e-06, "loss": 0.5711, "step": 27300 }, { "epoch": 32.12, "learning_rate": 5e-06, "loss": 0.5079, "step": 27400 }, { "epoch": 32.24, "learning_rate": 5e-06, "loss": 0.5303, "step": 27500 }, { "epoch": 32.36, "learning_rate": 5e-06, "loss": 0.5347, "step": 27600 }, { "epoch": 32.47, "learning_rate": 5e-06, "loss": 0.4936, "step": 27700 }, { "epoch": 32.59, "learning_rate": 5e-06, "loss": 0.5303, "step": 27800 }, { "epoch": 32.71, "learning_rate": 5e-06, "loss": 0.5543, "step": 27900 }, { "epoch": 32.83, "learning_rate": 5e-06, "loss": 0.5266, "step": 28000 }, { "epoch": 32.94, "learning_rate": 5e-06, "loss": 0.5258, "step": 28100 }, { "epoch": 33.06, "learning_rate": 5e-06, "loss": 0.5559, "step": 28200 }, { "epoch": 33.18, "learning_rate": 5e-06, "loss": 0.5096, "step": 28300 }, { "epoch": 33.29, "learning_rate": 5e-06, "loss": 0.5427, "step": 28400 }, { "epoch": 33.41, "learning_rate": 5e-06, "loss": 0.5336, "step": 28500 }, { "epoch": 33.41, "eval_loss": 0.587517261505127, "eval_runtime": 70.6475, "eval_samples_per_second": 3.015, "eval_steps_per_second": 3.015, "step": 28500 }, { "epoch": 33.53, "learning_rate": 5e-06, "loss": 0.5419, "step": 28600 }, { "epoch": 33.65, "learning_rate": 5e-06, "loss": 0.5235, "step": 28700 }, { "epoch": 33.76, "learning_rate": 5e-06, "loss": 0.5266, "step": 28800 }, { "epoch": 33.88, "learning_rate": 5e-06, "loss": 0.5308, "step": 28900 }, { "epoch": 34.0, "learning_rate": 5e-06, "loss": 0.5203, "step": 29000 }, { "epoch": 34.11, "learning_rate": 5e-06, "loss": 0.4988, "step": 29100 }, { "epoch": 34.23, "learning_rate": 5e-06, "loss": 0.502, "step": 29200 }, { "epoch": 34.35, "learning_rate": 5e-06, "loss": 0.5111, "step": 29300 }, { "epoch": 34.47, "learning_rate": 5e-06, "loss": 0.539, "step": 29400 }, { "epoch": 34.58, "learning_rate": 5e-06, "loss": 0.5086, "step": 29500 }, { "epoch": 34.7, "learning_rate": 5e-06, "loss": 0.5285, "step": 29600 }, { "epoch": 34.82, "learning_rate": 5e-06, "loss": 0.5153, "step": 29700 }, { "epoch": 34.94, "learning_rate": 5e-06, "loss": 0.5366, "step": 29800 }, { "epoch": 35.05, "learning_rate": 5e-06, "loss": 0.5307, "step": 29900 }, { "epoch": 35.17, "learning_rate": 5e-06, "loss": 0.5677, "step": 30000 }, { "epoch": 35.17, "eval_loss": 0.591385543346405, "eval_runtime": 70.1376, "eval_samples_per_second": 3.037, "eval_steps_per_second": 3.037, "step": 30000 }, { "epoch": 35.29, "learning_rate": 5e-06, "loss": 0.5283, "step": 30100 }, { "epoch": 35.4, "learning_rate": 5e-06, "loss": 0.5281, "step": 30200 }, { "epoch": 35.52, "learning_rate": 5e-06, "loss": 0.5087, "step": 30300 }, { "epoch": 35.64, "learning_rate": 5e-06, "loss": 0.4965, "step": 30400 }, { "epoch": 35.76, "learning_rate": 5e-06, "loss": 0.5085, "step": 30500 }, { "epoch": 35.87, "learning_rate": 5e-06, "loss": 0.5159, "step": 30600 }, { "epoch": 35.99, "learning_rate": 5e-06, "loss": 0.5149, "step": 30700 }, { "epoch": 36.11, "learning_rate": 5e-06, "loss": 0.5281, "step": 30800 }, { "epoch": 36.23, "learning_rate": 5e-06, "loss": 0.511, "step": 30900 }, { "epoch": 36.34, "learning_rate": 5e-06, "loss": 0.5327, "step": 31000 }, { "epoch": 36.46, "learning_rate": 5e-06, "loss": 0.5267, "step": 31100 }, { "epoch": 36.58, "learning_rate": 5e-06, "loss": 0.5124, "step": 31200 }, { "epoch": 36.69, "learning_rate": 5e-06, "loss": 0.5069, "step": 31300 }, { "epoch": 36.81, "learning_rate": 5e-06, "loss": 0.4839, "step": 31400 }, { "epoch": 36.93, "learning_rate": 5e-06, "loss": 0.5009, "step": 31500 }, { "epoch": 36.93, "eval_loss": 0.6007654070854187, "eval_runtime": 70.1373, "eval_samples_per_second": 3.037, "eval_steps_per_second": 3.037, "step": 31500 }, { "epoch": 37.05, "learning_rate": 5e-06, "loss": 0.5169, "step": 31600 }, { "epoch": 37.16, "learning_rate": 5e-06, "loss": 0.5003, "step": 31700 }, { "epoch": 37.28, "learning_rate": 5e-06, "loss": 0.502, "step": 31800 }, { "epoch": 37.4, "learning_rate": 5e-06, "loss": 0.5149, "step": 31900 }, { "epoch": 37.51, "learning_rate": 5e-06, "loss": 0.4903, "step": 32000 } ], "max_steps": 255900, "num_train_epochs": 300, "total_flos": 3.193778270424269e+19, "trial_name": null, "trial_params": null }