{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.976, "eval_steps": 100, "global_step": 6100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.016, "grad_norm": 83.78450775146484, "learning_rate": 7.44e-06, "loss": 3.2768, "step": 100 }, { "epoch": 0.016, "eval_all-nli-dev_cosine_accuracy": 0.815, "eval_all-nli-dev_dot_accuracy": 0.316, "eval_all-nli-dev_euclidean_accuracy": 0.804, "eval_all-nli-dev_manhattan_accuracy": 0.833, "eval_all-nli-dev_max_accuracy": 0.833, "eval_loss": 1.8052657842636108, "eval_runtime": 8.0216, "eval_samples_per_second": 124.664, "eval_steps_per_second": 7.854, "step": 100 }, { "epoch": 0.032, "grad_norm": 62.54200744628906, "learning_rate": 1.544e-05, "loss": 1.1697, "step": 200 }, { "epoch": 0.032, "eval_all-nli-dev_cosine_accuracy": 0.842, "eval_all-nli-dev_dot_accuracy": 0.171, "eval_all-nli-dev_euclidean_accuracy": 0.838, "eval_all-nli-dev_manhattan_accuracy": 0.861, "eval_all-nli-dev_max_accuracy": 0.861, "eval_loss": 1.2878086566925049, "eval_runtime": 7.8831, "eval_samples_per_second": 126.853, "eval_steps_per_second": 7.992, "step": 200 }, { "epoch": 0.048, "grad_norm": 75.54035949707031, "learning_rate": 2.344e-05, "loss": 1.372, "step": 300 }, { "epoch": 0.048, "eval_all-nli-dev_cosine_accuracy": 0.841, "eval_all-nli-dev_dot_accuracy": 0.181, "eval_all-nli-dev_euclidean_accuracy": 0.844, "eval_all-nli-dev_manhattan_accuracy": 0.861, "eval_all-nli-dev_max_accuracy": 0.861, "eval_loss": 1.2466014623641968, "eval_runtime": 7.8792, "eval_samples_per_second": 126.916, "eval_steps_per_second": 7.996, "step": 300 }, { "epoch": 0.064, "grad_norm": 33.85651779174805, "learning_rate": 3.136e-05, "loss": 1.0476, "step": 400 }, { "epoch": 0.064, "eval_all-nli-dev_cosine_accuracy": 0.848, "eval_all-nli-dev_dot_accuracy": 0.201, "eval_all-nli-dev_euclidean_accuracy": 0.848, "eval_all-nli-dev_manhattan_accuracy": 0.863, "eval_all-nli-dev_max_accuracy": 0.863, "eval_loss": 1.2291330099105835, "eval_runtime": 7.8507, "eval_samples_per_second": 127.378, "eval_steps_per_second": 8.025, "step": 400 }, { "epoch": 0.08, "grad_norm": 27.536640167236328, "learning_rate": 3.936e-05, "loss": 0.8588, "step": 500 }, { "epoch": 0.08, "eval_all-nli-dev_cosine_accuracy": 0.821, "eval_all-nli-dev_dot_accuracy": 0.213, "eval_all-nli-dev_euclidean_accuracy": 0.828, "eval_all-nli-dev_manhattan_accuracy": 0.838, "eval_all-nli-dev_max_accuracy": 0.838, "eval_loss": 1.5258921384811401, "eval_runtime": 7.8331, "eval_samples_per_second": 127.663, "eval_steps_per_second": 8.043, "step": 500 }, { "epoch": 0.096, "grad_norm": 0.15947222709655762, "learning_rate": 4.736000000000001e-05, "loss": 2.9781, "step": 600 }, { "epoch": 0.096, "eval_all-nli-dev_cosine_accuracy": 0.462, "eval_all-nli-dev_dot_accuracy": 0.363, "eval_all-nli-dev_euclidean_accuracy": 0.462, "eval_all-nli-dev_manhattan_accuracy": 0.463, "eval_all-nli-dev_max_accuracy": 0.463, "eval_loss": 3.4308860301971436, "eval_runtime": 7.9535, "eval_samples_per_second": 125.73, "eval_steps_per_second": 7.921, "step": 600 }, { "epoch": 0.112, "grad_norm": 0.09273126721382141, "learning_rate": 4.9404444444444447e-05, "loss": 3.4982, "step": 700 }, { "epoch": 0.112, "eval_all-nli-dev_cosine_accuracy": 0.449, "eval_all-nli-dev_dot_accuracy": 0.381, "eval_all-nli-dev_euclidean_accuracy": 0.449, "eval_all-nli-dev_manhattan_accuracy": 0.457, "eval_all-nli-dev_max_accuracy": 0.457, "eval_loss": 3.4308860301971436, "eval_runtime": 7.905, "eval_samples_per_second": 126.501, "eval_steps_per_second": 7.97, "step": 700 }, { "epoch": 0.128, "grad_norm": 0.047491393983364105, "learning_rate": 4.851555555555556e-05, "loss": 3.467, "step": 800 }, { "epoch": 0.128, "eval_all-nli-dev_cosine_accuracy": 0.471, "eval_all-nli-dev_dot_accuracy": 0.366, "eval_all-nli-dev_euclidean_accuracy": 0.471, "eval_all-nli-dev_manhattan_accuracy": 0.479, "eval_all-nli-dev_max_accuracy": 0.479, "eval_loss": 3.4308860301971436, "eval_runtime": 7.8623, "eval_samples_per_second": 127.188, "eval_steps_per_second": 8.013, "step": 800 }, { "epoch": 0.144, "grad_norm": 0.05457993969321251, "learning_rate": 4.762666666666667e-05, "loss": 3.4665, "step": 900 }, { "epoch": 0.144, "eval_all-nli-dev_cosine_accuracy": 0.452, "eval_all-nli-dev_dot_accuracy": 0.342, "eval_all-nli-dev_euclidean_accuracy": 0.452, "eval_all-nli-dev_manhattan_accuracy": 0.446, "eval_all-nli-dev_max_accuracy": 0.452, "eval_loss": 3.4308860301971436, "eval_runtime": 8.0107, "eval_samples_per_second": 124.832, "eval_steps_per_second": 7.864, "step": 900 }, { "epoch": 0.16, "grad_norm": 0.03311806544661522, "learning_rate": 4.673777777777778e-05, "loss": 3.4664, "step": 1000 }, { "epoch": 0.16, "eval_all-nli-dev_cosine_accuracy": 0.477, "eval_all-nli-dev_dot_accuracy": 0.352, "eval_all-nli-dev_euclidean_accuracy": 0.477, "eval_all-nli-dev_manhattan_accuracy": 0.468, "eval_all-nli-dev_max_accuracy": 0.477, "eval_loss": 3.4308860301971436, "eval_runtime": 7.9233, "eval_samples_per_second": 126.21, "eval_steps_per_second": 7.951, "step": 1000 }, { "epoch": 0.176, "grad_norm": 0.02707391045987606, "learning_rate": 4.584888888888889e-05, "loss": 3.4663, "step": 1100 }, { "epoch": 0.176, "eval_all-nli-dev_cosine_accuracy": 0.458, "eval_all-nli-dev_dot_accuracy": 0.376, "eval_all-nli-dev_euclidean_accuracy": 0.458, "eval_all-nli-dev_manhattan_accuracy": 0.452, "eval_all-nli-dev_max_accuracy": 0.458, "eval_loss": 3.4308860301971436, "eval_runtime": 7.8521, "eval_samples_per_second": 127.354, "eval_steps_per_second": 8.023, "step": 1100 }, { "epoch": 0.192, "grad_norm": 0.030668186023831367, "learning_rate": 4.496e-05, "loss": 3.4661, "step": 1200 }, { "epoch": 0.192, "eval_all-nli-dev_cosine_accuracy": 0.46, "eval_all-nli-dev_dot_accuracy": 0.39, "eval_all-nli-dev_euclidean_accuracy": 0.46, "eval_all-nli-dev_manhattan_accuracy": 0.462, "eval_all-nli-dev_max_accuracy": 0.462, "eval_loss": 3.4308862686157227, "eval_runtime": 8.0488, "eval_samples_per_second": 124.243, "eval_steps_per_second": 7.827, "step": 1200 }, { "epoch": 0.208, "grad_norm": 0.04911087080836296, "learning_rate": 4.4071111111111115e-05, "loss": 3.4658, "step": 1300 }, { "epoch": 0.208, "eval_all-nli-dev_cosine_accuracy": 0.441, "eval_all-nli-dev_dot_accuracy": 0.338, "eval_all-nli-dev_euclidean_accuracy": 0.441, "eval_all-nli-dev_manhattan_accuracy": 0.45, "eval_all-nli-dev_max_accuracy": 0.45, "eval_loss": 3.4308857917785645, "eval_runtime": 7.9015, "eval_samples_per_second": 126.558, "eval_steps_per_second": 7.973, "step": 1300 }, { "epoch": 0.224, "grad_norm": 0.02693816088140011, "learning_rate": 4.3182222222222226e-05, "loss": 3.4661, "step": 1400 }, { "epoch": 0.224, "eval_all-nli-dev_cosine_accuracy": 0.475, "eval_all-nli-dev_dot_accuracy": 0.31, "eval_all-nli-dev_euclidean_accuracy": 0.475, "eval_all-nli-dev_manhattan_accuracy": 0.481, "eval_all-nli-dev_max_accuracy": 0.481, "eval_loss": 3.4308862686157227, "eval_runtime": 7.9424, "eval_samples_per_second": 125.906, "eval_steps_per_second": 7.932, "step": 1400 }, { "epoch": 0.24, "grad_norm": 0.17906944453716278, "learning_rate": 4.229333333333334e-05, "loss": 3.4877, "step": 1500 }, { "epoch": 0.24, "eval_all-nli-dev_cosine_accuracy": 0.464, "eval_all-nli-dev_dot_accuracy": 0.358, "eval_all-nli-dev_euclidean_accuracy": 0.464, "eval_all-nli-dev_manhattan_accuracy": 0.458, "eval_all-nli-dev_max_accuracy": 0.464, "eval_loss": 3.4308860301971436, "eval_runtime": 7.9335, "eval_samples_per_second": 126.048, "eval_steps_per_second": 7.941, "step": 1500 }, { "epoch": 0.256, "grad_norm": 0.07473237067461014, "learning_rate": 4.140444444444445e-05, "loss": 3.4675, "step": 1600 }, { "epoch": 0.256, "eval_all-nli-dev_cosine_accuracy": 0.462, "eval_all-nli-dev_dot_accuracy": 0.347, "eval_all-nli-dev_euclidean_accuracy": 0.462, "eval_all-nli-dev_manhattan_accuracy": 0.457, "eval_all-nli-dev_max_accuracy": 0.462, "eval_loss": 3.4308860301971436, "eval_runtime": 7.9155, "eval_samples_per_second": 126.335, "eval_steps_per_second": 7.959, "step": 1600 }, { "epoch": 0.272, "grad_norm": 0.06472893804311752, "learning_rate": 4.051555555555556e-05, "loss": 3.4665, "step": 1700 }, { "epoch": 0.272, "eval_all-nli-dev_cosine_accuracy": 0.488, "eval_all-nli-dev_dot_accuracy": 0.394, "eval_all-nli-dev_euclidean_accuracy": 0.488, "eval_all-nli-dev_manhattan_accuracy": 0.487, "eval_all-nli-dev_max_accuracy": 0.488, "eval_loss": 3.4308862686157227, "eval_runtime": 7.7762, "eval_samples_per_second": 128.597, "eval_steps_per_second": 8.102, "step": 1700 }, { "epoch": 0.288, "grad_norm": 0.05817211791872978, "learning_rate": 3.9626666666666664e-05, "loss": 3.4667, "step": 1800 }, { "epoch": 0.288, "eval_all-nli-dev_cosine_accuracy": 0.492, "eval_all-nli-dev_dot_accuracy": 0.396, "eval_all-nli-dev_euclidean_accuracy": 0.492, "eval_all-nli-dev_manhattan_accuracy": 0.483, "eval_all-nli-dev_max_accuracy": 0.492, "eval_loss": 3.4308860301971436, "eval_runtime": 7.8429, "eval_samples_per_second": 127.504, "eval_steps_per_second": 8.033, "step": 1800 }, { "epoch": 0.304, "grad_norm": 0.06118469312787056, "learning_rate": 3.8737777777777776e-05, "loss": 3.4664, "step": 1900 }, { "epoch": 0.304, "eval_all-nli-dev_cosine_accuracy": 0.452, "eval_all-nli-dev_dot_accuracy": 0.377, "eval_all-nli-dev_euclidean_accuracy": 0.452, "eval_all-nli-dev_manhattan_accuracy": 0.455, "eval_all-nli-dev_max_accuracy": 0.455, "eval_loss": 3.4308860301971436, "eval_runtime": 7.8193, "eval_samples_per_second": 127.888, "eval_steps_per_second": 8.057, "step": 1900 }, { "epoch": 0.32, "grad_norm": 0.034288015216588974, "learning_rate": 3.784888888888889e-05, "loss": 3.4661, "step": 2000 }, { "epoch": 0.32, "eval_all-nli-dev_cosine_accuracy": 0.446, "eval_all-nli-dev_dot_accuracy": 0.35, "eval_all-nli-dev_euclidean_accuracy": 0.446, "eval_all-nli-dev_manhattan_accuracy": 0.453, "eval_all-nli-dev_max_accuracy": 0.453, "eval_loss": 3.4308860301971436, "eval_runtime": 7.8176, "eval_samples_per_second": 127.916, "eval_steps_per_second": 8.059, "step": 2000 }, { "epoch": 0.336, "grad_norm": 0.06111547723412514, "learning_rate": 3.696e-05, "loss": 3.4666, "step": 2100 }, { "epoch": 0.336, "eval_all-nli-dev_cosine_accuracy": 0.469, "eval_all-nli-dev_dot_accuracy": 0.39, "eval_all-nli-dev_euclidean_accuracy": 0.469, "eval_all-nli-dev_manhattan_accuracy": 0.477, "eval_all-nli-dev_max_accuracy": 0.477, "eval_loss": 3.4308862686157227, "eval_runtime": 7.8728, "eval_samples_per_second": 127.02, "eval_steps_per_second": 8.002, "step": 2100 }, { "epoch": 0.352, "grad_norm": 0.06624756008386612, "learning_rate": 3.607111111111111e-05, "loss": 3.4683, "step": 2200 }, { "epoch": 0.352, "eval_all-nli-dev_cosine_accuracy": 0.478, "eval_all-nli-dev_dot_accuracy": 0.338, "eval_all-nli-dev_euclidean_accuracy": 0.478, "eval_all-nli-dev_manhattan_accuracy": 0.48, "eval_all-nli-dev_max_accuracy": 0.48, "eval_loss": 3.4308860301971436, "eval_runtime": 7.9247, "eval_samples_per_second": 126.188, "eval_steps_per_second": 7.95, "step": 2200 }, { "epoch": 0.368, "grad_norm": 0.031209247186779976, "learning_rate": 3.518222222222222e-05, "loss": 3.4663, "step": 2300 }, { "epoch": 0.368, "eval_all-nli-dev_cosine_accuracy": 0.469, "eval_all-nli-dev_dot_accuracy": 0.312, "eval_all-nli-dev_euclidean_accuracy": 0.469, "eval_all-nli-dev_manhattan_accuracy": 0.464, "eval_all-nli-dev_max_accuracy": 0.469, "eval_loss": 3.4308860301971436, "eval_runtime": 7.8393, "eval_samples_per_second": 127.562, "eval_steps_per_second": 8.036, "step": 2300 }, { "epoch": 0.384, "grad_norm": 0.133990079164505, "learning_rate": 3.429333333333333e-05, "loss": 3.4667, "step": 2400 }, { "epoch": 0.384, "eval_all-nli-dev_cosine_accuracy": 0.448, "eval_all-nli-dev_dot_accuracy": 0.404, "eval_all-nli-dev_euclidean_accuracy": 0.448, "eval_all-nli-dev_manhattan_accuracy": 0.445, "eval_all-nli-dev_max_accuracy": 0.448, "eval_loss": 3.4308860301971436, "eval_runtime": 8.021, "eval_samples_per_second": 124.672, "eval_steps_per_second": 7.854, "step": 2400 }, { "epoch": 0.4, "grad_norm": 0.04892706498503685, "learning_rate": 3.3404444444444444e-05, "loss": 3.4669, "step": 2500 }, { "epoch": 0.4, "eval_all-nli-dev_cosine_accuracy": 0.499, "eval_all-nli-dev_dot_accuracy": 0.365, "eval_all-nli-dev_euclidean_accuracy": 0.499, "eval_all-nli-dev_manhattan_accuracy": 0.492, "eval_all-nli-dev_max_accuracy": 0.499, "eval_loss": 3.4308862686157227, "eval_runtime": 7.8451, "eval_samples_per_second": 127.468, "eval_steps_per_second": 8.03, "step": 2500 }, { "epoch": 0.416, "grad_norm": 0.053150493651628494, "learning_rate": 3.2515555555555555e-05, "loss": 3.4661, "step": 2600 }, { "epoch": 0.416, "eval_all-nli-dev_cosine_accuracy": 0.453, "eval_all-nli-dev_dot_accuracy": 0.349, "eval_all-nli-dev_euclidean_accuracy": 0.453, "eval_all-nli-dev_manhattan_accuracy": 0.45, "eval_all-nli-dev_max_accuracy": 0.453, "eval_loss": 3.4308860301971436, "eval_runtime": 7.9334, "eval_samples_per_second": 126.05, "eval_steps_per_second": 7.941, "step": 2600 }, { "epoch": 0.432, "grad_norm": 0.09555792808532715, "learning_rate": 3.1626666666666667e-05, "loss": 3.4656, "step": 2700 }, { "epoch": 0.432, "eval_all-nli-dev_cosine_accuracy": 0.466, "eval_all-nli-dev_dot_accuracy": 0.336, "eval_all-nli-dev_euclidean_accuracy": 0.466, "eval_all-nli-dev_manhattan_accuracy": 0.467, "eval_all-nli-dev_max_accuracy": 0.467, "eval_loss": 3.4308860301971436, "eval_runtime": 7.8562, "eval_samples_per_second": 127.287, "eval_steps_per_second": 8.019, "step": 2700 }, { "epoch": 0.448, "grad_norm": 0.01744948886334896, "learning_rate": 3.0737777777777785e-05, "loss": 3.4662, "step": 2800 }, { "epoch": 0.448, "eval_all-nli-dev_cosine_accuracy": 0.506, "eval_all-nli-dev_dot_accuracy": 0.4, "eval_all-nli-dev_euclidean_accuracy": 0.506, "eval_all-nli-dev_manhattan_accuracy": 0.507, "eval_all-nli-dev_max_accuracy": 0.507, "eval_loss": 3.4308860301971436, "eval_runtime": 7.8617, "eval_samples_per_second": 127.199, "eval_steps_per_second": 8.014, "step": 2800 }, { "epoch": 0.464, "grad_norm": 0.2527749538421631, "learning_rate": 2.986666666666667e-05, "loss": 3.4902, "step": 2900 }, { "epoch": 0.464, "eval_all-nli-dev_cosine_accuracy": 0.463, "eval_all-nli-dev_dot_accuracy": 0.338, "eval_all-nli-dev_euclidean_accuracy": 0.463, "eval_all-nli-dev_manhattan_accuracy": 0.473, "eval_all-nli-dev_max_accuracy": 0.473, "eval_loss": 3.4308860301971436, "eval_runtime": 7.8074, "eval_samples_per_second": 128.083, "eval_steps_per_second": 8.069, "step": 2900 }, { "epoch": 0.48, "grad_norm": 0.02826513536274433, "learning_rate": 2.897777777777778e-05, "loss": 3.4663, "step": 3000 }, { "epoch": 0.48, "eval_all-nli-dev_cosine_accuracy": 0.454, "eval_all-nli-dev_dot_accuracy": 0.338, "eval_all-nli-dev_euclidean_accuracy": 0.454, "eval_all-nli-dev_manhattan_accuracy": 0.469, "eval_all-nli-dev_max_accuracy": 0.469, "eval_loss": 3.4308860301971436, "eval_runtime": 7.8421, "eval_samples_per_second": 127.516, "eval_steps_per_second": 8.034, "step": 3000 }, { "epoch": 0.496, "grad_norm": 0.07985329627990723, "learning_rate": 2.8088888888888893e-05, "loss": 3.554, "step": 3100 }, { "epoch": 0.496, "eval_all-nli-dev_cosine_accuracy": 0.456, "eval_all-nli-dev_dot_accuracy": 0.359, "eval_all-nli-dev_euclidean_accuracy": 0.456, "eval_all-nli-dev_manhattan_accuracy": 0.46, "eval_all-nli-dev_max_accuracy": 0.46, "eval_loss": 3.4308860301971436, "eval_runtime": 7.8756, "eval_samples_per_second": 126.975, "eval_steps_per_second": 7.999, "step": 3100 }, { "epoch": 0.512, "grad_norm": 0.1259102076292038, "learning_rate": 2.7200000000000004e-05, "loss": 3.4664, "step": 3200 }, { "epoch": 0.512, "eval_all-nli-dev_cosine_accuracy": 0.455, "eval_all-nli-dev_dot_accuracy": 0.257, "eval_all-nli-dev_euclidean_accuracy": 0.455, "eval_all-nli-dev_manhattan_accuracy": 0.454, "eval_all-nli-dev_max_accuracy": 0.455, "eval_loss": 3.4308860301971436, "eval_runtime": 7.9019, "eval_samples_per_second": 126.552, "eval_steps_per_second": 7.973, "step": 3200 }, { "epoch": 0.528, "grad_norm": 0.06493524461984634, "learning_rate": 2.6311111111111115e-05, "loss": 3.4668, "step": 3300 }, { "epoch": 0.528, "eval_all-nli-dev_cosine_accuracy": 0.448, "eval_all-nli-dev_dot_accuracy": 0.383, "eval_all-nli-dev_euclidean_accuracy": 0.448, "eval_all-nli-dev_manhattan_accuracy": 0.46, "eval_all-nli-dev_max_accuracy": 0.46, "eval_loss": 3.4308860301971436, "eval_runtime": 7.9333, "eval_samples_per_second": 126.051, "eval_steps_per_second": 7.941, "step": 3300 }, { "epoch": 0.544, "grad_norm": 0.0680716335773468, "learning_rate": 2.5422222222222227e-05, "loss": 3.4661, "step": 3400 }, { "epoch": 0.544, "eval_all-nli-dev_cosine_accuracy": 0.492, "eval_all-nli-dev_dot_accuracy": 0.407, "eval_all-nli-dev_euclidean_accuracy": 0.492, "eval_all-nli-dev_manhattan_accuracy": 0.481, "eval_all-nli-dev_max_accuracy": 0.492, "eval_loss": 3.4308860301971436, "eval_runtime": 7.9192, "eval_samples_per_second": 126.276, "eval_steps_per_second": 7.955, "step": 3400 }, { "epoch": 0.56, "grad_norm": 0.04947361350059509, "learning_rate": 2.4533333333333334e-05, "loss": 3.4667, "step": 3500 }, { "epoch": 0.56, "eval_all-nli-dev_cosine_accuracy": 0.432, "eval_all-nli-dev_dot_accuracy": 0.391, "eval_all-nli-dev_euclidean_accuracy": 0.432, "eval_all-nli-dev_manhattan_accuracy": 0.427, "eval_all-nli-dev_max_accuracy": 0.432, "eval_loss": 3.4308857917785645, "eval_runtime": 7.8702, "eval_samples_per_second": 127.061, "eval_steps_per_second": 8.005, "step": 3500 }, { "epoch": 0.576, "grad_norm": 0.07287462800741196, "learning_rate": 2.3644444444444446e-05, "loss": 3.4668, "step": 3600 }, { "epoch": 0.576, "eval_all-nli-dev_cosine_accuracy": 0.482, "eval_all-nli-dev_dot_accuracy": 0.374, "eval_all-nli-dev_euclidean_accuracy": 0.482, "eval_all-nli-dev_manhattan_accuracy": 0.486, "eval_all-nli-dev_max_accuracy": 0.486, "eval_loss": 3.4308860301971436, "eval_runtime": 7.8562, "eval_samples_per_second": 127.288, "eval_steps_per_second": 8.019, "step": 3600 }, { "epoch": 0.592, "grad_norm": 0.028324555605649948, "learning_rate": 2.2755555555555557e-05, "loss": 3.4666, "step": 3700 }, { "epoch": 0.592, "eval_all-nli-dev_cosine_accuracy": 0.469, "eval_all-nli-dev_dot_accuracy": 0.351, "eval_all-nli-dev_euclidean_accuracy": 0.469, "eval_all-nli-dev_manhattan_accuracy": 0.46, "eval_all-nli-dev_max_accuracy": 0.469, "eval_loss": 3.4308865070343018, "eval_runtime": 7.879, "eval_samples_per_second": 126.919, "eval_steps_per_second": 7.996, "step": 3700 }, { "epoch": 0.608, "grad_norm": 0.04251887649297714, "learning_rate": 2.186666666666667e-05, "loss": 3.4669, "step": 3800 }, { "epoch": 0.608, "eval_all-nli-dev_cosine_accuracy": 0.473, "eval_all-nli-dev_dot_accuracy": 0.369, "eval_all-nli-dev_euclidean_accuracy": 0.473, "eval_all-nli-dev_manhattan_accuracy": 0.467, "eval_all-nli-dev_max_accuracy": 0.473, "eval_loss": 3.4308860301971436, "eval_runtime": 7.9346, "eval_samples_per_second": 126.031, "eval_steps_per_second": 7.94, "step": 3800 }, { "epoch": 0.624, "grad_norm": 0.01685403846204281, "learning_rate": 2.097777777777778e-05, "loss": 3.4658, "step": 3900 }, { "epoch": 0.624, "eval_all-nli-dev_cosine_accuracy": 0.487, "eval_all-nli-dev_dot_accuracy": 0.333, "eval_all-nli-dev_euclidean_accuracy": 0.487, "eval_all-nli-dev_manhattan_accuracy": 0.486, "eval_all-nli-dev_max_accuracy": 0.487, "eval_loss": 3.4308860301971436, "eval_runtime": 8.0469, "eval_samples_per_second": 124.272, "eval_steps_per_second": 7.829, "step": 3900 }, { "epoch": 0.64, "grad_norm": 0.16402187943458557, "learning_rate": 2.008888888888889e-05, "loss": 3.4663, "step": 4000 }, { "epoch": 0.64, "eval_all-nli-dev_cosine_accuracy": 0.448, "eval_all-nli-dev_dot_accuracy": 0.355, "eval_all-nli-dev_euclidean_accuracy": 0.448, "eval_all-nli-dev_manhattan_accuracy": 0.442, "eval_all-nli-dev_max_accuracy": 0.448, "eval_loss": 3.4308860301971436, "eval_runtime": 7.9755, "eval_samples_per_second": 125.384, "eval_steps_per_second": 7.899, "step": 4000 }, { "epoch": 0.656, "grad_norm": 0.08633382618427277, "learning_rate": 1.9200000000000003e-05, "loss": 3.4663, "step": 4100 }, { "epoch": 0.656, "eval_all-nli-dev_cosine_accuracy": 0.463, "eval_all-nli-dev_dot_accuracy": 0.3, "eval_all-nli-dev_euclidean_accuracy": 0.463, "eval_all-nli-dev_manhattan_accuracy": 0.465, "eval_all-nli-dev_max_accuracy": 0.465, "eval_loss": 3.4308860301971436, "eval_runtime": 7.9722, "eval_samples_per_second": 125.436, "eval_steps_per_second": 7.902, "step": 4100 }, { "epoch": 0.672, "grad_norm": 0.08357453346252441, "learning_rate": 1.8311111111111114e-05, "loss": 3.4664, "step": 4200 }, { "epoch": 0.672, "eval_all-nli-dev_cosine_accuracy": 0.48, "eval_all-nli-dev_dot_accuracy": 0.343, "eval_all-nli-dev_euclidean_accuracy": 0.48, "eval_all-nli-dev_manhattan_accuracy": 0.484, "eval_all-nli-dev_max_accuracy": 0.484, "eval_loss": 3.4308860301971436, "eval_runtime": 7.9287, "eval_samples_per_second": 126.125, "eval_steps_per_second": 7.946, "step": 4200 }, { "epoch": 0.688, "grad_norm": 0.20162004232406616, "learning_rate": 1.7422222222222222e-05, "loss": 3.4663, "step": 4300 }, { "epoch": 0.688, "eval_all-nli-dev_cosine_accuracy": 0.451, "eval_all-nli-dev_dot_accuracy": 0.354, "eval_all-nli-dev_euclidean_accuracy": 0.451, "eval_all-nli-dev_manhattan_accuracy": 0.469, "eval_all-nli-dev_max_accuracy": 0.469, "eval_loss": 3.4308860301971436, "eval_runtime": 7.9242, "eval_samples_per_second": 126.195, "eval_steps_per_second": 7.95, "step": 4300 }, { "epoch": 0.704, "grad_norm": 0.17491748929023743, "learning_rate": 1.6533333333333333e-05, "loss": 3.4661, "step": 4400 }, { "epoch": 0.704, "eval_all-nli-dev_cosine_accuracy": 0.47, "eval_all-nli-dev_dot_accuracy": 0.367, "eval_all-nli-dev_euclidean_accuracy": 0.47, "eval_all-nli-dev_manhattan_accuracy": 0.478, "eval_all-nli-dev_max_accuracy": 0.478, "eval_loss": 3.4308862686157227, "eval_runtime": 7.989, "eval_samples_per_second": 125.172, "eval_steps_per_second": 7.886, "step": 4400 }, { "epoch": 0.72, "grad_norm": 0.1597195565700531, "learning_rate": 1.5644444444444444e-05, "loss": 3.4669, "step": 4500 }, { "epoch": 0.72, "eval_all-nli-dev_cosine_accuracy": 0.466, "eval_all-nli-dev_dot_accuracy": 0.375, "eval_all-nli-dev_euclidean_accuracy": 0.466, "eval_all-nli-dev_manhattan_accuracy": 0.467, "eval_all-nli-dev_max_accuracy": 0.467, "eval_loss": 3.4308860301971436, "eval_runtime": 7.9327, "eval_samples_per_second": 126.06, "eval_steps_per_second": 7.942, "step": 4500 }, { "epoch": 0.736, "grad_norm": 0.08718598634004593, "learning_rate": 1.4755555555555556e-05, "loss": 3.4664, "step": 4600 }, { "epoch": 0.736, "eval_all-nli-dev_cosine_accuracy": 0.44, "eval_all-nli-dev_dot_accuracy": 0.307, "eval_all-nli-dev_euclidean_accuracy": 0.44, "eval_all-nli-dev_manhattan_accuracy": 0.455, "eval_all-nli-dev_max_accuracy": 0.455, "eval_loss": 3.4308862686157227, "eval_runtime": 7.9594, "eval_samples_per_second": 125.637, "eval_steps_per_second": 7.915, "step": 4600 }, { "epoch": 0.752, "grad_norm": 0.03150181472301483, "learning_rate": 1.3866666666666667e-05, "loss": 3.4664, "step": 4700 }, { "epoch": 0.752, "eval_all-nli-dev_cosine_accuracy": 0.469, "eval_all-nli-dev_dot_accuracy": 0.336, "eval_all-nli-dev_euclidean_accuracy": 0.469, "eval_all-nli-dev_manhattan_accuracy": 0.481, "eval_all-nli-dev_max_accuracy": 0.481, "eval_loss": 3.4308860301971436, "eval_runtime": 7.9652, "eval_samples_per_second": 125.546, "eval_steps_per_second": 7.909, "step": 4700 }, { "epoch": 0.768, "grad_norm": 0.027570225298404694, "learning_rate": 1.2977777777777777e-05, "loss": 3.4659, "step": 4800 }, { "epoch": 0.768, "eval_all-nli-dev_cosine_accuracy": 0.461, "eval_all-nli-dev_dot_accuracy": 0.282, "eval_all-nli-dev_euclidean_accuracy": 0.461, "eval_all-nli-dev_manhattan_accuracy": 0.466, "eval_all-nli-dev_max_accuracy": 0.466, "eval_loss": 3.4308862686157227, "eval_runtime": 8.0206, "eval_samples_per_second": 124.678, "eval_steps_per_second": 7.855, "step": 4800 }, { "epoch": 0.784, "grad_norm": 0.16807572543621063, "learning_rate": 1.208888888888889e-05, "loss": 3.466, "step": 4900 }, { "epoch": 0.784, "eval_all-nli-dev_cosine_accuracy": 0.45, "eval_all-nli-dev_dot_accuracy": 0.375, "eval_all-nli-dev_euclidean_accuracy": 0.45, "eval_all-nli-dev_manhattan_accuracy": 0.451, "eval_all-nli-dev_max_accuracy": 0.451, "eval_loss": 3.4308860301971436, "eval_runtime": 7.9772, "eval_samples_per_second": 125.358, "eval_steps_per_second": 7.898, "step": 4900 }, { "epoch": 0.8, "grad_norm": 0.03361041471362114, "learning_rate": 1.1200000000000001e-05, "loss": 3.466, "step": 5000 }, { "epoch": 0.8, "eval_all-nli-dev_cosine_accuracy": 0.464, "eval_all-nli-dev_dot_accuracy": 0.374, "eval_all-nli-dev_euclidean_accuracy": 0.464, "eval_all-nli-dev_manhattan_accuracy": 0.473, "eval_all-nli-dev_max_accuracy": 0.473, "eval_loss": 3.4308860301971436, "eval_runtime": 7.9287, "eval_samples_per_second": 126.123, "eval_steps_per_second": 7.946, "step": 5000 }, { "epoch": 0.816, "grad_norm": 0.1332545429468155, "learning_rate": 1.031111111111111e-05, "loss": 3.4664, "step": 5100 }, { "epoch": 0.816, "eval_all-nli-dev_cosine_accuracy": 0.44, "eval_all-nli-dev_dot_accuracy": 0.341, "eval_all-nli-dev_euclidean_accuracy": 0.44, "eval_all-nli-dev_manhattan_accuracy": 0.44, "eval_all-nli-dev_max_accuracy": 0.44, "eval_loss": 3.4308860301971436, "eval_runtime": 7.9414, "eval_samples_per_second": 125.922, "eval_steps_per_second": 7.933, "step": 5100 }, { "epoch": 0.832, "grad_norm": 0.029825175181031227, "learning_rate": 9.422222222222222e-06, "loss": 3.4658, "step": 5200 }, { "epoch": 0.832, "eval_all-nli-dev_cosine_accuracy": 0.496, "eval_all-nli-dev_dot_accuracy": 0.356, "eval_all-nli-dev_euclidean_accuracy": 0.496, "eval_all-nli-dev_manhattan_accuracy": 0.497, "eval_all-nli-dev_max_accuracy": 0.497, "eval_loss": 3.4308860301971436, "eval_runtime": 7.9321, "eval_samples_per_second": 126.07, "eval_steps_per_second": 7.942, "step": 5200 }, { "epoch": 0.848, "grad_norm": 0.023120058700442314, "learning_rate": 8.533333333333334e-06, "loss": 3.4664, "step": 5300 }, { "epoch": 0.848, "eval_all-nli-dev_cosine_accuracy": 0.464, "eval_all-nli-dev_dot_accuracy": 0.321, "eval_all-nli-dev_euclidean_accuracy": 0.464, "eval_all-nli-dev_manhattan_accuracy": 0.474, "eval_all-nli-dev_max_accuracy": 0.474, "eval_loss": 3.4308860301971436, "eval_runtime": 7.9178, "eval_samples_per_second": 126.297, "eval_steps_per_second": 7.957, "step": 5300 }, { "epoch": 0.864, "grad_norm": 0.10328388214111328, "learning_rate": 7.644444444444445e-06, "loss": 3.4658, "step": 5400 }, { "epoch": 0.864, "eval_all-nli-dev_cosine_accuracy": 0.446, "eval_all-nli-dev_dot_accuracy": 0.309, "eval_all-nli-dev_euclidean_accuracy": 0.446, "eval_all-nli-dev_manhattan_accuracy": 0.449, "eval_all-nli-dev_max_accuracy": 0.449, "eval_loss": 3.4308860301971436, "eval_runtime": 8.0066, "eval_samples_per_second": 124.897, "eval_steps_per_second": 7.869, "step": 5400 }, { "epoch": 0.88, "grad_norm": 0.03729160130023956, "learning_rate": 6.755555555555555e-06, "loss": 3.4662, "step": 5500 }, { "epoch": 0.88, "eval_all-nli-dev_cosine_accuracy": 0.466, "eval_all-nli-dev_dot_accuracy": 0.311, "eval_all-nli-dev_euclidean_accuracy": 0.466, "eval_all-nli-dev_manhattan_accuracy": 0.464, "eval_all-nli-dev_max_accuracy": 0.466, "eval_loss": 3.4308860301971436, "eval_runtime": 7.964, "eval_samples_per_second": 125.565, "eval_steps_per_second": 7.911, "step": 5500 }, { "epoch": 0.896, "grad_norm": 0.02747642807662487, "learning_rate": 5.866666666666667e-06, "loss": 3.4663, "step": 5600 }, { "epoch": 0.896, "eval_all-nli-dev_cosine_accuracy": 0.476, "eval_all-nli-dev_dot_accuracy": 0.369, "eval_all-nli-dev_euclidean_accuracy": 0.476, "eval_all-nli-dev_manhattan_accuracy": 0.468, "eval_all-nli-dev_max_accuracy": 0.476, "eval_loss": 3.4308860301971436, "eval_runtime": 7.9426, "eval_samples_per_second": 125.903, "eval_steps_per_second": 7.932, "step": 5600 }, { "epoch": 0.912, "grad_norm": 0.053027376532554626, "learning_rate": 4.977777777777778e-06, "loss": 3.4667, "step": 5700 }, { "epoch": 0.912, "eval_all-nli-dev_cosine_accuracy": 0.455, "eval_all-nli-dev_dot_accuracy": 0.369, "eval_all-nli-dev_euclidean_accuracy": 0.455, "eval_all-nli-dev_manhattan_accuracy": 0.454, "eval_all-nli-dev_max_accuracy": 0.455, "eval_loss": 3.4308860301971436, "eval_runtime": 7.9874, "eval_samples_per_second": 125.197, "eval_steps_per_second": 7.887, "step": 5700 }, { "epoch": 0.928, "grad_norm": 0.02996170148253441, "learning_rate": 4.088888888888889e-06, "loss": 3.4669, "step": 5800 }, { "epoch": 0.928, "eval_all-nli-dev_cosine_accuracy": 0.456, "eval_all-nli-dev_dot_accuracy": 0.229, "eval_all-nli-dev_euclidean_accuracy": 0.456, "eval_all-nli-dev_manhattan_accuracy": 0.463, "eval_all-nli-dev_max_accuracy": 0.463, "eval_loss": 3.4308860301971436, "eval_runtime": 7.9568, "eval_samples_per_second": 125.679, "eval_steps_per_second": 7.918, "step": 5800 }, { "epoch": 0.944, "grad_norm": 0.05124541372060776, "learning_rate": 3.2000000000000003e-06, "loss": 3.4657, "step": 5900 }, { "epoch": 0.944, "eval_all-nli-dev_cosine_accuracy": 0.466, "eval_all-nli-dev_dot_accuracy": 0.388, "eval_all-nli-dev_euclidean_accuracy": 0.467, "eval_all-nli-dev_manhattan_accuracy": 0.464, "eval_all-nli-dev_max_accuracy": 0.467, "eval_loss": 3.4308862686157227, "eval_runtime": 7.9623, "eval_samples_per_second": 125.591, "eval_steps_per_second": 7.912, "step": 5900 }, { "epoch": 0.96, "grad_norm": 0.06664357334375381, "learning_rate": 2.311111111111111e-06, "loss": 3.4671, "step": 6000 }, { "epoch": 0.96, "eval_all-nli-dev_cosine_accuracy": 0.452, "eval_all-nli-dev_dot_accuracy": 0.33, "eval_all-nli-dev_euclidean_accuracy": 0.452, "eval_all-nli-dev_manhattan_accuracy": 0.456, "eval_all-nli-dev_max_accuracy": 0.456, "eval_loss": 3.4308862686157227, "eval_runtime": 7.9667, "eval_samples_per_second": 125.523, "eval_steps_per_second": 7.908, "step": 6000 }, { "epoch": 0.976, "grad_norm": 0.18055689334869385, "learning_rate": 1.4222222222222223e-06, "loss": 2.9471, "step": 6100 }, { "epoch": 0.976, "eval_all-nli-dev_cosine_accuracy": 0.479, "eval_all-nli-dev_dot_accuracy": 0.378, "eval_all-nli-dev_euclidean_accuracy": 0.479, "eval_all-nli-dev_manhattan_accuracy": 0.484, "eval_all-nli-dev_max_accuracy": 0.484, "eval_loss": 3.4308860301971436, "eval_runtime": 7.9406, "eval_samples_per_second": 125.936, "eval_steps_per_second": 7.934, "step": 6100 } ], "logging_steps": 100, "max_steps": 6250, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }