|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.992, |
|
"eval_steps": 100, |
|
"global_step": 6200, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.016, |
|
"grad_norm": 83.78450775146484, |
|
"learning_rate": 7.44e-06, |
|
"loss": 3.2768, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.016, |
|
"eval_all-nli-dev_cosine_accuracy": 0.815, |
|
"eval_all-nli-dev_dot_accuracy": 0.316, |
|
"eval_all-nli-dev_euclidean_accuracy": 0.804, |
|
"eval_all-nli-dev_manhattan_accuracy": 0.833, |
|
"eval_all-nli-dev_max_accuracy": 0.833, |
|
"eval_loss": 1.8052657842636108, |
|
"eval_runtime": 8.0216, |
|
"eval_samples_per_second": 124.664, |
|
"eval_steps_per_second": 7.854, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.032, |
|
"grad_norm": 62.54200744628906, |
|
"learning_rate": 1.544e-05, |
|
"loss": 1.1697, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.032, |
|
"eval_all-nli-dev_cosine_accuracy": 0.842, |
|
"eval_all-nli-dev_dot_accuracy": 0.171, |
|
"eval_all-nli-dev_euclidean_accuracy": 0.838, |
|
"eval_all-nli-dev_manhattan_accuracy": 0.861, |
|
"eval_all-nli-dev_max_accuracy": 0.861, |
|
"eval_loss": 1.2878086566925049, |
|
"eval_runtime": 7.8831, |
|
"eval_samples_per_second": 126.853, |
|
"eval_steps_per_second": 7.992, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.048, |
|
"grad_norm": 75.54035949707031, |
|
"learning_rate": 2.344e-05, |
|
"loss": 1.372, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.048, |
|
"eval_all-nli-dev_cosine_accuracy": 0.841, |
|
"eval_all-nli-dev_dot_accuracy": 0.181, |
|
"eval_all-nli-dev_euclidean_accuracy": 0.844, |
|
"eval_all-nli-dev_manhattan_accuracy": 0.861, |
|
"eval_all-nli-dev_max_accuracy": 0.861, |
|
"eval_loss": 1.2466014623641968, |
|
"eval_runtime": 7.8792, |
|
"eval_samples_per_second": 126.916, |
|
"eval_steps_per_second": 7.996, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.064, |
|
"grad_norm": 33.85651779174805, |
|
"learning_rate": 3.136e-05, |
|
"loss": 1.0476, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.064, |
|
"eval_all-nli-dev_cosine_accuracy": 0.848, |
|
"eval_all-nli-dev_dot_accuracy": 0.201, |
|
"eval_all-nli-dev_euclidean_accuracy": 0.848, |
|
"eval_all-nli-dev_manhattan_accuracy": 0.863, |
|
"eval_all-nli-dev_max_accuracy": 0.863, |
|
"eval_loss": 1.2291330099105835, |
|
"eval_runtime": 7.8507, |
|
"eval_samples_per_second": 127.378, |
|
"eval_steps_per_second": 8.025, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 27.536640167236328, |
|
"learning_rate": 3.936e-05, |
|
"loss": 0.8588, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"eval_all-nli-dev_cosine_accuracy": 0.821, |
|
"eval_all-nli-dev_dot_accuracy": 0.213, |
|
"eval_all-nli-dev_euclidean_accuracy": 0.828, |
|
"eval_all-nli-dev_manhattan_accuracy": 0.838, |
|
"eval_all-nli-dev_max_accuracy": 0.838, |
|
"eval_loss": 1.5258921384811401, |
|
"eval_runtime": 7.8331, |
|
"eval_samples_per_second": 127.663, |
|
"eval_steps_per_second": 8.043, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.096, |
|
"grad_norm": 0.15947222709655762, |
|
"learning_rate": 4.736000000000001e-05, |
|
"loss": 2.9781, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.096, |
|
"eval_all-nli-dev_cosine_accuracy": 0.462, |
|
"eval_all-nli-dev_dot_accuracy": 0.363, |
|
"eval_all-nli-dev_euclidean_accuracy": 0.462, |
|
"eval_all-nli-dev_manhattan_accuracy": 0.463, |
|
"eval_all-nli-dev_max_accuracy": 0.463, |
|
"eval_loss": 3.4308860301971436, |
|
"eval_runtime": 7.9535, |
|
"eval_samples_per_second": 125.73, |
|
"eval_steps_per_second": 7.921, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.112, |
|
"grad_norm": 0.09273126721382141, |
|
"learning_rate": 4.9404444444444447e-05, |
|
"loss": 3.4982, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.112, |
|
"eval_all-nli-dev_cosine_accuracy": 0.449, |
|
"eval_all-nli-dev_dot_accuracy": 0.381, |
|
"eval_all-nli-dev_euclidean_accuracy": 0.449, |
|
"eval_all-nli-dev_manhattan_accuracy": 0.457, |
|
"eval_all-nli-dev_max_accuracy": 0.457, |
|
"eval_loss": 3.4308860301971436, |
|
"eval_runtime": 7.905, |
|
"eval_samples_per_second": 126.501, |
|
"eval_steps_per_second": 7.97, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.128, |
|
"grad_norm": 0.047491393983364105, |
|
"learning_rate": 4.851555555555556e-05, |
|
"loss": 3.467, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.128, |
|
"eval_all-nli-dev_cosine_accuracy": 0.471, |
|
"eval_all-nli-dev_dot_accuracy": 0.366, |
|
"eval_all-nli-dev_euclidean_accuracy": 0.471, |
|
"eval_all-nli-dev_manhattan_accuracy": 0.479, |
|
"eval_all-nli-dev_max_accuracy": 0.479, |
|
"eval_loss": 3.4308860301971436, |
|
"eval_runtime": 7.8623, |
|
"eval_samples_per_second": 127.188, |
|
"eval_steps_per_second": 8.013, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.144, |
|
"grad_norm": 0.05457993969321251, |
|
"learning_rate": 4.762666666666667e-05, |
|
"loss": 3.4665, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.144, |
|
"eval_all-nli-dev_cosine_accuracy": 0.452, |
|
"eval_all-nli-dev_dot_accuracy": 0.342, |
|
"eval_all-nli-dev_euclidean_accuracy": 0.452, |
|
"eval_all-nli-dev_manhattan_accuracy": 0.446, |
|
"eval_all-nli-dev_max_accuracy": 0.452, |
|
"eval_loss": 3.4308860301971436, |
|
"eval_runtime": 8.0107, |
|
"eval_samples_per_second": 124.832, |
|
"eval_steps_per_second": 7.864, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.03311806544661522, |
|
"learning_rate": 4.673777777777778e-05, |
|
"loss": 3.4664, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"eval_all-nli-dev_cosine_accuracy": 0.477, |
|
"eval_all-nli-dev_dot_accuracy": 0.352, |
|
"eval_all-nli-dev_euclidean_accuracy": 0.477, |
|
"eval_all-nli-dev_manhattan_accuracy": 0.468, |
|
"eval_all-nli-dev_max_accuracy": 0.477, |
|
"eval_loss": 3.4308860301971436, |
|
"eval_runtime": 7.9233, |
|
"eval_samples_per_second": 126.21, |
|
"eval_steps_per_second": 7.951, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.176, |
|
"grad_norm": 0.02707391045987606, |
|
"learning_rate": 4.584888888888889e-05, |
|
"loss": 3.4663, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.176, |
|
"eval_all-nli-dev_cosine_accuracy": 0.458, |
|
"eval_all-nli-dev_dot_accuracy": 0.376, |
|
"eval_all-nli-dev_euclidean_accuracy": 0.458, |
|
"eval_all-nli-dev_manhattan_accuracy": 0.452, |
|
"eval_all-nli-dev_max_accuracy": 0.458, |
|
"eval_loss": 3.4308860301971436, |
|
"eval_runtime": 7.8521, |
|
"eval_samples_per_second": 127.354, |
|
"eval_steps_per_second": 8.023, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.192, |
|
"grad_norm": 0.030668186023831367, |
|
"learning_rate": 4.496e-05, |
|
"loss": 3.4661, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.192, |
|
"eval_all-nli-dev_cosine_accuracy": 0.46, |
|
"eval_all-nli-dev_dot_accuracy": 0.39, |
|
"eval_all-nli-dev_euclidean_accuracy": 0.46, |
|
"eval_all-nli-dev_manhattan_accuracy": 0.462, |
|
"eval_all-nli-dev_max_accuracy": 0.462, |
|
"eval_loss": 3.4308862686157227, |
|
"eval_runtime": 8.0488, |
|
"eval_samples_per_second": 124.243, |
|
"eval_steps_per_second": 7.827, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.208, |
|
"grad_norm": 0.04911087080836296, |
|
"learning_rate": 4.4071111111111115e-05, |
|
"loss": 3.4658, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.208, |
|
"eval_all-nli-dev_cosine_accuracy": 0.441, |
|
"eval_all-nli-dev_dot_accuracy": 0.338, |
|
"eval_all-nli-dev_euclidean_accuracy": 0.441, |
|
"eval_all-nli-dev_manhattan_accuracy": 0.45, |
|
"eval_all-nli-dev_max_accuracy": 0.45, |
|
"eval_loss": 3.4308857917785645, |
|
"eval_runtime": 7.9015, |
|
"eval_samples_per_second": 126.558, |
|
"eval_steps_per_second": 7.973, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.224, |
|
"grad_norm": 0.02693816088140011, |
|
"learning_rate": 4.3182222222222226e-05, |
|
"loss": 3.4661, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.224, |
|
"eval_all-nli-dev_cosine_accuracy": 0.475, |
|
"eval_all-nli-dev_dot_accuracy": 0.31, |
|
"eval_all-nli-dev_euclidean_accuracy": 0.475, |
|
"eval_all-nli-dev_manhattan_accuracy": 0.481, |
|
"eval_all-nli-dev_max_accuracy": 0.481, |
|
"eval_loss": 3.4308862686157227, |
|
"eval_runtime": 7.9424, |
|
"eval_samples_per_second": 125.906, |
|
"eval_steps_per_second": 7.932, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.17906944453716278, |
|
"learning_rate": 4.229333333333334e-05, |
|
"loss": 3.4877, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"eval_all-nli-dev_cosine_accuracy": 0.464, |
|
"eval_all-nli-dev_dot_accuracy": 0.358, |
|
"eval_all-nli-dev_euclidean_accuracy": 0.464, |
|
"eval_all-nli-dev_manhattan_accuracy": 0.458, |
|
"eval_all-nli-dev_max_accuracy": 0.464, |
|
"eval_loss": 3.4308860301971436, |
|
"eval_runtime": 7.9335, |
|
"eval_samples_per_second": 126.048, |
|
"eval_steps_per_second": 7.941, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.256, |
|
"grad_norm": 0.07473237067461014, |
|
"learning_rate": 4.140444444444445e-05, |
|
"loss": 3.4675, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.256, |
|
"eval_all-nli-dev_cosine_accuracy": 0.462, |
|
"eval_all-nli-dev_dot_accuracy": 0.347, |
|
"eval_all-nli-dev_euclidean_accuracy": 0.462, |
|
"eval_all-nli-dev_manhattan_accuracy": 0.457, |
|
"eval_all-nli-dev_max_accuracy": 0.462, |
|
"eval_loss": 3.4308860301971436, |
|
"eval_runtime": 7.9155, |
|
"eval_samples_per_second": 126.335, |
|
"eval_steps_per_second": 7.959, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.272, |
|
"grad_norm": 0.06472893804311752, |
|
"learning_rate": 4.051555555555556e-05, |
|
"loss": 3.4665, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.272, |
|
"eval_all-nli-dev_cosine_accuracy": 0.488, |
|
"eval_all-nli-dev_dot_accuracy": 0.394, |
|
"eval_all-nli-dev_euclidean_accuracy": 0.488, |
|
"eval_all-nli-dev_manhattan_accuracy": 0.487, |
|
"eval_all-nli-dev_max_accuracy": 0.488, |
|
"eval_loss": 3.4308862686157227, |
|
"eval_runtime": 7.7762, |
|
"eval_samples_per_second": 128.597, |
|
"eval_steps_per_second": 8.102, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.288, |
|
"grad_norm": 0.05817211791872978, |
|
"learning_rate": 3.9626666666666664e-05, |
|
"loss": 3.4667, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.288, |
|
"eval_all-nli-dev_cosine_accuracy": 0.492, |
|
"eval_all-nli-dev_dot_accuracy": 0.396, |
|
"eval_all-nli-dev_euclidean_accuracy": 0.492, |
|
"eval_all-nli-dev_manhattan_accuracy": 0.483, |
|
"eval_all-nli-dev_max_accuracy": 0.492, |
|
"eval_loss": 3.4308860301971436, |
|
"eval_runtime": 7.8429, |
|
"eval_samples_per_second": 127.504, |
|
"eval_steps_per_second": 8.033, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.304, |
|
"grad_norm": 0.06118469312787056, |
|
"learning_rate": 3.8737777777777776e-05, |
|
"loss": 3.4664, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.304, |
|
"eval_all-nli-dev_cosine_accuracy": 0.452, |
|
"eval_all-nli-dev_dot_accuracy": 0.377, |
|
"eval_all-nli-dev_euclidean_accuracy": 0.452, |
|
"eval_all-nli-dev_manhattan_accuracy": 0.455, |
|
"eval_all-nli-dev_max_accuracy": 0.455, |
|
"eval_loss": 3.4308860301971436, |
|
"eval_runtime": 7.8193, |
|
"eval_samples_per_second": 127.888, |
|
"eval_steps_per_second": 8.057, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.034288015216588974, |
|
"learning_rate": 3.784888888888889e-05, |
|
"loss": 3.4661, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"eval_all-nli-dev_cosine_accuracy": 0.446, |
|
"eval_all-nli-dev_dot_accuracy": 0.35, |
|
"eval_all-nli-dev_euclidean_accuracy": 0.446, |
|
"eval_all-nli-dev_manhattan_accuracy": 0.453, |
|
"eval_all-nli-dev_max_accuracy": 0.453, |
|
"eval_loss": 3.4308860301971436, |
|
"eval_runtime": 7.8176, |
|
"eval_samples_per_second": 127.916, |
|
"eval_steps_per_second": 8.059, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.336, |
|
"grad_norm": 0.06111547723412514, |
|
"learning_rate": 3.696e-05, |
|
"loss": 3.4666, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.336, |
|
"eval_all-nli-dev_cosine_accuracy": 0.469, |
|
"eval_all-nli-dev_dot_accuracy": 0.39, |
|
"eval_all-nli-dev_euclidean_accuracy": 0.469, |
|
"eval_all-nli-dev_manhattan_accuracy": 0.477, |
|
"eval_all-nli-dev_max_accuracy": 0.477, |
|
"eval_loss": 3.4308862686157227, |
|
"eval_runtime": 7.8728, |
|
"eval_samples_per_second": 127.02, |
|
"eval_steps_per_second": 8.002, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.352, |
|
"grad_norm": 0.06624756008386612, |
|
"learning_rate": 3.607111111111111e-05, |
|
"loss": 3.4683, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.352, |
|
"eval_all-nli-dev_cosine_accuracy": 0.478, |
|
"eval_all-nli-dev_dot_accuracy": 0.338, |
|
"eval_all-nli-dev_euclidean_accuracy": 0.478, |
|
"eval_all-nli-dev_manhattan_accuracy": 0.48, |
|
"eval_all-nli-dev_max_accuracy": 0.48, |
|
"eval_loss": 3.4308860301971436, |
|
"eval_runtime": 7.9247, |
|
"eval_samples_per_second": 126.188, |
|
"eval_steps_per_second": 7.95, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.368, |
|
"grad_norm": 0.031209247186779976, |
|
"learning_rate": 3.518222222222222e-05, |
|
"loss": 3.4663, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.368, |
|
"eval_all-nli-dev_cosine_accuracy": 0.469, |
|
"eval_all-nli-dev_dot_accuracy": 0.312, |
|
"eval_all-nli-dev_euclidean_accuracy": 0.469, |
|
"eval_all-nli-dev_manhattan_accuracy": 0.464, |
|
"eval_all-nli-dev_max_accuracy": 0.469, |
|
"eval_loss": 3.4308860301971436, |
|
"eval_runtime": 7.8393, |
|
"eval_samples_per_second": 127.562, |
|
"eval_steps_per_second": 8.036, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.384, |
|
"grad_norm": 0.133990079164505, |
|
"learning_rate": 3.429333333333333e-05, |
|
"loss": 3.4667, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.384, |
|
"eval_all-nli-dev_cosine_accuracy": 0.448, |
|
"eval_all-nli-dev_dot_accuracy": 0.404, |
|
"eval_all-nli-dev_euclidean_accuracy": 0.448, |
|
"eval_all-nli-dev_manhattan_accuracy": 0.445, |
|
"eval_all-nli-dev_max_accuracy": 0.448, |
|
"eval_loss": 3.4308860301971436, |
|
"eval_runtime": 8.021, |
|
"eval_samples_per_second": 124.672, |
|
"eval_steps_per_second": 7.854, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.04892706498503685, |
|
"learning_rate": 3.3404444444444444e-05, |
|
"loss": 3.4669, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"eval_all-nli-dev_cosine_accuracy": 0.499, |
|
"eval_all-nli-dev_dot_accuracy": 0.365, |
|
"eval_all-nli-dev_euclidean_accuracy": 0.499, |
|
"eval_all-nli-dev_manhattan_accuracy": 0.492, |
|
"eval_all-nli-dev_max_accuracy": 0.499, |
|
"eval_loss": 3.4308862686157227, |
|
"eval_runtime": 7.8451, |
|
"eval_samples_per_second": 127.468, |
|
"eval_steps_per_second": 8.03, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.416, |
|
"grad_norm": 0.053150493651628494, |
|
"learning_rate": 3.2515555555555555e-05, |
|
"loss": 3.4661, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.416, |
|
"eval_all-nli-dev_cosine_accuracy": 0.453, |
|
"eval_all-nli-dev_dot_accuracy": 0.349, |
|
"eval_all-nli-dev_euclidean_accuracy": 0.453, |
|
"eval_all-nli-dev_manhattan_accuracy": 0.45, |
|
"eval_all-nli-dev_max_accuracy": 0.453, |
|
"eval_loss": 3.4308860301971436, |
|
"eval_runtime": 7.9334, |
|
"eval_samples_per_second": 126.05, |
|
"eval_steps_per_second": 7.941, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.432, |
|
"grad_norm": 0.09555792808532715, |
|
"learning_rate": 3.1626666666666667e-05, |
|
"loss": 3.4656, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.432, |
|
"eval_all-nli-dev_cosine_accuracy": 0.466, |
|
"eval_all-nli-dev_dot_accuracy": 0.336, |
|
"eval_all-nli-dev_euclidean_accuracy": 0.466, |
|
"eval_all-nli-dev_manhattan_accuracy": 0.467, |
|
"eval_all-nli-dev_max_accuracy": 0.467, |
|
"eval_loss": 3.4308860301971436, |
|
"eval_runtime": 7.8562, |
|
"eval_samples_per_second": 127.287, |
|
"eval_steps_per_second": 8.019, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.448, |
|
"grad_norm": 0.01744948886334896, |
|
"learning_rate": 3.0737777777777785e-05, |
|
"loss": 3.4662, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.448, |
|
"eval_all-nli-dev_cosine_accuracy": 0.506, |
|
"eval_all-nli-dev_dot_accuracy": 0.4, |
|
"eval_all-nli-dev_euclidean_accuracy": 0.506, |
|
"eval_all-nli-dev_manhattan_accuracy": 0.507, |
|
"eval_all-nli-dev_max_accuracy": 0.507, |
|
"eval_loss": 3.4308860301971436, |
|
"eval_runtime": 7.8617, |
|
"eval_samples_per_second": 127.199, |
|
"eval_steps_per_second": 8.014, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.464, |
|
"grad_norm": 0.2527749538421631, |
|
"learning_rate": 2.986666666666667e-05, |
|
"loss": 3.4902, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.464, |
|
"eval_all-nli-dev_cosine_accuracy": 0.463, |
|
"eval_all-nli-dev_dot_accuracy": 0.338, |
|
"eval_all-nli-dev_euclidean_accuracy": 0.463, |
|
"eval_all-nli-dev_manhattan_accuracy": 0.473, |
|
"eval_all-nli-dev_max_accuracy": 0.473, |
|
"eval_loss": 3.4308860301971436, |
|
"eval_runtime": 7.8074, |
|
"eval_samples_per_second": 128.083, |
|
"eval_steps_per_second": 8.069, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.02826513536274433, |
|
"learning_rate": 2.897777777777778e-05, |
|
"loss": 3.4663, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"eval_all-nli-dev_cosine_accuracy": 0.454, |
|
"eval_all-nli-dev_dot_accuracy": 0.338, |
|
"eval_all-nli-dev_euclidean_accuracy": 0.454, |
|
"eval_all-nli-dev_manhattan_accuracy": 0.469, |
|
"eval_all-nli-dev_max_accuracy": 0.469, |
|
"eval_loss": 3.4308860301971436, |
|
"eval_runtime": 7.8421, |
|
"eval_samples_per_second": 127.516, |
|
"eval_steps_per_second": 8.034, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.496, |
|
"grad_norm": 0.07985329627990723, |
|
"learning_rate": 2.8088888888888893e-05, |
|
"loss": 3.554, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 0.496, |
|
"eval_all-nli-dev_cosine_accuracy": 0.456, |
|
"eval_all-nli-dev_dot_accuracy": 0.359, |
|
"eval_all-nli-dev_euclidean_accuracy": 0.456, |
|
"eval_all-nli-dev_manhattan_accuracy": 0.46, |
|
"eval_all-nli-dev_max_accuracy": 0.46, |
|
"eval_loss": 3.4308860301971436, |
|
"eval_runtime": 7.8756, |
|
"eval_samples_per_second": 126.975, |
|
"eval_steps_per_second": 7.999, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 0.512, |
|
"grad_norm": 0.1259102076292038, |
|
"learning_rate": 2.7200000000000004e-05, |
|
"loss": 3.4664, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.512, |
|
"eval_all-nli-dev_cosine_accuracy": 0.455, |
|
"eval_all-nli-dev_dot_accuracy": 0.257, |
|
"eval_all-nli-dev_euclidean_accuracy": 0.455, |
|
"eval_all-nli-dev_manhattan_accuracy": 0.454, |
|
"eval_all-nli-dev_max_accuracy": 0.455, |
|
"eval_loss": 3.4308860301971436, |
|
"eval_runtime": 7.9019, |
|
"eval_samples_per_second": 126.552, |
|
"eval_steps_per_second": 7.973, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.528, |
|
"grad_norm": 0.06493524461984634, |
|
"learning_rate": 2.6311111111111115e-05, |
|
"loss": 3.4668, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.528, |
|
"eval_all-nli-dev_cosine_accuracy": 0.448, |
|
"eval_all-nli-dev_dot_accuracy": 0.383, |
|
"eval_all-nli-dev_euclidean_accuracy": 0.448, |
|
"eval_all-nli-dev_manhattan_accuracy": 0.46, |
|
"eval_all-nli-dev_max_accuracy": 0.46, |
|
"eval_loss": 3.4308860301971436, |
|
"eval_runtime": 7.9333, |
|
"eval_samples_per_second": 126.051, |
|
"eval_steps_per_second": 7.941, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.544, |
|
"grad_norm": 0.0680716335773468, |
|
"learning_rate": 2.5422222222222227e-05, |
|
"loss": 3.4661, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 0.544, |
|
"eval_all-nli-dev_cosine_accuracy": 0.492, |
|
"eval_all-nli-dev_dot_accuracy": 0.407, |
|
"eval_all-nli-dev_euclidean_accuracy": 0.492, |
|
"eval_all-nli-dev_manhattan_accuracy": 0.481, |
|
"eval_all-nli-dev_max_accuracy": 0.492, |
|
"eval_loss": 3.4308860301971436, |
|
"eval_runtime": 7.9192, |
|
"eval_samples_per_second": 126.276, |
|
"eval_steps_per_second": 7.955, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 0.04947361350059509, |
|
"learning_rate": 2.4533333333333334e-05, |
|
"loss": 3.4667, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"eval_all-nli-dev_cosine_accuracy": 0.432, |
|
"eval_all-nli-dev_dot_accuracy": 0.391, |
|
"eval_all-nli-dev_euclidean_accuracy": 0.432, |
|
"eval_all-nli-dev_manhattan_accuracy": 0.427, |
|
"eval_all-nli-dev_max_accuracy": 0.432, |
|
"eval_loss": 3.4308857917785645, |
|
"eval_runtime": 7.8702, |
|
"eval_samples_per_second": 127.061, |
|
"eval_steps_per_second": 8.005, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.576, |
|
"grad_norm": 0.07287462800741196, |
|
"learning_rate": 2.3644444444444446e-05, |
|
"loss": 3.4668, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 0.576, |
|
"eval_all-nli-dev_cosine_accuracy": 0.482, |
|
"eval_all-nli-dev_dot_accuracy": 0.374, |
|
"eval_all-nli-dev_euclidean_accuracy": 0.482, |
|
"eval_all-nli-dev_manhattan_accuracy": 0.486, |
|
"eval_all-nli-dev_max_accuracy": 0.486, |
|
"eval_loss": 3.4308860301971436, |
|
"eval_runtime": 7.8562, |
|
"eval_samples_per_second": 127.288, |
|
"eval_steps_per_second": 8.019, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 0.592, |
|
"grad_norm": 0.028324555605649948, |
|
"learning_rate": 2.2755555555555557e-05, |
|
"loss": 3.4666, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 0.592, |
|
"eval_all-nli-dev_cosine_accuracy": 0.469, |
|
"eval_all-nli-dev_dot_accuracy": 0.351, |
|
"eval_all-nli-dev_euclidean_accuracy": 0.469, |
|
"eval_all-nli-dev_manhattan_accuracy": 0.46, |
|
"eval_all-nli-dev_max_accuracy": 0.469, |
|
"eval_loss": 3.4308865070343018, |
|
"eval_runtime": 7.879, |
|
"eval_samples_per_second": 126.919, |
|
"eval_steps_per_second": 7.996, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 0.608, |
|
"grad_norm": 0.04251887649297714, |
|
"learning_rate": 2.186666666666667e-05, |
|
"loss": 3.4669, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 0.608, |
|
"eval_all-nli-dev_cosine_accuracy": 0.473, |
|
"eval_all-nli-dev_dot_accuracy": 0.369, |
|
"eval_all-nli-dev_euclidean_accuracy": 0.473, |
|
"eval_all-nli-dev_manhattan_accuracy": 0.467, |
|
"eval_all-nli-dev_max_accuracy": 0.473, |
|
"eval_loss": 3.4308860301971436, |
|
"eval_runtime": 7.9346, |
|
"eval_samples_per_second": 126.031, |
|
"eval_steps_per_second": 7.94, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 0.624, |
|
"grad_norm": 0.01685403846204281, |
|
"learning_rate": 2.097777777777778e-05, |
|
"loss": 3.4658, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 0.624, |
|
"eval_all-nli-dev_cosine_accuracy": 0.487, |
|
"eval_all-nli-dev_dot_accuracy": 0.333, |
|
"eval_all-nli-dev_euclidean_accuracy": 0.487, |
|
"eval_all-nli-dev_manhattan_accuracy": 0.486, |
|
"eval_all-nli-dev_max_accuracy": 0.487, |
|
"eval_loss": 3.4308860301971436, |
|
"eval_runtime": 8.0469, |
|
"eval_samples_per_second": 124.272, |
|
"eval_steps_per_second": 7.829, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.16402187943458557, |
|
"learning_rate": 2.008888888888889e-05, |
|
"loss": 3.4663, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"eval_all-nli-dev_cosine_accuracy": 0.448, |
|
"eval_all-nli-dev_dot_accuracy": 0.355, |
|
"eval_all-nli-dev_euclidean_accuracy": 0.448, |
|
"eval_all-nli-dev_manhattan_accuracy": 0.442, |
|
"eval_all-nli-dev_max_accuracy": 0.448, |
|
"eval_loss": 3.4308860301971436, |
|
"eval_runtime": 7.9755, |
|
"eval_samples_per_second": 125.384, |
|
"eval_steps_per_second": 7.899, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.656, |
|
"grad_norm": 0.08633382618427277, |
|
"learning_rate": 1.9200000000000003e-05, |
|
"loss": 3.4663, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 0.656, |
|
"eval_all-nli-dev_cosine_accuracy": 0.463, |
|
"eval_all-nli-dev_dot_accuracy": 0.3, |
|
"eval_all-nli-dev_euclidean_accuracy": 0.463, |
|
"eval_all-nli-dev_manhattan_accuracy": 0.465, |
|
"eval_all-nli-dev_max_accuracy": 0.465, |
|
"eval_loss": 3.4308860301971436, |
|
"eval_runtime": 7.9722, |
|
"eval_samples_per_second": 125.436, |
|
"eval_steps_per_second": 7.902, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 0.672, |
|
"grad_norm": 0.08357453346252441, |
|
"learning_rate": 1.8311111111111114e-05, |
|
"loss": 3.4664, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 0.672, |
|
"eval_all-nli-dev_cosine_accuracy": 0.48, |
|
"eval_all-nli-dev_dot_accuracy": 0.343, |
|
"eval_all-nli-dev_euclidean_accuracy": 0.48, |
|
"eval_all-nli-dev_manhattan_accuracy": 0.484, |
|
"eval_all-nli-dev_max_accuracy": 0.484, |
|
"eval_loss": 3.4308860301971436, |
|
"eval_runtime": 7.9287, |
|
"eval_samples_per_second": 126.125, |
|
"eval_steps_per_second": 7.946, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 0.688, |
|
"grad_norm": 0.20162004232406616, |
|
"learning_rate": 1.7422222222222222e-05, |
|
"loss": 3.4663, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 0.688, |
|
"eval_all-nli-dev_cosine_accuracy": 0.451, |
|
"eval_all-nli-dev_dot_accuracy": 0.354, |
|
"eval_all-nli-dev_euclidean_accuracy": 0.451, |
|
"eval_all-nli-dev_manhattan_accuracy": 0.469, |
|
"eval_all-nli-dev_max_accuracy": 0.469, |
|
"eval_loss": 3.4308860301971436, |
|
"eval_runtime": 7.9242, |
|
"eval_samples_per_second": 126.195, |
|
"eval_steps_per_second": 7.95, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 0.704, |
|
"grad_norm": 0.17491748929023743, |
|
"learning_rate": 1.6533333333333333e-05, |
|
"loss": 3.4661, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 0.704, |
|
"eval_all-nli-dev_cosine_accuracy": 0.47, |
|
"eval_all-nli-dev_dot_accuracy": 0.367, |
|
"eval_all-nli-dev_euclidean_accuracy": 0.47, |
|
"eval_all-nli-dev_manhattan_accuracy": 0.478, |
|
"eval_all-nli-dev_max_accuracy": 0.478, |
|
"eval_loss": 3.4308862686157227, |
|
"eval_runtime": 7.989, |
|
"eval_samples_per_second": 125.172, |
|
"eval_steps_per_second": 7.886, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.1597195565700531, |
|
"learning_rate": 1.5644444444444444e-05, |
|
"loss": 3.4669, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"eval_all-nli-dev_cosine_accuracy": 0.466, |
|
"eval_all-nli-dev_dot_accuracy": 0.375, |
|
"eval_all-nli-dev_euclidean_accuracy": 0.466, |
|
"eval_all-nli-dev_manhattan_accuracy": 0.467, |
|
"eval_all-nli-dev_max_accuracy": 0.467, |
|
"eval_loss": 3.4308860301971436, |
|
"eval_runtime": 7.9327, |
|
"eval_samples_per_second": 126.06, |
|
"eval_steps_per_second": 7.942, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.736, |
|
"grad_norm": 0.08718598634004593, |
|
"learning_rate": 1.4755555555555556e-05, |
|
"loss": 3.4664, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 0.736, |
|
"eval_all-nli-dev_cosine_accuracy": 0.44, |
|
"eval_all-nli-dev_dot_accuracy": 0.307, |
|
"eval_all-nli-dev_euclidean_accuracy": 0.44, |
|
"eval_all-nli-dev_manhattan_accuracy": 0.455, |
|
"eval_all-nli-dev_max_accuracy": 0.455, |
|
"eval_loss": 3.4308862686157227, |
|
"eval_runtime": 7.9594, |
|
"eval_samples_per_second": 125.637, |
|
"eval_steps_per_second": 7.915, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 0.752, |
|
"grad_norm": 0.03150181472301483, |
|
"learning_rate": 1.3866666666666667e-05, |
|
"loss": 3.4664, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 0.752, |
|
"eval_all-nli-dev_cosine_accuracy": 0.469, |
|
"eval_all-nli-dev_dot_accuracy": 0.336, |
|
"eval_all-nli-dev_euclidean_accuracy": 0.469, |
|
"eval_all-nli-dev_manhattan_accuracy": 0.481, |
|
"eval_all-nli-dev_max_accuracy": 0.481, |
|
"eval_loss": 3.4308860301971436, |
|
"eval_runtime": 7.9652, |
|
"eval_samples_per_second": 125.546, |
|
"eval_steps_per_second": 7.909, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 0.768, |
|
"grad_norm": 0.027570225298404694, |
|
"learning_rate": 1.2977777777777777e-05, |
|
"loss": 3.4659, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 0.768, |
|
"eval_all-nli-dev_cosine_accuracy": 0.461, |
|
"eval_all-nli-dev_dot_accuracy": 0.282, |
|
"eval_all-nli-dev_euclidean_accuracy": 0.461, |
|
"eval_all-nli-dev_manhattan_accuracy": 0.466, |
|
"eval_all-nli-dev_max_accuracy": 0.466, |
|
"eval_loss": 3.4308862686157227, |
|
"eval_runtime": 8.0206, |
|
"eval_samples_per_second": 124.678, |
|
"eval_steps_per_second": 7.855, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 0.784, |
|
"grad_norm": 0.16807572543621063, |
|
"learning_rate": 1.208888888888889e-05, |
|
"loss": 3.466, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 0.784, |
|
"eval_all-nli-dev_cosine_accuracy": 0.45, |
|
"eval_all-nli-dev_dot_accuracy": 0.375, |
|
"eval_all-nli-dev_euclidean_accuracy": 0.45, |
|
"eval_all-nli-dev_manhattan_accuracy": 0.451, |
|
"eval_all-nli-dev_max_accuracy": 0.451, |
|
"eval_loss": 3.4308860301971436, |
|
"eval_runtime": 7.9772, |
|
"eval_samples_per_second": 125.358, |
|
"eval_steps_per_second": 7.898, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.03361041471362114, |
|
"learning_rate": 1.1200000000000001e-05, |
|
"loss": 3.466, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"eval_all-nli-dev_cosine_accuracy": 0.464, |
|
"eval_all-nli-dev_dot_accuracy": 0.374, |
|
"eval_all-nli-dev_euclidean_accuracy": 0.464, |
|
"eval_all-nli-dev_manhattan_accuracy": 0.473, |
|
"eval_all-nli-dev_max_accuracy": 0.473, |
|
"eval_loss": 3.4308860301971436, |
|
"eval_runtime": 7.9287, |
|
"eval_samples_per_second": 126.123, |
|
"eval_steps_per_second": 7.946, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.816, |
|
"grad_norm": 0.1332545429468155, |
|
"learning_rate": 1.031111111111111e-05, |
|
"loss": 3.4664, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 0.816, |
|
"eval_all-nli-dev_cosine_accuracy": 0.44, |
|
"eval_all-nli-dev_dot_accuracy": 0.341, |
|
"eval_all-nli-dev_euclidean_accuracy": 0.44, |
|
"eval_all-nli-dev_manhattan_accuracy": 0.44, |
|
"eval_all-nli-dev_max_accuracy": 0.44, |
|
"eval_loss": 3.4308860301971436, |
|
"eval_runtime": 7.9414, |
|
"eval_samples_per_second": 125.922, |
|
"eval_steps_per_second": 7.933, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 0.832, |
|
"grad_norm": 0.029825175181031227, |
|
"learning_rate": 9.422222222222222e-06, |
|
"loss": 3.4658, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 0.832, |
|
"eval_all-nli-dev_cosine_accuracy": 0.496, |
|
"eval_all-nli-dev_dot_accuracy": 0.356, |
|
"eval_all-nli-dev_euclidean_accuracy": 0.496, |
|
"eval_all-nli-dev_manhattan_accuracy": 0.497, |
|
"eval_all-nli-dev_max_accuracy": 0.497, |
|
"eval_loss": 3.4308860301971436, |
|
"eval_runtime": 7.9321, |
|
"eval_samples_per_second": 126.07, |
|
"eval_steps_per_second": 7.942, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 0.848, |
|
"grad_norm": 0.023120058700442314, |
|
"learning_rate": 8.533333333333334e-06, |
|
"loss": 3.4664, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 0.848, |
|
"eval_all-nli-dev_cosine_accuracy": 0.464, |
|
"eval_all-nli-dev_dot_accuracy": 0.321, |
|
"eval_all-nli-dev_euclidean_accuracy": 0.464, |
|
"eval_all-nli-dev_manhattan_accuracy": 0.474, |
|
"eval_all-nli-dev_max_accuracy": 0.474, |
|
"eval_loss": 3.4308860301971436, |
|
"eval_runtime": 7.9178, |
|
"eval_samples_per_second": 126.297, |
|
"eval_steps_per_second": 7.957, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 0.864, |
|
"grad_norm": 0.10328388214111328, |
|
"learning_rate": 7.644444444444445e-06, |
|
"loss": 3.4658, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 0.864, |
|
"eval_all-nli-dev_cosine_accuracy": 0.446, |
|
"eval_all-nli-dev_dot_accuracy": 0.309, |
|
"eval_all-nli-dev_euclidean_accuracy": 0.446, |
|
"eval_all-nli-dev_manhattan_accuracy": 0.449, |
|
"eval_all-nli-dev_max_accuracy": 0.449, |
|
"eval_loss": 3.4308860301971436, |
|
"eval_runtime": 8.0066, |
|
"eval_samples_per_second": 124.897, |
|
"eval_steps_per_second": 7.869, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 0.03729160130023956, |
|
"learning_rate": 6.755555555555555e-06, |
|
"loss": 3.4662, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"eval_all-nli-dev_cosine_accuracy": 0.466, |
|
"eval_all-nli-dev_dot_accuracy": 0.311, |
|
"eval_all-nli-dev_euclidean_accuracy": 0.466, |
|
"eval_all-nli-dev_manhattan_accuracy": 0.464, |
|
"eval_all-nli-dev_max_accuracy": 0.466, |
|
"eval_loss": 3.4308860301971436, |
|
"eval_runtime": 7.964, |
|
"eval_samples_per_second": 125.565, |
|
"eval_steps_per_second": 7.911, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.896, |
|
"grad_norm": 0.02747642807662487, |
|
"learning_rate": 5.866666666666667e-06, |
|
"loss": 3.4663, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 0.896, |
|
"eval_all-nli-dev_cosine_accuracy": 0.476, |
|
"eval_all-nli-dev_dot_accuracy": 0.369, |
|
"eval_all-nli-dev_euclidean_accuracy": 0.476, |
|
"eval_all-nli-dev_manhattan_accuracy": 0.468, |
|
"eval_all-nli-dev_max_accuracy": 0.476, |
|
"eval_loss": 3.4308860301971436, |
|
"eval_runtime": 7.9426, |
|
"eval_samples_per_second": 125.903, |
|
"eval_steps_per_second": 7.932, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 0.912, |
|
"grad_norm": 0.053027376532554626, |
|
"learning_rate": 4.977777777777778e-06, |
|
"loss": 3.4667, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 0.912, |
|
"eval_all-nli-dev_cosine_accuracy": 0.455, |
|
"eval_all-nli-dev_dot_accuracy": 0.369, |
|
"eval_all-nli-dev_euclidean_accuracy": 0.455, |
|
"eval_all-nli-dev_manhattan_accuracy": 0.454, |
|
"eval_all-nli-dev_max_accuracy": 0.455, |
|
"eval_loss": 3.4308860301971436, |
|
"eval_runtime": 7.9874, |
|
"eval_samples_per_second": 125.197, |
|
"eval_steps_per_second": 7.887, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 0.928, |
|
"grad_norm": 0.02996170148253441, |
|
"learning_rate": 4.088888888888889e-06, |
|
"loss": 3.4669, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 0.928, |
|
"eval_all-nli-dev_cosine_accuracy": 0.456, |
|
"eval_all-nli-dev_dot_accuracy": 0.229, |
|
"eval_all-nli-dev_euclidean_accuracy": 0.456, |
|
"eval_all-nli-dev_manhattan_accuracy": 0.463, |
|
"eval_all-nli-dev_max_accuracy": 0.463, |
|
"eval_loss": 3.4308860301971436, |
|
"eval_runtime": 7.9568, |
|
"eval_samples_per_second": 125.679, |
|
"eval_steps_per_second": 7.918, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 0.944, |
|
"grad_norm": 0.05124541372060776, |
|
"learning_rate": 3.2000000000000003e-06, |
|
"loss": 3.4657, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 0.944, |
|
"eval_all-nli-dev_cosine_accuracy": 0.466, |
|
"eval_all-nli-dev_dot_accuracy": 0.388, |
|
"eval_all-nli-dev_euclidean_accuracy": 0.467, |
|
"eval_all-nli-dev_manhattan_accuracy": 0.464, |
|
"eval_all-nli-dev_max_accuracy": 0.467, |
|
"eval_loss": 3.4308862686157227, |
|
"eval_runtime": 7.9623, |
|
"eval_samples_per_second": 125.591, |
|
"eval_steps_per_second": 7.912, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 0.06664357334375381, |
|
"learning_rate": 2.311111111111111e-06, |
|
"loss": 3.4671, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"eval_all-nli-dev_cosine_accuracy": 0.452, |
|
"eval_all-nli-dev_dot_accuracy": 0.33, |
|
"eval_all-nli-dev_euclidean_accuracy": 0.452, |
|
"eval_all-nli-dev_manhattan_accuracy": 0.456, |
|
"eval_all-nli-dev_max_accuracy": 0.456, |
|
"eval_loss": 3.4308862686157227, |
|
"eval_runtime": 7.9667, |
|
"eval_samples_per_second": 125.523, |
|
"eval_steps_per_second": 7.908, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.976, |
|
"grad_norm": 0.18055689334869385, |
|
"learning_rate": 1.4222222222222223e-06, |
|
"loss": 2.9471, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 0.976, |
|
"eval_all-nli-dev_cosine_accuracy": 0.479, |
|
"eval_all-nli-dev_dot_accuracy": 0.378, |
|
"eval_all-nli-dev_euclidean_accuracy": 0.479, |
|
"eval_all-nli-dev_manhattan_accuracy": 0.484, |
|
"eval_all-nli-dev_max_accuracy": 0.484, |
|
"eval_loss": 3.4308860301971436, |
|
"eval_runtime": 7.9406, |
|
"eval_samples_per_second": 125.936, |
|
"eval_steps_per_second": 7.934, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 0.992, |
|
"grad_norm": 0.10885396599769592, |
|
"learning_rate": 5.333333333333333e-07, |
|
"loss": 0.6929, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 0.992, |
|
"eval_all-nli-dev_cosine_accuracy": 0.452, |
|
"eval_all-nli-dev_dot_accuracy": 0.34, |
|
"eval_all-nli-dev_euclidean_accuracy": 0.452, |
|
"eval_all-nli-dev_manhattan_accuracy": 0.456, |
|
"eval_all-nli-dev_max_accuracy": 0.456, |
|
"eval_loss": 3.4308860301971436, |
|
"eval_runtime": 7.9933, |
|
"eval_samples_per_second": 125.105, |
|
"eval_steps_per_second": 7.882, |
|
"step": 6200 |
|
} |
|
], |
|
"logging_steps": 100, |
|
"max_steps": 6250, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 0.0, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|