manu's picture
Upload folder using huggingface_hub
264b250 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.992,
"eval_steps": 100,
"global_step": 6200,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.016,
"grad_norm": 83.78450775146484,
"learning_rate": 7.44e-06,
"loss": 3.2768,
"step": 100
},
{
"epoch": 0.016,
"eval_all-nli-dev_cosine_accuracy": 0.815,
"eval_all-nli-dev_dot_accuracy": 0.316,
"eval_all-nli-dev_euclidean_accuracy": 0.804,
"eval_all-nli-dev_manhattan_accuracy": 0.833,
"eval_all-nli-dev_max_accuracy": 0.833,
"eval_loss": 1.8052657842636108,
"eval_runtime": 8.0216,
"eval_samples_per_second": 124.664,
"eval_steps_per_second": 7.854,
"step": 100
},
{
"epoch": 0.032,
"grad_norm": 62.54200744628906,
"learning_rate": 1.544e-05,
"loss": 1.1697,
"step": 200
},
{
"epoch": 0.032,
"eval_all-nli-dev_cosine_accuracy": 0.842,
"eval_all-nli-dev_dot_accuracy": 0.171,
"eval_all-nli-dev_euclidean_accuracy": 0.838,
"eval_all-nli-dev_manhattan_accuracy": 0.861,
"eval_all-nli-dev_max_accuracy": 0.861,
"eval_loss": 1.2878086566925049,
"eval_runtime": 7.8831,
"eval_samples_per_second": 126.853,
"eval_steps_per_second": 7.992,
"step": 200
},
{
"epoch": 0.048,
"grad_norm": 75.54035949707031,
"learning_rate": 2.344e-05,
"loss": 1.372,
"step": 300
},
{
"epoch": 0.048,
"eval_all-nli-dev_cosine_accuracy": 0.841,
"eval_all-nli-dev_dot_accuracy": 0.181,
"eval_all-nli-dev_euclidean_accuracy": 0.844,
"eval_all-nli-dev_manhattan_accuracy": 0.861,
"eval_all-nli-dev_max_accuracy": 0.861,
"eval_loss": 1.2466014623641968,
"eval_runtime": 7.8792,
"eval_samples_per_second": 126.916,
"eval_steps_per_second": 7.996,
"step": 300
},
{
"epoch": 0.064,
"grad_norm": 33.85651779174805,
"learning_rate": 3.136e-05,
"loss": 1.0476,
"step": 400
},
{
"epoch": 0.064,
"eval_all-nli-dev_cosine_accuracy": 0.848,
"eval_all-nli-dev_dot_accuracy": 0.201,
"eval_all-nli-dev_euclidean_accuracy": 0.848,
"eval_all-nli-dev_manhattan_accuracy": 0.863,
"eval_all-nli-dev_max_accuracy": 0.863,
"eval_loss": 1.2291330099105835,
"eval_runtime": 7.8507,
"eval_samples_per_second": 127.378,
"eval_steps_per_second": 8.025,
"step": 400
},
{
"epoch": 0.08,
"grad_norm": 27.536640167236328,
"learning_rate": 3.936e-05,
"loss": 0.8588,
"step": 500
},
{
"epoch": 0.08,
"eval_all-nli-dev_cosine_accuracy": 0.821,
"eval_all-nli-dev_dot_accuracy": 0.213,
"eval_all-nli-dev_euclidean_accuracy": 0.828,
"eval_all-nli-dev_manhattan_accuracy": 0.838,
"eval_all-nli-dev_max_accuracy": 0.838,
"eval_loss": 1.5258921384811401,
"eval_runtime": 7.8331,
"eval_samples_per_second": 127.663,
"eval_steps_per_second": 8.043,
"step": 500
},
{
"epoch": 0.096,
"grad_norm": 0.15947222709655762,
"learning_rate": 4.736000000000001e-05,
"loss": 2.9781,
"step": 600
},
{
"epoch": 0.096,
"eval_all-nli-dev_cosine_accuracy": 0.462,
"eval_all-nli-dev_dot_accuracy": 0.363,
"eval_all-nli-dev_euclidean_accuracy": 0.462,
"eval_all-nli-dev_manhattan_accuracy": 0.463,
"eval_all-nli-dev_max_accuracy": 0.463,
"eval_loss": 3.4308860301971436,
"eval_runtime": 7.9535,
"eval_samples_per_second": 125.73,
"eval_steps_per_second": 7.921,
"step": 600
},
{
"epoch": 0.112,
"grad_norm": 0.09273126721382141,
"learning_rate": 4.9404444444444447e-05,
"loss": 3.4982,
"step": 700
},
{
"epoch": 0.112,
"eval_all-nli-dev_cosine_accuracy": 0.449,
"eval_all-nli-dev_dot_accuracy": 0.381,
"eval_all-nli-dev_euclidean_accuracy": 0.449,
"eval_all-nli-dev_manhattan_accuracy": 0.457,
"eval_all-nli-dev_max_accuracy": 0.457,
"eval_loss": 3.4308860301971436,
"eval_runtime": 7.905,
"eval_samples_per_second": 126.501,
"eval_steps_per_second": 7.97,
"step": 700
},
{
"epoch": 0.128,
"grad_norm": 0.047491393983364105,
"learning_rate": 4.851555555555556e-05,
"loss": 3.467,
"step": 800
},
{
"epoch": 0.128,
"eval_all-nli-dev_cosine_accuracy": 0.471,
"eval_all-nli-dev_dot_accuracy": 0.366,
"eval_all-nli-dev_euclidean_accuracy": 0.471,
"eval_all-nli-dev_manhattan_accuracy": 0.479,
"eval_all-nli-dev_max_accuracy": 0.479,
"eval_loss": 3.4308860301971436,
"eval_runtime": 7.8623,
"eval_samples_per_second": 127.188,
"eval_steps_per_second": 8.013,
"step": 800
},
{
"epoch": 0.144,
"grad_norm": 0.05457993969321251,
"learning_rate": 4.762666666666667e-05,
"loss": 3.4665,
"step": 900
},
{
"epoch": 0.144,
"eval_all-nli-dev_cosine_accuracy": 0.452,
"eval_all-nli-dev_dot_accuracy": 0.342,
"eval_all-nli-dev_euclidean_accuracy": 0.452,
"eval_all-nli-dev_manhattan_accuracy": 0.446,
"eval_all-nli-dev_max_accuracy": 0.452,
"eval_loss": 3.4308860301971436,
"eval_runtime": 8.0107,
"eval_samples_per_second": 124.832,
"eval_steps_per_second": 7.864,
"step": 900
},
{
"epoch": 0.16,
"grad_norm": 0.03311806544661522,
"learning_rate": 4.673777777777778e-05,
"loss": 3.4664,
"step": 1000
},
{
"epoch": 0.16,
"eval_all-nli-dev_cosine_accuracy": 0.477,
"eval_all-nli-dev_dot_accuracy": 0.352,
"eval_all-nli-dev_euclidean_accuracy": 0.477,
"eval_all-nli-dev_manhattan_accuracy": 0.468,
"eval_all-nli-dev_max_accuracy": 0.477,
"eval_loss": 3.4308860301971436,
"eval_runtime": 7.9233,
"eval_samples_per_second": 126.21,
"eval_steps_per_second": 7.951,
"step": 1000
},
{
"epoch": 0.176,
"grad_norm": 0.02707391045987606,
"learning_rate": 4.584888888888889e-05,
"loss": 3.4663,
"step": 1100
},
{
"epoch": 0.176,
"eval_all-nli-dev_cosine_accuracy": 0.458,
"eval_all-nli-dev_dot_accuracy": 0.376,
"eval_all-nli-dev_euclidean_accuracy": 0.458,
"eval_all-nli-dev_manhattan_accuracy": 0.452,
"eval_all-nli-dev_max_accuracy": 0.458,
"eval_loss": 3.4308860301971436,
"eval_runtime": 7.8521,
"eval_samples_per_second": 127.354,
"eval_steps_per_second": 8.023,
"step": 1100
},
{
"epoch": 0.192,
"grad_norm": 0.030668186023831367,
"learning_rate": 4.496e-05,
"loss": 3.4661,
"step": 1200
},
{
"epoch": 0.192,
"eval_all-nli-dev_cosine_accuracy": 0.46,
"eval_all-nli-dev_dot_accuracy": 0.39,
"eval_all-nli-dev_euclidean_accuracy": 0.46,
"eval_all-nli-dev_manhattan_accuracy": 0.462,
"eval_all-nli-dev_max_accuracy": 0.462,
"eval_loss": 3.4308862686157227,
"eval_runtime": 8.0488,
"eval_samples_per_second": 124.243,
"eval_steps_per_second": 7.827,
"step": 1200
},
{
"epoch": 0.208,
"grad_norm": 0.04911087080836296,
"learning_rate": 4.4071111111111115e-05,
"loss": 3.4658,
"step": 1300
},
{
"epoch": 0.208,
"eval_all-nli-dev_cosine_accuracy": 0.441,
"eval_all-nli-dev_dot_accuracy": 0.338,
"eval_all-nli-dev_euclidean_accuracy": 0.441,
"eval_all-nli-dev_manhattan_accuracy": 0.45,
"eval_all-nli-dev_max_accuracy": 0.45,
"eval_loss": 3.4308857917785645,
"eval_runtime": 7.9015,
"eval_samples_per_second": 126.558,
"eval_steps_per_second": 7.973,
"step": 1300
},
{
"epoch": 0.224,
"grad_norm": 0.02693816088140011,
"learning_rate": 4.3182222222222226e-05,
"loss": 3.4661,
"step": 1400
},
{
"epoch": 0.224,
"eval_all-nli-dev_cosine_accuracy": 0.475,
"eval_all-nli-dev_dot_accuracy": 0.31,
"eval_all-nli-dev_euclidean_accuracy": 0.475,
"eval_all-nli-dev_manhattan_accuracy": 0.481,
"eval_all-nli-dev_max_accuracy": 0.481,
"eval_loss": 3.4308862686157227,
"eval_runtime": 7.9424,
"eval_samples_per_second": 125.906,
"eval_steps_per_second": 7.932,
"step": 1400
},
{
"epoch": 0.24,
"grad_norm": 0.17906944453716278,
"learning_rate": 4.229333333333334e-05,
"loss": 3.4877,
"step": 1500
},
{
"epoch": 0.24,
"eval_all-nli-dev_cosine_accuracy": 0.464,
"eval_all-nli-dev_dot_accuracy": 0.358,
"eval_all-nli-dev_euclidean_accuracy": 0.464,
"eval_all-nli-dev_manhattan_accuracy": 0.458,
"eval_all-nli-dev_max_accuracy": 0.464,
"eval_loss": 3.4308860301971436,
"eval_runtime": 7.9335,
"eval_samples_per_second": 126.048,
"eval_steps_per_second": 7.941,
"step": 1500
},
{
"epoch": 0.256,
"grad_norm": 0.07473237067461014,
"learning_rate": 4.140444444444445e-05,
"loss": 3.4675,
"step": 1600
},
{
"epoch": 0.256,
"eval_all-nli-dev_cosine_accuracy": 0.462,
"eval_all-nli-dev_dot_accuracy": 0.347,
"eval_all-nli-dev_euclidean_accuracy": 0.462,
"eval_all-nli-dev_manhattan_accuracy": 0.457,
"eval_all-nli-dev_max_accuracy": 0.462,
"eval_loss": 3.4308860301971436,
"eval_runtime": 7.9155,
"eval_samples_per_second": 126.335,
"eval_steps_per_second": 7.959,
"step": 1600
},
{
"epoch": 0.272,
"grad_norm": 0.06472893804311752,
"learning_rate": 4.051555555555556e-05,
"loss": 3.4665,
"step": 1700
},
{
"epoch": 0.272,
"eval_all-nli-dev_cosine_accuracy": 0.488,
"eval_all-nli-dev_dot_accuracy": 0.394,
"eval_all-nli-dev_euclidean_accuracy": 0.488,
"eval_all-nli-dev_manhattan_accuracy": 0.487,
"eval_all-nli-dev_max_accuracy": 0.488,
"eval_loss": 3.4308862686157227,
"eval_runtime": 7.7762,
"eval_samples_per_second": 128.597,
"eval_steps_per_second": 8.102,
"step": 1700
},
{
"epoch": 0.288,
"grad_norm": 0.05817211791872978,
"learning_rate": 3.9626666666666664e-05,
"loss": 3.4667,
"step": 1800
},
{
"epoch": 0.288,
"eval_all-nli-dev_cosine_accuracy": 0.492,
"eval_all-nli-dev_dot_accuracy": 0.396,
"eval_all-nli-dev_euclidean_accuracy": 0.492,
"eval_all-nli-dev_manhattan_accuracy": 0.483,
"eval_all-nli-dev_max_accuracy": 0.492,
"eval_loss": 3.4308860301971436,
"eval_runtime": 7.8429,
"eval_samples_per_second": 127.504,
"eval_steps_per_second": 8.033,
"step": 1800
},
{
"epoch": 0.304,
"grad_norm": 0.06118469312787056,
"learning_rate": 3.8737777777777776e-05,
"loss": 3.4664,
"step": 1900
},
{
"epoch": 0.304,
"eval_all-nli-dev_cosine_accuracy": 0.452,
"eval_all-nli-dev_dot_accuracy": 0.377,
"eval_all-nli-dev_euclidean_accuracy": 0.452,
"eval_all-nli-dev_manhattan_accuracy": 0.455,
"eval_all-nli-dev_max_accuracy": 0.455,
"eval_loss": 3.4308860301971436,
"eval_runtime": 7.8193,
"eval_samples_per_second": 127.888,
"eval_steps_per_second": 8.057,
"step": 1900
},
{
"epoch": 0.32,
"grad_norm": 0.034288015216588974,
"learning_rate": 3.784888888888889e-05,
"loss": 3.4661,
"step": 2000
},
{
"epoch": 0.32,
"eval_all-nli-dev_cosine_accuracy": 0.446,
"eval_all-nli-dev_dot_accuracy": 0.35,
"eval_all-nli-dev_euclidean_accuracy": 0.446,
"eval_all-nli-dev_manhattan_accuracy": 0.453,
"eval_all-nli-dev_max_accuracy": 0.453,
"eval_loss": 3.4308860301971436,
"eval_runtime": 7.8176,
"eval_samples_per_second": 127.916,
"eval_steps_per_second": 8.059,
"step": 2000
},
{
"epoch": 0.336,
"grad_norm": 0.06111547723412514,
"learning_rate": 3.696e-05,
"loss": 3.4666,
"step": 2100
},
{
"epoch": 0.336,
"eval_all-nli-dev_cosine_accuracy": 0.469,
"eval_all-nli-dev_dot_accuracy": 0.39,
"eval_all-nli-dev_euclidean_accuracy": 0.469,
"eval_all-nli-dev_manhattan_accuracy": 0.477,
"eval_all-nli-dev_max_accuracy": 0.477,
"eval_loss": 3.4308862686157227,
"eval_runtime": 7.8728,
"eval_samples_per_second": 127.02,
"eval_steps_per_second": 8.002,
"step": 2100
},
{
"epoch": 0.352,
"grad_norm": 0.06624756008386612,
"learning_rate": 3.607111111111111e-05,
"loss": 3.4683,
"step": 2200
},
{
"epoch": 0.352,
"eval_all-nli-dev_cosine_accuracy": 0.478,
"eval_all-nli-dev_dot_accuracy": 0.338,
"eval_all-nli-dev_euclidean_accuracy": 0.478,
"eval_all-nli-dev_manhattan_accuracy": 0.48,
"eval_all-nli-dev_max_accuracy": 0.48,
"eval_loss": 3.4308860301971436,
"eval_runtime": 7.9247,
"eval_samples_per_second": 126.188,
"eval_steps_per_second": 7.95,
"step": 2200
},
{
"epoch": 0.368,
"grad_norm": 0.031209247186779976,
"learning_rate": 3.518222222222222e-05,
"loss": 3.4663,
"step": 2300
},
{
"epoch": 0.368,
"eval_all-nli-dev_cosine_accuracy": 0.469,
"eval_all-nli-dev_dot_accuracy": 0.312,
"eval_all-nli-dev_euclidean_accuracy": 0.469,
"eval_all-nli-dev_manhattan_accuracy": 0.464,
"eval_all-nli-dev_max_accuracy": 0.469,
"eval_loss": 3.4308860301971436,
"eval_runtime": 7.8393,
"eval_samples_per_second": 127.562,
"eval_steps_per_second": 8.036,
"step": 2300
},
{
"epoch": 0.384,
"grad_norm": 0.133990079164505,
"learning_rate": 3.429333333333333e-05,
"loss": 3.4667,
"step": 2400
},
{
"epoch": 0.384,
"eval_all-nli-dev_cosine_accuracy": 0.448,
"eval_all-nli-dev_dot_accuracy": 0.404,
"eval_all-nli-dev_euclidean_accuracy": 0.448,
"eval_all-nli-dev_manhattan_accuracy": 0.445,
"eval_all-nli-dev_max_accuracy": 0.448,
"eval_loss": 3.4308860301971436,
"eval_runtime": 8.021,
"eval_samples_per_second": 124.672,
"eval_steps_per_second": 7.854,
"step": 2400
},
{
"epoch": 0.4,
"grad_norm": 0.04892706498503685,
"learning_rate": 3.3404444444444444e-05,
"loss": 3.4669,
"step": 2500
},
{
"epoch": 0.4,
"eval_all-nli-dev_cosine_accuracy": 0.499,
"eval_all-nli-dev_dot_accuracy": 0.365,
"eval_all-nli-dev_euclidean_accuracy": 0.499,
"eval_all-nli-dev_manhattan_accuracy": 0.492,
"eval_all-nli-dev_max_accuracy": 0.499,
"eval_loss": 3.4308862686157227,
"eval_runtime": 7.8451,
"eval_samples_per_second": 127.468,
"eval_steps_per_second": 8.03,
"step": 2500
},
{
"epoch": 0.416,
"grad_norm": 0.053150493651628494,
"learning_rate": 3.2515555555555555e-05,
"loss": 3.4661,
"step": 2600
},
{
"epoch": 0.416,
"eval_all-nli-dev_cosine_accuracy": 0.453,
"eval_all-nli-dev_dot_accuracy": 0.349,
"eval_all-nli-dev_euclidean_accuracy": 0.453,
"eval_all-nli-dev_manhattan_accuracy": 0.45,
"eval_all-nli-dev_max_accuracy": 0.453,
"eval_loss": 3.4308860301971436,
"eval_runtime": 7.9334,
"eval_samples_per_second": 126.05,
"eval_steps_per_second": 7.941,
"step": 2600
},
{
"epoch": 0.432,
"grad_norm": 0.09555792808532715,
"learning_rate": 3.1626666666666667e-05,
"loss": 3.4656,
"step": 2700
},
{
"epoch": 0.432,
"eval_all-nli-dev_cosine_accuracy": 0.466,
"eval_all-nli-dev_dot_accuracy": 0.336,
"eval_all-nli-dev_euclidean_accuracy": 0.466,
"eval_all-nli-dev_manhattan_accuracy": 0.467,
"eval_all-nli-dev_max_accuracy": 0.467,
"eval_loss": 3.4308860301971436,
"eval_runtime": 7.8562,
"eval_samples_per_second": 127.287,
"eval_steps_per_second": 8.019,
"step": 2700
},
{
"epoch": 0.448,
"grad_norm": 0.01744948886334896,
"learning_rate": 3.0737777777777785e-05,
"loss": 3.4662,
"step": 2800
},
{
"epoch": 0.448,
"eval_all-nli-dev_cosine_accuracy": 0.506,
"eval_all-nli-dev_dot_accuracy": 0.4,
"eval_all-nli-dev_euclidean_accuracy": 0.506,
"eval_all-nli-dev_manhattan_accuracy": 0.507,
"eval_all-nli-dev_max_accuracy": 0.507,
"eval_loss": 3.4308860301971436,
"eval_runtime": 7.8617,
"eval_samples_per_second": 127.199,
"eval_steps_per_second": 8.014,
"step": 2800
},
{
"epoch": 0.464,
"grad_norm": 0.2527749538421631,
"learning_rate": 2.986666666666667e-05,
"loss": 3.4902,
"step": 2900
},
{
"epoch": 0.464,
"eval_all-nli-dev_cosine_accuracy": 0.463,
"eval_all-nli-dev_dot_accuracy": 0.338,
"eval_all-nli-dev_euclidean_accuracy": 0.463,
"eval_all-nli-dev_manhattan_accuracy": 0.473,
"eval_all-nli-dev_max_accuracy": 0.473,
"eval_loss": 3.4308860301971436,
"eval_runtime": 7.8074,
"eval_samples_per_second": 128.083,
"eval_steps_per_second": 8.069,
"step": 2900
},
{
"epoch": 0.48,
"grad_norm": 0.02826513536274433,
"learning_rate": 2.897777777777778e-05,
"loss": 3.4663,
"step": 3000
},
{
"epoch": 0.48,
"eval_all-nli-dev_cosine_accuracy": 0.454,
"eval_all-nli-dev_dot_accuracy": 0.338,
"eval_all-nli-dev_euclidean_accuracy": 0.454,
"eval_all-nli-dev_manhattan_accuracy": 0.469,
"eval_all-nli-dev_max_accuracy": 0.469,
"eval_loss": 3.4308860301971436,
"eval_runtime": 7.8421,
"eval_samples_per_second": 127.516,
"eval_steps_per_second": 8.034,
"step": 3000
},
{
"epoch": 0.496,
"grad_norm": 0.07985329627990723,
"learning_rate": 2.8088888888888893e-05,
"loss": 3.554,
"step": 3100
},
{
"epoch": 0.496,
"eval_all-nli-dev_cosine_accuracy": 0.456,
"eval_all-nli-dev_dot_accuracy": 0.359,
"eval_all-nli-dev_euclidean_accuracy": 0.456,
"eval_all-nli-dev_manhattan_accuracy": 0.46,
"eval_all-nli-dev_max_accuracy": 0.46,
"eval_loss": 3.4308860301971436,
"eval_runtime": 7.8756,
"eval_samples_per_second": 126.975,
"eval_steps_per_second": 7.999,
"step": 3100
},
{
"epoch": 0.512,
"grad_norm": 0.1259102076292038,
"learning_rate": 2.7200000000000004e-05,
"loss": 3.4664,
"step": 3200
},
{
"epoch": 0.512,
"eval_all-nli-dev_cosine_accuracy": 0.455,
"eval_all-nli-dev_dot_accuracy": 0.257,
"eval_all-nli-dev_euclidean_accuracy": 0.455,
"eval_all-nli-dev_manhattan_accuracy": 0.454,
"eval_all-nli-dev_max_accuracy": 0.455,
"eval_loss": 3.4308860301971436,
"eval_runtime": 7.9019,
"eval_samples_per_second": 126.552,
"eval_steps_per_second": 7.973,
"step": 3200
},
{
"epoch": 0.528,
"grad_norm": 0.06493524461984634,
"learning_rate": 2.6311111111111115e-05,
"loss": 3.4668,
"step": 3300
},
{
"epoch": 0.528,
"eval_all-nli-dev_cosine_accuracy": 0.448,
"eval_all-nli-dev_dot_accuracy": 0.383,
"eval_all-nli-dev_euclidean_accuracy": 0.448,
"eval_all-nli-dev_manhattan_accuracy": 0.46,
"eval_all-nli-dev_max_accuracy": 0.46,
"eval_loss": 3.4308860301971436,
"eval_runtime": 7.9333,
"eval_samples_per_second": 126.051,
"eval_steps_per_second": 7.941,
"step": 3300
},
{
"epoch": 0.544,
"grad_norm": 0.0680716335773468,
"learning_rate": 2.5422222222222227e-05,
"loss": 3.4661,
"step": 3400
},
{
"epoch": 0.544,
"eval_all-nli-dev_cosine_accuracy": 0.492,
"eval_all-nli-dev_dot_accuracy": 0.407,
"eval_all-nli-dev_euclidean_accuracy": 0.492,
"eval_all-nli-dev_manhattan_accuracy": 0.481,
"eval_all-nli-dev_max_accuracy": 0.492,
"eval_loss": 3.4308860301971436,
"eval_runtime": 7.9192,
"eval_samples_per_second": 126.276,
"eval_steps_per_second": 7.955,
"step": 3400
},
{
"epoch": 0.56,
"grad_norm": 0.04947361350059509,
"learning_rate": 2.4533333333333334e-05,
"loss": 3.4667,
"step": 3500
},
{
"epoch": 0.56,
"eval_all-nli-dev_cosine_accuracy": 0.432,
"eval_all-nli-dev_dot_accuracy": 0.391,
"eval_all-nli-dev_euclidean_accuracy": 0.432,
"eval_all-nli-dev_manhattan_accuracy": 0.427,
"eval_all-nli-dev_max_accuracy": 0.432,
"eval_loss": 3.4308857917785645,
"eval_runtime": 7.8702,
"eval_samples_per_second": 127.061,
"eval_steps_per_second": 8.005,
"step": 3500
},
{
"epoch": 0.576,
"grad_norm": 0.07287462800741196,
"learning_rate": 2.3644444444444446e-05,
"loss": 3.4668,
"step": 3600
},
{
"epoch": 0.576,
"eval_all-nli-dev_cosine_accuracy": 0.482,
"eval_all-nli-dev_dot_accuracy": 0.374,
"eval_all-nli-dev_euclidean_accuracy": 0.482,
"eval_all-nli-dev_manhattan_accuracy": 0.486,
"eval_all-nli-dev_max_accuracy": 0.486,
"eval_loss": 3.4308860301971436,
"eval_runtime": 7.8562,
"eval_samples_per_second": 127.288,
"eval_steps_per_second": 8.019,
"step": 3600
},
{
"epoch": 0.592,
"grad_norm": 0.028324555605649948,
"learning_rate": 2.2755555555555557e-05,
"loss": 3.4666,
"step": 3700
},
{
"epoch": 0.592,
"eval_all-nli-dev_cosine_accuracy": 0.469,
"eval_all-nli-dev_dot_accuracy": 0.351,
"eval_all-nli-dev_euclidean_accuracy": 0.469,
"eval_all-nli-dev_manhattan_accuracy": 0.46,
"eval_all-nli-dev_max_accuracy": 0.469,
"eval_loss": 3.4308865070343018,
"eval_runtime": 7.879,
"eval_samples_per_second": 126.919,
"eval_steps_per_second": 7.996,
"step": 3700
},
{
"epoch": 0.608,
"grad_norm": 0.04251887649297714,
"learning_rate": 2.186666666666667e-05,
"loss": 3.4669,
"step": 3800
},
{
"epoch": 0.608,
"eval_all-nli-dev_cosine_accuracy": 0.473,
"eval_all-nli-dev_dot_accuracy": 0.369,
"eval_all-nli-dev_euclidean_accuracy": 0.473,
"eval_all-nli-dev_manhattan_accuracy": 0.467,
"eval_all-nli-dev_max_accuracy": 0.473,
"eval_loss": 3.4308860301971436,
"eval_runtime": 7.9346,
"eval_samples_per_second": 126.031,
"eval_steps_per_second": 7.94,
"step": 3800
},
{
"epoch": 0.624,
"grad_norm": 0.01685403846204281,
"learning_rate": 2.097777777777778e-05,
"loss": 3.4658,
"step": 3900
},
{
"epoch": 0.624,
"eval_all-nli-dev_cosine_accuracy": 0.487,
"eval_all-nli-dev_dot_accuracy": 0.333,
"eval_all-nli-dev_euclidean_accuracy": 0.487,
"eval_all-nli-dev_manhattan_accuracy": 0.486,
"eval_all-nli-dev_max_accuracy": 0.487,
"eval_loss": 3.4308860301971436,
"eval_runtime": 8.0469,
"eval_samples_per_second": 124.272,
"eval_steps_per_second": 7.829,
"step": 3900
},
{
"epoch": 0.64,
"grad_norm": 0.16402187943458557,
"learning_rate": 2.008888888888889e-05,
"loss": 3.4663,
"step": 4000
},
{
"epoch": 0.64,
"eval_all-nli-dev_cosine_accuracy": 0.448,
"eval_all-nli-dev_dot_accuracy": 0.355,
"eval_all-nli-dev_euclidean_accuracy": 0.448,
"eval_all-nli-dev_manhattan_accuracy": 0.442,
"eval_all-nli-dev_max_accuracy": 0.448,
"eval_loss": 3.4308860301971436,
"eval_runtime": 7.9755,
"eval_samples_per_second": 125.384,
"eval_steps_per_second": 7.899,
"step": 4000
},
{
"epoch": 0.656,
"grad_norm": 0.08633382618427277,
"learning_rate": 1.9200000000000003e-05,
"loss": 3.4663,
"step": 4100
},
{
"epoch": 0.656,
"eval_all-nli-dev_cosine_accuracy": 0.463,
"eval_all-nli-dev_dot_accuracy": 0.3,
"eval_all-nli-dev_euclidean_accuracy": 0.463,
"eval_all-nli-dev_manhattan_accuracy": 0.465,
"eval_all-nli-dev_max_accuracy": 0.465,
"eval_loss": 3.4308860301971436,
"eval_runtime": 7.9722,
"eval_samples_per_second": 125.436,
"eval_steps_per_second": 7.902,
"step": 4100
},
{
"epoch": 0.672,
"grad_norm": 0.08357453346252441,
"learning_rate": 1.8311111111111114e-05,
"loss": 3.4664,
"step": 4200
},
{
"epoch": 0.672,
"eval_all-nli-dev_cosine_accuracy": 0.48,
"eval_all-nli-dev_dot_accuracy": 0.343,
"eval_all-nli-dev_euclidean_accuracy": 0.48,
"eval_all-nli-dev_manhattan_accuracy": 0.484,
"eval_all-nli-dev_max_accuracy": 0.484,
"eval_loss": 3.4308860301971436,
"eval_runtime": 7.9287,
"eval_samples_per_second": 126.125,
"eval_steps_per_second": 7.946,
"step": 4200
},
{
"epoch": 0.688,
"grad_norm": 0.20162004232406616,
"learning_rate": 1.7422222222222222e-05,
"loss": 3.4663,
"step": 4300
},
{
"epoch": 0.688,
"eval_all-nli-dev_cosine_accuracy": 0.451,
"eval_all-nli-dev_dot_accuracy": 0.354,
"eval_all-nli-dev_euclidean_accuracy": 0.451,
"eval_all-nli-dev_manhattan_accuracy": 0.469,
"eval_all-nli-dev_max_accuracy": 0.469,
"eval_loss": 3.4308860301971436,
"eval_runtime": 7.9242,
"eval_samples_per_second": 126.195,
"eval_steps_per_second": 7.95,
"step": 4300
},
{
"epoch": 0.704,
"grad_norm": 0.17491748929023743,
"learning_rate": 1.6533333333333333e-05,
"loss": 3.4661,
"step": 4400
},
{
"epoch": 0.704,
"eval_all-nli-dev_cosine_accuracy": 0.47,
"eval_all-nli-dev_dot_accuracy": 0.367,
"eval_all-nli-dev_euclidean_accuracy": 0.47,
"eval_all-nli-dev_manhattan_accuracy": 0.478,
"eval_all-nli-dev_max_accuracy": 0.478,
"eval_loss": 3.4308862686157227,
"eval_runtime": 7.989,
"eval_samples_per_second": 125.172,
"eval_steps_per_second": 7.886,
"step": 4400
},
{
"epoch": 0.72,
"grad_norm": 0.1597195565700531,
"learning_rate": 1.5644444444444444e-05,
"loss": 3.4669,
"step": 4500
},
{
"epoch": 0.72,
"eval_all-nli-dev_cosine_accuracy": 0.466,
"eval_all-nli-dev_dot_accuracy": 0.375,
"eval_all-nli-dev_euclidean_accuracy": 0.466,
"eval_all-nli-dev_manhattan_accuracy": 0.467,
"eval_all-nli-dev_max_accuracy": 0.467,
"eval_loss": 3.4308860301971436,
"eval_runtime": 7.9327,
"eval_samples_per_second": 126.06,
"eval_steps_per_second": 7.942,
"step": 4500
},
{
"epoch": 0.736,
"grad_norm": 0.08718598634004593,
"learning_rate": 1.4755555555555556e-05,
"loss": 3.4664,
"step": 4600
},
{
"epoch": 0.736,
"eval_all-nli-dev_cosine_accuracy": 0.44,
"eval_all-nli-dev_dot_accuracy": 0.307,
"eval_all-nli-dev_euclidean_accuracy": 0.44,
"eval_all-nli-dev_manhattan_accuracy": 0.455,
"eval_all-nli-dev_max_accuracy": 0.455,
"eval_loss": 3.4308862686157227,
"eval_runtime": 7.9594,
"eval_samples_per_second": 125.637,
"eval_steps_per_second": 7.915,
"step": 4600
},
{
"epoch": 0.752,
"grad_norm": 0.03150181472301483,
"learning_rate": 1.3866666666666667e-05,
"loss": 3.4664,
"step": 4700
},
{
"epoch": 0.752,
"eval_all-nli-dev_cosine_accuracy": 0.469,
"eval_all-nli-dev_dot_accuracy": 0.336,
"eval_all-nli-dev_euclidean_accuracy": 0.469,
"eval_all-nli-dev_manhattan_accuracy": 0.481,
"eval_all-nli-dev_max_accuracy": 0.481,
"eval_loss": 3.4308860301971436,
"eval_runtime": 7.9652,
"eval_samples_per_second": 125.546,
"eval_steps_per_second": 7.909,
"step": 4700
},
{
"epoch": 0.768,
"grad_norm": 0.027570225298404694,
"learning_rate": 1.2977777777777777e-05,
"loss": 3.4659,
"step": 4800
},
{
"epoch": 0.768,
"eval_all-nli-dev_cosine_accuracy": 0.461,
"eval_all-nli-dev_dot_accuracy": 0.282,
"eval_all-nli-dev_euclidean_accuracy": 0.461,
"eval_all-nli-dev_manhattan_accuracy": 0.466,
"eval_all-nli-dev_max_accuracy": 0.466,
"eval_loss": 3.4308862686157227,
"eval_runtime": 8.0206,
"eval_samples_per_second": 124.678,
"eval_steps_per_second": 7.855,
"step": 4800
},
{
"epoch": 0.784,
"grad_norm": 0.16807572543621063,
"learning_rate": 1.208888888888889e-05,
"loss": 3.466,
"step": 4900
},
{
"epoch": 0.784,
"eval_all-nli-dev_cosine_accuracy": 0.45,
"eval_all-nli-dev_dot_accuracy": 0.375,
"eval_all-nli-dev_euclidean_accuracy": 0.45,
"eval_all-nli-dev_manhattan_accuracy": 0.451,
"eval_all-nli-dev_max_accuracy": 0.451,
"eval_loss": 3.4308860301971436,
"eval_runtime": 7.9772,
"eval_samples_per_second": 125.358,
"eval_steps_per_second": 7.898,
"step": 4900
},
{
"epoch": 0.8,
"grad_norm": 0.03361041471362114,
"learning_rate": 1.1200000000000001e-05,
"loss": 3.466,
"step": 5000
},
{
"epoch": 0.8,
"eval_all-nli-dev_cosine_accuracy": 0.464,
"eval_all-nli-dev_dot_accuracy": 0.374,
"eval_all-nli-dev_euclidean_accuracy": 0.464,
"eval_all-nli-dev_manhattan_accuracy": 0.473,
"eval_all-nli-dev_max_accuracy": 0.473,
"eval_loss": 3.4308860301971436,
"eval_runtime": 7.9287,
"eval_samples_per_second": 126.123,
"eval_steps_per_second": 7.946,
"step": 5000
},
{
"epoch": 0.816,
"grad_norm": 0.1332545429468155,
"learning_rate": 1.031111111111111e-05,
"loss": 3.4664,
"step": 5100
},
{
"epoch": 0.816,
"eval_all-nli-dev_cosine_accuracy": 0.44,
"eval_all-nli-dev_dot_accuracy": 0.341,
"eval_all-nli-dev_euclidean_accuracy": 0.44,
"eval_all-nli-dev_manhattan_accuracy": 0.44,
"eval_all-nli-dev_max_accuracy": 0.44,
"eval_loss": 3.4308860301971436,
"eval_runtime": 7.9414,
"eval_samples_per_second": 125.922,
"eval_steps_per_second": 7.933,
"step": 5100
},
{
"epoch": 0.832,
"grad_norm": 0.029825175181031227,
"learning_rate": 9.422222222222222e-06,
"loss": 3.4658,
"step": 5200
},
{
"epoch": 0.832,
"eval_all-nli-dev_cosine_accuracy": 0.496,
"eval_all-nli-dev_dot_accuracy": 0.356,
"eval_all-nli-dev_euclidean_accuracy": 0.496,
"eval_all-nli-dev_manhattan_accuracy": 0.497,
"eval_all-nli-dev_max_accuracy": 0.497,
"eval_loss": 3.4308860301971436,
"eval_runtime": 7.9321,
"eval_samples_per_second": 126.07,
"eval_steps_per_second": 7.942,
"step": 5200
},
{
"epoch": 0.848,
"grad_norm": 0.023120058700442314,
"learning_rate": 8.533333333333334e-06,
"loss": 3.4664,
"step": 5300
},
{
"epoch": 0.848,
"eval_all-nli-dev_cosine_accuracy": 0.464,
"eval_all-nli-dev_dot_accuracy": 0.321,
"eval_all-nli-dev_euclidean_accuracy": 0.464,
"eval_all-nli-dev_manhattan_accuracy": 0.474,
"eval_all-nli-dev_max_accuracy": 0.474,
"eval_loss": 3.4308860301971436,
"eval_runtime": 7.9178,
"eval_samples_per_second": 126.297,
"eval_steps_per_second": 7.957,
"step": 5300
},
{
"epoch": 0.864,
"grad_norm": 0.10328388214111328,
"learning_rate": 7.644444444444445e-06,
"loss": 3.4658,
"step": 5400
},
{
"epoch": 0.864,
"eval_all-nli-dev_cosine_accuracy": 0.446,
"eval_all-nli-dev_dot_accuracy": 0.309,
"eval_all-nli-dev_euclidean_accuracy": 0.446,
"eval_all-nli-dev_manhattan_accuracy": 0.449,
"eval_all-nli-dev_max_accuracy": 0.449,
"eval_loss": 3.4308860301971436,
"eval_runtime": 8.0066,
"eval_samples_per_second": 124.897,
"eval_steps_per_second": 7.869,
"step": 5400
},
{
"epoch": 0.88,
"grad_norm": 0.03729160130023956,
"learning_rate": 6.755555555555555e-06,
"loss": 3.4662,
"step": 5500
},
{
"epoch": 0.88,
"eval_all-nli-dev_cosine_accuracy": 0.466,
"eval_all-nli-dev_dot_accuracy": 0.311,
"eval_all-nli-dev_euclidean_accuracy": 0.466,
"eval_all-nli-dev_manhattan_accuracy": 0.464,
"eval_all-nli-dev_max_accuracy": 0.466,
"eval_loss": 3.4308860301971436,
"eval_runtime": 7.964,
"eval_samples_per_second": 125.565,
"eval_steps_per_second": 7.911,
"step": 5500
},
{
"epoch": 0.896,
"grad_norm": 0.02747642807662487,
"learning_rate": 5.866666666666667e-06,
"loss": 3.4663,
"step": 5600
},
{
"epoch": 0.896,
"eval_all-nli-dev_cosine_accuracy": 0.476,
"eval_all-nli-dev_dot_accuracy": 0.369,
"eval_all-nli-dev_euclidean_accuracy": 0.476,
"eval_all-nli-dev_manhattan_accuracy": 0.468,
"eval_all-nli-dev_max_accuracy": 0.476,
"eval_loss": 3.4308860301971436,
"eval_runtime": 7.9426,
"eval_samples_per_second": 125.903,
"eval_steps_per_second": 7.932,
"step": 5600
},
{
"epoch": 0.912,
"grad_norm": 0.053027376532554626,
"learning_rate": 4.977777777777778e-06,
"loss": 3.4667,
"step": 5700
},
{
"epoch": 0.912,
"eval_all-nli-dev_cosine_accuracy": 0.455,
"eval_all-nli-dev_dot_accuracy": 0.369,
"eval_all-nli-dev_euclidean_accuracy": 0.455,
"eval_all-nli-dev_manhattan_accuracy": 0.454,
"eval_all-nli-dev_max_accuracy": 0.455,
"eval_loss": 3.4308860301971436,
"eval_runtime": 7.9874,
"eval_samples_per_second": 125.197,
"eval_steps_per_second": 7.887,
"step": 5700
},
{
"epoch": 0.928,
"grad_norm": 0.02996170148253441,
"learning_rate": 4.088888888888889e-06,
"loss": 3.4669,
"step": 5800
},
{
"epoch": 0.928,
"eval_all-nli-dev_cosine_accuracy": 0.456,
"eval_all-nli-dev_dot_accuracy": 0.229,
"eval_all-nli-dev_euclidean_accuracy": 0.456,
"eval_all-nli-dev_manhattan_accuracy": 0.463,
"eval_all-nli-dev_max_accuracy": 0.463,
"eval_loss": 3.4308860301971436,
"eval_runtime": 7.9568,
"eval_samples_per_second": 125.679,
"eval_steps_per_second": 7.918,
"step": 5800
},
{
"epoch": 0.944,
"grad_norm": 0.05124541372060776,
"learning_rate": 3.2000000000000003e-06,
"loss": 3.4657,
"step": 5900
},
{
"epoch": 0.944,
"eval_all-nli-dev_cosine_accuracy": 0.466,
"eval_all-nli-dev_dot_accuracy": 0.388,
"eval_all-nli-dev_euclidean_accuracy": 0.467,
"eval_all-nli-dev_manhattan_accuracy": 0.464,
"eval_all-nli-dev_max_accuracy": 0.467,
"eval_loss": 3.4308862686157227,
"eval_runtime": 7.9623,
"eval_samples_per_second": 125.591,
"eval_steps_per_second": 7.912,
"step": 5900
},
{
"epoch": 0.96,
"grad_norm": 0.06664357334375381,
"learning_rate": 2.311111111111111e-06,
"loss": 3.4671,
"step": 6000
},
{
"epoch": 0.96,
"eval_all-nli-dev_cosine_accuracy": 0.452,
"eval_all-nli-dev_dot_accuracy": 0.33,
"eval_all-nli-dev_euclidean_accuracy": 0.452,
"eval_all-nli-dev_manhattan_accuracy": 0.456,
"eval_all-nli-dev_max_accuracy": 0.456,
"eval_loss": 3.4308862686157227,
"eval_runtime": 7.9667,
"eval_samples_per_second": 125.523,
"eval_steps_per_second": 7.908,
"step": 6000
},
{
"epoch": 0.976,
"grad_norm": 0.18055689334869385,
"learning_rate": 1.4222222222222223e-06,
"loss": 2.9471,
"step": 6100
},
{
"epoch": 0.976,
"eval_all-nli-dev_cosine_accuracy": 0.479,
"eval_all-nli-dev_dot_accuracy": 0.378,
"eval_all-nli-dev_euclidean_accuracy": 0.479,
"eval_all-nli-dev_manhattan_accuracy": 0.484,
"eval_all-nli-dev_max_accuracy": 0.484,
"eval_loss": 3.4308860301971436,
"eval_runtime": 7.9406,
"eval_samples_per_second": 125.936,
"eval_steps_per_second": 7.934,
"step": 6100
},
{
"epoch": 0.992,
"grad_norm": 0.10885396599769592,
"learning_rate": 5.333333333333333e-07,
"loss": 0.6929,
"step": 6200
},
{
"epoch": 0.992,
"eval_all-nli-dev_cosine_accuracy": 0.452,
"eval_all-nli-dev_dot_accuracy": 0.34,
"eval_all-nli-dev_euclidean_accuracy": 0.452,
"eval_all-nli-dev_manhattan_accuracy": 0.456,
"eval_all-nli-dev_max_accuracy": 0.456,
"eval_loss": 3.4308860301971436,
"eval_runtime": 7.9933,
"eval_samples_per_second": 125.105,
"eval_steps_per_second": 7.882,
"step": 6200
}
],
"logging_steps": 100,
"max_steps": 6250,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}