german-jeopardy-mt5-large-256 / trainer_state.json
Marvin
Initial commit
f436953 unverified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 19.789564097058193,
"eval_steps": 500,
"global_step": 720,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.99,
"learning_rate": 0.0001,
"loss": 5.932,
"step": 36
},
{
"epoch": 0.99,
"eval_bleu": 3.7906,
"eval_bp": 1.0,
"eval_counts_1": 5614,
"eval_counts_2": 1426,
"eval_counts_3": 527,
"eval_counts_4": 204,
"eval_exact_match": 0.0,
"eval_f1": 0.2127,
"eval_gen_len": 11.4306,
"eval_loss": 2.450988292694092,
"eval_precisions_1": 19.4694,
"eval_precisions_2": 5.3547,
"eval_precisions_3": 2.1574,
"eval_precisions_4": 0.918,
"eval_ref_len": 21250,
"eval_rouge1": 0.1946,
"eval_rouge2": 0.0763,
"eval_rougeL": 0.1843,
"eval_rougeLsum": 0.1843,
"eval_runtime": 1846.0551,
"eval_samples_per_second": 1.194,
"eval_steps_per_second": 1.194,
"eval_sys_len": 28835,
"eval_totals_1": 28835,
"eval_totals_2": 26631,
"eval_totals_3": 24427,
"eval_totals_4": 22223,
"step": 36
},
{
"epoch": 1.98,
"learning_rate": 0.0001,
"loss": 2.3089,
"step": 72
},
{
"epoch": 1.98,
"eval_bleu": 11.3254,
"eval_bp": 0.7904,
"eval_counts_1": 7578,
"eval_counts_2": 2696,
"eval_counts_3": 1244,
"eval_counts_4": 580,
"eval_exact_match": 0.01,
"eval_f1": 0.3246,
"eval_gen_len": 12.6583,
"eval_loss": 1.3963948488235474,
"eval_precisions_1": 44.0505,
"eval_precisions_2": 17.9745,
"eval_precisions_3": 9.7225,
"eval_precisions_4": 5.4763,
"eval_ref_len": 21250,
"eval_rouge1": 0.3312,
"eval_rouge2": 0.1655,
"eval_rougeL": 0.316,
"eval_rougeLsum": 0.3162,
"eval_runtime": 1912.8668,
"eval_samples_per_second": 1.152,
"eval_steps_per_second": 1.152,
"eval_sys_len": 17203,
"eval_totals_1": 17203,
"eval_totals_2": 14999,
"eval_totals_3": 12795,
"eval_totals_4": 10591,
"step": 72
},
{
"epoch": 3.0,
"learning_rate": 0.0001,
"loss": 1.6778,
"step": 109
},
{
"epoch": 3.0,
"eval_bleu": 13.128,
"eval_bp": 0.7826,
"eval_counts_1": 7961,
"eval_counts_2": 3020,
"eval_counts_3": 1480,
"eval_counts_4": 747,
"eval_exact_match": 0.0195,
"eval_f1": 0.3517,
"eval_gen_len": 12.4682,
"eval_loss": 1.2659858465194702,
"eval_precisions_1": 46.6456,
"eval_precisions_2": 20.3189,
"eval_precisions_3": 11.6913,
"eval_precisions_4": 7.1449,
"eval_ref_len": 21250,
"eval_rouge1": 0.3608,
"eval_rouge2": 0.1881,
"eval_rougeL": 0.3456,
"eval_rougeLsum": 0.3454,
"eval_runtime": 1864.6637,
"eval_samples_per_second": 1.182,
"eval_steps_per_second": 1.182,
"eval_sys_len": 17067,
"eval_totals_1": 17067,
"eval_totals_2": 14863,
"eval_totals_3": 12659,
"eval_totals_4": 10455,
"step": 109
},
{
"epoch": 3.99,
"learning_rate": 0.0001,
"loss": 1.5383,
"step": 145
},
{
"epoch": 3.99,
"eval_bleu": 13.625,
"eval_bp": 0.7612,
"eval_counts_1": 7948,
"eval_counts_2": 3121,
"eval_counts_3": 1558,
"eval_counts_4": 796,
"eval_exact_match": 0.024,
"eval_f1": 0.3554,
"eval_gen_len": 12.221,
"eval_loss": 1.2212449312210083,
"eval_precisions_1": 47.6099,
"eval_precisions_2": 21.539,
"eval_precisions_3": 12.6811,
"eval_precisions_4": 7.8953,
"eval_ref_len": 21250,
"eval_rouge1": 0.3663,
"eval_rouge2": 0.1989,
"eval_rougeL": 0.3523,
"eval_rougeLsum": 0.352,
"eval_runtime": 1862.6167,
"eval_samples_per_second": 1.183,
"eval_steps_per_second": 1.183,
"eval_sys_len": 16694,
"eval_totals_1": 16694,
"eval_totals_2": 14490,
"eval_totals_3": 12286,
"eval_totals_4": 10082,
"step": 145
},
{
"epoch": 4.97,
"learning_rate": 0.0001,
"loss": 1.423,
"step": 181
},
{
"epoch": 4.97,
"eval_bleu": 15.7567,
"eval_bp": 0.8219,
"eval_counts_1": 8746,
"eval_counts_2": 3590,
"eval_counts_3": 1840,
"eval_counts_4": 963,
"eval_exact_match": 0.0304,
"eval_f1": 0.3941,
"eval_gen_len": 13.0277,
"eval_loss": 1.1706066131591797,
"eval_precisions_1": 49.2316,
"eval_precisions_2": 23.0705,
"eval_precisions_3": 13.7755,
"eval_precisions_4": 8.6344,
"eval_ref_len": 21250,
"eval_rouge1": 0.4033,
"eval_rouge2": 0.2224,
"eval_rougeL": 0.3876,
"eval_rougeLsum": 0.3874,
"eval_runtime": 1923.5708,
"eval_samples_per_second": 1.146,
"eval_steps_per_second": 1.146,
"eval_sys_len": 17765,
"eval_totals_1": 17765,
"eval_totals_2": 15561,
"eval_totals_3": 13357,
"eval_totals_4": 11153,
"step": 181
},
{
"epoch": 5.99,
"learning_rate": 0.0001,
"loss": 1.2861,
"step": 218
},
{
"epoch": 5.99,
"eval_bleu": 16.123,
"eval_bp": 0.8018,
"eval_counts_1": 8885,
"eval_counts_2": 3646,
"eval_counts_3": 1864,
"eval_counts_4": 1005,
"eval_exact_match": 0.0331,
"eval_f1": 0.4092,
"eval_gen_len": 12.9142,
"eval_loss": 1.1327459812164307,
"eval_precisions_1": 51.0456,
"eval_precisions_2": 23.9837,
"eval_precisions_3": 14.3407,
"eval_precisions_4": 9.3107,
"eval_ref_len": 21250,
"eval_rouge1": 0.4181,
"eval_rouge2": 0.2295,
"eval_rougeL": 0.4022,
"eval_rougeLsum": 0.402,
"eval_runtime": 2271.0986,
"eval_samples_per_second": 0.97,
"eval_steps_per_second": 0.97,
"eval_sys_len": 17406,
"eval_totals_1": 17406,
"eval_totals_2": 15202,
"eval_totals_3": 12998,
"eval_totals_4": 10794,
"step": 218
},
{
"epoch": 6.98,
"learning_rate": 0.0001,
"loss": 1.2372,
"step": 254
},
{
"epoch": 6.98,
"eval_bleu": 17.0334,
"eval_bp": 0.7964,
"eval_counts_1": 9122,
"eval_counts_2": 3824,
"eval_counts_3": 1997,
"eval_counts_4": 1084,
"eval_exact_match": 0.0358,
"eval_f1": 0.4236,
"eval_gen_len": 12.8412,
"eval_loss": 1.1248232126235962,
"eval_precisions_1": 52.6979,
"eval_precisions_2": 25.3144,
"eval_precisions_3": 15.4782,
"eval_precisions_4": 10.1327,
"eval_ref_len": 21250,
"eval_rouge1": 0.4313,
"eval_rouge2": 0.239,
"eval_rougeL": 0.4175,
"eval_rougeLsum": 0.4172,
"eval_runtime": 2069.7482,
"eval_samples_per_second": 1.065,
"eval_steps_per_second": 1.065,
"eval_sys_len": 17310,
"eval_totals_1": 17310,
"eval_totals_2": 15106,
"eval_totals_3": 12902,
"eval_totals_4": 10698,
"step": 254
},
{
"epoch": 8.0,
"learning_rate": 0.0001,
"loss": 1.1307,
"step": 291
},
{
"epoch": 8.0,
"eval_bleu": 18.0474,
"eval_bp": 0.8389,
"eval_counts_1": 9423,
"eval_counts_2": 4019,
"eval_counts_3": 2136,
"eval_counts_4": 1190,
"eval_exact_match": 0.0404,
"eval_f1": 0.4327,
"eval_gen_len": 13.4138,
"eval_loss": 1.0998262166976929,
"eval_precisions_1": 52.1357,
"eval_precisions_2": 25.3245,
"eval_precisions_3": 15.63,
"eval_precisions_4": 10.3821,
"eval_ref_len": 21250,
"eval_rouge1": 0.441,
"eval_rouge2": 0.249,
"eval_rougeL": 0.4255,
"eval_rougeLsum": 0.4252,
"eval_runtime": 2305.9504,
"eval_samples_per_second": 0.956,
"eval_steps_per_second": 0.956,
"eval_sys_len": 18074,
"eval_totals_1": 18074,
"eval_totals_2": 15870,
"eval_totals_3": 13666,
"eval_totals_4": 11462,
"step": 291
},
{
"epoch": 8.99,
"learning_rate": 0.0001,
"loss": 1.0982,
"step": 327
},
{
"epoch": 8.99,
"eval_bleu": 18.0367,
"eval_bp": 0.8427,
"eval_counts_1": 9450,
"eval_counts_2": 4003,
"eval_counts_3": 2147,
"eval_counts_4": 1184,
"eval_exact_match": 0.0426,
"eval_f1": 0.4344,
"eval_gen_len": 13.4465,
"eval_loss": 1.1051570177078247,
"eval_precisions_1": 52.0805,
"eval_precisions_2": 25.1113,
"eval_precisions_3": 15.6293,
"eval_precisions_4": 10.2662,
"eval_ref_len": 21250,
"eval_rouge1": 0.4427,
"eval_rouge2": 0.2492,
"eval_rougeL": 0.4266,
"eval_rougeLsum": 0.4261,
"eval_runtime": 2511.4033,
"eval_samples_per_second": 0.878,
"eval_steps_per_second": 0.878,
"eval_sys_len": 18145,
"eval_totals_1": 18145,
"eval_totals_2": 15941,
"eval_totals_3": 13737,
"eval_totals_4": 11533,
"step": 327
},
{
"epoch": 9.98,
"learning_rate": 0.0001,
"loss": 1.0449,
"step": 363
},
{
"epoch": 9.98,
"eval_bleu": 18.0793,
"eval_bp": 0.8385,
"eval_counts_1": 9471,
"eval_counts_2": 4036,
"eval_counts_3": 2149,
"eval_counts_4": 1180,
"eval_exact_match": 0.0404,
"eval_f1": 0.4341,
"eval_gen_len": 13.333,
"eval_loss": 1.0995820760726929,
"eval_precisions_1": 52.4215,
"eval_precisions_2": 25.4429,
"eval_precisions_3": 15.7332,
"eval_precisions_4": 10.3012,
"eval_ref_len": 21250,
"eval_rouge1": 0.4422,
"eval_rouge2": 0.2477,
"eval_rougeL": 0.4261,
"eval_rougeLsum": 0.4257,
"eval_runtime": 1973.5312,
"eval_samples_per_second": 1.117,
"eval_steps_per_second": 1.117,
"eval_sys_len": 18067,
"eval_totals_1": 18067,
"eval_totals_2": 15863,
"eval_totals_3": 13659,
"eval_totals_4": 11455,
"step": 363
},
{
"epoch": 10.99,
"learning_rate": 0.0001,
"loss": 0.9686,
"step": 400
},
{
"epoch": 10.99,
"eval_bleu": 18.6914,
"eval_bp": 0.8339,
"eval_counts_1": 9612,
"eval_counts_2": 4165,
"eval_counts_3": 2240,
"eval_counts_4": 1233,
"eval_exact_match": 0.0449,
"eval_f1": 0.4458,
"eval_gen_len": 13.3534,
"eval_loss": 1.1012390851974487,
"eval_precisions_1": 53.4505,
"eval_precisions_2": 26.3958,
"eval_precisions_3": 16.5009,
"eval_precisions_4": 10.8434,
"eval_ref_len": 21250,
"eval_rouge1": 0.4534,
"eval_rouge2": 0.2591,
"eval_rougeL": 0.4381,
"eval_rougeLsum": 0.4378,
"eval_runtime": 2028.4468,
"eval_samples_per_second": 1.087,
"eval_steps_per_second": 1.087,
"eval_sys_len": 17983,
"eval_totals_1": 17983,
"eval_totals_2": 15779,
"eval_totals_3": 13575,
"eval_totals_4": 11371,
"step": 400
},
{
"epoch": 11.98,
"learning_rate": 0.0001,
"loss": 0.9465,
"step": 436
},
{
"epoch": 11.98,
"eval_bleu": 18.6863,
"eval_bp": 0.8466,
"eval_counts_1": 9670,
"eval_counts_2": 4154,
"eval_counts_3": 2229,
"eval_counts_4": 1239,
"eval_exact_match": 0.0445,
"eval_f1": 0.4452,
"eval_gen_len": 13.5912,
"eval_loss": 1.1026833057403564,
"eval_precisions_1": 53.0823,
"eval_precisions_2": 25.9414,
"eval_precisions_3": 16.1416,
"eval_precisions_4": 10.6764,
"eval_ref_len": 21250,
"eval_rouge1": 0.4531,
"eval_rouge2": 0.258,
"eval_rougeL": 0.4377,
"eval_rougeLsum": 0.4374,
"eval_runtime": 4413.1713,
"eval_samples_per_second": 0.499,
"eval_steps_per_second": 0.499,
"eval_sys_len": 18217,
"eval_totals_1": 18217,
"eval_totals_2": 16013,
"eval_totals_3": 13809,
"eval_totals_4": 11605,
"step": 436
},
{
"epoch": 12.97,
"learning_rate": 0.0001,
"loss": 0.9025,
"step": 472
},
{
"epoch": 12.97,
"eval_bleu": 18.7344,
"eval_bp": 0.839,
"eval_counts_1": 9627,
"eval_counts_2": 4155,
"eval_counts_3": 2241,
"eval_counts_4": 1247,
"eval_exact_match": 0.0436,
"eval_f1": 0.4452,
"eval_gen_len": 13.5259,
"eval_loss": 1.11244535446167,
"eval_precisions_1": 53.2585,
"eval_precisions_2": 26.1782,
"eval_precisions_3": 16.396,
"eval_precisions_4": 10.8775,
"eval_ref_len": 21250,
"eval_rouge1": 0.4531,
"eval_rouge2": 0.2583,
"eval_rougeL": 0.4386,
"eval_rougeLsum": 0.4382,
"eval_runtime": 3852.502,
"eval_samples_per_second": 0.572,
"eval_steps_per_second": 0.572,
"eval_sys_len": 18076,
"eval_totals_1": 18076,
"eval_totals_2": 15872,
"eval_totals_3": 13668,
"eval_totals_4": 11464,
"step": 472
},
{
"epoch": 13.99,
"learning_rate": 0.0001,
"loss": 0.8402,
"step": 509
},
{
"epoch": 13.99,
"eval_bleu": 18.3062,
"eval_bp": 0.7981,
"eval_counts_1": 9425,
"eval_counts_2": 4071,
"eval_counts_3": 2176,
"eval_counts_4": 1207,
"eval_exact_match": 0.0445,
"eval_f1": 0.4417,
"eval_gen_len": 12.9129,
"eval_loss": 1.139233946800232,
"eval_precisions_1": 54.3572,
"eval_precisions_2": 26.8979,
"eval_precisions_3": 16.8278,
"eval_precisions_4": 11.252,
"eval_ref_len": 21250,
"eval_rouge1": 0.4495,
"eval_rouge2": 0.2568,
"eval_rougeL": 0.4365,
"eval_rougeLsum": 0.4358,
"eval_runtime": 3679.9149,
"eval_samples_per_second": 0.599,
"eval_steps_per_second": 0.599,
"eval_sys_len": 17339,
"eval_totals_1": 17339,
"eval_totals_2": 15135,
"eval_totals_3": 12931,
"eval_totals_4": 10727,
"step": 509
},
{
"epoch": 14.98,
"learning_rate": 0.0001,
"loss": 0.8282,
"step": 545
},
{
"epoch": 14.98,
"eval_bleu": 19.2695,
"eval_bp": 0.87,
"eval_counts_1": 9803,
"eval_counts_2": 4274,
"eval_counts_3": 2316,
"eval_counts_4": 1305,
"eval_exact_match": 0.0463,
"eval_f1": 0.4496,
"eval_gen_len": 14.0104,
"eval_loss": 1.1227205991744995,
"eval_precisions_1": 52.5574,
"eval_precisions_2": 25.9849,
"eval_precisions_3": 16.2595,
"eval_precisions_4": 10.8389,
"eval_ref_len": 21250,
"eval_rouge1": 0.4573,
"eval_rouge2": 0.2627,
"eval_rougeL": 0.4418,
"eval_rougeLsum": 0.4414,
"eval_runtime": 3897.0455,
"eval_samples_per_second": 0.566,
"eval_steps_per_second": 0.566,
"eval_sys_len": 18652,
"eval_totals_1": 18652,
"eval_totals_2": 16448,
"eval_totals_3": 14244,
"eval_totals_4": 12040,
"step": 545
},
{
"epoch": 16.0,
"learning_rate": 0.0001,
"loss": 0.7694,
"step": 582
},
{
"epoch": 16.0,
"eval_bleu": 19.1704,
"eval_bp": 0.8501,
"eval_counts_1": 9740,
"eval_counts_2": 4240,
"eval_counts_3": 2299,
"eval_counts_4": 1296,
"eval_exact_match": 0.0476,
"eval_f1": 0.4492,
"eval_gen_len": 13.6475,
"eval_loss": 1.139431357383728,
"eval_precisions_1": 53.2794,
"eval_precisions_2": 26.3731,
"eval_precisions_3": 16.5718,
"eval_precisions_4": 11.1064,
"eval_ref_len": 21250,
"eval_rouge1": 0.4572,
"eval_rouge2": 0.2629,
"eval_rougeL": 0.4411,
"eval_rougeLsum": 0.4412,
"eval_runtime": 3879.8331,
"eval_samples_per_second": 0.568,
"eval_steps_per_second": 0.568,
"eval_sys_len": 18281,
"eval_totals_1": 18281,
"eval_totals_2": 16077,
"eval_totals_3": 13873,
"eval_totals_4": 11669,
"step": 582
},
{
"epoch": 16.99,
"learning_rate": 0.0001,
"loss": 0.7589,
"step": 618
},
{
"epoch": 16.99,
"eval_bleu": 18.5906,
"eval_bp": 0.8572,
"eval_counts_1": 9663,
"eval_counts_2": 4140,
"eval_counts_3": 2214,
"eval_counts_4": 1232,
"eval_exact_match": 0.044,
"eval_f1": 0.4432,
"eval_gen_len": 13.7926,
"eval_loss": 1.1496515274047852,
"eval_precisions_1": 52.4821,
"eval_precisions_2": 25.5429,
"eval_precisions_3": 15.8098,
"eval_precisions_4": 10.4407,
"eval_ref_len": 21250,
"eval_rouge1": 0.4515,
"eval_rouge2": 0.2561,
"eval_rougeL": 0.4359,
"eval_rougeLsum": 0.4358,
"eval_runtime": 3896.3752,
"eval_samples_per_second": 0.566,
"eval_steps_per_second": 0.566,
"eval_sys_len": 18412,
"eval_totals_1": 18412,
"eval_totals_2": 16208,
"eval_totals_3": 14004,
"eval_totals_4": 11800,
"step": 618
},
{
"epoch": 17.98,
"learning_rate": 0.0001,
"loss": 0.724,
"step": 654
},
{
"epoch": 17.98,
"eval_bleu": 19.2167,
"eval_bp": 0.8566,
"eval_counts_1": 9743,
"eval_counts_2": 4246,
"eval_counts_3": 2316,
"eval_counts_4": 1300,
"eval_exact_match": 0.0472,
"eval_f1": 0.4474,
"eval_gen_len": 13.7214,
"eval_loss": 1.1680002212524414,
"eval_precisions_1": 52.9453,
"eval_precisions_2": 26.2131,
"eval_precisions_3": 16.5499,
"eval_precisions_4": 11.0263,
"eval_ref_len": 21250,
"eval_rouge1": 0.4562,
"eval_rouge2": 0.2625,
"eval_rougeL": 0.4408,
"eval_rougeLsum": 0.441,
"eval_runtime": 3957.7799,
"eval_samples_per_second": 0.557,
"eval_steps_per_second": 0.557,
"eval_sys_len": 18402,
"eval_totals_1": 18402,
"eval_totals_2": 16198,
"eval_totals_3": 13994,
"eval_totals_4": 11790,
"step": 654
},
{
"epoch": 18.99,
"learning_rate": 0.0001,
"loss": 0.6755,
"step": 691
},
{
"epoch": 18.99,
"eval_bleu": 19.4647,
"eval_bp": 0.8496,
"eval_counts_1": 9722,
"eval_counts_2": 4266,
"eval_counts_3": 2351,
"eval_counts_4": 1341,
"eval_exact_match": 0.0495,
"eval_f1": 0.4469,
"eval_gen_len": 13.6071,
"eval_loss": 1.1873786449432373,
"eval_precisions_1": 53.2071,
"eval_precisions_2": 26.5497,
"eval_precisions_3": 16.9576,
"eval_precisions_4": 11.5009,
"eval_ref_len": 21250,
"eval_rouge1": 0.4559,
"eval_rouge2": 0.2639,
"eval_rougeL": 0.4417,
"eval_rougeLsum": 0.4413,
"eval_runtime": 3873.6961,
"eval_samples_per_second": 0.569,
"eval_steps_per_second": 0.569,
"eval_sys_len": 18272,
"eval_totals_1": 18272,
"eval_totals_2": 16068,
"eval_totals_3": 13864,
"eval_totals_4": 11660,
"step": 691
},
{
"epoch": 19.79,
"learning_rate": 0.0001,
"loss": 0.657,
"step": 720
},
{
"epoch": 19.79,
"eval_bleu": 19.8248,
"eval_bp": 0.8822,
"eval_counts_1": 9920,
"eval_counts_2": 4361,
"eval_counts_3": 2402,
"eval_counts_4": 1373,
"eval_exact_match": 0.0467,
"eval_f1": 0.4508,
"eval_gen_len": 14.2001,
"eval_loss": 1.1845453977584839,
"eval_precisions_1": 52.5312,
"eval_precisions_2": 26.1451,
"eval_precisions_3": 16.593,
"eval_precisions_4": 11.1881,
"eval_ref_len": 21250,
"eval_rouge1": 0.4594,
"eval_rouge2": 0.2647,
"eval_rougeL": 0.4423,
"eval_rougeLsum": 0.4421,
"eval_runtime": 3958.6673,
"eval_samples_per_second": 0.557,
"eval_steps_per_second": 0.557,
"eval_sys_len": 18884,
"eval_totals_1": 18884,
"eval_totals_2": 16680,
"eval_totals_3": 14476,
"eval_totals_4": 12272,
"step": 720
},
{
"epoch": 19.79,
"step": 720,
"total_flos": 1.102412878184448e+18,
"train_loss": 1.3412119759453667,
"train_runtime": 128063.7495,
"train_samples_per_second": 1.455,
"train_steps_per_second": 0.006
}
],
"logging_steps": 500,
"max_steps": 720,
"num_train_epochs": 20,
"save_steps": 500,
"total_flos": 1.102412878184448e+18,
"trial_name": null,
"trial_params": null
}