{ "best_metric": 17.3273, "best_model_checkpoint": "models/mt0-xl_russian_natprompt_adafactor_updated/checkpoint-6150", "epoch": 14.999024390243903, "eval_steps": 500, "global_step": 7687, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 1.0, "learning_rate": 4.75e-05, "loss": 2.0388, "step": 512 }, { "epoch": 1.0, "eval_gen_len": 16.58484100877193, "eval_loss": 1.6734575033187866, "eval_rouge1": 14.1367, "eval_rouge2": 7.0437, "eval_rougeL": 14.0625, "eval_rougeLsum": 14.0916, "eval_runtime": 270.6111, "eval_samples_per_second": 26.932, "eval_steps_per_second": 0.843, "step": 512 }, { "epoch": 2.0, "learning_rate": 4.4995117187500005e-05, "loss": 1.7098, "step": 1025 }, { "epoch": 2.0, "eval_gen_len": 16.68050986842105, "eval_loss": 1.6203718185424805, "eval_rouge1": 15.2619, "eval_rouge2": 7.8124, "eval_rougeL": 15.159, "eval_rougeLsum": 15.2078, "eval_runtime": 276.6842, "eval_samples_per_second": 26.341, "eval_steps_per_second": 0.824, "step": 1025 }, { "epoch": 3.0, "learning_rate": 4.24951171875e-05, "loss": 1.539, "step": 1537 }, { "epoch": 3.0, "eval_gen_len": 16.61417214912281, "eval_loss": 1.6058766841888428, "eval_rouge1": 15.9942, "eval_rouge2": 8.1827, "eval_rougeL": 15.872, "eval_rougeLsum": 15.9105, "eval_runtime": 263.8074, "eval_samples_per_second": 27.626, "eval_steps_per_second": 0.864, "step": 1537 }, { "epoch": 4.0, "learning_rate": 3.9990234375e-05, "loss": 1.403, "step": 2050 }, { "epoch": 4.0, "eval_gen_len": 16.26343201754386, "eval_loss": 1.6041721105575562, "eval_rouge1": 16.6383, "eval_rouge2": 8.4603, "eval_rougeL": 16.5096, "eval_rougeLsum": 16.5635, "eval_runtime": 251.4581, "eval_samples_per_second": 28.983, "eval_steps_per_second": 0.907, "step": 2050 }, { "epoch": 5.0, "learning_rate": 3.7490234375e-05, "loss": 1.295, "step": 2562 }, { "epoch": 5.0, "eval_gen_len": 15.741365131578947, "eval_loss": 1.6226089000701904, "eval_rouge1": 16.9189, "eval_rouge2": 8.8384, "eval_rougeL": 16.7799, "eval_rougeLsum": 16.8258, "eval_runtime": 169.6881, "eval_samples_per_second": 42.949, "eval_steps_per_second": 1.344, "step": 2562 }, { "epoch": 6.0, "learning_rate": 3.49853515625e-05, "loss": 1.1984, "step": 3075 }, { "epoch": 6.0, "eval_gen_len": 15.888157894736842, "eval_loss": 1.6289030313491821, "eval_rouge1": 16.9788, "eval_rouge2": 8.7272, "eval_rougeL": 16.8238, "eval_rougeLsum": 16.8765, "eval_runtime": 175.0677, "eval_samples_per_second": 41.63, "eval_steps_per_second": 1.302, "step": 3075 }, { "epoch": 7.0, "learning_rate": 3.2485351562499996e-05, "loss": 1.1195, "step": 3587 }, { "epoch": 7.0, "eval_gen_len": 16.23519736842105, "eval_loss": 1.6697918176651, "eval_rouge1": 17.0912, "eval_rouge2": 8.7061, "eval_rougeL": 16.9084, "eval_rougeLsum": 16.9633, "eval_runtime": 171.9395, "eval_samples_per_second": 42.387, "eval_steps_per_second": 1.326, "step": 3587 }, { "epoch": 8.0, "learning_rate": 2.998046875e-05, "loss": 1.0463, "step": 4100 }, { "epoch": 8.0, "eval_gen_len": 16.14761513157895, "eval_loss": 1.6845269203186035, "eval_rouge1": 17.201, "eval_rouge2": 8.7395, "eval_rougeL": 17.003, "eval_rougeLsum": 17.052, "eval_runtime": 252.7052, "eval_samples_per_second": 28.84, "eval_steps_per_second": 0.902, "step": 4100 }, { "epoch": 9.0, "learning_rate": 2.748046875e-05, "loss": 0.9866, "step": 4612 }, { "epoch": 9.0, "eval_gen_len": 15.878837719298245, "eval_loss": 1.726230502128601, "eval_rouge1": 17.3223, "eval_rouge2": 8.8289, "eval_rougeL": 17.1413, "eval_rougeLsum": 17.1756, "eval_runtime": 182.5703, "eval_samples_per_second": 39.919, "eval_steps_per_second": 1.249, "step": 4612 }, { "epoch": 10.0, "learning_rate": 2.49755859375e-05, "loss": 0.9326, "step": 5125 }, { "epoch": 10.0, "eval_gen_len": 15.797149122807017, "eval_loss": 1.7532711029052734, "eval_rouge1": 17.2655, "eval_rouge2": 8.7512, "eval_rougeL": 17.0508, "eval_rougeLsum": 17.1055, "eval_runtime": 168.7949, "eval_samples_per_second": 43.177, "eval_steps_per_second": 1.351, "step": 5125 }, { "epoch": 11.0, "learning_rate": 2.24755859375e-05, "loss": 0.8844, "step": 5637 }, { "epoch": 11.0, "eval_gen_len": 16.32360197368421, "eval_loss": 1.7794246673583984, "eval_rouge1": 17.008, "eval_rouge2": 8.5404, "eval_rougeL": 16.8044, "eval_rougeLsum": 16.848, "eval_runtime": 168.6102, "eval_samples_per_second": 43.224, "eval_steps_per_second": 1.352, "step": 5637 }, { "epoch": 12.0, "learning_rate": 1.9970703125e-05, "loss": 0.8393, "step": 6150 }, { "epoch": 12.0, "eval_gen_len": 16.143092105263158, "eval_loss": 1.7995822429656982, "eval_rouge1": 17.3273, "eval_rouge2": 8.7829, "eval_rougeL": 17.097, "eval_rougeLsum": 17.1644, "eval_runtime": 171.5723, "eval_samples_per_second": 42.478, "eval_steps_per_second": 1.329, "step": 6150 }, { "epoch": 13.0, "learning_rate": 1.7470703125000003e-05, "loss": 0.8046, "step": 6662 }, { "epoch": 13.0, "eval_gen_len": 16.090597587719298, "eval_loss": 1.8266295194625854, "eval_rouge1": 17.1859, "eval_rouge2": 8.6524, "eval_rougeL": 16.9605, "eval_rougeLsum": 17.0118, "eval_runtime": 259.1646, "eval_samples_per_second": 28.121, "eval_steps_per_second": 0.88, "step": 6662 }, { "epoch": 14.0, "learning_rate": 1.49658203125e-05, "loss": 0.7682, "step": 7175 }, { "epoch": 14.0, "eval_gen_len": 16.11239035087719, "eval_loss": 1.8624775409698486, "eval_rouge1": 17.0184, "eval_rouge2": 8.5314, "eval_rougeL": 16.8019, "eval_rougeLsum": 16.847, "eval_runtime": 170.9938, "eval_samples_per_second": 42.621, "eval_steps_per_second": 1.333, "step": 7175 }, { "epoch": 15.0, "learning_rate": 1.2465820312500002e-05, "loss": 0.7419, "step": 7687 }, { "epoch": 15.0, "eval_gen_len": 15.95751096491228, "eval_loss": 1.8779526948928833, "eval_rouge1": 17.2742, "eval_rouge2": 8.6795, "eval_rougeL": 17.0699, "eval_rougeLsum": 17.1118, "eval_runtime": 177.9916, "eval_samples_per_second": 40.946, "eval_steps_per_second": 1.281, "step": 7687 }, { "epoch": 15.0, "step": 7687, "total_flos": 1.7085595424946913e+18, "train_loss": 1.153788715837463, "train_runtime": 20083.7121, "train_samples_per_second": 65.311, "train_steps_per_second": 0.51 } ], "logging_steps": 500, "max_steps": 10240, "num_input_tokens_seen": 0, "num_train_epochs": 20, "save_steps": 500, "total_flos": 1.7085595424946913e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }