|
{ |
|
"best_metric": 2.3254711627960205, |
|
"best_model_checkpoint": "/home/co-ou1/rds/hpc-work/models/longt5_xl_sfd_4096/longt5_xl_sfd_4096_e10/checkpoint-28", |
|
"epoch": 9.73913043478261, |
|
"eval_steps": 500, |
|
"global_step": 140, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.14, |
|
"learning_rate": 0.001, |
|
"loss": 3.2585, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"learning_rate": 0.001, |
|
"loss": 3.3242, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"learning_rate": 0.001, |
|
"loss": 3.6288, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"learning_rate": 0.001, |
|
"loss": 3.1948, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"learning_rate": 0.001, |
|
"loss": 2.9524, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"learning_rate": 0.001, |
|
"loss": 2.8889, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"learning_rate": 0.001, |
|
"loss": 3.0332, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"eval_loss": 2.5423872470855713, |
|
"eval_runtime": 96.2671, |
|
"eval_samples_per_second": 3.511, |
|
"eval_steps_per_second": 0.447, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 1.11, |
|
"learning_rate": 0.001, |
|
"loss": 2.6074, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"learning_rate": 0.001, |
|
"loss": 2.6808, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 1.39, |
|
"learning_rate": 0.001, |
|
"loss": 2.6085, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 1.53, |
|
"learning_rate": 0.001, |
|
"loss": 2.5063, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 1.67, |
|
"learning_rate": 0.001, |
|
"loss": 2.4614, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 1.81, |
|
"learning_rate": 0.001, |
|
"loss": 2.4519, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 1.95, |
|
"learning_rate": 0.001, |
|
"loss": 2.4105, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 1.95, |
|
"eval_loss": 2.3254711627960205, |
|
"eval_runtime": 99.9342, |
|
"eval_samples_per_second": 3.382, |
|
"eval_steps_per_second": 0.43, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 2.09, |
|
"learning_rate": 0.001, |
|
"loss": 2.1628, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 2.23, |
|
"learning_rate": 0.001, |
|
"loss": 2.0701, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 2.37, |
|
"learning_rate": 0.001, |
|
"loss": 2.0992, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"learning_rate": 0.001, |
|
"loss": 2.0401, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 2.64, |
|
"learning_rate": 0.001, |
|
"loss": 2.0299, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 2.78, |
|
"learning_rate": 0.001, |
|
"loss": 2.0812, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 2.92, |
|
"learning_rate": 0.001, |
|
"loss": 2.0496, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 2.99, |
|
"eval_loss": 2.3419768810272217, |
|
"eval_runtime": 99.9699, |
|
"eval_samples_per_second": 3.381, |
|
"eval_steps_per_second": 0.43, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 3.06, |
|
"learning_rate": 0.001, |
|
"loss": 1.9994, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 3.2, |
|
"learning_rate": 0.001, |
|
"loss": 1.7276, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 3.34, |
|
"learning_rate": 0.001, |
|
"loss": 1.7639, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 3.48, |
|
"learning_rate": 0.001, |
|
"loss": 1.7624, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 3.62, |
|
"learning_rate": 0.001, |
|
"loss": 1.7726, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 3.76, |
|
"learning_rate": 0.001, |
|
"loss": 1.7218, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 3.9, |
|
"learning_rate": 0.001, |
|
"loss": 1.7473, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 3.97, |
|
"eval_loss": 2.352036476135254, |
|
"eval_runtime": 109.1375, |
|
"eval_samples_per_second": 3.097, |
|
"eval_steps_per_second": 0.394, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 4.03, |
|
"learning_rate": 0.001, |
|
"loss": 1.6586, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 4.17, |
|
"learning_rate": 0.001, |
|
"loss": 1.3888, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 4.31, |
|
"learning_rate": 0.001, |
|
"loss": 1.4192, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 4.45, |
|
"learning_rate": 0.001, |
|
"loss": 1.4003, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 4.59, |
|
"learning_rate": 0.001, |
|
"loss": 1.4405, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 4.73, |
|
"learning_rate": 0.001, |
|
"loss": 1.3766, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 4.87, |
|
"learning_rate": 0.001, |
|
"loss": 1.4007, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 4.94, |
|
"eval_loss": 2.4979982376098633, |
|
"eval_runtime": 101.842, |
|
"eval_samples_per_second": 3.319, |
|
"eval_steps_per_second": 0.422, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 5.01, |
|
"learning_rate": 0.001, |
|
"loss": 1.3547, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 5.15, |
|
"learning_rate": 0.001, |
|
"loss": 1.3243, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 5.29, |
|
"learning_rate": 0.001, |
|
"loss": 1.3494, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 5.43, |
|
"learning_rate": 0.001, |
|
"loss": 1.3982, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 5.57, |
|
"learning_rate": 0.001, |
|
"loss": 1.3294, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 5.7, |
|
"learning_rate": 0.001, |
|
"loss": 1.404, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 5.84, |
|
"learning_rate": 0.001, |
|
"loss": 1.371, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 5.98, |
|
"learning_rate": 0.001, |
|
"loss": 1.3809, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 5.98, |
|
"eval_loss": 2.4785053730010986, |
|
"eval_runtime": 78.2069, |
|
"eval_samples_per_second": 4.322, |
|
"eval_steps_per_second": 0.55, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 6.12, |
|
"learning_rate": 0.001, |
|
"loss": 1.0798, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 6.26, |
|
"learning_rate": 0.001, |
|
"loss": 1.0476, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 6.4, |
|
"learning_rate": 0.001, |
|
"loss": 1.111, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 6.54, |
|
"learning_rate": 0.001, |
|
"loss": 1.0734, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 6.68, |
|
"learning_rate": 0.001, |
|
"loss": 1.0563, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 6.82, |
|
"learning_rate": 0.001, |
|
"loss": 1.1215, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 6.96, |
|
"learning_rate": 0.001, |
|
"loss": 1.1153, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 6.96, |
|
"eval_loss": 2.732642650604248, |
|
"eval_runtime": 78.0611, |
|
"eval_samples_per_second": 4.33, |
|
"eval_steps_per_second": 0.551, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 7.1, |
|
"learning_rate": 0.001, |
|
"loss": 0.9032, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 7.23, |
|
"learning_rate": 0.001, |
|
"loss": 0.8517, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 7.37, |
|
"learning_rate": 0.001, |
|
"loss": 0.8711, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 7.51, |
|
"learning_rate": 0.001, |
|
"loss": 0.8849, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 7.65, |
|
"learning_rate": 0.001, |
|
"loss": 0.898, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 7.79, |
|
"learning_rate": 0.001, |
|
"loss": 0.9153, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 7.93, |
|
"learning_rate": 0.001, |
|
"loss": 0.9129, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_loss": 2.9232447147369385, |
|
"eval_runtime": 78.4249, |
|
"eval_samples_per_second": 4.31, |
|
"eval_steps_per_second": 0.548, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 8.07, |
|
"learning_rate": 0.001, |
|
"loss": 0.7377, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 8.21, |
|
"learning_rate": 0.001, |
|
"loss": 0.6558, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 8.35, |
|
"learning_rate": 0.001, |
|
"loss": 0.7047, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 8.49, |
|
"learning_rate": 0.001, |
|
"loss": 0.7382, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 8.63, |
|
"learning_rate": 0.001, |
|
"loss": 0.6919, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 8.77, |
|
"learning_rate": 0.001, |
|
"loss": 0.7257, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 8.9, |
|
"learning_rate": 0.001, |
|
"loss": 0.7118, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 8.97, |
|
"eval_loss": 3.047579526901245, |
|
"eval_runtime": 78.168, |
|
"eval_samples_per_second": 4.324, |
|
"eval_steps_per_second": 0.55, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 9.04, |
|
"learning_rate": 0.001, |
|
"loss": 0.6401, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 9.18, |
|
"learning_rate": 0.001, |
|
"loss": 0.5032, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 9.32, |
|
"learning_rate": 0.001, |
|
"loss": 0.548, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 9.46, |
|
"learning_rate": 0.001, |
|
"loss": 0.5218, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 9.6, |
|
"learning_rate": 0.001, |
|
"loss": 0.5744, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 9.74, |
|
"learning_rate": 0.001, |
|
"loss": 0.5883, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 9.74, |
|
"eval_loss": 3.3644142150878906, |
|
"eval_runtime": 78.3345, |
|
"eval_samples_per_second": 4.315, |
|
"eval_steps_per_second": 0.549, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 9.74, |
|
"step": 140, |
|
"total_flos": 2.447850236380324e+18, |
|
"train_loss": 1.5745136559009552, |
|
"train_runtime": 36489.2213, |
|
"train_samples_per_second": 1.007, |
|
"train_steps_per_second": 0.004 |
|
} |
|
], |
|
"logging_steps": 2, |
|
"max_steps": 140, |
|
"num_train_epochs": 10, |
|
"save_steps": 500, |
|
"total_flos": 2.447850236380324e+18, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|