|
{ |
|
"best_metric": 1.0, |
|
"best_model_checkpoint": "vit-base-patch16-224-dmae-va-U3-40A/checkpoint-119", |
|
"epoch": 40.0, |
|
"eval_steps": 500, |
|
"global_step": 280, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 1.0, |
|
"eval_accuracy": 0.37373737373737376, |
|
"eval_loss": 1.329177737236023, |
|
"eval_runtime": 1.2424, |
|
"eval_samples_per_second": 79.683, |
|
"eval_steps_per_second": 3.22, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 1.71, |
|
"grad_norm": 3.085270404815674, |
|
"learning_rate": 2.1428571428571428e-05, |
|
"loss": 1.3407, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_accuracy": 0.5151515151515151, |
|
"eval_loss": 1.107913851737976, |
|
"eval_runtime": 1.2457, |
|
"eval_samples_per_second": 79.472, |
|
"eval_steps_per_second": 3.211, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_accuracy": 0.6262626262626263, |
|
"eval_loss": 0.8917983770370483, |
|
"eval_runtime": 1.2574, |
|
"eval_samples_per_second": 78.737, |
|
"eval_steps_per_second": 3.181, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 3.43, |
|
"grad_norm": 3.550651788711548, |
|
"learning_rate": 4.2857142857142856e-05, |
|
"loss": 0.9919, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_accuracy": 0.7878787878787878, |
|
"eval_loss": 0.6446935534477234, |
|
"eval_runtime": 1.2679, |
|
"eval_samples_per_second": 78.083, |
|
"eval_steps_per_second": 3.155, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_accuracy": 0.8282828282828283, |
|
"eval_loss": 0.45023515820503235, |
|
"eval_runtime": 1.4064, |
|
"eval_samples_per_second": 70.393, |
|
"eval_steps_per_second": 2.844, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 5.14, |
|
"grad_norm": 3.4338462352752686, |
|
"learning_rate": 4.841269841269841e-05, |
|
"loss": 0.5761, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_accuracy": 0.9191919191919192, |
|
"eval_loss": 0.27204275131225586, |
|
"eval_runtime": 1.4909, |
|
"eval_samples_per_second": 66.403, |
|
"eval_steps_per_second": 2.683, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 6.86, |
|
"grad_norm": 1.6468698978424072, |
|
"learning_rate": 4.603174603174603e-05, |
|
"loss": 0.3111, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"eval_accuracy": 0.9292929292929293, |
|
"eval_loss": 0.23024092614650726, |
|
"eval_runtime": 1.3066, |
|
"eval_samples_per_second": 75.766, |
|
"eval_steps_per_second": 3.061, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_accuracy": 0.9494949494949495, |
|
"eval_loss": 0.16501076519489288, |
|
"eval_runtime": 1.3125, |
|
"eval_samples_per_second": 75.428, |
|
"eval_steps_per_second": 3.048, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 8.57, |
|
"grad_norm": 1.7793350219726562, |
|
"learning_rate": 4.3650793650793655e-05, |
|
"loss": 0.204, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"eval_accuracy": 0.9494949494949495, |
|
"eval_loss": 0.1503186672925949, |
|
"eval_runtime": 1.316, |
|
"eval_samples_per_second": 75.227, |
|
"eval_steps_per_second": 3.039, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"eval_accuracy": 0.9797979797979798, |
|
"eval_loss": 0.08136877417564392, |
|
"eval_runtime": 1.3015, |
|
"eval_samples_per_second": 76.065, |
|
"eval_steps_per_second": 3.073, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 10.29, |
|
"grad_norm": 1.3909555673599243, |
|
"learning_rate": 4.126984126984127e-05, |
|
"loss": 0.1518, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 11.0, |
|
"eval_accuracy": 0.9797979797979798, |
|
"eval_loss": 0.06037978082895279, |
|
"eval_runtime": 1.2829, |
|
"eval_samples_per_second": 77.171, |
|
"eval_steps_per_second": 3.118, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"grad_norm": 1.9186725616455078, |
|
"learning_rate": 3.888888888888889e-05, |
|
"loss": 0.1272, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"eval_accuracy": 0.9494949494949495, |
|
"eval_loss": 0.1265028864145279, |
|
"eval_runtime": 1.2863, |
|
"eval_samples_per_second": 76.965, |
|
"eval_steps_per_second": 3.11, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 13.0, |
|
"eval_accuracy": 0.9797979797979798, |
|
"eval_loss": 0.05176503211259842, |
|
"eval_runtime": 1.3002, |
|
"eval_samples_per_second": 76.143, |
|
"eval_steps_per_second": 3.076, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 13.71, |
|
"grad_norm": 1.4530500173568726, |
|
"learning_rate": 3.650793650793651e-05, |
|
"loss": 0.1379, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 14.0, |
|
"eval_accuracy": 0.98989898989899, |
|
"eval_loss": 0.044787079095840454, |
|
"eval_runtime": 1.303, |
|
"eval_samples_per_second": 75.979, |
|
"eval_steps_per_second": 3.07, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 15.0, |
|
"eval_accuracy": 0.98989898989899, |
|
"eval_loss": 0.03611420467495918, |
|
"eval_runtime": 1.309, |
|
"eval_samples_per_second": 75.628, |
|
"eval_steps_per_second": 3.056, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 15.43, |
|
"grad_norm": 1.3324140310287476, |
|
"learning_rate": 3.412698412698413e-05, |
|
"loss": 0.092, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 16.0, |
|
"eval_accuracy": 0.98989898989899, |
|
"eval_loss": 0.032215822488069534, |
|
"eval_runtime": 1.3002, |
|
"eval_samples_per_second": 76.141, |
|
"eval_steps_per_second": 3.076, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 17.0, |
|
"eval_accuracy": 1.0, |
|
"eval_loss": 0.021263618022203445, |
|
"eval_runtime": 1.307, |
|
"eval_samples_per_second": 75.748, |
|
"eval_steps_per_second": 3.061, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 17.14, |
|
"grad_norm": 2.2976629734039307, |
|
"learning_rate": 3.1746031746031745e-05, |
|
"loss": 0.0762, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 18.0, |
|
"eval_accuracy": 0.98989898989899, |
|
"eval_loss": 0.04688708856701851, |
|
"eval_runtime": 1.315, |
|
"eval_samples_per_second": 75.288, |
|
"eval_steps_per_second": 3.042, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 18.86, |
|
"grad_norm": 2.308321714401245, |
|
"learning_rate": 2.9365079365079366e-05, |
|
"loss": 0.0954, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 19.0, |
|
"eval_accuracy": 0.98989898989899, |
|
"eval_loss": 0.061472997069358826, |
|
"eval_runtime": 1.3031, |
|
"eval_samples_per_second": 75.971, |
|
"eval_steps_per_second": 3.07, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"eval_accuracy": 0.98989898989899, |
|
"eval_loss": 0.03133073449134827, |
|
"eval_runtime": 1.2993, |
|
"eval_samples_per_second": 76.194, |
|
"eval_steps_per_second": 3.079, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 20.57, |
|
"grad_norm": 1.274895191192627, |
|
"learning_rate": 2.6984126984126984e-05, |
|
"loss": 0.0795, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 21.0, |
|
"eval_accuracy": 0.98989898989899, |
|
"eval_loss": 0.0380566343665123, |
|
"eval_runtime": 1.3136, |
|
"eval_samples_per_second": 75.368, |
|
"eval_steps_per_second": 3.045, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 22.0, |
|
"eval_accuracy": 1.0, |
|
"eval_loss": 0.013774486258625984, |
|
"eval_runtime": 1.309, |
|
"eval_samples_per_second": 75.628, |
|
"eval_steps_per_second": 3.056, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 22.29, |
|
"grad_norm": 1.425993800163269, |
|
"learning_rate": 2.4603174603174602e-05, |
|
"loss": 0.077, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 23.0, |
|
"eval_accuracy": 1.0, |
|
"eval_loss": 0.01703532226383686, |
|
"eval_runtime": 1.3122, |
|
"eval_samples_per_second": 75.446, |
|
"eval_steps_per_second": 3.048, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 24.0, |
|
"grad_norm": 0.986053466796875, |
|
"learning_rate": 2.2222222222222223e-05, |
|
"loss": 0.0675, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 24.0, |
|
"eval_accuracy": 1.0, |
|
"eval_loss": 0.010675261728465557, |
|
"eval_runtime": 1.3159, |
|
"eval_samples_per_second": 75.231, |
|
"eval_steps_per_second": 3.04, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 25.0, |
|
"eval_accuracy": 0.98989898989899, |
|
"eval_loss": 0.019328022375702858, |
|
"eval_runtime": 1.3069, |
|
"eval_samples_per_second": 75.752, |
|
"eval_steps_per_second": 3.061, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 25.71, |
|
"grad_norm": 2.7184066772460938, |
|
"learning_rate": 1.984126984126984e-05, |
|
"loss": 0.0659, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 26.0, |
|
"eval_accuracy": 0.98989898989899, |
|
"eval_loss": 0.025486120954155922, |
|
"eval_runtime": 1.3084, |
|
"eval_samples_per_second": 75.667, |
|
"eval_steps_per_second": 3.057, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 27.0, |
|
"eval_accuracy": 0.98989898989899, |
|
"eval_loss": 0.020136240869760513, |
|
"eval_runtime": 1.3096, |
|
"eval_samples_per_second": 75.596, |
|
"eval_steps_per_second": 3.054, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 27.43, |
|
"grad_norm": 2.4670774936676025, |
|
"learning_rate": 1.746031746031746e-05, |
|
"loss": 0.0758, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 28.0, |
|
"eval_accuracy": 0.98989898989899, |
|
"eval_loss": 0.03251149132847786, |
|
"eval_runtime": 1.2895, |
|
"eval_samples_per_second": 76.775, |
|
"eval_steps_per_second": 3.102, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 29.0, |
|
"eval_accuracy": 1.0, |
|
"eval_loss": 0.011012612842023373, |
|
"eval_runtime": 1.3105, |
|
"eval_samples_per_second": 75.542, |
|
"eval_steps_per_second": 3.052, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 29.14, |
|
"grad_norm": 1.395484209060669, |
|
"learning_rate": 1.5079365079365079e-05, |
|
"loss": 0.0589, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 30.0, |
|
"eval_accuracy": 1.0, |
|
"eval_loss": 0.0159281175583601, |
|
"eval_runtime": 1.3316, |
|
"eval_samples_per_second": 74.346, |
|
"eval_steps_per_second": 3.004, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 30.86, |
|
"grad_norm": 1.4966486692428589, |
|
"learning_rate": 1.2698412698412699e-05, |
|
"loss": 0.0521, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 31.0, |
|
"eval_accuracy": 0.98989898989899, |
|
"eval_loss": 0.031857237219810486, |
|
"eval_runtime": 1.2919, |
|
"eval_samples_per_second": 76.629, |
|
"eval_steps_per_second": 3.096, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 32.0, |
|
"eval_accuracy": 0.9797979797979798, |
|
"eval_loss": 0.029438314959406853, |
|
"eval_runtime": 1.3011, |
|
"eval_samples_per_second": 76.091, |
|
"eval_steps_per_second": 3.074, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 32.57, |
|
"grad_norm": 1.3882635831832886, |
|
"learning_rate": 1.0317460317460318e-05, |
|
"loss": 0.0618, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 33.0, |
|
"eval_accuracy": 0.9797979797979798, |
|
"eval_loss": 0.039191748946905136, |
|
"eval_runtime": 1.3063, |
|
"eval_samples_per_second": 75.788, |
|
"eval_steps_per_second": 3.062, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 34.0, |
|
"eval_accuracy": 0.98989898989899, |
|
"eval_loss": 0.026867415755987167, |
|
"eval_runtime": 1.2944, |
|
"eval_samples_per_second": 76.481, |
|
"eval_steps_per_second": 3.09, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 34.29, |
|
"grad_norm": 1.203762412071228, |
|
"learning_rate": 7.936507936507936e-06, |
|
"loss": 0.0422, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 35.0, |
|
"eval_accuracy": 0.98989898989899, |
|
"eval_loss": 0.021003253757953644, |
|
"eval_runtime": 1.3003, |
|
"eval_samples_per_second": 76.137, |
|
"eval_steps_per_second": 3.076, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 36.0, |
|
"grad_norm": 1.9063193798065186, |
|
"learning_rate": 5.555555555555556e-06, |
|
"loss": 0.0551, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 36.0, |
|
"eval_accuracy": 0.98989898989899, |
|
"eval_loss": 0.01777764968574047, |
|
"eval_runtime": 1.456, |
|
"eval_samples_per_second": 67.993, |
|
"eval_steps_per_second": 2.747, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 37.0, |
|
"eval_accuracy": 0.98989898989899, |
|
"eval_loss": 0.01593274623155594, |
|
"eval_runtime": 1.3158, |
|
"eval_samples_per_second": 75.242, |
|
"eval_steps_per_second": 3.04, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 37.71, |
|
"grad_norm": 1.4439316987991333, |
|
"learning_rate": 3.1746031746031746e-06, |
|
"loss": 0.0518, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 38.0, |
|
"eval_accuracy": 0.98989898989899, |
|
"eval_loss": 0.012379774823784828, |
|
"eval_runtime": 1.3454, |
|
"eval_samples_per_second": 73.583, |
|
"eval_steps_per_second": 2.973, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 39.0, |
|
"eval_accuracy": 1.0, |
|
"eval_loss": 0.011175006628036499, |
|
"eval_runtime": 1.4492, |
|
"eval_samples_per_second": 68.315, |
|
"eval_steps_per_second": 2.76, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 39.43, |
|
"grad_norm": 1.1414995193481445, |
|
"learning_rate": 7.936507936507937e-07, |
|
"loss": 0.0313, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 40.0, |
|
"eval_accuracy": 1.0, |
|
"eval_loss": 0.01096320990473032, |
|
"eval_runtime": 1.3095, |
|
"eval_samples_per_second": 75.6, |
|
"eval_steps_per_second": 3.055, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 40.0, |
|
"step": 280, |
|
"total_flos": 2.7494650758139085e+18, |
|
"train_loss": 0.2073148890797581, |
|
"train_runtime": 1492.3317, |
|
"train_samples_per_second": 23.775, |
|
"train_steps_per_second": 0.188 |
|
} |
|
], |
|
"logging_steps": 12, |
|
"max_steps": 280, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 40, |
|
"save_steps": 500, |
|
"total_flos": 2.7494650758139085e+18, |
|
"train_batch_size": 32, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|