|
{ |
|
"best_metric": 0.605766236782074, |
|
"best_model_checkpoint": "./vit-base-cats/checkpoint-500", |
|
"epoch": 2.857142857142857, |
|
"eval_steps": 100, |
|
"global_step": 500, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 104665.1796875, |
|
"learning_rate": 0.00019714285714285716, |
|
"loss": 2.8888, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 102043.53125, |
|
"learning_rate": 0.0001942857142857143, |
|
"loss": 2.6146, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 97863.03125, |
|
"learning_rate": 0.00019142857142857145, |
|
"loss": 2.3775, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 98961.4453125, |
|
"learning_rate": 0.00018857142857142857, |
|
"loss": 2.0395, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 142313.515625, |
|
"learning_rate": 0.00018571428571428572, |
|
"loss": 1.8599, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 112630.1328125, |
|
"learning_rate": 0.00018285714285714286, |
|
"loss": 1.7573, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 180249.9375, |
|
"learning_rate": 0.00018, |
|
"loss": 1.5562, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 124194.484375, |
|
"learning_rate": 0.00017714285714285713, |
|
"loss": 1.4203, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 187150.671875, |
|
"learning_rate": 0.0001742857142857143, |
|
"loss": 1.2605, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 151804.65625, |
|
"learning_rate": 0.00017142857142857143, |
|
"loss": 1.2422, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"eval_accuracy": 0.7314285714285714, |
|
"eval_loss": 1.1657222509384155, |
|
"eval_runtime": 15.2094, |
|
"eval_samples_per_second": 46.024, |
|
"eval_steps_per_second": 2.893, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 134052.609375, |
|
"learning_rate": 0.00016857142857142857, |
|
"loss": 1.1438, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 163657.515625, |
|
"learning_rate": 0.00016571428571428575, |
|
"loss": 1.2158, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 159906.9375, |
|
"learning_rate": 0.00016285714285714287, |
|
"loss": 1.0575, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 171710.640625, |
|
"learning_rate": 0.00016, |
|
"loss": 1.0049, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 171094.046875, |
|
"learning_rate": 0.00015714285714285716, |
|
"loss": 1.0786, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 140285.921875, |
|
"learning_rate": 0.0001542857142857143, |
|
"loss": 0.9912, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 170517.640625, |
|
"learning_rate": 0.00015142857142857143, |
|
"loss": 0.9448, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 1.03, |
|
"grad_norm": 125416.1484375, |
|
"learning_rate": 0.00014857142857142857, |
|
"loss": 0.826, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.09, |
|
"grad_norm": 205800.3125, |
|
"learning_rate": 0.00014571428571428572, |
|
"loss": 0.7471, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 1.14, |
|
"grad_norm": 74866.34375, |
|
"learning_rate": 0.00014285714285714287, |
|
"loss": 0.5948, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.14, |
|
"eval_accuracy": 0.8, |
|
"eval_loss": 0.7744152545928955, |
|
"eval_runtime": 11.8882, |
|
"eval_samples_per_second": 58.882, |
|
"eval_steps_per_second": 3.701, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 191153.328125, |
|
"learning_rate": 0.00014, |
|
"loss": 0.5425, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 1.26, |
|
"grad_norm": 181985.578125, |
|
"learning_rate": 0.00013714285714285716, |
|
"loss": 0.6606, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 1.31, |
|
"grad_norm": 193704.78125, |
|
"learning_rate": 0.00013428571428571428, |
|
"loss": 0.5681, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 1.37, |
|
"grad_norm": 135817.890625, |
|
"learning_rate": 0.00013142857142857143, |
|
"loss": 0.5497, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 1.43, |
|
"grad_norm": 154213.90625, |
|
"learning_rate": 0.00012857142857142858, |
|
"loss": 0.562, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.49, |
|
"grad_norm": 132152.8125, |
|
"learning_rate": 0.00012571428571428572, |
|
"loss": 0.5728, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 1.54, |
|
"grad_norm": 207587.0625, |
|
"learning_rate": 0.00012285714285714287, |
|
"loss": 0.6254, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 98314.2109375, |
|
"learning_rate": 0.00012, |
|
"loss": 0.4033, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 1.66, |
|
"grad_norm": 107417.6328125, |
|
"learning_rate": 0.00011714285714285715, |
|
"loss": 0.4892, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 1.71, |
|
"grad_norm": 250248.421875, |
|
"learning_rate": 0.00011428571428571428, |
|
"loss": 0.5324, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.71, |
|
"eval_accuracy": 0.8128571428571428, |
|
"eval_loss": 0.6736084818840027, |
|
"eval_runtime": 13.3903, |
|
"eval_samples_per_second": 52.277, |
|
"eval_steps_per_second": 3.286, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.77, |
|
"grad_norm": 165674.15625, |
|
"learning_rate": 0.00011142857142857144, |
|
"loss": 0.4641, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 1.83, |
|
"grad_norm": 194581.40625, |
|
"learning_rate": 0.00010857142857142856, |
|
"loss": 0.4818, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 1.89, |
|
"grad_norm": 138774.1875, |
|
"learning_rate": 0.00010571428571428572, |
|
"loss": 0.4677, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 1.94, |
|
"grad_norm": 203262.421875, |
|
"learning_rate": 0.00010285714285714286, |
|
"loss": 0.4034, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 128663.1875, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3978, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 2.06, |
|
"grad_norm": 35406.80078125, |
|
"learning_rate": 9.714285714285715e-05, |
|
"loss": 0.2415, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 2.11, |
|
"grad_norm": 128107.3046875, |
|
"learning_rate": 9.428571428571429e-05, |
|
"loss": 0.2798, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 2.17, |
|
"grad_norm": 52566.515625, |
|
"learning_rate": 9.142857142857143e-05, |
|
"loss": 0.2742, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 2.23, |
|
"grad_norm": 49925.12890625, |
|
"learning_rate": 8.857142857142857e-05, |
|
"loss": 0.2293, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 2.29, |
|
"grad_norm": 175336.984375, |
|
"learning_rate": 8.571428571428571e-05, |
|
"loss": 0.212, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 2.29, |
|
"eval_accuracy": 0.8085714285714286, |
|
"eval_loss": 0.7056272625923157, |
|
"eval_runtime": 11.5847, |
|
"eval_samples_per_second": 60.424, |
|
"eval_steps_per_second": 3.798, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 2.34, |
|
"grad_norm": 24206.984375, |
|
"learning_rate": 8.285714285714287e-05, |
|
"loss": 0.2311, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 2.4, |
|
"grad_norm": 124311.4140625, |
|
"learning_rate": 8e-05, |
|
"loss": 0.2342, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 2.46, |
|
"grad_norm": 68467.265625, |
|
"learning_rate": 7.714285714285715e-05, |
|
"loss": 0.2055, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 2.51, |
|
"grad_norm": 104893.53125, |
|
"learning_rate": 7.428571428571429e-05, |
|
"loss": 0.1814, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 2.57, |
|
"grad_norm": 76138.921875, |
|
"learning_rate": 7.142857142857143e-05, |
|
"loss": 0.2127, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 2.63, |
|
"grad_norm": 138337.28125, |
|
"learning_rate": 6.857142857142858e-05, |
|
"loss": 0.1863, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 2.69, |
|
"grad_norm": 47311.3203125, |
|
"learning_rate": 6.571428571428571e-05, |
|
"loss": 0.2281, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 2.74, |
|
"grad_norm": 107393.203125, |
|
"learning_rate": 6.285714285714286e-05, |
|
"loss": 0.1875, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 2.8, |
|
"grad_norm": 167001.0, |
|
"learning_rate": 6e-05, |
|
"loss": 0.1896, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 2.86, |
|
"grad_norm": 90490.375, |
|
"learning_rate": 5.714285714285714e-05, |
|
"loss": 0.2158, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 2.86, |
|
"eval_accuracy": 0.82, |
|
"eval_loss": 0.605766236782074, |
|
"eval_runtime": 11.55, |
|
"eval_samples_per_second": 60.606, |
|
"eval_steps_per_second": 3.81, |
|
"step": 500 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 700, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 4, |
|
"save_steps": 100, |
|
"total_flos": 1.240071860256768e+18, |
|
"train_batch_size": 32, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|