|
{ |
|
"best_metric": 0.10501641035079956, |
|
"best_model_checkpoint": "./vit-base-lcdoctypev1_session3/checkpoint-335", |
|
"epoch": 10.0, |
|
"eval_steps": 5, |
|
"global_step": 600, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.08, |
|
"eval_accuracy": 0.9090909090909091, |
|
"eval_loss": 0.3158922493457794, |
|
"eval_runtime": 9.7781, |
|
"eval_samples_per_second": 12.375, |
|
"eval_steps_per_second": 1.636, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 6.890113353729248, |
|
"learning_rate": 0.00019666666666666666, |
|
"loss": 0.1798, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"eval_accuracy": 0.9338842975206612, |
|
"eval_loss": 0.22616656124591827, |
|
"eval_runtime": 8.5759, |
|
"eval_samples_per_second": 14.109, |
|
"eval_steps_per_second": 1.866, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"eval_accuracy": 0.7768595041322314, |
|
"eval_loss": 0.9910252690315247, |
|
"eval_runtime": 9.2477, |
|
"eval_samples_per_second": 13.084, |
|
"eval_steps_per_second": 1.73, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 5.934413909912109, |
|
"learning_rate": 0.0001936666666666667, |
|
"loss": 0.3815, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"eval_accuracy": 0.9008264462809917, |
|
"eval_loss": 0.3035498857498169, |
|
"eval_runtime": 8.7721, |
|
"eval_samples_per_second": 13.794, |
|
"eval_steps_per_second": 1.824, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"eval_accuracy": 0.9338842975206612, |
|
"eval_loss": 0.21773552894592285, |
|
"eval_runtime": 9.0833, |
|
"eval_samples_per_second": 13.321, |
|
"eval_steps_per_second": 1.761, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.3238607347011566, |
|
"learning_rate": 0.00019033333333333334, |
|
"loss": 0.1429, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"eval_accuracy": 0.8842975206611571, |
|
"eval_loss": 0.4908874034881592, |
|
"eval_runtime": 8.456, |
|
"eval_samples_per_second": 14.309, |
|
"eval_steps_per_second": 1.892, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"eval_accuracy": 0.9256198347107438, |
|
"eval_loss": 0.3096350133419037, |
|
"eval_runtime": 10.0423, |
|
"eval_samples_per_second": 12.049, |
|
"eval_steps_per_second": 1.593, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 5.55199670791626, |
|
"learning_rate": 0.00018700000000000002, |
|
"loss": 0.2424, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"eval_accuracy": 0.9173553719008265, |
|
"eval_loss": 0.32702526450157166, |
|
"eval_runtime": 8.357, |
|
"eval_samples_per_second": 14.479, |
|
"eval_steps_per_second": 1.915, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"eval_accuracy": 0.9173553719008265, |
|
"eval_loss": 0.2554916441440582, |
|
"eval_runtime": 9.2174, |
|
"eval_samples_per_second": 13.127, |
|
"eval_steps_per_second": 1.736, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 14.463412284851074, |
|
"learning_rate": 0.00018366666666666667, |
|
"loss": 0.1172, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"eval_accuracy": 0.9173553719008265, |
|
"eval_loss": 0.2309066504240036, |
|
"eval_runtime": 9.1271, |
|
"eval_samples_per_second": 13.257, |
|
"eval_steps_per_second": 1.753, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"eval_accuracy": 0.9173553719008265, |
|
"eval_loss": 0.2952496409416199, |
|
"eval_runtime": 9.6618, |
|
"eval_samples_per_second": 12.523, |
|
"eval_steps_per_second": 1.656, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.9041020274162292, |
|
"learning_rate": 0.00018033333333333334, |
|
"loss": 0.1185, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_accuracy": 0.9173553719008265, |
|
"eval_loss": 0.295705646276474, |
|
"eval_runtime": 8.6323, |
|
"eval_samples_per_second": 14.017, |
|
"eval_steps_per_second": 1.854, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 1.08, |
|
"eval_accuracy": 0.8925619834710744, |
|
"eval_loss": 0.3724129796028137, |
|
"eval_runtime": 8.5924, |
|
"eval_samples_per_second": 14.082, |
|
"eval_steps_per_second": 1.862, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 1.17, |
|
"grad_norm": 24.987598419189453, |
|
"learning_rate": 0.00017700000000000002, |
|
"loss": 0.1594, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 1.17, |
|
"eval_accuracy": 0.8842975206611571, |
|
"eval_loss": 0.4216250777244568, |
|
"eval_runtime": 9.0834, |
|
"eval_samples_per_second": 13.321, |
|
"eval_steps_per_second": 1.761, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"eval_accuracy": 0.9173553719008265, |
|
"eval_loss": 0.347516268491745, |
|
"eval_runtime": 8.9447, |
|
"eval_samples_per_second": 13.528, |
|
"eval_steps_per_second": 1.789, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 1.33, |
|
"grad_norm": 0.3783435523509979, |
|
"learning_rate": 0.00017366666666666667, |
|
"loss": 0.1231, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 1.33, |
|
"eval_accuracy": 0.8925619834710744, |
|
"eval_loss": 0.323406845331192, |
|
"eval_runtime": 9.122, |
|
"eval_samples_per_second": 13.265, |
|
"eval_steps_per_second": 1.754, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 1.42, |
|
"eval_accuracy": 0.8842975206611571, |
|
"eval_loss": 0.4309641718864441, |
|
"eval_runtime": 8.8847, |
|
"eval_samples_per_second": 13.619, |
|
"eval_steps_per_second": 1.801, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 5.4656147956848145, |
|
"learning_rate": 0.00017033333333333334, |
|
"loss": 0.0875, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"eval_accuracy": 0.9256198347107438, |
|
"eval_loss": 0.3598105013370514, |
|
"eval_runtime": 9.086, |
|
"eval_samples_per_second": 13.317, |
|
"eval_steps_per_second": 1.761, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 1.58, |
|
"eval_accuracy": 0.9256198347107438, |
|
"eval_loss": 0.3038037419319153, |
|
"eval_runtime": 8.8538, |
|
"eval_samples_per_second": 13.666, |
|
"eval_steps_per_second": 1.807, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 1.67, |
|
"grad_norm": 0.0762423649430275, |
|
"learning_rate": 0.000167, |
|
"loss": 0.0897, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 1.67, |
|
"eval_accuracy": 0.9338842975206612, |
|
"eval_loss": 0.25987809896469116, |
|
"eval_runtime": 8.7997, |
|
"eval_samples_per_second": 13.751, |
|
"eval_steps_per_second": 1.818, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 1.75, |
|
"eval_accuracy": 0.9586776859504132, |
|
"eval_loss": 0.1683536171913147, |
|
"eval_runtime": 8.0037, |
|
"eval_samples_per_second": 15.118, |
|
"eval_steps_per_second": 1.999, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 1.83, |
|
"grad_norm": 1.4948017597198486, |
|
"learning_rate": 0.00016366666666666667, |
|
"loss": 0.1797, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 1.83, |
|
"eval_accuracy": 0.9504132231404959, |
|
"eval_loss": 0.1412244588136673, |
|
"eval_runtime": 8.3997, |
|
"eval_samples_per_second": 14.405, |
|
"eval_steps_per_second": 1.905, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 1.92, |
|
"eval_accuracy": 0.9586776859504132, |
|
"eval_loss": 0.14531350135803223, |
|
"eval_runtime": 8.6407, |
|
"eval_samples_per_second": 14.003, |
|
"eval_steps_per_second": 1.852, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 6.408501148223877, |
|
"learning_rate": 0.00016033333333333335, |
|
"loss": 0.1178, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_accuracy": 0.8925619834710744, |
|
"eval_loss": 0.3830728530883789, |
|
"eval_runtime": 9.0064, |
|
"eval_samples_per_second": 13.435, |
|
"eval_steps_per_second": 1.777, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 2.08, |
|
"eval_accuracy": 0.9090909090909091, |
|
"eval_loss": 0.3321413993835449, |
|
"eval_runtime": 8.8462, |
|
"eval_samples_per_second": 13.678, |
|
"eval_steps_per_second": 1.809, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 2.17, |
|
"grad_norm": 0.7101590633392334, |
|
"learning_rate": 0.00015700000000000002, |
|
"loss": 0.1969, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 2.17, |
|
"eval_accuracy": 0.9090909090909091, |
|
"eval_loss": 0.25461918115615845, |
|
"eval_runtime": 8.969, |
|
"eval_samples_per_second": 13.491, |
|
"eval_steps_per_second": 1.784, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 2.25, |
|
"eval_accuracy": 0.9504132231404959, |
|
"eval_loss": 0.18391890823841095, |
|
"eval_runtime": 8.644, |
|
"eval_samples_per_second": 13.998, |
|
"eval_steps_per_second": 1.851, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 2.33, |
|
"grad_norm": 0.9350224733352661, |
|
"learning_rate": 0.00015366666666666667, |
|
"loss": 0.0362, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 2.33, |
|
"eval_accuracy": 0.9586776859504132, |
|
"eval_loss": 0.20266053080558777, |
|
"eval_runtime": 8.5546, |
|
"eval_samples_per_second": 14.144, |
|
"eval_steps_per_second": 1.87, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 2.42, |
|
"eval_accuracy": 0.9090909090909091, |
|
"eval_loss": 0.28766530752182007, |
|
"eval_runtime": 8.0124, |
|
"eval_samples_per_second": 15.102, |
|
"eval_steps_per_second": 1.997, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"grad_norm": 8.35146427154541, |
|
"learning_rate": 0.00015033333333333335, |
|
"loss": 0.1047, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"eval_accuracy": 0.8925619834710744, |
|
"eval_loss": 0.4503512978553772, |
|
"eval_runtime": 8.454, |
|
"eval_samples_per_second": 14.313, |
|
"eval_steps_per_second": 1.893, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 2.58, |
|
"eval_accuracy": 0.9504132231404959, |
|
"eval_loss": 0.1810603141784668, |
|
"eval_runtime": 9.2694, |
|
"eval_samples_per_second": 13.054, |
|
"eval_steps_per_second": 1.726, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 2.67, |
|
"grad_norm": 0.6059785485267639, |
|
"learning_rate": 0.000147, |
|
"loss": 0.1232, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 2.67, |
|
"eval_accuracy": 0.9421487603305785, |
|
"eval_loss": 0.21074515581130981, |
|
"eval_runtime": 8.8489, |
|
"eval_samples_per_second": 13.674, |
|
"eval_steps_per_second": 1.808, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 2.75, |
|
"eval_accuracy": 0.9504132231404959, |
|
"eval_loss": 0.20863419771194458, |
|
"eval_runtime": 8.9684, |
|
"eval_samples_per_second": 13.492, |
|
"eval_steps_per_second": 1.784, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 2.83, |
|
"grad_norm": 0.04269757494330406, |
|
"learning_rate": 0.00014366666666666667, |
|
"loss": 0.0611, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 2.83, |
|
"eval_accuracy": 0.9338842975206612, |
|
"eval_loss": 0.2971450686454773, |
|
"eval_runtime": 9.1231, |
|
"eval_samples_per_second": 13.263, |
|
"eval_steps_per_second": 1.754, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 2.92, |
|
"eval_accuracy": 0.9338842975206612, |
|
"eval_loss": 0.2731765806674957, |
|
"eval_runtime": 8.9974, |
|
"eval_samples_per_second": 13.448, |
|
"eval_steps_per_second": 1.778, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 0.9442762732505798, |
|
"learning_rate": 0.00014033333333333335, |
|
"loss": 0.0815, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_accuracy": 0.9586776859504132, |
|
"eval_loss": 0.16794723272323608, |
|
"eval_runtime": 8.9455, |
|
"eval_samples_per_second": 13.526, |
|
"eval_steps_per_second": 1.789, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 3.08, |
|
"eval_accuracy": 0.9338842975206612, |
|
"eval_loss": 0.24155646562576294, |
|
"eval_runtime": 8.3613, |
|
"eval_samples_per_second": 14.471, |
|
"eval_steps_per_second": 1.914, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 3.17, |
|
"grad_norm": 2.08608078956604, |
|
"learning_rate": 0.00013700000000000002, |
|
"loss": 0.0469, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 3.17, |
|
"eval_accuracy": 0.9256198347107438, |
|
"eval_loss": 0.29269692301750183, |
|
"eval_runtime": 8.9407, |
|
"eval_samples_per_second": 13.534, |
|
"eval_steps_per_second": 1.79, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 3.25, |
|
"eval_accuracy": 0.9338842975206612, |
|
"eval_loss": 0.28314581513404846, |
|
"eval_runtime": 8.7918, |
|
"eval_samples_per_second": 13.763, |
|
"eval_steps_per_second": 1.82, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 3.33, |
|
"grad_norm": 0.8798501491546631, |
|
"learning_rate": 0.00013366666666666667, |
|
"loss": 0.0443, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 3.33, |
|
"eval_accuracy": 0.9421487603305785, |
|
"eval_loss": 0.2744951546192169, |
|
"eval_runtime": 8.612, |
|
"eval_samples_per_second": 14.05, |
|
"eval_steps_per_second": 1.858, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 3.42, |
|
"eval_accuracy": 0.8925619834710744, |
|
"eval_loss": 0.4193201959133148, |
|
"eval_runtime": 8.9147, |
|
"eval_samples_per_second": 13.573, |
|
"eval_steps_per_second": 1.795, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 3.5, |
|
"grad_norm": 0.03738969564437866, |
|
"learning_rate": 0.00013033333333333332, |
|
"loss": 0.0823, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 3.5, |
|
"eval_accuracy": 0.9173553719008265, |
|
"eval_loss": 0.3746081292629242, |
|
"eval_runtime": 8.5854, |
|
"eval_samples_per_second": 14.094, |
|
"eval_steps_per_second": 1.864, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 3.58, |
|
"eval_accuracy": 0.9421487603305785, |
|
"eval_loss": 0.30296453833580017, |
|
"eval_runtime": 8.8651, |
|
"eval_samples_per_second": 13.649, |
|
"eval_steps_per_second": 1.805, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 3.67, |
|
"grad_norm": 0.03203318268060684, |
|
"learning_rate": 0.000127, |
|
"loss": 0.0101, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 3.67, |
|
"eval_accuracy": 0.9504132231404959, |
|
"eval_loss": 0.21464580297470093, |
|
"eval_runtime": 8.8029, |
|
"eval_samples_per_second": 13.745, |
|
"eval_steps_per_second": 1.818, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 3.75, |
|
"eval_accuracy": 0.9421487603305785, |
|
"eval_loss": 0.2514008581638336, |
|
"eval_runtime": 9.2073, |
|
"eval_samples_per_second": 13.142, |
|
"eval_steps_per_second": 1.738, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 3.83, |
|
"grad_norm": 0.04610053077340126, |
|
"learning_rate": 0.00012366666666666667, |
|
"loss": 0.16, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 3.83, |
|
"eval_accuracy": 0.9421487603305785, |
|
"eval_loss": 0.25517505407333374, |
|
"eval_runtime": 8.8885, |
|
"eval_samples_per_second": 13.613, |
|
"eval_steps_per_second": 1.8, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 3.92, |
|
"eval_accuracy": 0.9421487603305785, |
|
"eval_loss": 0.22389596700668335, |
|
"eval_runtime": 8.5203, |
|
"eval_samples_per_second": 14.201, |
|
"eval_steps_per_second": 1.878, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"grad_norm": 3.5699214935302734, |
|
"learning_rate": 0.00012033333333333335, |
|
"loss": 0.1687, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_accuracy": 0.9256198347107438, |
|
"eval_loss": 0.25712552666664124, |
|
"eval_runtime": 8.7329, |
|
"eval_samples_per_second": 13.856, |
|
"eval_steps_per_second": 1.832, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 4.08, |
|
"eval_accuracy": 0.9752066115702479, |
|
"eval_loss": 0.13568438589572906, |
|
"eval_runtime": 8.642, |
|
"eval_samples_per_second": 14.001, |
|
"eval_steps_per_second": 1.851, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 4.17, |
|
"grad_norm": 2.446256637573242, |
|
"learning_rate": 0.000117, |
|
"loss": 0.0758, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 4.17, |
|
"eval_accuracy": 0.9504132231404959, |
|
"eval_loss": 0.17341962456703186, |
|
"eval_runtime": 8.0872, |
|
"eval_samples_per_second": 14.962, |
|
"eval_steps_per_second": 1.978, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 4.25, |
|
"eval_accuracy": 0.9752066115702479, |
|
"eval_loss": 0.11965644359588623, |
|
"eval_runtime": 9.0168, |
|
"eval_samples_per_second": 13.419, |
|
"eval_steps_per_second": 1.774, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 4.33, |
|
"grad_norm": 0.048665851354599, |
|
"learning_rate": 0.00011366666666666667, |
|
"loss": 0.042, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 4.33, |
|
"eval_accuracy": 0.9421487603305785, |
|
"eval_loss": 0.23387375473976135, |
|
"eval_runtime": 8.7974, |
|
"eval_samples_per_second": 13.754, |
|
"eval_steps_per_second": 1.819, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 4.42, |
|
"eval_accuracy": 0.9173553719008265, |
|
"eval_loss": 0.2923980951309204, |
|
"eval_runtime": 8.7236, |
|
"eval_samples_per_second": 13.87, |
|
"eval_steps_per_second": 1.834, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 4.5, |
|
"grad_norm": 0.07859846204519272, |
|
"learning_rate": 0.00011033333333333334, |
|
"loss": 0.0114, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 4.5, |
|
"eval_accuracy": 0.9504132231404959, |
|
"eval_loss": 0.23183898627758026, |
|
"eval_runtime": 8.7284, |
|
"eval_samples_per_second": 13.863, |
|
"eval_steps_per_second": 1.833, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 4.58, |
|
"eval_accuracy": 0.9586776859504132, |
|
"eval_loss": 0.17654092609882355, |
|
"eval_runtime": 8.8357, |
|
"eval_samples_per_second": 13.694, |
|
"eval_steps_per_second": 1.811, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 4.67, |
|
"grad_norm": 0.266812801361084, |
|
"learning_rate": 0.00010700000000000001, |
|
"loss": 0.0197, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 4.67, |
|
"eval_accuracy": 0.9669421487603306, |
|
"eval_loss": 0.12631382048130035, |
|
"eval_runtime": 7.9448, |
|
"eval_samples_per_second": 15.23, |
|
"eval_steps_per_second": 2.014, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 4.75, |
|
"eval_accuracy": 0.9669421487603306, |
|
"eval_loss": 0.12528358399868011, |
|
"eval_runtime": 8.1841, |
|
"eval_samples_per_second": 14.785, |
|
"eval_steps_per_second": 1.955, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 4.83, |
|
"grad_norm": 0.8625770807266235, |
|
"learning_rate": 0.00010366666666666666, |
|
"loss": 0.0283, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 4.83, |
|
"eval_accuracy": 0.9669421487603306, |
|
"eval_loss": 0.1239379420876503, |
|
"eval_runtime": 9.1766, |
|
"eval_samples_per_second": 13.186, |
|
"eval_steps_per_second": 1.744, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 4.92, |
|
"eval_accuracy": 0.9669421487603306, |
|
"eval_loss": 0.12782499194145203, |
|
"eval_runtime": 8.9807, |
|
"eval_samples_per_second": 13.473, |
|
"eval_steps_per_second": 1.782, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"grad_norm": 0.08663175255060196, |
|
"learning_rate": 0.00010033333333333335, |
|
"loss": 0.1115, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_accuracy": 0.9338842975206612, |
|
"eval_loss": 0.2527827322483063, |
|
"eval_runtime": 8.5733, |
|
"eval_samples_per_second": 14.114, |
|
"eval_steps_per_second": 1.866, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 5.08, |
|
"eval_accuracy": 0.9338842975206612, |
|
"eval_loss": 0.3164093792438507, |
|
"eval_runtime": 8.5647, |
|
"eval_samples_per_second": 14.128, |
|
"eval_steps_per_second": 1.868, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 5.17, |
|
"grad_norm": 0.11408742517232895, |
|
"learning_rate": 9.7e-05, |
|
"loss": 0.0404, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 5.17, |
|
"eval_accuracy": 0.9338842975206612, |
|
"eval_loss": 0.2841833829879761, |
|
"eval_runtime": 8.7395, |
|
"eval_samples_per_second": 13.845, |
|
"eval_steps_per_second": 1.831, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 5.25, |
|
"eval_accuracy": 0.9504132231404959, |
|
"eval_loss": 0.17133790254592896, |
|
"eval_runtime": 8.8895, |
|
"eval_samples_per_second": 13.612, |
|
"eval_steps_per_second": 1.8, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 5.33, |
|
"grad_norm": 0.08482904732227325, |
|
"learning_rate": 9.366666666666668e-05, |
|
"loss": 0.0719, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 5.33, |
|
"eval_accuracy": 0.9338842975206612, |
|
"eval_loss": 0.18959270417690277, |
|
"eval_runtime": 8.641, |
|
"eval_samples_per_second": 14.003, |
|
"eval_steps_per_second": 1.852, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 5.42, |
|
"eval_accuracy": 0.9256198347107438, |
|
"eval_loss": 0.18550463020801544, |
|
"eval_runtime": 8.219, |
|
"eval_samples_per_second": 14.722, |
|
"eval_steps_per_second": 1.947, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 5.5, |
|
"grad_norm": 4.354619026184082, |
|
"learning_rate": 9.033333333333334e-05, |
|
"loss": 0.0435, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 5.5, |
|
"eval_accuracy": 0.9669421487603306, |
|
"eval_loss": 0.15409986674785614, |
|
"eval_runtime": 8.5474, |
|
"eval_samples_per_second": 14.156, |
|
"eval_steps_per_second": 1.872, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 5.58, |
|
"eval_accuracy": 0.9669421487603306, |
|
"eval_loss": 0.10501641035079956, |
|
"eval_runtime": 8.6962, |
|
"eval_samples_per_second": 13.914, |
|
"eval_steps_per_second": 1.84, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 5.67, |
|
"grad_norm": 0.05306649208068848, |
|
"learning_rate": 8.7e-05, |
|
"loss": 0.0129, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 5.67, |
|
"eval_accuracy": 0.9586776859504132, |
|
"eval_loss": 0.10632016509771347, |
|
"eval_runtime": 8.7849, |
|
"eval_samples_per_second": 13.774, |
|
"eval_steps_per_second": 1.821, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 5.75, |
|
"eval_accuracy": 0.9586776859504132, |
|
"eval_loss": 0.11378511786460876, |
|
"eval_runtime": 8.0973, |
|
"eval_samples_per_second": 14.943, |
|
"eval_steps_per_second": 1.976, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 5.83, |
|
"grad_norm": 0.02987060882151127, |
|
"learning_rate": 8.366666666666668e-05, |
|
"loss": 0.0222, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 5.83, |
|
"eval_accuracy": 0.9586776859504132, |
|
"eval_loss": 0.11444854736328125, |
|
"eval_runtime": 8.7513, |
|
"eval_samples_per_second": 13.827, |
|
"eval_steps_per_second": 1.828, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 5.92, |
|
"eval_accuracy": 0.9669421487603306, |
|
"eval_loss": 0.12378235161304474, |
|
"eval_runtime": 9.2818, |
|
"eval_samples_per_second": 13.036, |
|
"eval_steps_per_second": 1.724, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"grad_norm": 0.02307078428566456, |
|
"learning_rate": 8.033333333333334e-05, |
|
"loss": 0.0431, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_accuracy": 0.9752066115702479, |
|
"eval_loss": 0.1342514306306839, |
|
"eval_runtime": 7.9783, |
|
"eval_samples_per_second": 15.166, |
|
"eval_steps_per_second": 2.005, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 6.08, |
|
"eval_accuracy": 0.9669421487603306, |
|
"eval_loss": 0.144140362739563, |
|
"eval_runtime": 8.763, |
|
"eval_samples_per_second": 13.808, |
|
"eval_steps_per_second": 1.826, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 6.17, |
|
"grad_norm": 0.016979066655039787, |
|
"learning_rate": 7.7e-05, |
|
"loss": 0.0064, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 6.17, |
|
"eval_accuracy": 0.9669421487603306, |
|
"eval_loss": 0.1470753401517868, |
|
"eval_runtime": 8.9895, |
|
"eval_samples_per_second": 13.46, |
|
"eval_steps_per_second": 1.78, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 6.25, |
|
"eval_accuracy": 0.9752066115702479, |
|
"eval_loss": 0.1360587477684021, |
|
"eval_runtime": 8.9513, |
|
"eval_samples_per_second": 13.518, |
|
"eval_steps_per_second": 1.787, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 6.33, |
|
"grad_norm": 0.022434255108237267, |
|
"learning_rate": 7.366666666666668e-05, |
|
"loss": 0.0576, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 6.33, |
|
"eval_accuracy": 0.9752066115702479, |
|
"eval_loss": 0.13161548972129822, |
|
"eval_runtime": 8.7711, |
|
"eval_samples_per_second": 13.795, |
|
"eval_steps_per_second": 1.824, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 6.42, |
|
"eval_accuracy": 0.9669421487603306, |
|
"eval_loss": 0.12319940328598022, |
|
"eval_runtime": 8.3275, |
|
"eval_samples_per_second": 14.53, |
|
"eval_steps_per_second": 1.921, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 6.5, |
|
"grad_norm": 0.2556796371936798, |
|
"learning_rate": 7.033333333333334e-05, |
|
"loss": 0.0298, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 6.5, |
|
"eval_accuracy": 0.9669421487603306, |
|
"eval_loss": 0.1254538893699646, |
|
"eval_runtime": 8.6034, |
|
"eval_samples_per_second": 14.064, |
|
"eval_steps_per_second": 1.86, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 6.58, |
|
"eval_accuracy": 0.9669421487603306, |
|
"eval_loss": 0.13591305911540985, |
|
"eval_runtime": 8.8244, |
|
"eval_samples_per_second": 13.712, |
|
"eval_steps_per_second": 1.813, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 6.67, |
|
"grad_norm": 0.03436155617237091, |
|
"learning_rate": 6.7e-05, |
|
"loss": 0.0097, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 6.67, |
|
"eval_accuracy": 0.9669421487603306, |
|
"eval_loss": 0.1434980034828186, |
|
"eval_runtime": 9.1677, |
|
"eval_samples_per_second": 13.199, |
|
"eval_steps_per_second": 1.745, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 6.75, |
|
"eval_accuracy": 0.9669421487603306, |
|
"eval_loss": 0.14506025612354279, |
|
"eval_runtime": 8.7551, |
|
"eval_samples_per_second": 13.82, |
|
"eval_steps_per_second": 1.827, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 6.83, |
|
"grad_norm": 0.019292179495096207, |
|
"learning_rate": 6.366666666666668e-05, |
|
"loss": 0.0153, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 6.83, |
|
"eval_accuracy": 0.9669421487603306, |
|
"eval_loss": 0.14391547441482544, |
|
"eval_runtime": 8.6401, |
|
"eval_samples_per_second": 14.004, |
|
"eval_steps_per_second": 1.852, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 6.92, |
|
"eval_accuracy": 0.9752066115702479, |
|
"eval_loss": 0.1352916657924652, |
|
"eval_runtime": 8.9781, |
|
"eval_samples_per_second": 13.477, |
|
"eval_steps_per_second": 1.782, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"grad_norm": 0.1764516532421112, |
|
"learning_rate": 6.033333333333334e-05, |
|
"loss": 0.0406, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"eval_accuracy": 0.9752066115702479, |
|
"eval_loss": 0.13157208263874054, |
|
"eval_runtime": 8.4013, |
|
"eval_samples_per_second": 14.402, |
|
"eval_steps_per_second": 1.904, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 7.08, |
|
"eval_accuracy": 0.9752066115702479, |
|
"eval_loss": 0.13093091547489166, |
|
"eval_runtime": 9.2645, |
|
"eval_samples_per_second": 13.061, |
|
"eval_steps_per_second": 1.727, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 7.17, |
|
"grad_norm": 0.025451194494962692, |
|
"learning_rate": 5.6999999999999996e-05, |
|
"loss": 0.0154, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 7.17, |
|
"eval_accuracy": 0.9752066115702479, |
|
"eval_loss": 0.13050581514835358, |
|
"eval_runtime": 8.9669, |
|
"eval_samples_per_second": 13.494, |
|
"eval_steps_per_second": 1.784, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 7.25, |
|
"eval_accuracy": 0.9752066115702479, |
|
"eval_loss": 0.13096679747104645, |
|
"eval_runtime": 8.8412, |
|
"eval_samples_per_second": 13.686, |
|
"eval_steps_per_second": 1.81, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 7.33, |
|
"grad_norm": 0.01770775578916073, |
|
"learning_rate": 5.3666666666666666e-05, |
|
"loss": 0.0209, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 7.33, |
|
"eval_accuracy": 0.9752066115702479, |
|
"eval_loss": 0.13012637197971344, |
|
"eval_runtime": 8.8578, |
|
"eval_samples_per_second": 13.66, |
|
"eval_steps_per_second": 1.806, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 7.42, |
|
"eval_accuracy": 0.9586776859504132, |
|
"eval_loss": 0.145931214094162, |
|
"eval_runtime": 8.6371, |
|
"eval_samples_per_second": 14.009, |
|
"eval_steps_per_second": 1.852, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 7.5, |
|
"grad_norm": 17.443572998046875, |
|
"learning_rate": 5.0333333333333335e-05, |
|
"loss": 0.0298, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 7.5, |
|
"eval_accuracy": 0.9586776859504132, |
|
"eval_loss": 0.16629938781261444, |
|
"eval_runtime": 8.2752, |
|
"eval_samples_per_second": 14.622, |
|
"eval_steps_per_second": 1.933, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 7.58, |
|
"eval_accuracy": 0.9586776859504132, |
|
"eval_loss": 0.15594066679477692, |
|
"eval_runtime": 8.806, |
|
"eval_samples_per_second": 13.741, |
|
"eval_steps_per_second": 1.817, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 7.67, |
|
"grad_norm": 0.030463455244898796, |
|
"learning_rate": 4.7e-05, |
|
"loss": 0.0052, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 7.67, |
|
"eval_accuracy": 0.9586776859504132, |
|
"eval_loss": 0.15159618854522705, |
|
"eval_runtime": 8.9863, |
|
"eval_samples_per_second": 13.465, |
|
"eval_steps_per_second": 1.78, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 7.75, |
|
"eval_accuracy": 0.9586776859504132, |
|
"eval_loss": 0.13964863121509552, |
|
"eval_runtime": 8.4469, |
|
"eval_samples_per_second": 14.325, |
|
"eval_steps_per_second": 1.894, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 7.83, |
|
"grad_norm": 0.11391649395227432, |
|
"learning_rate": 4.3666666666666666e-05, |
|
"loss": 0.0172, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 7.83, |
|
"eval_accuracy": 0.9586776859504132, |
|
"eval_loss": 0.13303633034229279, |
|
"eval_runtime": 8.9949, |
|
"eval_samples_per_second": 13.452, |
|
"eval_steps_per_second": 1.779, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 7.92, |
|
"eval_accuracy": 0.9752066115702479, |
|
"eval_loss": 0.12355250120162964, |
|
"eval_runtime": 8.1109, |
|
"eval_samples_per_second": 14.918, |
|
"eval_steps_per_second": 1.973, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"grad_norm": 0.059335533529520035, |
|
"learning_rate": 4.0333333333333336e-05, |
|
"loss": 0.0348, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_accuracy": 0.9752066115702479, |
|
"eval_loss": 0.12098132073879242, |
|
"eval_runtime": 8.8837, |
|
"eval_samples_per_second": 13.621, |
|
"eval_steps_per_second": 1.801, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 8.08, |
|
"eval_accuracy": 0.9752066115702479, |
|
"eval_loss": 0.11751802265644073, |
|
"eval_runtime": 9.8045, |
|
"eval_samples_per_second": 12.341, |
|
"eval_steps_per_second": 1.632, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 8.17, |
|
"grad_norm": 0.019469719380140305, |
|
"learning_rate": 3.7e-05, |
|
"loss": 0.0068, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 8.17, |
|
"eval_accuracy": 0.9752066115702479, |
|
"eval_loss": 0.1185019463300705, |
|
"eval_runtime": 8.8757, |
|
"eval_samples_per_second": 13.633, |
|
"eval_steps_per_second": 1.803, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 8.25, |
|
"eval_accuracy": 0.9752066115702479, |
|
"eval_loss": 0.12291049212217331, |
|
"eval_runtime": 8.9999, |
|
"eval_samples_per_second": 13.445, |
|
"eval_steps_per_second": 1.778, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 8.33, |
|
"grad_norm": 0.029128719121217728, |
|
"learning_rate": 3.366666666666667e-05, |
|
"loss": 0.0305, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 8.33, |
|
"eval_accuracy": 0.9752066115702479, |
|
"eval_loss": 0.12297818809747696, |
|
"eval_runtime": 9.0782, |
|
"eval_samples_per_second": 13.329, |
|
"eval_steps_per_second": 1.762, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 8.42, |
|
"eval_accuracy": 0.9752066115702479, |
|
"eval_loss": 0.12048203498125076, |
|
"eval_runtime": 8.2377, |
|
"eval_samples_per_second": 14.688, |
|
"eval_steps_per_second": 1.942, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 8.5, |
|
"grad_norm": 0.01591232791543007, |
|
"learning_rate": 3.0333333333333337e-05, |
|
"loss": 0.0154, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 8.5, |
|
"eval_accuracy": 0.9752066115702479, |
|
"eval_loss": 0.11965296417474747, |
|
"eval_runtime": 9.3646, |
|
"eval_samples_per_second": 12.921, |
|
"eval_steps_per_second": 1.709, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 8.58, |
|
"eval_accuracy": 0.9752066115702479, |
|
"eval_loss": 0.12173377722501755, |
|
"eval_runtime": 9.3963, |
|
"eval_samples_per_second": 12.877, |
|
"eval_steps_per_second": 1.703, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 8.67, |
|
"grad_norm": 0.02258380502462387, |
|
"learning_rate": 2.7000000000000002e-05, |
|
"loss": 0.0177, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 8.67, |
|
"eval_accuracy": 0.9752066115702479, |
|
"eval_loss": 0.12387024611234665, |
|
"eval_runtime": 9.1961, |
|
"eval_samples_per_second": 13.158, |
|
"eval_steps_per_second": 1.74, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 8.75, |
|
"eval_accuracy": 0.9752066115702479, |
|
"eval_loss": 0.12437601387500763, |
|
"eval_runtime": 9.196, |
|
"eval_samples_per_second": 13.158, |
|
"eval_steps_per_second": 1.74, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 8.83, |
|
"grad_norm": 0.020360412076115608, |
|
"learning_rate": 2.3666666666666668e-05, |
|
"loss": 0.0123, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 8.83, |
|
"eval_accuracy": 0.9669421487603306, |
|
"eval_loss": 0.12708893418312073, |
|
"eval_runtime": 8.8063, |
|
"eval_samples_per_second": 13.74, |
|
"eval_steps_per_second": 1.817, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 8.92, |
|
"eval_accuracy": 0.9669421487603306, |
|
"eval_loss": 0.1300380975008011, |
|
"eval_runtime": 8.9605, |
|
"eval_samples_per_second": 13.504, |
|
"eval_steps_per_second": 1.786, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"grad_norm": 0.6857224702835083, |
|
"learning_rate": 2.0333333333333334e-05, |
|
"loss": 0.0154, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"eval_accuracy": 0.9669421487603306, |
|
"eval_loss": 0.13137011229991913, |
|
"eval_runtime": 8.8274, |
|
"eval_samples_per_second": 13.707, |
|
"eval_steps_per_second": 1.813, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 9.08, |
|
"eval_accuracy": 0.9669421487603306, |
|
"eval_loss": 0.1295723170042038, |
|
"eval_runtime": 8.8666, |
|
"eval_samples_per_second": 13.647, |
|
"eval_steps_per_second": 1.805, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 9.17, |
|
"grad_norm": 0.4675326943397522, |
|
"learning_rate": 1.7000000000000003e-05, |
|
"loss": 0.0331, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 9.17, |
|
"eval_accuracy": 0.9752066115702479, |
|
"eval_loss": 0.12511087954044342, |
|
"eval_runtime": 9.3211, |
|
"eval_samples_per_second": 12.981, |
|
"eval_steps_per_second": 1.717, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 9.25, |
|
"eval_accuracy": 0.9752066115702479, |
|
"eval_loss": 0.12691423296928406, |
|
"eval_runtime": 9.1389, |
|
"eval_samples_per_second": 13.24, |
|
"eval_steps_per_second": 1.751, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 9.33, |
|
"grad_norm": 0.02136796899139881, |
|
"learning_rate": 1.3666666666666666e-05, |
|
"loss": 0.0196, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 9.33, |
|
"eval_accuracy": 0.9752066115702479, |
|
"eval_loss": 0.12836582958698273, |
|
"eval_runtime": 8.6274, |
|
"eval_samples_per_second": 14.025, |
|
"eval_steps_per_second": 1.855, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 9.42, |
|
"eval_accuracy": 0.9669421487603306, |
|
"eval_loss": 0.1298026293516159, |
|
"eval_runtime": 8.3157, |
|
"eval_samples_per_second": 14.551, |
|
"eval_steps_per_second": 1.924, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 9.5, |
|
"grad_norm": 0.014311849139630795, |
|
"learning_rate": 1.0333333333333333e-05, |
|
"loss": 0.0058, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 9.5, |
|
"eval_accuracy": 0.9669421487603306, |
|
"eval_loss": 0.13134212791919708, |
|
"eval_runtime": 9.1247, |
|
"eval_samples_per_second": 13.261, |
|
"eval_steps_per_second": 1.753, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 9.58, |
|
"eval_accuracy": 0.9669421487603306, |
|
"eval_loss": 0.13212385773658752, |
|
"eval_runtime": 8.6468, |
|
"eval_samples_per_second": 13.994, |
|
"eval_steps_per_second": 1.85, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 9.67, |
|
"grad_norm": 0.4894683361053467, |
|
"learning_rate": 7.000000000000001e-06, |
|
"loss": 0.012, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 9.67, |
|
"eval_accuracy": 0.9669421487603306, |
|
"eval_loss": 0.1326775997877121, |
|
"eval_runtime": 9.2181, |
|
"eval_samples_per_second": 13.126, |
|
"eval_steps_per_second": 1.736, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 9.75, |
|
"eval_accuracy": 0.9669421487603306, |
|
"eval_loss": 0.13264916837215424, |
|
"eval_runtime": 8.9299, |
|
"eval_samples_per_second": 13.55, |
|
"eval_steps_per_second": 1.792, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 9.83, |
|
"grad_norm": 0.015242637135088444, |
|
"learning_rate": 3.666666666666667e-06, |
|
"loss": 0.0081, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 9.83, |
|
"eval_accuracy": 0.9669421487603306, |
|
"eval_loss": 0.13293592631816864, |
|
"eval_runtime": 9.2123, |
|
"eval_samples_per_second": 13.135, |
|
"eval_steps_per_second": 1.737, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 9.92, |
|
"eval_accuracy": 0.9669421487603306, |
|
"eval_loss": 0.13364511728286743, |
|
"eval_runtime": 8.9976, |
|
"eval_samples_per_second": 13.448, |
|
"eval_steps_per_second": 1.778, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"grad_norm": 0.014774391427636147, |
|
"learning_rate": 3.3333333333333335e-07, |
|
"loss": 0.0083, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"eval_accuracy": 0.9669421487603306, |
|
"eval_loss": 0.1337580680847168, |
|
"eval_runtime": 8.6721, |
|
"eval_samples_per_second": 13.953, |
|
"eval_steps_per_second": 1.845, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"step": 600, |
|
"total_flos": 7.39286832673751e+17, |
|
"train_loss": 0.06743512197087208, |
|
"train_runtime": 3546.8991, |
|
"train_samples_per_second": 2.69, |
|
"train_steps_per_second": 0.169 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 600, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 10, |
|
"save_steps": 5, |
|
"total_flos": 7.39286832673751e+17, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|