|
{ |
|
"best_metric": 0.9716312056737588, |
|
"best_model_checkpoint": "./results/checkpoint-3807", |
|
"epoch": 70.0, |
|
"eval_steps": 500, |
|
"global_step": 5670, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 6.570446014404297, |
|
"learning_rate": 1.9728395061728395e-05, |
|
"loss": 2.6389, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_accuracy": 0.475177304964539, |
|
"eval_loss": 0.7098350524902344, |
|
"eval_runtime": 0.3441, |
|
"eval_samples_per_second": 819.478, |
|
"eval_steps_per_second": 52.307, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 4.623419284820557, |
|
"learning_rate": 1.944268077601411e-05, |
|
"loss": 0.6477, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_accuracy": 0.48226950354609927, |
|
"eval_loss": 0.7516428828239441, |
|
"eval_runtime": 0.3434, |
|
"eval_samples_per_second": 821.083, |
|
"eval_steps_per_second": 52.41, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 10.926794052124023, |
|
"learning_rate": 1.9156966490299824e-05, |
|
"loss": 0.6227, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_accuracy": 0.4929078014184397, |
|
"eval_loss": 0.8317187428474426, |
|
"eval_runtime": 0.3439, |
|
"eval_samples_per_second": 820.006, |
|
"eval_steps_per_second": 52.341, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"grad_norm": 12.648384094238281, |
|
"learning_rate": 1.887125220458554e-05, |
|
"loss": 0.5403, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_accuracy": 0.4929078014184397, |
|
"eval_loss": 1.9380121231079102, |
|
"eval_runtime": 0.3424, |
|
"eval_samples_per_second": 823.561, |
|
"eval_steps_per_second": 52.568, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"grad_norm": 23.567258834838867, |
|
"learning_rate": 1.8585537918871256e-05, |
|
"loss": 0.5108, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_accuracy": 0.49645390070921985, |
|
"eval_loss": 2.270359754562378, |
|
"eval_runtime": 0.3437, |
|
"eval_samples_per_second": 820.538, |
|
"eval_steps_per_second": 52.375, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"grad_norm": 3.5719075202941895, |
|
"learning_rate": 1.830335097001764e-05, |
|
"loss": 0.4677, |
|
"step": 486 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_accuracy": 0.48936170212765956, |
|
"eval_loss": 1.6858181953430176, |
|
"eval_runtime": 0.3432, |
|
"eval_samples_per_second": 821.693, |
|
"eval_steps_per_second": 52.449, |
|
"step": 486 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"grad_norm": 7.08165168762207, |
|
"learning_rate": 1.8017636684303353e-05, |
|
"loss": 0.4798, |
|
"step": 567 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"eval_accuracy": 0.49645390070921985, |
|
"eval_loss": 1.623734712600708, |
|
"eval_runtime": 0.3436, |
|
"eval_samples_per_second": 820.682, |
|
"eval_steps_per_second": 52.384, |
|
"step": 567 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"grad_norm": 10.894269943237305, |
|
"learning_rate": 1.773192239858907e-05, |
|
"loss": 0.4817, |
|
"step": 648 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_accuracy": 0.5141843971631206, |
|
"eval_loss": 1.3935478925704956, |
|
"eval_runtime": 0.3435, |
|
"eval_samples_per_second": 821.029, |
|
"eval_steps_per_second": 52.406, |
|
"step": 648 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"grad_norm": 7.739453315734863, |
|
"learning_rate": 1.744620811287478e-05, |
|
"loss": 0.4668, |
|
"step": 729 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"eval_accuracy": 0.5177304964539007, |
|
"eval_loss": 1.259345531463623, |
|
"eval_runtime": 0.343, |
|
"eval_samples_per_second": 822.053, |
|
"eval_steps_per_second": 52.471, |
|
"step": 729 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"grad_norm": 17.012800216674805, |
|
"learning_rate": 1.7160493827160498e-05, |
|
"loss": 0.4359, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"eval_accuracy": 0.5354609929078015, |
|
"eval_loss": 1.310729742050171, |
|
"eval_runtime": 0.3436, |
|
"eval_samples_per_second": 820.694, |
|
"eval_steps_per_second": 52.385, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 11.0, |
|
"grad_norm": 1.6642764806747437, |
|
"learning_rate": 1.687477954144621e-05, |
|
"loss": 0.3956, |
|
"step": 891 |
|
}, |
|
{ |
|
"epoch": 11.0, |
|
"eval_accuracy": 0.8226950354609929, |
|
"eval_loss": 0.43421775102615356, |
|
"eval_runtime": 0.3435, |
|
"eval_samples_per_second": 820.982, |
|
"eval_steps_per_second": 52.403, |
|
"step": 891 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"grad_norm": 0.3688388168811798, |
|
"learning_rate": 1.6589065255731923e-05, |
|
"loss": 0.2906, |
|
"step": 972 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"eval_accuracy": 0.9290780141843972, |
|
"eval_loss": 0.23947754502296448, |
|
"eval_runtime": 0.3442, |
|
"eval_samples_per_second": 819.25, |
|
"eval_steps_per_second": 52.293, |
|
"step": 972 |
|
}, |
|
{ |
|
"epoch": 13.0, |
|
"grad_norm": 37.02349853515625, |
|
"learning_rate": 1.630335097001764e-05, |
|
"loss": 0.2146, |
|
"step": 1053 |
|
}, |
|
{ |
|
"epoch": 13.0, |
|
"eval_accuracy": 0.9397163120567376, |
|
"eval_loss": 0.33284759521484375, |
|
"eval_runtime": 0.3437, |
|
"eval_samples_per_second": 820.462, |
|
"eval_steps_per_second": 52.37, |
|
"step": 1053 |
|
}, |
|
{ |
|
"epoch": 14.0, |
|
"grad_norm": 0.11939908564090729, |
|
"learning_rate": 1.601763668430335e-05, |
|
"loss": 0.1462, |
|
"step": 1134 |
|
}, |
|
{ |
|
"epoch": 14.0, |
|
"eval_accuracy": 0.950354609929078, |
|
"eval_loss": 0.3009294867515564, |
|
"eval_runtime": 0.3439, |
|
"eval_samples_per_second": 819.984, |
|
"eval_steps_per_second": 52.339, |
|
"step": 1134 |
|
}, |
|
{ |
|
"epoch": 15.0, |
|
"grad_norm": 0.08733003586530685, |
|
"learning_rate": 1.5731922398589064e-05, |
|
"loss": 0.1062, |
|
"step": 1215 |
|
}, |
|
{ |
|
"epoch": 15.0, |
|
"eval_accuracy": 0.9290780141843972, |
|
"eval_loss": 0.21407951414585114, |
|
"eval_runtime": 0.3436, |
|
"eval_samples_per_second": 820.805, |
|
"eval_steps_per_second": 52.392, |
|
"step": 1215 |
|
}, |
|
{ |
|
"epoch": 16.0, |
|
"grad_norm": 0.21886540949344635, |
|
"learning_rate": 1.544620811287478e-05, |
|
"loss": 0.0813, |
|
"step": 1296 |
|
}, |
|
{ |
|
"epoch": 16.0, |
|
"eval_accuracy": 0.9432624113475178, |
|
"eval_loss": 0.34917283058166504, |
|
"eval_runtime": 0.344, |
|
"eval_samples_per_second": 819.711, |
|
"eval_steps_per_second": 52.322, |
|
"step": 1296 |
|
}, |
|
{ |
|
"epoch": 17.0, |
|
"grad_norm": 0.5847246646881104, |
|
"learning_rate": 1.5160493827160495e-05, |
|
"loss": 0.1027, |
|
"step": 1377 |
|
}, |
|
{ |
|
"epoch": 17.0, |
|
"eval_accuracy": 0.9219858156028369, |
|
"eval_loss": 0.3432806432247162, |
|
"eval_runtime": 0.3446, |
|
"eval_samples_per_second": 818.425, |
|
"eval_steps_per_second": 52.24, |
|
"step": 1377 |
|
}, |
|
{ |
|
"epoch": 18.0, |
|
"grad_norm": 0.6198065280914307, |
|
"learning_rate": 1.4874779541446209e-05, |
|
"loss": 0.0736, |
|
"step": 1458 |
|
}, |
|
{ |
|
"epoch": 18.0, |
|
"eval_accuracy": 0.9539007092198581, |
|
"eval_loss": 0.27183273434638977, |
|
"eval_runtime": 0.3437, |
|
"eval_samples_per_second": 820.405, |
|
"eval_steps_per_second": 52.366, |
|
"step": 1458 |
|
}, |
|
{ |
|
"epoch": 19.0, |
|
"grad_norm": 0.5257266163825989, |
|
"learning_rate": 1.4589065255731925e-05, |
|
"loss": 0.0684, |
|
"step": 1539 |
|
}, |
|
{ |
|
"epoch": 19.0, |
|
"eval_accuracy": 0.9645390070921985, |
|
"eval_loss": 0.25684282183647156, |
|
"eval_runtime": 0.3434, |
|
"eval_samples_per_second": 821.157, |
|
"eval_steps_per_second": 52.414, |
|
"step": 1539 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"grad_norm": 0.0009818405378609896, |
|
"learning_rate": 1.4303350970017638e-05, |
|
"loss": 0.0779, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"eval_accuracy": 0.9609929078014184, |
|
"eval_loss": 0.2152564525604248, |
|
"eval_runtime": 0.3431, |
|
"eval_samples_per_second": 821.93, |
|
"eval_steps_per_second": 52.464, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 21.0, |
|
"grad_norm": 0.4532203674316406, |
|
"learning_rate": 1.4021164021164022e-05, |
|
"loss": 0.0745, |
|
"step": 1701 |
|
}, |
|
{ |
|
"epoch": 21.0, |
|
"eval_accuracy": 0.9645390070921985, |
|
"eval_loss": 0.1914406418800354, |
|
"eval_runtime": 0.344, |
|
"eval_samples_per_second": 819.813, |
|
"eval_steps_per_second": 52.329, |
|
"step": 1701 |
|
}, |
|
{ |
|
"epoch": 22.0, |
|
"grad_norm": 17.428327560424805, |
|
"learning_rate": 1.3735449735449738e-05, |
|
"loss": 0.1106, |
|
"step": 1782 |
|
}, |
|
{ |
|
"epoch": 22.0, |
|
"eval_accuracy": 0.9574468085106383, |
|
"eval_loss": 0.2807099223136902, |
|
"eval_runtime": 0.3441, |
|
"eval_samples_per_second": 819.457, |
|
"eval_steps_per_second": 52.306, |
|
"step": 1782 |
|
}, |
|
{ |
|
"epoch": 23.0, |
|
"grad_norm": 0.00047796443686820567, |
|
"learning_rate": 1.344973544973545e-05, |
|
"loss": 0.0755, |
|
"step": 1863 |
|
}, |
|
{ |
|
"epoch": 23.0, |
|
"eval_accuracy": 0.9539007092198581, |
|
"eval_loss": 0.331978976726532, |
|
"eval_runtime": 0.3453, |
|
"eval_samples_per_second": 816.672, |
|
"eval_steps_per_second": 52.128, |
|
"step": 1863 |
|
}, |
|
{ |
|
"epoch": 24.0, |
|
"grad_norm": 1.006925106048584, |
|
"learning_rate": 1.3164021164021166e-05, |
|
"loss": 0.0833, |
|
"step": 1944 |
|
}, |
|
{ |
|
"epoch": 24.0, |
|
"eval_accuracy": 0.9539007092198581, |
|
"eval_loss": 0.34625303745269775, |
|
"eval_runtime": 0.3436, |
|
"eval_samples_per_second": 820.661, |
|
"eval_steps_per_second": 52.383, |
|
"step": 1944 |
|
}, |
|
{ |
|
"epoch": 25.0, |
|
"grad_norm": 0.506279706954956, |
|
"learning_rate": 1.288183421516755e-05, |
|
"loss": 0.0754, |
|
"step": 2025 |
|
}, |
|
{ |
|
"epoch": 25.0, |
|
"eval_accuracy": 0.9432624113475178, |
|
"eval_loss": 0.34365448355674744, |
|
"eval_runtime": 0.3432, |
|
"eval_samples_per_second": 821.691, |
|
"eval_steps_per_second": 52.448, |
|
"step": 2025 |
|
}, |
|
{ |
|
"epoch": 26.0, |
|
"grad_norm": 0.1998976171016693, |
|
"learning_rate": 1.2596119929453263e-05, |
|
"loss": 0.0772, |
|
"step": 2106 |
|
}, |
|
{ |
|
"epoch": 26.0, |
|
"eval_accuracy": 0.950354609929078, |
|
"eval_loss": 0.3350883424282074, |
|
"eval_runtime": 0.3435, |
|
"eval_samples_per_second": 820.852, |
|
"eval_steps_per_second": 52.395, |
|
"step": 2106 |
|
}, |
|
{ |
|
"epoch": 27.0, |
|
"grad_norm": 0.19478876888751984, |
|
"learning_rate": 1.2310405643738979e-05, |
|
"loss": 0.076, |
|
"step": 2187 |
|
}, |
|
{ |
|
"epoch": 27.0, |
|
"eval_accuracy": 0.9468085106382979, |
|
"eval_loss": 0.4145265519618988, |
|
"eval_runtime": 0.3445, |
|
"eval_samples_per_second": 818.483, |
|
"eval_steps_per_second": 52.244, |
|
"step": 2187 |
|
}, |
|
{ |
|
"epoch": 28.0, |
|
"grad_norm": 0.27469512820243835, |
|
"learning_rate": 1.2024691358024691e-05, |
|
"loss": 0.0625, |
|
"step": 2268 |
|
}, |
|
{ |
|
"epoch": 28.0, |
|
"eval_accuracy": 0.950354609929078, |
|
"eval_loss": 0.44451093673706055, |
|
"eval_runtime": 0.3439, |
|
"eval_samples_per_second": 819.913, |
|
"eval_steps_per_second": 52.335, |
|
"step": 2268 |
|
}, |
|
{ |
|
"epoch": 29.0, |
|
"grad_norm": 26.14291000366211, |
|
"learning_rate": 1.1738977072310408e-05, |
|
"loss": 0.0741, |
|
"step": 2349 |
|
}, |
|
{ |
|
"epoch": 29.0, |
|
"eval_accuracy": 0.9468085106382979, |
|
"eval_loss": 0.29801085591316223, |
|
"eval_runtime": 0.3448, |
|
"eval_samples_per_second": 817.812, |
|
"eval_steps_per_second": 52.201, |
|
"step": 2349 |
|
}, |
|
{ |
|
"epoch": 30.0, |
|
"grad_norm": 0.0004499799106270075, |
|
"learning_rate": 1.145326278659612e-05, |
|
"loss": 0.0649, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 30.0, |
|
"eval_accuracy": 0.9574468085106383, |
|
"eval_loss": 0.28359255194664, |
|
"eval_runtime": 0.3442, |
|
"eval_samples_per_second": 819.247, |
|
"eval_steps_per_second": 52.292, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 31.0, |
|
"grad_norm": 0.0018564946949481964, |
|
"learning_rate": 1.1167548500881835e-05, |
|
"loss": 0.0688, |
|
"step": 2511 |
|
}, |
|
{ |
|
"epoch": 31.0, |
|
"eval_accuracy": 0.9574468085106383, |
|
"eval_loss": 0.21793903410434723, |
|
"eval_runtime": 0.3445, |
|
"eval_samples_per_second": 818.498, |
|
"eval_steps_per_second": 52.245, |
|
"step": 2511 |
|
}, |
|
{ |
|
"epoch": 32.0, |
|
"grad_norm": 0.0009469461510889232, |
|
"learning_rate": 1.088183421516755e-05, |
|
"loss": 0.0735, |
|
"step": 2592 |
|
}, |
|
{ |
|
"epoch": 32.0, |
|
"eval_accuracy": 0.9539007092198581, |
|
"eval_loss": 0.22946923971176147, |
|
"eval_runtime": 0.3449, |
|
"eval_samples_per_second": 817.666, |
|
"eval_steps_per_second": 52.191, |
|
"step": 2592 |
|
}, |
|
{ |
|
"epoch": 33.0, |
|
"grad_norm": 0.4778638184070587, |
|
"learning_rate": 1.0596119929453263e-05, |
|
"loss": 0.0648, |
|
"step": 2673 |
|
}, |
|
{ |
|
"epoch": 33.0, |
|
"eval_accuracy": 0.9468085106382979, |
|
"eval_loss": 0.42410480976104736, |
|
"eval_runtime": 0.3433, |
|
"eval_samples_per_second": 821.406, |
|
"eval_steps_per_second": 52.43, |
|
"step": 2673 |
|
}, |
|
{ |
|
"epoch": 34.0, |
|
"grad_norm": 0.21737487614154816, |
|
"learning_rate": 1.031040564373898e-05, |
|
"loss": 0.0672, |
|
"step": 2754 |
|
}, |
|
{ |
|
"epoch": 34.0, |
|
"eval_accuracy": 0.9539007092198581, |
|
"eval_loss": 0.2829430401325226, |
|
"eval_runtime": 0.3447, |
|
"eval_samples_per_second": 818.124, |
|
"eval_steps_per_second": 52.221, |
|
"step": 2754 |
|
}, |
|
{ |
|
"epoch": 35.0, |
|
"grad_norm": 0.08269879966974258, |
|
"learning_rate": 1.0024691358024692e-05, |
|
"loss": 0.067, |
|
"step": 2835 |
|
}, |
|
{ |
|
"epoch": 35.0, |
|
"eval_accuracy": 0.9468085106382979, |
|
"eval_loss": 0.3723122179508209, |
|
"eval_runtime": 0.3448, |
|
"eval_samples_per_second": 817.778, |
|
"eval_steps_per_second": 52.199, |
|
"step": 2835 |
|
}, |
|
{ |
|
"epoch": 36.0, |
|
"grad_norm": 0.3665499687194824, |
|
"learning_rate": 9.738977072310406e-06, |
|
"loss": 0.0768, |
|
"step": 2916 |
|
}, |
|
{ |
|
"epoch": 36.0, |
|
"eval_accuracy": 0.9574468085106383, |
|
"eval_loss": 0.25441667437553406, |
|
"eval_runtime": 0.3447, |
|
"eval_samples_per_second": 818.182, |
|
"eval_steps_per_second": 52.224, |
|
"step": 2916 |
|
}, |
|
{ |
|
"epoch": 37.0, |
|
"grad_norm": 0.11919476091861725, |
|
"learning_rate": 9.45326278659612e-06, |
|
"loss": 0.0691, |
|
"step": 2997 |
|
}, |
|
{ |
|
"epoch": 37.0, |
|
"eval_accuracy": 0.9609929078014184, |
|
"eval_loss": 0.20481815934181213, |
|
"eval_runtime": 0.3445, |
|
"eval_samples_per_second": 818.558, |
|
"eval_steps_per_second": 52.248, |
|
"step": 2997 |
|
}, |
|
{ |
|
"epoch": 38.0, |
|
"grad_norm": 0.0036801116075366735, |
|
"learning_rate": 9.167548500881835e-06, |
|
"loss": 0.0661, |
|
"step": 3078 |
|
}, |
|
{ |
|
"epoch": 38.0, |
|
"eval_accuracy": 0.9680851063829787, |
|
"eval_loss": 0.20478524267673492, |
|
"eval_runtime": 0.3445, |
|
"eval_samples_per_second": 818.468, |
|
"eval_steps_per_second": 52.243, |
|
"step": 3078 |
|
}, |
|
{ |
|
"epoch": 39.0, |
|
"grad_norm": 0.12663815915584564, |
|
"learning_rate": 8.88183421516755e-06, |
|
"loss": 0.0409, |
|
"step": 3159 |
|
}, |
|
{ |
|
"epoch": 39.0, |
|
"eval_accuracy": 0.9645390070921985, |
|
"eval_loss": 0.18502239882946014, |
|
"eval_runtime": 0.3434, |
|
"eval_samples_per_second": 821.144, |
|
"eval_steps_per_second": 52.413, |
|
"step": 3159 |
|
}, |
|
{ |
|
"epoch": 40.0, |
|
"grad_norm": 0.06950168311595917, |
|
"learning_rate": 8.596119929453264e-06, |
|
"loss": 0.0424, |
|
"step": 3240 |
|
}, |
|
{ |
|
"epoch": 40.0, |
|
"eval_accuracy": 0.9645390070921985, |
|
"eval_loss": 0.20747074484825134, |
|
"eval_runtime": 0.3445, |
|
"eval_samples_per_second": 818.693, |
|
"eval_steps_per_second": 52.257, |
|
"step": 3240 |
|
}, |
|
{ |
|
"epoch": 41.0, |
|
"grad_norm": 0.09251494705677032, |
|
"learning_rate": 8.310405643738978e-06, |
|
"loss": 0.0381, |
|
"step": 3321 |
|
}, |
|
{ |
|
"epoch": 41.0, |
|
"eval_accuracy": 0.9645390070921985, |
|
"eval_loss": 0.2633875906467438, |
|
"eval_runtime": 0.3468, |
|
"eval_samples_per_second": 813.14, |
|
"eval_steps_per_second": 51.903, |
|
"step": 3321 |
|
}, |
|
{ |
|
"epoch": 42.0, |
|
"grad_norm": 0.06917154043912888, |
|
"learning_rate": 8.024691358024692e-06, |
|
"loss": 0.0383, |
|
"step": 3402 |
|
}, |
|
{ |
|
"epoch": 42.0, |
|
"eval_accuracy": 0.9574468085106383, |
|
"eval_loss": 0.3520617187023163, |
|
"eval_runtime": 0.3447, |
|
"eval_samples_per_second": 818.036, |
|
"eval_steps_per_second": 52.215, |
|
"step": 3402 |
|
}, |
|
{ |
|
"epoch": 43.0, |
|
"grad_norm": 0.0010325413895770907, |
|
"learning_rate": 7.738977072310407e-06, |
|
"loss": 0.0288, |
|
"step": 3483 |
|
}, |
|
{ |
|
"epoch": 43.0, |
|
"eval_accuracy": 0.9680851063829787, |
|
"eval_loss": 0.2726523280143738, |
|
"eval_runtime": 0.3428, |
|
"eval_samples_per_second": 822.588, |
|
"eval_steps_per_second": 52.506, |
|
"step": 3483 |
|
}, |
|
{ |
|
"epoch": 44.0, |
|
"grad_norm": 0.04726780578494072, |
|
"learning_rate": 7.45326278659612e-06, |
|
"loss": 0.035, |
|
"step": 3564 |
|
}, |
|
{ |
|
"epoch": 44.0, |
|
"eval_accuracy": 0.9645390070921985, |
|
"eval_loss": 0.2995310127735138, |
|
"eval_runtime": 0.3442, |
|
"eval_samples_per_second": 819.308, |
|
"eval_steps_per_second": 52.296, |
|
"step": 3564 |
|
}, |
|
{ |
|
"epoch": 45.0, |
|
"grad_norm": 0.09283600747585297, |
|
"learning_rate": 7.167548500881835e-06, |
|
"loss": 0.0265, |
|
"step": 3645 |
|
}, |
|
{ |
|
"epoch": 45.0, |
|
"eval_accuracy": 0.9609929078014184, |
|
"eval_loss": 0.33694958686828613, |
|
"eval_runtime": 0.3443, |
|
"eval_samples_per_second": 818.994, |
|
"eval_steps_per_second": 52.276, |
|
"step": 3645 |
|
}, |
|
{ |
|
"epoch": 46.0, |
|
"grad_norm": 0.03685113787651062, |
|
"learning_rate": 6.881834215167549e-06, |
|
"loss": 0.0217, |
|
"step": 3726 |
|
}, |
|
{ |
|
"epoch": 46.0, |
|
"eval_accuracy": 0.9609929078014184, |
|
"eval_loss": 0.35722091794013977, |
|
"eval_runtime": 0.3438, |
|
"eval_samples_per_second": 820.281, |
|
"eval_steps_per_second": 52.358, |
|
"step": 3726 |
|
}, |
|
{ |
|
"epoch": 47.0, |
|
"grad_norm": 0.04708189144730568, |
|
"learning_rate": 6.596119929453263e-06, |
|
"loss": 0.0259, |
|
"step": 3807 |
|
}, |
|
{ |
|
"epoch": 47.0, |
|
"eval_accuracy": 0.9716312056737588, |
|
"eval_loss": 0.21833930909633636, |
|
"eval_runtime": 0.3427, |
|
"eval_samples_per_second": 822.913, |
|
"eval_steps_per_second": 52.526, |
|
"step": 3807 |
|
}, |
|
{ |
|
"epoch": 48.0, |
|
"grad_norm": 0.06329997628927231, |
|
"learning_rate": 6.310405643738977e-06, |
|
"loss": 0.0264, |
|
"step": 3888 |
|
}, |
|
{ |
|
"epoch": 48.0, |
|
"eval_accuracy": 0.9609929078014184, |
|
"eval_loss": 0.2745024561882019, |
|
"eval_runtime": 0.3436, |
|
"eval_samples_per_second": 820.777, |
|
"eval_steps_per_second": 52.39, |
|
"step": 3888 |
|
}, |
|
{ |
|
"epoch": 49.0, |
|
"grad_norm": 0.13020673394203186, |
|
"learning_rate": 6.024691358024692e-06, |
|
"loss": 0.027, |
|
"step": 3969 |
|
}, |
|
{ |
|
"epoch": 49.0, |
|
"eval_accuracy": 0.9539007092198581, |
|
"eval_loss": 0.3425739109516144, |
|
"eval_runtime": 0.3449, |
|
"eval_samples_per_second": 817.548, |
|
"eval_steps_per_second": 52.184, |
|
"step": 3969 |
|
}, |
|
{ |
|
"epoch": 50.0, |
|
"grad_norm": 0.04181819409132004, |
|
"learning_rate": 5.7389770723104065e-06, |
|
"loss": 0.023, |
|
"step": 4050 |
|
}, |
|
{ |
|
"epoch": 50.0, |
|
"eval_accuracy": 0.950354609929078, |
|
"eval_loss": 0.37068530917167664, |
|
"eval_runtime": 0.3441, |
|
"eval_samples_per_second": 819.471, |
|
"eval_steps_per_second": 52.307, |
|
"step": 4050 |
|
}, |
|
{ |
|
"epoch": 51.0, |
|
"grad_norm": 0.03754027560353279, |
|
"learning_rate": 5.453262786596121e-06, |
|
"loss": 0.0241, |
|
"step": 4131 |
|
}, |
|
{ |
|
"epoch": 51.0, |
|
"eval_accuracy": 0.9645390070921985, |
|
"eval_loss": 0.3041815459728241, |
|
"eval_runtime": 0.3443, |
|
"eval_samples_per_second": 819.127, |
|
"eval_steps_per_second": 52.285, |
|
"step": 4131 |
|
}, |
|
{ |
|
"epoch": 52.0, |
|
"grad_norm": 0.06724414229393005, |
|
"learning_rate": 5.167548500881835e-06, |
|
"loss": 0.0248, |
|
"step": 4212 |
|
}, |
|
{ |
|
"epoch": 52.0, |
|
"eval_accuracy": 0.9609929078014184, |
|
"eval_loss": 0.3282240927219391, |
|
"eval_runtime": 0.3433, |
|
"eval_samples_per_second": 821.512, |
|
"eval_steps_per_second": 52.437, |
|
"step": 4212 |
|
}, |
|
{ |
|
"epoch": 53.0, |
|
"grad_norm": 0.044111430644989014, |
|
"learning_rate": 4.881834215167549e-06, |
|
"loss": 0.0267, |
|
"step": 4293 |
|
}, |
|
{ |
|
"epoch": 53.0, |
|
"eval_accuracy": 0.9680851063829787, |
|
"eval_loss": 0.2480100840330124, |
|
"eval_runtime": 0.3438, |
|
"eval_samples_per_second": 820.176, |
|
"eval_steps_per_second": 52.352, |
|
"step": 4293 |
|
}, |
|
{ |
|
"epoch": 54.0, |
|
"grad_norm": 0.09385800361633301, |
|
"learning_rate": 4.596119929453263e-06, |
|
"loss": 0.019, |
|
"step": 4374 |
|
}, |
|
{ |
|
"epoch": 54.0, |
|
"eval_accuracy": 0.9680851063829787, |
|
"eval_loss": 0.2954387366771698, |
|
"eval_runtime": 0.3444, |
|
"eval_samples_per_second": 818.748, |
|
"eval_steps_per_second": 52.261, |
|
"step": 4374 |
|
}, |
|
{ |
|
"epoch": 55.0, |
|
"grad_norm": 0.00036285247188061476, |
|
"learning_rate": 4.3104056437389775e-06, |
|
"loss": 0.0233, |
|
"step": 4455 |
|
}, |
|
{ |
|
"epoch": 55.0, |
|
"eval_accuracy": 0.9645390070921985, |
|
"eval_loss": 0.26300373673439026, |
|
"eval_runtime": 0.3483, |
|
"eval_samples_per_second": 809.563, |
|
"eval_steps_per_second": 51.674, |
|
"step": 4455 |
|
}, |
|
{ |
|
"epoch": 56.0, |
|
"grad_norm": 0.03549063578248024, |
|
"learning_rate": 4.024691358024692e-06, |
|
"loss": 0.0231, |
|
"step": 4536 |
|
}, |
|
{ |
|
"epoch": 56.0, |
|
"eval_accuracy": 0.9645390070921985, |
|
"eval_loss": 0.26614007353782654, |
|
"eval_runtime": 0.3434, |
|
"eval_samples_per_second": 821.294, |
|
"eval_steps_per_second": 52.423, |
|
"step": 4536 |
|
}, |
|
{ |
|
"epoch": 57.0, |
|
"grad_norm": 0.0008688592351973057, |
|
"learning_rate": 3.7389770723104058e-06, |
|
"loss": 0.0188, |
|
"step": 4617 |
|
}, |
|
{ |
|
"epoch": 57.0, |
|
"eval_accuracy": 0.9574468085106383, |
|
"eval_loss": 0.3676702678203583, |
|
"eval_runtime": 0.3441, |
|
"eval_samples_per_second": 819.514, |
|
"eval_steps_per_second": 52.309, |
|
"step": 4617 |
|
}, |
|
{ |
|
"epoch": 58.0, |
|
"grad_norm": 0.00031407736241817474, |
|
"learning_rate": 3.4532627865961205e-06, |
|
"loss": 0.0263, |
|
"step": 4698 |
|
}, |
|
{ |
|
"epoch": 58.0, |
|
"eval_accuracy": 0.9539007092198581, |
|
"eval_loss": 0.36925771832466125, |
|
"eval_runtime": 0.348, |
|
"eval_samples_per_second": 810.368, |
|
"eval_steps_per_second": 51.726, |
|
"step": 4698 |
|
}, |
|
{ |
|
"epoch": 59.0, |
|
"grad_norm": 0.040128860622644424, |
|
"learning_rate": 3.1675485008818345e-06, |
|
"loss": 0.019, |
|
"step": 4779 |
|
}, |
|
{ |
|
"epoch": 59.0, |
|
"eval_accuracy": 0.9574468085106383, |
|
"eval_loss": 0.35094693303108215, |
|
"eval_runtime": 0.3436, |
|
"eval_samples_per_second": 820.815, |
|
"eval_steps_per_second": 52.392, |
|
"step": 4779 |
|
}, |
|
{ |
|
"epoch": 60.0, |
|
"grad_norm": 0.0004439246258698404, |
|
"learning_rate": 2.881834215167549e-06, |
|
"loss": 0.0202, |
|
"step": 4860 |
|
}, |
|
{ |
|
"epoch": 60.0, |
|
"eval_accuracy": 0.9609929078014184, |
|
"eval_loss": 0.3040333092212677, |
|
"eval_runtime": 0.3445, |
|
"eval_samples_per_second": 818.559, |
|
"eval_steps_per_second": 52.248, |
|
"step": 4860 |
|
}, |
|
{ |
|
"epoch": 61.0, |
|
"grad_norm": 0.07529360055923462, |
|
"learning_rate": 2.5961199294532628e-06, |
|
"loss": 0.0208, |
|
"step": 4941 |
|
}, |
|
{ |
|
"epoch": 61.0, |
|
"eval_accuracy": 0.9468085106382979, |
|
"eval_loss": 0.5039365887641907, |
|
"eval_runtime": 0.3439, |
|
"eval_samples_per_second": 819.902, |
|
"eval_steps_per_second": 52.334, |
|
"step": 4941 |
|
}, |
|
{ |
|
"epoch": 62.0, |
|
"grad_norm": 0.00026053638430312276, |
|
"learning_rate": 2.310405643738977e-06, |
|
"loss": 0.0242, |
|
"step": 5022 |
|
}, |
|
{ |
|
"epoch": 62.0, |
|
"eval_accuracy": 0.950354609929078, |
|
"eval_loss": 0.4803861677646637, |
|
"eval_runtime": 0.3445, |
|
"eval_samples_per_second": 818.64, |
|
"eval_steps_per_second": 52.254, |
|
"step": 5022 |
|
}, |
|
{ |
|
"epoch": 63.0, |
|
"grad_norm": 0.06742388755083084, |
|
"learning_rate": 2.0246913580246915e-06, |
|
"loss": 0.023, |
|
"step": 5103 |
|
}, |
|
{ |
|
"epoch": 63.0, |
|
"eval_accuracy": 0.9609929078014184, |
|
"eval_loss": 0.3538144826889038, |
|
"eval_runtime": 0.3445, |
|
"eval_samples_per_second": 818.51, |
|
"eval_steps_per_second": 52.245, |
|
"step": 5103 |
|
}, |
|
{ |
|
"epoch": 64.0, |
|
"grad_norm": 0.00042550539365038276, |
|
"learning_rate": 1.7389770723104056e-06, |
|
"loss": 0.0189, |
|
"step": 5184 |
|
}, |
|
{ |
|
"epoch": 64.0, |
|
"eval_accuracy": 0.9574468085106383, |
|
"eval_loss": 0.37617096304893494, |
|
"eval_runtime": 0.3442, |
|
"eval_samples_per_second": 819.198, |
|
"eval_steps_per_second": 52.289, |
|
"step": 5184 |
|
}, |
|
{ |
|
"epoch": 65.0, |
|
"grad_norm": 0.02407378889620304, |
|
"learning_rate": 1.45326278659612e-06, |
|
"loss": 0.0209, |
|
"step": 5265 |
|
}, |
|
{ |
|
"epoch": 65.0, |
|
"eval_accuracy": 0.950354609929078, |
|
"eval_loss": 0.43608424067497253, |
|
"eval_runtime": 0.3438, |
|
"eval_samples_per_second": 820.243, |
|
"eval_steps_per_second": 52.356, |
|
"step": 5265 |
|
}, |
|
{ |
|
"epoch": 66.0, |
|
"grad_norm": 0.054311446845531464, |
|
"learning_rate": 1.1675485008818344e-06, |
|
"loss": 0.0209, |
|
"step": 5346 |
|
}, |
|
{ |
|
"epoch": 66.0, |
|
"eval_accuracy": 0.950354609929078, |
|
"eval_loss": 0.41794532537460327, |
|
"eval_runtime": 0.3436, |
|
"eval_samples_per_second": 820.791, |
|
"eval_steps_per_second": 52.391, |
|
"step": 5346 |
|
}, |
|
{ |
|
"epoch": 67.0, |
|
"grad_norm": 0.04109662398695946, |
|
"learning_rate": 8.818342151675485e-07, |
|
"loss": 0.0198, |
|
"step": 5427 |
|
}, |
|
{ |
|
"epoch": 67.0, |
|
"eval_accuracy": 0.9539007092198581, |
|
"eval_loss": 0.3815895617008209, |
|
"eval_runtime": 0.3443, |
|
"eval_samples_per_second": 819.013, |
|
"eval_steps_per_second": 52.277, |
|
"step": 5427 |
|
}, |
|
{ |
|
"epoch": 68.0, |
|
"grad_norm": 0.13629287481307983, |
|
"learning_rate": 5.961199294532629e-07, |
|
"loss": 0.0197, |
|
"step": 5508 |
|
}, |
|
{ |
|
"epoch": 68.0, |
|
"eval_accuracy": 0.950354609929078, |
|
"eval_loss": 0.39786896109580994, |
|
"eval_runtime": 0.3445, |
|
"eval_samples_per_second": 818.46, |
|
"eval_steps_per_second": 52.242, |
|
"step": 5508 |
|
}, |
|
{ |
|
"epoch": 69.0, |
|
"grad_norm": 0.039983708411455154, |
|
"learning_rate": 3.104056437389771e-07, |
|
"loss": 0.0192, |
|
"step": 5589 |
|
}, |
|
{ |
|
"epoch": 69.0, |
|
"eval_accuracy": 0.950354609929078, |
|
"eval_loss": 0.411296546459198, |
|
"eval_runtime": 0.3435, |
|
"eval_samples_per_second": 820.901, |
|
"eval_steps_per_second": 52.398, |
|
"step": 5589 |
|
}, |
|
{ |
|
"epoch": 70.0, |
|
"grad_norm": 0.00027353325276635587, |
|
"learning_rate": 2.469135802469136e-08, |
|
"loss": 0.0177, |
|
"step": 5670 |
|
}, |
|
{ |
|
"epoch": 70.0, |
|
"eval_accuracy": 0.9539007092198581, |
|
"eval_loss": 0.40772485733032227, |
|
"eval_runtime": 0.3437, |
|
"eval_samples_per_second": 820.466, |
|
"eval_steps_per_second": 52.37, |
|
"step": 5670 |
|
} |
|
], |
|
"logging_steps": 100, |
|
"max_steps": 5670, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 70, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 9735501528974304.0, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|