|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.23143277929728592, |
|
"eval_steps": 10, |
|
"global_step": 550, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.004207868714496108, |
|
"grad_norm": 3.2594902515411377, |
|
"learning_rate": 0.00039272727272727273, |
|
"loss": 0.6971, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.004207868714496108, |
|
"eval_accuracy": 0.5914159417152405, |
|
"eval_loss": 0.7148427963256836, |
|
"eval_runtime": 585.2226, |
|
"eval_samples_per_second": 8.122, |
|
"eval_steps_per_second": 2.032, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.008415737428992216, |
|
"grad_norm": 22.503503799438477, |
|
"learning_rate": 0.0003854545454545455, |
|
"loss": 0.7436, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.008415737428992216, |
|
"eval_accuracy": 0.5914159417152405, |
|
"eval_loss": 1.42451012134552, |
|
"eval_runtime": 577.2378, |
|
"eval_samples_per_second": 8.234, |
|
"eval_steps_per_second": 2.06, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.012623606143488323, |
|
"grad_norm": 1.2163587808609009, |
|
"learning_rate": 0.0003781818181818182, |
|
"loss": 0.9446, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.012623606143488323, |
|
"eval_accuracy": 0.4085840582847595, |
|
"eval_loss": 0.7073472738265991, |
|
"eval_runtime": 581.0519, |
|
"eval_samples_per_second": 8.18, |
|
"eval_steps_per_second": 2.046, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.016831474857984433, |
|
"grad_norm": 2.1413767337799072, |
|
"learning_rate": 0.0003709090909090909, |
|
"loss": 0.6945, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.016831474857984433, |
|
"eval_accuracy": 0.5914159417152405, |
|
"eval_loss": 0.7796906232833862, |
|
"eval_runtime": 591.5892, |
|
"eval_samples_per_second": 8.034, |
|
"eval_steps_per_second": 2.01, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.021039343572480537, |
|
"grad_norm": 0.8763795495033264, |
|
"learning_rate": 0.00036363636363636367, |
|
"loss": 0.653, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.021039343572480537, |
|
"eval_accuracy": 0.5867872834205627, |
|
"eval_loss": 0.6816809773445129, |
|
"eval_runtime": 588.5616, |
|
"eval_samples_per_second": 8.076, |
|
"eval_steps_per_second": 2.02, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.025247212286976645, |
|
"grad_norm": 0.9567063450813293, |
|
"learning_rate": 0.0003563636363636364, |
|
"loss": 0.6479, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.025247212286976645, |
|
"eval_accuracy": 0.8102251291275024, |
|
"eval_loss": 0.5443565845489502, |
|
"eval_runtime": 581.8352, |
|
"eval_samples_per_second": 8.169, |
|
"eval_steps_per_second": 2.044, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.029455081001472753, |
|
"grad_norm": 4.155121803283691, |
|
"learning_rate": 0.0003490909090909091, |
|
"loss": 0.502, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.029455081001472753, |
|
"eval_accuracy": 0.5952030420303345, |
|
"eval_loss": 1.1268996000289917, |
|
"eval_runtime": 583.4269, |
|
"eval_samples_per_second": 8.147, |
|
"eval_steps_per_second": 2.038, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.033662949715968865, |
|
"grad_norm": 3.364042282104492, |
|
"learning_rate": 0.0003418181818181818, |
|
"loss": 0.6052, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.033662949715968865, |
|
"eval_accuracy": 0.8194824457168579, |
|
"eval_loss": 0.6350404024124146, |
|
"eval_runtime": 585.0166, |
|
"eval_samples_per_second": 8.125, |
|
"eval_steps_per_second": 2.032, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.03787081843046497, |
|
"grad_norm": 1.5822795629501343, |
|
"learning_rate": 0.00033454545454545456, |
|
"loss": 0.3917, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.03787081843046497, |
|
"eval_accuracy": 0.5914159417152405, |
|
"eval_loss": 1.0552942752838135, |
|
"eval_runtime": 579.8411, |
|
"eval_samples_per_second": 8.197, |
|
"eval_steps_per_second": 2.051, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.042078687144961074, |
|
"grad_norm": 1.6902482509613037, |
|
"learning_rate": 0.0003272727272727273, |
|
"loss": 0.648, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.042078687144961074, |
|
"eval_accuracy": 0.8116979002952576, |
|
"eval_loss": 0.7334651947021484, |
|
"eval_runtime": 577.8009, |
|
"eval_samples_per_second": 8.226, |
|
"eval_steps_per_second": 2.058, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.046286555859457186, |
|
"grad_norm": 1.05351984500885, |
|
"learning_rate": 0.00032, |
|
"loss": 0.431, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.046286555859457186, |
|
"eval_accuracy": 0.8480959534645081, |
|
"eval_loss": 0.39465391635894775, |
|
"eval_runtime": 576.7671, |
|
"eval_samples_per_second": 8.241, |
|
"eval_steps_per_second": 2.061, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.05049442457395329, |
|
"grad_norm": 0.6553493738174438, |
|
"learning_rate": 0.00031272727272727273, |
|
"loss": 0.5428, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.05049442457395329, |
|
"eval_accuracy": 0.8729223608970642, |
|
"eval_loss": 0.38810551166534424, |
|
"eval_runtime": 579.7258, |
|
"eval_samples_per_second": 8.199, |
|
"eval_steps_per_second": 2.051, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.0547022932884494, |
|
"grad_norm": 0.42037296295166016, |
|
"learning_rate": 0.0003054545454545455, |
|
"loss": 0.34, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.0547022932884494, |
|
"eval_accuracy": 0.8840731978416443, |
|
"eval_loss": 0.35628741979599, |
|
"eval_runtime": 582.4004, |
|
"eval_samples_per_second": 8.161, |
|
"eval_steps_per_second": 2.042, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.05891016200294551, |
|
"grad_norm": 0.47820377349853516, |
|
"learning_rate": 0.0002981818181818182, |
|
"loss": 0.398, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.05891016200294551, |
|
"eval_accuracy": 0.8842836022377014, |
|
"eval_loss": 0.3024275004863739, |
|
"eval_runtime": 581.4736, |
|
"eval_samples_per_second": 8.174, |
|
"eval_steps_per_second": 2.045, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.06311803071744161, |
|
"grad_norm": 0.9904082417488098, |
|
"learning_rate": 0.0002909090909090909, |
|
"loss": 0.1834, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.06311803071744161, |
|
"eval_accuracy": 0.87250155210495, |
|
"eval_loss": 0.38005706667900085, |
|
"eval_runtime": 580.8975, |
|
"eval_samples_per_second": 8.182, |
|
"eval_steps_per_second": 2.047, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.06732589943193773, |
|
"grad_norm": 1.6720079183578491, |
|
"learning_rate": 0.0002836363636363637, |
|
"loss": 0.5052, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.06732589943193773, |
|
"eval_accuracy": 0.9114243388175964, |
|
"eval_loss": 0.23850664496421814, |
|
"eval_runtime": 581.0949, |
|
"eval_samples_per_second": 8.179, |
|
"eval_steps_per_second": 2.046, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.07153376814643383, |
|
"grad_norm": 1.5653446912765503, |
|
"learning_rate": 0.0002763636363636364, |
|
"loss": 0.3953, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.07153376814643383, |
|
"eval_accuracy": 0.8544077277183533, |
|
"eval_loss": 0.41211748123168945, |
|
"eval_runtime": 581.5472, |
|
"eval_samples_per_second": 8.173, |
|
"eval_steps_per_second": 2.045, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.07574163686092994, |
|
"grad_norm": 1.7199907302856445, |
|
"learning_rate": 0.0002690909090909091, |
|
"loss": 0.407, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.07574163686092994, |
|
"eval_accuracy": 0.9044813513755798, |
|
"eval_loss": 0.26600247621536255, |
|
"eval_runtime": 580.7861, |
|
"eval_samples_per_second": 8.184, |
|
"eval_steps_per_second": 2.047, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.07994950557542604, |
|
"grad_norm": 2.303934335708618, |
|
"learning_rate": 0.00026181818181818185, |
|
"loss": 0.3571, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.07994950557542604, |
|
"eval_accuracy": 0.8844940066337585, |
|
"eval_loss": 0.3394128680229187, |
|
"eval_runtime": 584.97, |
|
"eval_samples_per_second": 8.125, |
|
"eval_steps_per_second": 2.033, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.08415737428992215, |
|
"grad_norm": 2.5085370540618896, |
|
"learning_rate": 0.00025454545454545456, |
|
"loss": 0.2747, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.08415737428992215, |
|
"eval_accuracy": 0.9147906303405762, |
|
"eval_loss": 0.2246033400297165, |
|
"eval_runtime": 585.8187, |
|
"eval_samples_per_second": 8.113, |
|
"eval_steps_per_second": 2.03, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.08836524300441827, |
|
"grad_norm": 4.109118461608887, |
|
"learning_rate": 0.00024727272727272727, |
|
"loss": 0.2863, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.08836524300441827, |
|
"eval_accuracy": 0.9244687557220459, |
|
"eval_loss": 0.2438182830810547, |
|
"eval_runtime": 582.1202, |
|
"eval_samples_per_second": 8.165, |
|
"eval_steps_per_second": 2.043, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.09257311171891437, |
|
"grad_norm": 0.5757103562355042, |
|
"learning_rate": 0.00024, |
|
"loss": 0.2334, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.09257311171891437, |
|
"eval_accuracy": 0.922154426574707, |
|
"eval_loss": 0.21005088090896606, |
|
"eval_runtime": 579.1924, |
|
"eval_samples_per_second": 8.206, |
|
"eval_steps_per_second": 2.053, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.09678098043341048, |
|
"grad_norm": 0.36710911989212036, |
|
"learning_rate": 0.00023272727272727271, |
|
"loss": 0.1744, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.09678098043341048, |
|
"eval_accuracy": 0.8529350161552429, |
|
"eval_loss": 0.41234469413757324, |
|
"eval_runtime": 589.1406, |
|
"eval_samples_per_second": 8.068, |
|
"eval_steps_per_second": 2.018, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.10098884914790658, |
|
"grad_norm": 3.2360424995422363, |
|
"learning_rate": 0.00022545454545454545, |
|
"loss": 0.1948, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.10098884914790658, |
|
"eval_accuracy": 0.9253103137016296, |
|
"eval_loss": 0.22991585731506348, |
|
"eval_runtime": 587.1353, |
|
"eval_samples_per_second": 8.095, |
|
"eval_steps_per_second": 2.025, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.1051967178624027, |
|
"grad_norm": 0.21486328542232513, |
|
"learning_rate": 0.00021818181818181818, |
|
"loss": 0.2382, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.1051967178624027, |
|
"eval_accuracy": 0.9322533011436462, |
|
"eval_loss": 0.27035772800445557, |
|
"eval_runtime": 587.222, |
|
"eval_samples_per_second": 8.094, |
|
"eval_steps_per_second": 2.025, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.1094045865768988, |
|
"grad_norm": 1.8511269092559814, |
|
"learning_rate": 0.0002109090909090909, |
|
"loss": 0.219, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.1094045865768988, |
|
"eval_accuracy": 0.9137386679649353, |
|
"eval_loss": 0.3539877235889435, |
|
"eval_runtime": 585.6282, |
|
"eval_samples_per_second": 8.116, |
|
"eval_steps_per_second": 2.03, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.11361245529139491, |
|
"grad_norm": 0.11713656038045883, |
|
"learning_rate": 0.00020363636363636363, |
|
"loss": 0.1122, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.11361245529139491, |
|
"eval_accuracy": 0.9339364767074585, |
|
"eval_loss": 0.2783205509185791, |
|
"eval_runtime": 585.5155, |
|
"eval_samples_per_second": 8.118, |
|
"eval_steps_per_second": 2.031, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.11782032400589101, |
|
"grad_norm": 2.4492409229278564, |
|
"learning_rate": 0.00019636363636363636, |
|
"loss": 0.1902, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.11782032400589101, |
|
"eval_accuracy": 0.9322533011436462, |
|
"eval_loss": 0.28133705258369446, |
|
"eval_runtime": 587.357, |
|
"eval_samples_per_second": 8.092, |
|
"eval_steps_per_second": 2.024, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.12202819272038712, |
|
"grad_norm": 0.09383056312799454, |
|
"learning_rate": 0.0001890909090909091, |
|
"loss": 0.1279, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.12202819272038712, |
|
"eval_accuracy": 0.9297285676002502, |
|
"eval_loss": 0.22569426894187927, |
|
"eval_runtime": 586.2579, |
|
"eval_samples_per_second": 8.107, |
|
"eval_steps_per_second": 2.028, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.12623606143488322, |
|
"grad_norm": 1.5099377632141113, |
|
"learning_rate": 0.00018181818181818183, |
|
"loss": 0.168, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.12623606143488322, |
|
"eval_accuracy": 0.9347780346870422, |
|
"eval_loss": 0.2831152081489563, |
|
"eval_runtime": 587.962, |
|
"eval_samples_per_second": 8.084, |
|
"eval_steps_per_second": 2.022, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.13044393014937933, |
|
"grad_norm": 16.816967010498047, |
|
"learning_rate": 0.00017454545454545454, |
|
"loss": 0.1351, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.13044393014937933, |
|
"eval_accuracy": 0.9293078184127808, |
|
"eval_loss": 0.3322593569755554, |
|
"eval_runtime": 586.8235, |
|
"eval_samples_per_second": 8.1, |
|
"eval_steps_per_second": 2.026, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.13465179886387546, |
|
"grad_norm": 0.6043083667755127, |
|
"learning_rate": 0.00016727272727272728, |
|
"loss": 0.0422, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.13465179886387546, |
|
"eval_accuracy": 0.9427729845046997, |
|
"eval_loss": 0.28099876642227173, |
|
"eval_runtime": 587.4493, |
|
"eval_samples_per_second": 8.091, |
|
"eval_steps_per_second": 2.024, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.13885966757837157, |
|
"grad_norm": 16.680456161499023, |
|
"learning_rate": 0.00016, |
|
"loss": 0.1904, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.13885966757837157, |
|
"eval_accuracy": 0.9349884390830994, |
|
"eval_loss": 0.3474605977535248, |
|
"eval_runtime": 579.304, |
|
"eval_samples_per_second": 8.205, |
|
"eval_steps_per_second": 2.052, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.14306753629286767, |
|
"grad_norm": 0.6815859079360962, |
|
"learning_rate": 0.00015272727272727275, |
|
"loss": 0.0864, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.14306753629286767, |
|
"eval_accuracy": 0.9438249468803406, |
|
"eval_loss": 0.3012893497943878, |
|
"eval_runtime": 588.316, |
|
"eval_samples_per_second": 8.079, |
|
"eval_steps_per_second": 2.021, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.14727540500736377, |
|
"grad_norm": 0.01422311831265688, |
|
"learning_rate": 0.00014545454545454546, |
|
"loss": 0.0198, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.14727540500736377, |
|
"eval_accuracy": 0.9335156679153442, |
|
"eval_loss": 0.38236290216445923, |
|
"eval_runtime": 589.516, |
|
"eval_samples_per_second": 8.063, |
|
"eval_steps_per_second": 2.017, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.15148327372185988, |
|
"grad_norm": 2.280247211456299, |
|
"learning_rate": 0.0001381818181818182, |
|
"loss": 0.2155, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.15148327372185988, |
|
"eval_accuracy": 0.9463496804237366, |
|
"eval_loss": 0.3106628656387329, |
|
"eval_runtime": 581.9576, |
|
"eval_samples_per_second": 8.167, |
|
"eval_steps_per_second": 2.043, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.15569114243635598, |
|
"grad_norm": 15.617796897888184, |
|
"learning_rate": 0.00013090909090909093, |
|
"loss": 0.2275, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.15569114243635598, |
|
"eval_accuracy": 0.9450873136520386, |
|
"eval_loss": 0.2654193341732025, |
|
"eval_runtime": 582.2813, |
|
"eval_samples_per_second": 8.163, |
|
"eval_steps_per_second": 2.042, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.1598990111508521, |
|
"grad_norm": 9.974563598632812, |
|
"learning_rate": 0.00012363636363636364, |
|
"loss": 0.1118, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.1598990111508521, |
|
"eval_accuracy": 0.9421418309211731, |
|
"eval_loss": 0.2898730933666229, |
|
"eval_runtime": 580.2758, |
|
"eval_samples_per_second": 8.191, |
|
"eval_steps_per_second": 2.049, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.1641068798653482, |
|
"grad_norm": 1.2296732664108276, |
|
"learning_rate": 0.00011636363636363636, |
|
"loss": 0.0258, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.1641068798653482, |
|
"eval_accuracy": 0.9570797681808472, |
|
"eval_loss": 0.18523547053337097, |
|
"eval_runtime": 580.5297, |
|
"eval_samples_per_second": 8.187, |
|
"eval_steps_per_second": 2.048, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.1683147485798443, |
|
"grad_norm": 1.1257351636886597, |
|
"learning_rate": 0.00010909090909090909, |
|
"loss": 0.0816, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.1683147485798443, |
|
"eval_accuracy": 0.9535030722618103, |
|
"eval_loss": 0.18315376341342926, |
|
"eval_runtime": 582.012, |
|
"eval_samples_per_second": 8.166, |
|
"eval_steps_per_second": 2.043, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.17252261729434043, |
|
"grad_norm": 0.1417863965034485, |
|
"learning_rate": 0.00010181818181818181, |
|
"loss": 0.1385, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.17252261729434043, |
|
"eval_accuracy": 0.9606564044952393, |
|
"eval_loss": 0.1722693145275116, |
|
"eval_runtime": 583.7175, |
|
"eval_samples_per_second": 8.143, |
|
"eval_steps_per_second": 2.037, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.17673048600883653, |
|
"grad_norm": 0.05042952299118042, |
|
"learning_rate": 9.454545454545455e-05, |
|
"loss": 0.1194, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.17673048600883653, |
|
"eval_accuracy": 0.9391962885856628, |
|
"eval_loss": 0.26152685284614563, |
|
"eval_runtime": 583.6437, |
|
"eval_samples_per_second": 8.144, |
|
"eval_steps_per_second": 2.037, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.18093835472333264, |
|
"grad_norm": 0.06302843242883682, |
|
"learning_rate": 8.727272727272727e-05, |
|
"loss": 0.2722, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.18093835472333264, |
|
"eval_accuracy": 0.9667578339576721, |
|
"eval_loss": 0.1336488574743271, |
|
"eval_runtime": 585.4585, |
|
"eval_samples_per_second": 8.118, |
|
"eval_steps_per_second": 2.031, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.18514622343782874, |
|
"grad_norm": 1.160020112991333, |
|
"learning_rate": 8e-05, |
|
"loss": 0.1969, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.18514622343782874, |
|
"eval_accuracy": 0.9520303010940552, |
|
"eval_loss": 0.1606164574623108, |
|
"eval_runtime": 582.6331, |
|
"eval_samples_per_second": 8.158, |
|
"eval_steps_per_second": 2.041, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.18935409215232485, |
|
"grad_norm": 0.8594697713851929, |
|
"learning_rate": 7.272727272727273e-05, |
|
"loss": 0.109, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.18935409215232485, |
|
"eval_accuracy": 0.9610772132873535, |
|
"eval_loss": 0.13081230223178864, |
|
"eval_runtime": 598.3887, |
|
"eval_samples_per_second": 7.943, |
|
"eval_steps_per_second": 1.987, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.19356196086682095, |
|
"grad_norm": 0.1255054622888565, |
|
"learning_rate": 6.545454545454546e-05, |
|
"loss": 0.1662, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.19356196086682095, |
|
"eval_accuracy": 0.9657058715820312, |
|
"eval_loss": 0.1277003139257431, |
|
"eval_runtime": 596.8696, |
|
"eval_samples_per_second": 7.963, |
|
"eval_steps_per_second": 1.992, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.19776982958131706, |
|
"grad_norm": 0.09831862151622772, |
|
"learning_rate": 5.818181818181818e-05, |
|
"loss": 0.0393, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.19776982958131706, |
|
"eval_accuracy": 0.964022696018219, |
|
"eval_loss": 0.12812593579292297, |
|
"eval_runtime": 594.9871, |
|
"eval_samples_per_second": 7.988, |
|
"eval_steps_per_second": 1.998, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.20197769829581316, |
|
"grad_norm": 0.08306915313005447, |
|
"learning_rate": 5.090909090909091e-05, |
|
"loss": 0.1268, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.20197769829581316, |
|
"eval_accuracy": 0.9644435048103333, |
|
"eval_loss": 0.13266168534755707, |
|
"eval_runtime": 586.2759, |
|
"eval_samples_per_second": 8.107, |
|
"eval_steps_per_second": 2.028, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.20618556701030927, |
|
"grad_norm": 0.08147989958524704, |
|
"learning_rate": 4.3636363636363636e-05, |
|
"loss": 0.0548, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.20618556701030927, |
|
"eval_accuracy": 0.9646539092063904, |
|
"eval_loss": 0.14851805567741394, |
|
"eval_runtime": 591.656, |
|
"eval_samples_per_second": 8.033, |
|
"eval_steps_per_second": 2.01, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.2103934357248054, |
|
"grad_norm": 0.6941895484924316, |
|
"learning_rate": 3.6363636363636364e-05, |
|
"loss": 0.0484, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.2103934357248054, |
|
"eval_accuracy": 0.9629707336425781, |
|
"eval_loss": 0.16351090371608734, |
|
"eval_runtime": 589.9697, |
|
"eval_samples_per_second": 8.056, |
|
"eval_steps_per_second": 2.015, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.2146013044393015, |
|
"grad_norm": 0.04926018416881561, |
|
"learning_rate": 2.909090909090909e-05, |
|
"loss": 0.022, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.2146013044393015, |
|
"eval_accuracy": 0.964022696018219, |
|
"eval_loss": 0.15824884176254272, |
|
"eval_runtime": 597.3025, |
|
"eval_samples_per_second": 7.957, |
|
"eval_steps_per_second": 1.991, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.2188091731537976, |
|
"grad_norm": 0.043105900287628174, |
|
"learning_rate": 2.1818181818181818e-05, |
|
"loss": 0.1426, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.2188091731537976, |
|
"eval_accuracy": 0.9642331004142761, |
|
"eval_loss": 0.15840177237987518, |
|
"eval_runtime": 596.1264, |
|
"eval_samples_per_second": 7.973, |
|
"eval_steps_per_second": 1.995, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.22301704186829371, |
|
"grad_norm": 0.04364515841007233, |
|
"learning_rate": 1.4545454545454545e-05, |
|
"loss": 0.0611, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.22301704186829371, |
|
"eval_accuracy": 0.9636019468307495, |
|
"eval_loss": 0.1682334691286087, |
|
"eval_runtime": 590.7102, |
|
"eval_samples_per_second": 8.046, |
|
"eval_steps_per_second": 2.013, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.22722491058278982, |
|
"grad_norm": 0.9690969586372375, |
|
"learning_rate": 7.272727272727272e-06, |
|
"loss": 0.0668, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.22722491058278982, |
|
"eval_accuracy": 0.9633915424346924, |
|
"eval_loss": 0.16814225912094116, |
|
"eval_runtime": 593.6369, |
|
"eval_samples_per_second": 8.007, |
|
"eval_steps_per_second": 2.003, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.23143277929728592, |
|
"grad_norm": 0.053498703986406326, |
|
"learning_rate": 0.0, |
|
"loss": 0.0471, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.23143277929728592, |
|
"eval_accuracy": 0.964022696018219, |
|
"eval_loss": 0.16578398644924164, |
|
"eval_runtime": 583.1222, |
|
"eval_samples_per_second": 8.151, |
|
"eval_steps_per_second": 2.039, |
|
"step": 550 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 550, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 10, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 6.208452205931052e+17, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|