{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.23143277929728592, "eval_steps": 10, "global_step": 550, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.004207868714496108, "grad_norm": 3.2594902515411377, "learning_rate": 0.00039272727272727273, "loss": 0.6971, "step": 10 }, { "epoch": 0.004207868714496108, "eval_accuracy": 0.5914159417152405, "eval_loss": 0.7148427963256836, "eval_runtime": 585.2226, "eval_samples_per_second": 8.122, "eval_steps_per_second": 2.032, "step": 10 }, { "epoch": 0.008415737428992216, "grad_norm": 22.503503799438477, "learning_rate": 0.0003854545454545455, "loss": 0.7436, "step": 20 }, { "epoch": 0.008415737428992216, "eval_accuracy": 0.5914159417152405, "eval_loss": 1.42451012134552, "eval_runtime": 577.2378, "eval_samples_per_second": 8.234, "eval_steps_per_second": 2.06, "step": 20 }, { "epoch": 0.012623606143488323, "grad_norm": 1.2163587808609009, "learning_rate": 0.0003781818181818182, "loss": 0.9446, "step": 30 }, { "epoch": 0.012623606143488323, "eval_accuracy": 0.4085840582847595, "eval_loss": 0.7073472738265991, "eval_runtime": 581.0519, "eval_samples_per_second": 8.18, "eval_steps_per_second": 2.046, "step": 30 }, { "epoch": 0.016831474857984433, "grad_norm": 2.1413767337799072, "learning_rate": 0.0003709090909090909, "loss": 0.6945, "step": 40 }, { "epoch": 0.016831474857984433, "eval_accuracy": 0.5914159417152405, "eval_loss": 0.7796906232833862, "eval_runtime": 591.5892, "eval_samples_per_second": 8.034, "eval_steps_per_second": 2.01, "step": 40 }, { "epoch": 0.021039343572480537, "grad_norm": 0.8763795495033264, "learning_rate": 0.00036363636363636367, "loss": 0.653, "step": 50 }, { "epoch": 0.021039343572480537, "eval_accuracy": 0.5867872834205627, "eval_loss": 0.6816809773445129, "eval_runtime": 588.5616, "eval_samples_per_second": 8.076, "eval_steps_per_second": 2.02, "step": 50 }, { "epoch": 0.025247212286976645, "grad_norm": 0.9567063450813293, "learning_rate": 0.0003563636363636364, "loss": 0.6479, "step": 60 }, { "epoch": 0.025247212286976645, "eval_accuracy": 0.8102251291275024, "eval_loss": 0.5443565845489502, "eval_runtime": 581.8352, "eval_samples_per_second": 8.169, "eval_steps_per_second": 2.044, "step": 60 }, { "epoch": 0.029455081001472753, "grad_norm": 4.155121803283691, "learning_rate": 0.0003490909090909091, "loss": 0.502, "step": 70 }, { "epoch": 0.029455081001472753, "eval_accuracy": 0.5952030420303345, "eval_loss": 1.1268996000289917, "eval_runtime": 583.4269, "eval_samples_per_second": 8.147, "eval_steps_per_second": 2.038, "step": 70 }, { "epoch": 0.033662949715968865, "grad_norm": 3.364042282104492, "learning_rate": 0.0003418181818181818, "loss": 0.6052, "step": 80 }, { "epoch": 0.033662949715968865, "eval_accuracy": 0.8194824457168579, "eval_loss": 0.6350404024124146, "eval_runtime": 585.0166, "eval_samples_per_second": 8.125, "eval_steps_per_second": 2.032, "step": 80 }, { "epoch": 0.03787081843046497, "grad_norm": 1.5822795629501343, "learning_rate": 0.00033454545454545456, "loss": 0.3917, "step": 90 }, { "epoch": 0.03787081843046497, "eval_accuracy": 0.5914159417152405, "eval_loss": 1.0552942752838135, "eval_runtime": 579.8411, "eval_samples_per_second": 8.197, "eval_steps_per_second": 2.051, "step": 90 }, { "epoch": 0.042078687144961074, "grad_norm": 1.6902482509613037, "learning_rate": 0.0003272727272727273, "loss": 0.648, "step": 100 }, { "epoch": 0.042078687144961074, "eval_accuracy": 0.8116979002952576, "eval_loss": 0.7334651947021484, "eval_runtime": 577.8009, "eval_samples_per_second": 8.226, "eval_steps_per_second": 2.058, "step": 100 }, { "epoch": 0.046286555859457186, "grad_norm": 1.05351984500885, "learning_rate": 0.00032, "loss": 0.431, "step": 110 }, { "epoch": 0.046286555859457186, "eval_accuracy": 0.8480959534645081, "eval_loss": 0.39465391635894775, "eval_runtime": 576.7671, "eval_samples_per_second": 8.241, "eval_steps_per_second": 2.061, "step": 110 }, { "epoch": 0.05049442457395329, "grad_norm": 0.6553493738174438, "learning_rate": 0.00031272727272727273, "loss": 0.5428, "step": 120 }, { "epoch": 0.05049442457395329, "eval_accuracy": 0.8729223608970642, "eval_loss": 0.38810551166534424, "eval_runtime": 579.7258, "eval_samples_per_second": 8.199, "eval_steps_per_second": 2.051, "step": 120 }, { "epoch": 0.0547022932884494, "grad_norm": 0.42037296295166016, "learning_rate": 0.0003054545454545455, "loss": 0.34, "step": 130 }, { "epoch": 0.0547022932884494, "eval_accuracy": 0.8840731978416443, "eval_loss": 0.35628741979599, "eval_runtime": 582.4004, "eval_samples_per_second": 8.161, "eval_steps_per_second": 2.042, "step": 130 }, { "epoch": 0.05891016200294551, "grad_norm": 0.47820377349853516, "learning_rate": 0.0002981818181818182, "loss": 0.398, "step": 140 }, { "epoch": 0.05891016200294551, "eval_accuracy": 0.8842836022377014, "eval_loss": 0.3024275004863739, "eval_runtime": 581.4736, "eval_samples_per_second": 8.174, "eval_steps_per_second": 2.045, "step": 140 }, { "epoch": 0.06311803071744161, "grad_norm": 0.9904082417488098, "learning_rate": 0.0002909090909090909, "loss": 0.1834, "step": 150 }, { "epoch": 0.06311803071744161, "eval_accuracy": 0.87250155210495, "eval_loss": 0.38005706667900085, "eval_runtime": 580.8975, "eval_samples_per_second": 8.182, "eval_steps_per_second": 2.047, "step": 150 }, { "epoch": 0.06732589943193773, "grad_norm": 1.6720079183578491, "learning_rate": 0.0002836363636363637, "loss": 0.5052, "step": 160 }, { "epoch": 0.06732589943193773, "eval_accuracy": 0.9114243388175964, "eval_loss": 0.23850664496421814, "eval_runtime": 581.0949, "eval_samples_per_second": 8.179, "eval_steps_per_second": 2.046, "step": 160 }, { "epoch": 0.07153376814643383, "grad_norm": 1.5653446912765503, "learning_rate": 0.0002763636363636364, "loss": 0.3953, "step": 170 }, { "epoch": 0.07153376814643383, "eval_accuracy": 0.8544077277183533, "eval_loss": 0.41211748123168945, "eval_runtime": 581.5472, "eval_samples_per_second": 8.173, "eval_steps_per_second": 2.045, "step": 170 }, { "epoch": 0.07574163686092994, "grad_norm": 1.7199907302856445, "learning_rate": 0.0002690909090909091, "loss": 0.407, "step": 180 }, { "epoch": 0.07574163686092994, "eval_accuracy": 0.9044813513755798, "eval_loss": 0.26600247621536255, "eval_runtime": 580.7861, "eval_samples_per_second": 8.184, "eval_steps_per_second": 2.047, "step": 180 }, { "epoch": 0.07994950557542604, "grad_norm": 2.303934335708618, "learning_rate": 0.00026181818181818185, "loss": 0.3571, "step": 190 }, { "epoch": 0.07994950557542604, "eval_accuracy": 0.8844940066337585, "eval_loss": 0.3394128680229187, "eval_runtime": 584.97, "eval_samples_per_second": 8.125, "eval_steps_per_second": 2.033, "step": 190 }, { "epoch": 0.08415737428992215, "grad_norm": 2.5085370540618896, "learning_rate": 0.00025454545454545456, "loss": 0.2747, "step": 200 }, { "epoch": 0.08415737428992215, "eval_accuracy": 0.9147906303405762, "eval_loss": 0.2246033400297165, "eval_runtime": 585.8187, "eval_samples_per_second": 8.113, "eval_steps_per_second": 2.03, "step": 200 }, { "epoch": 0.08836524300441827, "grad_norm": 4.109118461608887, "learning_rate": 0.00024727272727272727, "loss": 0.2863, "step": 210 }, { "epoch": 0.08836524300441827, "eval_accuracy": 0.9244687557220459, "eval_loss": 0.2438182830810547, "eval_runtime": 582.1202, "eval_samples_per_second": 8.165, "eval_steps_per_second": 2.043, "step": 210 }, { "epoch": 0.09257311171891437, "grad_norm": 0.5757103562355042, "learning_rate": 0.00024, "loss": 0.2334, "step": 220 }, { "epoch": 0.09257311171891437, "eval_accuracy": 0.922154426574707, "eval_loss": 0.21005088090896606, "eval_runtime": 579.1924, "eval_samples_per_second": 8.206, "eval_steps_per_second": 2.053, "step": 220 }, { "epoch": 0.09678098043341048, "grad_norm": 0.36710911989212036, "learning_rate": 0.00023272727272727271, "loss": 0.1744, "step": 230 }, { "epoch": 0.09678098043341048, "eval_accuracy": 0.8529350161552429, "eval_loss": 0.41234469413757324, "eval_runtime": 589.1406, "eval_samples_per_second": 8.068, "eval_steps_per_second": 2.018, "step": 230 }, { "epoch": 0.10098884914790658, "grad_norm": 3.2360424995422363, "learning_rate": 0.00022545454545454545, "loss": 0.1948, "step": 240 }, { "epoch": 0.10098884914790658, "eval_accuracy": 0.9253103137016296, "eval_loss": 0.22991585731506348, "eval_runtime": 587.1353, "eval_samples_per_second": 8.095, "eval_steps_per_second": 2.025, "step": 240 }, { "epoch": 0.1051967178624027, "grad_norm": 0.21486328542232513, "learning_rate": 0.00021818181818181818, "loss": 0.2382, "step": 250 }, { "epoch": 0.1051967178624027, "eval_accuracy": 0.9322533011436462, "eval_loss": 0.27035772800445557, "eval_runtime": 587.222, "eval_samples_per_second": 8.094, "eval_steps_per_second": 2.025, "step": 250 }, { "epoch": 0.1094045865768988, "grad_norm": 1.8511269092559814, "learning_rate": 0.0002109090909090909, "loss": 0.219, "step": 260 }, { "epoch": 0.1094045865768988, "eval_accuracy": 0.9137386679649353, "eval_loss": 0.3539877235889435, "eval_runtime": 585.6282, "eval_samples_per_second": 8.116, "eval_steps_per_second": 2.03, "step": 260 }, { "epoch": 0.11361245529139491, "grad_norm": 0.11713656038045883, "learning_rate": 0.00020363636363636363, "loss": 0.1122, "step": 270 }, { "epoch": 0.11361245529139491, "eval_accuracy": 0.9339364767074585, "eval_loss": 0.2783205509185791, "eval_runtime": 585.5155, "eval_samples_per_second": 8.118, "eval_steps_per_second": 2.031, "step": 270 }, { "epoch": 0.11782032400589101, "grad_norm": 2.4492409229278564, "learning_rate": 0.00019636363636363636, "loss": 0.1902, "step": 280 }, { "epoch": 0.11782032400589101, "eval_accuracy": 0.9322533011436462, "eval_loss": 0.28133705258369446, "eval_runtime": 587.357, "eval_samples_per_second": 8.092, "eval_steps_per_second": 2.024, "step": 280 }, { "epoch": 0.12202819272038712, "grad_norm": 0.09383056312799454, "learning_rate": 0.0001890909090909091, "loss": 0.1279, "step": 290 }, { "epoch": 0.12202819272038712, "eval_accuracy": 0.9297285676002502, "eval_loss": 0.22569426894187927, "eval_runtime": 586.2579, "eval_samples_per_second": 8.107, "eval_steps_per_second": 2.028, "step": 290 }, { "epoch": 0.12623606143488322, "grad_norm": 1.5099377632141113, "learning_rate": 0.00018181818181818183, "loss": 0.168, "step": 300 }, { "epoch": 0.12623606143488322, "eval_accuracy": 0.9347780346870422, "eval_loss": 0.2831152081489563, "eval_runtime": 587.962, "eval_samples_per_second": 8.084, "eval_steps_per_second": 2.022, "step": 300 }, { "epoch": 0.13044393014937933, "grad_norm": 16.816967010498047, "learning_rate": 0.00017454545454545454, "loss": 0.1351, "step": 310 }, { "epoch": 0.13044393014937933, "eval_accuracy": 0.9293078184127808, "eval_loss": 0.3322593569755554, "eval_runtime": 586.8235, "eval_samples_per_second": 8.1, "eval_steps_per_second": 2.026, "step": 310 }, { "epoch": 0.13465179886387546, "grad_norm": 0.6043083667755127, "learning_rate": 0.00016727272727272728, "loss": 0.0422, "step": 320 }, { "epoch": 0.13465179886387546, "eval_accuracy": 0.9427729845046997, "eval_loss": 0.28099876642227173, "eval_runtime": 587.4493, "eval_samples_per_second": 8.091, "eval_steps_per_second": 2.024, "step": 320 }, { "epoch": 0.13885966757837157, "grad_norm": 16.680456161499023, "learning_rate": 0.00016, "loss": 0.1904, "step": 330 }, { "epoch": 0.13885966757837157, "eval_accuracy": 0.9349884390830994, "eval_loss": 0.3474605977535248, "eval_runtime": 579.304, "eval_samples_per_second": 8.205, "eval_steps_per_second": 2.052, "step": 330 }, { "epoch": 0.14306753629286767, "grad_norm": 0.6815859079360962, "learning_rate": 0.00015272727272727275, "loss": 0.0864, "step": 340 }, { "epoch": 0.14306753629286767, "eval_accuracy": 0.9438249468803406, "eval_loss": 0.3012893497943878, "eval_runtime": 588.316, "eval_samples_per_second": 8.079, "eval_steps_per_second": 2.021, "step": 340 }, { "epoch": 0.14727540500736377, "grad_norm": 0.01422311831265688, "learning_rate": 0.00014545454545454546, "loss": 0.0198, "step": 350 }, { "epoch": 0.14727540500736377, "eval_accuracy": 0.9335156679153442, "eval_loss": 0.38236290216445923, "eval_runtime": 589.516, "eval_samples_per_second": 8.063, "eval_steps_per_second": 2.017, "step": 350 }, { "epoch": 0.15148327372185988, "grad_norm": 2.280247211456299, "learning_rate": 0.0001381818181818182, "loss": 0.2155, "step": 360 }, { "epoch": 0.15148327372185988, "eval_accuracy": 0.9463496804237366, "eval_loss": 0.3106628656387329, "eval_runtime": 581.9576, "eval_samples_per_second": 8.167, "eval_steps_per_second": 2.043, "step": 360 }, { "epoch": 0.15569114243635598, "grad_norm": 15.617796897888184, "learning_rate": 0.00013090909090909093, "loss": 0.2275, "step": 370 }, { "epoch": 0.15569114243635598, "eval_accuracy": 0.9450873136520386, "eval_loss": 0.2654193341732025, "eval_runtime": 582.2813, "eval_samples_per_second": 8.163, "eval_steps_per_second": 2.042, "step": 370 }, { "epoch": 0.1598990111508521, "grad_norm": 9.974563598632812, "learning_rate": 0.00012363636363636364, "loss": 0.1118, "step": 380 }, { "epoch": 0.1598990111508521, "eval_accuracy": 0.9421418309211731, "eval_loss": 0.2898730933666229, "eval_runtime": 580.2758, "eval_samples_per_second": 8.191, "eval_steps_per_second": 2.049, "step": 380 }, { "epoch": 0.1641068798653482, "grad_norm": 1.2296732664108276, "learning_rate": 0.00011636363636363636, "loss": 0.0258, "step": 390 }, { "epoch": 0.1641068798653482, "eval_accuracy": 0.9570797681808472, "eval_loss": 0.18523547053337097, "eval_runtime": 580.5297, "eval_samples_per_second": 8.187, "eval_steps_per_second": 2.048, "step": 390 }, { "epoch": 0.1683147485798443, "grad_norm": 1.1257351636886597, "learning_rate": 0.00010909090909090909, "loss": 0.0816, "step": 400 }, { "epoch": 0.1683147485798443, "eval_accuracy": 0.9535030722618103, "eval_loss": 0.18315376341342926, "eval_runtime": 582.012, "eval_samples_per_second": 8.166, "eval_steps_per_second": 2.043, "step": 400 }, { "epoch": 0.17252261729434043, "grad_norm": 0.1417863965034485, "learning_rate": 0.00010181818181818181, "loss": 0.1385, "step": 410 }, { "epoch": 0.17252261729434043, "eval_accuracy": 0.9606564044952393, "eval_loss": 0.1722693145275116, "eval_runtime": 583.7175, "eval_samples_per_second": 8.143, "eval_steps_per_second": 2.037, "step": 410 }, { "epoch": 0.17673048600883653, "grad_norm": 0.05042952299118042, "learning_rate": 9.454545454545455e-05, "loss": 0.1194, "step": 420 }, { "epoch": 0.17673048600883653, "eval_accuracy": 0.9391962885856628, "eval_loss": 0.26152685284614563, "eval_runtime": 583.6437, "eval_samples_per_second": 8.144, "eval_steps_per_second": 2.037, "step": 420 }, { "epoch": 0.18093835472333264, "grad_norm": 0.06302843242883682, "learning_rate": 8.727272727272727e-05, "loss": 0.2722, "step": 430 }, { "epoch": 0.18093835472333264, "eval_accuracy": 0.9667578339576721, "eval_loss": 0.1336488574743271, "eval_runtime": 585.4585, "eval_samples_per_second": 8.118, "eval_steps_per_second": 2.031, "step": 430 }, { "epoch": 0.18514622343782874, "grad_norm": 1.160020112991333, "learning_rate": 8e-05, "loss": 0.1969, "step": 440 }, { "epoch": 0.18514622343782874, "eval_accuracy": 0.9520303010940552, "eval_loss": 0.1606164574623108, "eval_runtime": 582.6331, "eval_samples_per_second": 8.158, "eval_steps_per_second": 2.041, "step": 440 }, { "epoch": 0.18935409215232485, "grad_norm": 0.8594697713851929, "learning_rate": 7.272727272727273e-05, "loss": 0.109, "step": 450 }, { "epoch": 0.18935409215232485, "eval_accuracy": 0.9610772132873535, "eval_loss": 0.13081230223178864, "eval_runtime": 598.3887, "eval_samples_per_second": 7.943, "eval_steps_per_second": 1.987, "step": 450 }, { "epoch": 0.19356196086682095, "grad_norm": 0.1255054622888565, "learning_rate": 6.545454545454546e-05, "loss": 0.1662, "step": 460 }, { "epoch": 0.19356196086682095, "eval_accuracy": 0.9657058715820312, "eval_loss": 0.1277003139257431, "eval_runtime": 596.8696, "eval_samples_per_second": 7.963, "eval_steps_per_second": 1.992, "step": 460 }, { "epoch": 0.19776982958131706, "grad_norm": 0.09831862151622772, "learning_rate": 5.818181818181818e-05, "loss": 0.0393, "step": 470 }, { "epoch": 0.19776982958131706, "eval_accuracy": 0.964022696018219, "eval_loss": 0.12812593579292297, "eval_runtime": 594.9871, "eval_samples_per_second": 7.988, "eval_steps_per_second": 1.998, "step": 470 }, { "epoch": 0.20197769829581316, "grad_norm": 0.08306915313005447, "learning_rate": 5.090909090909091e-05, "loss": 0.1268, "step": 480 }, { "epoch": 0.20197769829581316, "eval_accuracy": 0.9644435048103333, "eval_loss": 0.13266168534755707, "eval_runtime": 586.2759, "eval_samples_per_second": 8.107, "eval_steps_per_second": 2.028, "step": 480 }, { "epoch": 0.20618556701030927, "grad_norm": 0.08147989958524704, "learning_rate": 4.3636363636363636e-05, "loss": 0.0548, "step": 490 }, { "epoch": 0.20618556701030927, "eval_accuracy": 0.9646539092063904, "eval_loss": 0.14851805567741394, "eval_runtime": 591.656, "eval_samples_per_second": 8.033, "eval_steps_per_second": 2.01, "step": 490 }, { "epoch": 0.2103934357248054, "grad_norm": 0.6941895484924316, "learning_rate": 3.6363636363636364e-05, "loss": 0.0484, "step": 500 }, { "epoch": 0.2103934357248054, "eval_accuracy": 0.9629707336425781, "eval_loss": 0.16351090371608734, "eval_runtime": 589.9697, "eval_samples_per_second": 8.056, "eval_steps_per_second": 2.015, "step": 500 }, { "epoch": 0.2146013044393015, "grad_norm": 0.04926018416881561, "learning_rate": 2.909090909090909e-05, "loss": 0.022, "step": 510 }, { "epoch": 0.2146013044393015, "eval_accuracy": 0.964022696018219, "eval_loss": 0.15824884176254272, "eval_runtime": 597.3025, "eval_samples_per_second": 7.957, "eval_steps_per_second": 1.991, "step": 510 }, { "epoch": 0.2188091731537976, "grad_norm": 0.043105900287628174, "learning_rate": 2.1818181818181818e-05, "loss": 0.1426, "step": 520 }, { "epoch": 0.2188091731537976, "eval_accuracy": 0.9642331004142761, "eval_loss": 0.15840177237987518, "eval_runtime": 596.1264, "eval_samples_per_second": 7.973, "eval_steps_per_second": 1.995, "step": 520 }, { "epoch": 0.22301704186829371, "grad_norm": 0.04364515841007233, "learning_rate": 1.4545454545454545e-05, "loss": 0.0611, "step": 530 }, { "epoch": 0.22301704186829371, "eval_accuracy": 0.9636019468307495, "eval_loss": 0.1682334691286087, "eval_runtime": 590.7102, "eval_samples_per_second": 8.046, "eval_steps_per_second": 2.013, "step": 530 }, { "epoch": 0.22722491058278982, "grad_norm": 0.9690969586372375, "learning_rate": 7.272727272727272e-06, "loss": 0.0668, "step": 540 }, { "epoch": 0.22722491058278982, "eval_accuracy": 0.9633915424346924, "eval_loss": 0.16814225912094116, "eval_runtime": 593.6369, "eval_samples_per_second": 8.007, "eval_steps_per_second": 2.003, "step": 540 }, { "epoch": 0.23143277929728592, "grad_norm": 0.053498703986406326, "learning_rate": 0.0, "loss": 0.0471, "step": 550 }, { "epoch": 0.23143277929728592, "eval_accuracy": 0.964022696018219, "eval_loss": 0.16578398644924164, "eval_runtime": 583.1222, "eval_samples_per_second": 8.151, "eval_steps_per_second": 2.039, "step": 550 } ], "logging_steps": 10, "max_steps": 550, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 10, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 6.208452205931052e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }