{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.23143277929728592, "eval_steps": 10, "global_step": 550, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.004207868714496108, "grad_norm": 3.7386245727539062, "learning_rate": 0.00019636363636363636, "loss": 0.6798, "step": 10 }, { "epoch": 0.004207868714496108, "eval_accuracy": 0.6183463335037231, "eval_loss": 0.6594762802124023, "eval_runtime": 597.7321, "eval_samples_per_second": 7.952, "eval_steps_per_second": 1.989, "step": 10 }, { "epoch": 0.008415737428992216, "grad_norm": 23.72800636291504, "learning_rate": 0.00019272727272727274, "loss": 0.734, "step": 20 }, { "epoch": 0.008415737428992216, "eval_accuracy": 0.6175047159194946, "eval_loss": 0.6504878401756287, "eval_runtime": 581.0465, "eval_samples_per_second": 8.18, "eval_steps_per_second": 2.046, "step": 20 }, { "epoch": 0.012623606143488323, "grad_norm": 9.25478458404541, "learning_rate": 0.0001890909090909091, "loss": 0.6702, "step": 30 }, { "epoch": 0.012623606143488323, "eval_accuracy": 0.6919839978218079, "eval_loss": 0.5777503252029419, "eval_runtime": 578.0228, "eval_samples_per_second": 8.223, "eval_steps_per_second": 2.057, "step": 30 }, { "epoch": 0.016831474857984433, "grad_norm": 7.304983615875244, "learning_rate": 0.00018545454545454545, "loss": 0.4556, "step": 40 }, { "epoch": 0.016831474857984433, "eval_accuracy": 0.8396801948547363, "eval_loss": 0.40889662504196167, "eval_runtime": 571.8895, "eval_samples_per_second": 8.311, "eval_steps_per_second": 2.079, "step": 40 }, { "epoch": 0.021039343572480537, "grad_norm": 7.751391887664795, "learning_rate": 0.00018181818181818183, "loss": 0.569, "step": 50 }, { "epoch": 0.021039343572480537, "eval_accuracy": 0.7944456338882446, "eval_loss": 0.4470515251159668, "eval_runtime": 573.7399, "eval_samples_per_second": 8.284, "eval_steps_per_second": 2.072, "step": 50 }, { "epoch": 0.025247212286976645, "grad_norm": 1.6625334024429321, "learning_rate": 0.0001781818181818182, "loss": 0.4117, "step": 60 }, { "epoch": 0.025247212286976645, "eval_accuracy": 0.8788133859634399, "eval_loss": 0.2980070412158966, "eval_runtime": 568.2097, "eval_samples_per_second": 8.365, "eval_steps_per_second": 2.093, "step": 60 }, { "epoch": 0.029455081001472753, "grad_norm": 3.0882933139801025, "learning_rate": 0.00017454545454545454, "loss": 0.4465, "step": 70 }, { "epoch": 0.029455081001472753, "eval_accuracy": 0.8990111351013184, "eval_loss": 0.24757108092308044, "eval_runtime": 561.4024, "eval_samples_per_second": 8.466, "eval_steps_per_second": 2.118, "step": 70 }, { "epoch": 0.033662949715968865, "grad_norm": 1.2464115619659424, "learning_rate": 0.0001709090909090909, "loss": 0.4248, "step": 80 }, { "epoch": 0.033662949715968865, "eval_accuracy": 0.9202609062194824, "eval_loss": 0.2152455598115921, "eval_runtime": 561.7534, "eval_samples_per_second": 8.461, "eval_steps_per_second": 2.117, "step": 80 }, { "epoch": 0.03787081843046497, "grad_norm": 3.7030742168426514, "learning_rate": 0.00016727272727272728, "loss": 0.2108, "step": 90 }, { "epoch": 0.03787081843046497, "eval_accuracy": 0.9377235174179077, "eval_loss": 0.18127086758613586, "eval_runtime": 560.5533, "eval_samples_per_second": 8.479, "eval_steps_per_second": 2.121, "step": 90 }, { "epoch": 0.042078687144961074, "grad_norm": 3.4107439517974854, "learning_rate": 0.00016363636363636366, "loss": 0.3354, "step": 100 }, { "epoch": 0.042078687144961074, "eval_accuracy": 0.9215232729911804, "eval_loss": 0.19136299192905426, "eval_runtime": 554.2596, "eval_samples_per_second": 8.575, "eval_steps_per_second": 2.145, "step": 100 }, { "epoch": 0.046286555859457186, "grad_norm": 9.938206672668457, "learning_rate": 0.00016, "loss": 0.2655, "step": 110 }, { "epoch": 0.046286555859457186, "eval_accuracy": 0.9354091882705688, "eval_loss": 0.1784314066171646, "eval_runtime": 556.5406, "eval_samples_per_second": 8.54, "eval_steps_per_second": 2.136, "step": 110 }, { "epoch": 0.05049442457395329, "grad_norm": 7.809841156005859, "learning_rate": 0.00015636363636363637, "loss": 0.3885, "step": 120 }, { "epoch": 0.05049442457395329, "eval_accuracy": 0.8979591727256775, "eval_loss": 0.2238394021987915, "eval_runtime": 558.4451, "eval_samples_per_second": 8.511, "eval_steps_per_second": 2.129, "step": 120 }, { "epoch": 0.0547022932884494, "grad_norm": 0.18238992989063263, "learning_rate": 0.00015272727272727275, "loss": 0.1268, "step": 130 }, { "epoch": 0.0547022932884494, "eval_accuracy": 0.965285062789917, "eval_loss": 0.08994818478822708, "eval_runtime": 564.2804, "eval_samples_per_second": 8.423, "eval_steps_per_second": 2.107, "step": 130 }, { "epoch": 0.05891016200294551, "grad_norm": 21.266611099243164, "learning_rate": 0.0001490909090909091, "loss": 0.222, "step": 140 }, { "epoch": 0.05891016200294551, "eval_accuracy": 0.9591836929321289, "eval_loss": 0.2074023336172104, "eval_runtime": 563.4722, "eval_samples_per_second": 8.435, "eval_steps_per_second": 2.11, "step": 140 }, { "epoch": 0.06311803071744161, "grad_norm": 11.509130477905273, "learning_rate": 0.00014545454545454546, "loss": 0.4098, "step": 150 }, { "epoch": 0.06311803071744161, "eval_accuracy": 0.8276877999305725, "eval_loss": 0.7424523234367371, "eval_runtime": 560.1213, "eval_samples_per_second": 8.486, "eval_steps_per_second": 2.123, "step": 150 }, { "epoch": 0.06732589943193773, "grad_norm": 7.520476818084717, "learning_rate": 0.00014181818181818184, "loss": 0.4331, "step": 160 }, { "epoch": 0.06732589943193773, "eval_accuracy": 0.9326741099357605, "eval_loss": 0.2065199315547943, "eval_runtime": 562.1855, "eval_samples_per_second": 8.455, "eval_steps_per_second": 2.115, "step": 160 }, { "epoch": 0.07153376814643383, "grad_norm": 12.3318510055542, "learning_rate": 0.0001381818181818182, "loss": 0.6336, "step": 170 }, { "epoch": 0.07153376814643383, "eval_accuracy": 0.859036386013031, "eval_loss": 0.29903730750083923, "eval_runtime": 564.8753, "eval_samples_per_second": 8.414, "eval_steps_per_second": 2.105, "step": 170 }, { "epoch": 0.07574163686092994, "grad_norm": 7.766119956970215, "learning_rate": 0.00013454545454545455, "loss": 0.2881, "step": 180 }, { "epoch": 0.07574163686092994, "eval_accuracy": 0.948032796382904, "eval_loss": 0.11360938847064972, "eval_runtime": 565.4404, "eval_samples_per_second": 8.406, "eval_steps_per_second": 2.103, "step": 180 }, { "epoch": 0.07994950557542604, "grad_norm": 6.830145359039307, "learning_rate": 0.00013090909090909093, "loss": 0.277, "step": 190 }, { "epoch": 0.07994950557542604, "eval_accuracy": 0.9400378465652466, "eval_loss": 0.209593266248703, "eval_runtime": 562.6503, "eval_samples_per_second": 8.448, "eval_steps_per_second": 2.113, "step": 190 }, { "epoch": 0.08415737428992215, "grad_norm": 4.35469913482666, "learning_rate": 0.00012727272727272728, "loss": 0.1586, "step": 200 }, { "epoch": 0.08415737428992215, "eval_accuracy": 0.9337260723114014, "eval_loss": 0.23578013479709625, "eval_runtime": 562.1129, "eval_samples_per_second": 8.456, "eval_steps_per_second": 2.115, "step": 200 }, { "epoch": 0.08836524300441827, "grad_norm": 2.565586566925049, "learning_rate": 0.00012363636363636364, "loss": 0.1664, "step": 210 }, { "epoch": 0.08836524300441827, "eval_accuracy": 0.967809796333313, "eval_loss": 0.11096801608800888, "eval_runtime": 556.8442, "eval_samples_per_second": 8.536, "eval_steps_per_second": 2.135, "step": 210 }, { "epoch": 0.09257311171891437, "grad_norm": 12.48508358001709, "learning_rate": 0.00012, "loss": 0.3419, "step": 220 }, { "epoch": 0.09257311171891437, "eval_accuracy": 0.9421418309211731, "eval_loss": 0.14782342314720154, "eval_runtime": 560.1835, "eval_samples_per_second": 8.485, "eval_steps_per_second": 2.123, "step": 220 }, { "epoch": 0.09678098043341048, "grad_norm": 10.233806610107422, "learning_rate": 0.00011636363636363636, "loss": 0.187, "step": 230 }, { "epoch": 0.09678098043341048, "eval_accuracy": 0.9579213261604309, "eval_loss": 0.11080725491046906, "eval_runtime": 563.1087, "eval_samples_per_second": 8.441, "eval_steps_per_second": 2.111, "step": 230 }, { "epoch": 0.10098884914790658, "grad_norm": 0.16775061190128326, "learning_rate": 0.00011272727272727272, "loss": 0.1834, "step": 240 }, { "epoch": 0.10098884914790658, "eval_accuracy": 0.8727119565010071, "eval_loss": 0.44505125284194946, "eval_runtime": 564.6564, "eval_samples_per_second": 8.418, "eval_steps_per_second": 2.106, "step": 240 }, { "epoch": 0.1051967178624027, "grad_norm": 5.313834190368652, "learning_rate": 0.00010909090909090909, "loss": 0.2383, "step": 250 }, { "epoch": 0.1051967178624027, "eval_accuracy": 0.9726488590240479, "eval_loss": 0.07795428484678268, "eval_runtime": 565.2854, "eval_samples_per_second": 8.408, "eval_steps_per_second": 2.103, "step": 250 }, { "epoch": 0.1094045865768988, "grad_norm": 0.07698975503444672, "learning_rate": 0.00010545454545454545, "loss": 0.2157, "step": 260 }, { "epoch": 0.1094045865768988, "eval_accuracy": 0.9758047461509705, "eval_loss": 0.0759287104010582, "eval_runtime": 564.058, "eval_samples_per_second": 8.426, "eval_steps_per_second": 2.108, "step": 260 }, { "epoch": 0.11361245529139491, "grad_norm": 0.13711199164390564, "learning_rate": 0.00010181818181818181, "loss": 0.2665, "step": 270 }, { "epoch": 0.11361245529139491, "eval_accuracy": 0.9787502884864807, "eval_loss": 0.06089087575674057, "eval_runtime": 563.7962, "eval_samples_per_second": 8.43, "eval_steps_per_second": 2.109, "step": 270 }, { "epoch": 0.11782032400589101, "grad_norm": 0.17997244000434875, "learning_rate": 9.818181818181818e-05, "loss": 0.1226, "step": 280 }, { "epoch": 0.11782032400589101, "eval_accuracy": 0.9808542132377625, "eval_loss": 0.052085429430007935, "eval_runtime": 564.5871, "eval_samples_per_second": 8.419, "eval_steps_per_second": 2.106, "step": 280 }, { "epoch": 0.12202819272038712, "grad_norm": 0.1597593128681183, "learning_rate": 9.454545454545455e-05, "loss": 0.1287, "step": 290 }, { "epoch": 0.12202819272038712, "eval_accuracy": 0.9558174014091492, "eval_loss": 0.09927653521299362, "eval_runtime": 563.2882, "eval_samples_per_second": 8.438, "eval_steps_per_second": 2.111, "step": 290 }, { "epoch": 0.12623606143488322, "grad_norm": 14.694509506225586, "learning_rate": 9.090909090909092e-05, "loss": 0.1088, "step": 300 }, { "epoch": 0.12623606143488322, "eval_accuracy": 0.9309909343719482, "eval_loss": 0.19473901391029358, "eval_runtime": 565.488, "eval_samples_per_second": 8.405, "eval_steps_per_second": 2.103, "step": 300 }, { "epoch": 0.13044393014937933, "grad_norm": 4.671857833862305, "learning_rate": 8.727272727272727e-05, "loss": 0.1778, "step": 310 }, { "epoch": 0.13044393014937933, "eval_accuracy": 0.9814853668212891, "eval_loss": 0.057908281683921814, "eval_runtime": 564.4128, "eval_samples_per_second": 8.421, "eval_steps_per_second": 2.107, "step": 310 }, { "epoch": 0.13465179886387546, "grad_norm": 1.1494044065475464, "learning_rate": 8.363636363636364e-05, "loss": 0.1686, "step": 320 }, { "epoch": 0.13465179886387546, "eval_accuracy": 0.9798022508621216, "eval_loss": 0.06721032410860062, "eval_runtime": 563.963, "eval_samples_per_second": 8.428, "eval_steps_per_second": 2.108, "step": 320 }, { "epoch": 0.13885966757837157, "grad_norm": 20.857423782348633, "learning_rate": 8e-05, "loss": 0.19, "step": 330 }, { "epoch": 0.13885966757837157, "eval_accuracy": 0.9793814420700073, "eval_loss": 0.07669834792613983, "eval_runtime": 558.0764, "eval_samples_per_second": 8.517, "eval_steps_per_second": 2.131, "step": 330 }, { "epoch": 0.14306753629286767, "grad_norm": 0.02689620479941368, "learning_rate": 7.636363636363637e-05, "loss": 0.0889, "step": 340 }, { "epoch": 0.14306753629286767, "eval_accuracy": 0.9776982665061951, "eval_loss": 0.0889260396361351, "eval_runtime": 561.1157, "eval_samples_per_second": 8.471, "eval_steps_per_second": 2.119, "step": 340 }, { "epoch": 0.14727540500736377, "grad_norm": 0.01651788130402565, "learning_rate": 7.272727272727273e-05, "loss": 0.0046, "step": 350 }, { "epoch": 0.14727540500736377, "eval_accuracy": 0.9774879217147827, "eval_loss": 0.0977482944726944, "eval_runtime": 557.4467, "eval_samples_per_second": 8.526, "eval_steps_per_second": 2.133, "step": 350 }, { "epoch": 0.15148327372185988, "grad_norm": 0.49462929368019104, "learning_rate": 6.90909090909091e-05, "loss": 0.2327, "step": 360 }, { "epoch": 0.15148327372185988, "eval_accuracy": 0.9833789467811584, "eval_loss": 0.05215293914079666, "eval_runtime": 559.0767, "eval_samples_per_second": 8.502, "eval_steps_per_second": 2.127, "step": 360 }, { "epoch": 0.15569114243635598, "grad_norm": 1.9083374738693237, "learning_rate": 6.545454545454546e-05, "loss": 0.021, "step": 370 }, { "epoch": 0.15569114243635598, "eval_accuracy": 0.9629707336425781, "eval_loss": 0.11694369465112686, "eval_runtime": 559.3448, "eval_samples_per_second": 8.497, "eval_steps_per_second": 2.126, "step": 370 }, { "epoch": 0.1598990111508521, "grad_norm": 23.373567581176758, "learning_rate": 6.181818181818182e-05, "loss": 0.2001, "step": 380 }, { "epoch": 0.1598990111508521, "eval_accuracy": 0.9829581379890442, "eval_loss": 0.060037508606910706, "eval_runtime": 560.337, "eval_samples_per_second": 8.482, "eval_steps_per_second": 2.122, "step": 380 }, { "epoch": 0.1641068798653482, "grad_norm": 0.10927582532167435, "learning_rate": 5.818181818181818e-05, "loss": 0.0711, "step": 390 }, { "epoch": 0.1641068798653482, "eval_accuracy": 0.9859036207199097, "eval_loss": 0.04903951659798622, "eval_runtime": 555.1166, "eval_samples_per_second": 8.562, "eval_steps_per_second": 2.142, "step": 390 }, { "epoch": 0.1683147485798443, "grad_norm": 1.898078203201294, "learning_rate": 5.4545454545454546e-05, "loss": 0.0983, "step": 400 }, { "epoch": 0.1683147485798443, "eval_accuracy": 0.9831685423851013, "eval_loss": 0.045510243624448776, "eval_runtime": 558.1218, "eval_samples_per_second": 8.516, "eval_steps_per_second": 2.13, "step": 400 }, { "epoch": 0.17252261729434043, "grad_norm": 0.12654219567775726, "learning_rate": 5.090909090909091e-05, "loss": 0.0548, "step": 410 }, { "epoch": 0.17252261729434043, "eval_accuracy": 0.9888491630554199, "eval_loss": 0.04250233247876167, "eval_runtime": 558.6685, "eval_samples_per_second": 8.508, "eval_steps_per_second": 2.128, "step": 410 }, { "epoch": 0.17673048600883653, "grad_norm": 2.7145519256591797, "learning_rate": 4.7272727272727275e-05, "loss": 0.0505, "step": 420 }, { "epoch": 0.17673048600883653, "eval_accuracy": 0.9880075454711914, "eval_loss": 0.04300825670361519, "eval_runtime": 559.6635, "eval_samples_per_second": 8.493, "eval_steps_per_second": 2.124, "step": 420 }, { "epoch": 0.18093835472333264, "grad_norm": 0.08013765513896942, "learning_rate": 4.3636363636363636e-05, "loss": 0.2609, "step": 430 }, { "epoch": 0.18093835472333264, "eval_accuracy": 0.9899011254310608, "eval_loss": 0.03251320868730545, "eval_runtime": 560.0268, "eval_samples_per_second": 8.487, "eval_steps_per_second": 2.123, "step": 430 }, { "epoch": 0.18514622343782874, "grad_norm": 0.1592099815607071, "learning_rate": 4e-05, "loss": 0.058, "step": 440 }, { "epoch": 0.18514622343782874, "eval_accuracy": 0.9783294796943665, "eval_loss": 0.0525953434407711, "eval_runtime": 553.5191, "eval_samples_per_second": 8.587, "eval_steps_per_second": 2.148, "step": 440 }, { "epoch": 0.18935409215232485, "grad_norm": 0.064529187977314, "learning_rate": 3.6363636363636364e-05, "loss": 0.1082, "step": 450 }, { "epoch": 0.18935409215232485, "eval_accuracy": 0.9888491630554199, "eval_loss": 0.03384377807378769, "eval_runtime": 557.5032, "eval_samples_per_second": 8.526, "eval_steps_per_second": 2.133, "step": 450 }, { "epoch": 0.19356196086682095, "grad_norm": 0.35138818621635437, "learning_rate": 3.272727272727273e-05, "loss": 0.03, "step": 460 }, { "epoch": 0.19356196086682095, "eval_accuracy": 0.9924258589744568, "eval_loss": 0.029149439185857773, "eval_runtime": 559.8975, "eval_samples_per_second": 8.489, "eval_steps_per_second": 2.124, "step": 460 }, { "epoch": 0.19776982958131706, "grad_norm": 0.020687425509095192, "learning_rate": 2.909090909090909e-05, "loss": 0.0702, "step": 470 }, { "epoch": 0.19776982958131706, "eval_accuracy": 0.9930570125579834, "eval_loss": 0.027508797124028206, "eval_runtime": 558.5715, "eval_samples_per_second": 8.509, "eval_steps_per_second": 2.129, "step": 470 }, { "epoch": 0.20197769829581316, "grad_norm": 0.08153369277715683, "learning_rate": 2.5454545454545454e-05, "loss": 0.0412, "step": 480 }, { "epoch": 0.20197769829581316, "eval_accuracy": 0.9924258589744568, "eval_loss": 0.02858472615480423, "eval_runtime": 557.0232, "eval_samples_per_second": 8.533, "eval_steps_per_second": 2.135, "step": 480 }, { "epoch": 0.20618556701030927, "grad_norm": 0.027780650183558464, "learning_rate": 2.1818181818181818e-05, "loss": 0.0056, "step": 490 }, { "epoch": 0.20618556701030927, "eval_accuracy": 0.9922154545783997, "eval_loss": 0.028572723269462585, "eval_runtime": 557.9364, "eval_samples_per_second": 8.519, "eval_steps_per_second": 2.131, "step": 490 }, { "epoch": 0.2103934357248054, "grad_norm": 0.013943745754659176, "learning_rate": 1.8181818181818182e-05, "loss": 0.0202, "step": 500 }, { "epoch": 0.2103934357248054, "eval_accuracy": 0.9934778213500977, "eval_loss": 0.02832806296646595, "eval_runtime": 590.2498, "eval_samples_per_second": 8.053, "eval_steps_per_second": 2.014, "step": 500 }, { "epoch": 0.2146013044393015, "grad_norm": 0.06914212554693222, "learning_rate": 1.4545454545454545e-05, "loss": 0.0032, "step": 510 }, { "epoch": 0.2146013044393015, "eval_accuracy": 0.9932674169540405, "eval_loss": 0.02869958057999611, "eval_runtime": 574.9638, "eval_samples_per_second": 8.267, "eval_steps_per_second": 2.068, "step": 510 }, { "epoch": 0.2188091731537976, "grad_norm": 0.014089811593294144, "learning_rate": 1.0909090909090909e-05, "loss": 0.0932, "step": 520 }, { "epoch": 0.2188091731537976, "eval_accuracy": 0.9924258589744568, "eval_loss": 0.029511602595448494, "eval_runtime": 574.493, "eval_samples_per_second": 8.273, "eval_steps_per_second": 2.07, "step": 520 }, { "epoch": 0.22301704186829371, "grad_norm": 0.006211976986378431, "learning_rate": 7.272727272727272e-06, "loss": 0.0597, "step": 530 }, { "epoch": 0.22301704186829371, "eval_accuracy": 0.9926362037658691, "eval_loss": 0.029444053769111633, "eval_runtime": 566.9552, "eval_samples_per_second": 8.383, "eval_steps_per_second": 2.097, "step": 530 }, { "epoch": 0.22722491058278982, "grad_norm": 0.030956320464611053, "learning_rate": 3.636363636363636e-06, "loss": 0.0069, "step": 540 }, { "epoch": 0.22722491058278982, "eval_accuracy": 0.9926362037658691, "eval_loss": 0.02989383228123188, "eval_runtime": 560.5938, "eval_samples_per_second": 8.479, "eval_steps_per_second": 2.121, "step": 540 }, { "epoch": 0.23143277929728592, "grad_norm": 0.10614718496799469, "learning_rate": 0.0, "loss": 0.024, "step": 550 }, { "epoch": 0.23143277929728592, "eval_accuracy": 0.9924258589744568, "eval_loss": 0.030369577929377556, "eval_runtime": 551.1375, "eval_samples_per_second": 8.624, "eval_steps_per_second": 2.157, "step": 550 } ], "logging_steps": 10, "max_steps": 550, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 10, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 6.208452205931052e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }