{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.23143277929728592, "eval_steps": 10, "global_step": 550, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.004207868714496108, "grad_norm": 3.2926599979400635, "learning_rate": 9.818181818181818e-05, "loss": 0.6744, "step": 10 }, { "epoch": 0.004207868714496108, "eval_accuracy": 0.6254996657371521, "eval_loss": 0.6731761693954468, "eval_runtime": 574.0292, "eval_samples_per_second": 8.28, "eval_steps_per_second": 2.071, "step": 10 }, { "epoch": 0.008415737428992216, "grad_norm": 3.0475454330444336, "learning_rate": 9.636363636363637e-05, "loss": 0.6978, "step": 20 }, { "epoch": 0.008415737428992216, "eval_accuracy": 0.6509572863578796, "eval_loss": 0.6372496485710144, "eval_runtime": 563.1409, "eval_samples_per_second": 8.44, "eval_steps_per_second": 2.111, "step": 20 }, { "epoch": 0.012623606143488323, "grad_norm": 13.133508682250977, "learning_rate": 9.454545454545455e-05, "loss": 0.6593, "step": 30 }, { "epoch": 0.012623606143488323, "eval_accuracy": 0.8199031949043274, "eval_loss": 0.5605600476264954, "eval_runtime": 570.6751, "eval_samples_per_second": 8.329, "eval_steps_per_second": 2.083, "step": 30 }, { "epoch": 0.016831474857984433, "grad_norm": 19.010900497436523, "learning_rate": 9.272727272727273e-05, "loss": 0.5335, "step": 40 }, { "epoch": 0.016831474857984433, "eval_accuracy": 0.7992846369743347, "eval_loss": 0.5063626766204834, "eval_runtime": 573.7613, "eval_samples_per_second": 8.284, "eval_steps_per_second": 2.072, "step": 40 }, { "epoch": 0.021039343572480537, "grad_norm": 8.88330078125, "learning_rate": 9.090909090909092e-05, "loss": 0.5275, "step": 50 }, { "epoch": 0.021039343572480537, "eval_accuracy": 0.8531453609466553, "eval_loss": 0.42206627130508423, "eval_runtime": 570.9439, "eval_samples_per_second": 8.325, "eval_steps_per_second": 2.083, "step": 50 }, { "epoch": 0.025247212286976645, "grad_norm": 7.115237236022949, "learning_rate": 8.90909090909091e-05, "loss": 0.332, "step": 60 }, { "epoch": 0.025247212286976645, "eval_accuracy": 0.8525142073631287, "eval_loss": 0.3472881615161896, "eval_runtime": 570.3915, "eval_samples_per_second": 8.333, "eval_steps_per_second": 2.085, "step": 60 }, { "epoch": 0.029455081001472753, "grad_norm": 4.491124629974365, "learning_rate": 8.727272727272727e-05, "loss": 0.3109, "step": 70 }, { "epoch": 0.029455081001472753, "eval_accuracy": 0.8998527526855469, "eval_loss": 0.24124681949615479, "eval_runtime": 571.9368, "eval_samples_per_second": 8.31, "eval_steps_per_second": 2.079, "step": 70 }, { "epoch": 0.033662949715968865, "grad_norm": 5.824629306793213, "learning_rate": 8.545454545454545e-05, "loss": 0.3167, "step": 80 }, { "epoch": 0.033662949715968865, "eval_accuracy": 0.9421418309211731, "eval_loss": 0.16658170521259308, "eval_runtime": 581.5044, "eval_samples_per_second": 8.174, "eval_steps_per_second": 2.045, "step": 80 }, { "epoch": 0.03787081843046497, "grad_norm": 14.90581226348877, "learning_rate": 8.363636363636364e-05, "loss": 0.441, "step": 90 }, { "epoch": 0.03787081843046497, "eval_accuracy": 0.9120555520057678, "eval_loss": 0.24102848768234253, "eval_runtime": 575.9749, "eval_samples_per_second": 8.252, "eval_steps_per_second": 2.064, "step": 90 }, { "epoch": 0.042078687144961074, "grad_norm": 5.160868167877197, "learning_rate": 8.181818181818183e-05, "loss": 0.3689, "step": 100 }, { "epoch": 0.042078687144961074, "eval_accuracy": 0.9105827808380127, "eval_loss": 0.24303115904331207, "eval_runtime": 566.896, "eval_samples_per_second": 8.384, "eval_steps_per_second": 2.097, "step": 100 }, { "epoch": 0.046286555859457186, "grad_norm": 4.731154918670654, "learning_rate": 8e-05, "loss": 0.2679, "step": 110 }, { "epoch": 0.046286555859457186, "eval_accuracy": 0.9086892604827881, "eval_loss": 0.261859655380249, "eval_runtime": 566.3825, "eval_samples_per_second": 8.392, "eval_steps_per_second": 2.099, "step": 110 }, { "epoch": 0.05049442457395329, "grad_norm": 3.6892170906066895, "learning_rate": 7.818181818181818e-05, "loss": 0.2681, "step": 120 }, { "epoch": 0.05049442457395329, "eval_accuracy": 0.8956448435783386, "eval_loss": 0.31176039576530457, "eval_runtime": 571.9255, "eval_samples_per_second": 8.311, "eval_steps_per_second": 2.079, "step": 120 }, { "epoch": 0.0547022932884494, "grad_norm": 0.8576626181602478, "learning_rate": 7.636363636363637e-05, "loss": 0.3362, "step": 130 }, { "epoch": 0.0547022932884494, "eval_accuracy": 0.9206816554069519, "eval_loss": 0.23810358345508575, "eval_runtime": 565.3779, "eval_samples_per_second": 8.407, "eval_steps_per_second": 2.103, "step": 130 }, { "epoch": 0.05891016200294551, "grad_norm": 0.5792973637580872, "learning_rate": 7.454545454545455e-05, "loss": 0.2112, "step": 140 }, { "epoch": 0.05891016200294551, "eval_accuracy": 0.9688617587089539, "eval_loss": 0.09994017332792282, "eval_runtime": 565.2596, "eval_samples_per_second": 8.409, "eval_steps_per_second": 2.103, "step": 140 }, { "epoch": 0.06311803071744161, "grad_norm": 13.7179594039917, "learning_rate": 7.272727272727273e-05, "loss": 0.2842, "step": 150 }, { "epoch": 0.06311803071744161, "eval_accuracy": 0.9562381505966187, "eval_loss": 0.13098464906215668, "eval_runtime": 568.883, "eval_samples_per_second": 8.355, "eval_steps_per_second": 2.09, "step": 150 }, { "epoch": 0.06732589943193773, "grad_norm": 13.58166217803955, "learning_rate": 7.090909090909092e-05, "loss": 0.2738, "step": 160 }, { "epoch": 0.06732589943193773, "eval_accuracy": 0.974121630191803, "eval_loss": 0.07404066622257233, "eval_runtime": 565.2694, "eval_samples_per_second": 8.408, "eval_steps_per_second": 2.103, "step": 160 }, { "epoch": 0.07153376814643383, "grad_norm": 1.1847424507141113, "learning_rate": 6.90909090909091e-05, "loss": 0.3975, "step": 170 }, { "epoch": 0.07153376814643383, "eval_accuracy": 0.9358299970626831, "eval_loss": 0.1807854324579239, "eval_runtime": 565.7049, "eval_samples_per_second": 8.402, "eval_steps_per_second": 2.102, "step": 170 }, { "epoch": 0.07574163686092994, "grad_norm": 0.5569546818733215, "learning_rate": 6.727272727272727e-05, "loss": 0.2441, "step": 180 }, { "epoch": 0.07574163686092994, "eval_accuracy": 0.9688617587089539, "eval_loss": 0.09184221178293228, "eval_runtime": 555.8532, "eval_samples_per_second": 8.551, "eval_steps_per_second": 2.139, "step": 180 }, { "epoch": 0.07994950557542604, "grad_norm": 5.223395347595215, "learning_rate": 6.545454545454546e-05, "loss": 0.15, "step": 190 }, { "epoch": 0.07994950557542604, "eval_accuracy": 0.9474016427993774, "eval_loss": 0.22705134749412537, "eval_runtime": 554.1797, "eval_samples_per_second": 8.577, "eval_steps_per_second": 2.146, "step": 190 }, { "epoch": 0.08415737428992215, "grad_norm": 19.633956909179688, "learning_rate": 6.363636363636364e-05, "loss": 0.4032, "step": 200 }, { "epoch": 0.08415737428992215, "eval_accuracy": 0.9152114391326904, "eval_loss": 0.3923904001712799, "eval_runtime": 554.483, "eval_samples_per_second": 8.572, "eval_steps_per_second": 2.144, "step": 200 }, { "epoch": 0.08836524300441827, "grad_norm": 0.19549815356731415, "learning_rate": 6.181818181818182e-05, "loss": 0.0958, "step": 210 }, { "epoch": 0.08836524300441827, "eval_accuracy": 0.9160529971122742, "eval_loss": 0.3585941791534424, "eval_runtime": 546.3632, "eval_samples_per_second": 8.699, "eval_steps_per_second": 2.176, "step": 210 }, { "epoch": 0.09257311171891437, "grad_norm": 1.1258805990219116, "learning_rate": 6e-05, "loss": 0.2203, "step": 220 }, { "epoch": 0.09257311171891437, "eval_accuracy": 0.895013689994812, "eval_loss": 0.3808138072490692, "eval_runtime": 549.8489, "eval_samples_per_second": 8.644, "eval_steps_per_second": 2.162, "step": 220 }, { "epoch": 0.09678098043341048, "grad_norm": 2.5008387565612793, "learning_rate": 5.818181818181818e-05, "loss": 0.0868, "step": 230 }, { "epoch": 0.09678098043341048, "eval_accuracy": 0.9440353512763977, "eval_loss": 0.21156248450279236, "eval_runtime": 554.8432, "eval_samples_per_second": 8.566, "eval_steps_per_second": 2.143, "step": 230 }, { "epoch": 0.10098884914790658, "grad_norm": 0.5708354115486145, "learning_rate": 5.636363636363636e-05, "loss": 0.1104, "step": 240 }, { "epoch": 0.10098884914790658, "eval_accuracy": 0.9168946146965027, "eval_loss": 0.3665996193885803, "eval_runtime": 557.2932, "eval_samples_per_second": 8.529, "eval_steps_per_second": 2.134, "step": 240 }, { "epoch": 0.1051967178624027, "grad_norm": 0.9385812878608704, "learning_rate": 5.4545454545454546e-05, "loss": 0.211, "step": 250 }, { "epoch": 0.1051967178624027, "eval_accuracy": 0.9593940377235413, "eval_loss": 0.1287224441766739, "eval_runtime": 553.8519, "eval_samples_per_second": 8.582, "eval_steps_per_second": 2.147, "step": 250 }, { "epoch": 0.1094045865768988, "grad_norm": 5.300781726837158, "learning_rate": 5.272727272727272e-05, "loss": 0.3359, "step": 260 }, { "epoch": 0.1094045865768988, "eval_accuracy": 0.9863244295120239, "eval_loss": 0.04394599795341492, "eval_runtime": 553.7785, "eval_samples_per_second": 8.583, "eval_steps_per_second": 2.147, "step": 260 }, { "epoch": 0.11361245529139491, "grad_norm": 6.374191761016846, "learning_rate": 5.090909090909091e-05, "loss": 0.2681, "step": 270 }, { "epoch": 0.11361245529139491, "eval_accuracy": 0.9440353512763977, "eval_loss": 0.16353961825370789, "eval_runtime": 553.8134, "eval_samples_per_second": 8.582, "eval_steps_per_second": 2.147, "step": 270 }, { "epoch": 0.11782032400589101, "grad_norm": 0.06790979206562042, "learning_rate": 4.909090909090909e-05, "loss": 0.0586, "step": 280 }, { "epoch": 0.11782032400589101, "eval_accuracy": 0.9688617587089539, "eval_loss": 0.08115343004465103, "eval_runtime": 552.432, "eval_samples_per_second": 8.604, "eval_steps_per_second": 2.152, "step": 280 }, { "epoch": 0.12202819272038712, "grad_norm": 2.017587423324585, "learning_rate": 4.7272727272727275e-05, "loss": 0.215, "step": 290 }, { "epoch": 0.12202819272038712, "eval_accuracy": 0.9255207180976868, "eval_loss": 0.22139500081539154, "eval_runtime": 552.6796, "eval_samples_per_second": 8.6, "eval_steps_per_second": 2.151, "step": 290 }, { "epoch": 0.12623606143488322, "grad_norm": 0.08541750907897949, "learning_rate": 4.545454545454546e-05, "loss": 0.1846, "step": 300 }, { "epoch": 0.12623606143488322, "eval_accuracy": 0.9837996959686279, "eval_loss": 0.049501266330480576, "eval_runtime": 551.6552, "eval_samples_per_second": 8.616, "eval_steps_per_second": 2.155, "step": 300 }, { "epoch": 0.13044393014937933, "grad_norm": 14.675045013427734, "learning_rate": 4.3636363636363636e-05, "loss": 0.2153, "step": 310 }, { "epoch": 0.13044393014937933, "eval_accuracy": 0.9835892915725708, "eval_loss": 0.05814049392938614, "eval_runtime": 553.8442, "eval_samples_per_second": 8.582, "eval_steps_per_second": 2.147, "step": 310 }, { "epoch": 0.13465179886387546, "grad_norm": 0.5145708918571472, "learning_rate": 4.181818181818182e-05, "loss": 0.0814, "step": 320 }, { "epoch": 0.13465179886387546, "eval_accuracy": 0.9575005173683167, "eval_loss": 0.1593790054321289, "eval_runtime": 554.2522, "eval_samples_per_second": 8.576, "eval_steps_per_second": 2.145, "step": 320 }, { "epoch": 0.13885966757837157, "grad_norm": 15.138551712036133, "learning_rate": 4e-05, "loss": 0.1322, "step": 330 }, { "epoch": 0.13885966757837157, "eval_accuracy": 0.9673890471458435, "eval_loss": 0.11336074024438858, "eval_runtime": 548.0702, "eval_samples_per_second": 8.672, "eval_steps_per_second": 2.169, "step": 330 }, { "epoch": 0.14306753629286767, "grad_norm": 0.017534606158733368, "learning_rate": 3.818181818181819e-05, "loss": 0.1172, "step": 340 }, { "epoch": 0.14306753629286767, "eval_accuracy": 0.9604460597038269, "eval_loss": 0.11512165516614914, "eval_runtime": 551.6579, "eval_samples_per_second": 8.616, "eval_steps_per_second": 2.155, "step": 340 }, { "epoch": 0.14727540500736377, "grad_norm": 0.008281617425382137, "learning_rate": 3.6363636363636364e-05, "loss": 0.0648, "step": 350 }, { "epoch": 0.14727540500736377, "eval_accuracy": 0.9758047461509705, "eval_loss": 0.06669025123119354, "eval_runtime": 553.29, "eval_samples_per_second": 8.59, "eval_steps_per_second": 2.149, "step": 350 }, { "epoch": 0.15148327372185988, "grad_norm": 0.17740336060523987, "learning_rate": 3.454545454545455e-05, "loss": 0.2261, "step": 360 }, { "epoch": 0.15148327372185988, "eval_accuracy": 0.967809796333313, "eval_loss": 0.09138375520706177, "eval_runtime": 561.1917, "eval_samples_per_second": 8.469, "eval_steps_per_second": 2.119, "step": 360 }, { "epoch": 0.15569114243635598, "grad_norm": 1.9274922609329224, "learning_rate": 3.272727272727273e-05, "loss": 0.1405, "step": 370 }, { "epoch": 0.15569114243635598, "eval_accuracy": 0.9215232729911804, "eval_loss": 0.294222891330719, "eval_runtime": 554.2437, "eval_samples_per_second": 8.576, "eval_steps_per_second": 2.145, "step": 370 }, { "epoch": 0.1598990111508521, "grad_norm": 20.162691116333008, "learning_rate": 3.090909090909091e-05, "loss": 0.2011, "step": 380 }, { "epoch": 0.1598990111508521, "eval_accuracy": 0.9770671129226685, "eval_loss": 0.06251883506774902, "eval_runtime": 556.1293, "eval_samples_per_second": 8.547, "eval_steps_per_second": 2.138, "step": 380 }, { "epoch": 0.1641068798653482, "grad_norm": 0.0279800184071064, "learning_rate": 2.909090909090909e-05, "loss": 0.1612, "step": 390 }, { "epoch": 0.1641068798653482, "eval_accuracy": 0.9840101003646851, "eval_loss": 0.040968313813209534, "eval_runtime": 554.0799, "eval_samples_per_second": 8.578, "eval_steps_per_second": 2.146, "step": 390 }, { "epoch": 0.1683147485798443, "grad_norm": 8.97774600982666, "learning_rate": 2.7272727272727273e-05, "loss": 0.1875, "step": 400 }, { "epoch": 0.1683147485798443, "eval_accuracy": 0.974121630191803, "eval_loss": 0.06667405366897583, "eval_runtime": 553.8832, "eval_samples_per_second": 8.581, "eval_steps_per_second": 2.147, "step": 400 }, { "epoch": 0.17252261729434043, "grad_norm": 0.09378823637962341, "learning_rate": 2.5454545454545454e-05, "loss": 0.0717, "step": 410 }, { "epoch": 0.17252261729434043, "eval_accuracy": 0.9829581379890442, "eval_loss": 0.04144088178873062, "eval_runtime": 554.6341, "eval_samples_per_second": 8.57, "eval_steps_per_second": 2.144, "step": 410 }, { "epoch": 0.17673048600883653, "grad_norm": 0.21639426052570343, "learning_rate": 2.3636363636363637e-05, "loss": 0.0524, "step": 420 }, { "epoch": 0.17673048600883653, "eval_accuracy": 0.9886387586593628, "eval_loss": 0.0340532623231411, "eval_runtime": 554.2365, "eval_samples_per_second": 8.576, "eval_steps_per_second": 2.145, "step": 420 }, { "epoch": 0.18093835472333264, "grad_norm": 0.07831648737192154, "learning_rate": 2.1818181818181818e-05, "loss": 0.0935, "step": 430 }, { "epoch": 0.18093835472333264, "eval_accuracy": 0.9842205047607422, "eval_loss": 0.0411146804690361, "eval_runtime": 557.1919, "eval_samples_per_second": 8.53, "eval_steps_per_second": 2.134, "step": 430 }, { "epoch": 0.18514622343782874, "grad_norm": 0.0247107595205307, "learning_rate": 2e-05, "loss": 0.0521, "step": 440 }, { "epoch": 0.18514622343782874, "eval_accuracy": 0.974121630191803, "eval_loss": 0.07654237747192383, "eval_runtime": 548.0314, "eval_samples_per_second": 8.673, "eval_steps_per_second": 2.17, "step": 440 }, { "epoch": 0.18935409215232485, "grad_norm": 0.10471628606319427, "learning_rate": 1.8181818181818182e-05, "loss": 0.0889, "step": 450 }, { "epoch": 0.18935409215232485, "eval_accuracy": 0.9608668088912964, "eval_loss": 0.1275644451379776, "eval_runtime": 558.2261, "eval_samples_per_second": 8.514, "eval_steps_per_second": 2.13, "step": 450 }, { "epoch": 0.19356196086682095, "grad_norm": 0.4881088435649872, "learning_rate": 1.6363636363636366e-05, "loss": 0.0387, "step": 460 }, { "epoch": 0.19356196086682095, "eval_accuracy": 0.9667578339576721, "eval_loss": 0.098667673766613, "eval_runtime": 556.0323, "eval_samples_per_second": 8.548, "eval_steps_per_second": 2.138, "step": 460 }, { "epoch": 0.19776982958131706, "grad_norm": 0.01689060404896736, "learning_rate": 1.4545454545454545e-05, "loss": 0.03, "step": 470 }, { "epoch": 0.19776982958131706, "eval_accuracy": 0.9734904170036316, "eval_loss": 0.07727943360805511, "eval_runtime": 554.2432, "eval_samples_per_second": 8.576, "eval_steps_per_second": 2.145, "step": 470 }, { "epoch": 0.20197769829581316, "grad_norm": 8.339999198913574, "learning_rate": 1.2727272727272727e-05, "loss": 0.0901, "step": 480 }, { "epoch": 0.20197769829581316, "eval_accuracy": 0.9829581379890442, "eval_loss": 0.04694819450378418, "eval_runtime": 556.8586, "eval_samples_per_second": 8.535, "eval_steps_per_second": 2.135, "step": 480 }, { "epoch": 0.20618556701030927, "grad_norm": 0.023077547550201416, "learning_rate": 1.0909090909090909e-05, "loss": 0.0873, "step": 490 }, { "epoch": 0.20618556701030927, "eval_accuracy": 0.9808542132377625, "eval_loss": 0.0580894760787487, "eval_runtime": 554.8362, "eval_samples_per_second": 8.566, "eval_steps_per_second": 2.143, "step": 490 }, { "epoch": 0.2103934357248054, "grad_norm": 0.014377394691109657, "learning_rate": 9.090909090909091e-06, "loss": 0.0201, "step": 500 }, { "epoch": 0.2103934357248054, "eval_accuracy": 0.9816957712173462, "eval_loss": 0.057598479092121124, "eval_runtime": 554.1331, "eval_samples_per_second": 8.577, "eval_steps_per_second": 2.146, "step": 500 }, { "epoch": 0.2146013044393015, "grad_norm": 0.12555623054504395, "learning_rate": 7.272727272727272e-06, "loss": 0.017, "step": 510 }, { "epoch": 0.2146013044393015, "eval_accuracy": 0.9802230000495911, "eval_loss": 0.06528125703334808, "eval_runtime": 553.9243, "eval_samples_per_second": 8.581, "eval_steps_per_second": 2.147, "step": 510 }, { "epoch": 0.2188091731537976, "grad_norm": 0.04903264343738556, "learning_rate": 5.4545454545454545e-06, "loss": 0.1464, "step": 520 }, { "epoch": 0.2188091731537976, "eval_accuracy": 0.9749631881713867, "eval_loss": 0.07879046350717545, "eval_runtime": 556.0819, "eval_samples_per_second": 8.547, "eval_steps_per_second": 2.138, "step": 520 }, { "epoch": 0.22301704186829371, "grad_norm": 0.016027677804231644, "learning_rate": 3.636363636363636e-06, "loss": 0.0105, "step": 530 }, { "epoch": 0.22301704186829371, "eval_accuracy": 0.9758047461509705, "eval_loss": 0.07757497578859329, "eval_runtime": 557.9097, "eval_samples_per_second": 8.519, "eval_steps_per_second": 2.131, "step": 530 }, { "epoch": 0.22722491058278982, "grad_norm": 0.08870552480220795, "learning_rate": 1.818181818181818e-06, "loss": 0.0124, "step": 540 }, { "epoch": 0.22722491058278982, "eval_accuracy": 0.9774879217147827, "eval_loss": 0.07116981595754623, "eval_runtime": 561.0259, "eval_samples_per_second": 8.472, "eval_steps_per_second": 2.119, "step": 540 }, { "epoch": 0.23143277929728592, "grad_norm": 7.751413822174072, "learning_rate": 0.0, "loss": 0.061, "step": 550 }, { "epoch": 0.23143277929728592, "eval_accuracy": 0.9779086709022522, "eval_loss": 0.06926773488521576, "eval_runtime": 562.0676, "eval_samples_per_second": 8.456, "eval_steps_per_second": 2.115, "step": 550 } ], "logging_steps": 10, "max_steps": 550, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 10, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 6.208452205931052e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }