{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.24509803921568626, "eval_steps": 10, "global_step": 550, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.004456327985739751, "grad_norm": 0.7916810512542725, "learning_rate": 0.0002945454545454545, "loss": 0.3376, "step": 10 }, { "epoch": 0.004456327985739751, "eval_accuracy": 0.89683598279953, "eval_loss": 0.6079411506652832, "eval_runtime": 537.6759, "eval_samples_per_second": 8.347, "eval_steps_per_second": 2.087, "step": 10 }, { "epoch": 0.008912655971479501, "grad_norm": 0.1121700331568718, "learning_rate": 0.00028909090909090904, "loss": 0.3858, "step": 20 }, { "epoch": 0.008912655971479501, "eval_accuracy": 0.89683598279953, "eval_loss": 0.4899146854877472, "eval_runtime": 528.4079, "eval_samples_per_second": 8.493, "eval_steps_per_second": 2.123, "step": 20 }, { "epoch": 0.013368983957219251, "grad_norm": 0.4192979633808136, "learning_rate": 0.0002836363636363636, "loss": 0.4091, "step": 30 }, { "epoch": 0.013368983957219251, "eval_accuracy": 0.89683598279953, "eval_loss": 0.4288414418697357, "eval_runtime": 532.0654, "eval_samples_per_second": 8.435, "eval_steps_per_second": 2.109, "step": 30 }, { "epoch": 0.017825311942959002, "grad_norm": 0.6693852543830872, "learning_rate": 0.00027818181818181815, "loss": 0.3274, "step": 40 }, { "epoch": 0.017825311942959002, "eval_accuracy": 0.89683598279953, "eval_loss": 0.3423554301261902, "eval_runtime": 534.016, "eval_samples_per_second": 8.404, "eval_steps_per_second": 2.101, "step": 40 }, { "epoch": 0.022281639928698752, "grad_norm": 2.928346633911133, "learning_rate": 0.0002727272727272727, "loss": 0.3434, "step": 50 }, { "epoch": 0.022281639928698752, "eval_accuracy": 0.89683598279953, "eval_loss": 0.32134178280830383, "eval_runtime": 540.6384, "eval_samples_per_second": 8.301, "eval_steps_per_second": 2.075, "step": 50 }, { "epoch": 0.026737967914438502, "grad_norm": 7.551202774047852, "learning_rate": 0.0002672727272727272, "loss": 0.3627, "step": 60 }, { "epoch": 0.026737967914438502, "eval_accuracy": 0.89683598279953, "eval_loss": 0.24254043400287628, "eval_runtime": 532.8983, "eval_samples_per_second": 8.422, "eval_steps_per_second": 2.105, "step": 60 }, { "epoch": 0.031194295900178252, "grad_norm": 5.043577671051025, "learning_rate": 0.0002618181818181818, "loss": 0.3062, "step": 70 }, { "epoch": 0.031194295900178252, "eval_accuracy": 0.8770053386688232, "eval_loss": 0.16930516064167023, "eval_runtime": 531.9662, "eval_samples_per_second": 8.437, "eval_steps_per_second": 2.109, "step": 70 }, { "epoch": 0.035650623885918005, "grad_norm": 0.5775203108787537, "learning_rate": 0.00025636363636363633, "loss": 0.2115, "step": 80 }, { "epoch": 0.035650623885918005, "eval_accuracy": 0.8687611222267151, "eval_loss": 0.2173369973897934, "eval_runtime": 536.5487, "eval_samples_per_second": 8.365, "eval_steps_per_second": 2.091, "step": 80 }, { "epoch": 0.040106951871657755, "grad_norm": 1.0835380554199219, "learning_rate": 0.00025090909090909086, "loss": 0.2757, "step": 90 }, { "epoch": 0.040106951871657755, "eval_accuracy": 0.8426916003227234, "eval_loss": 0.2951839566230774, "eval_runtime": 531.3931, "eval_samples_per_second": 8.446, "eval_steps_per_second": 2.111, "step": 90 }, { "epoch": 0.044563279857397504, "grad_norm": 0.8163406848907471, "learning_rate": 0.00024545454545454545, "loss": 0.2344, "step": 100 }, { "epoch": 0.044563279857397504, "eval_accuracy": 0.8957219123840332, "eval_loss": 0.17275306582450867, "eval_runtime": 539.8237, "eval_samples_per_second": 8.314, "eval_steps_per_second": 2.078, "step": 100 }, { "epoch": 0.049019607843137254, "grad_norm": 0.909106433391571, "learning_rate": 0.00023999999999999998, "loss": 0.3855, "step": 110 }, { "epoch": 0.049019607843137254, "eval_accuracy": 0.9086452722549438, "eval_loss": 0.22119583189487457, "eval_runtime": 532.7138, "eval_samples_per_second": 8.425, "eval_steps_per_second": 2.106, "step": 110 }, { "epoch": 0.053475935828877004, "grad_norm": 4.141552448272705, "learning_rate": 0.00023454545454545454, "loss": 0.3104, "step": 120 }, { "epoch": 0.053475935828877004, "eval_accuracy": 0.8426916003227234, "eval_loss": 0.2036559134721756, "eval_runtime": 539.9806, "eval_samples_per_second": 8.311, "eval_steps_per_second": 2.078, "step": 120 }, { "epoch": 0.057932263814616754, "grad_norm": 0.04542813077569008, "learning_rate": 0.00022909090909090907, "loss": 0.235, "step": 130 }, { "epoch": 0.057932263814616754, "eval_accuracy": 0.89683598279953, "eval_loss": 0.6018572449684143, "eval_runtime": 532.3019, "eval_samples_per_second": 8.431, "eval_steps_per_second": 2.108, "step": 130 }, { "epoch": 0.062388591800356503, "grad_norm": 0.20932789146900177, "learning_rate": 0.00022363636363636363, "loss": 0.383, "step": 140 }, { "epoch": 0.062388591800356503, "eval_accuracy": 0.89683598279953, "eval_loss": 0.1932043582201004, "eval_runtime": 534.2227, "eval_samples_per_second": 8.401, "eval_steps_per_second": 2.1, "step": 140 }, { "epoch": 0.06684491978609626, "grad_norm": 2.113372325897217, "learning_rate": 0.00021818181818181816, "loss": 0.3633, "step": 150 }, { "epoch": 0.06684491978609626, "eval_accuracy": 0.9097593426704407, "eval_loss": 0.27564167976379395, "eval_runtime": 533.5467, "eval_samples_per_second": 8.412, "eval_steps_per_second": 2.103, "step": 150 }, { "epoch": 0.07130124777183601, "grad_norm": 2.0779898166656494, "learning_rate": 0.00021272727272727272, "loss": 0.2534, "step": 160 }, { "epoch": 0.07130124777183601, "eval_accuracy": 0.9124331474304199, "eval_loss": 0.17877253890037537, "eval_runtime": 534.2313, "eval_samples_per_second": 8.401, "eval_steps_per_second": 2.1, "step": 160 }, { "epoch": 0.07575757575757576, "grad_norm": 2.8469862937927246, "learning_rate": 0.00020727272727272725, "loss": 0.1618, "step": 170 }, { "epoch": 0.07575757575757576, "eval_accuracy": 0.9001782536506653, "eval_loss": 0.21939000487327576, "eval_runtime": 539.3123, "eval_samples_per_second": 8.322, "eval_steps_per_second": 2.08, "step": 170 }, { "epoch": 0.08021390374331551, "grad_norm": 0.3544740378856659, "learning_rate": 0.0002018181818181818, "loss": 0.3204, "step": 180 }, { "epoch": 0.08021390374331551, "eval_accuracy": 0.7437611222267151, "eval_loss": 0.6187111735343933, "eval_runtime": 539.0642, "eval_samples_per_second": 8.326, "eval_steps_per_second": 2.081, "step": 180 }, { "epoch": 0.08467023172905526, "grad_norm": 0.08713312447071075, "learning_rate": 0.00019636363636363634, "loss": 0.4719, "step": 190 }, { "epoch": 0.08467023172905526, "eval_accuracy": 0.917557954788208, "eval_loss": 0.15933164954185486, "eval_runtime": 532.5075, "eval_samples_per_second": 8.428, "eval_steps_per_second": 2.107, "step": 190 }, { "epoch": 0.08912655971479501, "grad_norm": 0.10667263716459274, "learning_rate": 0.0001909090909090909, "loss": 0.2124, "step": 200 }, { "epoch": 0.08912655971479501, "eval_accuracy": 0.874331533908844, "eval_loss": 0.2731720209121704, "eval_runtime": 532.8068, "eval_samples_per_second": 8.423, "eval_steps_per_second": 2.106, "step": 200 }, { "epoch": 0.09358288770053476, "grad_norm": 0.09285986423492432, "learning_rate": 0.00018545454545454543, "loss": 0.1807, "step": 210 }, { "epoch": 0.09358288770053476, "eval_accuracy": 0.9458556175231934, "eval_loss": 0.15097761154174805, "eval_runtime": 534.5302, "eval_samples_per_second": 8.396, "eval_steps_per_second": 2.099, "step": 210 }, { "epoch": 0.09803921568627451, "grad_norm": 0.21800605952739716, "learning_rate": 0.00017999999999999998, "loss": 0.241, "step": 220 }, { "epoch": 0.09803921568627451, "eval_accuracy": 0.9117646813392639, "eval_loss": 0.20164452493190765, "eval_runtime": 534.0184, "eval_samples_per_second": 8.404, "eval_steps_per_second": 2.101, "step": 220 }, { "epoch": 0.10249554367201426, "grad_norm": 2.175548553466797, "learning_rate": 0.00017454545454545452, "loss": 0.3668, "step": 230 }, { "epoch": 0.10249554367201426, "eval_accuracy": 0.9079768061637878, "eval_loss": 0.22817277908325195, "eval_runtime": 533.4214, "eval_samples_per_second": 8.414, "eval_steps_per_second": 2.103, "step": 230 }, { "epoch": 0.10695187165775401, "grad_norm": 2.417444944381714, "learning_rate": 0.00016909090909090907, "loss": 0.2424, "step": 240 }, { "epoch": 0.10695187165775401, "eval_accuracy": 0.9364973306655884, "eval_loss": 0.1917591691017151, "eval_runtime": 539.0061, "eval_samples_per_second": 8.326, "eval_steps_per_second": 2.082, "step": 240 }, { "epoch": 0.11140819964349376, "grad_norm": 1.2514077425003052, "learning_rate": 0.0001636363636363636, "loss": 0.1934, "step": 250 }, { "epoch": 0.11140819964349376, "eval_accuracy": 0.934937596321106, "eval_loss": 0.14482024312019348, "eval_runtime": 535.7699, "eval_samples_per_second": 8.377, "eval_steps_per_second": 2.094, "step": 250 }, { "epoch": 0.11586452762923351, "grad_norm": 7.08188533782959, "learning_rate": 0.00015818181818181816, "loss": 0.3554, "step": 260 }, { "epoch": 0.11586452762923351, "eval_accuracy": 0.9601158499717712, "eval_loss": 0.09824846684932709, "eval_runtime": 532.0513, "eval_samples_per_second": 8.435, "eval_steps_per_second": 2.109, "step": 260 }, { "epoch": 0.12032085561497326, "grad_norm": 1.4493603706359863, "learning_rate": 0.0001527272727272727, "loss": 0.1619, "step": 270 }, { "epoch": 0.12032085561497326, "eval_accuracy": 0.9569964408874512, "eval_loss": 0.11313515901565552, "eval_runtime": 531.5383, "eval_samples_per_second": 8.443, "eval_steps_per_second": 2.111, "step": 270 }, { "epoch": 0.12477718360071301, "grad_norm": 0.9487653374671936, "learning_rate": 0.00014727272727272725, "loss": 0.0925, "step": 280 }, { "epoch": 0.12477718360071301, "eval_accuracy": 0.9748217463493347, "eval_loss": 0.06883005797863007, "eval_runtime": 533.9256, "eval_samples_per_second": 8.406, "eval_steps_per_second": 2.101, "step": 280 }, { "epoch": 0.12923351158645277, "grad_norm": 0.0604180209338665, "learning_rate": 0.0001418181818181818, "loss": 0.1223, "step": 290 }, { "epoch": 0.12923351158645277, "eval_accuracy": 0.9766042828559875, "eval_loss": 0.056954748928546906, "eval_runtime": 539.6876, "eval_samples_per_second": 8.316, "eval_steps_per_second": 2.079, "step": 290 }, { "epoch": 0.13368983957219252, "grad_norm": 2.264599561691284, "learning_rate": 0.00013636363636363634, "loss": 0.1144, "step": 300 }, { "epoch": 0.13368983957219252, "eval_accuracy": 0.9741532802581787, "eval_loss": 0.06963187456130981, "eval_runtime": 535.6675, "eval_samples_per_second": 8.378, "eval_steps_per_second": 2.095, "step": 300 }, { "epoch": 0.13814616755793227, "grad_norm": 3.3151047229766846, "learning_rate": 0.0001309090909090909, "loss": 0.0702, "step": 310 }, { "epoch": 0.13814616755793227, "eval_accuracy": 0.8979501128196716, "eval_loss": 0.3257082402706146, "eval_runtime": 539.7299, "eval_samples_per_second": 8.315, "eval_steps_per_second": 2.079, "step": 310 }, { "epoch": 0.14260249554367202, "grad_norm": 0.20427709817886353, "learning_rate": 0.00012545454545454543, "loss": 0.1778, "step": 320 }, { "epoch": 0.14260249554367202, "eval_accuracy": 0.9463012218475342, "eval_loss": 0.15733219683170319, "eval_runtime": 532.2728, "eval_samples_per_second": 8.432, "eval_steps_per_second": 2.108, "step": 320 }, { "epoch": 0.14705882352941177, "grad_norm": 0.29525595903396606, "learning_rate": 0.00011999999999999999, "loss": 0.1151, "step": 330 }, { "epoch": 0.14705882352941177, "eval_accuracy": 0.9881907105445862, "eval_loss": 0.04204181954264641, "eval_runtime": 532.9583, "eval_samples_per_second": 8.421, "eval_steps_per_second": 2.105, "step": 330 }, { "epoch": 0.15151515151515152, "grad_norm": 0.05631331354379654, "learning_rate": 0.00011454545454545453, "loss": 0.0784, "step": 340 }, { "epoch": 0.15151515151515152, "eval_accuracy": 0.9870766401290894, "eval_loss": 0.041727881878614426, "eval_runtime": 533.1617, "eval_samples_per_second": 8.418, "eval_steps_per_second": 2.104, "step": 340 }, { "epoch": 0.15597147950089127, "grad_norm": 1.579466700553894, "learning_rate": 0.00010909090909090908, "loss": 0.1782, "step": 350 }, { "epoch": 0.15597147950089127, "eval_accuracy": 0.981951892375946, "eval_loss": 0.04734932258725166, "eval_runtime": 530.6175, "eval_samples_per_second": 8.458, "eval_steps_per_second": 2.115, "step": 350 }, { "epoch": 0.16042780748663102, "grad_norm": 0.4887264370918274, "learning_rate": 0.00010363636363636362, "loss": 0.0328, "step": 360 }, { "epoch": 0.16042780748663102, "eval_accuracy": 0.9826202988624573, "eval_loss": 0.0771423876285553, "eval_runtime": 539.3929, "eval_samples_per_second": 8.32, "eval_steps_per_second": 2.08, "step": 360 }, { "epoch": 0.16488413547237077, "grad_norm": 0.05755012482404709, "learning_rate": 9.818181818181817e-05, "loss": 0.1689, "step": 370 }, { "epoch": 0.16488413547237077, "eval_accuracy": 0.9770498871803284, "eval_loss": 0.049294665455818176, "eval_runtime": 534.2657, "eval_samples_per_second": 8.4, "eval_steps_per_second": 2.1, "step": 370 }, { "epoch": 0.16934046345811052, "grad_norm": 0.2694285809993744, "learning_rate": 9.272727272727271e-05, "loss": 0.0219, "step": 380 }, { "epoch": 0.16934046345811052, "eval_accuracy": 0.9402852058410645, "eval_loss": 0.14428526163101196, "eval_runtime": 531.3442, "eval_samples_per_second": 8.447, "eval_steps_per_second": 2.112, "step": 380 }, { "epoch": 0.17379679144385027, "grad_norm": 0.12420541048049927, "learning_rate": 8.727272727272726e-05, "loss": 0.0951, "step": 390 }, { "epoch": 0.17379679144385027, "eval_accuracy": 0.9977718591690063, "eval_loss": 0.012917960062623024, "eval_runtime": 535.2048, "eval_samples_per_second": 8.386, "eval_steps_per_second": 2.096, "step": 390 }, { "epoch": 0.17825311942959002, "grad_norm": 0.25717079639434814, "learning_rate": 8.18181818181818e-05, "loss": 0.0244, "step": 400 }, { "epoch": 0.17825311942959002, "eval_accuracy": 0.9968805909156799, "eval_loss": 0.010928132571280003, "eval_runtime": 531.9808, "eval_samples_per_second": 8.436, "eval_steps_per_second": 2.109, "step": 400 }, { "epoch": 0.18270944741532977, "grad_norm": 8.522324562072754, "learning_rate": 7.636363636363635e-05, "loss": 0.0142, "step": 410 }, { "epoch": 0.18270944741532977, "eval_accuracy": 0.9942067861557007, "eval_loss": 0.01725505292415619, "eval_runtime": 536.5062, "eval_samples_per_second": 8.365, "eval_steps_per_second": 2.091, "step": 410 }, { "epoch": 0.18716577540106952, "grad_norm": 0.08865547925233841, "learning_rate": 7.09090909090909e-05, "loss": 0.0702, "step": 420 }, { "epoch": 0.18716577540106952, "eval_accuracy": 0.9772727489471436, "eval_loss": 0.060311682522296906, "eval_runtime": 532.5572, "eval_samples_per_second": 8.427, "eval_steps_per_second": 2.107, "step": 420 }, { "epoch": 0.19162210338680927, "grad_norm": 0.02239236794412136, "learning_rate": 6.545454545454545e-05, "loss": 0.0209, "step": 430 }, { "epoch": 0.19162210338680927, "eval_accuracy": 0.9904189109802246, "eval_loss": 0.02353021316230297, "eval_runtime": 538.0122, "eval_samples_per_second": 8.342, "eval_steps_per_second": 2.085, "step": 430 }, { "epoch": 0.19607843137254902, "grad_norm": 14.454584121704102, "learning_rate": 5.9999999999999995e-05, "loss": 0.0914, "step": 440 }, { "epoch": 0.19607843137254902, "eval_accuracy": 0.9939839839935303, "eval_loss": 0.01657612808048725, "eval_runtime": 532.1826, "eval_samples_per_second": 8.433, "eval_steps_per_second": 2.108, "step": 440 }, { "epoch": 0.20053475935828877, "grad_norm": 0.065692238509655, "learning_rate": 5.454545454545454e-05, "loss": 0.0645, "step": 450 }, { "epoch": 0.20053475935828877, "eval_accuracy": 0.9886363744735718, "eval_loss": 0.029221026226878166, "eval_runtime": 532.6751, "eval_samples_per_second": 8.425, "eval_steps_per_second": 2.106, "step": 450 }, { "epoch": 0.20499108734402852, "grad_norm": 0.03740346431732178, "learning_rate": 4.9090909090909084e-05, "loss": 0.0027, "step": 460 }, { "epoch": 0.20499108734402852, "eval_accuracy": 0.9848484992980957, "eval_loss": 0.04155493900179863, "eval_runtime": 531.2072, "eval_samples_per_second": 8.449, "eval_steps_per_second": 2.112, "step": 460 }, { "epoch": 0.20944741532976827, "grad_norm": 0.31382623314857483, "learning_rate": 4.363636363636363e-05, "loss": 0.1536, "step": 470 }, { "epoch": 0.20944741532976827, "eval_accuracy": 0.9879679083824158, "eval_loss": 0.03183320537209511, "eval_runtime": 531.0891, "eval_samples_per_second": 8.451, "eval_steps_per_second": 2.113, "step": 470 }, { "epoch": 0.21390374331550802, "grad_norm": 0.028712157160043716, "learning_rate": 3.8181818181818174e-05, "loss": 0.086, "step": 480 }, { "epoch": 0.21390374331550802, "eval_accuracy": 0.987522304058075, "eval_loss": 0.032387156039476395, "eval_runtime": 538.4057, "eval_samples_per_second": 8.336, "eval_steps_per_second": 2.084, "step": 480 }, { "epoch": 0.21836007130124777, "grad_norm": 0.016576524823904037, "learning_rate": 3.2727272727272725e-05, "loss": 0.0223, "step": 490 }, { "epoch": 0.21836007130124777, "eval_accuracy": 0.9955436587333679, "eval_loss": 0.011954777874052525, "eval_runtime": 530.6249, "eval_samples_per_second": 8.458, "eval_steps_per_second": 2.114, "step": 490 }, { "epoch": 0.22281639928698752, "grad_norm": 0.014118828810751438, "learning_rate": 2.727272727272727e-05, "loss": 0.0235, "step": 500 }, { "epoch": 0.22281639928698752, "eval_accuracy": 0.9962121248245239, "eval_loss": 0.010361210443079472, "eval_runtime": 539.7181, "eval_samples_per_second": 8.315, "eval_steps_per_second": 2.079, "step": 500 }, { "epoch": 0.22727272727272727, "grad_norm": 0.027222217991948128, "learning_rate": 2.1818181818181814e-05, "loss": 0.0025, "step": 510 }, { "epoch": 0.22727272727272727, "eval_accuracy": 0.9966577291488647, "eval_loss": 0.00987223070114851, "eval_runtime": 530.4418, "eval_samples_per_second": 8.461, "eval_steps_per_second": 2.115, "step": 510 }, { "epoch": 0.23172905525846701, "grad_norm": 9.495808601379395, "learning_rate": 1.6363636363636363e-05, "loss": 0.0254, "step": 520 }, { "epoch": 0.23172905525846701, "eval_accuracy": 0.9962121248245239, "eval_loss": 0.010062512941658497, "eval_runtime": 532.4144, "eval_samples_per_second": 8.43, "eval_steps_per_second": 2.107, "step": 520 }, { "epoch": 0.23618538324420676, "grad_norm": 0.02721397951245308, "learning_rate": 1.0909090909090907e-05, "loss": 0.0741, "step": 530 }, { "epoch": 0.23618538324420676, "eval_accuracy": 0.9944295883178711, "eval_loss": 0.014333564788103104, "eval_runtime": 533.0522, "eval_samples_per_second": 8.419, "eval_steps_per_second": 2.105, "step": 530 }, { "epoch": 0.24064171122994651, "grad_norm": 0.20209485292434692, "learning_rate": 5.454545454545454e-06, "loss": 0.0305, "step": 540 }, { "epoch": 0.24064171122994651, "eval_accuracy": 0.9935383200645447, "eval_loss": 0.016204852610826492, "eval_runtime": 531.1995, "eval_samples_per_second": 8.449, "eval_steps_per_second": 2.112, "step": 540 }, { "epoch": 0.24509803921568626, "grad_norm": 7.379052639007568, "learning_rate": 0.0, "loss": 0.007, "step": 550 }, { "epoch": 0.24509803921568626, "eval_accuracy": 0.9935383200645447, "eval_loss": 0.016523057594895363, "eval_runtime": 537.9402, "eval_samples_per_second": 8.343, "eval_steps_per_second": 2.086, "step": 550 } ], "logging_steps": 10, "max_steps": 550, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 10, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 6.326400520422712e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }