{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.24509803921568626, "eval_steps": 10, "global_step": 550, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.004456327985739751, "grad_norm": 0.49718141555786133, "learning_rate": 0.00039272727272727273, "loss": 0.3714, "step": 10 }, { "epoch": 0.004456327985739751, "eval_accuracy": 0.89683598279953, "eval_loss": 0.4125003516674042, "eval_runtime": 535.6711, "eval_samples_per_second": 8.378, "eval_steps_per_second": 2.095, "step": 10 }, { "epoch": 0.008912655971479501, "grad_norm": 0.15621569752693176, "learning_rate": 0.0003854545454545455, "loss": 0.3392, "step": 20 }, { "epoch": 0.008912655971479501, "eval_accuracy": 0.89683598279953, "eval_loss": 0.445027619600296, "eval_runtime": 527.1231, "eval_samples_per_second": 8.514, "eval_steps_per_second": 2.129, "step": 20 }, { "epoch": 0.013368983957219251, "grad_norm": 0.836126446723938, "learning_rate": 0.0003781818181818182, "loss": 0.3419, "step": 30 }, { "epoch": 0.013368983957219251, "eval_accuracy": 0.89683598279953, "eval_loss": 0.3319561779499054, "eval_runtime": 533.3212, "eval_samples_per_second": 8.415, "eval_steps_per_second": 2.104, "step": 30 }, { "epoch": 0.017825311942959002, "grad_norm": 0.5524694919586182, "learning_rate": 0.0003709090909090909, "loss": 0.2675, "step": 40 }, { "epoch": 0.017825311942959002, "eval_accuracy": 0.89683598279953, "eval_loss": 0.3492385745048523, "eval_runtime": 534.7065, "eval_samples_per_second": 8.393, "eval_steps_per_second": 2.098, "step": 40 }, { "epoch": 0.022281639928698752, "grad_norm": 3.021937847137451, "learning_rate": 0.00036363636363636367, "loss": 0.3912, "step": 50 }, { "epoch": 0.022281639928698752, "eval_accuracy": 0.89683598279953, "eval_loss": 0.3363156020641327, "eval_runtime": 537.4904, "eval_samples_per_second": 8.35, "eval_steps_per_second": 2.087, "step": 50 }, { "epoch": 0.026737967914438502, "grad_norm": 0.8108776807785034, "learning_rate": 0.0003563636363636364, "loss": 0.3462, "step": 60 }, { "epoch": 0.026737967914438502, "eval_accuracy": 0.89683598279953, "eval_loss": 0.3350915014743805, "eval_runtime": 531.3591, "eval_samples_per_second": 8.446, "eval_steps_per_second": 2.112, "step": 60 }, { "epoch": 0.031194295900178252, "grad_norm": 0.2669970989227295, "learning_rate": 0.0003490909090909091, "loss": 0.3693, "step": 70 }, { "epoch": 0.031194295900178252, "eval_accuracy": 0.89683598279953, "eval_loss": 0.3409470319747925, "eval_runtime": 529.3629, "eval_samples_per_second": 8.478, "eval_steps_per_second": 2.12, "step": 70 }, { "epoch": 0.035650623885918005, "grad_norm": 0.12394805252552032, "learning_rate": 0.0003418181818181818, "loss": 0.4736, "step": 80 }, { "epoch": 0.035650623885918005, "eval_accuracy": 0.89683598279953, "eval_loss": 0.3342541754245758, "eval_runtime": 532.7547, "eval_samples_per_second": 8.424, "eval_steps_per_second": 2.106, "step": 80 }, { "epoch": 0.040106951871657755, "grad_norm": 2.5240843296051025, "learning_rate": 0.00033454545454545456, "loss": 0.3994, "step": 90 }, { "epoch": 0.040106951871657755, "eval_accuracy": 0.89683598279953, "eval_loss": 0.30716776847839355, "eval_runtime": 533.0712, "eval_samples_per_second": 8.419, "eval_steps_per_second": 2.105, "step": 90 }, { "epoch": 0.044563279857397504, "grad_norm": 0.5875090956687927, "learning_rate": 0.0003272727272727273, "loss": 0.3118, "step": 100 }, { "epoch": 0.044563279857397504, "eval_accuracy": 0.89683598279953, "eval_loss": 0.3242183029651642, "eval_runtime": 538.0823, "eval_samples_per_second": 8.341, "eval_steps_per_second": 2.085, "step": 100 }, { "epoch": 0.049019607843137254, "grad_norm": 1.1133062839508057, "learning_rate": 0.00032, "loss": 0.4577, "step": 110 }, { "epoch": 0.049019607843137254, "eval_accuracy": 0.89683598279953, "eval_loss": 0.3053763210773468, "eval_runtime": 532.813, "eval_samples_per_second": 8.423, "eval_steps_per_second": 2.106, "step": 110 }, { "epoch": 0.053475935828877004, "grad_norm": 1.3879055976867676, "learning_rate": 0.00031272727272727273, "loss": 0.3927, "step": 120 }, { "epoch": 0.053475935828877004, "eval_accuracy": 0.89683598279953, "eval_loss": 0.29549089074134827, "eval_runtime": 542.0362, "eval_samples_per_second": 8.28, "eval_steps_per_second": 2.07, "step": 120 }, { "epoch": 0.057932263814616754, "grad_norm": 0.12985749542713165, "learning_rate": 0.0003054545454545455, "loss": 0.229, "step": 130 }, { "epoch": 0.057932263814616754, "eval_accuracy": 0.8972816467285156, "eval_loss": 0.26039257645606995, "eval_runtime": 536.2206, "eval_samples_per_second": 8.37, "eval_steps_per_second": 2.092, "step": 130 }, { "epoch": 0.062388591800356503, "grad_norm": 1.7885103225708008, "learning_rate": 0.0002981818181818182, "loss": 0.2635, "step": 140 }, { "epoch": 0.062388591800356503, "eval_accuracy": 0.89683598279953, "eval_loss": 0.3494693636894226, "eval_runtime": 538.5549, "eval_samples_per_second": 8.333, "eval_steps_per_second": 2.083, "step": 140 }, { "epoch": 0.06684491978609626, "grad_norm": 0.5339781641960144, "learning_rate": 0.0002909090909090909, "loss": 0.4063, "step": 150 }, { "epoch": 0.06684491978609626, "eval_accuracy": 0.89683598279953, "eval_loss": 0.2612840533256531, "eval_runtime": 538.3622, "eval_samples_per_second": 8.336, "eval_steps_per_second": 2.084, "step": 150 }, { "epoch": 0.07130124777183601, "grad_norm": 1.367612600326538, "learning_rate": 0.0002836363636363637, "loss": 0.2946, "step": 160 }, { "epoch": 0.07130124777183601, "eval_accuracy": 0.89683598279953, "eval_loss": 0.2859351336956024, "eval_runtime": 537.0937, "eval_samples_per_second": 8.356, "eval_steps_per_second": 2.089, "step": 160 }, { "epoch": 0.07575757575757576, "grad_norm": 1.688023567199707, "learning_rate": 0.0002763636363636364, "loss": 0.2653, "step": 170 }, { "epoch": 0.07575757575757576, "eval_accuracy": 0.89371657371521, "eval_loss": 0.20645655691623688, "eval_runtime": 544.7393, "eval_samples_per_second": 8.239, "eval_steps_per_second": 2.06, "step": 170 }, { "epoch": 0.08021390374331551, "grad_norm": 1.5258842706680298, "learning_rate": 0.0002690909090909091, "loss": 0.2861, "step": 180 }, { "epoch": 0.08021390374331551, "eval_accuracy": 0.8636363744735718, "eval_loss": 0.2519168555736542, "eval_runtime": 536.5715, "eval_samples_per_second": 8.364, "eval_steps_per_second": 2.091, "step": 180 }, { "epoch": 0.08467023172905526, "grad_norm": 0.10538855940103531, "learning_rate": 0.00026181818181818185, "loss": 0.2971, "step": 190 }, { "epoch": 0.08467023172905526, "eval_accuracy": 0.89683598279953, "eval_loss": 0.4605836868286133, "eval_runtime": 537.7296, "eval_samples_per_second": 8.346, "eval_steps_per_second": 2.087, "step": 190 }, { "epoch": 0.08912655971479501, "grad_norm": 1.2356140613555908, "learning_rate": 0.00025454545454545456, "loss": 0.3759, "step": 200 }, { "epoch": 0.08912655971479501, "eval_accuracy": 0.89683598279953, "eval_loss": 0.2962367832660675, "eval_runtime": 538.9507, "eval_samples_per_second": 8.327, "eval_steps_per_second": 2.082, "step": 200 }, { "epoch": 0.09358288770053476, "grad_norm": 0.343402624130249, "learning_rate": 0.00024727272727272727, "loss": 0.2156, "step": 210 }, { "epoch": 0.09358288770053476, "eval_accuracy": 0.89683598279953, "eval_loss": 0.2896555960178375, "eval_runtime": 537.3494, "eval_samples_per_second": 8.352, "eval_steps_per_second": 2.088, "step": 210 }, { "epoch": 0.09803921568627451, "grad_norm": 0.3443286120891571, "learning_rate": 0.00024, "loss": 0.2512, "step": 220 }, { "epoch": 0.09803921568627451, "eval_accuracy": 0.89683598279953, "eval_loss": 0.25072118639945984, "eval_runtime": 534.4338, "eval_samples_per_second": 8.398, "eval_steps_per_second": 2.099, "step": 220 }, { "epoch": 0.10249554367201426, "grad_norm": 1.4869568347930908, "learning_rate": 0.00023272727272727271, "loss": 0.2509, "step": 230 }, { "epoch": 0.10249554367201426, "eval_accuracy": 0.8954991102218628, "eval_loss": 0.19357898831367493, "eval_runtime": 535.513, "eval_samples_per_second": 8.381, "eval_steps_per_second": 2.095, "step": 230 }, { "epoch": 0.10695187165775401, "grad_norm": 2.3914287090301514, "learning_rate": 0.00022545454545454545, "loss": 0.2218, "step": 240 }, { "epoch": 0.10695187165775401, "eval_accuracy": 0.9015151262283325, "eval_loss": 0.21406562626361847, "eval_runtime": 534.6792, "eval_samples_per_second": 8.394, "eval_steps_per_second": 2.098, "step": 240 }, { "epoch": 0.11140819964349376, "grad_norm": 2.300201416015625, "learning_rate": 0.00021818181818181818, "loss": 0.3693, "step": 250 }, { "epoch": 0.11140819964349376, "eval_accuracy": 0.8792335391044617, "eval_loss": 0.21343877911567688, "eval_runtime": 530.968, "eval_samples_per_second": 8.452, "eval_steps_per_second": 2.113, "step": 250 }, { "epoch": 0.11586452762923351, "grad_norm": 2.3365836143493652, "learning_rate": 0.0002109090909090909, "loss": 0.3644, "step": 260 }, { "epoch": 0.11586452762923351, "eval_accuracy": 0.89683598279953, "eval_loss": 0.22058773040771484, "eval_runtime": 530.143, "eval_samples_per_second": 8.466, "eval_steps_per_second": 2.116, "step": 260 }, { "epoch": 0.12032085561497326, "grad_norm": 0.33425235748291016, "learning_rate": 0.00020363636363636363, "loss": 0.167, "step": 270 }, { "epoch": 0.12032085561497326, "eval_accuracy": 0.89683598279953, "eval_loss": 0.1814015805721283, "eval_runtime": 531.0237, "eval_samples_per_second": 8.452, "eval_steps_per_second": 2.113, "step": 270 }, { "epoch": 0.12477718360071301, "grad_norm": 1.2170227766036987, "learning_rate": 0.00019636363636363636, "loss": 0.1849, "step": 280 }, { "epoch": 0.12477718360071301, "eval_accuracy": 0.9012923240661621, "eval_loss": 0.21576102077960968, "eval_runtime": 530.0384, "eval_samples_per_second": 8.467, "eval_steps_per_second": 2.117, "step": 280 }, { "epoch": 0.12923351158645277, "grad_norm": 0.8702998161315918, "learning_rate": 0.0001890909090909091, "loss": 0.2305, "step": 290 }, { "epoch": 0.12923351158645277, "eval_accuracy": 0.9001782536506653, "eval_loss": 0.16198058426380157, "eval_runtime": 536.0043, "eval_samples_per_second": 8.373, "eval_steps_per_second": 2.093, "step": 290 }, { "epoch": 0.13368983957219252, "grad_norm": 2.6873793601989746, "learning_rate": 0.00018181818181818183, "loss": 0.2787, "step": 300 }, { "epoch": 0.13368983957219252, "eval_accuracy": 0.8970588445663452, "eval_loss": 0.4286949932575226, "eval_runtime": 533.2723, "eval_samples_per_second": 8.416, "eval_steps_per_second": 2.104, "step": 300 }, { "epoch": 0.13814616755793227, "grad_norm": 2.6352877616882324, "learning_rate": 0.00017454545454545454, "loss": 0.4769, "step": 310 }, { "epoch": 0.13814616755793227, "eval_accuracy": 0.89683598279953, "eval_loss": 0.2692972421646118, "eval_runtime": 537.214, "eval_samples_per_second": 8.354, "eval_steps_per_second": 2.089, "step": 310 }, { "epoch": 0.14260249554367202, "grad_norm": 7.490699291229248, "learning_rate": 0.00016727272727272728, "loss": 0.2615, "step": 320 }, { "epoch": 0.14260249554367202, "eval_accuracy": 0.8977272510528564, "eval_loss": 0.17833727598190308, "eval_runtime": 530.3846, "eval_samples_per_second": 8.462, "eval_steps_per_second": 2.115, "step": 320 }, { "epoch": 0.14705882352941177, "grad_norm": 0.2769690454006195, "learning_rate": 0.00016, "loss": 0.1976, "step": 330 }, { "epoch": 0.14705882352941177, "eval_accuracy": 0.897504448890686, "eval_loss": 0.1791936755180359, "eval_runtime": 529.6531, "eval_samples_per_second": 8.473, "eval_steps_per_second": 2.118, "step": 330 }, { "epoch": 0.15151515151515152, "grad_norm": 0.1449800729751587, "learning_rate": 0.00015272727272727275, "loss": 0.1132, "step": 340 }, { "epoch": 0.15151515151515152, "eval_accuracy": 0.8979501128196716, "eval_loss": 0.17950215935707092, "eval_runtime": 530.325, "eval_samples_per_second": 8.463, "eval_steps_per_second": 2.116, "step": 340 }, { "epoch": 0.15597147950089127, "grad_norm": 1.8357865810394287, "learning_rate": 0.00014545454545454546, "loss": 0.2649, "step": 350 }, { "epoch": 0.15597147950089127, "eval_accuracy": 0.8990641832351685, "eval_loss": 0.17018087208271027, "eval_runtime": 528.0517, "eval_samples_per_second": 8.499, "eval_steps_per_second": 2.125, "step": 350 }, { "epoch": 0.16042780748663102, "grad_norm": 1.703097939491272, "learning_rate": 0.0001381818181818182, "loss": 0.1975, "step": 360 }, { "epoch": 0.16042780748663102, "eval_accuracy": 0.8992869853973389, "eval_loss": 0.18035584688186646, "eval_runtime": 539.3037, "eval_samples_per_second": 8.322, "eval_steps_per_second": 2.08, "step": 360 }, { "epoch": 0.16488413547237077, "grad_norm": 1.1701892614364624, "learning_rate": 0.00013090909090909093, "loss": 0.143, "step": 370 }, { "epoch": 0.16488413547237077, "eval_accuracy": 0.897504448890686, "eval_loss": 0.1814272254705429, "eval_runtime": 531.4635, "eval_samples_per_second": 8.445, "eval_steps_per_second": 2.111, "step": 370 }, { "epoch": 0.16934046345811052, "grad_norm": 0.24161402881145477, "learning_rate": 0.00012363636363636364, "loss": 0.1982, "step": 380 }, { "epoch": 0.16934046345811052, "eval_accuracy": 0.89683598279953, "eval_loss": 0.17482873797416687, "eval_runtime": 532.8609, "eval_samples_per_second": 8.422, "eval_steps_per_second": 2.106, "step": 380 }, { "epoch": 0.17379679144385027, "grad_norm": 0.49488192796707153, "learning_rate": 0.00011636363636363636, "loss": 0.1935, "step": 390 }, { "epoch": 0.17379679144385027, "eval_accuracy": 0.89683598279953, "eval_loss": 0.18158306181430817, "eval_runtime": 531.1778, "eval_samples_per_second": 8.449, "eval_steps_per_second": 2.112, "step": 390 }, { "epoch": 0.17825311942959002, "grad_norm": 1.5082249641418457, "learning_rate": 0.00010909090909090909, "loss": 0.1893, "step": 400 }, { "epoch": 0.17825311942959002, "eval_accuracy": 0.8977272510528564, "eval_loss": 0.16474968194961548, "eval_runtime": 531.3613, "eval_samples_per_second": 8.446, "eval_steps_per_second": 2.112, "step": 400 }, { "epoch": 0.18270944741532977, "grad_norm": 0.7104145288467407, "learning_rate": 0.00010181818181818181, "loss": 0.1793, "step": 410 }, { "epoch": 0.18270944741532977, "eval_accuracy": 0.9039661288261414, "eval_loss": 0.16507278382778168, "eval_runtime": 532.5438, "eval_samples_per_second": 8.427, "eval_steps_per_second": 2.107, "step": 410 }, { "epoch": 0.18716577540106952, "grad_norm": 2.474039316177368, "learning_rate": 9.454545454545455e-05, "loss": 0.1832, "step": 420 }, { "epoch": 0.18716577540106952, "eval_accuracy": 0.9053030014038086, "eval_loss": 0.15766561031341553, "eval_runtime": 530.4857, "eval_samples_per_second": 8.46, "eval_steps_per_second": 2.115, "step": 420 }, { "epoch": 0.19162210338680927, "grad_norm": 0.4223766028881073, "learning_rate": 8.727272727272727e-05, "loss": 0.1833, "step": 430 }, { "epoch": 0.19162210338680927, "eval_accuracy": 0.9099822044372559, "eval_loss": 0.16656117141246796, "eval_runtime": 538.6394, "eval_samples_per_second": 8.332, "eval_steps_per_second": 2.083, "step": 430 }, { "epoch": 0.19607843137254902, "grad_norm": 1.0373200178146362, "learning_rate": 8e-05, "loss": 0.1436, "step": 440 }, { "epoch": 0.19607843137254902, "eval_accuracy": 0.9119875431060791, "eval_loss": 0.1528429239988327, "eval_runtime": 536.2551, "eval_samples_per_second": 8.369, "eval_steps_per_second": 2.092, "step": 440 }, { "epoch": 0.20053475935828877, "grad_norm": 2.6555793285369873, "learning_rate": 7.272727272727273e-05, "loss": 0.1856, "step": 450 }, { "epoch": 0.20053475935828877, "eval_accuracy": 0.9115418791770935, "eval_loss": 0.14956268668174744, "eval_runtime": 534.8893, "eval_samples_per_second": 8.391, "eval_steps_per_second": 2.098, "step": 450 }, { "epoch": 0.20499108734402852, "grad_norm": 0.36912649869918823, "learning_rate": 6.545454545454546e-05, "loss": 0.1281, "step": 460 }, { "epoch": 0.20499108734402852, "eval_accuracy": 0.9144384860992432, "eval_loss": 0.14639925956726074, "eval_runtime": 534.7579, "eval_samples_per_second": 8.393, "eval_steps_per_second": 2.098, "step": 460 }, { "epoch": 0.20944741532976827, "grad_norm": 0.9557098150253296, "learning_rate": 5.818181818181818e-05, "loss": 0.1894, "step": 470 }, { "epoch": 0.20944741532976827, "eval_accuracy": 0.9215686321258545, "eval_loss": 0.1419779509305954, "eval_runtime": 535.2427, "eval_samples_per_second": 8.385, "eval_steps_per_second": 2.096, "step": 470 }, { "epoch": 0.21390374331550802, "grad_norm": 0.6485058069229126, "learning_rate": 5.090909090909091e-05, "loss": 0.222, "step": 480 }, { "epoch": 0.21390374331550802, "eval_accuracy": 0.9057486653327942, "eval_loss": 0.16202405095100403, "eval_runtime": 539.6942, "eval_samples_per_second": 8.316, "eval_steps_per_second": 2.079, "step": 480 }, { "epoch": 0.21836007130124777, "grad_norm": 0.5341868996620178, "learning_rate": 4.3636363636363636e-05, "loss": 0.129, "step": 490 }, { "epoch": 0.21836007130124777, "eval_accuracy": 0.9226827025413513, "eval_loss": 0.1420886367559433, "eval_runtime": 539.6845, "eval_samples_per_second": 8.316, "eval_steps_per_second": 2.079, "step": 490 }, { "epoch": 0.22281639928698752, "grad_norm": 0.11955850571393967, "learning_rate": 3.6363636363636364e-05, "loss": 0.0887, "step": 500 }, { "epoch": 0.22281639928698752, "eval_accuracy": 0.9318181872367859, "eval_loss": 0.13754913210868835, "eval_runtime": 539.9321, "eval_samples_per_second": 8.312, "eval_steps_per_second": 2.078, "step": 500 }, { "epoch": 0.22727272727272727, "grad_norm": 0.711179256439209, "learning_rate": 2.909090909090909e-05, "loss": 0.0965, "step": 510 }, { "epoch": 0.22727272727272727, "eval_accuracy": 0.9202316999435425, "eval_loss": 0.16527850925922394, "eval_runtime": 536.1841, "eval_samples_per_second": 8.37, "eval_steps_per_second": 2.093, "step": 510 }, { "epoch": 0.23172905525846701, "grad_norm": 0.823353111743927, "learning_rate": 2.1818181818181818e-05, "loss": 0.2106, "step": 520 }, { "epoch": 0.23172905525846701, "eval_accuracy": 0.9280303120613098, "eval_loss": 0.14900797605514526, "eval_runtime": 535.1968, "eval_samples_per_second": 8.386, "eval_steps_per_second": 2.096, "step": 520 }, { "epoch": 0.23618538324420676, "grad_norm": 3.12156343460083, "learning_rate": 1.4545454545454545e-05, "loss": 0.2153, "step": 530 }, { "epoch": 0.23618538324420676, "eval_accuracy": 0.9347147941589355, "eval_loss": 0.13268369436264038, "eval_runtime": 533.2781, "eval_samples_per_second": 8.416, "eval_steps_per_second": 2.104, "step": 530 }, { "epoch": 0.24064171122994651, "grad_norm": 5.130224227905273, "learning_rate": 7.272727272727272e-06, "loss": 0.157, "step": 540 }, { "epoch": 0.24064171122994651, "eval_accuracy": 0.9318181872367859, "eval_loss": 0.13205085694789886, "eval_runtime": 537.2184, "eval_samples_per_second": 8.354, "eval_steps_per_second": 2.089, "step": 540 }, { "epoch": 0.24509803921568626, "grad_norm": 0.49416056275367737, "learning_rate": 0.0, "loss": 0.1583, "step": 550 }, { "epoch": 0.24509803921568626, "eval_accuracy": 0.9304812550544739, "eval_loss": 0.13242210447788239, "eval_runtime": 549.6193, "eval_samples_per_second": 8.166, "eval_steps_per_second": 2.041, "step": 550 } ], "logging_steps": 10, "max_steps": 550, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 10, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 6.326400520422712e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }