{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.24064171122994651, "eval_steps": 10, "global_step": 540, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.004456327985739751, "grad_norm": 1.5512940883636475, "learning_rate": 9.818181818181818e-05, "loss": 0.4724, "step": 10 }, { "epoch": 0.004456327985739751, "eval_accuracy": 0.89683598279953, "eval_loss": 0.34007999300956726, "eval_runtime": 549.9948, "eval_samples_per_second": 8.16, "eval_steps_per_second": 2.04, "step": 10 }, { "epoch": 0.008912655971479501, "grad_norm": 1.4623041152954102, "learning_rate": 9.636363636363637e-05, "loss": 0.2715, "step": 20 }, { "epoch": 0.008912655971479501, "eval_accuracy": 0.89683598279953, "eval_loss": 0.39523470401763916, "eval_runtime": 543.4137, "eval_samples_per_second": 8.259, "eval_steps_per_second": 2.065, "step": 20 }, { "epoch": 0.013368983957219251, "grad_norm": 0.6407843828201294, "learning_rate": 9.454545454545455e-05, "loss": 0.3566, "step": 30 }, { "epoch": 0.013368983957219251, "eval_accuracy": 0.89683598279953, "eval_loss": 0.38253363966941833, "eval_runtime": 536.8913, "eval_samples_per_second": 8.359, "eval_steps_per_second": 2.09, "step": 30 }, { "epoch": 0.017825311942959002, "grad_norm": 1.9206137657165527, "learning_rate": 9.272727272727273e-05, "loss": 0.2892, "step": 40 }, { "epoch": 0.017825311942959002, "eval_accuracy": 0.89683598279953, "eval_loss": 0.36841902136802673, "eval_runtime": 539.4459, "eval_samples_per_second": 8.32, "eval_steps_per_second": 2.08, "step": 40 }, { "epoch": 0.022281639928698752, "grad_norm": 1.0926023721694946, "learning_rate": 9.090909090909092e-05, "loss": 0.3157, "step": 50 }, { "epoch": 0.022281639928698752, "eval_accuracy": 0.89683598279953, "eval_loss": 0.3572486639022827, "eval_runtime": 542.4121, "eval_samples_per_second": 8.274, "eval_steps_per_second": 2.069, "step": 50 }, { "epoch": 0.026737967914438502, "grad_norm": 3.161236524581909, "learning_rate": 8.90909090909091e-05, "loss": 0.3792, "step": 60 }, { "epoch": 0.026737967914438502, "eval_accuracy": 0.89683598279953, "eval_loss": 0.3476051688194275, "eval_runtime": 538.7398, "eval_samples_per_second": 8.331, "eval_steps_per_second": 2.083, "step": 60 }, { "epoch": 0.031194295900178252, "grad_norm": 1.0513850450515747, "learning_rate": 8.727272727272727e-05, "loss": 0.3938, "step": 70 }, { "epoch": 0.031194295900178252, "eval_accuracy": 0.89683598279953, "eval_loss": 0.317058265209198, "eval_runtime": 540.0617, "eval_samples_per_second": 8.31, "eval_steps_per_second": 2.078, "step": 70 }, { "epoch": 0.035650623885918005, "grad_norm": 5.6621479988098145, "learning_rate": 8.545454545454545e-05, "loss": 0.3962, "step": 80 }, { "epoch": 0.035650623885918005, "eval_accuracy": 0.89683598279953, "eval_loss": 0.2908115088939667, "eval_runtime": 541.5124, "eval_samples_per_second": 8.288, "eval_steps_per_second": 2.072, "step": 80 }, { "epoch": 0.040106951871657755, "grad_norm": 11.542706489562988, "learning_rate": 8.363636363636364e-05, "loss": 0.3536, "step": 90 }, { "epoch": 0.040106951871657755, "eval_accuracy": 0.89683598279953, "eval_loss": 0.2727506160736084, "eval_runtime": 542.0266, "eval_samples_per_second": 8.28, "eval_steps_per_second": 2.07, "step": 90 }, { "epoch": 0.044563279857397504, "grad_norm": 0.7968679070472717, "learning_rate": 8.181818181818183e-05, "loss": 0.2338, "step": 100 }, { "epoch": 0.044563279857397504, "eval_accuracy": 0.898172914981842, "eval_loss": 0.21359723806381226, "eval_runtime": 550.439, "eval_samples_per_second": 8.153, "eval_steps_per_second": 2.038, "step": 100 }, { "epoch": 0.049019607843137254, "grad_norm": 13.235169410705566, "learning_rate": 8e-05, "loss": 0.2591, "step": 110 }, { "epoch": 0.049019607843137254, "eval_accuracy": 0.9035205245018005, "eval_loss": 0.17823095619678497, "eval_runtime": 543.3444, "eval_samples_per_second": 8.26, "eval_steps_per_second": 2.065, "step": 110 }, { "epoch": 0.053475935828877004, "grad_norm": 3.1415486335754395, "learning_rate": 7.818181818181818e-05, "loss": 0.261, "step": 120 }, { "epoch": 0.053475935828877004, "eval_accuracy": 0.8725489974021912, "eval_loss": 0.23705999553203583, "eval_runtime": 550.4034, "eval_samples_per_second": 8.154, "eval_steps_per_second": 2.039, "step": 120 }, { "epoch": 0.057932263814616754, "grad_norm": 0.12994514405727386, "learning_rate": 7.636363636363637e-05, "loss": 0.2626, "step": 130 }, { "epoch": 0.057932263814616754, "eval_accuracy": 0.89683598279953, "eval_loss": 0.4889169931411743, "eval_runtime": 545.9677, "eval_samples_per_second": 8.22, "eval_steps_per_second": 2.055, "step": 130 }, { "epoch": 0.062388591800356503, "grad_norm": 0.20874539017677307, "learning_rate": 7.454545454545455e-05, "loss": 0.3156, "step": 140 }, { "epoch": 0.062388591800356503, "eval_accuracy": 0.9021835923194885, "eval_loss": 0.21060487627983093, "eval_runtime": 543.8063, "eval_samples_per_second": 8.253, "eval_steps_per_second": 2.063, "step": 140 }, { "epoch": 0.06684491978609626, "grad_norm": 1.1314120292663574, "learning_rate": 7.272727272727273e-05, "loss": 0.3342, "step": 150 }, { "epoch": 0.06684491978609626, "eval_accuracy": 0.9217914342880249, "eval_loss": 0.19053570926189423, "eval_runtime": 543.9184, "eval_samples_per_second": 8.251, "eval_steps_per_second": 2.063, "step": 150 }, { "epoch": 0.07130124777183601, "grad_norm": 5.4050092697143555, "learning_rate": 7.090909090909092e-05, "loss": 0.2658, "step": 160 }, { "epoch": 0.07130124777183601, "eval_accuracy": 0.9498662948608398, "eval_loss": 0.12423694878816605, "eval_runtime": 542.375, "eval_samples_per_second": 8.275, "eval_steps_per_second": 2.069, "step": 160 }, { "epoch": 0.07575757575757576, "grad_norm": 6.958705425262451, "learning_rate": 6.90909090909091e-05, "loss": 0.2162, "step": 170 }, { "epoch": 0.07575757575757576, "eval_accuracy": 0.9211229681968689, "eval_loss": 0.1585531085729599, "eval_runtime": 550.3192, "eval_samples_per_second": 8.155, "eval_steps_per_second": 2.039, "step": 170 }, { "epoch": 0.08021390374331551, "grad_norm": 6.347085475921631, "learning_rate": 6.727272727272727e-05, "loss": 0.2457, "step": 180 }, { "epoch": 0.08021390374331551, "eval_accuracy": 0.8422459959983826, "eval_loss": 0.31698858737945557, "eval_runtime": 550.4639, "eval_samples_per_second": 8.153, "eval_steps_per_second": 2.038, "step": 180 }, { "epoch": 0.08467023172905526, "grad_norm": 0.05648183450102806, "learning_rate": 6.545454545454546e-05, "loss": 0.178, "step": 190 }, { "epoch": 0.08467023172905526, "eval_accuracy": 0.9663547277450562, "eval_loss": 0.08927226811647415, "eval_runtime": 548.6704, "eval_samples_per_second": 8.18, "eval_steps_per_second": 2.045, "step": 190 }, { "epoch": 0.08912655971479501, "grad_norm": 0.10795829445123672, "learning_rate": 6.363636363636364e-05, "loss": 0.2864, "step": 200 }, { "epoch": 0.08912655971479501, "eval_accuracy": 0.9079768061637878, "eval_loss": 0.1990886628627777, "eval_runtime": 547.327, "eval_samples_per_second": 8.2, "eval_steps_per_second": 2.05, "step": 200 }, { "epoch": 0.09358288770053476, "grad_norm": 0.10935252159833908, "learning_rate": 6.181818181818182e-05, "loss": 0.0852, "step": 210 }, { "epoch": 0.09358288770053476, "eval_accuracy": 0.9777183532714844, "eval_loss": 0.06789236515760422, "eval_runtime": 546.4384, "eval_samples_per_second": 8.213, "eval_steps_per_second": 2.053, "step": 210 }, { "epoch": 0.09803921568627451, "grad_norm": 0.724420428276062, "learning_rate": 6e-05, "loss": 0.2227, "step": 220 }, { "epoch": 0.09803921568627451, "eval_accuracy": 0.9676916003227234, "eval_loss": 0.08267948776483536, "eval_runtime": 543.5229, "eval_samples_per_second": 8.257, "eval_steps_per_second": 2.064, "step": 220 }, { "epoch": 0.10249554367201426, "grad_norm": 3.5009875297546387, "learning_rate": 5.818181818181818e-05, "loss": 0.0894, "step": 230 }, { "epoch": 0.10249554367201426, "eval_accuracy": 0.9799465537071228, "eval_loss": 0.05659499019384384, "eval_runtime": 547.0317, "eval_samples_per_second": 8.204, "eval_steps_per_second": 2.051, "step": 230 }, { "epoch": 0.10695187165775401, "grad_norm": 13.977231979370117, "learning_rate": 5.636363636363636e-05, "loss": 0.1766, "step": 240 }, { "epoch": 0.10695187165775401, "eval_accuracy": 0.9275846481323242, "eval_loss": 0.17180828750133514, "eval_runtime": 554.5064, "eval_samples_per_second": 8.094, "eval_steps_per_second": 2.023, "step": 240 }, { "epoch": 0.11140819964349376, "grad_norm": 1.5852386951446533, "learning_rate": 5.4545454545454546e-05, "loss": 0.1133, "step": 250 }, { "epoch": 0.11140819964349376, "eval_accuracy": 0.9625668525695801, "eval_loss": 0.09958070516586304, "eval_runtime": 546.2195, "eval_samples_per_second": 8.216, "eval_steps_per_second": 2.054, "step": 250 }, { "epoch": 0.11586452762923351, "grad_norm": 4.984052658081055, "learning_rate": 5.272727272727272e-05, "loss": 0.1581, "step": 260 }, { "epoch": 0.11586452762923351, "eval_accuracy": 0.9616755843162537, "eval_loss": 0.09552862495183945, "eval_runtime": 542.5097, "eval_samples_per_second": 8.273, "eval_steps_per_second": 2.068, "step": 260 }, { "epoch": 0.12032085561497326, "grad_norm": 0.06418484449386597, "learning_rate": 5.090909090909091e-05, "loss": 0.1164, "step": 270 }, { "epoch": 0.12032085561497326, "eval_accuracy": 0.9962121248245239, "eval_loss": 0.017139658331871033, "eval_runtime": 545.1464, "eval_samples_per_second": 8.233, "eval_steps_per_second": 2.058, "step": 270 }, { "epoch": 0.12477718360071301, "grad_norm": 1.2460925579071045, "learning_rate": 4.909090909090909e-05, "loss": 0.0199, "step": 280 }, { "epoch": 0.12477718360071301, "eval_accuracy": 0.9884135723114014, "eval_loss": 0.04466737061738968, "eval_runtime": 545.2406, "eval_samples_per_second": 8.231, "eval_steps_per_second": 2.058, "step": 280 }, { "epoch": 0.12923351158645277, "grad_norm": 0.021525170654058456, "learning_rate": 4.7272727272727275e-05, "loss": 0.0358, "step": 290 }, { "epoch": 0.12923351158645277, "eval_accuracy": 0.9625668525695801, "eval_loss": 0.08889108896255493, "eval_runtime": 551.6453, "eval_samples_per_second": 8.136, "eval_steps_per_second": 2.034, "step": 290 }, { "epoch": 0.13368983957219252, "grad_norm": 10.565823554992676, "learning_rate": 4.545454545454546e-05, "loss": 0.0134, "step": 300 }, { "epoch": 0.13368983957219252, "eval_accuracy": 0.9607843160629272, "eval_loss": 0.11075662821531296, "eval_runtime": 546.8222, "eval_samples_per_second": 8.207, "eval_steps_per_second": 2.052, "step": 300 }, { "epoch": 0.13814616755793227, "grad_norm": 2.769604444503784, "learning_rate": 4.3636363636363636e-05, "loss": 0.0085, "step": 310 }, { "epoch": 0.13814616755793227, "eval_accuracy": 0.9204545617103577, "eval_loss": 0.24255433678627014, "eval_runtime": 551.4065, "eval_samples_per_second": 8.139, "eval_steps_per_second": 2.035, "step": 310 }, { "epoch": 0.14260249554367202, "grad_norm": 0.027551617473363876, "learning_rate": 4.181818181818182e-05, "loss": 0.0691, "step": 320 }, { "epoch": 0.14260249554367202, "eval_accuracy": 0.9496434926986694, "eval_loss": 0.15619446337223053, "eval_runtime": 545.5513, "eval_samples_per_second": 8.227, "eval_steps_per_second": 2.057, "step": 320 }, { "epoch": 0.14705882352941177, "grad_norm": 0.010140771977603436, "learning_rate": 4e-05, "loss": 0.2242, "step": 330 }, { "epoch": 0.14705882352941177, "eval_accuracy": 0.9968805909156799, "eval_loss": 0.01314464956521988, "eval_runtime": 545.9537, "eval_samples_per_second": 8.22, "eval_steps_per_second": 2.055, "step": 330 }, { "epoch": 0.15151515151515152, "grad_norm": 0.011543634347617626, "learning_rate": 3.818181818181819e-05, "loss": 0.1593, "step": 340 }, { "epoch": 0.15151515151515152, "eval_accuracy": 0.9890819787979126, "eval_loss": 0.03110310062766075, "eval_runtime": 547.8599, "eval_samples_per_second": 8.192, "eval_steps_per_second": 2.048, "step": 340 }, { "epoch": 0.15597147950089127, "grad_norm": 0.06019105017185211, "learning_rate": 3.6363636363636364e-05, "loss": 0.0065, "step": 350 }, { "epoch": 0.15597147950089127, "eval_accuracy": 0.9643493890762329, "eval_loss": 0.11227227747440338, "eval_runtime": 546.9694, "eval_samples_per_second": 8.205, "eval_steps_per_second": 2.051, "step": 350 }, { "epoch": 0.16042780748663102, "grad_norm": 0.028353577479720116, "learning_rate": 3.454545454545455e-05, "loss": 0.0626, "step": 360 }, { "epoch": 0.16042780748663102, "eval_accuracy": 0.9817290306091309, "eval_loss": 0.061965711414813995, "eval_runtime": 552.3776, "eval_samples_per_second": 8.125, "eval_steps_per_second": 2.031, "step": 360 }, { "epoch": 0.16488413547237077, "grad_norm": 0.4727762043476105, "learning_rate": 3.272727272727273e-05, "loss": 0.0281, "step": 370 }, { "epoch": 0.16488413547237077, "eval_accuracy": 0.9596702456474304, "eval_loss": 0.12898869812488556, "eval_runtime": 546.7154, "eval_samples_per_second": 8.209, "eval_steps_per_second": 2.052, "step": 370 }, { "epoch": 0.16934046345811052, "grad_norm": 0.013356081210076809, "learning_rate": 3.090909090909091e-05, "loss": 0.0189, "step": 380 }, { "epoch": 0.16934046345811052, "eval_accuracy": 0.991310179233551, "eval_loss": 0.02557438611984253, "eval_runtime": 546.8803, "eval_samples_per_second": 8.207, "eval_steps_per_second": 2.052, "step": 380 }, { "epoch": 0.17379679144385027, "grad_norm": 0.056645121425390244, "learning_rate": 2.909090909090909e-05, "loss": 0.1307, "step": 390 }, { "epoch": 0.17379679144385027, "eval_accuracy": 0.9783868193626404, "eval_loss": 0.07275046408176422, "eval_runtime": 548.2067, "eval_samples_per_second": 8.187, "eval_steps_per_second": 2.047, "step": 390 }, { "epoch": 0.17825311942959002, "grad_norm": 0.0406530387699604, "learning_rate": 2.7272727272727273e-05, "loss": 0.0061, "step": 400 }, { "epoch": 0.17825311942959002, "eval_accuracy": 0.9471924901008606, "eval_loss": 0.17399340867996216, "eval_runtime": 542.8312, "eval_samples_per_second": 8.268, "eval_steps_per_second": 2.067, "step": 400 }, { "epoch": 0.18270944741532977, "grad_norm": 0.07212503999471664, "learning_rate": 2.5454545454545454e-05, "loss": 0.0739, "step": 410 }, { "epoch": 0.18270944741532977, "eval_accuracy": 0.9500890970230103, "eval_loss": 0.16757053136825562, "eval_runtime": 542.9814, "eval_samples_per_second": 8.265, "eval_steps_per_second": 2.066, "step": 410 }, { "epoch": 0.18716577540106952, "grad_norm": 0.10422785580158234, "learning_rate": 2.3636363636363637e-05, "loss": 0.0028, "step": 420 }, { "epoch": 0.18716577540106952, "eval_accuracy": 0.9783868193626404, "eval_loss": 0.07298260927200317, "eval_runtime": 539.9794, "eval_samples_per_second": 8.311, "eval_steps_per_second": 2.078, "step": 420 }, { "epoch": 0.19162210338680927, "grad_norm": 0.007547458633780479, "learning_rate": 2.1818181818181818e-05, "loss": 0.0011, "step": 430 }, { "epoch": 0.19162210338680927, "eval_accuracy": 0.977495551109314, "eval_loss": 0.07658497989177704, "eval_runtime": 549.3337, "eval_samples_per_second": 8.17, "eval_steps_per_second": 2.042, "step": 430 }, { "epoch": 0.19607843137254902, "grad_norm": 0.007210019510239363, "learning_rate": 2e-05, "loss": 0.0019, "step": 440 }, { "epoch": 0.19607843137254902, "eval_accuracy": 0.9636809229850769, "eval_loss": 0.11826927214860916, "eval_runtime": 543.2619, "eval_samples_per_second": 8.261, "eval_steps_per_second": 2.065, "step": 440 }, { "epoch": 0.20053475935828877, "grad_norm": 0.013209226541221142, "learning_rate": 1.8181818181818182e-05, "loss": 0.0388, "step": 450 }, { "epoch": 0.20053475935828877, "eval_accuracy": 0.9576649069786072, "eval_loss": 0.14545659720897675, "eval_runtime": 544.4489, "eval_samples_per_second": 8.243, "eval_steps_per_second": 2.061, "step": 450 }, { "epoch": 0.20499108734402852, "grad_norm": 0.007217989303171635, "learning_rate": 1.6363636363636366e-05, "loss": 0.0041, "step": 460 }, { "epoch": 0.20499108734402852, "eval_accuracy": 0.9523172974586487, "eval_loss": 0.16972462832927704, "eval_runtime": 545.2894, "eval_samples_per_second": 8.23, "eval_steps_per_second": 2.058, "step": 460 }, { "epoch": 0.20944741532976827, "grad_norm": 0.8174325227737427, "learning_rate": 1.4545454545454545e-05, "loss": 0.0064, "step": 470 }, { "epoch": 0.20944741532976827, "eval_accuracy": 0.9509803652763367, "eval_loss": 0.17997007071971893, "eval_runtime": 543.7245, "eval_samples_per_second": 8.254, "eval_steps_per_second": 2.064, "step": 470 }, { "epoch": 0.21390374331550802, "grad_norm": 0.032331835478544235, "learning_rate": 1.2727272727272727e-05, "loss": 0.0008, "step": 480 }, { "epoch": 0.21390374331550802, "eval_accuracy": 0.9610071182250977, "eval_loss": 0.1397457718849182, "eval_runtime": 546.6811, "eval_samples_per_second": 8.21, "eval_steps_per_second": 2.052, "step": 480 }, { "epoch": 0.21836007130124777, "grad_norm": 6.01271915435791, "learning_rate": 1.0909090909090909e-05, "loss": 0.0688, "step": 490 }, { "epoch": 0.21836007130124777, "eval_accuracy": 0.9694741368293762, "eval_loss": 0.10627125203609467, "eval_runtime": 544.0334, "eval_samples_per_second": 8.249, "eval_steps_per_second": 2.062, "step": 490 }, { "epoch": 0.22281639928698752, "grad_norm": 0.004174210596829653, "learning_rate": 9.090909090909091e-06, "loss": 0.0008, "step": 500 }, { "epoch": 0.22281639928698752, "eval_accuracy": 0.9844028353691101, "eval_loss": 0.05713631212711334, "eval_runtime": 551.9061, "eval_samples_per_second": 8.132, "eval_steps_per_second": 2.033, "step": 500 }, { "epoch": 0.22727272727272727, "grad_norm": 0.006048465613275766, "learning_rate": 7.272727272727272e-06, "loss": 0.0018, "step": 510 }, { "epoch": 0.22727272727272727, "eval_accuracy": 0.9870766401290894, "eval_loss": 0.048317644745111465, "eval_runtime": 541.5344, "eval_samples_per_second": 8.288, "eval_steps_per_second": 2.072, "step": 510 }, { "epoch": 0.23172905525846701, "grad_norm": 13.882183074951172, "learning_rate": 5.4545454545454545e-06, "loss": 0.1531, "step": 520 }, { "epoch": 0.23172905525846701, "eval_accuracy": 0.9870766401290894, "eval_loss": 0.04793470725417137, "eval_runtime": 540.1145, "eval_samples_per_second": 8.309, "eval_steps_per_second": 2.077, "step": 520 }, { "epoch": 0.23618538324420676, "grad_norm": 0.21645870804786682, "learning_rate": 3.636363636363636e-06, "loss": 0.0027, "step": 530 }, { "epoch": 0.23618538324420676, "eval_accuracy": 0.9848484992980957, "eval_loss": 0.055766720324754715, "eval_runtime": 545.2691, "eval_samples_per_second": 8.231, "eval_steps_per_second": 2.058, "step": 530 }, { "epoch": 0.24064171122994651, "grad_norm": 0.27186936140060425, "learning_rate": 1.818181818181818e-06, "loss": 0.001, "step": 540 }, { "epoch": 0.24064171122994651, "eval_accuracy": 0.9844028353691101, "eval_loss": 0.059176359325647354, "eval_runtime": 537.6305, "eval_samples_per_second": 8.348, "eval_steps_per_second": 2.087, "step": 540 } ], "logging_steps": 10, "max_steps": 550, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 10, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 6.207949973692707e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }