{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.24509803921568626, "eval_steps": 10, "global_step": 550, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.004456327985739751, "grad_norm": 0.7227747440338135, "learning_rate": 0.00019636363636363636, "loss": 0.364, "step": 10 }, { "epoch": 0.004456327985739751, "eval_accuracy": 0.89683598279953, "eval_loss": 0.3638981878757477, "eval_runtime": 537.6403, "eval_samples_per_second": 8.348, "eval_steps_per_second": 2.087, "step": 10 }, { "epoch": 0.008912655971479501, "grad_norm": 0.0924869254231453, "learning_rate": 0.00019272727272727274, "loss": 0.3247, "step": 20 }, { "epoch": 0.008912655971479501, "eval_accuracy": 0.89683598279953, "eval_loss": 0.5152587890625, "eval_runtime": 533.0372, "eval_samples_per_second": 8.42, "eval_steps_per_second": 2.105, "step": 20 }, { "epoch": 0.013368983957219251, "grad_norm": 0.23558691143989563, "learning_rate": 0.0001890909090909091, "loss": 0.4074, "step": 30 }, { "epoch": 0.013368983957219251, "eval_accuracy": 0.89683598279953, "eval_loss": 0.3966202735900879, "eval_runtime": 532.3437, "eval_samples_per_second": 8.431, "eval_steps_per_second": 2.108, "step": 30 }, { "epoch": 0.017825311942959002, "grad_norm": 0.7453870177268982, "learning_rate": 0.00018545454545454545, "loss": 0.2989, "step": 40 }, { "epoch": 0.017825311942959002, "eval_accuracy": 0.89683598279953, "eval_loss": 0.33440741896629333, "eval_runtime": 532.0041, "eval_samples_per_second": 8.436, "eval_steps_per_second": 2.109, "step": 40 }, { "epoch": 0.022281639928698752, "grad_norm": 6.811055660247803, "learning_rate": 0.00018181818181818183, "loss": 0.3399, "step": 50 }, { "epoch": 0.022281639928698752, "eval_accuracy": 0.89683598279953, "eval_loss": 0.32954567670822144, "eval_runtime": 536.9612, "eval_samples_per_second": 8.358, "eval_steps_per_second": 2.09, "step": 50 }, { "epoch": 0.026737967914438502, "grad_norm": 1.3061631917953491, "learning_rate": 0.0001781818181818182, "loss": 0.3437, "step": 60 }, { "epoch": 0.026737967914438502, "eval_accuracy": 0.89683598279953, "eval_loss": 0.3265528976917267, "eval_runtime": 533.3224, "eval_samples_per_second": 8.415, "eval_steps_per_second": 2.104, "step": 60 }, { "epoch": 0.031194295900178252, "grad_norm": 1.7616266012191772, "learning_rate": 0.00017454545454545454, "loss": 0.3749, "step": 70 }, { "epoch": 0.031194295900178252, "eval_accuracy": 0.89683598279953, "eval_loss": 0.3122697174549103, "eval_runtime": 530.7411, "eval_samples_per_second": 8.456, "eval_steps_per_second": 2.114, "step": 70 }, { "epoch": 0.035650623885918005, "grad_norm": 1.1275932788848877, "learning_rate": 0.0001709090909090909, "loss": 0.4267, "step": 80 }, { "epoch": 0.035650623885918005, "eval_accuracy": 0.89683598279953, "eval_loss": 0.23432780802249908, "eval_runtime": 533.9058, "eval_samples_per_second": 8.406, "eval_steps_per_second": 2.101, "step": 80 }, { "epoch": 0.040106951871657755, "grad_norm": 1.387789249420166, "learning_rate": 0.00016727272727272728, "loss": 0.4221, "step": 90 }, { "epoch": 0.040106951871657755, "eval_accuracy": 0.89683598279953, "eval_loss": 0.21733854711055756, "eval_runtime": 533.5436, "eval_samples_per_second": 8.412, "eval_steps_per_second": 2.103, "step": 90 }, { "epoch": 0.044563279857397504, "grad_norm": 9.95677375793457, "learning_rate": 0.00016363636363636366, "loss": 0.2292, "step": 100 }, { "epoch": 0.044563279857397504, "eval_accuracy": 0.89683598279953, "eval_loss": 0.3393175005912781, "eval_runtime": 537.6972, "eval_samples_per_second": 8.347, "eval_steps_per_second": 2.087, "step": 100 }, { "epoch": 0.049019607843137254, "grad_norm": 1.0864927768707275, "learning_rate": 0.00016, "loss": 0.4339, "step": 110 }, { "epoch": 0.049019607843137254, "eval_accuracy": 0.89683598279953, "eval_loss": 0.28164002299308777, "eval_runtime": 531.492, "eval_samples_per_second": 8.444, "eval_steps_per_second": 2.111, "step": 110 }, { "epoch": 0.053475935828877004, "grad_norm": 1.7070204019546509, "learning_rate": 0.00015636363636363637, "loss": 0.4143, "step": 120 }, { "epoch": 0.053475935828877004, "eval_accuracy": 0.8963903784751892, "eval_loss": 0.20752401649951935, "eval_runtime": 538.1828, "eval_samples_per_second": 8.339, "eval_steps_per_second": 2.085, "step": 120 }, { "epoch": 0.057932263814616754, "grad_norm": 0.08961692452430725, "learning_rate": 0.00015272727272727275, "loss": 0.2757, "step": 130 }, { "epoch": 0.057932263814616754, "eval_accuracy": 0.89683598279953, "eval_loss": 0.1715579330921173, "eval_runtime": 534.1189, "eval_samples_per_second": 8.403, "eval_steps_per_second": 2.101, "step": 130 }, { "epoch": 0.062388591800356503, "grad_norm": 1.3324838876724243, "learning_rate": 0.0001490909090909091, "loss": 0.1588, "step": 140 }, { "epoch": 0.062388591800356503, "eval_accuracy": 0.89683598279953, "eval_loss": 0.149879589676857, "eval_runtime": 530.7418, "eval_samples_per_second": 8.456, "eval_steps_per_second": 2.114, "step": 140 }, { "epoch": 0.06684491978609626, "grad_norm": 2.0198991298675537, "learning_rate": 0.00014545454545454546, "loss": 0.4453, "step": 150 }, { "epoch": 0.06684491978609626, "eval_accuracy": 0.89683598279953, "eval_loss": 0.2379792034626007, "eval_runtime": 530.3365, "eval_samples_per_second": 8.463, "eval_steps_per_second": 2.116, "step": 150 }, { "epoch": 0.07130124777183601, "grad_norm": 0.3087630867958069, "learning_rate": 0.00014181818181818184, "loss": 0.2505, "step": 160 }, { "epoch": 0.07130124777183601, "eval_accuracy": 0.89683598279953, "eval_loss": 0.3300795257091522, "eval_runtime": 529.0534, "eval_samples_per_second": 8.483, "eval_steps_per_second": 2.121, "step": 160 }, { "epoch": 0.07575757575757576, "grad_norm": 2.6436047554016113, "learning_rate": 0.0001381818181818182, "loss": 0.2654, "step": 170 }, { "epoch": 0.07575757575757576, "eval_accuracy": 0.89683598279953, "eval_loss": 0.22237823903560638, "eval_runtime": 536.9554, "eval_samples_per_second": 8.358, "eval_steps_per_second": 2.09, "step": 170 }, { "epoch": 0.08021390374331551, "grad_norm": 0.9066676497459412, "learning_rate": 0.00013454545454545455, "loss": 0.3018, "step": 180 }, { "epoch": 0.08021390374331551, "eval_accuracy": 0.89683598279953, "eval_loss": 0.2531951069831848, "eval_runtime": 532.2386, "eval_samples_per_second": 8.432, "eval_steps_per_second": 2.108, "step": 180 }, { "epoch": 0.08467023172905526, "grad_norm": 0.10262551158666611, "learning_rate": 0.00013090909090909093, "loss": 0.2325, "step": 190 }, { "epoch": 0.08467023172905526, "eval_accuracy": 0.89683598279953, "eval_loss": 0.22432851791381836, "eval_runtime": 531.5973, "eval_samples_per_second": 8.442, "eval_steps_per_second": 2.111, "step": 190 }, { "epoch": 0.08912655971479501, "grad_norm": 0.20511843264102936, "learning_rate": 0.00012727272727272728, "loss": 0.2641, "step": 200 }, { "epoch": 0.08912655971479501, "eval_accuracy": 0.89683598279953, "eval_loss": 0.14137917757034302, "eval_runtime": 528.9957, "eval_samples_per_second": 8.484, "eval_steps_per_second": 2.121, "step": 200 }, { "epoch": 0.09358288770053476, "grad_norm": 0.9824792742729187, "learning_rate": 0.00012363636363636364, "loss": 0.1497, "step": 210 }, { "epoch": 0.09358288770053476, "eval_accuracy": 0.89683598279953, "eval_loss": 0.12781496345996857, "eval_runtime": 532.3045, "eval_samples_per_second": 8.431, "eval_steps_per_second": 2.108, "step": 210 }, { "epoch": 0.09803921568627451, "grad_norm": 0.29042425751686096, "learning_rate": 0.00012, "loss": 0.1536, "step": 220 }, { "epoch": 0.09803921568627451, "eval_accuracy": 0.9380570650100708, "eval_loss": 0.14640431106090546, "eval_runtime": 535.3538, "eval_samples_per_second": 8.383, "eval_steps_per_second": 2.096, "step": 220 }, { "epoch": 0.10249554367201426, "grad_norm": 0.5411375761032104, "learning_rate": 0.00011636363636363636, "loss": 0.1801, "step": 230 }, { "epoch": 0.10249554367201426, "eval_accuracy": 0.9358288645744324, "eval_loss": 0.1414562165737152, "eval_runtime": 533.0939, "eval_samples_per_second": 8.419, "eval_steps_per_second": 2.105, "step": 230 }, { "epoch": 0.10695187165775401, "grad_norm": 5.630716323852539, "learning_rate": 0.00011272727272727272, "loss": 0.1344, "step": 240 }, { "epoch": 0.10695187165775401, "eval_accuracy": 0.9844028353691101, "eval_loss": 0.08324988931417465, "eval_runtime": 536.1639, "eval_samples_per_second": 8.371, "eval_steps_per_second": 2.093, "step": 240 }, { "epoch": 0.11140819964349376, "grad_norm": 0.27143993973731995, "learning_rate": 0.00010909090909090909, "loss": 0.1722, "step": 250 }, { "epoch": 0.11140819964349376, "eval_accuracy": 0.9861853718757629, "eval_loss": 0.060878392308950424, "eval_runtime": 533.7467, "eval_samples_per_second": 8.408, "eval_steps_per_second": 2.102, "step": 250 }, { "epoch": 0.11586452762923351, "grad_norm": 0.6671731472015381, "learning_rate": 0.00010545454545454545, "loss": 0.0684, "step": 260 }, { "epoch": 0.11586452762923351, "eval_accuracy": 0.8997326493263245, "eval_loss": 0.19059808552265167, "eval_runtime": 537.3352, "eval_samples_per_second": 8.352, "eval_steps_per_second": 2.088, "step": 260 }, { "epoch": 0.12032085561497326, "grad_norm": 0.007157750893384218, "learning_rate": 0.00010181818181818181, "loss": 0.2185, "step": 270 }, { "epoch": 0.12032085561497326, "eval_accuracy": 0.9703654050827026, "eval_loss": 0.07430911809206009, "eval_runtime": 534.3595, "eval_samples_per_second": 8.399, "eval_steps_per_second": 2.1, "step": 270 }, { "epoch": 0.12477718360071301, "grad_norm": 0.10907144099473953, "learning_rate": 9.818181818181818e-05, "loss": 0.0562, "step": 280 }, { "epoch": 0.12477718360071301, "eval_accuracy": 0.9839572310447693, "eval_loss": 0.046000149101018906, "eval_runtime": 532.5534, "eval_samples_per_second": 8.427, "eval_steps_per_second": 2.107, "step": 280 }, { "epoch": 0.12923351158645277, "grad_norm": 0.004431632813066244, "learning_rate": 9.454545454545455e-05, "loss": 0.007, "step": 290 }, { "epoch": 0.12923351158645277, "eval_accuracy": 0.9895276427268982, "eval_loss": 0.03365661948919296, "eval_runtime": 539.0883, "eval_samples_per_second": 8.325, "eval_steps_per_second": 2.081, "step": 290 }, { "epoch": 0.13368983957219252, "grad_norm": 0.14239944517612457, "learning_rate": 9.090909090909092e-05, "loss": 0.1054, "step": 300 }, { "epoch": 0.13368983957219252, "eval_accuracy": 0.9890819787979126, "eval_loss": 0.03497695177793503, "eval_runtime": 530.7114, "eval_samples_per_second": 8.457, "eval_steps_per_second": 2.114, "step": 300 }, { "epoch": 0.13814616755793227, "grad_norm": 0.19534932076931, "learning_rate": 8.727272727272727e-05, "loss": 0.0074, "step": 310 }, { "epoch": 0.13814616755793227, "eval_accuracy": 0.9815062284469604, "eval_loss": 0.055771518498659134, "eval_runtime": 540.1438, "eval_samples_per_second": 8.309, "eval_steps_per_second": 2.077, "step": 310 }, { "epoch": 0.14260249554367202, "grad_norm": 0.14106573164463043, "learning_rate": 8.363636363636364e-05, "loss": 0.0069, "step": 320 }, { "epoch": 0.14260249554367202, "eval_accuracy": 0.9752673506736755, "eval_loss": 0.07411307096481323, "eval_runtime": 530.0836, "eval_samples_per_second": 8.467, "eval_steps_per_second": 2.117, "step": 320 }, { "epoch": 0.14705882352941177, "grad_norm": 0.04756563529372215, "learning_rate": 8e-05, "loss": 0.0881, "step": 330 }, { "epoch": 0.14705882352941177, "eval_accuracy": 0.9884135723114014, "eval_loss": 0.03824571892619133, "eval_runtime": 529.2936, "eval_samples_per_second": 8.479, "eval_steps_per_second": 2.12, "step": 330 }, { "epoch": 0.15151515151515152, "grad_norm": 0.008182384073734283, "learning_rate": 7.636363636363637e-05, "loss": 0.047, "step": 340 }, { "epoch": 0.15151515151515152, "eval_accuracy": 0.9779411554336548, "eval_loss": 0.07092902809381485, "eval_runtime": 530.9951, "eval_samples_per_second": 8.452, "eval_steps_per_second": 2.113, "step": 340 }, { "epoch": 0.15597147950089127, "grad_norm": 6.055085182189941, "learning_rate": 7.272727272727273e-05, "loss": 0.1135, "step": 350 }, { "epoch": 0.15597147950089127, "eval_accuracy": 0.9935383200645447, "eval_loss": 0.02391628548502922, "eval_runtime": 531.8853, "eval_samples_per_second": 8.438, "eval_steps_per_second": 2.109, "step": 350 }, { "epoch": 0.16042780748663102, "grad_norm": 2.0424458980560303, "learning_rate": 6.90909090909091e-05, "loss": 0.1708, "step": 360 }, { "epoch": 0.16042780748663102, "eval_accuracy": 0.9844028353691101, "eval_loss": 0.0482899434864521, "eval_runtime": 538.9554, "eval_samples_per_second": 8.327, "eval_steps_per_second": 2.082, "step": 360 }, { "epoch": 0.16488413547237077, "grad_norm": 0.008309995755553246, "learning_rate": 6.545454545454546e-05, "loss": 0.0053, "step": 370 }, { "epoch": 0.16488413547237077, "eval_accuracy": 0.9977718591690063, "eval_loss": 0.011557623744010925, "eval_runtime": 536.7458, "eval_samples_per_second": 8.361, "eval_steps_per_second": 2.09, "step": 370 }, { "epoch": 0.16934046345811052, "grad_norm": 0.0617559477686882, "learning_rate": 6.181818181818182e-05, "loss": 0.0257, "step": 380 }, { "epoch": 0.16934046345811052, "eval_accuracy": 0.9979946613311768, "eval_loss": 0.011457420885562897, "eval_runtime": 538.0907, "eval_samples_per_second": 8.341, "eval_steps_per_second": 2.085, "step": 380 }, { "epoch": 0.17379679144385027, "grad_norm": 0.1585519164800644, "learning_rate": 5.818181818181818e-05, "loss": 0.0978, "step": 390 }, { "epoch": 0.17379679144385027, "eval_accuracy": 0.9986631274223328, "eval_loss": 0.009089282713830471, "eval_runtime": 539.9652, "eval_samples_per_second": 8.312, "eval_steps_per_second": 2.078, "step": 390 }, { "epoch": 0.17825311942959002, "grad_norm": 0.13566212356090546, "learning_rate": 5.4545454545454546e-05, "loss": 0.0467, "step": 400 }, { "epoch": 0.17825311942959002, "eval_accuracy": 0.9982174634933472, "eval_loss": 0.011012092232704163, "eval_runtime": 539.0037, "eval_samples_per_second": 8.326, "eval_steps_per_second": 2.082, "step": 400 }, { "epoch": 0.18270944741532977, "grad_norm": 0.054975979030132294, "learning_rate": 5.090909090909091e-05, "loss": 0.0266, "step": 410 }, { "epoch": 0.18270944741532977, "eval_accuracy": 0.9988859295845032, "eval_loss": 0.008680622093379498, "eval_runtime": 536.4536, "eval_samples_per_second": 8.366, "eval_steps_per_second": 2.092, "step": 410 }, { "epoch": 0.18716577540106952, "grad_norm": 9.600521087646484, "learning_rate": 4.7272727272727275e-05, "loss": 0.0118, "step": 420 }, { "epoch": 0.18716577540106952, "eval_accuracy": 0.9988859295845032, "eval_loss": 0.008196841925382614, "eval_runtime": 529.3899, "eval_samples_per_second": 8.478, "eval_steps_per_second": 2.119, "step": 420 }, { "epoch": 0.19162210338680927, "grad_norm": 0.00836893916130066, "learning_rate": 4.3636363636363636e-05, "loss": 0.0689, "step": 430 }, { "epoch": 0.19162210338680927, "eval_accuracy": 0.9984402656555176, "eval_loss": 0.009529507718980312, "eval_runtime": 537.9471, "eval_samples_per_second": 8.343, "eval_steps_per_second": 2.086, "step": 430 }, { "epoch": 0.19607843137254902, "grad_norm": 0.05235498398542404, "learning_rate": 4e-05, "loss": 0.0785, "step": 440 }, { "epoch": 0.19607843137254902, "eval_accuracy": 0.9975489974021912, "eval_loss": 0.01092607993632555, "eval_runtime": 533.7576, "eval_samples_per_second": 8.408, "eval_steps_per_second": 2.102, "step": 440 }, { "epoch": 0.20053475935828877, "grad_norm": 7.876805782318115, "learning_rate": 3.6363636363636364e-05, "loss": 0.0749, "step": 450 }, { "epoch": 0.20053475935828877, "eval_accuracy": 0.9971033930778503, "eval_loss": 0.013591339811682701, "eval_runtime": 532.153, "eval_samples_per_second": 8.434, "eval_steps_per_second": 2.108, "step": 450 }, { "epoch": 0.20499108734402852, "grad_norm": 0.055418092757463455, "learning_rate": 3.272727272727273e-05, "loss": 0.0038, "step": 460 }, { "epoch": 0.20499108734402852, "eval_accuracy": 0.9930927157402039, "eval_loss": 0.025715434923768044, "eval_runtime": 531.9415, "eval_samples_per_second": 8.437, "eval_steps_per_second": 2.109, "step": 460 }, { "epoch": 0.20944741532976827, "grad_norm": 26.78069305419922, "learning_rate": 2.909090909090909e-05, "loss": 0.1342, "step": 470 }, { "epoch": 0.20944741532976827, "eval_accuracy": 0.987522304058075, "eval_loss": 0.040662843734025955, "eval_runtime": 530.1651, "eval_samples_per_second": 8.465, "eval_steps_per_second": 2.116, "step": 470 }, { "epoch": 0.21390374331550802, "grad_norm": 13.968317031860352, "learning_rate": 2.5454545454545454e-05, "loss": 0.1816, "step": 480 }, { "epoch": 0.21390374331550802, "eval_accuracy": 0.9803921580314636, "eval_loss": 0.06171296164393425, "eval_runtime": 536.4017, "eval_samples_per_second": 8.367, "eval_steps_per_second": 2.092, "step": 480 }, { "epoch": 0.21836007130124777, "grad_norm": 0.009305565617978573, "learning_rate": 2.1818181818181818e-05, "loss": 0.0735, "step": 490 }, { "epoch": 0.21836007130124777, "eval_accuracy": 0.9734848737716675, "eval_loss": 0.08175662159919739, "eval_runtime": 530.5285, "eval_samples_per_second": 8.459, "eval_steps_per_second": 2.115, "step": 490 }, { "epoch": 0.22281639928698752, "grad_norm": 0.00870052631944418, "learning_rate": 1.8181818181818182e-05, "loss": 0.0535, "step": 500 }, { "epoch": 0.22281639928698752, "eval_accuracy": 0.9748217463493347, "eval_loss": 0.07746395468711853, "eval_runtime": 534.4791, "eval_samples_per_second": 8.397, "eval_steps_per_second": 2.099, "step": 500 }, { "epoch": 0.22727272727272727, "grad_norm": 0.05970863625407219, "learning_rate": 1.4545454545454545e-05, "loss": 0.0088, "step": 510 }, { "epoch": 0.22727272727272727, "eval_accuracy": 0.9846256971359253, "eval_loss": 0.04695257917046547, "eval_runtime": 532.8984, "eval_samples_per_second": 8.422, "eval_steps_per_second": 2.105, "step": 510 }, { "epoch": 0.23172905525846701, "grad_norm": 11.049286842346191, "learning_rate": 1.0909090909090909e-05, "loss": 0.0836, "step": 520 }, { "epoch": 0.23172905525846701, "eval_accuracy": 0.9897504448890686, "eval_loss": 0.03497246652841568, "eval_runtime": 534.5562, "eval_samples_per_second": 8.396, "eval_steps_per_second": 2.099, "step": 520 }, { "epoch": 0.23618538324420676, "grad_norm": 0.18157783150672913, "learning_rate": 7.272727272727272e-06, "loss": 0.0086, "step": 530 }, { "epoch": 0.23618538324420676, "eval_accuracy": 0.9888591766357422, "eval_loss": 0.03722322732210159, "eval_runtime": 532.0239, "eval_samples_per_second": 8.436, "eval_steps_per_second": 2.109, "step": 530 }, { "epoch": 0.24064171122994651, "grad_norm": 0.35108181834220886, "learning_rate": 3.636363636363636e-06, "loss": 0.0046, "step": 540 }, { "epoch": 0.24064171122994651, "eval_accuracy": 0.9888591766357422, "eval_loss": 0.0368054136633873, "eval_runtime": 531.8153, "eval_samples_per_second": 8.439, "eval_steps_per_second": 2.11, "step": 540 }, { "epoch": 0.24509803921568626, "grad_norm": 0.008579758927226067, "learning_rate": 0.0, "loss": 0.042, "step": 550 }, { "epoch": 0.24509803921568626, "eval_accuracy": 0.9890819787979126, "eval_loss": 0.03674088791012764, "eval_runtime": 537.7421, "eval_samples_per_second": 8.346, "eval_steps_per_second": 2.087, "step": 550 } ], "logging_steps": 10, "max_steps": 550, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 10, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 6.326400520422712e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }