|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.24509803921568626, |
|
"eval_steps": 10, |
|
"global_step": 550, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.004456327985739751, |
|
"grad_norm": 0.7227747440338135, |
|
"learning_rate": 0.00019636363636363636, |
|
"loss": 0.364, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.004456327985739751, |
|
"eval_accuracy": 0.89683598279953, |
|
"eval_loss": 0.3638981878757477, |
|
"eval_runtime": 537.6403, |
|
"eval_samples_per_second": 8.348, |
|
"eval_steps_per_second": 2.087, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.008912655971479501, |
|
"grad_norm": 0.0924869254231453, |
|
"learning_rate": 0.00019272727272727274, |
|
"loss": 0.3247, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.008912655971479501, |
|
"eval_accuracy": 0.89683598279953, |
|
"eval_loss": 0.5152587890625, |
|
"eval_runtime": 533.0372, |
|
"eval_samples_per_second": 8.42, |
|
"eval_steps_per_second": 2.105, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.013368983957219251, |
|
"grad_norm": 0.23558691143989563, |
|
"learning_rate": 0.0001890909090909091, |
|
"loss": 0.4074, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.013368983957219251, |
|
"eval_accuracy": 0.89683598279953, |
|
"eval_loss": 0.3966202735900879, |
|
"eval_runtime": 532.3437, |
|
"eval_samples_per_second": 8.431, |
|
"eval_steps_per_second": 2.108, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.017825311942959002, |
|
"grad_norm": 0.7453870177268982, |
|
"learning_rate": 0.00018545454545454545, |
|
"loss": 0.2989, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.017825311942959002, |
|
"eval_accuracy": 0.89683598279953, |
|
"eval_loss": 0.33440741896629333, |
|
"eval_runtime": 532.0041, |
|
"eval_samples_per_second": 8.436, |
|
"eval_steps_per_second": 2.109, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.022281639928698752, |
|
"grad_norm": 6.811055660247803, |
|
"learning_rate": 0.00018181818181818183, |
|
"loss": 0.3399, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.022281639928698752, |
|
"eval_accuracy": 0.89683598279953, |
|
"eval_loss": 0.32954567670822144, |
|
"eval_runtime": 536.9612, |
|
"eval_samples_per_second": 8.358, |
|
"eval_steps_per_second": 2.09, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.026737967914438502, |
|
"grad_norm": 1.3061631917953491, |
|
"learning_rate": 0.0001781818181818182, |
|
"loss": 0.3437, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.026737967914438502, |
|
"eval_accuracy": 0.89683598279953, |
|
"eval_loss": 0.3265528976917267, |
|
"eval_runtime": 533.3224, |
|
"eval_samples_per_second": 8.415, |
|
"eval_steps_per_second": 2.104, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.031194295900178252, |
|
"grad_norm": 1.7616266012191772, |
|
"learning_rate": 0.00017454545454545454, |
|
"loss": 0.3749, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.031194295900178252, |
|
"eval_accuracy": 0.89683598279953, |
|
"eval_loss": 0.3122697174549103, |
|
"eval_runtime": 530.7411, |
|
"eval_samples_per_second": 8.456, |
|
"eval_steps_per_second": 2.114, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.035650623885918005, |
|
"grad_norm": 1.1275932788848877, |
|
"learning_rate": 0.0001709090909090909, |
|
"loss": 0.4267, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.035650623885918005, |
|
"eval_accuracy": 0.89683598279953, |
|
"eval_loss": 0.23432780802249908, |
|
"eval_runtime": 533.9058, |
|
"eval_samples_per_second": 8.406, |
|
"eval_steps_per_second": 2.101, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.040106951871657755, |
|
"grad_norm": 1.387789249420166, |
|
"learning_rate": 0.00016727272727272728, |
|
"loss": 0.4221, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.040106951871657755, |
|
"eval_accuracy": 0.89683598279953, |
|
"eval_loss": 0.21733854711055756, |
|
"eval_runtime": 533.5436, |
|
"eval_samples_per_second": 8.412, |
|
"eval_steps_per_second": 2.103, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.044563279857397504, |
|
"grad_norm": 9.95677375793457, |
|
"learning_rate": 0.00016363636363636366, |
|
"loss": 0.2292, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.044563279857397504, |
|
"eval_accuracy": 0.89683598279953, |
|
"eval_loss": 0.3393175005912781, |
|
"eval_runtime": 537.6972, |
|
"eval_samples_per_second": 8.347, |
|
"eval_steps_per_second": 2.087, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.049019607843137254, |
|
"grad_norm": 1.0864927768707275, |
|
"learning_rate": 0.00016, |
|
"loss": 0.4339, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.049019607843137254, |
|
"eval_accuracy": 0.89683598279953, |
|
"eval_loss": 0.28164002299308777, |
|
"eval_runtime": 531.492, |
|
"eval_samples_per_second": 8.444, |
|
"eval_steps_per_second": 2.111, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.053475935828877004, |
|
"grad_norm": 1.7070204019546509, |
|
"learning_rate": 0.00015636363636363637, |
|
"loss": 0.4143, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.053475935828877004, |
|
"eval_accuracy": 0.8963903784751892, |
|
"eval_loss": 0.20752401649951935, |
|
"eval_runtime": 538.1828, |
|
"eval_samples_per_second": 8.339, |
|
"eval_steps_per_second": 2.085, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.057932263814616754, |
|
"grad_norm": 0.08961692452430725, |
|
"learning_rate": 0.00015272727272727275, |
|
"loss": 0.2757, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.057932263814616754, |
|
"eval_accuracy": 0.89683598279953, |
|
"eval_loss": 0.1715579330921173, |
|
"eval_runtime": 534.1189, |
|
"eval_samples_per_second": 8.403, |
|
"eval_steps_per_second": 2.101, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.062388591800356503, |
|
"grad_norm": 1.3324838876724243, |
|
"learning_rate": 0.0001490909090909091, |
|
"loss": 0.1588, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.062388591800356503, |
|
"eval_accuracy": 0.89683598279953, |
|
"eval_loss": 0.149879589676857, |
|
"eval_runtime": 530.7418, |
|
"eval_samples_per_second": 8.456, |
|
"eval_steps_per_second": 2.114, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.06684491978609626, |
|
"grad_norm": 2.0198991298675537, |
|
"learning_rate": 0.00014545454545454546, |
|
"loss": 0.4453, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.06684491978609626, |
|
"eval_accuracy": 0.89683598279953, |
|
"eval_loss": 0.2379792034626007, |
|
"eval_runtime": 530.3365, |
|
"eval_samples_per_second": 8.463, |
|
"eval_steps_per_second": 2.116, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.07130124777183601, |
|
"grad_norm": 0.3087630867958069, |
|
"learning_rate": 0.00014181818181818184, |
|
"loss": 0.2505, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.07130124777183601, |
|
"eval_accuracy": 0.89683598279953, |
|
"eval_loss": 0.3300795257091522, |
|
"eval_runtime": 529.0534, |
|
"eval_samples_per_second": 8.483, |
|
"eval_steps_per_second": 2.121, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.07575757575757576, |
|
"grad_norm": 2.6436047554016113, |
|
"learning_rate": 0.0001381818181818182, |
|
"loss": 0.2654, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.07575757575757576, |
|
"eval_accuracy": 0.89683598279953, |
|
"eval_loss": 0.22237823903560638, |
|
"eval_runtime": 536.9554, |
|
"eval_samples_per_second": 8.358, |
|
"eval_steps_per_second": 2.09, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.08021390374331551, |
|
"grad_norm": 0.9066676497459412, |
|
"learning_rate": 0.00013454545454545455, |
|
"loss": 0.3018, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.08021390374331551, |
|
"eval_accuracy": 0.89683598279953, |
|
"eval_loss": 0.2531951069831848, |
|
"eval_runtime": 532.2386, |
|
"eval_samples_per_second": 8.432, |
|
"eval_steps_per_second": 2.108, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.08467023172905526, |
|
"grad_norm": 0.10262551158666611, |
|
"learning_rate": 0.00013090909090909093, |
|
"loss": 0.2325, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.08467023172905526, |
|
"eval_accuracy": 0.89683598279953, |
|
"eval_loss": 0.22432851791381836, |
|
"eval_runtime": 531.5973, |
|
"eval_samples_per_second": 8.442, |
|
"eval_steps_per_second": 2.111, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.08912655971479501, |
|
"grad_norm": 0.20511843264102936, |
|
"learning_rate": 0.00012727272727272728, |
|
"loss": 0.2641, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.08912655971479501, |
|
"eval_accuracy": 0.89683598279953, |
|
"eval_loss": 0.14137917757034302, |
|
"eval_runtime": 528.9957, |
|
"eval_samples_per_second": 8.484, |
|
"eval_steps_per_second": 2.121, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.09358288770053476, |
|
"grad_norm": 0.9824792742729187, |
|
"learning_rate": 0.00012363636363636364, |
|
"loss": 0.1497, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.09358288770053476, |
|
"eval_accuracy": 0.89683598279953, |
|
"eval_loss": 0.12781496345996857, |
|
"eval_runtime": 532.3045, |
|
"eval_samples_per_second": 8.431, |
|
"eval_steps_per_second": 2.108, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.09803921568627451, |
|
"grad_norm": 0.29042425751686096, |
|
"learning_rate": 0.00012, |
|
"loss": 0.1536, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.09803921568627451, |
|
"eval_accuracy": 0.9380570650100708, |
|
"eval_loss": 0.14640431106090546, |
|
"eval_runtime": 535.3538, |
|
"eval_samples_per_second": 8.383, |
|
"eval_steps_per_second": 2.096, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.10249554367201426, |
|
"grad_norm": 0.5411375761032104, |
|
"learning_rate": 0.00011636363636363636, |
|
"loss": 0.1801, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.10249554367201426, |
|
"eval_accuracy": 0.9358288645744324, |
|
"eval_loss": 0.1414562165737152, |
|
"eval_runtime": 533.0939, |
|
"eval_samples_per_second": 8.419, |
|
"eval_steps_per_second": 2.105, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.10695187165775401, |
|
"grad_norm": 5.630716323852539, |
|
"learning_rate": 0.00011272727272727272, |
|
"loss": 0.1344, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.10695187165775401, |
|
"eval_accuracy": 0.9844028353691101, |
|
"eval_loss": 0.08324988931417465, |
|
"eval_runtime": 536.1639, |
|
"eval_samples_per_second": 8.371, |
|
"eval_steps_per_second": 2.093, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.11140819964349376, |
|
"grad_norm": 0.27143993973731995, |
|
"learning_rate": 0.00010909090909090909, |
|
"loss": 0.1722, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.11140819964349376, |
|
"eval_accuracy": 0.9861853718757629, |
|
"eval_loss": 0.060878392308950424, |
|
"eval_runtime": 533.7467, |
|
"eval_samples_per_second": 8.408, |
|
"eval_steps_per_second": 2.102, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.11586452762923351, |
|
"grad_norm": 0.6671731472015381, |
|
"learning_rate": 0.00010545454545454545, |
|
"loss": 0.0684, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.11586452762923351, |
|
"eval_accuracy": 0.8997326493263245, |
|
"eval_loss": 0.19059808552265167, |
|
"eval_runtime": 537.3352, |
|
"eval_samples_per_second": 8.352, |
|
"eval_steps_per_second": 2.088, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.12032085561497326, |
|
"grad_norm": 0.007157750893384218, |
|
"learning_rate": 0.00010181818181818181, |
|
"loss": 0.2185, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.12032085561497326, |
|
"eval_accuracy": 0.9703654050827026, |
|
"eval_loss": 0.07430911809206009, |
|
"eval_runtime": 534.3595, |
|
"eval_samples_per_second": 8.399, |
|
"eval_steps_per_second": 2.1, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.12477718360071301, |
|
"grad_norm": 0.10907144099473953, |
|
"learning_rate": 9.818181818181818e-05, |
|
"loss": 0.0562, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.12477718360071301, |
|
"eval_accuracy": 0.9839572310447693, |
|
"eval_loss": 0.046000149101018906, |
|
"eval_runtime": 532.5534, |
|
"eval_samples_per_second": 8.427, |
|
"eval_steps_per_second": 2.107, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.12923351158645277, |
|
"grad_norm": 0.004431632813066244, |
|
"learning_rate": 9.454545454545455e-05, |
|
"loss": 0.007, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.12923351158645277, |
|
"eval_accuracy": 0.9895276427268982, |
|
"eval_loss": 0.03365661948919296, |
|
"eval_runtime": 539.0883, |
|
"eval_samples_per_second": 8.325, |
|
"eval_steps_per_second": 2.081, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.13368983957219252, |
|
"grad_norm": 0.14239944517612457, |
|
"learning_rate": 9.090909090909092e-05, |
|
"loss": 0.1054, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.13368983957219252, |
|
"eval_accuracy": 0.9890819787979126, |
|
"eval_loss": 0.03497695177793503, |
|
"eval_runtime": 530.7114, |
|
"eval_samples_per_second": 8.457, |
|
"eval_steps_per_second": 2.114, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.13814616755793227, |
|
"grad_norm": 0.19534932076931, |
|
"learning_rate": 8.727272727272727e-05, |
|
"loss": 0.0074, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.13814616755793227, |
|
"eval_accuracy": 0.9815062284469604, |
|
"eval_loss": 0.055771518498659134, |
|
"eval_runtime": 540.1438, |
|
"eval_samples_per_second": 8.309, |
|
"eval_steps_per_second": 2.077, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.14260249554367202, |
|
"grad_norm": 0.14106573164463043, |
|
"learning_rate": 8.363636363636364e-05, |
|
"loss": 0.0069, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.14260249554367202, |
|
"eval_accuracy": 0.9752673506736755, |
|
"eval_loss": 0.07411307096481323, |
|
"eval_runtime": 530.0836, |
|
"eval_samples_per_second": 8.467, |
|
"eval_steps_per_second": 2.117, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.14705882352941177, |
|
"grad_norm": 0.04756563529372215, |
|
"learning_rate": 8e-05, |
|
"loss": 0.0881, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.14705882352941177, |
|
"eval_accuracy": 0.9884135723114014, |
|
"eval_loss": 0.03824571892619133, |
|
"eval_runtime": 529.2936, |
|
"eval_samples_per_second": 8.479, |
|
"eval_steps_per_second": 2.12, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.15151515151515152, |
|
"grad_norm": 0.008182384073734283, |
|
"learning_rate": 7.636363636363637e-05, |
|
"loss": 0.047, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.15151515151515152, |
|
"eval_accuracy": 0.9779411554336548, |
|
"eval_loss": 0.07092902809381485, |
|
"eval_runtime": 530.9951, |
|
"eval_samples_per_second": 8.452, |
|
"eval_steps_per_second": 2.113, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.15597147950089127, |
|
"grad_norm": 6.055085182189941, |
|
"learning_rate": 7.272727272727273e-05, |
|
"loss": 0.1135, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.15597147950089127, |
|
"eval_accuracy": 0.9935383200645447, |
|
"eval_loss": 0.02391628548502922, |
|
"eval_runtime": 531.8853, |
|
"eval_samples_per_second": 8.438, |
|
"eval_steps_per_second": 2.109, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.16042780748663102, |
|
"grad_norm": 2.0424458980560303, |
|
"learning_rate": 6.90909090909091e-05, |
|
"loss": 0.1708, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.16042780748663102, |
|
"eval_accuracy": 0.9844028353691101, |
|
"eval_loss": 0.0482899434864521, |
|
"eval_runtime": 538.9554, |
|
"eval_samples_per_second": 8.327, |
|
"eval_steps_per_second": 2.082, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.16488413547237077, |
|
"grad_norm": 0.008309995755553246, |
|
"learning_rate": 6.545454545454546e-05, |
|
"loss": 0.0053, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.16488413547237077, |
|
"eval_accuracy": 0.9977718591690063, |
|
"eval_loss": 0.011557623744010925, |
|
"eval_runtime": 536.7458, |
|
"eval_samples_per_second": 8.361, |
|
"eval_steps_per_second": 2.09, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.16934046345811052, |
|
"grad_norm": 0.0617559477686882, |
|
"learning_rate": 6.181818181818182e-05, |
|
"loss": 0.0257, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.16934046345811052, |
|
"eval_accuracy": 0.9979946613311768, |
|
"eval_loss": 0.011457420885562897, |
|
"eval_runtime": 538.0907, |
|
"eval_samples_per_second": 8.341, |
|
"eval_steps_per_second": 2.085, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.17379679144385027, |
|
"grad_norm": 0.1585519164800644, |
|
"learning_rate": 5.818181818181818e-05, |
|
"loss": 0.0978, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.17379679144385027, |
|
"eval_accuracy": 0.9986631274223328, |
|
"eval_loss": 0.009089282713830471, |
|
"eval_runtime": 539.9652, |
|
"eval_samples_per_second": 8.312, |
|
"eval_steps_per_second": 2.078, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.17825311942959002, |
|
"grad_norm": 0.13566212356090546, |
|
"learning_rate": 5.4545454545454546e-05, |
|
"loss": 0.0467, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.17825311942959002, |
|
"eval_accuracy": 0.9982174634933472, |
|
"eval_loss": 0.011012092232704163, |
|
"eval_runtime": 539.0037, |
|
"eval_samples_per_second": 8.326, |
|
"eval_steps_per_second": 2.082, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.18270944741532977, |
|
"grad_norm": 0.054975979030132294, |
|
"learning_rate": 5.090909090909091e-05, |
|
"loss": 0.0266, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.18270944741532977, |
|
"eval_accuracy": 0.9988859295845032, |
|
"eval_loss": 0.008680622093379498, |
|
"eval_runtime": 536.4536, |
|
"eval_samples_per_second": 8.366, |
|
"eval_steps_per_second": 2.092, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.18716577540106952, |
|
"grad_norm": 9.600521087646484, |
|
"learning_rate": 4.7272727272727275e-05, |
|
"loss": 0.0118, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.18716577540106952, |
|
"eval_accuracy": 0.9988859295845032, |
|
"eval_loss": 0.008196841925382614, |
|
"eval_runtime": 529.3899, |
|
"eval_samples_per_second": 8.478, |
|
"eval_steps_per_second": 2.119, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.19162210338680927, |
|
"grad_norm": 0.00836893916130066, |
|
"learning_rate": 4.3636363636363636e-05, |
|
"loss": 0.0689, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.19162210338680927, |
|
"eval_accuracy": 0.9984402656555176, |
|
"eval_loss": 0.009529507718980312, |
|
"eval_runtime": 537.9471, |
|
"eval_samples_per_second": 8.343, |
|
"eval_steps_per_second": 2.086, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.19607843137254902, |
|
"grad_norm": 0.05235498398542404, |
|
"learning_rate": 4e-05, |
|
"loss": 0.0785, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.19607843137254902, |
|
"eval_accuracy": 0.9975489974021912, |
|
"eval_loss": 0.01092607993632555, |
|
"eval_runtime": 533.7576, |
|
"eval_samples_per_second": 8.408, |
|
"eval_steps_per_second": 2.102, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.20053475935828877, |
|
"grad_norm": 7.876805782318115, |
|
"learning_rate": 3.6363636363636364e-05, |
|
"loss": 0.0749, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.20053475935828877, |
|
"eval_accuracy": 0.9971033930778503, |
|
"eval_loss": 0.013591339811682701, |
|
"eval_runtime": 532.153, |
|
"eval_samples_per_second": 8.434, |
|
"eval_steps_per_second": 2.108, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.20499108734402852, |
|
"grad_norm": 0.055418092757463455, |
|
"learning_rate": 3.272727272727273e-05, |
|
"loss": 0.0038, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.20499108734402852, |
|
"eval_accuracy": 0.9930927157402039, |
|
"eval_loss": 0.025715434923768044, |
|
"eval_runtime": 531.9415, |
|
"eval_samples_per_second": 8.437, |
|
"eval_steps_per_second": 2.109, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.20944741532976827, |
|
"grad_norm": 26.78069305419922, |
|
"learning_rate": 2.909090909090909e-05, |
|
"loss": 0.1342, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.20944741532976827, |
|
"eval_accuracy": 0.987522304058075, |
|
"eval_loss": 0.040662843734025955, |
|
"eval_runtime": 530.1651, |
|
"eval_samples_per_second": 8.465, |
|
"eval_steps_per_second": 2.116, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.21390374331550802, |
|
"grad_norm": 13.968317031860352, |
|
"learning_rate": 2.5454545454545454e-05, |
|
"loss": 0.1816, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.21390374331550802, |
|
"eval_accuracy": 0.9803921580314636, |
|
"eval_loss": 0.06171296164393425, |
|
"eval_runtime": 536.4017, |
|
"eval_samples_per_second": 8.367, |
|
"eval_steps_per_second": 2.092, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.21836007130124777, |
|
"grad_norm": 0.009305565617978573, |
|
"learning_rate": 2.1818181818181818e-05, |
|
"loss": 0.0735, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.21836007130124777, |
|
"eval_accuracy": 0.9734848737716675, |
|
"eval_loss": 0.08175662159919739, |
|
"eval_runtime": 530.5285, |
|
"eval_samples_per_second": 8.459, |
|
"eval_steps_per_second": 2.115, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.22281639928698752, |
|
"grad_norm": 0.00870052631944418, |
|
"learning_rate": 1.8181818181818182e-05, |
|
"loss": 0.0535, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.22281639928698752, |
|
"eval_accuracy": 0.9748217463493347, |
|
"eval_loss": 0.07746395468711853, |
|
"eval_runtime": 534.4791, |
|
"eval_samples_per_second": 8.397, |
|
"eval_steps_per_second": 2.099, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.22727272727272727, |
|
"grad_norm": 0.05970863625407219, |
|
"learning_rate": 1.4545454545454545e-05, |
|
"loss": 0.0088, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.22727272727272727, |
|
"eval_accuracy": 0.9846256971359253, |
|
"eval_loss": 0.04695257917046547, |
|
"eval_runtime": 532.8984, |
|
"eval_samples_per_second": 8.422, |
|
"eval_steps_per_second": 2.105, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.23172905525846701, |
|
"grad_norm": 11.049286842346191, |
|
"learning_rate": 1.0909090909090909e-05, |
|
"loss": 0.0836, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.23172905525846701, |
|
"eval_accuracy": 0.9897504448890686, |
|
"eval_loss": 0.03497246652841568, |
|
"eval_runtime": 534.5562, |
|
"eval_samples_per_second": 8.396, |
|
"eval_steps_per_second": 2.099, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.23618538324420676, |
|
"grad_norm": 0.18157783150672913, |
|
"learning_rate": 7.272727272727272e-06, |
|
"loss": 0.0086, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.23618538324420676, |
|
"eval_accuracy": 0.9888591766357422, |
|
"eval_loss": 0.03722322732210159, |
|
"eval_runtime": 532.0239, |
|
"eval_samples_per_second": 8.436, |
|
"eval_steps_per_second": 2.109, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.24064171122994651, |
|
"grad_norm": 0.35108181834220886, |
|
"learning_rate": 3.636363636363636e-06, |
|
"loss": 0.0046, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.24064171122994651, |
|
"eval_accuracy": 0.9888591766357422, |
|
"eval_loss": 0.0368054136633873, |
|
"eval_runtime": 531.8153, |
|
"eval_samples_per_second": 8.439, |
|
"eval_steps_per_second": 2.11, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.24509803921568626, |
|
"grad_norm": 0.008579758927226067, |
|
"learning_rate": 0.0, |
|
"loss": 0.042, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.24509803921568626, |
|
"eval_accuracy": 0.9890819787979126, |
|
"eval_loss": 0.03674088791012764, |
|
"eval_runtime": 537.7421, |
|
"eval_samples_per_second": 8.346, |
|
"eval_steps_per_second": 2.087, |
|
"step": 550 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 550, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 10, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 6.326400520422712e+17, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|