|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.24509803921568626, |
|
"eval_steps": 10, |
|
"global_step": 550, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.004456327985739751, |
|
"grad_norm": 0.7916810512542725, |
|
"learning_rate": 0.0002945454545454545, |
|
"loss": 0.3376, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.004456327985739751, |
|
"eval_accuracy": 0.89683598279953, |
|
"eval_loss": 0.6079411506652832, |
|
"eval_runtime": 537.6759, |
|
"eval_samples_per_second": 8.347, |
|
"eval_steps_per_second": 2.087, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.008912655971479501, |
|
"grad_norm": 0.1121700331568718, |
|
"learning_rate": 0.00028909090909090904, |
|
"loss": 0.3858, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.008912655971479501, |
|
"eval_accuracy": 0.89683598279953, |
|
"eval_loss": 0.4899146854877472, |
|
"eval_runtime": 528.4079, |
|
"eval_samples_per_second": 8.493, |
|
"eval_steps_per_second": 2.123, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.013368983957219251, |
|
"grad_norm": 0.4192979633808136, |
|
"learning_rate": 0.0002836363636363636, |
|
"loss": 0.4091, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.013368983957219251, |
|
"eval_accuracy": 0.89683598279953, |
|
"eval_loss": 0.4288414418697357, |
|
"eval_runtime": 532.0654, |
|
"eval_samples_per_second": 8.435, |
|
"eval_steps_per_second": 2.109, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.017825311942959002, |
|
"grad_norm": 0.6693852543830872, |
|
"learning_rate": 0.00027818181818181815, |
|
"loss": 0.3274, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.017825311942959002, |
|
"eval_accuracy": 0.89683598279953, |
|
"eval_loss": 0.3423554301261902, |
|
"eval_runtime": 534.016, |
|
"eval_samples_per_second": 8.404, |
|
"eval_steps_per_second": 2.101, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.022281639928698752, |
|
"grad_norm": 2.928346633911133, |
|
"learning_rate": 0.0002727272727272727, |
|
"loss": 0.3434, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.022281639928698752, |
|
"eval_accuracy": 0.89683598279953, |
|
"eval_loss": 0.32134178280830383, |
|
"eval_runtime": 540.6384, |
|
"eval_samples_per_second": 8.301, |
|
"eval_steps_per_second": 2.075, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.026737967914438502, |
|
"grad_norm": 7.551202774047852, |
|
"learning_rate": 0.0002672727272727272, |
|
"loss": 0.3627, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.026737967914438502, |
|
"eval_accuracy": 0.89683598279953, |
|
"eval_loss": 0.24254043400287628, |
|
"eval_runtime": 532.8983, |
|
"eval_samples_per_second": 8.422, |
|
"eval_steps_per_second": 2.105, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.031194295900178252, |
|
"grad_norm": 5.043577671051025, |
|
"learning_rate": 0.0002618181818181818, |
|
"loss": 0.3062, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.031194295900178252, |
|
"eval_accuracy": 0.8770053386688232, |
|
"eval_loss": 0.16930516064167023, |
|
"eval_runtime": 531.9662, |
|
"eval_samples_per_second": 8.437, |
|
"eval_steps_per_second": 2.109, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.035650623885918005, |
|
"grad_norm": 0.5775203108787537, |
|
"learning_rate": 0.00025636363636363633, |
|
"loss": 0.2115, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.035650623885918005, |
|
"eval_accuracy": 0.8687611222267151, |
|
"eval_loss": 0.2173369973897934, |
|
"eval_runtime": 536.5487, |
|
"eval_samples_per_second": 8.365, |
|
"eval_steps_per_second": 2.091, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.040106951871657755, |
|
"grad_norm": 1.0835380554199219, |
|
"learning_rate": 0.00025090909090909086, |
|
"loss": 0.2757, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.040106951871657755, |
|
"eval_accuracy": 0.8426916003227234, |
|
"eval_loss": 0.2951839566230774, |
|
"eval_runtime": 531.3931, |
|
"eval_samples_per_second": 8.446, |
|
"eval_steps_per_second": 2.111, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.044563279857397504, |
|
"grad_norm": 0.8163406848907471, |
|
"learning_rate": 0.00024545454545454545, |
|
"loss": 0.2344, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.044563279857397504, |
|
"eval_accuracy": 0.8957219123840332, |
|
"eval_loss": 0.17275306582450867, |
|
"eval_runtime": 539.8237, |
|
"eval_samples_per_second": 8.314, |
|
"eval_steps_per_second": 2.078, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.049019607843137254, |
|
"grad_norm": 0.909106433391571, |
|
"learning_rate": 0.00023999999999999998, |
|
"loss": 0.3855, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.049019607843137254, |
|
"eval_accuracy": 0.9086452722549438, |
|
"eval_loss": 0.22119583189487457, |
|
"eval_runtime": 532.7138, |
|
"eval_samples_per_second": 8.425, |
|
"eval_steps_per_second": 2.106, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.053475935828877004, |
|
"grad_norm": 4.141552448272705, |
|
"learning_rate": 0.00023454545454545454, |
|
"loss": 0.3104, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.053475935828877004, |
|
"eval_accuracy": 0.8426916003227234, |
|
"eval_loss": 0.2036559134721756, |
|
"eval_runtime": 539.9806, |
|
"eval_samples_per_second": 8.311, |
|
"eval_steps_per_second": 2.078, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.057932263814616754, |
|
"grad_norm": 0.04542813077569008, |
|
"learning_rate": 0.00022909090909090907, |
|
"loss": 0.235, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.057932263814616754, |
|
"eval_accuracy": 0.89683598279953, |
|
"eval_loss": 0.6018572449684143, |
|
"eval_runtime": 532.3019, |
|
"eval_samples_per_second": 8.431, |
|
"eval_steps_per_second": 2.108, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.062388591800356503, |
|
"grad_norm": 0.20932789146900177, |
|
"learning_rate": 0.00022363636363636363, |
|
"loss": 0.383, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.062388591800356503, |
|
"eval_accuracy": 0.89683598279953, |
|
"eval_loss": 0.1932043582201004, |
|
"eval_runtime": 534.2227, |
|
"eval_samples_per_second": 8.401, |
|
"eval_steps_per_second": 2.1, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.06684491978609626, |
|
"grad_norm": 2.113372325897217, |
|
"learning_rate": 0.00021818181818181816, |
|
"loss": 0.3633, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.06684491978609626, |
|
"eval_accuracy": 0.9097593426704407, |
|
"eval_loss": 0.27564167976379395, |
|
"eval_runtime": 533.5467, |
|
"eval_samples_per_second": 8.412, |
|
"eval_steps_per_second": 2.103, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.07130124777183601, |
|
"grad_norm": 2.0779898166656494, |
|
"learning_rate": 0.00021272727272727272, |
|
"loss": 0.2534, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.07130124777183601, |
|
"eval_accuracy": 0.9124331474304199, |
|
"eval_loss": 0.17877253890037537, |
|
"eval_runtime": 534.2313, |
|
"eval_samples_per_second": 8.401, |
|
"eval_steps_per_second": 2.1, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.07575757575757576, |
|
"grad_norm": 2.8469862937927246, |
|
"learning_rate": 0.00020727272727272725, |
|
"loss": 0.1618, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.07575757575757576, |
|
"eval_accuracy": 0.9001782536506653, |
|
"eval_loss": 0.21939000487327576, |
|
"eval_runtime": 539.3123, |
|
"eval_samples_per_second": 8.322, |
|
"eval_steps_per_second": 2.08, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.08021390374331551, |
|
"grad_norm": 0.3544740378856659, |
|
"learning_rate": 0.0002018181818181818, |
|
"loss": 0.3204, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.08021390374331551, |
|
"eval_accuracy": 0.7437611222267151, |
|
"eval_loss": 0.6187111735343933, |
|
"eval_runtime": 539.0642, |
|
"eval_samples_per_second": 8.326, |
|
"eval_steps_per_second": 2.081, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.08467023172905526, |
|
"grad_norm": 0.08713312447071075, |
|
"learning_rate": 0.00019636363636363634, |
|
"loss": 0.4719, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.08467023172905526, |
|
"eval_accuracy": 0.917557954788208, |
|
"eval_loss": 0.15933164954185486, |
|
"eval_runtime": 532.5075, |
|
"eval_samples_per_second": 8.428, |
|
"eval_steps_per_second": 2.107, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.08912655971479501, |
|
"grad_norm": 0.10667263716459274, |
|
"learning_rate": 0.0001909090909090909, |
|
"loss": 0.2124, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.08912655971479501, |
|
"eval_accuracy": 0.874331533908844, |
|
"eval_loss": 0.2731720209121704, |
|
"eval_runtime": 532.8068, |
|
"eval_samples_per_second": 8.423, |
|
"eval_steps_per_second": 2.106, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.09358288770053476, |
|
"grad_norm": 0.09285986423492432, |
|
"learning_rate": 0.00018545454545454543, |
|
"loss": 0.1807, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.09358288770053476, |
|
"eval_accuracy": 0.9458556175231934, |
|
"eval_loss": 0.15097761154174805, |
|
"eval_runtime": 534.5302, |
|
"eval_samples_per_second": 8.396, |
|
"eval_steps_per_second": 2.099, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.09803921568627451, |
|
"grad_norm": 0.21800605952739716, |
|
"learning_rate": 0.00017999999999999998, |
|
"loss": 0.241, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.09803921568627451, |
|
"eval_accuracy": 0.9117646813392639, |
|
"eval_loss": 0.20164452493190765, |
|
"eval_runtime": 534.0184, |
|
"eval_samples_per_second": 8.404, |
|
"eval_steps_per_second": 2.101, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.10249554367201426, |
|
"grad_norm": 2.175548553466797, |
|
"learning_rate": 0.00017454545454545452, |
|
"loss": 0.3668, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.10249554367201426, |
|
"eval_accuracy": 0.9079768061637878, |
|
"eval_loss": 0.22817277908325195, |
|
"eval_runtime": 533.4214, |
|
"eval_samples_per_second": 8.414, |
|
"eval_steps_per_second": 2.103, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.10695187165775401, |
|
"grad_norm": 2.417444944381714, |
|
"learning_rate": 0.00016909090909090907, |
|
"loss": 0.2424, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.10695187165775401, |
|
"eval_accuracy": 0.9364973306655884, |
|
"eval_loss": 0.1917591691017151, |
|
"eval_runtime": 539.0061, |
|
"eval_samples_per_second": 8.326, |
|
"eval_steps_per_second": 2.082, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.11140819964349376, |
|
"grad_norm": 1.2514077425003052, |
|
"learning_rate": 0.0001636363636363636, |
|
"loss": 0.1934, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.11140819964349376, |
|
"eval_accuracy": 0.934937596321106, |
|
"eval_loss": 0.14482024312019348, |
|
"eval_runtime": 535.7699, |
|
"eval_samples_per_second": 8.377, |
|
"eval_steps_per_second": 2.094, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.11586452762923351, |
|
"grad_norm": 7.08188533782959, |
|
"learning_rate": 0.00015818181818181816, |
|
"loss": 0.3554, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.11586452762923351, |
|
"eval_accuracy": 0.9601158499717712, |
|
"eval_loss": 0.09824846684932709, |
|
"eval_runtime": 532.0513, |
|
"eval_samples_per_second": 8.435, |
|
"eval_steps_per_second": 2.109, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.12032085561497326, |
|
"grad_norm": 1.4493603706359863, |
|
"learning_rate": 0.0001527272727272727, |
|
"loss": 0.1619, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.12032085561497326, |
|
"eval_accuracy": 0.9569964408874512, |
|
"eval_loss": 0.11313515901565552, |
|
"eval_runtime": 531.5383, |
|
"eval_samples_per_second": 8.443, |
|
"eval_steps_per_second": 2.111, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.12477718360071301, |
|
"grad_norm": 0.9487653374671936, |
|
"learning_rate": 0.00014727272727272725, |
|
"loss": 0.0925, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.12477718360071301, |
|
"eval_accuracy": 0.9748217463493347, |
|
"eval_loss": 0.06883005797863007, |
|
"eval_runtime": 533.9256, |
|
"eval_samples_per_second": 8.406, |
|
"eval_steps_per_second": 2.101, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.12923351158645277, |
|
"grad_norm": 0.0604180209338665, |
|
"learning_rate": 0.0001418181818181818, |
|
"loss": 0.1223, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.12923351158645277, |
|
"eval_accuracy": 0.9766042828559875, |
|
"eval_loss": 0.056954748928546906, |
|
"eval_runtime": 539.6876, |
|
"eval_samples_per_second": 8.316, |
|
"eval_steps_per_second": 2.079, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.13368983957219252, |
|
"grad_norm": 2.264599561691284, |
|
"learning_rate": 0.00013636363636363634, |
|
"loss": 0.1144, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.13368983957219252, |
|
"eval_accuracy": 0.9741532802581787, |
|
"eval_loss": 0.06963187456130981, |
|
"eval_runtime": 535.6675, |
|
"eval_samples_per_second": 8.378, |
|
"eval_steps_per_second": 2.095, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.13814616755793227, |
|
"grad_norm": 3.3151047229766846, |
|
"learning_rate": 0.0001309090909090909, |
|
"loss": 0.0702, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.13814616755793227, |
|
"eval_accuracy": 0.8979501128196716, |
|
"eval_loss": 0.3257082402706146, |
|
"eval_runtime": 539.7299, |
|
"eval_samples_per_second": 8.315, |
|
"eval_steps_per_second": 2.079, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.14260249554367202, |
|
"grad_norm": 0.20427709817886353, |
|
"learning_rate": 0.00012545454545454543, |
|
"loss": 0.1778, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.14260249554367202, |
|
"eval_accuracy": 0.9463012218475342, |
|
"eval_loss": 0.15733219683170319, |
|
"eval_runtime": 532.2728, |
|
"eval_samples_per_second": 8.432, |
|
"eval_steps_per_second": 2.108, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.14705882352941177, |
|
"grad_norm": 0.29525595903396606, |
|
"learning_rate": 0.00011999999999999999, |
|
"loss": 0.1151, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.14705882352941177, |
|
"eval_accuracy": 0.9881907105445862, |
|
"eval_loss": 0.04204181954264641, |
|
"eval_runtime": 532.9583, |
|
"eval_samples_per_second": 8.421, |
|
"eval_steps_per_second": 2.105, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.15151515151515152, |
|
"grad_norm": 0.05631331354379654, |
|
"learning_rate": 0.00011454545454545453, |
|
"loss": 0.0784, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.15151515151515152, |
|
"eval_accuracy": 0.9870766401290894, |
|
"eval_loss": 0.041727881878614426, |
|
"eval_runtime": 533.1617, |
|
"eval_samples_per_second": 8.418, |
|
"eval_steps_per_second": 2.104, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.15597147950089127, |
|
"grad_norm": 1.579466700553894, |
|
"learning_rate": 0.00010909090909090908, |
|
"loss": 0.1782, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.15597147950089127, |
|
"eval_accuracy": 0.981951892375946, |
|
"eval_loss": 0.04734932258725166, |
|
"eval_runtime": 530.6175, |
|
"eval_samples_per_second": 8.458, |
|
"eval_steps_per_second": 2.115, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.16042780748663102, |
|
"grad_norm": 0.4887264370918274, |
|
"learning_rate": 0.00010363636363636362, |
|
"loss": 0.0328, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.16042780748663102, |
|
"eval_accuracy": 0.9826202988624573, |
|
"eval_loss": 0.0771423876285553, |
|
"eval_runtime": 539.3929, |
|
"eval_samples_per_second": 8.32, |
|
"eval_steps_per_second": 2.08, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.16488413547237077, |
|
"grad_norm": 0.05755012482404709, |
|
"learning_rate": 9.818181818181817e-05, |
|
"loss": 0.1689, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.16488413547237077, |
|
"eval_accuracy": 0.9770498871803284, |
|
"eval_loss": 0.049294665455818176, |
|
"eval_runtime": 534.2657, |
|
"eval_samples_per_second": 8.4, |
|
"eval_steps_per_second": 2.1, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.16934046345811052, |
|
"grad_norm": 0.2694285809993744, |
|
"learning_rate": 9.272727272727271e-05, |
|
"loss": 0.0219, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.16934046345811052, |
|
"eval_accuracy": 0.9402852058410645, |
|
"eval_loss": 0.14428526163101196, |
|
"eval_runtime": 531.3442, |
|
"eval_samples_per_second": 8.447, |
|
"eval_steps_per_second": 2.112, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.17379679144385027, |
|
"grad_norm": 0.12420541048049927, |
|
"learning_rate": 8.727272727272726e-05, |
|
"loss": 0.0951, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.17379679144385027, |
|
"eval_accuracy": 0.9977718591690063, |
|
"eval_loss": 0.012917960062623024, |
|
"eval_runtime": 535.2048, |
|
"eval_samples_per_second": 8.386, |
|
"eval_steps_per_second": 2.096, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.17825311942959002, |
|
"grad_norm": 0.25717079639434814, |
|
"learning_rate": 8.18181818181818e-05, |
|
"loss": 0.0244, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.17825311942959002, |
|
"eval_accuracy": 0.9968805909156799, |
|
"eval_loss": 0.010928132571280003, |
|
"eval_runtime": 531.9808, |
|
"eval_samples_per_second": 8.436, |
|
"eval_steps_per_second": 2.109, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.18270944741532977, |
|
"grad_norm": 8.522324562072754, |
|
"learning_rate": 7.636363636363635e-05, |
|
"loss": 0.0142, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.18270944741532977, |
|
"eval_accuracy": 0.9942067861557007, |
|
"eval_loss": 0.01725505292415619, |
|
"eval_runtime": 536.5062, |
|
"eval_samples_per_second": 8.365, |
|
"eval_steps_per_second": 2.091, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.18716577540106952, |
|
"grad_norm": 0.08865547925233841, |
|
"learning_rate": 7.09090909090909e-05, |
|
"loss": 0.0702, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.18716577540106952, |
|
"eval_accuracy": 0.9772727489471436, |
|
"eval_loss": 0.060311682522296906, |
|
"eval_runtime": 532.5572, |
|
"eval_samples_per_second": 8.427, |
|
"eval_steps_per_second": 2.107, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.19162210338680927, |
|
"grad_norm": 0.02239236794412136, |
|
"learning_rate": 6.545454545454545e-05, |
|
"loss": 0.0209, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.19162210338680927, |
|
"eval_accuracy": 0.9904189109802246, |
|
"eval_loss": 0.02353021316230297, |
|
"eval_runtime": 538.0122, |
|
"eval_samples_per_second": 8.342, |
|
"eval_steps_per_second": 2.085, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.19607843137254902, |
|
"grad_norm": 14.454584121704102, |
|
"learning_rate": 5.9999999999999995e-05, |
|
"loss": 0.0914, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.19607843137254902, |
|
"eval_accuracy": 0.9939839839935303, |
|
"eval_loss": 0.01657612808048725, |
|
"eval_runtime": 532.1826, |
|
"eval_samples_per_second": 8.433, |
|
"eval_steps_per_second": 2.108, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.20053475935828877, |
|
"grad_norm": 0.065692238509655, |
|
"learning_rate": 5.454545454545454e-05, |
|
"loss": 0.0645, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.20053475935828877, |
|
"eval_accuracy": 0.9886363744735718, |
|
"eval_loss": 0.029221026226878166, |
|
"eval_runtime": 532.6751, |
|
"eval_samples_per_second": 8.425, |
|
"eval_steps_per_second": 2.106, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.20499108734402852, |
|
"grad_norm": 0.03740346431732178, |
|
"learning_rate": 4.9090909090909084e-05, |
|
"loss": 0.0027, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.20499108734402852, |
|
"eval_accuracy": 0.9848484992980957, |
|
"eval_loss": 0.04155493900179863, |
|
"eval_runtime": 531.2072, |
|
"eval_samples_per_second": 8.449, |
|
"eval_steps_per_second": 2.112, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.20944741532976827, |
|
"grad_norm": 0.31382623314857483, |
|
"learning_rate": 4.363636363636363e-05, |
|
"loss": 0.1536, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.20944741532976827, |
|
"eval_accuracy": 0.9879679083824158, |
|
"eval_loss": 0.03183320537209511, |
|
"eval_runtime": 531.0891, |
|
"eval_samples_per_second": 8.451, |
|
"eval_steps_per_second": 2.113, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.21390374331550802, |
|
"grad_norm": 0.028712157160043716, |
|
"learning_rate": 3.8181818181818174e-05, |
|
"loss": 0.086, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.21390374331550802, |
|
"eval_accuracy": 0.987522304058075, |
|
"eval_loss": 0.032387156039476395, |
|
"eval_runtime": 538.4057, |
|
"eval_samples_per_second": 8.336, |
|
"eval_steps_per_second": 2.084, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.21836007130124777, |
|
"grad_norm": 0.016576524823904037, |
|
"learning_rate": 3.2727272727272725e-05, |
|
"loss": 0.0223, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.21836007130124777, |
|
"eval_accuracy": 0.9955436587333679, |
|
"eval_loss": 0.011954777874052525, |
|
"eval_runtime": 530.6249, |
|
"eval_samples_per_second": 8.458, |
|
"eval_steps_per_second": 2.114, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.22281639928698752, |
|
"grad_norm": 0.014118828810751438, |
|
"learning_rate": 2.727272727272727e-05, |
|
"loss": 0.0235, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.22281639928698752, |
|
"eval_accuracy": 0.9962121248245239, |
|
"eval_loss": 0.010361210443079472, |
|
"eval_runtime": 539.7181, |
|
"eval_samples_per_second": 8.315, |
|
"eval_steps_per_second": 2.079, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.22727272727272727, |
|
"grad_norm": 0.027222217991948128, |
|
"learning_rate": 2.1818181818181814e-05, |
|
"loss": 0.0025, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.22727272727272727, |
|
"eval_accuracy": 0.9966577291488647, |
|
"eval_loss": 0.00987223070114851, |
|
"eval_runtime": 530.4418, |
|
"eval_samples_per_second": 8.461, |
|
"eval_steps_per_second": 2.115, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.23172905525846701, |
|
"grad_norm": 9.495808601379395, |
|
"learning_rate": 1.6363636363636363e-05, |
|
"loss": 0.0254, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.23172905525846701, |
|
"eval_accuracy": 0.9962121248245239, |
|
"eval_loss": 0.010062512941658497, |
|
"eval_runtime": 532.4144, |
|
"eval_samples_per_second": 8.43, |
|
"eval_steps_per_second": 2.107, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.23618538324420676, |
|
"grad_norm": 0.02721397951245308, |
|
"learning_rate": 1.0909090909090907e-05, |
|
"loss": 0.0741, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.23618538324420676, |
|
"eval_accuracy": 0.9944295883178711, |
|
"eval_loss": 0.014333564788103104, |
|
"eval_runtime": 533.0522, |
|
"eval_samples_per_second": 8.419, |
|
"eval_steps_per_second": 2.105, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.24064171122994651, |
|
"grad_norm": 0.20209485292434692, |
|
"learning_rate": 5.454545454545454e-06, |
|
"loss": 0.0305, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.24064171122994651, |
|
"eval_accuracy": 0.9935383200645447, |
|
"eval_loss": 0.016204852610826492, |
|
"eval_runtime": 531.1995, |
|
"eval_samples_per_second": 8.449, |
|
"eval_steps_per_second": 2.112, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.24509803921568626, |
|
"grad_norm": 7.379052639007568, |
|
"learning_rate": 0.0, |
|
"loss": 0.007, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.24509803921568626, |
|
"eval_accuracy": 0.9935383200645447, |
|
"eval_loss": 0.016523057594895363, |
|
"eval_runtime": 537.9402, |
|
"eval_samples_per_second": 8.343, |
|
"eval_steps_per_second": 2.086, |
|
"step": 550 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 550, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 10, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 6.326400520422712e+17, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|