|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.24064171122994651, |
|
"eval_steps": 10, |
|
"global_step": 540, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.004456327985739751, |
|
"grad_norm": 1.5512940883636475, |
|
"learning_rate": 9.818181818181818e-05, |
|
"loss": 0.4724, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.004456327985739751, |
|
"eval_accuracy": 0.89683598279953, |
|
"eval_loss": 0.34007999300956726, |
|
"eval_runtime": 549.9948, |
|
"eval_samples_per_second": 8.16, |
|
"eval_steps_per_second": 2.04, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.008912655971479501, |
|
"grad_norm": 1.4623041152954102, |
|
"learning_rate": 9.636363636363637e-05, |
|
"loss": 0.2715, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.008912655971479501, |
|
"eval_accuracy": 0.89683598279953, |
|
"eval_loss": 0.39523470401763916, |
|
"eval_runtime": 543.4137, |
|
"eval_samples_per_second": 8.259, |
|
"eval_steps_per_second": 2.065, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.013368983957219251, |
|
"grad_norm": 0.6407843828201294, |
|
"learning_rate": 9.454545454545455e-05, |
|
"loss": 0.3566, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.013368983957219251, |
|
"eval_accuracy": 0.89683598279953, |
|
"eval_loss": 0.38253363966941833, |
|
"eval_runtime": 536.8913, |
|
"eval_samples_per_second": 8.359, |
|
"eval_steps_per_second": 2.09, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.017825311942959002, |
|
"grad_norm": 1.9206137657165527, |
|
"learning_rate": 9.272727272727273e-05, |
|
"loss": 0.2892, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.017825311942959002, |
|
"eval_accuracy": 0.89683598279953, |
|
"eval_loss": 0.36841902136802673, |
|
"eval_runtime": 539.4459, |
|
"eval_samples_per_second": 8.32, |
|
"eval_steps_per_second": 2.08, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.022281639928698752, |
|
"grad_norm": 1.0926023721694946, |
|
"learning_rate": 9.090909090909092e-05, |
|
"loss": 0.3157, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.022281639928698752, |
|
"eval_accuracy": 0.89683598279953, |
|
"eval_loss": 0.3572486639022827, |
|
"eval_runtime": 542.4121, |
|
"eval_samples_per_second": 8.274, |
|
"eval_steps_per_second": 2.069, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.026737967914438502, |
|
"grad_norm": 3.161236524581909, |
|
"learning_rate": 8.90909090909091e-05, |
|
"loss": 0.3792, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.026737967914438502, |
|
"eval_accuracy": 0.89683598279953, |
|
"eval_loss": 0.3476051688194275, |
|
"eval_runtime": 538.7398, |
|
"eval_samples_per_second": 8.331, |
|
"eval_steps_per_second": 2.083, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.031194295900178252, |
|
"grad_norm": 1.0513850450515747, |
|
"learning_rate": 8.727272727272727e-05, |
|
"loss": 0.3938, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.031194295900178252, |
|
"eval_accuracy": 0.89683598279953, |
|
"eval_loss": 0.317058265209198, |
|
"eval_runtime": 540.0617, |
|
"eval_samples_per_second": 8.31, |
|
"eval_steps_per_second": 2.078, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.035650623885918005, |
|
"grad_norm": 5.6621479988098145, |
|
"learning_rate": 8.545454545454545e-05, |
|
"loss": 0.3962, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.035650623885918005, |
|
"eval_accuracy": 0.89683598279953, |
|
"eval_loss": 0.2908115088939667, |
|
"eval_runtime": 541.5124, |
|
"eval_samples_per_second": 8.288, |
|
"eval_steps_per_second": 2.072, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.040106951871657755, |
|
"grad_norm": 11.542706489562988, |
|
"learning_rate": 8.363636363636364e-05, |
|
"loss": 0.3536, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.040106951871657755, |
|
"eval_accuracy": 0.89683598279953, |
|
"eval_loss": 0.2727506160736084, |
|
"eval_runtime": 542.0266, |
|
"eval_samples_per_second": 8.28, |
|
"eval_steps_per_second": 2.07, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.044563279857397504, |
|
"grad_norm": 0.7968679070472717, |
|
"learning_rate": 8.181818181818183e-05, |
|
"loss": 0.2338, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.044563279857397504, |
|
"eval_accuracy": 0.898172914981842, |
|
"eval_loss": 0.21359723806381226, |
|
"eval_runtime": 550.439, |
|
"eval_samples_per_second": 8.153, |
|
"eval_steps_per_second": 2.038, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.049019607843137254, |
|
"grad_norm": 13.235169410705566, |
|
"learning_rate": 8e-05, |
|
"loss": 0.2591, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.049019607843137254, |
|
"eval_accuracy": 0.9035205245018005, |
|
"eval_loss": 0.17823095619678497, |
|
"eval_runtime": 543.3444, |
|
"eval_samples_per_second": 8.26, |
|
"eval_steps_per_second": 2.065, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.053475935828877004, |
|
"grad_norm": 3.1415486335754395, |
|
"learning_rate": 7.818181818181818e-05, |
|
"loss": 0.261, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.053475935828877004, |
|
"eval_accuracy": 0.8725489974021912, |
|
"eval_loss": 0.23705999553203583, |
|
"eval_runtime": 550.4034, |
|
"eval_samples_per_second": 8.154, |
|
"eval_steps_per_second": 2.039, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.057932263814616754, |
|
"grad_norm": 0.12994514405727386, |
|
"learning_rate": 7.636363636363637e-05, |
|
"loss": 0.2626, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.057932263814616754, |
|
"eval_accuracy": 0.89683598279953, |
|
"eval_loss": 0.4889169931411743, |
|
"eval_runtime": 545.9677, |
|
"eval_samples_per_second": 8.22, |
|
"eval_steps_per_second": 2.055, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.062388591800356503, |
|
"grad_norm": 0.20874539017677307, |
|
"learning_rate": 7.454545454545455e-05, |
|
"loss": 0.3156, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.062388591800356503, |
|
"eval_accuracy": 0.9021835923194885, |
|
"eval_loss": 0.21060487627983093, |
|
"eval_runtime": 543.8063, |
|
"eval_samples_per_second": 8.253, |
|
"eval_steps_per_second": 2.063, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.06684491978609626, |
|
"grad_norm": 1.1314120292663574, |
|
"learning_rate": 7.272727272727273e-05, |
|
"loss": 0.3342, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.06684491978609626, |
|
"eval_accuracy": 0.9217914342880249, |
|
"eval_loss": 0.19053570926189423, |
|
"eval_runtime": 543.9184, |
|
"eval_samples_per_second": 8.251, |
|
"eval_steps_per_second": 2.063, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.07130124777183601, |
|
"grad_norm": 5.4050092697143555, |
|
"learning_rate": 7.090909090909092e-05, |
|
"loss": 0.2658, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.07130124777183601, |
|
"eval_accuracy": 0.9498662948608398, |
|
"eval_loss": 0.12423694878816605, |
|
"eval_runtime": 542.375, |
|
"eval_samples_per_second": 8.275, |
|
"eval_steps_per_second": 2.069, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.07575757575757576, |
|
"grad_norm": 6.958705425262451, |
|
"learning_rate": 6.90909090909091e-05, |
|
"loss": 0.2162, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.07575757575757576, |
|
"eval_accuracy": 0.9211229681968689, |
|
"eval_loss": 0.1585531085729599, |
|
"eval_runtime": 550.3192, |
|
"eval_samples_per_second": 8.155, |
|
"eval_steps_per_second": 2.039, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.08021390374331551, |
|
"grad_norm": 6.347085475921631, |
|
"learning_rate": 6.727272727272727e-05, |
|
"loss": 0.2457, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.08021390374331551, |
|
"eval_accuracy": 0.8422459959983826, |
|
"eval_loss": 0.31698858737945557, |
|
"eval_runtime": 550.4639, |
|
"eval_samples_per_second": 8.153, |
|
"eval_steps_per_second": 2.038, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.08467023172905526, |
|
"grad_norm": 0.05648183450102806, |
|
"learning_rate": 6.545454545454546e-05, |
|
"loss": 0.178, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.08467023172905526, |
|
"eval_accuracy": 0.9663547277450562, |
|
"eval_loss": 0.08927226811647415, |
|
"eval_runtime": 548.6704, |
|
"eval_samples_per_second": 8.18, |
|
"eval_steps_per_second": 2.045, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.08912655971479501, |
|
"grad_norm": 0.10795829445123672, |
|
"learning_rate": 6.363636363636364e-05, |
|
"loss": 0.2864, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.08912655971479501, |
|
"eval_accuracy": 0.9079768061637878, |
|
"eval_loss": 0.1990886628627777, |
|
"eval_runtime": 547.327, |
|
"eval_samples_per_second": 8.2, |
|
"eval_steps_per_second": 2.05, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.09358288770053476, |
|
"grad_norm": 0.10935252159833908, |
|
"learning_rate": 6.181818181818182e-05, |
|
"loss": 0.0852, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.09358288770053476, |
|
"eval_accuracy": 0.9777183532714844, |
|
"eval_loss": 0.06789236515760422, |
|
"eval_runtime": 546.4384, |
|
"eval_samples_per_second": 8.213, |
|
"eval_steps_per_second": 2.053, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.09803921568627451, |
|
"grad_norm": 0.724420428276062, |
|
"learning_rate": 6e-05, |
|
"loss": 0.2227, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.09803921568627451, |
|
"eval_accuracy": 0.9676916003227234, |
|
"eval_loss": 0.08267948776483536, |
|
"eval_runtime": 543.5229, |
|
"eval_samples_per_second": 8.257, |
|
"eval_steps_per_second": 2.064, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.10249554367201426, |
|
"grad_norm": 3.5009875297546387, |
|
"learning_rate": 5.818181818181818e-05, |
|
"loss": 0.0894, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.10249554367201426, |
|
"eval_accuracy": 0.9799465537071228, |
|
"eval_loss": 0.05659499019384384, |
|
"eval_runtime": 547.0317, |
|
"eval_samples_per_second": 8.204, |
|
"eval_steps_per_second": 2.051, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.10695187165775401, |
|
"grad_norm": 13.977231979370117, |
|
"learning_rate": 5.636363636363636e-05, |
|
"loss": 0.1766, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.10695187165775401, |
|
"eval_accuracy": 0.9275846481323242, |
|
"eval_loss": 0.17180828750133514, |
|
"eval_runtime": 554.5064, |
|
"eval_samples_per_second": 8.094, |
|
"eval_steps_per_second": 2.023, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.11140819964349376, |
|
"grad_norm": 1.5852386951446533, |
|
"learning_rate": 5.4545454545454546e-05, |
|
"loss": 0.1133, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.11140819964349376, |
|
"eval_accuracy": 0.9625668525695801, |
|
"eval_loss": 0.09958070516586304, |
|
"eval_runtime": 546.2195, |
|
"eval_samples_per_second": 8.216, |
|
"eval_steps_per_second": 2.054, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.11586452762923351, |
|
"grad_norm": 4.984052658081055, |
|
"learning_rate": 5.272727272727272e-05, |
|
"loss": 0.1581, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.11586452762923351, |
|
"eval_accuracy": 0.9616755843162537, |
|
"eval_loss": 0.09552862495183945, |
|
"eval_runtime": 542.5097, |
|
"eval_samples_per_second": 8.273, |
|
"eval_steps_per_second": 2.068, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.12032085561497326, |
|
"grad_norm": 0.06418484449386597, |
|
"learning_rate": 5.090909090909091e-05, |
|
"loss": 0.1164, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.12032085561497326, |
|
"eval_accuracy": 0.9962121248245239, |
|
"eval_loss": 0.017139658331871033, |
|
"eval_runtime": 545.1464, |
|
"eval_samples_per_second": 8.233, |
|
"eval_steps_per_second": 2.058, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.12477718360071301, |
|
"grad_norm": 1.2460925579071045, |
|
"learning_rate": 4.909090909090909e-05, |
|
"loss": 0.0199, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.12477718360071301, |
|
"eval_accuracy": 0.9884135723114014, |
|
"eval_loss": 0.04466737061738968, |
|
"eval_runtime": 545.2406, |
|
"eval_samples_per_second": 8.231, |
|
"eval_steps_per_second": 2.058, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.12923351158645277, |
|
"grad_norm": 0.021525170654058456, |
|
"learning_rate": 4.7272727272727275e-05, |
|
"loss": 0.0358, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.12923351158645277, |
|
"eval_accuracy": 0.9625668525695801, |
|
"eval_loss": 0.08889108896255493, |
|
"eval_runtime": 551.6453, |
|
"eval_samples_per_second": 8.136, |
|
"eval_steps_per_second": 2.034, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.13368983957219252, |
|
"grad_norm": 10.565823554992676, |
|
"learning_rate": 4.545454545454546e-05, |
|
"loss": 0.0134, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.13368983957219252, |
|
"eval_accuracy": 0.9607843160629272, |
|
"eval_loss": 0.11075662821531296, |
|
"eval_runtime": 546.8222, |
|
"eval_samples_per_second": 8.207, |
|
"eval_steps_per_second": 2.052, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.13814616755793227, |
|
"grad_norm": 2.769604444503784, |
|
"learning_rate": 4.3636363636363636e-05, |
|
"loss": 0.0085, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.13814616755793227, |
|
"eval_accuracy": 0.9204545617103577, |
|
"eval_loss": 0.24255433678627014, |
|
"eval_runtime": 551.4065, |
|
"eval_samples_per_second": 8.139, |
|
"eval_steps_per_second": 2.035, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.14260249554367202, |
|
"grad_norm": 0.027551617473363876, |
|
"learning_rate": 4.181818181818182e-05, |
|
"loss": 0.0691, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.14260249554367202, |
|
"eval_accuracy": 0.9496434926986694, |
|
"eval_loss": 0.15619446337223053, |
|
"eval_runtime": 545.5513, |
|
"eval_samples_per_second": 8.227, |
|
"eval_steps_per_second": 2.057, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.14705882352941177, |
|
"grad_norm": 0.010140771977603436, |
|
"learning_rate": 4e-05, |
|
"loss": 0.2242, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.14705882352941177, |
|
"eval_accuracy": 0.9968805909156799, |
|
"eval_loss": 0.01314464956521988, |
|
"eval_runtime": 545.9537, |
|
"eval_samples_per_second": 8.22, |
|
"eval_steps_per_second": 2.055, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.15151515151515152, |
|
"grad_norm": 0.011543634347617626, |
|
"learning_rate": 3.818181818181819e-05, |
|
"loss": 0.1593, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.15151515151515152, |
|
"eval_accuracy": 0.9890819787979126, |
|
"eval_loss": 0.03110310062766075, |
|
"eval_runtime": 547.8599, |
|
"eval_samples_per_second": 8.192, |
|
"eval_steps_per_second": 2.048, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.15597147950089127, |
|
"grad_norm": 0.06019105017185211, |
|
"learning_rate": 3.6363636363636364e-05, |
|
"loss": 0.0065, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.15597147950089127, |
|
"eval_accuracy": 0.9643493890762329, |
|
"eval_loss": 0.11227227747440338, |
|
"eval_runtime": 546.9694, |
|
"eval_samples_per_second": 8.205, |
|
"eval_steps_per_second": 2.051, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.16042780748663102, |
|
"grad_norm": 0.028353577479720116, |
|
"learning_rate": 3.454545454545455e-05, |
|
"loss": 0.0626, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.16042780748663102, |
|
"eval_accuracy": 0.9817290306091309, |
|
"eval_loss": 0.061965711414813995, |
|
"eval_runtime": 552.3776, |
|
"eval_samples_per_second": 8.125, |
|
"eval_steps_per_second": 2.031, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.16488413547237077, |
|
"grad_norm": 0.4727762043476105, |
|
"learning_rate": 3.272727272727273e-05, |
|
"loss": 0.0281, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.16488413547237077, |
|
"eval_accuracy": 0.9596702456474304, |
|
"eval_loss": 0.12898869812488556, |
|
"eval_runtime": 546.7154, |
|
"eval_samples_per_second": 8.209, |
|
"eval_steps_per_second": 2.052, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.16934046345811052, |
|
"grad_norm": 0.013356081210076809, |
|
"learning_rate": 3.090909090909091e-05, |
|
"loss": 0.0189, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.16934046345811052, |
|
"eval_accuracy": 0.991310179233551, |
|
"eval_loss": 0.02557438611984253, |
|
"eval_runtime": 546.8803, |
|
"eval_samples_per_second": 8.207, |
|
"eval_steps_per_second": 2.052, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.17379679144385027, |
|
"grad_norm": 0.056645121425390244, |
|
"learning_rate": 2.909090909090909e-05, |
|
"loss": 0.1307, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.17379679144385027, |
|
"eval_accuracy": 0.9783868193626404, |
|
"eval_loss": 0.07275046408176422, |
|
"eval_runtime": 548.2067, |
|
"eval_samples_per_second": 8.187, |
|
"eval_steps_per_second": 2.047, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.17825311942959002, |
|
"grad_norm": 0.0406530387699604, |
|
"learning_rate": 2.7272727272727273e-05, |
|
"loss": 0.0061, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.17825311942959002, |
|
"eval_accuracy": 0.9471924901008606, |
|
"eval_loss": 0.17399340867996216, |
|
"eval_runtime": 542.8312, |
|
"eval_samples_per_second": 8.268, |
|
"eval_steps_per_second": 2.067, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.18270944741532977, |
|
"grad_norm": 0.07212503999471664, |
|
"learning_rate": 2.5454545454545454e-05, |
|
"loss": 0.0739, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.18270944741532977, |
|
"eval_accuracy": 0.9500890970230103, |
|
"eval_loss": 0.16757053136825562, |
|
"eval_runtime": 542.9814, |
|
"eval_samples_per_second": 8.265, |
|
"eval_steps_per_second": 2.066, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.18716577540106952, |
|
"grad_norm": 0.10422785580158234, |
|
"learning_rate": 2.3636363636363637e-05, |
|
"loss": 0.0028, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.18716577540106952, |
|
"eval_accuracy": 0.9783868193626404, |
|
"eval_loss": 0.07298260927200317, |
|
"eval_runtime": 539.9794, |
|
"eval_samples_per_second": 8.311, |
|
"eval_steps_per_second": 2.078, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.19162210338680927, |
|
"grad_norm": 0.007547458633780479, |
|
"learning_rate": 2.1818181818181818e-05, |
|
"loss": 0.0011, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.19162210338680927, |
|
"eval_accuracy": 0.977495551109314, |
|
"eval_loss": 0.07658497989177704, |
|
"eval_runtime": 549.3337, |
|
"eval_samples_per_second": 8.17, |
|
"eval_steps_per_second": 2.042, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.19607843137254902, |
|
"grad_norm": 0.007210019510239363, |
|
"learning_rate": 2e-05, |
|
"loss": 0.0019, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.19607843137254902, |
|
"eval_accuracy": 0.9636809229850769, |
|
"eval_loss": 0.11826927214860916, |
|
"eval_runtime": 543.2619, |
|
"eval_samples_per_second": 8.261, |
|
"eval_steps_per_second": 2.065, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.20053475935828877, |
|
"grad_norm": 0.013209226541221142, |
|
"learning_rate": 1.8181818181818182e-05, |
|
"loss": 0.0388, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.20053475935828877, |
|
"eval_accuracy": 0.9576649069786072, |
|
"eval_loss": 0.14545659720897675, |
|
"eval_runtime": 544.4489, |
|
"eval_samples_per_second": 8.243, |
|
"eval_steps_per_second": 2.061, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.20499108734402852, |
|
"grad_norm": 0.007217989303171635, |
|
"learning_rate": 1.6363636363636366e-05, |
|
"loss": 0.0041, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.20499108734402852, |
|
"eval_accuracy": 0.9523172974586487, |
|
"eval_loss": 0.16972462832927704, |
|
"eval_runtime": 545.2894, |
|
"eval_samples_per_second": 8.23, |
|
"eval_steps_per_second": 2.058, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.20944741532976827, |
|
"grad_norm": 0.8174325227737427, |
|
"learning_rate": 1.4545454545454545e-05, |
|
"loss": 0.0064, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.20944741532976827, |
|
"eval_accuracy": 0.9509803652763367, |
|
"eval_loss": 0.17997007071971893, |
|
"eval_runtime": 543.7245, |
|
"eval_samples_per_second": 8.254, |
|
"eval_steps_per_second": 2.064, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.21390374331550802, |
|
"grad_norm": 0.032331835478544235, |
|
"learning_rate": 1.2727272727272727e-05, |
|
"loss": 0.0008, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.21390374331550802, |
|
"eval_accuracy": 0.9610071182250977, |
|
"eval_loss": 0.1397457718849182, |
|
"eval_runtime": 546.6811, |
|
"eval_samples_per_second": 8.21, |
|
"eval_steps_per_second": 2.052, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.21836007130124777, |
|
"grad_norm": 6.01271915435791, |
|
"learning_rate": 1.0909090909090909e-05, |
|
"loss": 0.0688, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.21836007130124777, |
|
"eval_accuracy": 0.9694741368293762, |
|
"eval_loss": 0.10627125203609467, |
|
"eval_runtime": 544.0334, |
|
"eval_samples_per_second": 8.249, |
|
"eval_steps_per_second": 2.062, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.22281639928698752, |
|
"grad_norm": 0.004174210596829653, |
|
"learning_rate": 9.090909090909091e-06, |
|
"loss": 0.0008, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.22281639928698752, |
|
"eval_accuracy": 0.9844028353691101, |
|
"eval_loss": 0.05713631212711334, |
|
"eval_runtime": 551.9061, |
|
"eval_samples_per_second": 8.132, |
|
"eval_steps_per_second": 2.033, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.22727272727272727, |
|
"grad_norm": 0.006048465613275766, |
|
"learning_rate": 7.272727272727272e-06, |
|
"loss": 0.0018, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.22727272727272727, |
|
"eval_accuracy": 0.9870766401290894, |
|
"eval_loss": 0.048317644745111465, |
|
"eval_runtime": 541.5344, |
|
"eval_samples_per_second": 8.288, |
|
"eval_steps_per_second": 2.072, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.23172905525846701, |
|
"grad_norm": 13.882183074951172, |
|
"learning_rate": 5.4545454545454545e-06, |
|
"loss": 0.1531, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.23172905525846701, |
|
"eval_accuracy": 0.9870766401290894, |
|
"eval_loss": 0.04793470725417137, |
|
"eval_runtime": 540.1145, |
|
"eval_samples_per_second": 8.309, |
|
"eval_steps_per_second": 2.077, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.23618538324420676, |
|
"grad_norm": 0.21645870804786682, |
|
"learning_rate": 3.636363636363636e-06, |
|
"loss": 0.0027, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.23618538324420676, |
|
"eval_accuracy": 0.9848484992980957, |
|
"eval_loss": 0.055766720324754715, |
|
"eval_runtime": 545.2691, |
|
"eval_samples_per_second": 8.231, |
|
"eval_steps_per_second": 2.058, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.24064171122994651, |
|
"grad_norm": 0.27186936140060425, |
|
"learning_rate": 1.818181818181818e-06, |
|
"loss": 0.001, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.24064171122994651, |
|
"eval_accuracy": 0.9844028353691101, |
|
"eval_loss": 0.059176359325647354, |
|
"eval_runtime": 537.6305, |
|
"eval_samples_per_second": 8.348, |
|
"eval_steps_per_second": 2.087, |
|
"step": 540 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 550, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 10, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 6.207949973692707e+17, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|