|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.999929740743343, |
|
"eval_steps": 100, |
|
"global_step": 1779, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 30.728321433017737, |
|
"learning_rate": 2.9915682967959526e-07, |
|
"loss": 0.7843, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 36.77551634614557, |
|
"learning_rate": 2.9831365935919053e-07, |
|
"loss": 0.7146, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 35.07871745362434, |
|
"learning_rate": 2.9747048903878585e-07, |
|
"loss": 0.6938, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 22.884080905805792, |
|
"learning_rate": 2.9662731871838107e-07, |
|
"loss": 0.6587, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 20.00517219363236, |
|
"learning_rate": 2.957841483979764e-07, |
|
"loss": 0.653, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 18.36397890422735, |
|
"learning_rate": 2.9494097807757167e-07, |
|
"loss": 0.6644, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 25.380353671575364, |
|
"learning_rate": 2.9409780775716694e-07, |
|
"loss": 0.6591, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 22.96117971805064, |
|
"learning_rate": 2.932546374367622e-07, |
|
"loss": 0.6649, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 18.570972155544734, |
|
"learning_rate": 2.924114671163575e-07, |
|
"loss": 0.6461, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 31.463777178706607, |
|
"learning_rate": 2.915682967959528e-07, |
|
"loss": 0.6402, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"eval_accuracy": 0.71716621253406, |
|
"eval_loss": 0.6716727614402771, |
|
"eval_runtime": 81.6137, |
|
"eval_samples_per_second": 22.484, |
|
"eval_steps_per_second": 0.711, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 16.52215127003667, |
|
"learning_rate": 2.90725126475548e-07, |
|
"loss": 0.6428, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 19.497709198702577, |
|
"learning_rate": 2.8988195615514335e-07, |
|
"loss": 0.6215, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 18.350616329827186, |
|
"learning_rate": 2.890387858347386e-07, |
|
"loss": 0.6408, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 21.31149136639458, |
|
"learning_rate": 2.881956155143339e-07, |
|
"loss": 0.6435, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 18.652897776189004, |
|
"learning_rate": 2.8735244519392916e-07, |
|
"loss": 0.6429, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 15.905534553134705, |
|
"learning_rate": 2.8650927487352443e-07, |
|
"loss": 0.6406, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 16.811536357450365, |
|
"learning_rate": 2.856661045531197e-07, |
|
"loss": 0.6433, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 18.105305932357698, |
|
"learning_rate": 2.84822934232715e-07, |
|
"loss": 0.6305, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 17.99993276337699, |
|
"learning_rate": 2.839797639123103e-07, |
|
"loss": 0.6339, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 19.99711675781162, |
|
"learning_rate": 2.8313659359190557e-07, |
|
"loss": 0.6293, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"eval_accuracy": 0.740599455040872, |
|
"eval_loss": 0.6711810231208801, |
|
"eval_runtime": 81.6564, |
|
"eval_samples_per_second": 22.472, |
|
"eval_steps_per_second": 0.71, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 16.424824162884995, |
|
"learning_rate": 2.8229342327150084e-07, |
|
"loss": 0.6053, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 27.512486316457252, |
|
"learning_rate": 2.814502529510961e-07, |
|
"loss": 0.6109, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 22.320535945754912, |
|
"learning_rate": 2.806070826306914e-07, |
|
"loss": 0.6358, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 13.83846516782303, |
|
"learning_rate": 2.7976391231028666e-07, |
|
"loss": 0.6261, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 16.27927043935017, |
|
"learning_rate": 2.7892074198988193e-07, |
|
"loss": 0.6395, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 21.72712116315618, |
|
"learning_rate": 2.780775716694772e-07, |
|
"loss": 0.6273, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 19.022167698615288, |
|
"learning_rate": 2.7723440134907247e-07, |
|
"loss": 0.6206, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 16.64363952350825, |
|
"learning_rate": 2.763912310286678e-07, |
|
"loss": 0.6304, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 20.294764983153055, |
|
"learning_rate": 2.7554806070826307e-07, |
|
"loss": 0.6225, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 11.467691935838706, |
|
"learning_rate": 2.7470489038785834e-07, |
|
"loss": 0.6297, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"eval_accuracy": 0.7444141689373297, |
|
"eval_loss": 0.658985435962677, |
|
"eval_runtime": 81.6887, |
|
"eval_samples_per_second": 22.463, |
|
"eval_steps_per_second": 0.71, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 21.910401668214213, |
|
"learning_rate": 2.738617200674536e-07, |
|
"loss": 0.6182, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 24.007473629091503, |
|
"learning_rate": 2.730185497470489e-07, |
|
"loss": 0.6038, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 21.290646545312395, |
|
"learning_rate": 2.7217537942664415e-07, |
|
"loss": 0.6195, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 23.68987516909537, |
|
"learning_rate": 2.713322091062394e-07, |
|
"loss": 0.6308, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 30.80192565983983, |
|
"learning_rate": 2.704890387858347e-07, |
|
"loss": 0.6341, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 21.8133450286374, |
|
"learning_rate": 2.6964586846543e-07, |
|
"loss": 0.6106, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 20.299330239483883, |
|
"learning_rate": 2.688026981450253e-07, |
|
"loss": 0.64, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 20.648521192709566, |
|
"learning_rate": 2.6795952782462056e-07, |
|
"loss": 0.6169, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 30.447858291866996, |
|
"learning_rate": 2.6711635750421584e-07, |
|
"loss": 0.6416, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 15.933419590444712, |
|
"learning_rate": 2.662731871838111e-07, |
|
"loss": 0.6112, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"eval_accuracy": 0.7667574931880109, |
|
"eval_loss": 0.6638922095298767, |
|
"eval_runtime": 81.6313, |
|
"eval_samples_per_second": 22.479, |
|
"eval_steps_per_second": 0.711, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 23.46830344261005, |
|
"learning_rate": 2.654300168634064e-07, |
|
"loss": 0.6277, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 27.935995699348798, |
|
"learning_rate": 2.6458684654300165e-07, |
|
"loss": 0.6221, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 17.973218257874134, |
|
"learning_rate": 2.63743676222597e-07, |
|
"loss": 0.6477, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 18.376434311461416, |
|
"learning_rate": 2.629005059021922e-07, |
|
"loss": 0.6309, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 18.526800694282237, |
|
"learning_rate": 2.620573355817875e-07, |
|
"loss": 0.6316, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 15.15241254088161, |
|
"learning_rate": 2.612141652613828e-07, |
|
"loss": 0.6201, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 14.410194781468793, |
|
"learning_rate": 2.6037099494097806e-07, |
|
"loss": 0.6218, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 40.740369918242074, |
|
"learning_rate": 2.5952782462057333e-07, |
|
"loss": 0.609, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 17.442343289782034, |
|
"learning_rate": 2.586846543001686e-07, |
|
"loss": 0.6031, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 14.525640800311784, |
|
"learning_rate": 2.5784148397976393e-07, |
|
"loss": 0.6065, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"eval_accuracy": 0.7623978201634878, |
|
"eval_loss": 0.6653993725776672, |
|
"eval_runtime": 81.6511, |
|
"eval_samples_per_second": 22.474, |
|
"eval_steps_per_second": 0.71, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 16.37698400765353, |
|
"learning_rate": 2.5699831365935915e-07, |
|
"loss": 0.6044, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 18.46191451753819, |
|
"learning_rate": 2.5615514333895447e-07, |
|
"loss": 0.6026, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 16.60110193637955, |
|
"learning_rate": 2.5531197301854974e-07, |
|
"loss": 0.6117, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 12.90641142874894, |
|
"learning_rate": 2.54468802698145e-07, |
|
"loss": 0.6218, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 12.782297037944778, |
|
"learning_rate": 2.536256323777403e-07, |
|
"loss": 0.6094, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 13.132450106152808, |
|
"learning_rate": 2.5278246205733556e-07, |
|
"loss": 0.6065, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 15.505668610650245, |
|
"learning_rate": 2.519392917369309e-07, |
|
"loss": 0.6141, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 10.877242291946278, |
|
"learning_rate": 2.510961214165261e-07, |
|
"loss": 0.6132, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 16.260577656983788, |
|
"learning_rate": 2.502529510961214e-07, |
|
"loss": 0.618, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 17.74551723827062, |
|
"learning_rate": 2.494097807757167e-07, |
|
"loss": 0.6011, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"eval_accuracy": 0.7656675749318801, |
|
"eval_loss": 0.6725944876670837, |
|
"eval_runtime": 81.6007, |
|
"eval_samples_per_second": 22.488, |
|
"eval_steps_per_second": 0.711, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 13.489006632275235, |
|
"learning_rate": 2.4856661045531197e-07, |
|
"loss": 0.5972, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 11.884271769605707, |
|
"learning_rate": 2.4772344013490724e-07, |
|
"loss": 0.62, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 13.261788153937319, |
|
"learning_rate": 2.468802698145025e-07, |
|
"loss": 0.6167, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 14.172098159046573, |
|
"learning_rate": 2.4603709949409783e-07, |
|
"loss": 0.6153, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 21.378776702080216, |
|
"learning_rate": 2.4519392917369305e-07, |
|
"loss": 0.619, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 12.09294701146547, |
|
"learning_rate": 2.443507588532884e-07, |
|
"loss": 0.6282, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 14.012823513290494, |
|
"learning_rate": 2.4350758853288365e-07, |
|
"loss": 0.6329, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 12.10834170031786, |
|
"learning_rate": 2.426644182124789e-07, |
|
"loss": 0.6302, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 18.577060775724657, |
|
"learning_rate": 2.418212478920742e-07, |
|
"loss": 0.6311, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 13.914957035063992, |
|
"learning_rate": 2.4097807757166946e-07, |
|
"loss": 0.6188, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"eval_accuracy": 0.7716621253405994, |
|
"eval_loss": 0.6559094190597534, |
|
"eval_runtime": 81.6864, |
|
"eval_samples_per_second": 22.464, |
|
"eval_steps_per_second": 0.71, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 18.235947830015988, |
|
"learning_rate": 2.4013490725126473e-07, |
|
"loss": 0.6333, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 18.663534626617114, |
|
"learning_rate": 2.3929173693086e-07, |
|
"loss": 0.6277, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 15.048399094387358, |
|
"learning_rate": 2.3844856661045533e-07, |
|
"loss": 0.6209, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 14.114525067135927, |
|
"learning_rate": 2.3760539629005057e-07, |
|
"loss": 0.6192, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 19.517752622102307, |
|
"learning_rate": 2.3676222596964587e-07, |
|
"loss": 0.6016, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 11.012158965376576, |
|
"learning_rate": 2.3591905564924112e-07, |
|
"loss": 0.619, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 12.289099272322433, |
|
"learning_rate": 2.3507588532883641e-07, |
|
"loss": 0.625, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 11.949697745076817, |
|
"learning_rate": 2.3423271500843169e-07, |
|
"loss": 0.6147, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 20.873971042525024, |
|
"learning_rate": 2.3338954468802698e-07, |
|
"loss": 0.6059, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 23.749605034362084, |
|
"learning_rate": 2.3254637436762223e-07, |
|
"loss": 0.5964, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"eval_accuracy": 0.7623978201634878, |
|
"eval_loss": 0.6648799180984497, |
|
"eval_runtime": 81.6698, |
|
"eval_samples_per_second": 22.469, |
|
"eval_steps_per_second": 0.71, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 17.378412987521198, |
|
"learning_rate": 2.3170320404721753e-07, |
|
"loss": 0.5999, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 14.09219874944554, |
|
"learning_rate": 2.3086003372681282e-07, |
|
"loss": 0.6123, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 15.751831833203504, |
|
"learning_rate": 2.3001686340640807e-07, |
|
"loss": 0.6173, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 10.58472297190868, |
|
"learning_rate": 2.2917369308600337e-07, |
|
"loss": 0.6107, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 16.766833417242065, |
|
"learning_rate": 2.2833052276559864e-07, |
|
"loss": 0.5977, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 10.191079428211454, |
|
"learning_rate": 2.2748735244519394e-07, |
|
"loss": 0.6249, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 10.892435339015812, |
|
"learning_rate": 2.2664418212478918e-07, |
|
"loss": 0.6071, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 14.857284393951572, |
|
"learning_rate": 2.2580101180438448e-07, |
|
"loss": 0.6277, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 16.280040840533015, |
|
"learning_rate": 2.2495784148397975e-07, |
|
"loss": 0.6117, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 15.402943403313595, |
|
"learning_rate": 2.2411467116357502e-07, |
|
"loss": 0.6263, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"eval_accuracy": 0.7596730245231608, |
|
"eval_loss": 0.6583617329597473, |
|
"eval_runtime": 81.6999, |
|
"eval_samples_per_second": 22.46, |
|
"eval_steps_per_second": 0.71, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 16.324857723183708, |
|
"learning_rate": 2.2327150084317032e-07, |
|
"loss": 0.6189, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 10.98662853747513, |
|
"learning_rate": 2.224283305227656e-07, |
|
"loss": 0.6097, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 17.67105325082076, |
|
"learning_rate": 2.215851602023609e-07, |
|
"loss": 0.6122, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 20.41264564438468, |
|
"learning_rate": 2.2074198988195613e-07, |
|
"loss": 0.6133, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 17.79087613152158, |
|
"learning_rate": 2.1989881956155143e-07, |
|
"loss": 0.6211, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 14.606633495853552, |
|
"learning_rate": 2.1905564924114668e-07, |
|
"loss": 0.6127, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 15.658044929958105, |
|
"learning_rate": 2.1821247892074197e-07, |
|
"loss": 0.613, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 10.939022037806527, |
|
"learning_rate": 2.1736930860033725e-07, |
|
"loss": 0.611, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 17.074626515502477, |
|
"learning_rate": 2.1652613827993254e-07, |
|
"loss": 0.6235, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 9.873729304084089, |
|
"learning_rate": 2.1568296795952782e-07, |
|
"loss": 0.6173, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"eval_accuracy": 0.7787465940054495, |
|
"eval_loss": 0.6586682796478271, |
|
"eval_runtime": 81.6576, |
|
"eval_samples_per_second": 22.472, |
|
"eval_steps_per_second": 0.71, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 17.977983053245335, |
|
"learning_rate": 2.148397976391231e-07, |
|
"loss": 0.6085, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 22.720038161057953, |
|
"learning_rate": 2.1399662731871838e-07, |
|
"loss": 0.6077, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 14.934140128371691, |
|
"learning_rate": 2.1315345699831363e-07, |
|
"loss": 0.5936, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 17.025807021499876, |
|
"learning_rate": 2.1231028667790893e-07, |
|
"loss": 0.6077, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 13.012674899332776, |
|
"learning_rate": 2.114671163575042e-07, |
|
"loss": 0.6143, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 12.997561334592964, |
|
"learning_rate": 2.106239460370995e-07, |
|
"loss": 0.6079, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 12.442902629191648, |
|
"learning_rate": 2.0978077571669474e-07, |
|
"loss": 0.6027, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 16.45898926071221, |
|
"learning_rate": 2.0893760539629004e-07, |
|
"loss": 0.6027, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 21.037037529928906, |
|
"learning_rate": 2.0809443507588534e-07, |
|
"loss": 0.6048, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 9.922931220178954, |
|
"learning_rate": 2.0725126475548058e-07, |
|
"loss": 0.6133, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"eval_accuracy": 0.7754768392370572, |
|
"eval_loss": 0.6589598655700684, |
|
"eval_runtime": 81.5819, |
|
"eval_samples_per_second": 22.493, |
|
"eval_steps_per_second": 0.711, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 16.07724841249849, |
|
"learning_rate": 2.0640809443507588e-07, |
|
"loss": 0.6202, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 12.964684643299604, |
|
"learning_rate": 2.0556492411467115e-07, |
|
"loss": 0.6114, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 11.819826038425457, |
|
"learning_rate": 2.0472175379426645e-07, |
|
"loss": 0.6042, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 10.985202176776713, |
|
"learning_rate": 2.038785834738617e-07, |
|
"loss": 0.5989, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 15.914126285773381, |
|
"learning_rate": 2.03035413153457e-07, |
|
"loss": 0.6111, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 20.198502391005317, |
|
"learning_rate": 2.0219224283305226e-07, |
|
"loss": 0.6193, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 16.375353496861376, |
|
"learning_rate": 2.0134907251264754e-07, |
|
"loss": 0.6124, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 14.424027706264562, |
|
"learning_rate": 2.0050590219224283e-07, |
|
"loss": 0.6095, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 17.448118657270804, |
|
"learning_rate": 1.996627318718381e-07, |
|
"loss": 0.594, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 15.835051950720354, |
|
"learning_rate": 1.988195615514334e-07, |
|
"loss": 0.5902, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"eval_accuracy": 0.7673024523160763, |
|
"eval_loss": 0.6717323660850525, |
|
"eval_runtime": 81.1264, |
|
"eval_samples_per_second": 22.619, |
|
"eval_steps_per_second": 0.715, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 10.104802089341579, |
|
"learning_rate": 1.9797639123102865e-07, |
|
"loss": 0.6083, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 16.88055370044505, |
|
"learning_rate": 1.9713322091062395e-07, |
|
"loss": 0.6208, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 13.25377180665782, |
|
"learning_rate": 1.962900505902192e-07, |
|
"loss": 0.6195, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 12.03529826522156, |
|
"learning_rate": 1.954468802698145e-07, |
|
"loss": 0.6013, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 21.966805810887724, |
|
"learning_rate": 1.9460370994940976e-07, |
|
"loss": 0.5955, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 25.150403183144306, |
|
"learning_rate": 1.9376053962900506e-07, |
|
"loss": 0.6123, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 28.00330027046741, |
|
"learning_rate": 1.9291736930860033e-07, |
|
"loss": 0.6242, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 10.71380749264463, |
|
"learning_rate": 1.920741989881956e-07, |
|
"loss": 0.6288, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 14.346837709212238, |
|
"learning_rate": 1.912310286677909e-07, |
|
"loss": 0.6212, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 11.295633948457253, |
|
"learning_rate": 1.9038785834738614e-07, |
|
"loss": 0.6027, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"eval_accuracy": 0.7787465940054495, |
|
"eval_loss": 0.6629257202148438, |
|
"eval_runtime": 81.1053, |
|
"eval_samples_per_second": 22.625, |
|
"eval_steps_per_second": 0.715, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 11.81270801886205, |
|
"learning_rate": 1.8954468802698144e-07, |
|
"loss": 0.6127, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 11.458987921580261, |
|
"learning_rate": 1.887015177065767e-07, |
|
"loss": 0.6027, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 14.114816619174277, |
|
"learning_rate": 1.87858347386172e-07, |
|
"loss": 0.617, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 9.445103600368194, |
|
"learning_rate": 1.8701517706576726e-07, |
|
"loss": 0.6016, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 16.378432231814056, |
|
"learning_rate": 1.8617200674536255e-07, |
|
"loss": 0.607, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 18.105605603170623, |
|
"learning_rate": 1.8532883642495785e-07, |
|
"loss": 0.6059, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 25.98754135538099, |
|
"learning_rate": 1.844856661045531e-07, |
|
"loss": 0.6079, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 13.387903869057888, |
|
"learning_rate": 1.836424957841484e-07, |
|
"loss": 0.6248, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 15.072500301887933, |
|
"learning_rate": 1.8279932546374367e-07, |
|
"loss": 0.6059, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 28.195360534555086, |
|
"learning_rate": 1.8195615514333896e-07, |
|
"loss": 0.6094, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"eval_accuracy": 0.7825613079019074, |
|
"eval_loss": 0.6670619249343872, |
|
"eval_runtime": 81.1088, |
|
"eval_samples_per_second": 22.624, |
|
"eval_steps_per_second": 0.715, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 16.78097475585066, |
|
"learning_rate": 1.811129848229342e-07, |
|
"loss": 0.6026, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 9.27565083160915, |
|
"learning_rate": 1.802698145025295e-07, |
|
"loss": 0.6061, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 15.151917102254139, |
|
"learning_rate": 1.7942664418212478e-07, |
|
"loss": 0.6121, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 15.628107133180718, |
|
"learning_rate": 1.7858347386172005e-07, |
|
"loss": 0.6136, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 11.41747344754936, |
|
"learning_rate": 1.7774030354131535e-07, |
|
"loss": 0.6013, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 17.94793616613114, |
|
"learning_rate": 1.7689713322091062e-07, |
|
"loss": 0.6094, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 16.847404718639655, |
|
"learning_rate": 1.7605396290050592e-07, |
|
"loss": 0.6006, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 19.943006958068334, |
|
"learning_rate": 1.7521079258010116e-07, |
|
"loss": 0.6203, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 17.544121262898, |
|
"learning_rate": 1.7436762225969646e-07, |
|
"loss": 0.6118, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 16.348862133230483, |
|
"learning_rate": 1.735244519392917e-07, |
|
"loss": 0.606, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"eval_accuracy": 0.7771117166212534, |
|
"eval_loss": 0.6631556749343872, |
|
"eval_runtime": 81.1021, |
|
"eval_samples_per_second": 22.626, |
|
"eval_steps_per_second": 0.715, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 15.065882473200872, |
|
"learning_rate": 1.72681281618887e-07, |
|
"loss": 0.6105, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 10.334423134734958, |
|
"learning_rate": 1.718381112984823e-07, |
|
"loss": 0.603, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 13.630372232978868, |
|
"learning_rate": 1.7099494097807757e-07, |
|
"loss": 0.6177, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 21.60021911475766, |
|
"learning_rate": 1.7015177065767284e-07, |
|
"loss": 0.6138, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 27.660710692587415, |
|
"learning_rate": 1.6930860033726811e-07, |
|
"loss": 0.6058, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 15.6224127860944, |
|
"learning_rate": 1.684654300168634e-07, |
|
"loss": 0.6275, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 9.768689869942213, |
|
"learning_rate": 1.6762225969645866e-07, |
|
"loss": 0.6181, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 29.86529261497021, |
|
"learning_rate": 1.6677908937605395e-07, |
|
"loss": 0.6175, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 12.971455009975848, |
|
"learning_rate": 1.6593591905564923e-07, |
|
"loss": 0.6177, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 15.387843128502691, |
|
"learning_rate": 1.6509274873524452e-07, |
|
"loss": 0.6119, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"eval_accuracy": 0.7754768392370572, |
|
"eval_loss": 0.6606820225715637, |
|
"eval_runtime": 81.096, |
|
"eval_samples_per_second": 22.627, |
|
"eval_steps_per_second": 0.715, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 14.683073977357795, |
|
"learning_rate": 1.642495784148398e-07, |
|
"loss": 0.6132, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 20.101674008953353, |
|
"learning_rate": 1.6340640809443507e-07, |
|
"loss": 0.6129, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 9.565771214700746, |
|
"learning_rate": 1.6256323777403036e-07, |
|
"loss": 0.6032, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 19.63652279943903, |
|
"learning_rate": 1.617200674536256e-07, |
|
"loss": 0.6044, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 11.798398386935885, |
|
"learning_rate": 1.608768971332209e-07, |
|
"loss": 0.6104, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 8.45308349951647, |
|
"learning_rate": 1.6003372681281618e-07, |
|
"loss": 0.5946, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 6.748385507096111, |
|
"learning_rate": 1.5919055649241148e-07, |
|
"loss": 0.6124, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 11.792848227458215, |
|
"learning_rate": 1.5834738617200672e-07, |
|
"loss": 0.6118, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 15.049508549188333, |
|
"learning_rate": 1.5750421585160202e-07, |
|
"loss": 0.6085, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 10.01303081102771, |
|
"learning_rate": 1.5666104553119732e-07, |
|
"loss": 0.5992, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"eval_accuracy": 0.7798365122615804, |
|
"eval_loss": 0.6598241925239563, |
|
"eval_runtime": 81.0778, |
|
"eval_samples_per_second": 22.633, |
|
"eval_steps_per_second": 0.715, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 13.816105839597155, |
|
"learning_rate": 1.5581787521079256e-07, |
|
"loss": 0.614, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 15.916990016500973, |
|
"learning_rate": 1.5497470489038786e-07, |
|
"loss": 0.6071, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 14.639137907706433, |
|
"learning_rate": 1.5413153456998313e-07, |
|
"loss": 0.6069, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 15.234658834038223, |
|
"learning_rate": 1.5328836424957843e-07, |
|
"loss": 0.5919, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 13.325766641648226, |
|
"learning_rate": 1.5244519392917367e-07, |
|
"loss": 0.6179, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 9.90933302447297, |
|
"learning_rate": 1.5160202360876897e-07, |
|
"loss": 0.6017, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 18.624449453795865, |
|
"learning_rate": 1.5075885328836422e-07, |
|
"loss": 0.6055, |
|
"step": 1770 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 3558, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 500, |
|
"total_flos": 0.0, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|