|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 3.0, |
|
"eval_steps": 500, |
|
"global_step": 1521, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.01972386587771203, |
|
"grad_norm": 8.260985845744274, |
|
"learning_rate": 6.493506493506493e-07, |
|
"loss": 0.7556, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.03944773175542406, |
|
"grad_norm": 3.285262290141762, |
|
"learning_rate": 1.2987012987012986e-06, |
|
"loss": 0.6243, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.05917159763313609, |
|
"grad_norm": 1.8240152669785012, |
|
"learning_rate": 1.9480519480519483e-06, |
|
"loss": 0.527, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.07889546351084813, |
|
"grad_norm": 2.484918347587673, |
|
"learning_rate": 2.597402597402597e-06, |
|
"loss": 0.4858, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.09861932938856016, |
|
"grad_norm": 1.6071106659008887, |
|
"learning_rate": 3.246753246753247e-06, |
|
"loss": 0.4616, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.11834319526627218, |
|
"grad_norm": 1.7753595146102916, |
|
"learning_rate": 3.896103896103897e-06, |
|
"loss": 0.4467, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.13806706114398423, |
|
"grad_norm": 1.9952784096044014, |
|
"learning_rate": 4.5454545454545455e-06, |
|
"loss": 0.4355, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.15779092702169625, |
|
"grad_norm": 2.0090088525329057, |
|
"learning_rate": 4.999952075361122e-06, |
|
"loss": 0.4303, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.17751479289940827, |
|
"grad_norm": 1.9429866993343683, |
|
"learning_rate": 4.99910013857428e-06, |
|
"loss": 0.4213, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.19723865877712032, |
|
"grad_norm": 2.2150406308730166, |
|
"learning_rate": 4.997183673954895e-06, |
|
"loss": 0.4205, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.21696252465483234, |
|
"grad_norm": 2.3280715715799105, |
|
"learning_rate": 4.994203588590157e-06, |
|
"loss": 0.4132, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.23668639053254437, |
|
"grad_norm": 2.0514718162160617, |
|
"learning_rate": 4.9901612929925455e-06, |
|
"loss": 0.4097, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.2564102564102564, |
|
"grad_norm": 2.248051724393392, |
|
"learning_rate": 4.985058700432217e-06, |
|
"loss": 0.4078, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.27613412228796846, |
|
"grad_norm": 2.4477065193392114, |
|
"learning_rate": 4.978898226031426e-06, |
|
"loss": 0.4035, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.2958579881656805, |
|
"grad_norm": 2.3530821376592317, |
|
"learning_rate": 4.97168278562142e-06, |
|
"loss": 0.3988, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.3155818540433925, |
|
"grad_norm": 2.0658208779463796, |
|
"learning_rate": 4.9634157943623345e-06, |
|
"loss": 0.4008, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.33530571992110453, |
|
"grad_norm": 1.6308701318103827, |
|
"learning_rate": 4.954101165126764e-06, |
|
"loss": 0.3955, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.35502958579881655, |
|
"grad_norm": 1.8767575875235638, |
|
"learning_rate": 4.943743306647738e-06, |
|
"loss": 0.3964, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.3747534516765286, |
|
"grad_norm": 2.158851334024998, |
|
"learning_rate": 4.932347121432018e-06, |
|
"loss": 0.3955, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.39447731755424065, |
|
"grad_norm": 2.2424601067528367, |
|
"learning_rate": 4.919918003439677e-06, |
|
"loss": 0.3929, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.41420118343195267, |
|
"grad_norm": 1.4704562127782181, |
|
"learning_rate": 4.9064618355310694e-06, |
|
"loss": 0.3951, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.4339250493096647, |
|
"grad_norm": 1.5325962055467024, |
|
"learning_rate": 4.8919849866823955e-06, |
|
"loss": 0.3936, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.4536489151873767, |
|
"grad_norm": 1.752553432251344, |
|
"learning_rate": 4.8764943089711876e-06, |
|
"loss": 0.3894, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.47337278106508873, |
|
"grad_norm": 3.582185649197669, |
|
"learning_rate": 4.859997134333133e-06, |
|
"loss": 0.39, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.4930966469428008, |
|
"grad_norm": 2.283623608488685, |
|
"learning_rate": 4.842501271091773e-06, |
|
"loss": 0.3845, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.5128205128205128, |
|
"grad_norm": 2.954635543178996, |
|
"learning_rate": 4.8240150002627285e-06, |
|
"loss": 0.3853, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.5325443786982249, |
|
"grad_norm": 2.621411991175976, |
|
"learning_rate": 4.80454707163418e-06, |
|
"loss": 0.3802, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.5522682445759369, |
|
"grad_norm": 3.0076538937186554, |
|
"learning_rate": 4.784106699625493e-06, |
|
"loss": 0.3778, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.571992110453649, |
|
"grad_norm": 2.620788244299813, |
|
"learning_rate": 4.762703558925907e-06, |
|
"loss": 0.381, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.591715976331361, |
|
"grad_norm": 2.600774288511616, |
|
"learning_rate": 4.740347779915384e-06, |
|
"loss": 0.3795, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.611439842209073, |
|
"grad_norm": 2.825934593188172, |
|
"learning_rate": 4.717049943869774e-06, |
|
"loss": 0.3754, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.631163708086785, |
|
"grad_norm": 1.9636455738063043, |
|
"learning_rate": 4.692821077952556e-06, |
|
"loss": 0.3709, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.650887573964497, |
|
"grad_norm": 1.465934093555826, |
|
"learning_rate": 4.667672649995539e-06, |
|
"loss": 0.3686, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.6706114398422091, |
|
"grad_norm": 1.6730733158146738, |
|
"learning_rate": 4.641616563071003e-06, |
|
"loss": 0.374, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.6903353057199211, |
|
"grad_norm": 1.6420981338152472, |
|
"learning_rate": 4.6146651498578095e-06, |
|
"loss": 0.3725, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.7100591715976331, |
|
"grad_norm": 1.7081311753490396, |
|
"learning_rate": 4.586831166804191e-06, |
|
"loss": 0.3723, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.7297830374753451, |
|
"grad_norm": 1.7004420091082517, |
|
"learning_rate": 4.558127788089966e-06, |
|
"loss": 0.3685, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.7495069033530573, |
|
"grad_norm": 1.813129623101683, |
|
"learning_rate": 4.5285685993910246e-06, |
|
"loss": 0.3693, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.7692307692307693, |
|
"grad_norm": 1.441392302489358, |
|
"learning_rate": 4.49816759144906e-06, |
|
"loss": 0.3672, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.7889546351084813, |
|
"grad_norm": 1.743528342139816, |
|
"learning_rate": 4.466939153449565e-06, |
|
"loss": 0.3629, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.8086785009861933, |
|
"grad_norm": 1.5505480061250534, |
|
"learning_rate": 4.434898066211255e-06, |
|
"loss": 0.3647, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.8284023668639053, |
|
"grad_norm": 1.748134152515452, |
|
"learning_rate": 4.402059495190112e-06, |
|
"loss": 0.3687, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.8481262327416174, |
|
"grad_norm": 1.888131474531523, |
|
"learning_rate": 4.368438983301382e-06, |
|
"loss": 0.368, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.8678500986193294, |
|
"grad_norm": 1.3077877777100417, |
|
"learning_rate": 4.334052443562914e-06, |
|
"loss": 0.364, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.8875739644970414, |
|
"grad_norm": 1.7143497390643974, |
|
"learning_rate": 4.298916151563324e-06, |
|
"loss": 0.3662, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.9072978303747534, |
|
"grad_norm": 1.2650560376490414, |
|
"learning_rate": 4.263046737758557e-06, |
|
"loss": 0.3634, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.9270216962524654, |
|
"grad_norm": 1.325272234023546, |
|
"learning_rate": 4.226461179600474e-06, |
|
"loss": 0.3647, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.9467455621301775, |
|
"grad_norm": 1.7799396783443953, |
|
"learning_rate": 4.189176793501208e-06, |
|
"loss": 0.3601, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.9664694280078896, |
|
"grad_norm": 1.6138030010077298, |
|
"learning_rate": 4.151211226637083e-06, |
|
"loss": 0.3639, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.9861932938856016, |
|
"grad_norm": 1.6475058606657829, |
|
"learning_rate": 4.112582448595989e-06, |
|
"loss": 0.3631, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 0.3610161542892456, |
|
"eval_runtime": 46.5378, |
|
"eval_samples_per_second": 293.095, |
|
"eval_steps_per_second": 1.16, |
|
"step": 507 |
|
}, |
|
{ |
|
"epoch": 1.0059171597633136, |
|
"grad_norm": 2.318083617694004, |
|
"learning_rate": 4.073308742872136e-06, |
|
"loss": 0.339, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.0256410256410255, |
|
"grad_norm": 2.26507527796031, |
|
"learning_rate": 4.033408698212244e-06, |
|
"loss": 0.2904, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.0453648915187377, |
|
"grad_norm": 2.129210352759771, |
|
"learning_rate": 3.99290119981726e-06, |
|
"loss": 0.2845, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.0650887573964498, |
|
"grad_norm": 2.0458511034566897, |
|
"learning_rate": 3.95180542040374e-06, |
|
"loss": 0.2826, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.0848126232741617, |
|
"grad_norm": 2.34540520465628, |
|
"learning_rate": 3.910140811129166e-06, |
|
"loss": 0.2817, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.1045364891518739, |
|
"grad_norm": 1.5731137478504271, |
|
"learning_rate": 3.8679270923854596e-06, |
|
"loss": 0.2816, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.1242603550295858, |
|
"grad_norm": 1.9641564243584235, |
|
"learning_rate": 3.825184244465071e-06, |
|
"loss": 0.2833, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.143984220907298, |
|
"grad_norm": 1.5653763677552233, |
|
"learning_rate": 3.7819324981040517e-06, |
|
"loss": 0.2835, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.1637080867850098, |
|
"grad_norm": 1.4455902546137582, |
|
"learning_rate": 3.7381923249065838e-06, |
|
"loss": 0.2806, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.183431952662722, |
|
"grad_norm": 1.4589441051909717, |
|
"learning_rate": 3.6939844276555146e-06, |
|
"loss": 0.2842, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.2031558185404339, |
|
"grad_norm": 1.4737079619190827, |
|
"learning_rate": 3.649329730513461e-06, |
|
"loss": 0.2818, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 1.222879684418146, |
|
"grad_norm": 1.424470321783783, |
|
"learning_rate": 3.6042493691191377e-06, |
|
"loss": 0.2835, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.242603550295858, |
|
"grad_norm": 1.43822809638539, |
|
"learning_rate": 3.558764680583589e-06, |
|
"loss": 0.2829, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 1.26232741617357, |
|
"grad_norm": 1.4491877471048427, |
|
"learning_rate": 3.51289719339106e-06, |
|
"loss": 0.2823, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.282051282051282, |
|
"grad_norm": 1.4979353903583295, |
|
"learning_rate": 3.4666686172092927e-06, |
|
"loss": 0.2859, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.301775147928994, |
|
"grad_norm": 1.4793881592613725, |
|
"learning_rate": 3.4201008326140596e-06, |
|
"loss": 0.2849, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 1.3214990138067062, |
|
"grad_norm": 1.6343693105840815, |
|
"learning_rate": 3.3732158807328116e-06, |
|
"loss": 0.2875, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 1.3412228796844181, |
|
"grad_norm": 1.5638318327999918, |
|
"learning_rate": 3.3260359528123266e-06, |
|
"loss": 0.2877, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 1.3609467455621302, |
|
"grad_norm": 1.434550639059279, |
|
"learning_rate": 3.2785833797153115e-06, |
|
"loss": 0.2817, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 1.3806706114398422, |
|
"grad_norm": 1.3783604211664602, |
|
"learning_rate": 3.2308806213509204e-06, |
|
"loss": 0.2809, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.4003944773175543, |
|
"grad_norm": 1.7104337243982326, |
|
"learning_rate": 3.182950256044188e-06, |
|
"loss": 0.2825, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 1.4201183431952662, |
|
"grad_norm": 1.9527331404429782, |
|
"learning_rate": 3.1348149698494233e-06, |
|
"loss": 0.2827, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 1.4398422090729783, |
|
"grad_norm": 1.5082040480125063, |
|
"learning_rate": 3.0864975458126158e-06, |
|
"loss": 0.2857, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 1.4595660749506902, |
|
"grad_norm": 1.5939434329404958, |
|
"learning_rate": 3.038020853187914e-06, |
|
"loss": 0.2831, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 1.4792899408284024, |
|
"grad_norm": 1.425454732201556, |
|
"learning_rate": 2.98940783661333e-06, |
|
"loss": 0.2802, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.4990138067061145, |
|
"grad_norm": 1.4324944544127631, |
|
"learning_rate": 2.940681505250742e-06, |
|
"loss": 0.2848, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 1.5187376725838264, |
|
"grad_norm": 1.4082984304420074, |
|
"learning_rate": 2.8918649218953624e-06, |
|
"loss": 0.2801, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 1.5384615384615383, |
|
"grad_norm": 1.5895657718154816, |
|
"learning_rate": 2.84298119205983e-06, |
|
"loss": 0.2807, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 1.5581854043392505, |
|
"grad_norm": 1.6080440377232041, |
|
"learning_rate": 2.7940534530380666e-06, |
|
"loss": 0.2835, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 1.5779092702169626, |
|
"grad_norm": 1.404915797241871, |
|
"learning_rate": 2.7451048629541045e-06, |
|
"loss": 0.2808, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.5976331360946747, |
|
"grad_norm": 1.4879672080505235, |
|
"learning_rate": 2.6961585898010523e-06, |
|
"loss": 0.2806, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 1.6173570019723866, |
|
"grad_norm": 1.3888602093522253, |
|
"learning_rate": 2.647237800475384e-06, |
|
"loss": 0.2832, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 1.6370808678500985, |
|
"grad_norm": 1.3670120148082392, |
|
"learning_rate": 2.5983656498117525e-06, |
|
"loss": 0.2825, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 1.6568047337278107, |
|
"grad_norm": 1.2812642080517738, |
|
"learning_rate": 2.54956526962351e-06, |
|
"loss": 0.279, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 1.6765285996055228, |
|
"grad_norm": 1.252430854449729, |
|
"learning_rate": 2.5008597577541288e-06, |
|
"loss": 0.2814, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 1.6962524654832347, |
|
"grad_norm": 1.2750427994477165, |
|
"learning_rate": 2.45227216714469e-06, |
|
"loss": 0.2792, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 1.7159763313609466, |
|
"grad_norm": 1.354377403404739, |
|
"learning_rate": 2.403825494922636e-06, |
|
"loss": 0.282, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 1.7357001972386588, |
|
"grad_norm": 1.4267990848182481, |
|
"learning_rate": 2.3555426715169396e-06, |
|
"loss": 0.2791, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 1.755424063116371, |
|
"grad_norm": 1.252857555239978, |
|
"learning_rate": 2.3074465498048303e-06, |
|
"loss": 0.2826, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 1.7751479289940828, |
|
"grad_norm": 1.2876786054611615, |
|
"learning_rate": 2.259559894295244e-06, |
|
"loss": 0.2789, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.7948717948717947, |
|
"grad_norm": 1.2629901820145135, |
|
"learning_rate": 2.2119053703540866e-06, |
|
"loss": 0.2791, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 1.8145956607495068, |
|
"grad_norm": 1.3562733049556417, |
|
"learning_rate": 2.1645055334764237e-06, |
|
"loss": 0.2807, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 1.834319526627219, |
|
"grad_norm": 1.3132542320273741, |
|
"learning_rate": 2.1173828186106828e-06, |
|
"loss": 0.2782, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 1.854043392504931, |
|
"grad_norm": 1.372645351488049, |
|
"learning_rate": 2.0705595295399e-06, |
|
"loss": 0.28, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 1.873767258382643, |
|
"grad_norm": 1.286506818666612, |
|
"learning_rate": 2.0240578283250596e-06, |
|
"loss": 0.2788, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 1.893491124260355, |
|
"grad_norm": 1.343985774681719, |
|
"learning_rate": 1.9778997248155013e-06, |
|
"loss": 0.2779, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 1.913214990138067, |
|
"grad_norm": 1.3873943864064089, |
|
"learning_rate": 1.9321070662313824e-06, |
|
"loss": 0.2768, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 1.9329388560157792, |
|
"grad_norm": 1.3822544572854645, |
|
"learning_rate": 1.88670152682311e-06, |
|
"loss": 0.2753, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 1.952662721893491, |
|
"grad_norm": 1.3724554338840655, |
|
"learning_rate": 1.8417045976126347e-06, |
|
"loss": 0.274, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 1.972386587771203, |
|
"grad_norm": 1.428387339598408, |
|
"learning_rate": 1.797137576221482e-06, |
|
"loss": 0.2775, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.9921104536489151, |
|
"grad_norm": 1.2370547509299645, |
|
"learning_rate": 1.753021556790314e-06, |
|
"loss": 0.2746, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 0.3482723832130432, |
|
"eval_runtime": 46.4255, |
|
"eval_samples_per_second": 293.804, |
|
"eval_steps_per_second": 1.163, |
|
"step": 1014 |
|
}, |
|
{ |
|
"epoch": 2.0118343195266273, |
|
"grad_norm": 1.9502351693684774, |
|
"learning_rate": 1.7093774199948004e-06, |
|
"loss": 0.2309, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 2.0315581854043394, |
|
"grad_norm": 1.5862323859503984, |
|
"learning_rate": 1.6662258231625331e-06, |
|
"loss": 0.2026, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 2.051282051282051, |
|
"grad_norm": 1.3292614459089434, |
|
"learning_rate": 1.6235871904956431e-06, |
|
"loss": 0.2034, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 2.0710059171597632, |
|
"grad_norm": 1.2370582334736997, |
|
"learning_rate": 1.5814817034037715e-06, |
|
"loss": 0.2008, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 2.0907297830374754, |
|
"grad_norm": 1.325897622024457, |
|
"learning_rate": 1.5399292909519422e-06, |
|
"loss": 0.2042, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 2.1104536489151875, |
|
"grad_norm": 1.4548395791353137, |
|
"learning_rate": 1.4989496204278897e-06, |
|
"loss": 0.2025, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 2.1301775147928996, |
|
"grad_norm": 1.36179677292465, |
|
"learning_rate": 1.458562088033273e-06, |
|
"loss": 0.1978, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 2.1499013806706113, |
|
"grad_norm": 1.4589926591648759, |
|
"learning_rate": 1.4187858097032086e-06, |
|
"loss": 0.2024, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 2.1696252465483234, |
|
"grad_norm": 1.3095440667780154, |
|
"learning_rate": 1.3796396120584576e-06, |
|
"loss": 0.2032, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 2.1893491124260356, |
|
"grad_norm": 1.3522834520399176, |
|
"learning_rate": 1.341142023494537e-06, |
|
"loss": 0.1992, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 2.2090729783037477, |
|
"grad_norm": 1.3914925068585928, |
|
"learning_rate": 1.3033112654120032e-06, |
|
"loss": 0.2029, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 2.2287968441814594, |
|
"grad_norm": 1.2392072409116117, |
|
"learning_rate": 1.266165243592024e-06, |
|
"loss": 0.2019, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 2.2485207100591715, |
|
"grad_norm": 1.450828785906611, |
|
"learning_rate": 1.2297215397213442e-06, |
|
"loss": 0.2029, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 2.2682445759368837, |
|
"grad_norm": 1.3539897715774756, |
|
"learning_rate": 1.1939974030706499e-06, |
|
"loss": 0.1989, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 2.287968441814596, |
|
"grad_norm": 1.3124427663284721, |
|
"learning_rate": 1.1590097423302681e-06, |
|
"loss": 0.2013, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 2.3076923076923075, |
|
"grad_norm": 1.2751387286158546, |
|
"learning_rate": 1.1247751176070688e-06, |
|
"loss": 0.2003, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 2.3274161735700196, |
|
"grad_norm": 1.2826788452929796, |
|
"learning_rate": 1.0913097325863526e-06, |
|
"loss": 0.2013, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 2.3471400394477318, |
|
"grad_norm": 1.3449233167779666, |
|
"learning_rate": 1.0586294268624391e-06, |
|
"loss": 0.2031, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 2.366863905325444, |
|
"grad_norm": 1.3034368496811286, |
|
"learning_rate": 1.026749668441587e-06, |
|
"loss": 0.1994, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 2.386587771203156, |
|
"grad_norm": 1.3565807097213252, |
|
"learning_rate": 9.956855464207873e-07, |
|
"loss": 0.2, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 2.4063116370808677, |
|
"grad_norm": 1.451004027193357, |
|
"learning_rate": 9.654517638459015e-07, |
|
"loss": 0.1996, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 2.42603550295858, |
|
"grad_norm": 1.3107553476519733, |
|
"learning_rate": 9.360626307525231e-07, |
|
"loss": 0.2004, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 2.445759368836292, |
|
"grad_norm": 1.2866100592193557, |
|
"learning_rate": 9.075320573928513e-07, |
|
"loss": 0.2026, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 2.465483234714004, |
|
"grad_norm": 1.3169876215045113, |
|
"learning_rate": 8.798735476517964e-07, |
|
"loss": 0.2027, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 2.485207100591716, |
|
"grad_norm": 1.2821201625196061, |
|
"learning_rate": 8.531001926554134e-07, |
|
"loss": 0.2011, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 2.504930966469428, |
|
"grad_norm": 1.315132765819279, |
|
"learning_rate": 8.272246645747072e-07, |
|
"loss": 0.199, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 2.52465483234714, |
|
"grad_norm": 1.276154658164099, |
|
"learning_rate": 8.022592106277332e-07, |
|
"loss": 0.2008, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 2.544378698224852, |
|
"grad_norm": 1.2274421062761773, |
|
"learning_rate": 7.782156472828299e-07, |
|
"loss": 0.1998, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 2.564102564102564, |
|
"grad_norm": 1.2435720383981574, |
|
"learning_rate": 7.551053546657356e-07, |
|
"loss": 0.1995, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 2.583826429980276, |
|
"grad_norm": 1.2327909078947592, |
|
"learning_rate": 7.329392711732278e-07, |
|
"loss": 0.2024, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 2.603550295857988, |
|
"grad_norm": 1.1783489485507048, |
|
"learning_rate": 7.117278882958421e-07, |
|
"loss": 0.2003, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 2.6232741617357003, |
|
"grad_norm": 1.2687230261577986, |
|
"learning_rate": 6.914812456521138e-07, |
|
"loss": 0.2006, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 2.6429980276134124, |
|
"grad_norm": 1.2646158919927277, |
|
"learning_rate": 6.722089262366993e-07, |
|
"loss": 0.1982, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 2.662721893491124, |
|
"grad_norm": 1.2236131305338422, |
|
"learning_rate": 6.539200518846226e-07, |
|
"loss": 0.2001, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 2.6824457593688362, |
|
"grad_norm": 1.2428023457207789, |
|
"learning_rate": 6.366232789537923e-07, |
|
"loss": 0.2048, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 2.7021696252465484, |
|
"grad_norm": 1.2559417256017682, |
|
"learning_rate": 6.203267942278395e-07, |
|
"loss": 0.2012, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 2.7218934911242605, |
|
"grad_norm": 1.2572564112264348, |
|
"learning_rate": 6.050383110412069e-07, |
|
"loss": 0.1994, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 2.7416173570019726, |
|
"grad_norm": 1.1764889460619852, |
|
"learning_rate": 5.907650656283289e-07, |
|
"loss": 0.2002, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 2.7613412228796843, |
|
"grad_norm": 1.2804661059833917, |
|
"learning_rate": 5.775138136986298e-07, |
|
"loss": 0.2002, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 2.7810650887573964, |
|
"grad_norm": 1.3077263435732718, |
|
"learning_rate": 5.652908272389604e-07, |
|
"loss": 0.1995, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 2.8007889546351086, |
|
"grad_norm": 1.231137370296971, |
|
"learning_rate": 5.541018915449863e-07, |
|
"loss": 0.1989, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 2.8205128205128203, |
|
"grad_norm": 1.3443797697665705, |
|
"learning_rate": 5.439523024829335e-07, |
|
"loss": 0.1983, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 2.8402366863905324, |
|
"grad_norm": 1.2092638219767884, |
|
"learning_rate": 5.348468639829871e-07, |
|
"loss": 0.2007, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 2.8599605522682445, |
|
"grad_norm": 1.2392545674361426, |
|
"learning_rate": 5.267898857655307e-07, |
|
"loss": 0.201, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 2.8796844181459567, |
|
"grad_norm": 1.255507262390408, |
|
"learning_rate": 5.19785181301299e-07, |
|
"loss": 0.2008, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 2.899408284023669, |
|
"grad_norm": 1.2545629120536586, |
|
"learning_rate": 5.138360660064146e-07, |
|
"loss": 0.1979, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 2.9191321499013805, |
|
"grad_norm": 1.2279624795193589, |
|
"learning_rate": 5.08945355673159e-07, |
|
"loss": 0.201, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 2.9388560157790926, |
|
"grad_norm": 1.2395946923655343, |
|
"learning_rate": 5.05115365137222e-07, |
|
"loss": 0.1999, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 2.9585798816568047, |
|
"grad_norm": 1.2212433583596156, |
|
"learning_rate": 5.023479071820607e-07, |
|
"loss": 0.1989, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 2.978303747534517, |
|
"grad_norm": 1.298954785136158, |
|
"learning_rate": 5.006442916808849e-07, |
|
"loss": 0.2019, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 2.998027613412229, |
|
"grad_norm": 1.3586461216494594, |
|
"learning_rate": 5.000053249766787e-07, |
|
"loss": 0.1999, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_loss": 0.37075862288475037, |
|
"eval_runtime": 53.9042, |
|
"eval_samples_per_second": 253.041, |
|
"eval_steps_per_second": 1.002, |
|
"step": 1521 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"step": 1521, |
|
"total_flos": 2547731650314240.0, |
|
"train_loss": 0.2954053143131192, |
|
"train_runtime": 9131.3778, |
|
"train_samples_per_second": 85.143, |
|
"train_steps_per_second": 0.167 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 1521, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2547731650314240.0, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|