|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.995245641838352, |
|
"eval_steps": 500, |
|
"global_step": 945, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.03169572107765452, |
|
"grad_norm": 5.002077210650163, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8641, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.06339144215530904, |
|
"grad_norm": 1.0041823875277287, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7817, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.09508716323296355, |
|
"grad_norm": 0.7267029948833413, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7429, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.12678288431061807, |
|
"grad_norm": 0.624153716022588, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7173, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.15847860538827258, |
|
"grad_norm": 0.566309592028478, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7097, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.1901743264659271, |
|
"grad_norm": 0.8668636299502676, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7121, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.2218700475435816, |
|
"grad_norm": 0.684958377999424, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6997, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.25356576862123614, |
|
"grad_norm": 0.5215403890931859, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6898, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.28526148969889065, |
|
"grad_norm": 0.4816858922190366, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6867, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.31695721077654515, |
|
"grad_norm": 0.7154671815551356, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6748, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.3486529318541997, |
|
"grad_norm": 0.4058042716028271, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6765, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.3803486529318542, |
|
"grad_norm": 0.39453817862211543, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6734, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.4120443740095087, |
|
"grad_norm": 0.6699185012363967, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6736, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.4437400950871632, |
|
"grad_norm": 0.7047289653585949, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6675, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.4754358161648177, |
|
"grad_norm": 0.5651557636719974, |
|
"learning_rate": 5e-06, |
|
"loss": 0.67, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.5071315372424723, |
|
"grad_norm": 0.7485262569132396, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6702, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.5388272583201268, |
|
"grad_norm": 0.5937798065160911, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6684, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.5705229793977813, |
|
"grad_norm": 0.42087472284785377, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6696, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.6022187004754358, |
|
"grad_norm": 0.6838376914191113, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6736, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.6339144215530903, |
|
"grad_norm": 0.6779376025706788, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6568, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.6656101426307448, |
|
"grad_norm": 0.5461606311887525, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6574, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.6973058637083994, |
|
"grad_norm": 0.4896365938443348, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6661, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.7290015847860539, |
|
"grad_norm": 0.5061952793250585, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6568, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.7606973058637084, |
|
"grad_norm": 0.5463009018381003, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6542, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.7923930269413629, |
|
"grad_norm": 0.4203063093835648, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6528, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.8240887480190174, |
|
"grad_norm": 0.5364831104473663, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6596, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.8557844690966719, |
|
"grad_norm": 0.5836930133680968, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6588, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.8874801901743264, |
|
"grad_norm": 0.4587582257779609, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6569, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.919175911251981, |
|
"grad_norm": 0.5211716841949762, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6483, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.9508716323296355, |
|
"grad_norm": 0.6487858529916467, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6492, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.9825673534072901, |
|
"grad_norm": 0.4204562899057241, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6466, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.9984152139461173, |
|
"eval_loss": 0.6489410996437073, |
|
"eval_runtime": 170.8766, |
|
"eval_samples_per_second": 49.72, |
|
"eval_steps_per_second": 0.392, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 1.0142630744849446, |
|
"grad_norm": 0.7909658922802345, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6309, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 1.045958795562599, |
|
"grad_norm": 0.5926743835661953, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6133, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 1.0776545166402536, |
|
"grad_norm": 0.5086080623346543, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6184, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 1.109350237717908, |
|
"grad_norm": 0.5244697888828465, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6175, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.1410459587955626, |
|
"grad_norm": 0.5658308922949424, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6194, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.172741679873217, |
|
"grad_norm": 0.503634416974372, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6139, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 1.2044374009508716, |
|
"grad_norm": 0.5134026739487886, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6145, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.236133122028526, |
|
"grad_norm": 0.6504374930809814, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6106, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.2678288431061806, |
|
"grad_norm": 0.5488394685032496, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6178, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.299524564183835, |
|
"grad_norm": 0.5637848064827178, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6139, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 1.3312202852614896, |
|
"grad_norm": 0.4929677114414056, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6115, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.3629160063391441, |
|
"grad_norm": 0.5260476600034447, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6097, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.3946117274167986, |
|
"grad_norm": 0.44727265179318326, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6092, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.4263074484944531, |
|
"grad_norm": 0.601618414738924, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6163, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.4580031695721076, |
|
"grad_norm": 0.49038888044450907, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6095, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.4896988906497624, |
|
"grad_norm": 0.5097913207122167, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6136, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 1.5213946117274166, |
|
"grad_norm": 0.6535026695551385, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6131, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.5530903328050714, |
|
"grad_norm": 0.5478546695070423, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6117, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.5847860538827259, |
|
"grad_norm": 0.47440853445302006, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6275, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.6164817749603804, |
|
"grad_norm": 0.47242075964062596, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6185, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.6481774960380349, |
|
"grad_norm": 0.4932550522259784, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6188, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.6798732171156894, |
|
"grad_norm": 0.49155732750425185, |
|
"learning_rate": 5e-06, |
|
"loss": 0.618, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.7115689381933439, |
|
"grad_norm": 0.3852759456359732, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6108, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.7432646592709984, |
|
"grad_norm": 0.45204407263371044, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6128, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.7749603803486529, |
|
"grad_norm": 0.45233482856711393, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6129, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.8066561014263076, |
|
"grad_norm": 0.40723601066321924, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6141, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.8383518225039621, |
|
"grad_norm": 0.49021896910219537, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6048, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.8700475435816166, |
|
"grad_norm": 0.4137149099162348, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6069, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.9017432646592711, |
|
"grad_norm": 0.43600015726361796, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6127, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.9334389857369256, |
|
"grad_norm": 0.5983745339259906, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6119, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 1.9651347068145801, |
|
"grad_norm": 0.44191449750213707, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6105, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.9968304278922346, |
|
"grad_norm": 0.41833944166046033, |
|
"learning_rate": 5e-06, |
|
"loss": 0.612, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 0.6384235620498657, |
|
"eval_runtime": 172.0031, |
|
"eval_samples_per_second": 49.394, |
|
"eval_steps_per_second": 0.39, |
|
"step": 631 |
|
}, |
|
{ |
|
"epoch": 2.028526148969889, |
|
"grad_norm": 0.502856771702954, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5748, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 2.0602218700475436, |
|
"grad_norm": 0.47143025065214356, |
|
"learning_rate": 5e-06, |
|
"loss": 0.573, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 2.091917591125198, |
|
"grad_norm": 0.4378418090102208, |
|
"learning_rate": 5e-06, |
|
"loss": 0.573, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 2.1236133122028527, |
|
"grad_norm": 0.6058133875665858, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5783, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 2.155309033280507, |
|
"grad_norm": 0.47371009100624967, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5698, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 2.1870047543581617, |
|
"grad_norm": 0.4991361347258838, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5732, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 2.218700475435816, |
|
"grad_norm": 0.5359340720470293, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5699, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 2.2503961965134707, |
|
"grad_norm": 0.4893107425446941, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5751, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 2.282091917591125, |
|
"grad_norm": 0.44259860353438574, |
|
"learning_rate": 5e-06, |
|
"loss": 0.57, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 2.3137876386687797, |
|
"grad_norm": 0.7444991766460172, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5714, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 2.345483359746434, |
|
"grad_norm": 0.39951135529059456, |
|
"learning_rate": 5e-06, |
|
"loss": 0.577, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 2.3771790808240887, |
|
"grad_norm": 0.4742761127635493, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5782, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 2.408874801901743, |
|
"grad_norm": 0.585898905502293, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5755, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 2.4405705229793977, |
|
"grad_norm": 0.6170117233151285, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5752, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 2.472266244057052, |
|
"grad_norm": 0.45913749051577313, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5723, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 2.5039619651347067, |
|
"grad_norm": 0.4584152935889332, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5679, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 2.535657686212361, |
|
"grad_norm": 0.42856693615532904, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5744, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 2.5673534072900157, |
|
"grad_norm": 0.5273756256360088, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5794, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 2.59904912836767, |
|
"grad_norm": 0.46899833087451076, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5709, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 2.6307448494453247, |
|
"grad_norm": 0.484076993070708, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5759, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 2.662440570522979, |
|
"grad_norm": 0.4349705497114564, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5752, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 2.6941362916006337, |
|
"grad_norm": 0.5194459175036173, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5724, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 2.7258320126782882, |
|
"grad_norm": 0.5337441754498483, |
|
"learning_rate": 5e-06, |
|
"loss": 0.574, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 2.7575277337559427, |
|
"grad_norm": 0.5009650532937617, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5772, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 2.7892234548335972, |
|
"grad_norm": 0.46031841411878416, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5817, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 2.8209191759112517, |
|
"grad_norm": 0.5673120105732073, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5808, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 2.8526148969889062, |
|
"grad_norm": 0.614359093779175, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5801, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 2.8843106180665607, |
|
"grad_norm": 0.5019306735450663, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5813, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 2.9160063391442153, |
|
"grad_norm": 0.43396332531042875, |
|
"learning_rate": 5e-06, |
|
"loss": 0.576, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 2.94770206022187, |
|
"grad_norm": 0.4457359128495874, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5798, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 2.9793977812995247, |
|
"grad_norm": 0.4575875895533821, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5816, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 2.995245641838352, |
|
"eval_loss": 0.637653648853302, |
|
"eval_runtime": 171.7653, |
|
"eval_samples_per_second": 49.463, |
|
"eval_steps_per_second": 0.39, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 2.995245641838352, |
|
"step": 945, |
|
"total_flos": 1582491437629440.0, |
|
"train_loss": 0.6238129633444327, |
|
"train_runtime": 28593.492, |
|
"train_samples_per_second": 16.935, |
|
"train_steps_per_second": 0.033 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 945, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1582491437629440.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|