|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 3.0, |
|
"eval_steps": 500, |
|
"global_step": 951, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.031545741324921134, |
|
"grad_norm": 4.99587251526554, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7944, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.06309148264984227, |
|
"grad_norm": 4.271464380765346, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6536, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.0946372239747634, |
|
"grad_norm": 1.1461490048117564, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6254, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.12618296529968454, |
|
"grad_norm": 0.981971583786276, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6012, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.15772870662460567, |
|
"grad_norm": 0.9522065058873813, |
|
"learning_rate": 5e-06, |
|
"loss": 0.573, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.1892744479495268, |
|
"grad_norm": 0.8622864722897896, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5621, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.22082018927444794, |
|
"grad_norm": 0.6621715224692672, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5434, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.25236593059936907, |
|
"grad_norm": 0.5858137328472353, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5307, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.28391167192429023, |
|
"grad_norm": 0.6162490995497564, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5299, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.31545741324921134, |
|
"grad_norm": 0.7263871575915639, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5234, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.3470031545741325, |
|
"grad_norm": 0.7173526345718089, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5149, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.3785488958990536, |
|
"grad_norm": 0.5402852521407366, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5158, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.41009463722397477, |
|
"grad_norm": 0.5272811866546335, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5101, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.4416403785488959, |
|
"grad_norm": 0.8081780735356934, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5067, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.47318611987381703, |
|
"grad_norm": 0.7298094681047617, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5042, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.5047318611987381, |
|
"grad_norm": 0.5246448731879264, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5022, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.5362776025236593, |
|
"grad_norm": 0.642083232942298, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5002, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.5678233438485805, |
|
"grad_norm": 0.6059370520825695, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4969, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.5993690851735016, |
|
"grad_norm": 0.48161916454770276, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4953, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.6309148264984227, |
|
"grad_norm": 0.6810369029107554, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4893, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.6624605678233438, |
|
"grad_norm": 0.5503177401173632, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4897, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.694006309148265, |
|
"grad_norm": 0.6046372979618468, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4874, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.7255520504731862, |
|
"grad_norm": 0.5662755567010622, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4846, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.7570977917981072, |
|
"grad_norm": 0.4846646736673193, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4863, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.7886435331230284, |
|
"grad_norm": 0.4875951863435466, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4909, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.8201892744479495, |
|
"grad_norm": 0.5146296136980277, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4826, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.8517350157728707, |
|
"grad_norm": 0.4654893350187648, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4722, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.8832807570977917, |
|
"grad_norm": 0.4681820584043545, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4787, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.9148264984227129, |
|
"grad_norm": 0.4735053003732251, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4769, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.9463722397476341, |
|
"grad_norm": 0.5851272182332038, |
|
"learning_rate": 5e-06, |
|
"loss": 0.479, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.9779179810725552, |
|
"grad_norm": 0.6125017656961282, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4745, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 0.47562769055366516, |
|
"eval_runtime": 170.473, |
|
"eval_samples_per_second": 50.084, |
|
"eval_steps_per_second": 0.393, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 1.0094637223974763, |
|
"grad_norm": 0.7703225863849302, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4658, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 1.0410094637223974, |
|
"grad_norm": 0.5658238656624024, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4369, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 1.0725552050473186, |
|
"grad_norm": 0.6051214020937586, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4401, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 1.1041009463722398, |
|
"grad_norm": 0.7172129607524986, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4359, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.135646687697161, |
|
"grad_norm": 0.5578022584419556, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4349, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.167192429022082, |
|
"grad_norm": 0.49543679997471873, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4394, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 1.1987381703470033, |
|
"grad_norm": 0.6068336570614361, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4383, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.2302839116719242, |
|
"grad_norm": 0.5054022564611206, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4311, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.2618296529968454, |
|
"grad_norm": 0.5339101451875964, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4313, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.2933753943217665, |
|
"grad_norm": 0.4854999161353839, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4337, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 1.3249211356466877, |
|
"grad_norm": 0.4776808074651913, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4284, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.3564668769716088, |
|
"grad_norm": 0.571848423035293, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4333, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.38801261829653, |
|
"grad_norm": 0.5673583129436841, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4323, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.4195583596214512, |
|
"grad_norm": 0.5703308592011652, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4284, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.4511041009463723, |
|
"grad_norm": 0.5235306886956671, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4285, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.4826498422712935, |
|
"grad_norm": 0.5621748063203227, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4256, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 1.5141955835962144, |
|
"grad_norm": 0.5709758620722418, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4229, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.5457413249211358, |
|
"grad_norm": 0.7440083314774995, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4223, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.5772870662460567, |
|
"grad_norm": 0.5274171243956395, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4203, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.608832807570978, |
|
"grad_norm": 0.5573889566155682, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4246, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.640378548895899, |
|
"grad_norm": 0.5089409762486478, |
|
"learning_rate": 5e-06, |
|
"loss": 0.422, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.6719242902208202, |
|
"grad_norm": 0.5487128479946805, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4327, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.7034700315457414, |
|
"grad_norm": 0.6447340995233556, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4223, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.7350157728706623, |
|
"grad_norm": 0.4778520303774798, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4249, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.7665615141955837, |
|
"grad_norm": 0.5277561759622408, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4231, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.7981072555205047, |
|
"grad_norm": 0.507228211837066, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4208, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.8296529968454258, |
|
"grad_norm": 0.573778603365111, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4246, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.861198738170347, |
|
"grad_norm": 0.554076788906782, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4178, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.8927444794952681, |
|
"grad_norm": 0.48057226381097573, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4225, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.9242902208201893, |
|
"grad_norm": 0.5626542258956102, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4241, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 1.9558359621451105, |
|
"grad_norm": 0.49935383563605856, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4199, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.9873817034700316, |
|
"grad_norm": 0.5226498960785609, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4181, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 0.44654515385627747, |
|
"eval_runtime": 170.7254, |
|
"eval_samples_per_second": 50.01, |
|
"eval_steps_per_second": 0.392, |
|
"step": 634 |
|
}, |
|
{ |
|
"epoch": 2.0189274447949526, |
|
"grad_norm": 0.6088532387627132, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3953, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 2.050473186119874, |
|
"grad_norm": 0.6170163085689537, |
|
"learning_rate": 5e-06, |
|
"loss": 0.38, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 2.082018927444795, |
|
"grad_norm": 0.6044774577568809, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3821, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 2.1135646687697163, |
|
"grad_norm": 0.7001653617975729, |
|
"learning_rate": 5e-06, |
|
"loss": 0.386, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 2.145110410094637, |
|
"grad_norm": 0.5560077663575217, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3787, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 2.176656151419558, |
|
"grad_norm": 0.5213047969133517, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3838, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 2.2082018927444795, |
|
"grad_norm": 0.5147380409958507, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3851, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 2.2397476340694005, |
|
"grad_norm": 0.5497461937255633, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3864, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 2.271293375394322, |
|
"grad_norm": 0.620646304852149, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3814, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 2.302839116719243, |
|
"grad_norm": 0.6036500927422149, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3776, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 2.334384858044164, |
|
"grad_norm": 0.5481492981732653, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3858, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 2.365930599369085, |
|
"grad_norm": 0.5654790601381019, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3824, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 2.3974763406940065, |
|
"grad_norm": 0.5811999163309192, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3831, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 2.4290220820189274, |
|
"grad_norm": 0.5244470729272699, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3776, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 2.4605678233438484, |
|
"grad_norm": 0.48851134130848284, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3791, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 2.4921135646687698, |
|
"grad_norm": 0.5371797697436069, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3797, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 2.5236593059936907, |
|
"grad_norm": 0.49066553328169765, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3815, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 2.555205047318612, |
|
"grad_norm": 0.5984827997828263, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3756, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 2.586750788643533, |
|
"grad_norm": 0.6179347432660328, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3806, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 2.6182965299684544, |
|
"grad_norm": 0.6394278462521181, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3834, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 2.6498422712933754, |
|
"grad_norm": 0.579789988236478, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3869, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 2.6813880126182967, |
|
"grad_norm": 0.5366527627741544, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3848, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 2.7129337539432177, |
|
"grad_norm": 0.6067239997780901, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3768, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 2.7444794952681386, |
|
"grad_norm": 0.5441160362260907, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3883, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 2.77602523659306, |
|
"grad_norm": 0.5231719514823525, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3795, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 2.807570977917981, |
|
"grad_norm": 0.6141826745417515, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3771, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 2.8391167192429023, |
|
"grad_norm": 0.542559242674366, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3782, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 2.8706624605678233, |
|
"grad_norm": 0.5043528521109601, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3779, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 2.9022082018927446, |
|
"grad_norm": 0.5168292506779991, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3782, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 2.9337539432176656, |
|
"grad_norm": 0.556675998633135, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3802, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 2.965299684542587, |
|
"grad_norm": 0.5534284489693292, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3806, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 2.996845425867508, |
|
"grad_norm": 0.5746396382994269, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3794, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_loss": 0.4385061264038086, |
|
"eval_runtime": 170.7773, |
|
"eval_samples_per_second": 49.995, |
|
"eval_steps_per_second": 0.392, |
|
"step": 951 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"step": 951, |
|
"total_flos": 1592541661102080.0, |
|
"train_loss": 0.4442997165220142, |
|
"train_runtime": 28675.07, |
|
"train_samples_per_second": 16.971, |
|
"train_steps_per_second": 0.033 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 951, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1592541661102080.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|