|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.9993044284720614, |
|
"eval_steps": 500, |
|
"global_step": 1617, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.018548574078367727, |
|
"grad_norm": 2.947240343866928, |
|
"learning_rate": 5e-06, |
|
"loss": 0.9092, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.037097148156735454, |
|
"grad_norm": 4.807938505196804, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8033, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.055645722235103175, |
|
"grad_norm": 3.003116346754329, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7597, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.07419429631347091, |
|
"grad_norm": 1.4718449165697798, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7382, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.09274287039183862, |
|
"grad_norm": 1.5160653638400317, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7179, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.11129144447020635, |
|
"grad_norm": 1.2376263065355404, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6967, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.12984001854857408, |
|
"grad_norm": 1.2615134334932208, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6892, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.14838859262694182, |
|
"grad_norm": 0.6297064773701695, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6645, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.16693716670530953, |
|
"grad_norm": 0.8430227369470438, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6739, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.18548574078367724, |
|
"grad_norm": 0.5602106795501006, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6596, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.20403431486204499, |
|
"grad_norm": 0.5414728553841807, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6419, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.2225828889404127, |
|
"grad_norm": 0.581237350923138, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6475, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.24113146301878044, |
|
"grad_norm": 0.9110329106881332, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6394, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.25968003709714815, |
|
"grad_norm": 0.7117787694171418, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6349, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.2782286111755159, |
|
"grad_norm": 0.9097641559079228, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6307, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.29677718525388364, |
|
"grad_norm": 0.925373877659069, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6221, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.3153257593322513, |
|
"grad_norm": 0.56437914746603, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6321, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.33387433341061906, |
|
"grad_norm": 0.8197823974891302, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6306, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.3524229074889868, |
|
"grad_norm": 0.7402190785036176, |
|
"learning_rate": 5e-06, |
|
"loss": 0.632, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.3709714815673545, |
|
"grad_norm": 0.4858062774844838, |
|
"learning_rate": 5e-06, |
|
"loss": 0.633, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.38952005564572223, |
|
"grad_norm": 0.5126564143958117, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6406, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.40806862972408997, |
|
"grad_norm": 0.48094170267698244, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6246, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.4266172038024577, |
|
"grad_norm": 0.6524447128350448, |
|
"learning_rate": 5e-06, |
|
"loss": 0.607, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.4451657778808254, |
|
"grad_norm": 0.5518493275013766, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6243, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.46371435195919314, |
|
"grad_norm": 0.6558097677670804, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6171, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.4822629260375609, |
|
"grad_norm": 0.7137291594764235, |
|
"learning_rate": 5e-06, |
|
"loss": 0.606, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.5008115001159286, |
|
"grad_norm": 0.5275248289780116, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6102, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.5193600741942963, |
|
"grad_norm": 0.8348067083552699, |
|
"learning_rate": 5e-06, |
|
"loss": 0.616, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.537908648272664, |
|
"grad_norm": 0.6106750334627583, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6142, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.5564572223510318, |
|
"grad_norm": 0.6769490656578356, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6079, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.5750057964293995, |
|
"grad_norm": 0.9615740472605508, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6055, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.5935543705077673, |
|
"grad_norm": 0.5778708733971128, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6028, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.6121029445861349, |
|
"grad_norm": 0.4960676395424829, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6086, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.6306515186645026, |
|
"grad_norm": 0.5758649696479989, |
|
"learning_rate": 5e-06, |
|
"loss": 0.603, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.6492000927428704, |
|
"grad_norm": 0.6291485309740135, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6058, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.6677486668212381, |
|
"grad_norm": 0.6179917359269285, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5953, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.6862972408996059, |
|
"grad_norm": 0.552294533130827, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6024, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.7048458149779736, |
|
"grad_norm": 0.5774022034655474, |
|
"learning_rate": 5e-06, |
|
"loss": 0.607, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.7233943890563413, |
|
"grad_norm": 0.7214292693401807, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5974, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.741942963134709, |
|
"grad_norm": 0.6477577613968241, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5972, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.7604915372130767, |
|
"grad_norm": 0.6340600580365108, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5816, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.7790401112914445, |
|
"grad_norm": 0.5680706241521782, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5757, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.7975886853698122, |
|
"grad_norm": 0.6389171777087581, |
|
"learning_rate": 5e-06, |
|
"loss": 0.595, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.8161372594481799, |
|
"grad_norm": 0.6107276360180472, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5936, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.8346858335265477, |
|
"grad_norm": 0.5997875839762729, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5856, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.8532344076049154, |
|
"grad_norm": 0.5683771733599212, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5943, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.871782981683283, |
|
"grad_norm": 0.7010536508894063, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5833, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.8903315557616508, |
|
"grad_norm": 0.5908990476203293, |
|
"learning_rate": 5e-06, |
|
"loss": 0.599, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.9088801298400185, |
|
"grad_norm": 0.6071694127371104, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5768, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.9274287039183863, |
|
"grad_norm": 0.592858345014562, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5839, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.945977277996754, |
|
"grad_norm": 0.800429521567568, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5797, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.9645258520751218, |
|
"grad_norm": 0.7189609643326995, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5833, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.9830744261534895, |
|
"grad_norm": 0.6891202812176819, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5897, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.9997681428240204, |
|
"eval_loss": 0.5783212780952454, |
|
"eval_runtime": 568.0594, |
|
"eval_samples_per_second": 25.571, |
|
"eval_steps_per_second": 0.4, |
|
"step": 539 |
|
}, |
|
{ |
|
"epoch": 1.0016230002318571, |
|
"grad_norm": 1.508531876731451, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6241, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.0201715743102249, |
|
"grad_norm": 0.6966255000443538, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5375, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.0387201483885926, |
|
"grad_norm": 0.890924162169634, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5428, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.0572687224669604, |
|
"grad_norm": 0.8758158076398361, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5265, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.075817296545328, |
|
"grad_norm": 0.5756897797693913, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5341, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.0943658706236958, |
|
"grad_norm": 0.6406486451101234, |
|
"learning_rate": 5e-06, |
|
"loss": 0.525, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.1129144447020636, |
|
"grad_norm": 0.6180252143784474, |
|
"learning_rate": 5e-06, |
|
"loss": 0.538, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.1314630187804313, |
|
"grad_norm": 0.5607170765041661, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5298, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 1.150011592858799, |
|
"grad_norm": 0.5660990160046208, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5234, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.1685601669371668, |
|
"grad_norm": 0.7049943454448735, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5314, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 1.1871087410155345, |
|
"grad_norm": 0.6322757671168315, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5314, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.205657315093902, |
|
"grad_norm": 0.6315492733495492, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5359, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.2242058891722698, |
|
"grad_norm": 0.5824676673790568, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5314, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 1.2427544632506375, |
|
"grad_norm": 0.6054188119886005, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5305, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 1.2613030373290053, |
|
"grad_norm": 0.7111463099630733, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5427, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 1.279851611407373, |
|
"grad_norm": 0.5473676666520529, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5304, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 1.2984001854857408, |
|
"grad_norm": 0.5677624386493364, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5301, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.3169487595641085, |
|
"grad_norm": 0.5554522651946812, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5216, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 1.3354973336424762, |
|
"grad_norm": 0.5796555010460428, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5465, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 1.354045907720844, |
|
"grad_norm": 0.7811681513039581, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5432, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 1.3725944817992117, |
|
"grad_norm": 0.5405617867913124, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5318, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 1.3911430558775795, |
|
"grad_norm": 0.6158407644675764, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5282, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.4096916299559472, |
|
"grad_norm": 0.5535023339764817, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5204, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 1.428240204034315, |
|
"grad_norm": 0.5811059240915003, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5262, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 1.4467887781126825, |
|
"grad_norm": 0.5321304586809559, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5264, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 1.4653373521910504, |
|
"grad_norm": 0.5552151667844475, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5264, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 1.483885926269418, |
|
"grad_norm": 0.5361161307842528, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5268, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.502434500347786, |
|
"grad_norm": 0.547926650755314, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5301, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 1.5209830744261534, |
|
"grad_norm": 0.542860060621558, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5193, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 1.5395316485045212, |
|
"grad_norm": 0.5952263875330506, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5216, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 1.558080222582889, |
|
"grad_norm": 0.6060219873288911, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5259, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 1.5766287966612567, |
|
"grad_norm": 0.5526009181452085, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5241, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 1.5951773707396244, |
|
"grad_norm": 0.7655982897227043, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5267, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 1.6137259448179921, |
|
"grad_norm": 0.584300099468923, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5317, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 1.6322745188963599, |
|
"grad_norm": 0.643161799376352, |
|
"learning_rate": 5e-06, |
|
"loss": 0.527, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 1.6508230929747274, |
|
"grad_norm": 0.5972896345510548, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5194, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 1.6693716670530954, |
|
"grad_norm": 0.5653434512662245, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5206, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.6879202411314629, |
|
"grad_norm": 0.6357358361838954, |
|
"learning_rate": 5e-06, |
|
"loss": 0.515, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 1.7064688152098308, |
|
"grad_norm": 0.4604990510970151, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5304, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 1.7250173892881984, |
|
"grad_norm": 0.546451119916009, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5228, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 1.7435659633665663, |
|
"grad_norm": 0.6039459271117709, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5066, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 1.7621145374449338, |
|
"grad_norm": 0.5296235201454201, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5219, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 1.7806631115233018, |
|
"grad_norm": 0.7121437475875894, |
|
"learning_rate": 5e-06, |
|
"loss": 0.519, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 1.7992116856016693, |
|
"grad_norm": 0.6767258868529336, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5155, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 1.817760259680037, |
|
"grad_norm": 0.6347972264963303, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5056, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 1.8363088337584048, |
|
"grad_norm": 0.5014537431107586, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5178, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 1.8548574078367726, |
|
"grad_norm": 0.5154954740926758, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5235, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.8734059819151403, |
|
"grad_norm": 0.7714296144853684, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5291, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 1.891954555993508, |
|
"grad_norm": 0.5251823522650411, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5136, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 1.9105031300718758, |
|
"grad_norm": 0.5661699899829179, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5174, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 1.9290517041502433, |
|
"grad_norm": 0.7742483564691807, |
|
"learning_rate": 5e-06, |
|
"loss": 0.524, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 1.9476002782286113, |
|
"grad_norm": 0.6651979093977842, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5117, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 1.9661488523069788, |
|
"grad_norm": 0.4966107646307623, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5101, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 1.9846974263853467, |
|
"grad_norm": 0.5477414666916758, |
|
"learning_rate": 5e-06, |
|
"loss": 0.519, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 1.9995362856480408, |
|
"eval_loss": 0.5572099685668945, |
|
"eval_runtime": 571.2353, |
|
"eval_samples_per_second": 25.429, |
|
"eval_steps_per_second": 0.397, |
|
"step": 1078 |
|
}, |
|
{ |
|
"epoch": 2.0032460004637143, |
|
"grad_norm": 0.9550000425676828, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5612, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 2.0217945745420822, |
|
"grad_norm": 0.7749538423166364, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4796, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 2.0403431486204497, |
|
"grad_norm": 0.6196120958317889, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4717, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 2.0588917226988177, |
|
"grad_norm": 0.7503113770111682, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4705, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 2.0774402967771852, |
|
"grad_norm": 0.8195010086999319, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4661, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 2.095988870855553, |
|
"grad_norm": 0.7634806874449389, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4674, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 2.1145374449339207, |
|
"grad_norm": 0.5282535148779405, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4645, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 2.1330860190122882, |
|
"grad_norm": 0.8193534272594567, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4676, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 2.151634593090656, |
|
"grad_norm": 0.6337110972085372, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4657, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 2.1701831671690237, |
|
"grad_norm": 0.5914383150324656, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4656, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 2.1887317412473917, |
|
"grad_norm": 0.5943506604962782, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4583, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 2.207280315325759, |
|
"grad_norm": 0.5899210327294049, |
|
"learning_rate": 5e-06, |
|
"loss": 0.469, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 2.225828889404127, |
|
"grad_norm": 0.611916980187844, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4715, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 2.2443774634824947, |
|
"grad_norm": 0.5355848691260124, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4584, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 2.2629260375608626, |
|
"grad_norm": 0.9219025948142408, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4772, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 2.28147461163923, |
|
"grad_norm": 0.6356945379257924, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4701, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 2.300023185717598, |
|
"grad_norm": 0.5513716882387442, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4619, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 2.3185717597959656, |
|
"grad_norm": 0.5749300964975611, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4734, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 2.3371203338743336, |
|
"grad_norm": 0.536668517924369, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4746, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 2.355668907952701, |
|
"grad_norm": 0.6278038505879705, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4647, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 2.374217482031069, |
|
"grad_norm": 0.7026457321857001, |
|
"learning_rate": 5e-06, |
|
"loss": 0.459, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 2.3927660561094366, |
|
"grad_norm": 0.4965239092186108, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4687, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 2.411314630187804, |
|
"grad_norm": 0.6026630222667435, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4636, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 2.429863204266172, |
|
"grad_norm": 0.5323286663903507, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4798, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 2.4484117783445396, |
|
"grad_norm": 0.5602695467114859, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4743, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 2.4669603524229076, |
|
"grad_norm": 0.5102675731281984, |
|
"learning_rate": 5e-06, |
|
"loss": 0.479, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 2.485508926501275, |
|
"grad_norm": 0.6117163824572253, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4667, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 2.504057500579643, |
|
"grad_norm": 0.5650168400314336, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4756, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 2.5226060746580106, |
|
"grad_norm": 0.5983202563411003, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4656, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 2.5411546487363785, |
|
"grad_norm": 0.5753074854215314, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4769, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 2.559703222814746, |
|
"grad_norm": 0.5728210847879419, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4715, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 2.578251796893114, |
|
"grad_norm": 0.6211225492010692, |
|
"learning_rate": 5e-06, |
|
"loss": 0.466, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 2.5968003709714815, |
|
"grad_norm": 0.6343921659926295, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4719, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 2.615348945049849, |
|
"grad_norm": 0.5369714001226162, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4657, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 2.633897519128217, |
|
"grad_norm": 0.6567365723948313, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4649, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 2.652446093206585, |
|
"grad_norm": 0.540386802693628, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4734, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 2.6709946672849525, |
|
"grad_norm": 0.5879368969729212, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4696, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 2.68954324136332, |
|
"grad_norm": 0.6738162707613969, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4626, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 2.708091815441688, |
|
"grad_norm": 0.536400620430425, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4683, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 2.7266403895200555, |
|
"grad_norm": 0.6155879189493075, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4724, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 2.7451889635984235, |
|
"grad_norm": 0.549109259484142, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4756, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 2.763737537676791, |
|
"grad_norm": 0.6863931463768982, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4765, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 2.782286111755159, |
|
"grad_norm": 0.7188901458634751, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4798, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 2.8008346858335265, |
|
"grad_norm": 0.5414397122063939, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4694, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 2.8193832599118944, |
|
"grad_norm": 0.5257691036404979, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4672, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 2.837931833990262, |
|
"grad_norm": 0.5415229151538448, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4704, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 2.85648040806863, |
|
"grad_norm": 0.5702613917862038, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4795, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 2.8750289821469974, |
|
"grad_norm": 0.6269031796704044, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4626, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 2.893577556225365, |
|
"grad_norm": 0.6589343281148584, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4767, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 2.912126130303733, |
|
"grad_norm": 0.7261065025479309, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4681, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 2.930674704382101, |
|
"grad_norm": 0.5636416585164459, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4732, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 2.9492232784604684, |
|
"grad_norm": 0.5677677401790827, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4732, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 2.967771852538836, |
|
"grad_norm": 0.6434371938896902, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4713, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 2.986320426617204, |
|
"grad_norm": 0.6181729556001256, |
|
"learning_rate": 5e-06, |
|
"loss": 0.469, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 2.9993044284720614, |
|
"eval_loss": 0.5541288256645203, |
|
"eval_runtime": 569.4978, |
|
"eval_samples_per_second": 25.507, |
|
"eval_steps_per_second": 0.399, |
|
"step": 1617 |
|
}, |
|
{ |
|
"epoch": 2.9993044284720614, |
|
"step": 1617, |
|
"total_flos": 2708325846220800.0, |
|
"train_loss": 0.542901256162588, |
|
"train_runtime": 94751.0453, |
|
"train_samples_per_second": 8.738, |
|
"train_steps_per_second": 0.017 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 1617, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2708325846220800.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|