|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.9984552576409578, |
|
"eval_steps": 500, |
|
"global_step": 1698, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.017654198389054396, |
|
"grad_norm": 11.075085595262578, |
|
"learning_rate": 3.92156862745098e-06, |
|
"loss": 1.3624, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.03530839677810879, |
|
"grad_norm": 0.8268832619967722, |
|
"learning_rate": 7.84313725490196e-06, |
|
"loss": 1.0988, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.052962595167163194, |
|
"grad_norm": 0.6592198895247919, |
|
"learning_rate": 1.1764705882352942e-05, |
|
"loss": 1.0279, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.07061679355621758, |
|
"grad_norm": 0.7345400249936768, |
|
"learning_rate": 1.568627450980392e-05, |
|
"loss": 1.0346, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.08827099194527198, |
|
"grad_norm": 0.6520880282674657, |
|
"learning_rate": 1.9607843137254903e-05, |
|
"loss": 1.0161, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.10592519033432639, |
|
"grad_norm": 0.6267314186354508, |
|
"learning_rate": 1.999852647705027e-05, |
|
"loss": 0.9827, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.12357938872338078, |
|
"grad_norm": 0.5990775699580297, |
|
"learning_rate": 1.9993433374984987e-05, |
|
"loss": 0.9727, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.14123358711243517, |
|
"grad_norm": 0.7072632521466412, |
|
"learning_rate": 1.9984704354748582e-05, |
|
"loss": 0.9887, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.15888778550148958, |
|
"grad_norm": 0.6302011892410732, |
|
"learning_rate": 1.9972342592226873e-05, |
|
"loss": 0.9639, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.17654198389054396, |
|
"grad_norm": 0.6228020691113849, |
|
"learning_rate": 1.9956352585008946e-05, |
|
"loss": 0.9584, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.19419618227959837, |
|
"grad_norm": 0.6537080367025939, |
|
"learning_rate": 1.9936740150750825e-05, |
|
"loss": 0.9517, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.21185038066865278, |
|
"grad_norm": 0.7111863399924561, |
|
"learning_rate": 1.9913512425058803e-05, |
|
"loss": 0.9408, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.22950457905770716, |
|
"grad_norm": 0.6710168558236342, |
|
"learning_rate": 1.9886677858893303e-05, |
|
"loss": 0.9374, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.24715877744676157, |
|
"grad_norm": 0.6547246997110331, |
|
"learning_rate": 1.9856246215494147e-05, |
|
"loss": 0.9353, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.264812975835816, |
|
"grad_norm": 0.5595831632061861, |
|
"learning_rate": 1.982222856682841e-05, |
|
"loss": 0.9508, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.28246717422487033, |
|
"grad_norm": 0.6617569180053982, |
|
"learning_rate": 1.9784637289562067e-05, |
|
"loss": 0.9385, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.30012137261392474, |
|
"grad_norm": 0.7260939928287287, |
|
"learning_rate": 1.9743486060557015e-05, |
|
"loss": 0.9226, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.31777557100297915, |
|
"grad_norm": 0.6291820154599896, |
|
"learning_rate": 1.9698789851894986e-05, |
|
"loss": 0.9262, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.33542976939203356, |
|
"grad_norm": 0.5974647680183809, |
|
"learning_rate": 1.9650564925430257e-05, |
|
"loss": 0.9207, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.3530839677810879, |
|
"grad_norm": 0.5295497383864004, |
|
"learning_rate": 1.9598828826873085e-05, |
|
"loss": 0.9259, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.3707381661701423, |
|
"grad_norm": 0.5999369621481067, |
|
"learning_rate": 1.9543600379406027e-05, |
|
"loss": 0.931, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.38839236455919673, |
|
"grad_norm": 0.7118598829666988, |
|
"learning_rate": 1.9484899676835504e-05, |
|
"loss": 0.9132, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.40604656294825114, |
|
"grad_norm": 0.5244866233662618, |
|
"learning_rate": 1.9422748076281054e-05, |
|
"loss": 0.9165, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.42370076133730555, |
|
"grad_norm": 0.5981442443509013, |
|
"learning_rate": 1.9357168190404937e-05, |
|
"loss": 0.9145, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.4413549597263599, |
|
"grad_norm": 0.5814732591931184, |
|
"learning_rate": 1.9288183879184986e-05, |
|
"loss": 0.9108, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.4590091581154143, |
|
"grad_norm": 0.6328453186407658, |
|
"learning_rate": 1.9215820241233585e-05, |
|
"loss": 0.9256, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.4766633565044687, |
|
"grad_norm": 0.6253638836302401, |
|
"learning_rate": 1.9140103604666035e-05, |
|
"loss": 0.903, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.49431755489352314, |
|
"grad_norm": 0.6443153938386799, |
|
"learning_rate": 1.9061061517521575e-05, |
|
"loss": 0.9085, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.5119717532825775, |
|
"grad_norm": 0.5794297240032266, |
|
"learning_rate": 1.897872273774056e-05, |
|
"loss": 0.9142, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.529625951671632, |
|
"grad_norm": 0.5506943177514773, |
|
"learning_rate": 1.8893117222701435e-05, |
|
"loss": 0.8941, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.5472801500606863, |
|
"grad_norm": 0.7158873616368372, |
|
"learning_rate": 1.8804276118321328e-05, |
|
"loss": 0.912, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.5649343484497407, |
|
"grad_norm": 0.6761743351092112, |
|
"learning_rate": 1.8712231747724194e-05, |
|
"loss": 0.909, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.5825885468387951, |
|
"grad_norm": 0.5834711748627983, |
|
"learning_rate": 1.861701759948068e-05, |
|
"loss": 0.8963, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.6002427452278495, |
|
"grad_norm": 0.5603155829035662, |
|
"learning_rate": 1.8518668315423962e-05, |
|
"loss": 0.8995, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.617896943616904, |
|
"grad_norm": 0.522797804575778, |
|
"learning_rate": 1.8417219678045953e-05, |
|
"loss": 0.8962, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.6355511420059583, |
|
"grad_norm": 0.6835351293213303, |
|
"learning_rate": 1.831270859747857e-05, |
|
"loss": 0.8916, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.6532053403950127, |
|
"grad_norm": 0.5938476091687838, |
|
"learning_rate": 1.8205173098064656e-05, |
|
"loss": 0.8726, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.6708595387840671, |
|
"grad_norm": 0.634802092406171, |
|
"learning_rate": 1.8094652304523584e-05, |
|
"loss": 0.8841, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.6885137371731215, |
|
"grad_norm": 0.6742253195098137, |
|
"learning_rate": 1.7981186427716478e-05, |
|
"loss": 0.8817, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.7061679355621758, |
|
"grad_norm": 0.5772838179810406, |
|
"learning_rate": 1.7864816750016246e-05, |
|
"loss": 0.8803, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.7238221339512303, |
|
"grad_norm": 0.6030744898202103, |
|
"learning_rate": 1.7745585610287812e-05, |
|
"loss": 0.876, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.7414763323402846, |
|
"grad_norm": 0.5955540196255147, |
|
"learning_rate": 1.7623536388483902e-05, |
|
"loss": 0.879, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.7591305307293391, |
|
"grad_norm": 0.6933702635166431, |
|
"learning_rate": 1.7498713489862133e-05, |
|
"loss": 0.8917, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.7767847291183935, |
|
"grad_norm": 0.5817510021566383, |
|
"learning_rate": 1.737116232882895e-05, |
|
"loss": 0.8731, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.7944389275074478, |
|
"grad_norm": 0.6017117570992859, |
|
"learning_rate": 1.7240929312416545e-05, |
|
"loss": 0.8758, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.8120931258965023, |
|
"grad_norm": 0.52374499191387, |
|
"learning_rate": 1.710806182339848e-05, |
|
"loss": 0.8728, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.8297473242855566, |
|
"grad_norm": 0.5995068247811851, |
|
"learning_rate": 1.697260820305044e-05, |
|
"loss": 0.8867, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.8474015226746111, |
|
"grad_norm": 0.5610337196661641, |
|
"learning_rate": 1.683461773356213e-05, |
|
"loss": 0.865, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.8650557210636655, |
|
"grad_norm": 0.5740324652987965, |
|
"learning_rate": 1.669414062010696e-05, |
|
"loss": 0.8795, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.8827099194527198, |
|
"grad_norm": 0.6580353279100991, |
|
"learning_rate": 1.6551227972575823e-05, |
|
"loss": 0.8642, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.9003641178417743, |
|
"grad_norm": 0.5609839098459638, |
|
"learning_rate": 1.6405931786981753e-05, |
|
"loss": 0.8666, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.9180183162308286, |
|
"grad_norm": 0.606711772111839, |
|
"learning_rate": 1.6258304926542183e-05, |
|
"loss": 0.85, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.935672514619883, |
|
"grad_norm": 0.5282629393908134, |
|
"learning_rate": 1.610840110244568e-05, |
|
"loss": 0.8703, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.9533267130089375, |
|
"grad_norm": 0.5716697646332912, |
|
"learning_rate": 1.5956274854310157e-05, |
|
"loss": 0.8878, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.9709809113979918, |
|
"grad_norm": 0.6293891083073122, |
|
"learning_rate": 1.5801981530339695e-05, |
|
"loss": 0.8505, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.9886351097870463, |
|
"grad_norm": 0.5032178564593078, |
|
"learning_rate": 1.5645577267187163e-05, |
|
"loss": 0.8625, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.9992276288204789, |
|
"eval_loss": 0.7792695760726929, |
|
"eval_runtime": 577.071, |
|
"eval_samples_per_second": 9.642, |
|
"eval_steps_per_second": 2.41, |
|
"step": 566 |
|
}, |
|
{ |
|
"epoch": 1.0070616793556217, |
|
"grad_norm": 0.565136350859829, |
|
"learning_rate": 1.5487118969529973e-05, |
|
"loss": 0.7727, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.024715877744676, |
|
"grad_norm": 0.4954225833969141, |
|
"learning_rate": 1.5326664289366406e-05, |
|
"loss": 0.7606, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.0423700761337304, |
|
"grad_norm": 0.523592237844004, |
|
"learning_rate": 1.516427160504006e-05, |
|
"loss": 0.7597, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.060024274522785, |
|
"grad_norm": 0.5408055640730495, |
|
"learning_rate": 1.5000000000000002e-05, |
|
"loss": 0.7536, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.0776784729118394, |
|
"grad_norm": 0.5246834404536742, |
|
"learning_rate": 1.4833909241304391e-05, |
|
"loss": 0.7562, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 1.0953326713008937, |
|
"grad_norm": 0.5643207747274573, |
|
"learning_rate": 1.4666059757875397e-05, |
|
"loss": 0.7612, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.112986869689948, |
|
"grad_norm": 0.517010952574902, |
|
"learning_rate": 1.4496512618513289e-05, |
|
"loss": 0.7575, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 1.1306410680790024, |
|
"grad_norm": 0.5261650605787747, |
|
"learning_rate": 1.4325329509677743e-05, |
|
"loss": 0.7473, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.148295266468057, |
|
"grad_norm": 0.5342217209976181, |
|
"learning_rate": 1.4152572713044397e-05, |
|
"loss": 0.7645, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.1659494648571114, |
|
"grad_norm": 0.5394360986026229, |
|
"learning_rate": 1.3978305082844876e-05, |
|
"loss": 0.7456, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 1.1836036632461657, |
|
"grad_norm": 0.5251341793704711, |
|
"learning_rate": 1.3802590022998483e-05, |
|
"loss": 0.7564, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 1.20125786163522, |
|
"grad_norm": 0.5365883139849793, |
|
"learning_rate": 1.3625491464043909e-05, |
|
"loss": 0.7662, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 1.2189120600242744, |
|
"grad_norm": 0.5851452149462395, |
|
"learning_rate": 1.3447073839879339e-05, |
|
"loss": 0.7467, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 1.236566258413329, |
|
"grad_norm": 0.49698239669222605, |
|
"learning_rate": 1.3267402064319415e-05, |
|
"loss": 0.7468, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.2542204568023834, |
|
"grad_norm": 0.5490853297859661, |
|
"learning_rate": 1.3086541507477598e-05, |
|
"loss": 0.7496, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 1.2718746551914377, |
|
"grad_norm": 0.49838225273553083, |
|
"learning_rate": 1.2904557971982514e-05, |
|
"loss": 0.7465, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 1.289528853580492, |
|
"grad_norm": 0.5406712310299336, |
|
"learning_rate": 1.2721517669036929e-05, |
|
"loss": 0.7473, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 1.3071830519695464, |
|
"grad_norm": 0.5048219957782835, |
|
"learning_rate": 1.253748719432809e-05, |
|
"loss": 0.734, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 1.324837250358601, |
|
"grad_norm": 0.47951222482587946, |
|
"learning_rate": 1.2352533503798156e-05, |
|
"loss": 0.7431, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.3424914487476554, |
|
"grad_norm": 0.5851936786168687, |
|
"learning_rate": 1.2166723889283574e-05, |
|
"loss": 0.7472, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 1.3601456471367097, |
|
"grad_norm": 0.485279585177607, |
|
"learning_rate": 1.1980125954032239e-05, |
|
"loss": 0.7366, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 1.377799845525764, |
|
"grad_norm": 0.5453687928417636, |
|
"learning_rate": 1.1792807588107358e-05, |
|
"loss": 0.746, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 1.3954540439148184, |
|
"grad_norm": 0.5866620790608368, |
|
"learning_rate": 1.1604836943686957e-05, |
|
"loss": 0.7487, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 1.4131082423038728, |
|
"grad_norm": 0.5056556922549863, |
|
"learning_rate": 1.141628241026802e-05, |
|
"loss": 0.7384, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.4307624406929274, |
|
"grad_norm": 0.5141576590110138, |
|
"learning_rate": 1.1227212589784297e-05, |
|
"loss": 0.7398, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 1.4484166390819817, |
|
"grad_norm": 0.48402830116887263, |
|
"learning_rate": 1.1037696271646805e-05, |
|
"loss": 0.7369, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 1.466070837471036, |
|
"grad_norm": 0.5165402289756648, |
|
"learning_rate": 1.0847802407716128e-05, |
|
"loss": 0.7482, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 1.4837250358600904, |
|
"grad_norm": 0.4874182996900902, |
|
"learning_rate": 1.0657600087215618e-05, |
|
"loss": 0.7375, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 1.501379234249145, |
|
"grad_norm": 0.49738455424585243, |
|
"learning_rate": 1.0467158511594595e-05, |
|
"loss": 0.7397, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 1.5190334326381993, |
|
"grad_norm": 0.49991861283295536, |
|
"learning_rate": 1.0276546969350757e-05, |
|
"loss": 0.7278, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 1.5366876310272537, |
|
"grad_norm": 0.5667368371430999, |
|
"learning_rate": 1.0085834810820871e-05, |
|
"loss": 0.7301, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 1.554341829416308, |
|
"grad_norm": 0.5144068476500945, |
|
"learning_rate": 9.89509142294901e-06, |
|
"loss": 0.7378, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 1.5719960278053624, |
|
"grad_norm": 0.5137627935948821, |
|
"learning_rate": 9.704386204041438e-06, |
|
"loss": 0.7517, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 1.5896502261944168, |
|
"grad_norm": 0.52904044022737, |
|
"learning_rate": 9.513788538517375e-06, |
|
"loss": 0.7407, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.6073044245834711, |
|
"grad_norm": 0.5115595110168406, |
|
"learning_rate": 9.323367771664819e-06, |
|
"loss": 0.7338, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 1.6249586229725257, |
|
"grad_norm": 0.48147125209335884, |
|
"learning_rate": 9.133193184410589e-06, |
|
"loss": 0.7362, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 1.64261282136158, |
|
"grad_norm": 0.49180548035958394, |
|
"learning_rate": 8.943333968113808e-06, |
|
"loss": 0.7398, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 1.6602670197506344, |
|
"grad_norm": 0.4867013851446762, |
|
"learning_rate": 8.753859199391951e-06, |
|
"loss": 0.7483, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 1.677921218139689, |
|
"grad_norm": 0.4804051538514883, |
|
"learning_rate": 8.564837814988638e-06, |
|
"loss": 0.7307, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 1.6955754165287433, |
|
"grad_norm": 0.4863063091894713, |
|
"learning_rate": 8.376338586692367e-06, |
|
"loss": 0.7311, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 1.7132296149177977, |
|
"grad_norm": 0.5169386880097873, |
|
"learning_rate": 8.188430096315168e-06, |
|
"loss": 0.7327, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 1.730883813306852, |
|
"grad_norm": 0.5153036409881078, |
|
"learning_rate": 8.00118071074049e-06, |
|
"loss": 0.7328, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 1.7485380116959064, |
|
"grad_norm": 0.5006840768446958, |
|
"learning_rate": 7.814658557049175e-06, |
|
"loss": 0.746, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 1.7661922100849607, |
|
"grad_norm": 0.5193653515770323, |
|
"learning_rate": 7.62893149773278e-06, |
|
"loss": 0.738, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.783846408474015, |
|
"grad_norm": 0.5162239370053442, |
|
"learning_rate": 7.4440671060030725e-06, |
|
"loss": 0.73, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 1.8015006068630697, |
|
"grad_norm": 0.48852121302241314, |
|
"learning_rate": 7.260132641206861e-06, |
|
"loss": 0.7394, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 1.819154805252124, |
|
"grad_norm": 0.5463945098209841, |
|
"learning_rate": 7.077195024354939e-06, |
|
"loss": 0.722, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 1.8368090036411784, |
|
"grad_norm": 0.49951753268549337, |
|
"learning_rate": 6.895320813774206e-06, |
|
"loss": 0.7133, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 1.854463202030233, |
|
"grad_norm": 0.5119500277521447, |
|
"learning_rate": 6.714576180891653e-06, |
|
"loss": 0.7184, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 1.8721174004192873, |
|
"grad_norm": 0.6424312472729824, |
|
"learning_rate": 6.535026886159221e-06, |
|
"loss": 0.7186, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 1.8897715988083417, |
|
"grad_norm": 0.5373842851342149, |
|
"learning_rate": 6.356738255128068e-06, |
|
"loss": 0.7211, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 1.907425797197396, |
|
"grad_norm": 0.4982902687166508, |
|
"learning_rate": 6.179775154681184e-06, |
|
"loss": 0.7161, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 1.9250799955864504, |
|
"grad_norm": 0.5414062516129651, |
|
"learning_rate": 6.004201969432771e-06, |
|
"loss": 0.7303, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 1.9427341939755047, |
|
"grad_norm": 0.462895367066655, |
|
"learning_rate": 5.830082578303193e-06, |
|
"loss": 0.7249, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 1.960388392364559, |
|
"grad_norm": 0.49592685426373223, |
|
"learning_rate": 5.6574803312778196e-06, |
|
"loss": 0.7253, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 1.9780425907536134, |
|
"grad_norm": 0.4758478775588506, |
|
"learning_rate": 5.486458026358381e-06, |
|
"loss": 0.7126, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 1.995696789142668, |
|
"grad_norm": 0.4969466321267014, |
|
"learning_rate": 5.317077886715105e-06, |
|
"loss": 0.724, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 1.9992276288204789, |
|
"eval_loss": 0.6677735447883606, |
|
"eval_runtime": 527.1683, |
|
"eval_samples_per_second": 10.555, |
|
"eval_steps_per_second": 2.639, |
|
"step": 1132 |
|
}, |
|
{ |
|
"epoch": 2.0133509875317226, |
|
"grad_norm": 0.5185737619412808, |
|
"learning_rate": 5.14940153804804e-06, |
|
"loss": 0.6582, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 2.031005185920777, |
|
"grad_norm": 0.447637858983293, |
|
"learning_rate": 4.983489986165708e-06, |
|
"loss": 0.6365, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 2.0486593843098313, |
|
"grad_norm": 0.5152236160209434, |
|
"learning_rate": 4.819403594789335e-06, |
|
"loss": 0.6351, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 2.0663135826988857, |
|
"grad_norm": 0.6273558161173919, |
|
"learning_rate": 4.6572020635906535e-06, |
|
"loss": 0.6414, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 2.08396778108794, |
|
"grad_norm": 0.4926770781993292, |
|
"learning_rate": 4.4969444064713506e-06, |
|
"loss": 0.6335, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 2.1016219794769944, |
|
"grad_norm": 0.48793300929617467, |
|
"learning_rate": 4.338688930091982e-06, |
|
"loss": 0.6397, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 2.1192761778660487, |
|
"grad_norm": 0.48663482440046746, |
|
"learning_rate": 4.182493212658224e-06, |
|
"loss": 0.652, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 2.136930376255103, |
|
"grad_norm": 0.4760935997684315, |
|
"learning_rate": 4.028414082972141e-06, |
|
"loss": 0.6416, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 2.1545845746441574, |
|
"grad_norm": 0.4582415888721039, |
|
"learning_rate": 3.876507599756136e-06, |
|
"loss": 0.6295, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 2.172238773033212, |
|
"grad_norm": 0.4464518146441672, |
|
"learning_rate": 3.7268290312570622e-06, |
|
"loss": 0.6518, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 2.1898929714222666, |
|
"grad_norm": 0.4796659798369155, |
|
"learning_rate": 3.579432835137928e-06, |
|
"loss": 0.6376, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 2.207547169811321, |
|
"grad_norm": 0.4613709158191507, |
|
"learning_rate": 3.434372638664526e-06, |
|
"loss": 0.6418, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 2.2252013682003753, |
|
"grad_norm": 0.4620560070335567, |
|
"learning_rate": 3.2917012191941955e-06, |
|
"loss": 0.6331, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 2.2428555665894296, |
|
"grad_norm": 0.4403307799157334, |
|
"learning_rate": 3.151470484973792e-06, |
|
"loss": 0.6448, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 2.260509764978484, |
|
"grad_norm": 0.46979962256612806, |
|
"learning_rate": 3.0137314562538742e-06, |
|
"loss": 0.6333, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 2.2781639633675383, |
|
"grad_norm": 0.47605403770220545, |
|
"learning_rate": 2.8785342467259568e-06, |
|
"loss": 0.6372, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 2.2958181617565927, |
|
"grad_norm": 0.44577983389984577, |
|
"learning_rate": 2.745928045289631e-06, |
|
"loss": 0.6464, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 2.313472360145647, |
|
"grad_norm": 0.44706989287780297, |
|
"learning_rate": 2.6159610981561134e-06, |
|
"loss": 0.626, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 2.3311265585347014, |
|
"grad_norm": 0.44473280016435734, |
|
"learning_rate": 2.4886806912948034e-06, |
|
"loss": 0.6362, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 2.3487807569237558, |
|
"grad_norm": 0.4674495930967707, |
|
"learning_rate": 2.3641331332291793e-06, |
|
"loss": 0.6453, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 2.36643495531281, |
|
"grad_norm": 0.4455415304277737, |
|
"learning_rate": 2.2423637381883533e-06, |
|
"loss": 0.6289, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 2.384089153701865, |
|
"grad_norm": 0.4544071638805097, |
|
"learning_rate": 2.123416809620351e-06, |
|
"loss": 0.6331, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 2.4017433520909193, |
|
"grad_norm": 0.4723689800288618, |
|
"learning_rate": 2.007335624073157e-06, |
|
"loss": 0.645, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 2.4193975504799736, |
|
"grad_norm": 0.45220258933192226, |
|
"learning_rate": 1.8941624154493731e-06, |
|
"loss": 0.6451, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 2.437051748869028, |
|
"grad_norm": 0.4387363718825142, |
|
"learning_rate": 1.7839383596402382e-06, |
|
"loss": 0.6339, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 2.4547059472580823, |
|
"grad_norm": 0.52315736570108, |
|
"learning_rate": 1.6767035595445614e-06, |
|
"loss": 0.6289, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 2.4723601456471367, |
|
"grad_norm": 0.44143122222775955, |
|
"learning_rate": 1.5724970304780662e-06, |
|
"loss": 0.6307, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 2.490014344036191, |
|
"grad_norm": 0.49564787712258485, |
|
"learning_rate": 1.4713566859784045e-06, |
|
"loss": 0.6426, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 2.5076685424252454, |
|
"grad_norm": 0.44402047508915227, |
|
"learning_rate": 1.373319324011061e-06, |
|
"loss": 0.6368, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 2.5253227408142997, |
|
"grad_norm": 0.44130611579940193, |
|
"learning_rate": 1.2784206135811184e-06, |
|
"loss": 0.6398, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 2.5429769392033545, |
|
"grad_norm": 0.4764351411669087, |
|
"learning_rate": 1.1866950817557743e-06, |
|
"loss": 0.6349, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 2.5606311375924085, |
|
"grad_norm": 0.4398014393212803, |
|
"learning_rate": 1.0981761011023317e-06, |
|
"loss": 0.6218, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 2.5782853359814633, |
|
"grad_norm": 0.4470462981839341, |
|
"learning_rate": 1.0128958775462393e-06, |
|
"loss": 0.6241, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 2.5959395343705176, |
|
"grad_norm": 0.43673398026523563, |
|
"learning_rate": 9.308854386535849e-07, |
|
"loss": 0.6408, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 2.613593732759572, |
|
"grad_norm": 0.4505573168779551, |
|
"learning_rate": 8.521746223423088e-07, |
|
"loss": 0.6283, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 2.6312479311486263, |
|
"grad_norm": 0.4440648392714403, |
|
"learning_rate": 7.767920660262529e-07, |
|
"loss": 0.6285, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 2.6489021295376807, |
|
"grad_norm": 0.4414194215856468, |
|
"learning_rate": 7.047651961959978e-07, |
|
"loss": 0.6244, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 2.666556327926735, |
|
"grad_norm": 0.4440446143190118, |
|
"learning_rate": 6.361202184402515e-07, |
|
"loss": 0.6332, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 2.6842105263157894, |
|
"grad_norm": 0.4706364746939079, |
|
"learning_rate": 5.708821079114612e-07, |
|
"loss": 0.6362, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 2.7018647247048437, |
|
"grad_norm": 0.43340031061371953, |
|
"learning_rate": 5.090746002390734e-07, |
|
"loss": 0.6349, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 2.719518923093898, |
|
"grad_norm": 0.4246335933712338, |
|
"learning_rate": 4.507201828937935e-07, |
|
"loss": 0.6383, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 2.737173121482953, |
|
"grad_norm": 0.44029888263426625, |
|
"learning_rate": 3.958400870059476e-07, |
|
"loss": 0.6353, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 2.754827319872007, |
|
"grad_norm": 0.4317011108249921, |
|
"learning_rate": 3.444542796409478e-07, |
|
"loss": 0.632, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 2.7724815182610616, |
|
"grad_norm": 0.43100429391620887, |
|
"learning_rate": 2.965814565346548e-07, |
|
"loss": 0.6382, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 2.790135716650116, |
|
"grad_norm": 0.4769391216396139, |
|
"learning_rate": 2.522390352912985e-07, |
|
"loss": 0.6281, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 2.8077899150391703, |
|
"grad_norm": 0.44400518164711406, |
|
"learning_rate": 2.1144314904642194e-07, |
|
"loss": 0.6312, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 2.8254441134282247, |
|
"grad_norm": 0.4518349510084993, |
|
"learning_rate": 1.7420864059714215e-07, |
|
"loss": 0.6319, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 2.843098311817279, |
|
"grad_norm": 0.4353149820484635, |
|
"learning_rate": 1.405490570018908e-07, |
|
"loss": 0.6352, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 2.8607525102063334, |
|
"grad_norm": 0.4249015575790441, |
|
"learning_rate": 1.1047664465157592e-07, |
|
"loss": 0.6471, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 2.8784067085953877, |
|
"grad_norm": 0.42688299650964856, |
|
"learning_rate": 8.400234481397041e-08, |
|
"loss": 0.6232, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 2.896060906984442, |
|
"grad_norm": 0.4550228428201464, |
|
"learning_rate": 6.113578965293854e-08, |
|
"loss": 0.6426, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 2.9137151053734964, |
|
"grad_norm": 0.4424051249216907, |
|
"learning_rate": 4.188529872396374e-08, |
|
"loss": 0.6162, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 2.9313693037625512, |
|
"grad_norm": 0.43719698480253905, |
|
"learning_rate": 2.625787594723428e-08, |
|
"loss": 0.6353, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 2.9490235021516056, |
|
"grad_norm": 0.4552786221630045, |
|
"learning_rate": 1.4259207059403868e-08, |
|
"loss": 0.6355, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 2.96667770054066, |
|
"grad_norm": 0.4300462034554104, |
|
"learning_rate": 5.8936575449475284e-09, |
|
"loss": 0.6366, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 2.9843318989297143, |
|
"grad_norm": 0.4321948802854605, |
|
"learning_rate": 1.1642710478598772e-09, |
|
"loss": 0.6383, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 2.9984552576409578, |
|
"step": 1698, |
|
"total_flos": 2844841381724160.0, |
|
"train_loss": 0.0, |
|
"train_runtime": 8.0551, |
|
"train_samples_per_second": 27002.356, |
|
"train_steps_per_second": 210.799 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 1698, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2844841381724160.0, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|