|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 3.0, |
|
"eval_steps": 500, |
|
"global_step": 1497, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.02004008016032064, |
|
"grad_norm": 11.386945789449237, |
|
"learning_rate": 6.666666666666667e-07, |
|
"loss": 0.914, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.04008016032064128, |
|
"grad_norm": 3.686249022148282, |
|
"learning_rate": 1.3333333333333334e-06, |
|
"loss": 0.7761, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.06012024048096192, |
|
"grad_norm": 1.797608399405574, |
|
"learning_rate": 2.0000000000000003e-06, |
|
"loss": 0.6765, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.08016032064128256, |
|
"grad_norm": 2.53598544592724, |
|
"learning_rate": 2.666666666666667e-06, |
|
"loss": 0.6319, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.10020040080160321, |
|
"grad_norm": 1.8597121806159485, |
|
"learning_rate": 3.3333333333333333e-06, |
|
"loss": 0.6028, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.12024048096192384, |
|
"grad_norm": 1.8378706472462616, |
|
"learning_rate": 4.000000000000001e-06, |
|
"loss": 0.5846, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.1402805611222445, |
|
"grad_norm": 4.6666512736387356, |
|
"learning_rate": 4.666666666666667e-06, |
|
"loss": 0.5761, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.16032064128256512, |
|
"grad_norm": 2.67064642378821, |
|
"learning_rate": 4.999862725880242e-06, |
|
"loss": 0.5758, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.18036072144288579, |
|
"grad_norm": 2.6004134353948736, |
|
"learning_rate": 4.998764633422446e-06, |
|
"loss": 0.5667, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.20040080160320642, |
|
"grad_norm": 2.0205968573263413, |
|
"learning_rate": 4.996568984453794e-06, |
|
"loss": 0.5651, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.22044088176352705, |
|
"grad_norm": 1.7372715757375135, |
|
"learning_rate": 4.993276850606589e-06, |
|
"loss": 0.5593, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.24048096192384769, |
|
"grad_norm": 2.3534996172184846, |
|
"learning_rate": 4.988889838675462e-06, |
|
"loss": 0.5566, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.2605210420841683, |
|
"grad_norm": 2.1712051144800846, |
|
"learning_rate": 4.9834100898331425e-06, |
|
"loss": 0.554, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.280561122244489, |
|
"grad_norm": 2.988740945183358, |
|
"learning_rate": 4.976840278585413e-06, |
|
"loss": 0.5412, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.30060120240480964, |
|
"grad_norm": 1.7513590918597028, |
|
"learning_rate": 4.969183611465766e-06, |
|
"loss": 0.5494, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.32064128256513025, |
|
"grad_norm": 1.911642443149266, |
|
"learning_rate": 4.9604438254703845e-06, |
|
"loss": 0.5342, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.3406813627254509, |
|
"grad_norm": 1.7028624872037597, |
|
"learning_rate": 4.95062518623422e-06, |
|
"loss": 0.5427, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.36072144288577157, |
|
"grad_norm": 1.89229572623421, |
|
"learning_rate": 4.939732485949066e-06, |
|
"loss": 0.5379, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.3807615230460922, |
|
"grad_norm": 1.458083413953667, |
|
"learning_rate": 4.927771041024627e-06, |
|
"loss": 0.5355, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.40080160320641284, |
|
"grad_norm": 2.1624686975408336, |
|
"learning_rate": 4.914746689493731e-06, |
|
"loss": 0.54, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.42084168336673344, |
|
"grad_norm": 1.841839502355017, |
|
"learning_rate": 4.900665788162962e-06, |
|
"loss": 0.5306, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.4408817635270541, |
|
"grad_norm": 1.5623403972638494, |
|
"learning_rate": 4.885535209510081e-06, |
|
"loss": 0.531, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.46092184368737477, |
|
"grad_norm": 2.0959167807238672, |
|
"learning_rate": 4.869362338329773e-06, |
|
"loss": 0.528, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.48096192384769537, |
|
"grad_norm": 2.877452797335848, |
|
"learning_rate": 4.852155068129343e-06, |
|
"loss": 0.53, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.501002004008016, |
|
"grad_norm": 2.406716714855783, |
|
"learning_rate": 4.8339217972761274e-06, |
|
"loss": 0.5242, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.5210420841683366, |
|
"grad_norm": 2.6180161170191956, |
|
"learning_rate": 4.814671424898493e-06, |
|
"loss": 0.5218, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.5410821643286573, |
|
"grad_norm": 1.9325754069366325, |
|
"learning_rate": 4.794413346542432e-06, |
|
"loss": 0.516, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.561122244488978, |
|
"grad_norm": 1.6202018849080733, |
|
"learning_rate": 4.773157449585871e-06, |
|
"loss": 0.519, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.5811623246492986, |
|
"grad_norm": 2.5386712058137846, |
|
"learning_rate": 4.750914108412927e-06, |
|
"loss": 0.5168, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.6012024048096193, |
|
"grad_norm": 1.6204343506402092, |
|
"learning_rate": 4.727694179350476e-06, |
|
"loss": 0.5125, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.6212424849699398, |
|
"grad_norm": 1.6404070884775066, |
|
"learning_rate": 4.703508995369497e-06, |
|
"loss": 0.513, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.6412825651302605, |
|
"grad_norm": 1.6188579894187152, |
|
"learning_rate": 4.6783703605537715e-06, |
|
"loss": 0.5147, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.6613226452905812, |
|
"grad_norm": 1.4143375173363102, |
|
"learning_rate": 4.652290544338663e-06, |
|
"loss": 0.5173, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.6813627254509018, |
|
"grad_norm": 2.468479009235932, |
|
"learning_rate": 4.6252822755227605e-06, |
|
"loss": 0.5108, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.7014028056112225, |
|
"grad_norm": 2.1271900619659854, |
|
"learning_rate": 4.5973587360553185e-06, |
|
"loss": 0.5131, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.7214428857715431, |
|
"grad_norm": 1.5534652608564319, |
|
"learning_rate": 4.568533554602535e-06, |
|
"loss": 0.5083, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.7414829659318637, |
|
"grad_norm": 1.6027013883759695, |
|
"learning_rate": 4.5388207998957986e-06, |
|
"loss": 0.5133, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.7615230460921844, |
|
"grad_norm": 1.530068712838613, |
|
"learning_rate": 4.508234973865145e-06, |
|
"loss": 0.5157, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.781563126252505, |
|
"grad_norm": 1.8195363489136889, |
|
"learning_rate": 4.476791004561293e-06, |
|
"loss": 0.512, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.8016032064128257, |
|
"grad_norm": 1.637599921361452, |
|
"learning_rate": 4.4445042388696935e-06, |
|
"loss": 0.5095, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.8216432865731463, |
|
"grad_norm": 1.5903329614208361, |
|
"learning_rate": 4.411390435020164e-06, |
|
"loss": 0.5131, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.8416833667334669, |
|
"grad_norm": 1.5488153255590384, |
|
"learning_rate": 4.377465754895757e-06, |
|
"loss": 0.5052, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.8617234468937875, |
|
"grad_norm": 1.8543008230812608, |
|
"learning_rate": 4.342746756144609e-06, |
|
"loss": 0.5124, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.8817635270541082, |
|
"grad_norm": 1.5858389816681295, |
|
"learning_rate": 4.307250384098645e-06, |
|
"loss": 0.5094, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.9018036072144289, |
|
"grad_norm": 1.6693267301390629, |
|
"learning_rate": 4.270993963503048e-06, |
|
"loss": 0.5055, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.9218436873747495, |
|
"grad_norm": 1.4123291299839993, |
|
"learning_rate": 4.233995190060559e-06, |
|
"loss": 0.511, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.9418837675350702, |
|
"grad_norm": 1.4294865023874812, |
|
"learning_rate": 4.196272121794714e-06, |
|
"loss": 0.5122, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.9619238476953907, |
|
"grad_norm": 1.4747480031112472, |
|
"learning_rate": 4.15784317023626e-06, |
|
"loss": 0.5068, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.9819639278557114, |
|
"grad_norm": 1.6382019572026507, |
|
"learning_rate": 4.118727091437013e-06, |
|
"loss": 0.5002, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 0.5069113373756409, |
|
"eval_runtime": 45.884, |
|
"eval_samples_per_second": 292.695, |
|
"eval_steps_per_second": 1.155, |
|
"step": 499 |
|
}, |
|
{ |
|
"epoch": 1.002004008016032, |
|
"grad_norm": 3.2862336764098803, |
|
"learning_rate": 4.078942976815581e-06, |
|
"loss": 0.4942, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.0220440881763526, |
|
"grad_norm": 1.7759252107171999, |
|
"learning_rate": 4.038510243839396e-06, |
|
"loss": 0.418, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.0420841683366733, |
|
"grad_norm": 1.7448507912514557, |
|
"learning_rate": 3.997448626547613e-06, |
|
"loss": 0.4136, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.062124248496994, |
|
"grad_norm": 1.4081466868123829, |
|
"learning_rate": 3.955778165919502e-06, |
|
"loss": 0.4131, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.0821643286573146, |
|
"grad_norm": 1.4737485149847567, |
|
"learning_rate": 3.913519200093035e-06, |
|
"loss": 0.412, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.1022044088176353, |
|
"grad_norm": 1.2635878378102765, |
|
"learning_rate": 3.870692354438423e-06, |
|
"loss": 0.4142, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.122244488977956, |
|
"grad_norm": 1.553384214976449, |
|
"learning_rate": 3.827318531491478e-06, |
|
"loss": 0.4115, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.1422845691382766, |
|
"grad_norm": 1.5870294004406946, |
|
"learning_rate": 3.7834189007517002e-06, |
|
"loss": 0.4108, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.1623246492985972, |
|
"grad_norm": 1.6029534467749516, |
|
"learning_rate": 3.739014888350053e-06, |
|
"loss": 0.4116, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.182364729458918, |
|
"grad_norm": 1.6194120236052227, |
|
"learning_rate": 3.694128166591494e-06, |
|
"loss": 0.4136, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.2024048096192386, |
|
"grad_norm": 1.436406692065181, |
|
"learning_rate": 3.6487806433773615e-06, |
|
"loss": 0.4134, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.2224448897795592, |
|
"grad_norm": 1.4163845777049167, |
|
"learning_rate": 3.6029944515127585e-06, |
|
"loss": 0.4091, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 1.2424849699398797, |
|
"grad_norm": 1.5714505381282495, |
|
"learning_rate": 3.5567919379041783e-06, |
|
"loss": 0.4142, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.2625250501002003, |
|
"grad_norm": 1.4088322992815865, |
|
"learning_rate": 3.510195652652629e-06, |
|
"loss": 0.4161, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 1.282565130260521, |
|
"grad_norm": 1.6997351047753615, |
|
"learning_rate": 3.463228338047589e-06, |
|
"loss": 0.4059, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.3026052104208417, |
|
"grad_norm": 1.5683685074334606, |
|
"learning_rate": 3.4159129174671534e-06, |
|
"loss": 0.4065, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.3226452905811623, |
|
"grad_norm": 1.6253512472033467, |
|
"learning_rate": 3.3682724841898067e-06, |
|
"loss": 0.4108, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 1.342685370741483, |
|
"grad_norm": 1.759831168192353, |
|
"learning_rate": 3.320330290123261e-06, |
|
"loss": 0.4103, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 1.3627254509018036, |
|
"grad_norm": 1.7378970495344055, |
|
"learning_rate": 3.2721097344558794e-06, |
|
"loss": 0.4133, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 1.3827655310621243, |
|
"grad_norm": 1.5427360104428394, |
|
"learning_rate": 3.223634352236213e-06, |
|
"loss": 0.4103, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 1.402805611222445, |
|
"grad_norm": 1.4316640042632263, |
|
"learning_rate": 3.1749278028862325e-06, |
|
"loss": 0.4078, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.4228456913827654, |
|
"grad_norm": 1.951831574889627, |
|
"learning_rate": 3.1260138586538483e-06, |
|
"loss": 0.4108, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 1.4428857715430863, |
|
"grad_norm": 1.5703450558182246, |
|
"learning_rate": 3.076916393010373e-06, |
|
"loss": 0.4091, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 1.4629258517034067, |
|
"grad_norm": 1.4374885381526987, |
|
"learning_rate": 3.0276593689985737e-06, |
|
"loss": 0.4056, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 1.4829659318637274, |
|
"grad_norm": 1.2643689737278019, |
|
"learning_rate": 2.9782668275370095e-06, |
|
"loss": 0.4139, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 1.503006012024048, |
|
"grad_norm": 1.570266686325652, |
|
"learning_rate": 2.928762875686358e-06, |
|
"loss": 0.4109, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.5230460921843687, |
|
"grad_norm": 1.5743968516846465, |
|
"learning_rate": 2.879171674883462e-06, |
|
"loss": 0.4103, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 1.5430861723446894, |
|
"grad_norm": 1.5844459981668437, |
|
"learning_rate": 2.8295174291488383e-06, |
|
"loss": 0.4109, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 1.56312625250501, |
|
"grad_norm": 1.504354777149479, |
|
"learning_rate": 2.779824373273397e-06, |
|
"loss": 0.4118, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 1.5831663326653307, |
|
"grad_norm": 1.3787147420821566, |
|
"learning_rate": 2.7301167609901474e-06, |
|
"loss": 0.4135, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 1.6032064128256514, |
|
"grad_norm": 1.4777711687133863, |
|
"learning_rate": 2.680418853136659e-06, |
|
"loss": 0.4095, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.623246492985972, |
|
"grad_norm": 1.468500779993122, |
|
"learning_rate": 2.630754905814048e-06, |
|
"loss": 0.4079, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 1.6432865731462925, |
|
"grad_norm": 1.6134930461454124, |
|
"learning_rate": 2.58114915854829e-06, |
|
"loss": 0.4083, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 1.6633266533066133, |
|
"grad_norm": 1.4588892173060275, |
|
"learning_rate": 2.531625822459603e-06, |
|
"loss": 0.4087, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 1.6833667334669338, |
|
"grad_norm": 1.4497521826737287, |
|
"learning_rate": 2.4822090684457147e-06, |
|
"loss": 0.4113, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 1.7034068136272547, |
|
"grad_norm": 1.4372338693007887, |
|
"learning_rate": 2.432923015384743e-06, |
|
"loss": 0.4047, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 1.723446893787575, |
|
"grad_norm": 1.3561893796274846, |
|
"learning_rate": 2.3837917183634815e-06, |
|
"loss": 0.4085, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 1.7434869739478958, |
|
"grad_norm": 1.3339197747345177, |
|
"learning_rate": 2.3348391569368064e-06, |
|
"loss": 0.411, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 1.7635270541082164, |
|
"grad_norm": 1.2878261372500692, |
|
"learning_rate": 2.2860892234239565e-06, |
|
"loss": 0.4145, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 1.783567134268537, |
|
"grad_norm": 1.3852796037018127, |
|
"learning_rate": 2.237565711247391e-06, |
|
"loss": 0.4109, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 1.8036072144288577, |
|
"grad_norm": 1.343576505451422, |
|
"learning_rate": 2.1892923033199075e-06, |
|
"loss": 0.4078, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.8236472945891784, |
|
"grad_norm": 1.4910844256604878, |
|
"learning_rate": 2.141292560485708e-06, |
|
"loss": 0.4064, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 1.843687374749499, |
|
"grad_norm": 1.452632980241925, |
|
"learning_rate": 2.0935899100210316e-06, |
|
"loss": 0.4059, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 1.8637274549098195, |
|
"grad_norm": 1.3416125112736146, |
|
"learning_rate": 2.046207634199989e-06, |
|
"loss": 0.405, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 1.8837675350701404, |
|
"grad_norm": 1.3346098098736352, |
|
"learning_rate": 1.9991688589311575e-06, |
|
"loss": 0.4067, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 1.9038076152304608, |
|
"grad_norm": 1.2974237604973982, |
|
"learning_rate": 1.9524965424705026e-06, |
|
"loss": 0.4024, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 1.9238476953907817, |
|
"grad_norm": 1.471473412639078, |
|
"learning_rate": 1.9062134642161198e-06, |
|
"loss": 0.4091, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 1.9438877755511021, |
|
"grad_norm": 1.357694886526054, |
|
"learning_rate": 1.8603422135902737e-06, |
|
"loss": 0.4057, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 1.9639278557114228, |
|
"grad_norm": 1.2865807531711169, |
|
"learning_rate": 1.8149051790141628e-06, |
|
"loss": 0.4066, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 1.9839679358717435, |
|
"grad_norm": 1.297389613332899, |
|
"learning_rate": 1.7699245369807778e-06, |
|
"loss": 0.4045, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 0.4974905848503113, |
|
"eval_runtime": 51.1615, |
|
"eval_samples_per_second": 262.502, |
|
"eval_steps_per_second": 1.036, |
|
"step": 998 |
|
}, |
|
{ |
|
"epoch": 2.004008016032064, |
|
"grad_norm": 2.171277446089175, |
|
"learning_rate": 1.725422241231205e-06, |
|
"loss": 0.3859, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 2.024048096192385, |
|
"grad_norm": 1.7817202202370321, |
|
"learning_rate": 1.6814200120396438e-06, |
|
"loss": 0.3244, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 2.0440881763527052, |
|
"grad_norm": 1.619953212777582, |
|
"learning_rate": 1.6379393256123737e-06, |
|
"loss": 0.3211, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 2.064128256513026, |
|
"grad_norm": 1.40219382175259, |
|
"learning_rate": 1.595001403605844e-06, |
|
"loss": 0.3158, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 2.0841683366733466, |
|
"grad_norm": 1.4533085140998225, |
|
"learning_rate": 1.552627202769e-06, |
|
"loss": 0.318, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 2.1042084168336674, |
|
"grad_norm": 1.4269652142649647, |
|
"learning_rate": 1.5108374047149061e-06, |
|
"loss": 0.3197, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 2.124248496993988, |
|
"grad_norm": 1.537954200218747, |
|
"learning_rate": 1.4696524058266516e-06, |
|
"loss": 0.3196, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 2.1442885771543088, |
|
"grad_norm": 1.6449193889553193, |
|
"learning_rate": 1.4290923073024712e-06, |
|
"loss": 0.3214, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 2.164328657314629, |
|
"grad_norm": 1.3598251114998061, |
|
"learning_rate": 1.3891769053449355e-06, |
|
"loss": 0.321, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 2.18436873747495, |
|
"grad_norm": 1.551617792162964, |
|
"learning_rate": 1.3499256814990003e-06, |
|
"loss": 0.3172, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 2.2044088176352705, |
|
"grad_norm": 1.487413687096442, |
|
"learning_rate": 1.3113577931436332e-06, |
|
"loss": 0.319, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 2.2244488977955914, |
|
"grad_norm": 1.379093893132986, |
|
"learning_rate": 1.2734920641416556e-06, |
|
"loss": 0.3201, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 2.244488977955912, |
|
"grad_norm": 1.3525463916397538, |
|
"learning_rate": 1.236346975652358e-06, |
|
"loss": 0.32, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 2.2645290581162323, |
|
"grad_norm": 1.4572131883779704, |
|
"learning_rate": 1.1999406571113962e-06, |
|
"loss": 0.3171, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 2.284569138276553, |
|
"grad_norm": 1.4042689852309367, |
|
"learning_rate": 1.1642908773823274e-06, |
|
"loss": 0.3166, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 2.3046092184368736, |
|
"grad_norm": 1.3000414229131876, |
|
"learning_rate": 1.1294150360841577e-06, |
|
"loss": 0.3211, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 2.3246492985971945, |
|
"grad_norm": 1.3811842558284932, |
|
"learning_rate": 1.0953301550990882e-06, |
|
"loss": 0.3213, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 2.344689378757515, |
|
"grad_norm": 1.3259540554288098, |
|
"learning_rate": 1.0620528702646312e-06, |
|
"loss": 0.3193, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 2.364729458917836, |
|
"grad_norm": 1.4720294318649223, |
|
"learning_rate": 1.0295994232541454e-06, |
|
"loss": 0.32, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 2.3847695390781563, |
|
"grad_norm": 1.3734699044137106, |
|
"learning_rate": 9.979856536497435e-07, |
|
"loss": 0.3178, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 2.404809619238477, |
|
"grad_norm": 1.3553774328605976, |
|
"learning_rate": 9.672269912114582e-07, |
|
"loss": 0.3134, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 2.4248496993987976, |
|
"grad_norm": 1.350392284981299, |
|
"learning_rate": 9.373384483464223e-07, |
|
"loss": 0.3218, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 2.4448897795591185, |
|
"grad_norm": 1.4402104791397123, |
|
"learning_rate": 9.08334612781753e-07, |
|
"loss": 0.3205, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 2.464929859719439, |
|
"grad_norm": 1.3646686545789422, |
|
"learning_rate": 8.80229640444705e-07, |
|
"loss": 0.3216, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 2.4849699398797593, |
|
"grad_norm": 1.3499126545275342, |
|
"learning_rate": 8.530372485535751e-07, |
|
"loss": 0.3197, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 2.50501002004008, |
|
"grad_norm": 1.4013520657650143, |
|
"learning_rate": 8.267707089227288e-07, |
|
"loss": 0.3174, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 2.5250501002004007, |
|
"grad_norm": 1.3543143703928378, |
|
"learning_rate": 8.01442841485013e-07, |
|
"loss": 0.3237, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 2.5450901803607215, |
|
"grad_norm": 1.3528190597320051, |
|
"learning_rate": 7.770660080347213e-07, |
|
"loss": 0.3226, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 2.565130260521042, |
|
"grad_norm": 1.316933143733213, |
|
"learning_rate": 7.536521061941613e-07, |
|
"loss": 0.3178, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 2.585170340681363, |
|
"grad_norm": 1.2849910299224803, |
|
"learning_rate": 7.312125636067723e-07, |
|
"loss": 0.3225, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 2.6052104208416833, |
|
"grad_norm": 1.290228394422378, |
|
"learning_rate": 7.097583323596257e-07, |
|
"loss": 0.3222, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 2.6252505010020037, |
|
"grad_norm": 1.426441845965089, |
|
"learning_rate": 6.892998836380285e-07, |
|
"loss": 0.3173, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 2.6452905811623246, |
|
"grad_norm": 1.2801979621928115, |
|
"learning_rate": 6.698472026148459e-07, |
|
"loss": 0.3234, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 2.6653306613226455, |
|
"grad_norm": 1.3220972865520855, |
|
"learning_rate": 6.514097835770269e-07, |
|
"loss": 0.3175, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 2.685370741482966, |
|
"grad_norm": 1.3667409352018853, |
|
"learning_rate": 6.3399662529172e-07, |
|
"loss": 0.3184, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 2.7054108216432864, |
|
"grad_norm": 1.3528891456726284, |
|
"learning_rate": 6.176162266142376e-07, |
|
"loss": 0.323, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 2.7254509018036073, |
|
"grad_norm": 1.327321152396037, |
|
"learning_rate": 6.022765823400116e-07, |
|
"loss": 0.3215, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 2.7454909819639277, |
|
"grad_norm": 1.277409240062218, |
|
"learning_rate": 5.879851793025669e-07, |
|
"loss": 0.3182, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 2.7655310621242486, |
|
"grad_norm": 1.336386040490389, |
|
"learning_rate": 5.747489927194165e-07, |
|
"loss": 0.3202, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 2.785571142284569, |
|
"grad_norm": 1.2533448631397004, |
|
"learning_rate": 5.625744827876601e-07, |
|
"loss": 0.319, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 2.80561122244489, |
|
"grad_norm": 1.290234810317301, |
|
"learning_rate": 5.514675915309507e-07, |
|
"loss": 0.3185, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 2.8256513026052104, |
|
"grad_norm": 1.311881104807796, |
|
"learning_rate": 5.414337398993661e-07, |
|
"loss": 0.3156, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 2.845691382765531, |
|
"grad_norm": 1.3245343587549412, |
|
"learning_rate": 5.324778251236008e-07, |
|
"loss": 0.3191, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 2.8657314629258517, |
|
"grad_norm": 1.318165777680666, |
|
"learning_rate": 5.246042183247698e-07, |
|
"loss": 0.3141, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 2.8857715430861726, |
|
"grad_norm": 1.3833599862766017, |
|
"learning_rate": 5.178167623809932e-07, |
|
"loss": 0.3199, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 2.905811623246493, |
|
"grad_norm": 1.3572877385241544, |
|
"learning_rate": 5.121187700517977e-07, |
|
"loss": 0.3213, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 2.9258517034068134, |
|
"grad_norm": 1.289697605126265, |
|
"learning_rate": 5.075130223612563e-07, |
|
"loss": 0.3209, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 2.9458917835671343, |
|
"grad_norm": 1.3713215738345175, |
|
"learning_rate": 5.040017672406508e-07, |
|
"loss": 0.3152, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 2.9659318637274548, |
|
"grad_norm": 1.3065474715872867, |
|
"learning_rate": 5.015867184313224e-07, |
|
"loss": 0.3152, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 2.9859719438877756, |
|
"grad_norm": 1.29789945819745, |
|
"learning_rate": 5.002690546482441e-07, |
|
"loss": 0.3184, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_loss": 0.5188336968421936, |
|
"eval_runtime": 52.0293, |
|
"eval_samples_per_second": 258.124, |
|
"eval_steps_per_second": 1.019, |
|
"step": 1497 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"step": 1497, |
|
"total_flos": 2507530756423680.0, |
|
"train_loss": 0.4253670667598625, |
|
"train_runtime": 9374.4269, |
|
"train_samples_per_second": 81.656, |
|
"train_steps_per_second": 0.16 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 1497, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2507530756423680.0, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|