|
{ |
|
"best_metric": 11.208358764648438, |
|
"best_model_checkpoint": "./FT_models/[LDH]0219_all_llama31_docs_nodocs/checkpoint-1000", |
|
"epoch": 2.716904276985743, |
|
"eval_steps": 500, |
|
"global_step": 3000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.009051821679112922, |
|
"grad_norm": 0.874797523021698, |
|
"learning_rate": 0.0001999995055317446, |
|
"loss": 1.9293, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.018103643358225844, |
|
"grad_norm": 0.5239140391349792, |
|
"learning_rate": 0.0001999955498150411, |
|
"loss": 1.1773, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.027155465037338764, |
|
"grad_norm": 0.49293628334999084, |
|
"learning_rate": 0.00019998763853811184, |
|
"loss": 1.1369, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.03620728671645169, |
|
"grad_norm": 0.4514484405517578, |
|
"learning_rate": 0.00019997577201390606, |
|
"loss": 1.1049, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.04525910839556461, |
|
"grad_norm": 0.480444073677063, |
|
"learning_rate": 0.0001999599507118322, |
|
"loss": 1.0472, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.05431093007467753, |
|
"grad_norm": 0.6258535385131836, |
|
"learning_rate": 0.00019994017525773913, |
|
"loss": 1.0656, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.06336275175379046, |
|
"grad_norm": 0.5886121392250061, |
|
"learning_rate": 0.0001999164464338918, |
|
"loss": 0.9311, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.07241457343290338, |
|
"grad_norm": 0.6107162833213806, |
|
"learning_rate": 0.0001998887651789398, |
|
"loss": 1.0476, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.0814663951120163, |
|
"grad_norm": 0.5811272859573364, |
|
"learning_rate": 0.0001998571325878806, |
|
"loss": 0.9568, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.09051821679112922, |
|
"grad_norm": 0.6300623416900635, |
|
"learning_rate": 0.00019982154991201608, |
|
"loss": 0.9121, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.09957003847024214, |
|
"grad_norm": 0.6835796236991882, |
|
"learning_rate": 0.00019978201855890308, |
|
"loss": 0.8682, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.10862186014935506, |
|
"grad_norm": 0.5953958034515381, |
|
"learning_rate": 0.00019973854009229763, |
|
"loss": 0.9708, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.11767368182846798, |
|
"grad_norm": 0.6399095058441162, |
|
"learning_rate": 0.00019969111623209323, |
|
"loss": 0.9063, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.1267255035075809, |
|
"grad_norm": 0.5774116516113281, |
|
"learning_rate": 0.00019963974885425266, |
|
"loss": 0.9571, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.13577732518669383, |
|
"grad_norm": 0.6394885182380676, |
|
"learning_rate": 0.00019958443999073397, |
|
"loss": 0.9187, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.14482914686580675, |
|
"grad_norm": 0.6276863813400269, |
|
"learning_rate": 0.00019952519182940993, |
|
"loss": 0.9531, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.15388096854491967, |
|
"grad_norm": 0.6715622544288635, |
|
"learning_rate": 0.0001994620067139815, |
|
"loss": 0.8558, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.1629327902240326, |
|
"grad_norm": 0.7086506485939026, |
|
"learning_rate": 0.00019939488714388524, |
|
"loss": 0.8611, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.1719846119031455, |
|
"grad_norm": 0.614281177520752, |
|
"learning_rate": 0.00019932383577419432, |
|
"loss": 0.7985, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.18103643358225843, |
|
"grad_norm": 0.6501333117485046, |
|
"learning_rate": 0.0001992488554155135, |
|
"loss": 0.8663, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.19008825526137135, |
|
"grad_norm": 0.59181809425354, |
|
"learning_rate": 0.0001991699490338681, |
|
"loss": 0.7611, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.19914007694048427, |
|
"grad_norm": 0.6414404511451721, |
|
"learning_rate": 0.00019908711975058637, |
|
"loss": 0.7952, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.2081918986195972, |
|
"grad_norm": 0.5972408056259155, |
|
"learning_rate": 0.00019900037084217637, |
|
"loss": 0.7415, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.2172437202987101, |
|
"grad_norm": 0.6564370393753052, |
|
"learning_rate": 0.00019890970574019617, |
|
"loss": 0.7951, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.22629554197782303, |
|
"grad_norm": 0.7463496327400208, |
|
"learning_rate": 0.00019881512803111796, |
|
"loss": 0.7421, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.23534736365693595, |
|
"grad_norm": 0.6890644431114197, |
|
"learning_rate": 0.00019871664145618657, |
|
"loss": 0.7929, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.24439918533604887, |
|
"grad_norm": 0.5937248468399048, |
|
"learning_rate": 0.00019861424991127115, |
|
"loss": 0.7801, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.2534510070151618, |
|
"grad_norm": 0.7962441444396973, |
|
"learning_rate": 0.00019850795744671116, |
|
"loss": 0.7889, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.2625028286942747, |
|
"grad_norm": 0.5598044991493225, |
|
"learning_rate": 0.00019839776826715614, |
|
"loss": 0.773, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.27155465037338766, |
|
"grad_norm": 0.743926465511322, |
|
"learning_rate": 0.00019828368673139947, |
|
"loss": 0.7739, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.28060647205250056, |
|
"grad_norm": 0.5907182693481445, |
|
"learning_rate": 0.00019816571735220583, |
|
"loss": 0.7472, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.2896582937316135, |
|
"grad_norm": 0.7143293619155884, |
|
"learning_rate": 0.0001980438647961327, |
|
"loss": 0.723, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.2987101154107264, |
|
"grad_norm": 0.6375603079795837, |
|
"learning_rate": 0.00019791813388334581, |
|
"loss": 0.7454, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.30776193708983934, |
|
"grad_norm": 0.7762064933776855, |
|
"learning_rate": 0.00019778852958742853, |
|
"loss": 0.6882, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.31681375876895224, |
|
"grad_norm": 0.5525858998298645, |
|
"learning_rate": 0.00019765505703518496, |
|
"loss": 0.6642, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.3258655804480652, |
|
"grad_norm": 0.6317685842514038, |
|
"learning_rate": 0.00019751772150643722, |
|
"loss": 0.6965, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.3349174021271781, |
|
"grad_norm": 0.664470374584198, |
|
"learning_rate": 0.0001973765284338167, |
|
"loss": 0.7218, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.343969223806291, |
|
"grad_norm": 0.587965190410614, |
|
"learning_rate": 0.00019723148340254892, |
|
"loss": 0.7628, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.3530210454854039, |
|
"grad_norm": 0.6755478978157043, |
|
"learning_rate": 0.0001970825921502328, |
|
"loss": 0.6277, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.36207286716451687, |
|
"grad_norm": 0.6129139065742493, |
|
"learning_rate": 0.00019692986056661356, |
|
"loss": 0.7238, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.37112468884362976, |
|
"grad_norm": 0.6757403612136841, |
|
"learning_rate": 0.0001967732946933499, |
|
"loss": 0.6568, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.3801765105227427, |
|
"grad_norm": 0.7164187431335449, |
|
"learning_rate": 0.00019661290072377482, |
|
"loss": 0.6963, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.3892283322018556, |
|
"grad_norm": 0.6669524908065796, |
|
"learning_rate": 0.0001964486850026507, |
|
"loss": 0.6931, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.39828015388096855, |
|
"grad_norm": 0.6036201119422913, |
|
"learning_rate": 0.00019628065402591845, |
|
"loss": 0.6989, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.4073319755600815, |
|
"grad_norm": 0.6022247076034546, |
|
"learning_rate": 0.0001961088144404403, |
|
"loss": 0.7669, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.4163837972391944, |
|
"grad_norm": 0.6327004432678223, |
|
"learning_rate": 0.00019593317304373705, |
|
"loss": 0.6673, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.42543561891830733, |
|
"grad_norm": 0.6826880574226379, |
|
"learning_rate": 0.00019575373678371909, |
|
"loss": 0.6965, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.4344874405974202, |
|
"grad_norm": 0.6268287897109985, |
|
"learning_rate": 0.0001955705127584117, |
|
"loss": 0.6468, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.4435392622765332, |
|
"grad_norm": 0.6102762818336487, |
|
"learning_rate": 0.00019538350821567404, |
|
"loss": 0.6302, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.45259108395564607, |
|
"grad_norm": 0.672012448310852, |
|
"learning_rate": 0.00019519273055291266, |
|
"loss": 0.6529, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.45259108395564607, |
|
"eval_loss": 11.249969482421875, |
|
"eval_runtime": 42.473, |
|
"eval_samples_per_second": 7.464, |
|
"eval_steps_per_second": 3.744, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.461642905634759, |
|
"grad_norm": 0.6281114220619202, |
|
"learning_rate": 0.00019499818731678873, |
|
"loss": 0.705, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.4706947273138719, |
|
"grad_norm": 0.6248394250869751, |
|
"learning_rate": 0.00019479988620291956, |
|
"loss": 0.6999, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.47974654899298486, |
|
"grad_norm": 0.5792989134788513, |
|
"learning_rate": 0.00019459783505557424, |
|
"loss": 0.6166, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.48879837067209775, |
|
"grad_norm": 0.6465005278587341, |
|
"learning_rate": 0.0001943920418673633, |
|
"loss": 0.5985, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.4978501923512107, |
|
"grad_norm": 0.7002309560775757, |
|
"learning_rate": 0.0001941825147789225, |
|
"loss": 0.7327, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.5069020140303236, |
|
"grad_norm": 0.6550771594047546, |
|
"learning_rate": 0.00019396926207859084, |
|
"loss": 0.6097, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.5159538357094365, |
|
"grad_norm": 0.6645638942718506, |
|
"learning_rate": 0.00019375229220208276, |
|
"loss": 0.6032, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.5250056573885494, |
|
"grad_norm": 0.6332526803016663, |
|
"learning_rate": 0.0001935316137321543, |
|
"loss": 0.647, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.5340574790676623, |
|
"grad_norm": 0.6539055705070496, |
|
"learning_rate": 0.00019330723539826375, |
|
"loss": 0.5964, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.5431093007467753, |
|
"grad_norm": 0.616939902305603, |
|
"learning_rate": 0.0001930791660762262, |
|
"loss": 0.6331, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.5521611224258882, |
|
"grad_norm": 0.6726529598236084, |
|
"learning_rate": 0.0001928474147878626, |
|
"loss": 0.6395, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.5612129441050011, |
|
"grad_norm": 0.6520217061042786, |
|
"learning_rate": 0.0001926119907006426, |
|
"loss": 0.5835, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.570264765784114, |
|
"grad_norm": 0.6048600077629089, |
|
"learning_rate": 0.00019237290312732226, |
|
"loss": 0.5824, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.579316587463227, |
|
"grad_norm": 0.6220108270645142, |
|
"learning_rate": 0.0001921301615255754, |
|
"loss": 0.638, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.5883684091423399, |
|
"grad_norm": 0.6199471354484558, |
|
"learning_rate": 0.00019188377549761963, |
|
"loss": 0.591, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.5974202308214528, |
|
"grad_norm": 0.6387749910354614, |
|
"learning_rate": 0.00019163375478983632, |
|
"loss": 0.5934, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.6064720525005657, |
|
"grad_norm": 0.6121918559074402, |
|
"learning_rate": 0.00019138010929238534, |
|
"loss": 0.5164, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.6155238741796787, |
|
"grad_norm": 0.5336770415306091, |
|
"learning_rate": 0.0001911228490388136, |
|
"loss": 0.5644, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.6245756958587916, |
|
"grad_norm": 0.6676989793777466, |
|
"learning_rate": 0.00019086198420565823, |
|
"loss": 0.6162, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.6336275175379045, |
|
"grad_norm": 0.7787268161773682, |
|
"learning_rate": 0.000190597525112044, |
|
"loss": 0.5458, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.6426793392170175, |
|
"grad_norm": 0.5786814093589783, |
|
"learning_rate": 0.00019032948221927524, |
|
"loss": 0.6159, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.6517311608961304, |
|
"grad_norm": 0.5182745456695557, |
|
"learning_rate": 0.00019005786613042185, |
|
"loss": 0.5528, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.6607829825752433, |
|
"grad_norm": 0.6106360554695129, |
|
"learning_rate": 0.00018978268758989991, |
|
"loss": 0.5792, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.6698348042543562, |
|
"grad_norm": 0.6614301800727844, |
|
"learning_rate": 0.00018950395748304678, |
|
"loss": 0.5251, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.6788866259334692, |
|
"grad_norm": 0.4879720211029053, |
|
"learning_rate": 0.0001892216868356904, |
|
"loss": 0.5596, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.687938447612582, |
|
"grad_norm": 0.7061878442764282, |
|
"learning_rate": 0.00018893588681371303, |
|
"loss": 0.5517, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.6969902692916949, |
|
"grad_norm": 0.7787359952926636, |
|
"learning_rate": 0.00018864656872260985, |
|
"loss": 0.5813, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.7060420909708078, |
|
"grad_norm": 0.5743793845176697, |
|
"learning_rate": 0.00018835374400704154, |
|
"loss": 0.56, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.7150939126499208, |
|
"grad_norm": 0.5919637680053711, |
|
"learning_rate": 0.00018805742425038145, |
|
"loss": 0.5429, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.7241457343290337, |
|
"grad_norm": 0.5750169157981873, |
|
"learning_rate": 0.00018775762117425777, |
|
"loss": 0.5303, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.7331975560081466, |
|
"grad_norm": 0.5034388899803162, |
|
"learning_rate": 0.00018745434663808942, |
|
"loss": 0.554, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.7422493776872595, |
|
"grad_norm": 0.6673147678375244, |
|
"learning_rate": 0.00018714761263861728, |
|
"loss": 0.5769, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.7513011993663725, |
|
"grad_norm": 0.6599574089050293, |
|
"learning_rate": 0.00018683743130942928, |
|
"loss": 0.5316, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.7603530210454854, |
|
"grad_norm": 0.6396362781524658, |
|
"learning_rate": 0.00018652381492048083, |
|
"loss": 0.5061, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.7694048427245983, |
|
"grad_norm": 0.5517466068267822, |
|
"learning_rate": 0.00018620677587760916, |
|
"loss": 0.5951, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.7784566644037112, |
|
"grad_norm": 0.5412144064903259, |
|
"learning_rate": 0.00018588632672204264, |
|
"loss": 0.4929, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.7875084860828242, |
|
"grad_norm": 0.5431721210479736, |
|
"learning_rate": 0.00018556248012990468, |
|
"loss": 0.5358, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.7965603077619371, |
|
"grad_norm": 0.5506939888000488, |
|
"learning_rate": 0.0001852352489117124, |
|
"loss": 0.4487, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.80561212944105, |
|
"grad_norm": 0.5743805766105652, |
|
"learning_rate": 0.0001849046460118698, |
|
"loss": 0.5725, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.814663951120163, |
|
"grad_norm": 0.573427677154541, |
|
"learning_rate": 0.00018457068450815562, |
|
"loss": 0.5078, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.8237157727992759, |
|
"grad_norm": 0.6854224801063538, |
|
"learning_rate": 0.00018423337761120618, |
|
"loss": 0.4688, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.8327675944783888, |
|
"grad_norm": 0.6372252702713013, |
|
"learning_rate": 0.00018389273866399275, |
|
"loss": 0.5169, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.8418194161575017, |
|
"grad_norm": 0.6130049228668213, |
|
"learning_rate": 0.00018354878114129367, |
|
"loss": 0.5089, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.8508712378366147, |
|
"grad_norm": 0.5654692649841309, |
|
"learning_rate": 0.00018320151864916135, |
|
"loss": 0.5492, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.8599230595157276, |
|
"grad_norm": 0.548478901386261, |
|
"learning_rate": 0.00018285096492438424, |
|
"loss": 0.5218, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.8689748811948405, |
|
"grad_norm": 0.5212107300758362, |
|
"learning_rate": 0.00018249713383394303, |
|
"loss": 0.5803, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.8780267028739533, |
|
"grad_norm": 0.5414544939994812, |
|
"learning_rate": 0.00018214003937446253, |
|
"loss": 0.5677, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.8870785245530663, |
|
"grad_norm": 0.58649742603302, |
|
"learning_rate": 0.0001817796956716578, |
|
"loss": 0.5331, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.8961303462321792, |
|
"grad_norm": 0.5166681408882141, |
|
"learning_rate": 0.00018141611697977529, |
|
"loss": 0.5032, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.9051821679112921, |
|
"grad_norm": 0.6174758076667786, |
|
"learning_rate": 0.0001810493176810292, |
|
"loss": 0.5148, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.9051821679112921, |
|
"eval_loss": 11.208358764648438, |
|
"eval_runtime": 42.0152, |
|
"eval_samples_per_second": 7.545, |
|
"eval_steps_per_second": 3.784, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.914233989590405, |
|
"grad_norm": 0.6219255924224854, |
|
"learning_rate": 0.00018067931228503246, |
|
"loss": 0.5486, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.923285811269518, |
|
"grad_norm": 0.5557869672775269, |
|
"learning_rate": 0.00018030611542822257, |
|
"loss": 0.4645, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.9323376329486309, |
|
"grad_norm": 0.6376787424087524, |
|
"learning_rate": 0.00017992974187328305, |
|
"loss": 0.5582, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.9413894546277438, |
|
"grad_norm": 0.6127185821533203, |
|
"learning_rate": 0.000179550206508559, |
|
"loss": 0.5878, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.9504412763068567, |
|
"grad_norm": 1.0581947565078735, |
|
"learning_rate": 0.00017916752434746856, |
|
"loss": 0.5113, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.9594930979859697, |
|
"grad_norm": 0.6159986257553101, |
|
"learning_rate": 0.00017878171052790868, |
|
"loss": 0.5226, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.9685449196650826, |
|
"grad_norm": 0.6697051525115967, |
|
"learning_rate": 0.00017839278031165658, |
|
"loss": 0.5011, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.9775967413441955, |
|
"grad_norm": 0.6310513019561768, |
|
"learning_rate": 0.00017800074908376584, |
|
"loss": 0.4599, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.9866485630233084, |
|
"grad_norm": 0.5671334266662598, |
|
"learning_rate": 0.0001776056323519579, |
|
"loss": 0.4455, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.9957003847024214, |
|
"grad_norm": 0.5935373902320862, |
|
"learning_rate": 0.00017720744574600863, |
|
"loss": 0.472, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 1.0054310930074677, |
|
"grad_norm": 0.4842943251132965, |
|
"learning_rate": 0.00017680620501712996, |
|
"loss": 0.4619, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 1.0144829146865806, |
|
"grad_norm": 0.5976426601409912, |
|
"learning_rate": 0.00017640192603734692, |
|
"loss": 0.4535, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 1.0235347363656937, |
|
"grad_norm": 0.5157873630523682, |
|
"learning_rate": 0.00017599462479886974, |
|
"loss": 0.3832, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 1.0325865580448066, |
|
"grad_norm": 0.6044872403144836, |
|
"learning_rate": 0.00017558431741346122, |
|
"loss": 0.4133, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 1.0416383797239195, |
|
"grad_norm": 0.6488142013549805, |
|
"learning_rate": 0.00017517102011179933, |
|
"loss": 0.4391, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 1.0506902014030324, |
|
"grad_norm": 0.5446122884750366, |
|
"learning_rate": 0.00017475474924283536, |
|
"loss": 0.3626, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 1.0597420230821453, |
|
"grad_norm": 0.4834959805011749, |
|
"learning_rate": 0.000174335521273147, |
|
"loss": 0.3712, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 1.0687938447612582, |
|
"grad_norm": 0.506747305393219, |
|
"learning_rate": 0.00017391335278628712, |
|
"loss": 0.3985, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 1.077845666440371, |
|
"grad_norm": 0.5550024509429932, |
|
"learning_rate": 0.0001734882604821276, |
|
"loss": 0.4201, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 1.086897488119484, |
|
"grad_norm": 0.5207591652870178, |
|
"learning_rate": 0.00017306026117619889, |
|
"loss": 0.3782, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 1.095949309798597, |
|
"grad_norm": 0.6619957089424133, |
|
"learning_rate": 0.00017262937179902472, |
|
"loss": 0.4203, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 1.10500113147771, |
|
"grad_norm": 0.6261160969734192, |
|
"learning_rate": 0.00017219560939545246, |
|
"loss": 0.3877, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 1.1140529531568228, |
|
"grad_norm": 0.6254423260688782, |
|
"learning_rate": 0.0001717589911239788, |
|
"loss": 0.4245, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 1.1231047748359357, |
|
"grad_norm": 0.5006200671195984, |
|
"learning_rate": 0.00017131953425607104, |
|
"loss": 0.399, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 1.1321565965150486, |
|
"grad_norm": 0.5162988305091858, |
|
"learning_rate": 0.00017087725617548385, |
|
"loss": 0.4325, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 1.1412084181941615, |
|
"grad_norm": 0.5567941665649414, |
|
"learning_rate": 0.00017043217437757164, |
|
"loss": 0.4391, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 1.1502602398732744, |
|
"grad_norm": 0.6780401468276978, |
|
"learning_rate": 0.00016998430646859654, |
|
"loss": 0.4388, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 1.1593120615523875, |
|
"grad_norm": 0.6121058464050293, |
|
"learning_rate": 0.00016953367016503182, |
|
"loss": 0.4216, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 1.1683638832315004, |
|
"grad_norm": 0.6004344820976257, |
|
"learning_rate": 0.00016908028329286112, |
|
"loss": 0.4171, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 1.1774157049106133, |
|
"grad_norm": 0.46892327070236206, |
|
"learning_rate": 0.0001686241637868734, |
|
"loss": 0.3628, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 1.1864675265897262, |
|
"grad_norm": 0.6038176417350769, |
|
"learning_rate": 0.00016816532968995328, |
|
"loss": 0.3771, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 1.195519348268839, |
|
"grad_norm": 0.5433237552642822, |
|
"learning_rate": 0.00016770379915236766, |
|
"loss": 0.4029, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 1.204571169947952, |
|
"grad_norm": 0.6447364091873169, |
|
"learning_rate": 0.00016723959043104728, |
|
"loss": 0.4154, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 1.2136229916270649, |
|
"grad_norm": 0.45961570739746094, |
|
"learning_rate": 0.00016677272188886483, |
|
"loss": 0.412, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 1.2226748133061778, |
|
"grad_norm": 0.5007642507553101, |
|
"learning_rate": 0.00016630321199390867, |
|
"loss": 0.4137, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 1.2317266349852907, |
|
"grad_norm": 0.4964968264102936, |
|
"learning_rate": 0.00016583107931875192, |
|
"loss": 0.3922, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 1.2407784566644038, |
|
"grad_norm": 0.573707640171051, |
|
"learning_rate": 0.00016535634253971794, |
|
"loss": 0.3778, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 1.2498302783435167, |
|
"grad_norm": 0.5992773771286011, |
|
"learning_rate": 0.00016487902043614173, |
|
"loss": 0.4528, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 1.2588821000226296, |
|
"grad_norm": 0.650651216506958, |
|
"learning_rate": 0.00016439913188962685, |
|
"loss": 0.3915, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 1.2679339217017425, |
|
"grad_norm": 0.5636417865753174, |
|
"learning_rate": 0.0001639166958832985, |
|
"loss": 0.3947, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 1.2769857433808554, |
|
"grad_norm": 0.5544995665550232, |
|
"learning_rate": 0.00016343173150105278, |
|
"loss": 0.4252, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 1.2860375650599682, |
|
"grad_norm": 0.5746600031852722, |
|
"learning_rate": 0.0001629442579268016, |
|
"loss": 0.4127, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 1.2950893867390811, |
|
"grad_norm": 0.5972291231155396, |
|
"learning_rate": 0.0001624542944437139, |
|
"loss": 0.3918, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 1.3041412084181943, |
|
"grad_norm": 0.6181298494338989, |
|
"learning_rate": 0.00016196186043345288, |
|
"loss": 0.3615, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 1.3131930300973071, |
|
"grad_norm": 0.5114730000495911, |
|
"learning_rate": 0.00016146697537540924, |
|
"loss": 0.3966, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 1.32224485177642, |
|
"grad_norm": 0.5740589499473572, |
|
"learning_rate": 0.0001609696588459307, |
|
"loss": 0.4061, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 1.331296673455533, |
|
"grad_norm": 0.6494064927101135, |
|
"learning_rate": 0.00016046993051754756, |
|
"loss": 0.3979, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 1.3403484951346458, |
|
"grad_norm": 0.6532962322235107, |
|
"learning_rate": 0.0001599678101581945, |
|
"loss": 0.4531, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 1.3494003168137587, |
|
"grad_norm": 0.7263498306274414, |
|
"learning_rate": 0.00015946331763042867, |
|
"loss": 0.3788, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 1.3584521384928716, |
|
"grad_norm": 0.8316817879676819, |
|
"learning_rate": 0.00015895647289064396, |
|
"loss": 0.4298, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 1.3584521384928716, |
|
"eval_loss": 11.852625846862793, |
|
"eval_runtime": 42.2271, |
|
"eval_samples_per_second": 7.507, |
|
"eval_steps_per_second": 3.765, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 1.3675039601719847, |
|
"grad_norm": 0.4995158314704895, |
|
"learning_rate": 0.0001584472959882815, |
|
"loss": 0.4021, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 1.3765557818510976, |
|
"grad_norm": 0.6390697360038757, |
|
"learning_rate": 0.0001579358070650367, |
|
"loss": 0.3633, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 1.3856076035302105, |
|
"grad_norm": 0.5589749217033386, |
|
"learning_rate": 0.00015742202635406235, |
|
"loss": 0.3524, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 1.3946594252093234, |
|
"grad_norm": 0.5728365778923035, |
|
"learning_rate": 0.0001569059741791684, |
|
"loss": 0.3456, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 1.4037112468884363, |
|
"grad_norm": 0.45577678084373474, |
|
"learning_rate": 0.0001563876709540178, |
|
"loss": 0.3706, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 1.4127630685675492, |
|
"grad_norm": 0.7161752581596375, |
|
"learning_rate": 0.00015586713718131922, |
|
"loss": 0.362, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 1.421814890246662, |
|
"grad_norm": 0.530432403087616, |
|
"learning_rate": 0.0001553443934520159, |
|
"loss": 0.3798, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 1.4308667119257752, |
|
"grad_norm": 0.5991781949996948, |
|
"learning_rate": 0.00015481946044447099, |
|
"loss": 0.3965, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 1.4399185336048879, |
|
"grad_norm": 0.5994631052017212, |
|
"learning_rate": 0.00015429235892364994, |
|
"loss": 0.3855, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 1.448970355284001, |
|
"grad_norm": 0.5677673816680908, |
|
"learning_rate": 0.00015376310974029873, |
|
"loss": 0.3773, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 1.4580221769631139, |
|
"grad_norm": 0.4799586236476898, |
|
"learning_rate": 0.0001532317338301192, |
|
"loss": 0.3466, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 1.4670739986422268, |
|
"grad_norm": 0.5448312163352966, |
|
"learning_rate": 0.00015269825221294098, |
|
"loss": 0.3664, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 1.4761258203213397, |
|
"grad_norm": 0.5199277400970459, |
|
"learning_rate": 0.0001521626859918898, |
|
"loss": 0.3838, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 1.4851776420004525, |
|
"grad_norm": 0.4976363778114319, |
|
"learning_rate": 0.00015162505635255287, |
|
"loss": 0.3811, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 1.4942294636795654, |
|
"grad_norm": 0.6075541377067566, |
|
"learning_rate": 0.0001510853845621409, |
|
"loss": 0.3756, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 1.5032812853586783, |
|
"grad_norm": 0.5979759097099304, |
|
"learning_rate": 0.00015054369196864644, |
|
"loss": 0.3973, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 1.5123331070377914, |
|
"grad_norm": 0.9770258665084839, |
|
"learning_rate": 0.00015000000000000001, |
|
"loss": 0.4124, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 1.5213849287169041, |
|
"grad_norm": 0.580491840839386, |
|
"learning_rate": 0.0001494543301632219, |
|
"loss": 0.3827, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 1.5304367503960172, |
|
"grad_norm": 0.6543178558349609, |
|
"learning_rate": 0.0001489067040435717, |
|
"loss": 0.4009, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 1.5394885720751301, |
|
"grad_norm": 0.5909605622291565, |
|
"learning_rate": 0.00014835714330369446, |
|
"loss": 0.4272, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 1.548540393754243, |
|
"grad_norm": 0.4959102272987366, |
|
"learning_rate": 0.0001478056696827636, |
|
"loss": 0.3536, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 1.557592215433356, |
|
"grad_norm": 0.5864228010177612, |
|
"learning_rate": 0.00014725230499562119, |
|
"loss": 0.3657, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 1.5666440371124688, |
|
"grad_norm": 0.6094350814819336, |
|
"learning_rate": 0.00014669707113191483, |
|
"loss": 0.3889, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 1.575695858791582, |
|
"grad_norm": 0.45741668343544006, |
|
"learning_rate": 0.00014613999005523174, |
|
"loss": 0.37, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 1.5847476804706946, |
|
"grad_norm": 0.5069059729576111, |
|
"learning_rate": 0.00014558108380223012, |
|
"loss": 0.3992, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 1.5937995021498077, |
|
"grad_norm": 0.535524845123291, |
|
"learning_rate": 0.00014502037448176734, |
|
"loss": 0.3763, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 1.6028513238289206, |
|
"grad_norm": 0.5067459940910339, |
|
"learning_rate": 0.00014445788427402528, |
|
"loss": 0.3409, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 1.6119031455080335, |
|
"grad_norm": 0.5375223159790039, |
|
"learning_rate": 0.00014389363542963306, |
|
"loss": 0.3778, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 1.6209549671871464, |
|
"grad_norm": 0.5057679414749146, |
|
"learning_rate": 0.00014332765026878687, |
|
"loss": 0.362, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 1.6300067888662593, |
|
"grad_norm": 0.5510945320129395, |
|
"learning_rate": 0.00014275995118036693, |
|
"loss": 0.374, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 1.6390586105453724, |
|
"grad_norm": 0.619204580783844, |
|
"learning_rate": 0.00014219056062105193, |
|
"loss": 0.4279, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 1.648110432224485, |
|
"grad_norm": 0.5340638160705566, |
|
"learning_rate": 0.00014161950111443077, |
|
"loss": 0.3675, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 1.6571622539035982, |
|
"grad_norm": 0.6182312965393066, |
|
"learning_rate": 0.0001410467952501114, |
|
"loss": 0.3686, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 1.666214075582711, |
|
"grad_norm": 0.5214246511459351, |
|
"learning_rate": 0.00014047246568282736, |
|
"loss": 0.4001, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 1.675265897261824, |
|
"grad_norm": 0.4987814128398895, |
|
"learning_rate": 0.00013989653513154165, |
|
"loss": 0.357, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 1.6843177189409368, |
|
"grad_norm": 0.6497951149940491, |
|
"learning_rate": 0.0001393190263785479, |
|
"loss": 0.3722, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 1.6933695406200497, |
|
"grad_norm": 0.4140976667404175, |
|
"learning_rate": 0.00013873996226856933, |
|
"loss": 0.316, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 1.7024213622991629, |
|
"grad_norm": 0.5191941857337952, |
|
"learning_rate": 0.00013815936570785487, |
|
"loss": 0.3865, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 1.7114731839782755, |
|
"grad_norm": 0.5386790037155151, |
|
"learning_rate": 0.00013757725966327322, |
|
"loss": 0.3499, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 1.7205250056573886, |
|
"grad_norm": 0.5060738921165466, |
|
"learning_rate": 0.00013699366716140435, |
|
"loss": 0.3481, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 1.7295768273365013, |
|
"grad_norm": 0.5718989968299866, |
|
"learning_rate": 0.0001364086112876284, |
|
"loss": 0.3713, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 1.7386286490156144, |
|
"grad_norm": 0.5812616348266602, |
|
"learning_rate": 0.00013582211518521273, |
|
"loss": 0.3651, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 1.7476804706947273, |
|
"grad_norm": 0.42051294445991516, |
|
"learning_rate": 0.00013523420205439646, |
|
"loss": 0.3581, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 1.7567322923738402, |
|
"grad_norm": 0.6224139928817749, |
|
"learning_rate": 0.00013464489515147238, |
|
"loss": 0.3663, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 1.765784114052953, |
|
"grad_norm": 0.5418313145637512, |
|
"learning_rate": 0.00013405421778786737, |
|
"loss": 0.3521, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 1.774835935732066, |
|
"grad_norm": 0.5429657101631165, |
|
"learning_rate": 0.00013346219332922016, |
|
"loss": 0.3946, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 1.783887757411179, |
|
"grad_norm": 0.5962104797363281, |
|
"learning_rate": 0.0001328688451944569, |
|
"loss": 0.4121, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 1.7929395790902918, |
|
"grad_norm": 0.5818737745285034, |
|
"learning_rate": 0.00013227419685486492, |
|
"loss": 0.3209, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 1.801991400769405, |
|
"grad_norm": 0.4606368839740753, |
|
"learning_rate": 0.0001316782718331643, |
|
"loss": 0.316, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 1.8110432224485178, |
|
"grad_norm": 0.49896928668022156, |
|
"learning_rate": 0.00013108109370257712, |
|
"loss": 0.3538, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 1.8110432224485178, |
|
"eval_loss": 11.89608383178711, |
|
"eval_runtime": 41.6812, |
|
"eval_samples_per_second": 7.605, |
|
"eval_steps_per_second": 3.815, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 1.8200950441276307, |
|
"grad_norm": 0.4855138659477234, |
|
"learning_rate": 0.00013048268608589533, |
|
"loss": 0.3578, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 1.8291468658067436, |
|
"grad_norm": 0.577150285243988, |
|
"learning_rate": 0.00012988307265454597, |
|
"loss": 0.3644, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 1.8381986874858565, |
|
"grad_norm": 0.5089125037193298, |
|
"learning_rate": 0.00012928227712765504, |
|
"loss": 0.398, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 1.8472505091649696, |
|
"grad_norm": 0.44641053676605225, |
|
"learning_rate": 0.00012868032327110904, |
|
"loss": 0.3656, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 1.8563023308440822, |
|
"grad_norm": 0.5212085247039795, |
|
"learning_rate": 0.00012807723489661495, |
|
"loss": 0.3396, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 1.8653541525231954, |
|
"grad_norm": 0.48269546031951904, |
|
"learning_rate": 0.0001274730358607583, |
|
"loss": 0.3649, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 1.8744059742023083, |
|
"grad_norm": 0.5938083529472351, |
|
"learning_rate": 0.00012686775006405946, |
|
"loss": 0.3318, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 1.8834577958814211, |
|
"grad_norm": 0.46941766142845154, |
|
"learning_rate": 0.0001262614014500282, |
|
"loss": 0.3625, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 1.892509617560534, |
|
"grad_norm": 0.6015097498893738, |
|
"learning_rate": 0.00012565401400421651, |
|
"loss": 0.4174, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 1.901561439239647, |
|
"grad_norm": 0.6819751858711243, |
|
"learning_rate": 0.00012504561175326985, |
|
"loss": 0.351, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 1.91061326091876, |
|
"grad_norm": 0.4607352018356323, |
|
"learning_rate": 0.0001244362187639767, |
|
"loss": 0.3488, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 1.9196650825978727, |
|
"grad_norm": 0.7872518301010132, |
|
"learning_rate": 0.0001238258591423165, |
|
"loss": 0.3719, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 1.9287169042769858, |
|
"grad_norm": 0.49948009848594666, |
|
"learning_rate": 0.00012321455703250616, |
|
"loss": 0.3415, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 1.9377687259560985, |
|
"grad_norm": 0.513294517993927, |
|
"learning_rate": 0.0001226023366160449, |
|
"loss": 0.3575, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 1.9468205476352116, |
|
"grad_norm": 0.7481987476348877, |
|
"learning_rate": 0.00012198922211075778, |
|
"loss": 0.3465, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 1.9558723693143245, |
|
"grad_norm": 0.4865647852420807, |
|
"learning_rate": 0.00012137523776983757, |
|
"loss": 0.3246, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 1.9649241909934374, |
|
"grad_norm": 0.35988450050354004, |
|
"learning_rate": 0.00012076040788088554, |
|
"loss": 0.3506, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 1.9739760126725503, |
|
"grad_norm": 0.46516749262809753, |
|
"learning_rate": 0.00012014475676495052, |
|
"loss": 0.3077, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 1.9830278343516632, |
|
"grad_norm": 0.5179293751716614, |
|
"learning_rate": 0.000119528308775567, |
|
"loss": 0.375, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 1.9920796560307763, |
|
"grad_norm": 0.5721428394317627, |
|
"learning_rate": 0.00011891108829779165, |
|
"loss": 0.3503, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 2.0018103643358227, |
|
"grad_norm": 0.4291672706604004, |
|
"learning_rate": 0.00011829311974723867, |
|
"loss": 0.3745, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 2.0108621860149354, |
|
"grad_norm": 0.4879133701324463, |
|
"learning_rate": 0.00011767442756911417, |
|
"loss": 0.2479, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 2.0199140076940485, |
|
"grad_norm": 0.5393707752227783, |
|
"learning_rate": 0.00011705503623724898, |
|
"loss": 0.222, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 2.028965829373161, |
|
"grad_norm": 0.5438636541366577, |
|
"learning_rate": 0.00011643497025313061, |
|
"loss": 0.2726, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 2.0380176510522743, |
|
"grad_norm": 0.5499648451805115, |
|
"learning_rate": 0.0001158142541449341, |
|
"loss": 0.2583, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 2.0470694727313874, |
|
"grad_norm": 0.5354553461074829, |
|
"learning_rate": 0.0001151929124665516, |
|
"loss": 0.2365, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 2.0561212944105, |
|
"grad_norm": 0.586413562297821, |
|
"learning_rate": 0.00011457096979662114, |
|
"loss": 0.243, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 2.065173116089613, |
|
"grad_norm": 0.48880791664123535, |
|
"learning_rate": 0.00011394845073755455, |
|
"loss": 0.2534, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 2.074224937768726, |
|
"grad_norm": 0.5162414908409119, |
|
"learning_rate": 0.00011332537991456398, |
|
"loss": 0.2279, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 2.083276759447839, |
|
"grad_norm": 0.6083577871322632, |
|
"learning_rate": 0.00011270178197468789, |
|
"loss": 0.2353, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 2.0923285811269516, |
|
"grad_norm": 0.5396526455879211, |
|
"learning_rate": 0.00011207768158581613, |
|
"loss": 0.2396, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 2.1013804028060648, |
|
"grad_norm": 0.5191901326179504, |
|
"learning_rate": 0.00011145310343571411, |
|
"loss": 0.2557, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 2.1104322244851774, |
|
"grad_norm": 0.4944523572921753, |
|
"learning_rate": 0.0001108280722310462, |
|
"loss": 0.2332, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 2.1194840461642905, |
|
"grad_norm": 0.49368587136268616, |
|
"learning_rate": 0.00011020261269639842, |
|
"loss": 0.2395, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 2.1285358678434037, |
|
"grad_norm": 0.5858927369117737, |
|
"learning_rate": 0.00010957674957330042, |
|
"loss": 0.27, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 2.1375876895225163, |
|
"grad_norm": 0.5289851427078247, |
|
"learning_rate": 0.00010895050761924668, |
|
"loss": 0.2777, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 2.1466395112016294, |
|
"grad_norm": 0.5512372851371765, |
|
"learning_rate": 0.00010832391160671729, |
|
"loss": 0.257, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 2.155691332880742, |
|
"grad_norm": 0.48882365226745605, |
|
"learning_rate": 0.00010769698632219794, |
|
"loss": 0.2513, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 2.164743154559855, |
|
"grad_norm": 0.4982316792011261, |
|
"learning_rate": 0.00010706975656519946, |
|
"loss": 0.2265, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 2.173794976238968, |
|
"grad_norm": 0.5512099266052246, |
|
"learning_rate": 0.00010644224714727681, |
|
"loss": 0.2586, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 2.182846797918081, |
|
"grad_norm": 0.5546220541000366, |
|
"learning_rate": 0.00010581448289104758, |
|
"loss": 0.2494, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 2.191898619597194, |
|
"grad_norm": 0.6402847170829773, |
|
"learning_rate": 0.00010518648862921012, |
|
"loss": 0.2467, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 2.200950441276307, |
|
"grad_norm": 0.4461919665336609, |
|
"learning_rate": 0.00010455828920356115, |
|
"loss": 0.2248, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 2.21000226295542, |
|
"grad_norm": 0.562235951423645, |
|
"learning_rate": 0.00010392990946401313, |
|
"loss": 0.2196, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 2.2190540846345326, |
|
"grad_norm": 0.49623265862464905, |
|
"learning_rate": 0.00010330137426761135, |
|
"loss": 0.2572, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 2.2281059063136457, |
|
"grad_norm": 0.5660530924797058, |
|
"learning_rate": 0.00010267270847755048, |
|
"loss": 0.2523, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 2.2371577279927584, |
|
"grad_norm": 0.5315167307853699, |
|
"learning_rate": 0.00010204393696219117, |
|
"loss": 0.2489, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 2.2462095496718715, |
|
"grad_norm": 0.530099093914032, |
|
"learning_rate": 0.00010141508459407623, |
|
"loss": 0.2603, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 2.2552613713509846, |
|
"grad_norm": 0.6275547742843628, |
|
"learning_rate": 0.00010078617624894684, |
|
"loss": 0.2338, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 2.2643131930300973, |
|
"grad_norm": 0.5999734997749329, |
|
"learning_rate": 0.00010015723680475846, |
|
"loss": 0.2418, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 2.2643131930300973, |
|
"eval_loss": 12.012301445007324, |
|
"eval_runtime": 42.3048, |
|
"eval_samples_per_second": 7.493, |
|
"eval_steps_per_second": 3.758, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 2.2733650147092104, |
|
"grad_norm": 0.5600557327270508, |
|
"learning_rate": 9.95282911406968e-05, |
|
"loss": 0.2394, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 2.282416836388323, |
|
"grad_norm": 0.383777379989624, |
|
"learning_rate": 9.889936413619356e-05, |
|
"loss": 0.2331, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 2.291468658067436, |
|
"grad_norm": 0.5634308457374573, |
|
"learning_rate": 9.827048066994225e-05, |
|
"loss": 0.2492, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 2.300520479746549, |
|
"grad_norm": 0.5783660411834717, |
|
"learning_rate": 9.764166561891432e-05, |
|
"loss": 0.2524, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 2.309572301425662, |
|
"grad_norm": 0.6156514286994934, |
|
"learning_rate": 9.70129438573747e-05, |
|
"loss": 0.2655, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 2.318624123104775, |
|
"grad_norm": 0.3963329493999481, |
|
"learning_rate": 9.63843402558981e-05, |
|
"loss": 0.2459, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 2.3276759447838877, |
|
"grad_norm": 0.4392152726650238, |
|
"learning_rate": 9.57558796803852e-05, |
|
"loss": 0.2579, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 2.336727766463001, |
|
"grad_norm": 0.5545589327812195, |
|
"learning_rate": 9.512758699107879e-05, |
|
"loss": 0.2716, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 2.3457795881421135, |
|
"grad_norm": 0.6256219744682312, |
|
"learning_rate": 9.449948704158071e-05, |
|
"loss": 0.2644, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 2.3548314098212266, |
|
"grad_norm": 0.5015664100646973, |
|
"learning_rate": 9.38716046778684e-05, |
|
"loss": 0.2603, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 2.3638832315003393, |
|
"grad_norm": 0.5921751260757446, |
|
"learning_rate": 9.324396473731217e-05, |
|
"loss": 0.2713, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 2.3729350531794524, |
|
"grad_norm": 0.4421112835407257, |
|
"learning_rate": 9.261659204769284e-05, |
|
"loss": 0.2663, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 2.3819868748585655, |
|
"grad_norm": 0.5723668336868286, |
|
"learning_rate": 9.198951142621929e-05, |
|
"loss": 0.2757, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 2.391038696537678, |
|
"grad_norm": 0.5875786542892456, |
|
"learning_rate": 9.136274767854716e-05, |
|
"loss": 0.2764, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 2.4000905182167913, |
|
"grad_norm": 0.6496031284332275, |
|
"learning_rate": 9.07363255977973e-05, |
|
"loss": 0.273, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 2.409142339895904, |
|
"grad_norm": 0.39376866817474365, |
|
"learning_rate": 9.011026996357503e-05, |
|
"loss": 0.23, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 2.418194161575017, |
|
"grad_norm": 0.6112829446792603, |
|
"learning_rate": 8.948460554099018e-05, |
|
"loss": 0.2605, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 2.4272459832541298, |
|
"grad_norm": 0.519059419631958, |
|
"learning_rate": 8.885935707967716e-05, |
|
"loss": 0.2501, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 2.436297804933243, |
|
"grad_norm": 0.8110079765319824, |
|
"learning_rate": 8.823454931281616e-05, |
|
"loss": 0.2387, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 2.4453496266123556, |
|
"grad_norm": 0.7474923133850098, |
|
"learning_rate": 8.76102069561545e-05, |
|
"loss": 0.2655, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 2.4544014482914687, |
|
"grad_norm": 0.44778284430503845, |
|
"learning_rate": 8.698635470702923e-05, |
|
"loss": 0.2285, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 2.4634532699705813, |
|
"grad_norm": 0.5677033066749573, |
|
"learning_rate": 8.636301724339004e-05, |
|
"loss": 0.2433, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 2.4725050916496945, |
|
"grad_norm": 0.5416902303695679, |
|
"learning_rate": 8.574021922282292e-05, |
|
"loss": 0.238, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 2.4815569133288076, |
|
"grad_norm": 0.5404213666915894, |
|
"learning_rate": 8.511798528157512e-05, |
|
"loss": 0.2682, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 2.4906087350079202, |
|
"grad_norm": 0.4115196168422699, |
|
"learning_rate": 8.449634003358022e-05, |
|
"loss": 0.2486, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 2.4996605566870334, |
|
"grad_norm": 0.40759924054145813, |
|
"learning_rate": 8.387530806948476e-05, |
|
"loss": 0.2381, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 2.508712378366146, |
|
"grad_norm": 0.5902087092399597, |
|
"learning_rate": 8.325491395567541e-05, |
|
"loss": 0.2364, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 2.517764200045259, |
|
"grad_norm": 0.5643983483314514, |
|
"learning_rate": 8.263518223330697e-05, |
|
"loss": 0.2322, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 2.526816021724372, |
|
"grad_norm": 0.55036860704422, |
|
"learning_rate": 8.201613741733203e-05, |
|
"loss": 0.2697, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 2.535867843403485, |
|
"grad_norm": 0.5846819877624512, |
|
"learning_rate": 8.13978039955308e-05, |
|
"loss": 0.2648, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 2.544919665082598, |
|
"grad_norm": 0.6239003539085388, |
|
"learning_rate": 8.078020642754274e-05, |
|
"loss": 0.244, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 2.5539714867617107, |
|
"grad_norm": 0.49254196882247925, |
|
"learning_rate": 8.016336914389874e-05, |
|
"loss": 0.2475, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 2.563023308440824, |
|
"grad_norm": 0.737869381904602, |
|
"learning_rate": 7.954731654505491e-05, |
|
"loss": 0.2694, |
|
"step": 2830 |
|
}, |
|
{ |
|
"epoch": 2.5720751301199365, |
|
"grad_norm": 0.5183305144309998, |
|
"learning_rate": 7.89320730004274e-05, |
|
"loss": 0.234, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 2.5811269517990496, |
|
"grad_norm": 0.4777659773826599, |
|
"learning_rate": 7.831766284742807e-05, |
|
"loss": 0.2552, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 2.5901787734781623, |
|
"grad_norm": 0.553449809551239, |
|
"learning_rate": 7.77041103905023e-05, |
|
"loss": 0.2275, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 2.5992305951572754, |
|
"grad_norm": 0.542242169380188, |
|
"learning_rate": 7.709143990016702e-05, |
|
"loss": 0.2736, |
|
"step": 2870 |
|
}, |
|
{ |
|
"epoch": 2.6082824168363885, |
|
"grad_norm": 0.564593493938446, |
|
"learning_rate": 7.6479675612051e-05, |
|
"loss": 0.2469, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 2.617334238515501, |
|
"grad_norm": 0.37938451766967773, |
|
"learning_rate": 7.586884172593609e-05, |
|
"loss": 0.2362, |
|
"step": 2890 |
|
}, |
|
{ |
|
"epoch": 2.6263860601946143, |
|
"grad_norm": 0.6128523945808411, |
|
"learning_rate": 7.525896240479976e-05, |
|
"loss": 0.2456, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 2.635437881873727, |
|
"grad_norm": 0.6073201894760132, |
|
"learning_rate": 7.465006177385953e-05, |
|
"loss": 0.2413, |
|
"step": 2910 |
|
}, |
|
{ |
|
"epoch": 2.64448970355284, |
|
"grad_norm": 0.4320588707923889, |
|
"learning_rate": 7.404216391961847e-05, |
|
"loss": 0.243, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 2.6535415252319527, |
|
"grad_norm": 0.40451350808143616, |
|
"learning_rate": 7.343529288891239e-05, |
|
"loss": 0.265, |
|
"step": 2930 |
|
}, |
|
{ |
|
"epoch": 2.662593346911066, |
|
"grad_norm": 0.43473196029663086, |
|
"learning_rate": 7.282947268795877e-05, |
|
"loss": 0.2267, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 2.671645168590179, |
|
"grad_norm": 0.6388351917266846, |
|
"learning_rate": 7.222472728140695e-05, |
|
"loss": 0.2745, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 2.6806969902692916, |
|
"grad_norm": 0.5615083575248718, |
|
"learning_rate": 7.162108059139032e-05, |
|
"loss": 0.25, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 2.6897488119484048, |
|
"grad_norm": 0.6018364429473877, |
|
"learning_rate": 7.101855649657991e-05, |
|
"loss": 0.2601, |
|
"step": 2970 |
|
}, |
|
{ |
|
"epoch": 2.6988006336275174, |
|
"grad_norm": 0.5931141376495361, |
|
"learning_rate": 7.041717883123977e-05, |
|
"loss": 0.2455, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 2.7078524553066305, |
|
"grad_norm": 0.4409467875957489, |
|
"learning_rate": 6.981697138428434e-05, |
|
"loss": 0.2584, |
|
"step": 2990 |
|
}, |
|
{ |
|
"epoch": 2.716904276985743, |
|
"grad_norm": 0.6161777377128601, |
|
"learning_rate": 6.921795789833723e-05, |
|
"loss": 0.2364, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 2.716904276985743, |
|
"eval_loss": 11.997015953063965, |
|
"eval_runtime": 42.4537, |
|
"eval_samples_per_second": 7.467, |
|
"eval_steps_per_second": 3.745, |
|
"step": 3000 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 5000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 5, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 3.6779958650393395e+17, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|