{ "best_metric": 2.395761489868164, "best_model_checkpoint": "../../saves/Baichuan2-7B-Chat/lora/sft/checkpoint-2000", "epoch": 7.901234567901234, "eval_steps": 400, "global_step": 2000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.04, "grad_norm": 1.1405200958251953, "learning_rate": 2.25e-05, "loss": 3.6308, "step": 10 }, { "epoch": 0.08, "grad_norm": 1.766953945159912, "learning_rate": 4.75e-05, "loss": 3.4926, "step": 20 }, { "epoch": 0.12, "grad_norm": 1.2588557004928589, "learning_rate": 4.99984138555282e-05, "loss": 3.2621, "step": 30 }, { "epoch": 0.16, "grad_norm": 0.9718258380889893, "learning_rate": 4.999293114538139e-05, "loss": 3.0924, "step": 40 }, { "epoch": 0.2, "grad_norm": 0.9004219770431519, "learning_rate": 4.998353314622318e-05, "loss": 3.0325, "step": 50 }, { "epoch": 0.24, "grad_norm": 0.7595831751823425, "learning_rate": 4.997022133030516e-05, "loss": 2.9351, "step": 60 }, { "epoch": 0.28, "grad_norm": 0.8930522799491882, "learning_rate": 4.9952997783001254e-05, "loss": 2.8068, "step": 70 }, { "epoch": 0.32, "grad_norm": 0.7985192537307739, "learning_rate": 4.9931865202480996e-05, "loss": 2.8503, "step": 80 }, { "epoch": 0.36, "grad_norm": 0.9129031896591187, "learning_rate": 4.990682689928687e-05, "loss": 2.7241, "step": 90 }, { "epoch": 0.4, "grad_norm": 0.8816404342651367, "learning_rate": 4.9877886795815685e-05, "loss": 2.8525, "step": 100 }, { "epoch": 0.43, "grad_norm": 0.8212659358978271, "learning_rate": 4.98450494257041e-05, "loss": 2.7173, "step": 110 }, { "epoch": 0.47, "grad_norm": 0.9286770224571228, "learning_rate": 4.980831993311844e-05, "loss": 2.7857, "step": 120 }, { "epoch": 0.51, "grad_norm": 0.95149165391922, "learning_rate": 4.976770407194877e-05, "loss": 2.6764, "step": 130 }, { "epoch": 0.55, "grad_norm": 1.1459342241287231, "learning_rate": 4.972320820490759e-05, "loss": 2.7001, "step": 140 }, { "epoch": 0.59, "grad_norm": 1.1330541372299194, "learning_rate": 4.967483930253302e-05, "loss": 2.7024, "step": 150 }, { "epoch": 0.63, "grad_norm": 0.9277874827384949, "learning_rate": 4.962260494209683e-05, "loss": 2.7039, "step": 160 }, { "epoch": 0.67, "grad_norm": 1.0230640172958374, "learning_rate": 4.9566513306417444e-05, "loss": 2.7423, "step": 170 }, { "epoch": 0.71, "grad_norm": 0.9915482997894287, "learning_rate": 4.950657318257805e-05, "loss": 2.7303, "step": 180 }, { "epoch": 0.75, "grad_norm": 1.105600357055664, "learning_rate": 4.944279396055003e-05, "loss": 2.6616, "step": 190 }, { "epoch": 0.79, "grad_norm": 1.1231801509857178, "learning_rate": 4.937518563172196e-05, "loss": 2.655, "step": 200 }, { "epoch": 0.83, "grad_norm": 0.908206582069397, "learning_rate": 4.930375878733445e-05, "loss": 2.6541, "step": 210 }, { "epoch": 0.87, "grad_norm": 1.087323546409607, "learning_rate": 4.922852461682093e-05, "loss": 2.5646, "step": 220 }, { "epoch": 0.91, "grad_norm": 1.0399665832519531, "learning_rate": 4.9149494906054716e-05, "loss": 2.6036, "step": 230 }, { "epoch": 0.95, "grad_norm": 0.9571551084518433, "learning_rate": 4.906668203550279e-05, "loss": 2.6212, "step": 240 }, { "epoch": 0.99, "grad_norm": 0.9485632181167603, "learning_rate": 4.8980098978286215e-05, "loss": 2.6717, "step": 250 }, { "epoch": 1.03, "grad_norm": 0.9359139204025269, "learning_rate": 4.888975929814792e-05, "loss": 2.5967, "step": 260 }, { "epoch": 1.07, "grad_norm": 1.2552564144134521, "learning_rate": 4.8795677147327776e-05, "loss": 2.5608, "step": 270 }, { "epoch": 1.11, "grad_norm": 0.9426449537277222, "learning_rate": 4.8697867264345616e-05, "loss": 2.5731, "step": 280 }, { "epoch": 1.15, "grad_norm": 1.132430076599121, "learning_rate": 4.859634497169233e-05, "loss": 2.5884, "step": 290 }, { "epoch": 1.19, "grad_norm": 0.9066994786262512, "learning_rate": 4.849112617342955e-05, "loss": 2.5888, "step": 300 }, { "epoch": 1.22, "grad_norm": 1.0188608169555664, "learning_rate": 4.8382227352698115e-05, "loss": 2.5849, "step": 310 }, { "epoch": 1.26, "grad_norm": 1.3850712776184082, "learning_rate": 4.826966556913597e-05, "loss": 2.485, "step": 320 }, { "epoch": 1.3, "grad_norm": 1.1342747211456299, "learning_rate": 4.815345845620563e-05, "loss": 2.5624, "step": 330 }, { "epoch": 1.34, "grad_norm": 1.0687206983566284, "learning_rate": 4.803362421843177e-05, "loss": 2.5051, "step": 340 }, { "epoch": 1.38, "grad_norm": 1.5436629056930542, "learning_rate": 4.7910181628549454e-05, "loss": 2.5185, "step": 350 }, { "epoch": 1.42, "grad_norm": 1.2030800580978394, "learning_rate": 4.77831500245632e-05, "loss": 2.5122, "step": 360 }, { "epoch": 1.46, "grad_norm": 1.2365000247955322, "learning_rate": 4.765254930671762e-05, "loss": 2.5704, "step": 370 }, { "epoch": 1.5, "grad_norm": 1.1403887271881104, "learning_rate": 4.75183999343799e-05, "loss": 2.5605, "step": 380 }, { "epoch": 1.54, "grad_norm": 1.2193725109100342, "learning_rate": 4.738072292283473e-05, "loss": 2.569, "step": 390 }, { "epoch": 1.58, "grad_norm": 1.4231560230255127, "learning_rate": 4.723953983999215e-05, "loss": 2.4809, "step": 400 }, { "epoch": 1.58, "eval_loss": 2.4551122188568115, "eval_runtime": 134.6274, "eval_samples_per_second": 6.685, "eval_steps_per_second": 3.343, "step": 400 }, { "epoch": 1.62, "grad_norm": 1.26221764087677, "learning_rate": 4.70948728030088e-05, "loss": 2.6339, "step": 410 }, { "epoch": 1.66, "grad_norm": 1.2207887172698975, "learning_rate": 4.694674447482312e-05, "loss": 2.5877, "step": 420 }, { "epoch": 1.7, "grad_norm": 1.2746591567993164, "learning_rate": 4.679517806060509e-05, "loss": 2.5866, "step": 430 }, { "epoch": 1.74, "grad_norm": 1.774005651473999, "learning_rate": 4.664019730412101e-05, "loss": 2.5073, "step": 440 }, { "epoch": 1.78, "grad_norm": 1.4896618127822876, "learning_rate": 4.648182648401389e-05, "loss": 2.4688, "step": 450 }, { "epoch": 1.82, "grad_norm": 1.3457367420196533, "learning_rate": 4.6320090410000027e-05, "loss": 2.527, "step": 460 }, { "epoch": 1.86, "grad_norm": 1.2498492002487183, "learning_rate": 4.615501441898248e-05, "loss": 2.625, "step": 470 }, { "epoch": 1.9, "grad_norm": 1.3643558025360107, "learning_rate": 4.598662437108186e-05, "loss": 2.4755, "step": 480 }, { "epoch": 1.94, "grad_norm": 1.198166847229004, "learning_rate": 4.581494664558518e-05, "loss": 2.5688, "step": 490 }, { "epoch": 1.98, "grad_norm": 3.3917434215545654, "learning_rate": 4.564000813681342e-05, "loss": 2.5182, "step": 500 }, { "epoch": 2.01, "grad_norm": 1.562139630317688, "learning_rate": 4.546183624990832e-05, "loss": 2.4533, "step": 510 }, { "epoch": 2.05, "grad_norm": 1.1284795999526978, "learning_rate": 4.528045889653927e-05, "loss": 2.4901, "step": 520 }, { "epoch": 2.09, "grad_norm": 1.7664827108383179, "learning_rate": 4.509590449053074e-05, "loss": 2.5075, "step": 530 }, { "epoch": 2.13, "grad_norm": 1.6162073612213135, "learning_rate": 4.49082019434111e-05, "loss": 2.4769, "step": 540 }, { "epoch": 2.17, "grad_norm": 1.3468163013458252, "learning_rate": 4.471738065988347e-05, "loss": 2.4979, "step": 550 }, { "epoch": 2.21, "grad_norm": 1.0762629508972168, "learning_rate": 4.452347053321926e-05, "loss": 2.5436, "step": 560 }, { "epoch": 2.25, "grad_norm": 1.1567480564117432, "learning_rate": 4.432650194057527e-05, "loss": 2.5454, "step": 570 }, { "epoch": 2.29, "grad_norm": 1.419041395187378, "learning_rate": 4.412650573823489e-05, "loss": 2.4681, "step": 580 }, { "epoch": 2.33, "grad_norm": 1.2923465967178345, "learning_rate": 4.392351325677433e-05, "loss": 2.565, "step": 590 }, { "epoch": 2.37, "grad_norm": 1.2892262935638428, "learning_rate": 4.371755629615442e-05, "loss": 2.5258, "step": 600 }, { "epoch": 2.41, "grad_norm": 1.467966914176941, "learning_rate": 4.3508667120739046e-05, "loss": 2.5776, "step": 610 }, { "epoch": 2.45, "grad_norm": 1.2278165817260742, "learning_rate": 4.329687845424069e-05, "loss": 2.4175, "step": 620 }, { "epoch": 2.49, "grad_norm": 1.3225311040878296, "learning_rate": 4.308222347459411e-05, "loss": 2.4561, "step": 630 }, { "epoch": 2.53, "grad_norm": 1.2582958936691284, "learning_rate": 4.286473580875878e-05, "loss": 2.3885, "step": 640 }, { "epoch": 2.57, "grad_norm": 1.206189751625061, "learning_rate": 4.264444952745108e-05, "loss": 2.5041, "step": 650 }, { "epoch": 2.61, "grad_norm": 1.9777090549468994, "learning_rate": 4.242139913980686e-05, "loss": 2.4763, "step": 660 }, { "epoch": 2.65, "grad_norm": 1.91414475440979, "learning_rate": 4.219561958797543e-05, "loss": 2.37, "step": 670 }, { "epoch": 2.69, "grad_norm": 1.0806653499603271, "learning_rate": 4.196714624164565e-05, "loss": 2.5985, "step": 680 }, { "epoch": 2.73, "grad_norm": 1.2435009479522705, "learning_rate": 4.1736014892505064e-05, "loss": 2.4765, "step": 690 }, { "epoch": 2.77, "grad_norm": 1.3920471668243408, "learning_rate": 4.150226174863292e-05, "loss": 2.4446, "step": 700 }, { "epoch": 2.8, "grad_norm": 1.949141263961792, "learning_rate": 4.126592342882795e-05, "loss": 2.4979, "step": 710 }, { "epoch": 2.84, "grad_norm": 1.1306403875350952, "learning_rate": 4.1027036956871854e-05, "loss": 2.4096, "step": 720 }, { "epoch": 2.88, "grad_norm": 0.9906802773475647, "learning_rate": 4.078563975572928e-05, "loss": 2.5409, "step": 730 }, { "epoch": 2.92, "grad_norm": 1.4917031526565552, "learning_rate": 4.054176964168528e-05, "loss": 2.4508, "step": 740 }, { "epoch": 2.96, "grad_norm": 1.554909110069275, "learning_rate": 4.029546481842123e-05, "loss": 2.4673, "step": 750 }, { "epoch": 3.0, "grad_norm": 1.2943602800369263, "learning_rate": 4.004676387102995e-05, "loss": 2.4801, "step": 760 }, { "epoch": 3.04, "grad_norm": 1.301687240600586, "learning_rate": 3.9795705759971116e-05, "loss": 2.4779, "step": 770 }, { "epoch": 3.08, "grad_norm": 1.2175750732421875, "learning_rate": 3.9542329814967914e-05, "loss": 2.3964, "step": 780 }, { "epoch": 3.12, "grad_norm": 2.502758502960205, "learning_rate": 3.92866757288458e-05, "loss": 2.4044, "step": 790 }, { "epoch": 3.16, "grad_norm": 1.508583664894104, "learning_rate": 3.9028783551314347e-05, "loss": 2.5229, "step": 800 }, { "epoch": 3.16, "eval_loss": 2.413785696029663, "eval_runtime": 133.2078, "eval_samples_per_second": 6.756, "eval_steps_per_second": 3.378, "step": 800 }, { "epoch": 3.2, "grad_norm": 1.3288512229919434, "learning_rate": 3.876869368269327e-05, "loss": 2.4517, "step": 810 }, { "epoch": 3.24, "grad_norm": 1.4469561576843262, "learning_rate": 3.850644686758346e-05, "loss": 2.5377, "step": 820 }, { "epoch": 3.28, "grad_norm": 1.560353398323059, "learning_rate": 3.82420841884841e-05, "loss": 2.3569, "step": 830 }, { "epoch": 3.32, "grad_norm": 1.9207900762557983, "learning_rate": 3.7975647059356875e-05, "loss": 2.4131, "step": 840 }, { "epoch": 3.36, "grad_norm": 1.685535192489624, "learning_rate": 3.770717721913819e-05, "loss": 2.5124, "step": 850 }, { "epoch": 3.4, "grad_norm": 1.3592054843902588, "learning_rate": 3.743671672520054e-05, "loss": 2.3343, "step": 860 }, { "epoch": 3.44, "grad_norm": 1.9445059299468994, "learning_rate": 3.716430794676402e-05, "loss": 2.4614, "step": 870 }, { "epoch": 3.48, "grad_norm": 1.6313419342041016, "learning_rate": 3.688999355825887e-05, "loss": 2.4678, "step": 880 }, { "epoch": 3.52, "grad_norm": 2.071474313735962, "learning_rate": 3.661381653264031e-05, "loss": 2.4016, "step": 890 }, { "epoch": 3.56, "grad_norm": 6.210580825805664, "learning_rate": 3.633582013465658e-05, "loss": 2.3772, "step": 900 }, { "epoch": 3.6, "grad_norm": 1.459627628326416, "learning_rate": 3.605604791407124e-05, "loss": 2.4438, "step": 910 }, { "epoch": 3.63, "grad_norm": 1.3812425136566162, "learning_rate": 3.577454369884086e-05, "loss": 2.4352, "step": 920 }, { "epoch": 3.67, "grad_norm": 1.443032145500183, "learning_rate": 3.549135158824913e-05, "loss": 2.3374, "step": 930 }, { "epoch": 3.71, "grad_norm": 2.8968636989593506, "learning_rate": 3.520651594599842e-05, "loss": 2.3911, "step": 940 }, { "epoch": 3.75, "grad_norm": 1.7020437717437744, "learning_rate": 3.4920081393259955e-05, "loss": 2.5022, "step": 950 }, { "epoch": 3.79, "grad_norm": 1.4983431100845337, "learning_rate": 3.463209280168365e-05, "loss": 2.4919, "step": 960 }, { "epoch": 3.83, "grad_norm": 1.527735948562622, "learning_rate": 3.434259528636872e-05, "loss": 2.423, "step": 970 }, { "epoch": 3.87, "grad_norm": 1.3608715534210205, "learning_rate": 3.405163419879611e-05, "loss": 2.4668, "step": 980 }, { "epoch": 3.91, "grad_norm": 1.6936486959457397, "learning_rate": 3.37592551197239e-05, "loss": 2.4736, "step": 990 }, { "epoch": 3.95, "grad_norm": 1.6318974494934082, "learning_rate": 3.34655038520469e-05, "loss": 2.4683, "step": 1000 }, { "epoch": 3.99, "grad_norm": 1.3295326232910156, "learning_rate": 3.317042641362126e-05, "loss": 2.3889, "step": 1010 }, { "epoch": 4.03, "grad_norm": 1.5521697998046875, "learning_rate": 3.2874069030055534e-05, "loss": 2.4913, "step": 1020 }, { "epoch": 4.07, "grad_norm": 1.2893122434616089, "learning_rate": 3.257647812746922e-05, "loss": 2.4289, "step": 1030 }, { "epoch": 4.11, "grad_norm": 1.4011497497558594, "learning_rate": 3.227770032521975e-05, "loss": 2.4604, "step": 1040 }, { "epoch": 4.15, "grad_norm": 1.7100721597671509, "learning_rate": 3.1977782428599364e-05, "loss": 2.3778, "step": 1050 }, { "epoch": 4.19, "grad_norm": 1.4909169673919678, "learning_rate": 3.1676771421502746e-05, "loss": 2.4634, "step": 1060 }, { "epoch": 4.23, "grad_norm": 2.009910821914673, "learning_rate": 3.137471445906675e-05, "loss": 2.4035, "step": 1070 }, { "epoch": 4.27, "grad_norm": 1.4564893245697021, "learning_rate": 3.107165886028326e-05, "loss": 2.4581, "step": 1080 }, { "epoch": 4.31, "grad_norm": 1.6162135601043701, "learning_rate": 3.076765210058638e-05, "loss": 2.4216, "step": 1090 }, { "epoch": 4.35, "grad_norm": 1.469684362411499, "learning_rate": 3.046274180441512e-05, "loss": 2.3395, "step": 1100 }, { "epoch": 4.39, "grad_norm": 2.3828556537628174, "learning_rate": 3.015697573775283e-05, "loss": 2.4602, "step": 1110 }, { "epoch": 4.42, "grad_norm": 1.5302035808563232, "learning_rate": 2.9850401800644257e-05, "loss": 2.4116, "step": 1120 }, { "epoch": 4.46, "grad_norm": 2.1008236408233643, "learning_rate": 2.9543068019691833e-05, "loss": 2.2545, "step": 1130 }, { "epoch": 4.5, "grad_norm": 1.4228670597076416, "learning_rate": 2.923502254053193e-05, "loss": 2.4589, "step": 1140 }, { "epoch": 4.54, "grad_norm": 1.4719305038452148, "learning_rate": 2.892631362029265e-05, "loss": 2.3918, "step": 1150 }, { "epoch": 4.58, "grad_norm": 1.771802544593811, "learning_rate": 2.8616989620034013e-05, "loss": 2.3929, "step": 1160 }, { "epoch": 4.62, "grad_norm": 1.5566627979278564, "learning_rate": 2.83070989971719e-05, "loss": 2.3442, "step": 1170 }, { "epoch": 4.66, "grad_norm": 1.8499693870544434, "learning_rate": 2.7996690297886995e-05, "loss": 2.4422, "step": 1180 }, { "epoch": 4.7, "grad_norm": 1.5866152048110962, "learning_rate": 2.768581214951964e-05, "loss": 2.4489, "step": 1190 }, { "epoch": 4.74, "grad_norm": 1.5571675300598145, "learning_rate": 2.737451325295214e-05, "loss": 2.3453, "step": 1200 }, { "epoch": 4.74, "eval_loss": 2.4050841331481934, "eval_runtime": 133.8041, "eval_samples_per_second": 6.726, "eval_steps_per_second": 3.363, "step": 1200 }, { "epoch": 4.78, "grad_norm": 1.371382474899292, "learning_rate": 2.706284237497948e-05, "loss": 2.3094, "step": 1210 }, { "epoch": 4.82, "grad_norm": 1.5894430875778198, "learning_rate": 2.675084834066968e-05, "loss": 2.352, "step": 1220 }, { "epoch": 4.86, "grad_norm": 1.9093360900878906, "learning_rate": 2.6438580025715138e-05, "loss": 2.3941, "step": 1230 }, { "epoch": 4.9, "grad_norm": 1.7057812213897705, "learning_rate": 2.612608634877588e-05, "loss": 2.408, "step": 1240 }, { "epoch": 4.94, "grad_norm": 4.448972225189209, "learning_rate": 2.5813416263816227e-05, "loss": 2.4234, "step": 1250 }, { "epoch": 4.98, "grad_norm": 1.4726636409759521, "learning_rate": 2.550061875243584e-05, "loss": 2.4223, "step": 1260 }, { "epoch": 5.02, "grad_norm": 1.3479527235031128, "learning_rate": 2.5187742816196487e-05, "loss": 2.3444, "step": 1270 }, { "epoch": 5.06, "grad_norm": 1.584678292274475, "learning_rate": 2.487483746894563e-05, "loss": 2.4881, "step": 1280 }, { "epoch": 5.1, "grad_norm": 1.5328477621078491, "learning_rate": 2.4561951729138167e-05, "loss": 2.3752, "step": 1290 }, { "epoch": 5.14, "grad_norm": 2.0441181659698486, "learning_rate": 2.4249134612157346e-05, "loss": 2.4605, "step": 1300 }, { "epoch": 5.18, "grad_norm": 1.5883582830429077, "learning_rate": 2.393643512263627e-05, "loss": 2.3095, "step": 1310 }, { "epoch": 5.21, "grad_norm": 1.6504632234573364, "learning_rate": 2.3623902246780994e-05, "loss": 2.3773, "step": 1320 }, { "epoch": 5.25, "grad_norm": 2.101841926574707, "learning_rate": 2.331158494469657e-05, "loss": 2.3966, "step": 1330 }, { "epoch": 5.29, "grad_norm": 1.5765920877456665, "learning_rate": 2.2999532142717174e-05, "loss": 2.4361, "step": 1340 }, { "epoch": 5.33, "grad_norm": 2.0858278274536133, "learning_rate": 2.268779272574146e-05, "loss": 2.3576, "step": 1350 }, { "epoch": 5.37, "grad_norm": 1.7046364545822144, "learning_rate": 2.2376415529574525e-05, "loss": 2.4298, "step": 1360 }, { "epoch": 5.41, "grad_norm": 1.7461296319961548, "learning_rate": 2.206544933327742e-05, "loss": 2.3175, "step": 1370 }, { "epoch": 5.45, "grad_norm": 2.0052788257598877, "learning_rate": 2.1754942851525677e-05, "loss": 2.3432, "step": 1380 }, { "epoch": 5.49, "grad_norm": 1.8527193069458008, "learning_rate": 2.1444944726977857e-05, "loss": 2.2937, "step": 1390 }, { "epoch": 5.53, "grad_norm": 1.8431612253189087, "learning_rate": 2.1135503522655374e-05, "loss": 2.3031, "step": 1400 }, { "epoch": 5.57, "grad_norm": 1.8249716758728027, "learning_rate": 2.082666771433484e-05, "loss": 2.4171, "step": 1410 }, { "epoch": 5.61, "grad_norm": 1.6596335172653198, "learning_rate": 2.0518485682954025e-05, "loss": 2.4917, "step": 1420 }, { "epoch": 5.65, "grad_norm": 1.8855317831039429, "learning_rate": 2.0211005707032733e-05, "loss": 2.3648, "step": 1430 }, { "epoch": 5.69, "grad_norm": 1.6180534362792969, "learning_rate": 1.9904275955109652e-05, "loss": 2.4083, "step": 1440 }, { "epoch": 5.73, "grad_norm": 1.5273176431655884, "learning_rate": 1.959834447819649e-05, "loss": 2.4187, "step": 1450 }, { "epoch": 5.77, "grad_norm": 1.8004169464111328, "learning_rate": 1.9293259202250517e-05, "loss": 2.4147, "step": 1460 }, { "epoch": 5.81, "grad_norm": 1.641048550605774, "learning_rate": 1.8989067920666633e-05, "loss": 2.3738, "step": 1470 }, { "epoch": 5.85, "grad_norm": 1.586946964263916, "learning_rate": 1.8685818286790325e-05, "loss": 2.4126, "step": 1480 }, { "epoch": 5.89, "grad_norm": 2.0825576782226562, "learning_rate": 1.8383557806452433e-05, "loss": 2.3781, "step": 1490 }, { "epoch": 5.93, "grad_norm": 1.7725000381469727, "learning_rate": 1.808233383052709e-05, "loss": 2.2956, "step": 1500 }, { "epoch": 5.97, "grad_norm": 1.7009029388427734, "learning_rate": 1.7782193547513974e-05, "loss": 2.3416, "step": 1510 }, { "epoch": 6.0, "grad_norm": 1.7379595041275024, "learning_rate": 1.7483183976145894e-05, "loss": 2.3466, "step": 1520 }, { "epoch": 6.04, "grad_norm": 1.678911805152893, "learning_rate": 1.7185351958023082e-05, "loss": 2.4167, "step": 1530 }, { "epoch": 6.08, "grad_norm": 1.663160800933838, "learning_rate": 1.6888744150275148e-05, "loss": 2.4156, "step": 1540 }, { "epoch": 6.12, "grad_norm": 1.5950766801834106, "learning_rate": 1.6593407018251973e-05, "loss": 2.3795, "step": 1550 }, { "epoch": 6.16, "grad_norm": 1.5608184337615967, "learning_rate": 1.6299386828244645e-05, "loss": 2.3945, "step": 1560 }, { "epoch": 6.2, "grad_norm": 2.6302921772003174, "learning_rate": 1.60067296402376e-05, "loss": 2.3195, "step": 1570 }, { "epoch": 6.24, "grad_norm": 1.7563198804855347, "learning_rate": 1.5715481300692993e-05, "loss": 2.3551, "step": 1580 }, { "epoch": 6.28, "grad_norm": 2.2081732749938965, "learning_rate": 1.5425687435368648e-05, "loss": 2.3597, "step": 1590 }, { "epoch": 6.32, "grad_norm": 2.120513916015625, "learning_rate": 1.5137393442170461e-05, "loss": 2.3758, "step": 1600 }, { "epoch": 6.32, "eval_loss": 2.400667428970337, "eval_runtime": 133.4489, "eval_samples_per_second": 6.744, "eval_steps_per_second": 3.372, "step": 1600 }, { "epoch": 6.36, "grad_norm": 1.9258661270141602, "learning_rate": 1.4850644484040584e-05, "loss": 2.3852, "step": 1610 }, { "epoch": 6.4, "grad_norm": 1.749426007270813, "learning_rate": 1.4565485481882396e-05, "loss": 2.3067, "step": 1620 }, { "epoch": 6.44, "grad_norm": 1.9953992366790771, "learning_rate": 1.4281961107523336e-05, "loss": 2.3013, "step": 1630 }, { "epoch": 6.48, "grad_norm": 2.156952381134033, "learning_rate": 1.4000115776716849e-05, "loss": 2.3504, "step": 1640 }, { "epoch": 6.52, "grad_norm": 2.4170098304748535, "learning_rate": 1.371999364218437e-05, "loss": 2.3035, "step": 1650 }, { "epoch": 6.56, "grad_norm": 2.338102340698242, "learning_rate": 1.3441638586698527e-05, "loss": 2.2753, "step": 1660 }, { "epoch": 6.6, "grad_norm": 2.286085605621338, "learning_rate": 1.3165094216208696e-05, "loss": 2.3644, "step": 1670 }, { "epoch": 6.64, "grad_norm": 2.505244016647339, "learning_rate": 1.2890403853009847e-05, "loss": 2.371, "step": 1680 }, { "epoch": 6.68, "grad_norm": 1.636423110961914, "learning_rate": 1.2617610528955814e-05, "loss": 2.3602, "step": 1690 }, { "epoch": 6.72, "grad_norm": 1.6253471374511719, "learning_rate": 1.234675697871818e-05, "loss": 2.3858, "step": 1700 }, { "epoch": 6.76, "grad_norm": 1.9490761756896973, "learning_rate": 1.2077885633091595e-05, "loss": 2.2864, "step": 1710 }, { "epoch": 6.8, "grad_norm": 1.7611408233642578, "learning_rate": 1.1811038612346728e-05, "loss": 2.2646, "step": 1720 }, { "epoch": 6.83, "grad_norm": 1.9415556192398071, "learning_rate": 1.154625771963192e-05, "loss": 2.311, "step": 1730 }, { "epoch": 6.87, "grad_norm": 2.0429086685180664, "learning_rate": 1.1283584434424455e-05, "loss": 2.3504, "step": 1740 }, { "epoch": 6.91, "grad_norm": 2.0815227031707764, "learning_rate": 1.102305990603257e-05, "loss": 2.3426, "step": 1750 }, { "epoch": 6.95, "grad_norm": 1.8559825420379639, "learning_rate": 1.0764724947149132e-05, "loss": 2.3183, "step": 1760 }, { "epoch": 6.99, "grad_norm": 2.6576716899871826, "learning_rate": 1.0508620027458158e-05, "loss": 2.378, "step": 1770 }, { "epoch": 7.03, "grad_norm": 1.9085129499435425, "learning_rate": 1.0254785267294958e-05, "loss": 2.3286, "step": 1780 }, { "epoch": 7.07, "grad_norm": 1.899032711982727, "learning_rate": 1.0003260431361039e-05, "loss": 2.3615, "step": 1790 }, { "epoch": 7.11, "grad_norm": 1.8750344514846802, "learning_rate": 9.75408492249478e-06, "loss": 2.3459, "step": 1800 }, { "epoch": 7.15, "grad_norm": 2.1118319034576416, "learning_rate": 9.507297775498707e-06, "loss": 2.4204, "step": 1810 }, { "epoch": 7.19, "grad_norm": 1.971839189529419, "learning_rate": 9.262937651024462e-06, "loss": 2.3497, "step": 1820 }, { "epoch": 7.23, "grad_norm": 2.0775558948516846, "learning_rate": 9.02104282951641e-06, "loss": 2.3027, "step": 1830 }, { "epoch": 7.27, "grad_norm": 2.3251700401306152, "learning_rate": 8.781651205214775e-06, "loss": 2.3317, "step": 1840 }, { "epoch": 7.31, "grad_norm": 2.0580742359161377, "learning_rate": 8.544800280219282e-06, "loss": 2.3516, "step": 1850 }, { "epoch": 7.35, "grad_norm": 2.2532641887664795, "learning_rate": 8.310527158614204e-06, "loss": 2.2712, "step": 1860 }, { "epoch": 7.39, "grad_norm": 2.1395344734191895, "learning_rate": 8.07886854065585e-06, "loss": 2.3357, "step": 1870 }, { "epoch": 7.43, "grad_norm": 1.6818287372589111, "learning_rate": 7.849860717023217e-06, "loss": 2.3414, "step": 1880 }, { "epoch": 7.47, "grad_norm": 2.29758882522583, "learning_rate": 7.62353956313284e-06, "loss": 2.2435, "step": 1890 }, { "epoch": 7.51, "grad_norm": 2.3084988594055176, "learning_rate": 7.3999405335187124e-06, "loss": 2.3185, "step": 1900 }, { "epoch": 7.55, "grad_norm": 2.1061959266662598, "learning_rate": 7.17909865627813e-06, "loss": 2.3499, "step": 1910 }, { "epoch": 7.59, "grad_norm": 1.7624549865722656, "learning_rate": 6.961048527584296e-06, "loss": 2.3895, "step": 1920 }, { "epoch": 7.62, "grad_norm": 2.2806477546691895, "learning_rate": 6.745824306266685e-06, "loss": 2.3313, "step": 1930 }, { "epoch": 7.66, "grad_norm": 1.7848331928253174, "learning_rate": 6.533459708459827e-06, "loss": 2.4686, "step": 1940 }, { "epoch": 7.7, "grad_norm": 1.8504621982574463, "learning_rate": 6.323988002321471e-06, "loss": 2.2985, "step": 1950 }, { "epoch": 7.74, "grad_norm": 2.1124908924102783, "learning_rate": 6.1174420028209585e-06, "loss": 2.3432, "step": 1960 }, { "epoch": 7.78, "grad_norm": 1.7486050128936768, "learning_rate": 5.9138540665985595e-06, "loss": 2.3414, "step": 1970 }, { "epoch": 7.82, "grad_norm": 2.394221782684326, "learning_rate": 5.713256086896604e-06, "loss": 2.3297, "step": 1980 }, { "epoch": 7.86, "grad_norm": 2.157335042953491, "learning_rate": 5.5156794885632165e-06, "loss": 2.2748, "step": 1990 }, { "epoch": 7.9, "grad_norm": 2.8654778003692627, "learning_rate": 5.3211552231294485e-06, "loss": 2.3073, "step": 2000 }, { "epoch": 7.9, "eval_loss": 2.395761489868164, "eval_runtime": 134.6111, "eval_samples_per_second": 6.686, "eval_steps_per_second": 3.343, "step": 2000 } ], "logging_steps": 10, "max_steps": 2530, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 400, "total_flos": 2.2040949247338086e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }