{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9994666666666666, "eval_steps": 500, "global_step": 937, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 5.319148936170213e-08, "logits/chosen": 0.015347272157669067, "logits/rejected": -0.12729741632938385, "logps/chosen": -550.8414916992188, "logps/rejected": -492.32574462890625, "loss": 0.2285, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.01, "learning_rate": 5.319148936170213e-07, "logits/chosen": 0.08696222305297852, "logits/rejected": 0.14597061276435852, "logps/chosen": -464.8576965332031, "logps/rejected": -456.1559753417969, "loss": 0.21, "rewards/accuracies": 0.3611111044883728, "rewards/chosen": 0.000442314165411517, "rewards/margins": -2.51087640208425e-05, "rewards/rejected": 0.00046742294216528535, "step": 10 }, { "epoch": 0.02, "learning_rate": 1.0638297872340427e-06, "logits/chosen": 0.053336001932621, "logits/rejected": 0.027396252378821373, "logps/chosen": -463.5480041503906, "logps/rejected": -488.28094482421875, "loss": 0.2026, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.0002625098277349025, "rewards/margins": -8.491716289427131e-05, "rewards/rejected": 0.00034742700518108904, "step": 20 }, { "epoch": 0.03, "learning_rate": 1.595744680851064e-06, "logits/chosen": 0.08084158599376678, "logits/rejected": 0.09464940428733826, "logps/chosen": -506.58782958984375, "logps/rejected": -511.69696044921875, "loss": 0.2155, "rewards/accuracies": 0.4375, "rewards/chosen": -0.0002810861624311656, "rewards/margins": 8.317727770190686e-05, "rewards/rejected": -0.00036426345468498766, "step": 30 }, { "epoch": 0.04, "learning_rate": 2.1276595744680853e-06, "logits/chosen": 0.13661757111549377, "logits/rejected": 0.13296189904212952, "logps/chosen": -555.6744995117188, "logps/rejected": -553.1989135742188, "loss": 0.2092, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -0.000944433850236237, "rewards/margins": 0.00023557464010082185, "rewards/rejected": -0.0011800084030255675, "step": 40 }, { "epoch": 0.05, "learning_rate": 2.6595744680851065e-06, "logits/chosen": 0.11821017414331436, "logits/rejected": 0.09106048941612244, "logps/chosen": -486.61151123046875, "logps/rejected": -502.881591796875, "loss": 0.2104, "rewards/accuracies": 0.4124999940395355, "rewards/chosen": -0.0019579054787755013, "rewards/margins": 0.00025602790992707014, "rewards/rejected": -0.002213933737948537, "step": 50 }, { "epoch": 0.06, "learning_rate": 3.191489361702128e-06, "logits/chosen": 0.02751517854630947, "logits/rejected": 0.14413723349571228, "logps/chosen": -522.82177734375, "logps/rejected": -522.5377197265625, "loss": 0.2126, "rewards/accuracies": 0.45625001192092896, "rewards/chosen": -0.0038118392694741488, "rewards/margins": 0.0003993002756033093, "rewards/rejected": -0.004211139865219593, "step": 60 }, { "epoch": 0.07, "learning_rate": 3.723404255319149e-06, "logits/chosen": 0.1060861125588417, "logits/rejected": 0.10927315801382065, "logps/chosen": -507.0030822753906, "logps/rejected": -500.82489013671875, "loss": 0.2142, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.004309800453484058, "rewards/margins": 0.0013471845304593444, "rewards/rejected": -0.005656985100358725, "step": 70 }, { "epoch": 0.09, "learning_rate": 4.255319148936171e-06, "logits/chosen": 0.10468300431966782, "logits/rejected": 0.06107153370976448, "logps/chosen": -479.45989990234375, "logps/rejected": -488.6297302246094, "loss": 0.2109, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -0.004880805965512991, "rewards/margins": 0.0016875596484169364, "rewards/rejected": -0.0065683661960065365, "step": 80 }, { "epoch": 0.1, "learning_rate": 4.787234042553192e-06, "logits/chosen": 0.031617164611816406, "logits/rejected": 0.03786861151456833, "logps/chosen": -457.5731506347656, "logps/rejected": -467.76806640625, "loss": 0.2108, "rewards/accuracies": 0.5, "rewards/chosen": -0.006191219203174114, "rewards/margins": 0.003313812892884016, "rewards/rejected": -0.009505031630396843, "step": 90 }, { "epoch": 0.11, "learning_rate": 4.999375059004058e-06, "logits/chosen": 0.0778881087899208, "logits/rejected": -0.01506942231208086, "logps/chosen": -491.6114196777344, "logps/rejected": -491.8241271972656, "loss": 0.1988, "rewards/accuracies": 0.4437499940395355, "rewards/chosen": -0.010530633851885796, "rewards/margins": 0.003181540174409747, "rewards/rejected": -0.013712175190448761, "step": 100 }, { "epoch": 0.12, "learning_rate": 4.9955571065548795e-06, "logits/chosen": -0.0275646410882473, "logits/rejected": -0.05159539729356766, "logps/chosen": -512.9607543945312, "logps/rejected": -539.0792846679688, "loss": 0.2013, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -0.01575169898569584, "rewards/margins": 0.009278899058699608, "rewards/rejected": -0.025030598044395447, "step": 110 }, { "epoch": 0.13, "learning_rate": 4.9882736864879e-06, "logits/chosen": -0.037609659135341644, "logits/rejected": -0.007756671402603388, "logps/chosen": -528.1864624023438, "logps/rejected": -521.6083984375, "loss": 0.2006, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -0.016717851161956787, "rewards/margins": 0.00735442852601409, "rewards/rejected": -0.024072280153632164, "step": 120 }, { "epoch": 0.14, "learning_rate": 4.977534912960124e-06, "logits/chosen": 0.020100779831409454, "logits/rejected": 8.137784607242793e-05, "logps/chosen": -486.9697265625, "logps/rejected": -506.7543029785156, "loss": 0.1979, "rewards/accuracies": 0.4312500059604645, "rewards/chosen": -0.024340303614735603, "rewards/margins": 0.011229689233005047, "rewards/rejected": -0.03556998819112778, "step": 130 }, { "epoch": 0.15, "learning_rate": 4.963355698422092e-06, "logits/chosen": -0.014675384387373924, "logits/rejected": -0.04730897396802902, "logps/chosen": -488.8623046875, "logps/rejected": -493.58087158203125, "loss": 0.198, "rewards/accuracies": 0.5, "rewards/chosen": -0.018940208479762077, "rewards/margins": 0.01975865848362446, "rewards/rejected": -0.038698866963386536, "step": 140 }, { "epoch": 0.16, "learning_rate": 4.945755732909625e-06, "logits/chosen": -0.11544609069824219, "logits/rejected": -0.014225010760128498, "logps/chosen": -516.4327392578125, "logps/rejected": -552.8570556640625, "loss": 0.1998, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -0.026035359129309654, "rewards/margins": 0.02031863108277321, "rewards/rejected": -0.04635399580001831, "step": 150 }, { "epoch": 0.17, "learning_rate": 4.924759456701167e-06, "logits/chosen": -0.06522537022829056, "logits/rejected": -0.0837598517537117, "logps/chosen": -467.21722412109375, "logps/rejected": -517.3309326171875, "loss": 0.2005, "rewards/accuracies": 0.46875, "rewards/chosen": -0.02084728702902794, "rewards/margins": 0.025486458092927933, "rewards/rejected": -0.04633374512195587, "step": 160 }, { "epoch": 0.18, "learning_rate": 4.900396026378671e-06, "logits/chosen": -0.18057578802108765, "logits/rejected": -0.12668588757514954, "logps/chosen": -534.2403564453125, "logps/rejected": -615.2245483398438, "loss": 0.1981, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.017506513744592667, "rewards/margins": 0.03957374021410942, "rewards/rejected": -0.057080257683992386, "step": 170 }, { "epoch": 0.19, "learning_rate": 4.872699274339169e-06, "logits/chosen": -0.1534993201494217, "logits/rejected": -0.03731600195169449, "logps/chosen": -496.35247802734375, "logps/rejected": -547.9083862304688, "loss": 0.2, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.014407465234398842, "rewards/margins": 0.03549625352025032, "rewards/rejected": -0.049903713166713715, "step": 180 }, { "epoch": 0.2, "learning_rate": 4.8417076618132434e-06, "logits/chosen": -0.0889858677983284, "logits/rejected": -0.18024688959121704, "logps/chosen": -534.732421875, "logps/rejected": -549.1685791015625, "loss": 0.1997, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.02401977963745594, "rewards/margins": 0.026353713124990463, "rewards/rejected": -0.05037349462509155, "step": 190 }, { "epoch": 0.21, "learning_rate": 4.807464225455655e-06, "logits/chosen": -0.13999292254447937, "logits/rejected": -0.20869961380958557, "logps/chosen": -501.455810546875, "logps/rejected": -529.4840087890625, "loss": 0.1994, "rewards/accuracies": 0.45625001192092896, "rewards/chosen": -0.037735745310783386, "rewards/margins": 0.02034623734652996, "rewards/rejected": -0.0580819770693779, "step": 200 }, { "epoch": 0.22, "learning_rate": 4.770016517582283e-06, "logits/chosen": -0.16457560658454895, "logits/rejected": -0.16379991173744202, "logps/chosen": -577.7967529296875, "logps/rejected": -628.288818359375, "loss": 0.1953, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.05010377615690231, "rewards/margins": 0.03183088079094887, "rewards/rejected": -0.08193466067314148, "step": 210 }, { "epoch": 0.23, "learning_rate": 4.7294165401363616e-06, "logits/chosen": -0.19137120246887207, "logits/rejected": -0.0707189291715622, "logps/chosen": -590.2723999023438, "logps/rejected": -624.1236572265625, "loss": 0.1885, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.0555083341896534, "rewards/margins": 0.029821401461958885, "rewards/rejected": -0.08532973378896713, "step": 220 }, { "epoch": 0.25, "learning_rate": 4.68572067247573e-06, "logits/chosen": -0.19102320075035095, "logits/rejected": -0.18087339401245117, "logps/chosen": -493.77886962890625, "logps/rejected": -550.4117431640625, "loss": 0.2015, "rewards/accuracies": 0.45625001192092896, "rewards/chosen": -0.06666065752506256, "rewards/margins": 0.018180230632424355, "rewards/rejected": -0.08484089374542236, "step": 230 }, { "epoch": 0.26, "learning_rate": 4.638989593081364e-06, "logits/chosen": -0.14345607161521912, "logits/rejected": -0.14746864140033722, "logps/chosen": -489.48248291015625, "logps/rejected": -535.217041015625, "loss": 0.1916, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": -0.04723743349313736, "rewards/margins": 0.03517382964491844, "rewards/rejected": -0.0824112594127655, "step": 240 }, { "epoch": 0.27, "learning_rate": 4.5892881952959015e-06, "logits/chosen": -0.11015196144580841, "logits/rejected": -0.07484224438667297, "logps/chosen": -516.731689453125, "logps/rejected": -559.3098754882812, "loss": 0.187, "rewards/accuracies": 0.5, "rewards/chosen": -0.03826197609305382, "rewards/margins": 0.031822480261325836, "rewards/rejected": -0.07008445262908936, "step": 250 }, { "epoch": 0.28, "learning_rate": 4.536685497209182e-06, "logits/chosen": -0.11327183246612549, "logits/rejected": -0.16368862986564636, "logps/chosen": -511.823974609375, "logps/rejected": -565.7365112304688, "loss": 0.1924, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -0.04240923374891281, "rewards/margins": 0.04024555906653404, "rewards/rejected": -0.08265479654073715, "step": 260 }, { "epoch": 0.29, "learning_rate": 4.481254545815943e-06, "logits/chosen": -0.11171416938304901, "logits/rejected": -0.09737102687358856, "logps/chosen": -594.6912231445312, "logps/rejected": -623.1627807617188, "loss": 0.2014, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.04938292130827904, "rewards/margins": 0.02359354868531227, "rewards/rejected": -0.07297646999359131, "step": 270 }, { "epoch": 0.3, "learning_rate": 4.42307231557875e-06, "logits/chosen": -0.02691875956952572, "logits/rejected": -0.15112504363059998, "logps/chosen": -557.1254272460938, "logps/rejected": -591.5187377929688, "loss": 0.1912, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.03798414021730423, "rewards/margins": 0.02616792358458042, "rewards/rejected": -0.0641520619392395, "step": 280 }, { "epoch": 0.31, "learning_rate": 4.3622196015370305e-06, "logits/chosen": -0.06971077620983124, "logits/rejected": -0.1268663853406906, "logps/chosen": -498.1439514160156, "logps/rejected": -574.2821044921875, "loss": 0.195, "rewards/accuracies": 0.45625001192092896, "rewards/chosen": -0.044213201850652695, "rewards/margins": 0.02420051395893097, "rewards/rejected": -0.06841371208429337, "step": 290 }, { "epoch": 0.32, "learning_rate": 4.298780907110648e-06, "logits/chosen": -0.03687559440732002, "logits/rejected": -0.08016426861286163, "logps/chosen": -526.9334716796875, "logps/rejected": -581.825439453125, "loss": 0.1907, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -0.046329911798238754, "rewards/margins": 0.03305746242403984, "rewards/rejected": -0.079387366771698, "step": 300 }, { "epoch": 0.33, "learning_rate": 4.23284432675381e-06, "logits/chosen": -0.10335829108953476, "logits/rejected": -0.12378430366516113, "logps/chosen": -504.7950134277344, "logps/rejected": -553.68115234375, "loss": 0.1905, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -0.053347356617450714, "rewards/margins": 0.03879328817129135, "rewards/rejected": -0.09214064478874207, "step": 310 }, { "epoch": 0.34, "learning_rate": 4.164501423622277e-06, "logits/chosen": -0.1206481009721756, "logits/rejected": -0.052752863615751266, "logps/chosen": -516.8829345703125, "logps/rejected": -491.26849365234375, "loss": 0.2077, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.0364055298268795, "rewards/margins": 0.00497065857052803, "rewards/rejected": -0.04137618839740753, "step": 320 }, { "epoch": 0.35, "learning_rate": 4.0938471024237355e-06, "logits/chosen": -0.058399152010679245, "logits/rejected": -0.004985150881111622, "logps/chosen": -496.40020751953125, "logps/rejected": -557.3847045898438, "loss": 0.1882, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.013055197894573212, "rewards/margins": 0.04152555763721466, "rewards/rejected": -0.05458075553178787, "step": 330 }, { "epoch": 0.36, "learning_rate": 4.020979477627907e-06, "logits/chosen": -0.00046800225391052663, "logits/rejected": -0.04824506491422653, "logps/chosen": -511.21728515625, "logps/rejected": -525.9166259765625, "loss": 0.196, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.004664528649300337, "rewards/margins": 0.02710866369307041, "rewards/rejected": -0.03177319094538689, "step": 340 }, { "epoch": 0.37, "learning_rate": 3.9459997372194105e-06, "logits/chosen": -0.01971629448235035, "logits/rejected": 0.02092793397605419, "logps/chosen": -464.59808349609375, "logps/rejected": -518.8754272460938, "loss": 0.196, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.011982052586972713, "rewards/margins": 0.03715554624795914, "rewards/rejected": -0.049137599766254425, "step": 350 }, { "epoch": 0.38, "learning_rate": 3.869012002182573e-06, "logits/chosen": -0.13931025564670563, "logits/rejected": -0.10487548261880875, "logps/chosen": -488.48443603515625, "logps/rejected": -540.0478515625, "loss": 0.1955, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.007292003836482763, "rewards/margins": 0.028780395165085793, "rewards/rejected": -0.03607239946722984, "step": 360 }, { "epoch": 0.39, "learning_rate": 3.7901231819133104e-06, "logits/chosen": -0.0676584541797638, "logits/rejected": 0.000223781171371229, "logps/chosen": -515.3255615234375, "logps/rejected": -534.4259643554688, "loss": 0.1928, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.003454460995271802, "rewards/margins": 0.024057697504758835, "rewards/rejected": -0.02751215733587742, "step": 370 }, { "epoch": 0.41, "learning_rate": 3.709442825758875e-06, "logits/chosen": -0.13946941494941711, "logits/rejected": -0.10190458595752716, "logps/chosen": -506.9981994628906, "logps/rejected": -556.169921875, "loss": 0.1991, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.011531559750437737, "rewards/margins": 0.044275928288698196, "rewards/rejected": -0.055807482451200485, "step": 380 }, { "epoch": 0.42, "learning_rate": 3.6270829708916113e-06, "logits/chosen": -0.19370055198669434, "logits/rejected": -0.13238494098186493, "logps/chosen": -493.8172302246094, "logps/rejected": -526.5756225585938, "loss": 0.1965, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.01078014075756073, "rewards/margins": 0.024004068225622177, "rewards/rejected": -0.03478420898318291, "step": 390 }, { "epoch": 0.43, "learning_rate": 3.543157986727991e-06, "logits/chosen": -0.07623752951622009, "logits/rejected": -0.13485901057720184, "logps/chosen": -493.17303466796875, "logps/rejected": -547.02099609375, "loss": 0.2009, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -0.013095049187541008, "rewards/margins": 0.04162532463669777, "rewards/rejected": -0.05472037196159363, "step": 400 }, { "epoch": 0.44, "learning_rate": 3.4577844161089614e-06, "logits/chosen": 0.006638138089329004, "logits/rejected": -0.16132843494415283, "logps/chosen": -505.0772399902344, "logps/rejected": -525.45849609375, "loss": 0.2036, "rewards/accuracies": 0.4375, "rewards/chosen": -9.345449507236481e-05, "rewards/margins": 0.02224581316113472, "rewards/rejected": -0.022339265793561935, "step": 410 }, { "epoch": 0.45, "learning_rate": 3.3710808134621577e-06, "logits/chosen": -0.07844109833240509, "logits/rejected": -0.09653671830892563, "logps/chosen": -464.03411865234375, "logps/rejected": -497.51910400390625, "loss": 0.2023, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": 0.0025765378959476948, "rewards/margins": 0.03173336759209633, "rewards/rejected": -0.02915683016180992, "step": 420 }, { "epoch": 0.46, "learning_rate": 3.2831675801707126e-06, "logits/chosen": -0.12853233516216278, "logits/rejected": -0.06648223102092743, "logps/chosen": -487.64410400390625, "logps/rejected": -543.6026611328125, "loss": 0.1898, "rewards/accuracies": 0.518750011920929, "rewards/chosen": 0.0037212104070931673, "rewards/margins": 0.044964469969272614, "rewards/rejected": -0.041243262588977814, "step": 430 }, { "epoch": 0.47, "learning_rate": 3.194166797377289e-06, "logits/chosen": -0.13361698389053345, "logits/rejected": -0.0924164280295372, "logps/chosen": -488.75225830078125, "logps/rejected": -554.2010498046875, "loss": 0.1913, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.007956433109939098, "rewards/margins": 0.04044501110911369, "rewards/rejected": -0.03248857706785202, "step": 440 }, { "epoch": 0.48, "learning_rate": 3.104202056455501e-06, "logits/chosen": -0.05866111442446709, "logits/rejected": -0.11707846075296402, "logps/chosen": -475.849609375, "logps/rejected": -527.5254516601562, "loss": 0.1889, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": 0.0031708565074950457, "rewards/margins": 0.04169551655650139, "rewards/rejected": -0.03852466121315956, "step": 450 }, { "epoch": 0.49, "learning_rate": 3.013398287384144e-06, "logits/chosen": -0.03628942370414734, "logits/rejected": -0.05839689448475838, "logps/chosen": -450.5067443847656, "logps/rejected": -480.0916442871094, "loss": 0.1959, "rewards/accuracies": 0.46875, "rewards/chosen": 0.007704668678343296, "rewards/margins": 0.020786713808774948, "rewards/rejected": -0.013082042336463928, "step": 460 }, { "epoch": 0.5, "learning_rate": 2.9218815852625717e-06, "logits/chosen": -0.13268591463565826, "logits/rejected": -0.039528947323560715, "logps/chosen": -506.5321350097656, "logps/rejected": -551.9951782226562, "loss": 0.1954, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": 0.0014046819414943457, "rewards/margins": 0.021756207570433617, "rewards/rejected": -0.020351527258753777, "step": 470 }, { "epoch": 0.51, "learning_rate": 2.829779035208113e-06, "logits/chosen": -0.022976398468017578, "logits/rejected": -0.12432871758937836, "logps/chosen": -508.6607360839844, "logps/rejected": -550.5410766601562, "loss": 0.1858, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.002696055918931961, "rewards/margins": 0.03165871649980545, "rewards/rejected": -0.03435477241873741, "step": 480 }, { "epoch": 0.52, "learning_rate": 2.737218535878705e-06, "logits/chosen": -0.011603012681007385, "logits/rejected": -0.056485723704099655, "logps/chosen": -468.14764404296875, "logps/rejected": -571.5040283203125, "loss": 0.1853, "rewards/accuracies": 0.5625, "rewards/chosen": 0.005876007489860058, "rewards/margins": 0.05560145527124405, "rewards/rejected": -0.049725450575351715, "step": 490 }, { "epoch": 0.53, "learning_rate": 2.64432862186579e-06, "logits/chosen": -0.09491895884275436, "logits/rejected": -0.0038899630308151245, "logps/chosen": -498.3922424316406, "logps/rejected": -521.9002075195312, "loss": 0.1883, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.0021348330192267895, "rewards/margins": 0.0486973337829113, "rewards/rejected": -0.050832170993089676, "step": 500 }, { "epoch": 0.54, "learning_rate": 2.551238285204126e-06, "logits/chosen": -0.15483462810516357, "logits/rejected": 0.049260228872299194, "logps/chosen": -476.8915100097656, "logps/rejected": -518.4007568359375, "loss": 0.187, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.006897159852087498, "rewards/margins": 0.04596192017197609, "rewards/rejected": -0.05285907909274101, "step": 510 }, { "epoch": 0.55, "learning_rate": 2.4580767962463688e-06, "logits/chosen": -0.11445822566747665, "logits/rejected": -0.12616074085235596, "logps/chosen": -445.810302734375, "logps/rejected": -483.44439697265625, "loss": 0.1997, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -0.011665165424346924, "rewards/margins": 0.030944203957915306, "rewards/rejected": -0.04260937124490738, "step": 520 }, { "epoch": 0.57, "learning_rate": 2.3649735241511546e-06, "logits/chosen": -0.08440948277711868, "logits/rejected": -0.15135635435581207, "logps/chosen": -458.70513916015625, "logps/rejected": -531.6619262695312, "loss": 0.1857, "rewards/accuracies": 0.5, "rewards/chosen": -0.007908688858151436, "rewards/margins": 0.04392402246594429, "rewards/rejected": -0.051832713186740875, "step": 530 }, { "epoch": 0.58, "learning_rate": 2.2720577572339914e-06, "logits/chosen": -0.11203843355178833, "logits/rejected": -0.14960184693336487, "logps/chosen": -450.1597595214844, "logps/rejected": -564.0913696289062, "loss": 0.1827, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.019688036292791367, "rewards/margins": 0.04803295060992241, "rewards/rejected": -0.06772098690271378, "step": 540 }, { "epoch": 0.59, "learning_rate": 2.1794585234303995e-06, "logits/chosen": -0.1444821059703827, "logits/rejected": -0.093144990503788, "logps/chosen": -526.9356689453125, "logps/rejected": -576.2337646484375, "loss": 0.199, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.0206059031188488, "rewards/margins": 0.031863369047641754, "rewards/rejected": -0.05246926471590996, "step": 550 }, { "epoch": 0.6, "learning_rate": 2.0873044111206407e-06, "logits/chosen": -0.02479735016822815, "logits/rejected": -0.17415586113929749, "logps/chosen": -460.03057861328125, "logps/rejected": -571.5958251953125, "loss": 0.19, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.021069908514618874, "rewards/margins": 0.052955545485019684, "rewards/rejected": -0.07402545213699341, "step": 560 }, { "epoch": 0.61, "learning_rate": 1.9957233905648293e-06, "logits/chosen": -0.1578993797302246, "logits/rejected": -0.2234770804643631, "logps/chosen": -494.98492431640625, "logps/rejected": -540.7052001953125, "loss": 0.1871, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.012826258316636086, "rewards/margins": 0.03663797304034233, "rewards/rejected": -0.049464233219623566, "step": 570 }, { "epoch": 0.62, "learning_rate": 1.904842636196402e-06, "logits/chosen": -0.13111546635627747, "logits/rejected": -0.05531447380781174, "logps/chosen": -508.1211853027344, "logps/rejected": -585.5679931640625, "loss": 0.1889, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -0.0070688240230083466, "rewards/margins": 0.04199820011854172, "rewards/rejected": -0.04906702786684036, "step": 580 }, { "epoch": 0.63, "learning_rate": 1.814788350020726e-06, "logits/chosen": -0.05076460912823677, "logits/rejected": -0.09190233051776886, "logps/chosen": -486.60882568359375, "logps/rejected": -594.5232543945312, "loss": 0.1931, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.008244060911238194, "rewards/margins": 0.04668049514293671, "rewards/rejected": -0.054924555122852325, "step": 590 }, { "epoch": 0.64, "learning_rate": 1.725685586364051e-06, "logits/chosen": -0.09060011804103851, "logits/rejected": -0.03806861490011215, "logps/chosen": -504.2727966308594, "logps/rejected": -519.7486572265625, "loss": 0.1944, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -0.011549371294677258, "rewards/margins": 0.027325350791215897, "rewards/rejected": -0.03887472301721573, "step": 600 }, { "epoch": 0.65, "learning_rate": 1.6376580782162172e-06, "logits/chosen": -0.11662360280752182, "logits/rejected": -0.13336268067359924, "logps/chosen": -536.3695678710938, "logps/rejected": -606.1983032226562, "loss": 0.1908, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.020137406885623932, "rewards/margins": 0.03434290364384651, "rewards/rejected": -0.054480306804180145, "step": 610 }, { "epoch": 0.66, "learning_rate": 1.550828065408227e-06, "logits/chosen": -0.14947877824306488, "logits/rejected": -0.03148087114095688, "logps/chosen": -485.51934814453125, "logps/rejected": -529.2047119140625, "loss": 0.1913, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.014703591354191303, "rewards/margins": 0.03080858290195465, "rewards/rejected": -0.04551216959953308, "step": 620 }, { "epoch": 0.67, "learning_rate": 1.4653161248633053e-06, "logits/chosen": -0.10968085378408432, "logits/rejected": -0.01607862487435341, "logps/chosen": -470.38006591796875, "logps/rejected": -511.1072692871094, "loss": 0.1858, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -0.014205960556864738, "rewards/margins": 0.03790050745010376, "rewards/rejected": -0.05210646986961365, "step": 630 }, { "epoch": 0.68, "learning_rate": 1.381241003157162e-06, "logits/chosen": -0.10932781547307968, "logits/rejected": -0.12227501720190048, "logps/chosen": -479.05859375, "logps/rejected": -543.1046752929688, "loss": 0.1972, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.024690503254532814, "rewards/margins": 0.03049684688448906, "rewards/rejected": -0.05518735572695732, "step": 640 }, { "epoch": 0.69, "learning_rate": 1.298719451619979e-06, "logits/chosen": -0.04668913036584854, "logits/rejected": -0.13933394849300385, "logps/chosen": -487.51971435546875, "logps/rejected": -560.5028076171875, "loss": 0.1887, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.009937921538949013, "rewards/margins": 0.03267529979348183, "rewards/rejected": -0.04261321574449539, "step": 650 }, { "epoch": 0.7, "learning_rate": 1.2178660642091036e-06, "logits/chosen": -0.09352131187915802, "logits/rejected": -0.10257798433303833, "logps/chosen": -509.4551696777344, "logps/rejected": -567.7073364257812, "loss": 0.1825, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.012187512591481209, "rewards/margins": 0.05223285034298897, "rewards/rejected": -0.06442036479711533, "step": 660 }, { "epoch": 0.71, "learning_rate": 1.1387931183775821e-06, "logits/chosen": -0.07275299727916718, "logits/rejected": -0.16395212709903717, "logps/chosen": -499.53277587890625, "logps/rejected": -567.4371337890625, "loss": 0.1844, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.008052601478993893, "rewards/margins": 0.05070211738348007, "rewards/rejected": -0.05875472351908684, "step": 670 }, { "epoch": 0.73, "learning_rate": 1.061610419159532e-06, "logits/chosen": -0.03610026463866234, "logits/rejected": -0.07686237245798111, "logps/chosen": -508.24285888671875, "logps/rejected": -545.7764892578125, "loss": 0.1885, "rewards/accuracies": 0.5625, "rewards/chosen": -0.011298848316073418, "rewards/margins": 0.04110453277826309, "rewards/rejected": -0.05240337923169136, "step": 680 }, { "epoch": 0.74, "learning_rate": 9.864251466888364e-07, "logits/chosen": -0.14515052735805511, "logits/rejected": -0.05630829930305481, "logps/chosen": -514.5619506835938, "logps/rejected": -557.375732421875, "loss": 0.1796, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.015279670245945454, "rewards/margins": 0.037416163831949234, "rewards/rejected": -0.052695829421281815, "step": 690 }, { "epoch": 0.75, "learning_rate": 9.133417073629288e-07, "logits/chosen": -0.13727428019046783, "logits/rejected": -0.10859493911266327, "logps/chosen": -461.16961669921875, "logps/rejected": -528.3992309570312, "loss": 0.189, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.01616724021732807, "rewards/margins": 0.049376003444194794, "rewards/rejected": -0.06554324924945831, "step": 700 }, { "epoch": 0.76, "learning_rate": 8.424615888583332e-07, "logits/chosen": -0.13368161022663116, "logits/rejected": -0.07306365668773651, "logps/chosen": -490.7369689941406, "logps/rejected": -558.0841674804688, "loss": 0.1911, "rewards/accuracies": 0.53125, "rewards/chosen": -0.014125635847449303, "rewards/margins": 0.04241669178009033, "rewards/rejected": -0.056542325764894485, "step": 710 }, { "epoch": 0.77, "learning_rate": 7.738832191993092e-07, "logits/chosen": -0.10794935375452042, "logits/rejected": -0.10061631351709366, "logps/chosen": -505.11273193359375, "logps/rejected": -576.4747314453125, "loss": 0.1859, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.016501661390066147, "rewards/margins": 0.03631531447172165, "rewards/rejected": -0.052816975861787796, "step": 720 }, { "epoch": 0.78, "learning_rate": 7.077018300752917e-07, "logits/chosen": -0.19430354237556458, "logits/rejected": -0.09076061099767685, "logps/chosen": -483.529052734375, "logps/rejected": -520.256103515625, "loss": 0.1955, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -0.01530246902257204, "rewards/margins": 0.03641772270202637, "rewards/rejected": -0.05172019079327583, "step": 730 }, { "epoch": 0.79, "learning_rate": 6.440093245969342e-07, "logits/chosen": -0.2117353230714798, "logits/rejected": -0.00878197606652975, "logps/chosen": -498.167724609375, "logps/rejected": -512.6229248046875, "loss": 0.1946, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -0.028363991528749466, "rewards/margins": 0.02800668217241764, "rewards/rejected": -0.05637066811323166, "step": 740 }, { "epoch": 0.8, "learning_rate": 5.828941496744075e-07, "logits/chosen": -0.1706351935863495, "logits/rejected": -0.09497860074043274, "logps/chosen": -449.25640869140625, "logps/rejected": -503.36346435546875, "loss": 0.1914, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.021301377564668655, "rewards/margins": 0.03773494437336922, "rewards/rejected": -0.05903632566332817, "step": 750 }, { "epoch": 0.81, "learning_rate": 5.244411731951671e-07, "logits/chosen": -0.16339917480945587, "logits/rejected": -0.09793440252542496, "logps/chosen": -485.3291931152344, "logps/rejected": -541.0707397460938, "loss": 0.1932, "rewards/accuracies": 0.5, "rewards/chosen": -0.015980413183569908, "rewards/margins": 0.036792390048503876, "rewards/rejected": -0.052772797644138336, "step": 760 }, { "epoch": 0.82, "learning_rate": 4.6873156617173594e-07, "logits/chosen": -0.16512815654277802, "logits/rejected": -0.04607289656996727, "logps/chosen": -482.732421875, "logps/rejected": -537.2395629882812, "loss": 0.189, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.017042802646756172, "rewards/margins": 0.030301541090011597, "rewards/rejected": -0.04734434187412262, "step": 770 }, { "epoch": 0.83, "learning_rate": 4.1584269002318653e-07, "logits/chosen": -0.06346157193183899, "logits/rejected": -0.17497694492340088, "logps/chosen": -448.33721923828125, "logps/rejected": -536.0673217773438, "loss": 0.1874, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.021191086620092392, "rewards/margins": 0.04755834862589836, "rewards/rejected": -0.06874943524599075, "step": 780 }, { "epoch": 0.84, "learning_rate": 3.658479891468258e-07, "logits/chosen": -0.17253567278385162, "logits/rejected": -0.11398845911026001, "logps/chosen": -448.2945861816406, "logps/rejected": -549.4234008789062, "loss": 0.1922, "rewards/accuracies": 0.5, "rewards/chosen": -0.014737410470843315, "rewards/margins": 0.061607301235198975, "rewards/rejected": -0.07634472101926804, "step": 790 }, { "epoch": 0.85, "learning_rate": 3.18816888929272e-07, "logits/chosen": -0.12382335960865021, "logits/rejected": -0.1283954679965973, "logps/chosen": -486.486572265625, "logps/rejected": -524.1173706054688, "loss": 0.1883, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.01756083406507969, "rewards/margins": 0.035057444125413895, "rewards/rejected": -0.052618276327848434, "step": 800 }, { "epoch": 0.86, "learning_rate": 2.748146993385484e-07, "logits/chosen": -0.09039425849914551, "logits/rejected": -0.14174523949623108, "logps/chosen": -496.9384765625, "logps/rejected": -547.9468994140625, "loss": 0.1865, "rewards/accuracies": 0.53125, "rewards/chosen": -0.02103576436638832, "rewards/margins": 0.04283936321735382, "rewards/rejected": -0.06387512385845184, "step": 810 }, { "epoch": 0.87, "learning_rate": 2.3390252423108077e-07, "logits/chosen": -0.10462252050638199, "logits/rejected": -0.11660999059677124, "logps/chosen": -477.86163330078125, "logps/rejected": -558.4508056640625, "loss": 0.1865, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.01919230818748474, "rewards/margins": 0.046876341104507446, "rewards/rejected": -0.06606864929199219, "step": 820 }, { "epoch": 0.89, "learning_rate": 1.961371764995243e-07, "logits/chosen": -0.10605908930301666, "logits/rejected": -0.1372717022895813, "logps/chosen": -453.75067138671875, "logps/rejected": -494.4928283691406, "loss": 0.1788, "rewards/accuracies": 0.41874998807907104, "rewards/chosen": -0.013546602800488472, "rewards/margins": 0.03696237877011299, "rewards/rejected": -0.05050898343324661, "step": 830 }, { "epoch": 0.9, "learning_rate": 1.61571099179261e-07, "logits/chosen": -0.12508642673492432, "logits/rejected": -0.13528813421726227, "logps/chosen": -527.0031127929688, "logps/rejected": -544.302001953125, "loss": 0.1908, "rewards/accuracies": 0.45625001192092896, "rewards/chosen": -0.02414495311677456, "rewards/margins": 0.02833731472492218, "rewards/rejected": -0.05248226970434189, "step": 840 }, { "epoch": 0.91, "learning_rate": 1.3025229262312367e-07, "logits/chosen": -0.11270849406719208, "logits/rejected": -0.13684390485286713, "logps/chosen": -481.3531799316406, "logps/rejected": -518.9368896484375, "loss": 0.1929, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.02105192095041275, "rewards/margins": 0.031041234731674194, "rewards/rejected": -0.05209314823150635, "step": 850 }, { "epoch": 0.92, "learning_rate": 1.0222424784546853e-07, "logits/chosen": -0.1346197873353958, "logits/rejected": -0.07176433503627777, "logps/chosen": -523.1410522460938, "logps/rejected": -589.2576904296875, "loss": 0.1997, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.025201961398124695, "rewards/margins": 0.04333069920539856, "rewards/rejected": -0.06853266805410385, "step": 860 }, { "epoch": 0.93, "learning_rate": 7.752588612816553e-08, "logits/chosen": -0.10653670132160187, "logits/rejected": -0.09153258800506592, "logps/chosen": -498.6484375, "logps/rejected": -578.7772216796875, "loss": 0.19, "rewards/accuracies": 0.5, "rewards/chosen": -0.022535882890224457, "rewards/margins": 0.05137655884027481, "rewards/rejected": -0.07391244173049927, "step": 870 }, { "epoch": 0.94, "learning_rate": 5.619150497236991e-08, "logits/chosen": -0.06052190810441971, "logits/rejected": -0.1323552131652832, "logps/chosen": -515.4520263671875, "logps/rejected": -532.12353515625, "loss": 0.184, "rewards/accuracies": 0.4312500059604645, "rewards/chosen": -0.018582861870527267, "rewards/margins": 0.03698521479964256, "rewards/rejected": -0.05556807667016983, "step": 880 }, { "epoch": 0.95, "learning_rate": 3.825073047112743e-08, "logits/chosen": -0.0825682058930397, "logits/rejected": -0.08825576305389404, "logps/chosen": -547.6183471679688, "logps/rejected": -580.6526489257812, "loss": 0.2039, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.02423805184662342, "rewards/margins": 0.04648369550704956, "rewards/rejected": -0.07072174549102783, "step": 890 }, { "epoch": 0.96, "learning_rate": 2.372847616895685e-08, "logits/chosen": -0.24079068005084991, "logits/rejected": -0.1403379738330841, "logps/chosen": -467.0528869628906, "logps/rejected": -511.5205078125, "loss": 0.1853, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -0.023756807669997215, "rewards/margins": 0.03743208199739456, "rewards/rejected": -0.061188895255327225, "step": 900 }, { "epoch": 0.97, "learning_rate": 1.264490846553279e-08, "logits/chosen": -0.15193654596805573, "logits/rejected": -0.08774205297231674, "logps/chosen": -493.47113037109375, "logps/rejected": -540.6048583984375, "loss": 0.1914, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -0.019282512366771698, "rewards/margins": 0.04396004602313042, "rewards/rejected": -0.06324255466461182, "step": 910 }, { "epoch": 0.98, "learning_rate": 5.015418611516165e-09, "logits/chosen": -0.05882133170962334, "logits/rejected": -0.18861576914787292, "logps/chosen": -478.2850646972656, "logps/rejected": -563.0004272460938, "loss": 0.1882, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -0.012600253336131573, "rewards/margins": 0.05376917123794556, "rewards/rejected": -0.0663694217801094, "step": 920 }, { "epoch": 0.99, "learning_rate": 8.506013354186993e-10, "logits/chosen": -0.08989032357931137, "logits/rejected": -0.17091888189315796, "logps/chosen": -487.1897888183594, "logps/rejected": -533.8787231445312, "loss": 0.1867, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.01984177902340889, "rewards/margins": 0.038224250078201294, "rewards/rejected": -0.058066029101610184, "step": 930 }, { "epoch": 1.0, "step": 937, "total_flos": 0.0, "train_loss": 0.1943182722290654, "train_runtime": 7996.3001, "train_samples_per_second": 3.752, "train_steps_per_second": 0.117 } ], "logging_steps": 10, "max_steps": 937, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }