{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 1875, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 2.6595744680851065e-08, "logits/chosen": -1.755455493927002, "logits/rejected": -1.2333192825317383, "logps/chosen": -764.8233032226562, "logps/rejected": -1520.939208984375, "loss": 0.3704, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.01, "learning_rate": 2.6595744680851066e-07, "logits/chosen": -1.6312116384506226, "logits/rejected": -0.958756148815155, "logps/chosen": -542.477294921875, "logps/rejected": -1220.0614013671875, "loss": 0.3744, "rewards/accuracies": 0.4444444477558136, "rewards/chosen": 0.00022326533508021384, "rewards/margins": 0.00027514772955328226, "rewards/rejected": -5.1882434490835294e-05, "step": 10 }, { "epoch": 0.01, "learning_rate": 5.319148936170213e-07, "logits/chosen": -1.584674596786499, "logits/rejected": -0.943401038646698, "logps/chosen": -586.7803344726562, "logps/rejected": -1201.489501953125, "loss": 0.3553, "rewards/accuracies": 0.625, "rewards/chosen": 0.0002584285684861243, "rewards/margins": 0.00131960678845644, "rewards/rejected": -0.0010611782781779766, "step": 20 }, { "epoch": 0.02, "learning_rate": 7.97872340425532e-07, "logits/chosen": -1.378525972366333, "logits/rejected": -0.8355947732925415, "logps/chosen": -618.3253173828125, "logps/rejected": -1216.331298828125, "loss": 0.3435, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.003100637812167406, "rewards/margins": 0.004315788391977549, "rewards/rejected": -0.0012151505798101425, "step": 30 }, { "epoch": 0.02, "learning_rate": 1.0638297872340427e-06, "logits/chosen": -1.4447519779205322, "logits/rejected": -0.8539841771125793, "logps/chosen": -536.504638671875, "logps/rejected": -1172.6549072265625, "loss": 0.332, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.00205782288685441, "rewards/margins": 0.010990055277943611, "rewards/rejected": -0.008932231925427914, "step": 40 }, { "epoch": 0.03, "learning_rate": 1.3297872340425533e-06, "logits/chosen": -1.7163407802581787, "logits/rejected": -1.1005187034606934, "logps/chosen": -618.8663940429688, "logps/rejected": -1187.5242919921875, "loss": 0.3116, "rewards/accuracies": 0.625, "rewards/chosen": -0.00023196739493869245, "rewards/margins": 0.028941016644239426, "rewards/rejected": -0.029172983020544052, "step": 50 }, { "epoch": 0.03, "learning_rate": 1.595744680851064e-06, "logits/chosen": -1.7239458560943604, "logits/rejected": -0.5242995023727417, "logps/chosen": -738.6119384765625, "logps/rejected": -1377.3330078125, "loss": 0.2459, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.06412681937217712, "rewards/margins": 0.0683993324637413, "rewards/rejected": -0.13252617418766022, "step": 60 }, { "epoch": 0.04, "learning_rate": 1.8617021276595745e-06, "logits/chosen": -1.3319450616836548, "logits/rejected": -0.8398796916007996, "logps/chosen": -707.7894897460938, "logps/rejected": -1520.5997314453125, "loss": 0.198, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.16563495993614197, "rewards/margins": 0.17894069850444794, "rewards/rejected": -0.3445756733417511, "step": 70 }, { "epoch": 0.04, "learning_rate": 2.1276595744680853e-06, "logits/chosen": -1.4981276988983154, "logits/rejected": -0.41109055280685425, "logps/chosen": -937.0779418945312, "logps/rejected": -2068.221923828125, "loss": 0.2151, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.28319233655929565, "rewards/margins": 0.40604227781295776, "rewards/rejected": -0.6892346143722534, "step": 80 }, { "epoch": 0.05, "learning_rate": 2.393617021276596e-06, "logits/chosen": -1.4437072277069092, "logits/rejected": -0.4041782021522522, "logps/chosen": -608.6187744140625, "logps/rejected": -1449.1143798828125, "loss": 0.2497, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.05350279062986374, "rewards/margins": 0.12850181758403778, "rewards/rejected": -0.18200460076332092, "step": 90 }, { "epoch": 0.05, "learning_rate": 2.6595744680851065e-06, "logits/chosen": -1.4004117250442505, "logits/rejected": -0.3795866072177887, "logps/chosen": -688.6510009765625, "logps/rejected": -1484.7867431640625, "loss": 0.2483, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.07589573413133621, "rewards/margins": 0.10643702745437622, "rewards/rejected": -0.18233272433280945, "step": 100 }, { "epoch": 0.06, "learning_rate": 2.9255319148936174e-06, "logits/chosen": -1.4726046323776245, "logits/rejected": -0.5685114860534668, "logps/chosen": -926.9375, "logps/rejected": -1812.751220703125, "loss": 0.2093, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.21614733338356018, "rewards/margins": 0.26327216625213623, "rewards/rejected": -0.47941941022872925, "step": 110 }, { "epoch": 0.06, "learning_rate": 3.191489361702128e-06, "logits/chosen": -1.2898015975952148, "logits/rejected": -0.20202858746051788, "logps/chosen": -857.4860229492188, "logps/rejected": -1566.881591796875, "loss": 0.2628, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.2838597893714905, "rewards/margins": 0.172959566116333, "rewards/rejected": -0.4568193852901459, "step": 120 }, { "epoch": 0.07, "learning_rate": 3.457446808510639e-06, "logits/chosen": -1.3372188806533813, "logits/rejected": -0.5857396125793457, "logps/chosen": -738.6064453125, "logps/rejected": -1646.12890625, "loss": 0.21, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.1405295431613922, "rewards/margins": 0.23650164902210236, "rewards/rejected": -0.377031147480011, "step": 130 }, { "epoch": 0.07, "learning_rate": 3.723404255319149e-06, "logits/chosen": -1.397938847541809, "logits/rejected": -0.5400180220603943, "logps/chosen": -723.1727294921875, "logps/rejected": -1678.752197265625, "loss": 0.2137, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.15040160715579987, "rewards/margins": 0.28295016288757324, "rewards/rejected": -0.4333517551422119, "step": 140 }, { "epoch": 0.08, "learning_rate": 3.98936170212766e-06, "logits/chosen": -1.2215166091918945, "logits/rejected": -0.5187689065933228, "logps/chosen": -913.1868286132812, "logps/rejected": -2170.439697265625, "loss": 0.1472, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.33379653096199036, "rewards/margins": 0.4125414788722992, "rewards/rejected": -0.7463380098342896, "step": 150 }, { "epoch": 0.09, "learning_rate": 4.255319148936171e-06, "logits/chosen": -1.2553513050079346, "logits/rejected": -0.6222286224365234, "logps/chosen": -644.9315795898438, "logps/rejected": -1344.428466796875, "loss": 0.2055, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.16092197597026825, "rewards/margins": 0.18404754996299744, "rewards/rejected": -0.34496957063674927, "step": 160 }, { "epoch": 0.09, "learning_rate": 4.521276595744681e-06, "logits/chosen": -1.1327978372573853, "logits/rejected": -0.43405628204345703, "logps/chosen": -704.5630493164062, "logps/rejected": -1811.0263671875, "loss": 0.1446, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.1263386309146881, "rewards/margins": 0.3248198926448822, "rewards/rejected": -0.4511585235595703, "step": 170 }, { "epoch": 0.1, "learning_rate": 4.787234042553192e-06, "logits/chosen": -1.479241967201233, "logits/rejected": -0.397353857755661, "logps/chosen": -708.5093994140625, "logps/rejected": -1561.05078125, "loss": 0.1997, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.15144303441047668, "rewards/margins": 0.2386520355939865, "rewards/rejected": -0.3900950849056244, "step": 180 }, { "epoch": 0.1, "learning_rate": 4.999982660399688e-06, "logits/chosen": -1.1840440034866333, "logits/rejected": -0.4053446650505066, "logps/chosen": -822.2158203125, "logps/rejected": -1771.717529296875, "loss": 0.2122, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.2102956473827362, "rewards/margins": 0.2532525956630707, "rewards/rejected": -0.4635482430458069, "step": 190 }, { "epoch": 0.11, "learning_rate": 4.99937579964398e-06, "logits/chosen": -1.3526380062103271, "logits/rejected": -0.5665386915206909, "logps/chosen": -773.0138549804688, "logps/rejected": -1730.4918212890625, "loss": 0.2076, "rewards/accuracies": 0.75, "rewards/chosen": -0.1651298850774765, "rewards/margins": 0.24976544082164764, "rewards/rejected": -0.41489535570144653, "step": 200 }, { "epoch": 0.11, "learning_rate": 4.9979021993870645e-06, "logits/chosen": -1.4962358474731445, "logits/rejected": -0.5578527450561523, "logps/chosen": -795.4863891601562, "logps/rejected": -1807.668212890625, "loss": 0.1634, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.15764664113521576, "rewards/margins": 0.2677456736564636, "rewards/rejected": -0.4253923296928406, "step": 210 }, { "epoch": 0.12, "learning_rate": 4.995562370647553e-06, "logits/chosen": -0.9666210412979126, "logits/rejected": -0.4786294102668762, "logps/chosen": -786.662353515625, "logps/rejected": -1778.3988037109375, "loss": 0.2532, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.24228081107139587, "rewards/margins": 0.3355843126773834, "rewards/rejected": -0.5778651833534241, "step": 220 }, { "epoch": 0.12, "learning_rate": 4.992357124836838e-06, "logits/chosen": -1.409147024154663, "logits/rejected": -0.689926028251648, "logps/chosen": -877.3094482421875, "logps/rejected": -1852.2314453125, "loss": 0.1728, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.2839181125164032, "rewards/margins": 0.23368731141090393, "rewards/rejected": -0.5176054239273071, "step": 230 }, { "epoch": 0.13, "learning_rate": 4.9882875734777044e-06, "logits/chosen": -1.1925780773162842, "logits/rejected": 0.03326777368783951, "logps/chosen": -831.4713134765625, "logps/rejected": -1737.231689453125, "loss": 0.2304, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.27825552225112915, "rewards/margins": 0.28924351930618286, "rewards/rejected": -0.567499041557312, "step": 240 }, { "epoch": 0.13, "learning_rate": 4.983355127818882e-06, "logits/chosen": -1.4935053586959839, "logits/rejected": -0.0941208004951477, "logps/chosen": -759.3983154296875, "logps/rejected": -1830.8707275390625, "loss": 0.2091, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.13234801590442657, "rewards/margins": 0.29578104615211487, "rewards/rejected": -0.428129106760025, "step": 250 }, { "epoch": 0.14, "learning_rate": 4.977561498345639e-06, "logits/chosen": -1.4366835355758667, "logits/rejected": -0.848114013671875, "logps/chosen": -756.9669799804688, "logps/rejected": -1844.3541259765625, "loss": 0.1589, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.2339516133069992, "rewards/margins": 0.3356839716434479, "rewards/rejected": -0.5696356296539307, "step": 260 }, { "epoch": 0.14, "learning_rate": 4.970908694186624e-06, "logits/chosen": -1.433019995689392, "logits/rejected": -0.6539562940597534, "logps/chosen": -852.9888916015625, "logps/rejected": -1876.3004150390625, "loss": 0.1709, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.2885374426841736, "rewards/margins": 0.3317956030368805, "rewards/rejected": -0.6203330755233765, "step": 270 }, { "epoch": 0.15, "learning_rate": 4.9633990224171305e-06, "logits/chosen": -1.1937869787216187, "logits/rejected": -0.09999962896108627, "logps/chosen": -1032.603515625, "logps/rejected": -2027.5093994140625, "loss": 0.2166, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.36031240224838257, "rewards/margins": 0.3707783818244934, "rewards/rejected": -0.7310907244682312, "step": 280 }, { "epoch": 0.15, "learning_rate": 4.955035087259046e-06, "logits/chosen": -1.2412487268447876, "logits/rejected": -0.664631724357605, "logps/chosen": -734.98681640625, "logps/rejected": -1704.20703125, "loss": 0.1807, "rewards/accuracies": 0.875, "rewards/chosen": -0.14173075556755066, "rewards/margins": 0.25538477301597595, "rewards/rejected": -0.3971155285835266, "step": 290 }, { "epoch": 0.16, "learning_rate": 4.945819789177756e-06, "logits/chosen": -1.1638051271438599, "logits/rejected": 0.49103036522865295, "logps/chosen": -755.9884643554688, "logps/rejected": -1692.7734375, "loss": 0.1976, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.16941963136196136, "rewards/margins": 0.23666927218437195, "rewards/rejected": -0.4060889184474945, "step": 300 }, { "epoch": 0.17, "learning_rate": 4.935756323876306e-06, "logits/chosen": -1.1552793979644775, "logits/rejected": 0.35435959696769714, "logps/chosen": -895.009765625, "logps/rejected": -1757.65234375, "loss": 0.1955, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.2875288426876068, "rewards/margins": 0.24231036007404327, "rewards/rejected": -0.5298392176628113, "step": 310 }, { "epoch": 0.17, "learning_rate": 4.924848181187199e-06, "logits/chosen": -1.3235706090927124, "logits/rejected": -0.7385076284408569, "logps/chosen": -643.3860473632812, "logps/rejected": -1514.3936767578125, "loss": 0.2013, "rewards/accuracies": 0.75, "rewards/chosen": -0.1353076845407486, "rewards/margins": 0.21510222554206848, "rewards/rejected": -0.3504098951816559, "step": 320 }, { "epoch": 0.18, "learning_rate": 4.913099143862173e-06, "logits/chosen": -1.42311429977417, "logits/rejected": -0.19915446639060974, "logps/chosen": -806.025390625, "logps/rejected": -1748.1644287109375, "loss": 0.1834, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.16057676076889038, "rewards/margins": 0.29953283071517944, "rewards/rejected": -0.4601096212863922, "step": 330 }, { "epoch": 0.18, "learning_rate": 4.900513286260416e-06, "logits/chosen": -1.4523063898086548, "logits/rejected": -0.7241362929344177, "logps/chosen": -726.6815185546875, "logps/rejected": -1756.1654052734375, "loss": 0.1909, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.1610650271177292, "rewards/margins": 0.32662302255630493, "rewards/rejected": -0.48768800497055054, "step": 340 }, { "epoch": 0.19, "learning_rate": 4.887094972935645e-06, "logits/chosen": -1.505131721496582, "logits/rejected": -0.7010393738746643, "logps/chosen": -756.3118896484375, "logps/rejected": -1657.638427734375, "loss": 0.1529, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.19148796796798706, "rewards/margins": 0.26837441325187683, "rewards/rejected": -0.4598624110221863, "step": 350 }, { "epoch": 0.19, "learning_rate": 4.87284885712256e-06, "logits/chosen": -1.400520920753479, "logits/rejected": -0.24144461750984192, "logps/chosen": -773.6839599609375, "logps/rejected": -1939.297119140625, "loss": 0.1325, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.20408935844898224, "rewards/margins": 0.379780113697052, "rewards/rejected": -0.583869457244873, "step": 360 }, { "epoch": 0.2, "learning_rate": 4.857779879123181e-06, "logits/chosen": -1.3075954914093018, "logits/rejected": -0.6242688894271851, "logps/chosen": -830.1097412109375, "logps/rejected": -1820.2388916015625, "loss": 0.214, "rewards/accuracies": 0.75, "rewards/chosen": -0.3089172840118408, "rewards/margins": 0.2943463921546936, "rewards/rejected": -0.6032636761665344, "step": 370 }, { "epoch": 0.2, "learning_rate": 4.841893264593643e-06, "logits/chosen": -1.3646913766860962, "logits/rejected": -0.4420408606529236, "logps/chosen": -739.6143188476562, "logps/rejected": -1830.6683349609375, "loss": 0.1772, "rewards/accuracies": 0.875, "rewards/chosen": -0.1528651863336563, "rewards/margins": 0.3289165496826172, "rewards/rejected": -0.4817817211151123, "step": 380 }, { "epoch": 0.21, "learning_rate": 4.825194522732023e-06, "logits/chosen": -1.394213318824768, "logits/rejected": -0.41561856865882874, "logps/chosen": -848.2108154296875, "logps/rejected": -1736.9312744140625, "loss": 0.1866, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.17288437485694885, "rewards/margins": 0.2597029209136963, "rewards/rejected": -0.43258723616600037, "step": 390 }, { "epoch": 0.21, "learning_rate": 4.807689444367853e-06, "logits/chosen": -1.4238224029541016, "logits/rejected": -0.2904614210128784, "logps/chosen": -766.7713623046875, "logps/rejected": -1548.4310302734375, "loss": 0.1551, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.18976250290870667, "rewards/margins": 0.24860653281211853, "rewards/rejected": -0.4383690357208252, "step": 400 }, { "epoch": 0.22, "learning_rate": 4.78938409995396e-06, "logits/chosen": -1.5596516132354736, "logits/rejected": -0.19419638812541962, "logps/chosen": -894.0138549804688, "logps/rejected": -1879.2919921875, "loss": 0.1543, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.21909482777118683, "rewards/margins": 0.3173079490661621, "rewards/rejected": -0.5364028215408325, "step": 410 }, { "epoch": 0.22, "learning_rate": 4.770284837461342e-06, "logits/chosen": -1.468480110168457, "logits/rejected": -0.7513076663017273, "logps/chosen": -791.5577392578125, "logps/rejected": -1696.2337646484375, "loss": 0.1794, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.16900746524333954, "rewards/margins": 0.270784854888916, "rewards/rejected": -0.43979233503341675, "step": 420 }, { "epoch": 0.23, "learning_rate": 4.7503982801778015e-06, "logits/chosen": -1.4179691076278687, "logits/rejected": -0.012906777672469616, "logps/chosen": -816.0245361328125, "logps/rejected": -1743.844482421875, "loss": 0.1472, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.1716371476650238, "rewards/margins": 0.29810577630996704, "rewards/rejected": -0.46974295377731323, "step": 430 }, { "epoch": 0.23, "learning_rate": 4.729731324411104e-06, "logits/chosen": -1.447142481803894, "logits/rejected": -0.6321523189544678, "logps/chosen": -683.079833984375, "logps/rejected": -1682.686767578125, "loss": 0.1643, "rewards/accuracies": 0.75, "rewards/chosen": -0.1491968333721161, "rewards/margins": 0.3515523076057434, "rewards/rejected": -0.5007491111755371, "step": 440 }, { "epoch": 0.24, "learning_rate": 4.7082911370974645e-06, "logits/chosen": -0.9937525987625122, "logits/rejected": -0.6763631105422974, "logps/chosen": -770.6049194335938, "logps/rejected": -1687.3837890625, "loss": 0.1541, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.17300094664096832, "rewards/margins": 0.26958489418029785, "rewards/rejected": -0.44258585572242737, "step": 450 }, { "epoch": 0.25, "learning_rate": 4.68608515331618e-06, "logits/chosen": -1.2351852655410767, "logits/rejected": -0.5260038375854492, "logps/chosen": -727.1993408203125, "logps/rejected": -1712.5172119140625, "loss": 0.1596, "rewards/accuracies": 0.875, "rewards/chosen": -0.10707534849643707, "rewards/margins": 0.32161325216293335, "rewards/rejected": -0.4286886155605316, "step": 460 }, { "epoch": 0.25, "learning_rate": 4.663121073711269e-06, "logits/chosen": -1.4281326532363892, "logits/rejected": -0.11814825236797333, "logps/chosen": -732.6151123046875, "logps/rejected": -1736.462158203125, "loss": 0.1742, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.10931523144245148, "rewards/margins": 0.2831416428089142, "rewards/rejected": -0.39245691895484924, "step": 470 }, { "epoch": 0.26, "learning_rate": 4.63940686182103e-06, "logits/chosen": -1.366811990737915, "logits/rejected": -0.15237736701965332, "logps/chosen": -920.6950073242188, "logps/rejected": -1818.9697265625, "loss": 0.1964, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.18746745586395264, "rewards/margins": 0.3069246709346771, "rewards/rejected": -0.4943920969963074, "step": 480 }, { "epoch": 0.26, "learning_rate": 4.614950741316425e-06, "logits/chosen": -1.2442119121551514, "logits/rejected": -0.27264174818992615, "logps/chosen": -577.3939208984375, "logps/rejected": -1474.958984375, "loss": 0.1644, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.06222609430551529, "rewards/margins": 0.20860283076763153, "rewards/rejected": -0.27082890272140503, "step": 490 }, { "epoch": 0.27, "learning_rate": 4.589761193149254e-06, "logits/chosen": -1.2685329914093018, "logits/rejected": -0.32444876432418823, "logps/chosen": -788.1159057617188, "logps/rejected": -1656.401123046875, "loss": 0.2214, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.10539279133081436, "rewards/margins": 0.28618016839027405, "rewards/rejected": -0.3915729224681854, "step": 500 }, { "epoch": 0.27, "learning_rate": 4.563846952611112e-06, "logits/chosen": -1.1497045755386353, "logits/rejected": -0.3311644494533539, "logps/chosen": -789.111328125, "logps/rejected": -1705.7972412109375, "loss": 0.173, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.14451605081558228, "rewards/margins": 0.2676469087600708, "rewards/rejected": -0.4121629595756531, "step": 510 }, { "epoch": 0.28, "learning_rate": 4.537217006304141e-06, "logits/chosen": -1.1343199014663696, "logits/rejected": 0.12552905082702637, "logps/chosen": -782.1867065429688, "logps/rejected": -1627.5347900390625, "loss": 0.1644, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.15862300992012024, "rewards/margins": 0.2545243203639984, "rewards/rejected": -0.41314736008644104, "step": 520 }, { "epoch": 0.28, "learning_rate": 4.50988058902464e-06, "logits/chosen": -1.36593496799469, "logits/rejected": -0.3004814088344574, "logps/chosen": -754.1193237304688, "logps/rejected": -1613.5533447265625, "loss": 0.1722, "rewards/accuracies": 0.75, "rewards/chosen": -0.13889679312705994, "rewards/margins": 0.2828737497329712, "rewards/rejected": -0.4217705726623535, "step": 530 }, { "epoch": 0.29, "learning_rate": 4.481847180560593e-06, "logits/chosen": -1.5362759828567505, "logits/rejected": -0.10081689059734344, "logps/chosen": -763.066162109375, "logps/rejected": -1727.967529296875, "loss": 0.2263, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.11022988706827164, "rewards/margins": 0.2953043580055237, "rewards/rejected": -0.40553420782089233, "step": 540 }, { "epoch": 0.29, "learning_rate": 4.453126502404253e-06, "logits/chosen": -1.2658751010894775, "logits/rejected": -0.15121665596961975, "logps/chosen": -676.2111206054688, "logps/rejected": -1652.478515625, "loss": 0.1659, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.15621882677078247, "rewards/margins": 0.28227540850639343, "rewards/rejected": -0.4384942650794983, "step": 550 }, { "epoch": 0.3, "learning_rate": 4.423728514380892e-06, "logits/chosen": -1.1152678728103638, "logits/rejected": -0.4836166501045227, "logps/chosen": -772.0936279296875, "logps/rejected": -2011.451904296875, "loss": 0.2331, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.19823512434959412, "rewards/margins": 0.4028354585170746, "rewards/rejected": -0.6010705828666687, "step": 560 }, { "epoch": 0.3, "learning_rate": 4.393663411194918e-06, "logits/chosen": -1.2210283279418945, "logits/rejected": -0.5110222101211548, "logps/chosen": -661.4767456054688, "logps/rejected": -1744.774658203125, "loss": 0.195, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.09630689024925232, "rewards/margins": 0.3629889488220215, "rewards/rejected": -0.4592958390712738, "step": 570 }, { "epoch": 0.31, "learning_rate": 4.362941618894523e-06, "logits/chosen": -1.4232590198516846, "logits/rejected": -0.6951449513435364, "logps/chosen": -777.7971801757812, "logps/rejected": -1753.598388671875, "loss": 0.1906, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.1481999158859253, "rewards/margins": 0.3005923628807068, "rewards/rejected": -0.4487922787666321, "step": 580 }, { "epoch": 0.31, "learning_rate": 4.331573791256116e-06, "logits/chosen": -1.2691911458969116, "logits/rejected": -0.21553485095500946, "logps/chosen": -850.1456298828125, "logps/rejected": -1807.684814453125, "loss": 0.1987, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.23677225410938263, "rewards/margins": 0.27481013536453247, "rewards/rejected": -0.5115823745727539, "step": 590 }, { "epoch": 0.32, "learning_rate": 4.299570806089786e-06, "logits/chosen": -1.2055295705795288, "logits/rejected": -0.07821548730134964, "logps/chosen": -801.0753173828125, "logps/rejected": -1668.043212890625, "loss": 0.1633, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.21256402134895325, "rewards/margins": 0.29289668798446655, "rewards/rejected": -0.5054606795310974, "step": 600 }, { "epoch": 0.33, "learning_rate": 4.266943761467057e-06, "logits/chosen": -1.290138840675354, "logits/rejected": 0.16648904979228973, "logps/chosen": -796.8035888671875, "logps/rejected": -1771.1650390625, "loss": 0.1665, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.11760182678699493, "rewards/margins": 0.294794499874115, "rewards/rejected": -0.41239628195762634, "step": 610 }, { "epoch": 0.33, "learning_rate": 4.233703971872287e-06, "logits/chosen": -1.53248131275177, "logits/rejected": 0.212439626455307, "logps/chosen": -613.8381958007812, "logps/rejected": -1503.5120849609375, "loss": 0.1547, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.053207941353321075, "rewards/margins": 0.2783101201057434, "rewards/rejected": -0.3315180242061615, "step": 620 }, { "epoch": 0.34, "learning_rate": 4.1998629642789925e-06, "logits/chosen": -1.56017005443573, "logits/rejected": -0.071937195956707, "logps/chosen": -674.8516845703125, "logps/rejected": -1349.5697021484375, "loss": 0.181, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.056226201355457306, "rewards/margins": 0.21242019534111023, "rewards/rejected": -0.26864638924598694, "step": 630 }, { "epoch": 0.34, "learning_rate": 4.165432474152505e-06, "logits/chosen": -1.2165509462356567, "logits/rejected": -0.5167722702026367, "logps/chosen": -677.8233032226562, "logps/rejected": -1581.596923828125, "loss": 0.1799, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.10262588411569595, "rewards/margins": 0.2893064618110657, "rewards/rejected": -0.39193230867385864, "step": 640 }, { "epoch": 0.35, "learning_rate": 4.130424441380308e-06, "logits/chosen": -1.5206298828125, "logits/rejected": -0.4772413372993469, "logps/chosen": -721.204345703125, "logps/rejected": -1608.91845703125, "loss": 0.1771, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.1459272801876068, "rewards/margins": 0.2629932761192322, "rewards/rejected": -0.4089205861091614, "step": 650 }, { "epoch": 0.35, "learning_rate": 4.09485100613151e-06, "logits/chosen": -1.1552391052246094, "logits/rejected": -0.5302366614341736, "logps/chosen": -844.49560546875, "logps/rejected": -1849.5101318359375, "loss": 0.1755, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.20096397399902344, "rewards/margins": 0.2851325273513794, "rewards/rejected": -0.4860965609550476, "step": 660 }, { "epoch": 0.36, "learning_rate": 4.058724504646834e-06, "logits/chosen": -1.205714225769043, "logits/rejected": -0.6423812508583069, "logps/chosen": -707.0100708007812, "logps/rejected": -1621.2158203125, "loss": 0.141, "rewards/accuracies": 0.75, "rewards/chosen": -0.09242536127567291, "rewards/margins": 0.3333635926246643, "rewards/rejected": -0.4257889688014984, "step": 670 }, { "epoch": 0.36, "learning_rate": 4.022057464960632e-06, "logits/chosen": -1.365877389907837, "logits/rejected": 0.25049906969070435, "logps/chosen": -719.96630859375, "logps/rejected": -1713.641845703125, "loss": 0.1621, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.0692489892244339, "rewards/margins": 0.3098606467247009, "rewards/rejected": -0.3791096806526184, "step": 680 }, { "epoch": 0.37, "learning_rate": 3.984862602556383e-06, "logits/chosen": -1.2444730997085571, "logits/rejected": 0.32205188274383545, "logps/chosen": -701.1004638671875, "logps/rejected": -1603.306884765625, "loss": 0.1723, "rewards/accuracies": 0.875, "rewards/chosen": -0.0724688395857811, "rewards/margins": 0.2886015772819519, "rewards/rejected": -0.3610703945159912, "step": 690 }, { "epoch": 0.37, "learning_rate": 3.947152815957187e-06, "logits/chosen": -1.2809414863586426, "logits/rejected": 0.01291370578110218, "logps/chosen": -631.6290893554688, "logps/rejected": -1685.641357421875, "loss": 0.1675, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.05924375727772713, "rewards/margins": 0.32539594173431396, "rewards/rejected": -0.3846396803855896, "step": 700 }, { "epoch": 0.38, "learning_rate": 3.908941182252785e-06, "logits/chosen": -1.20944344997406, "logits/rejected": 0.5900853276252747, "logps/chosen": -768.8225708007812, "logps/rejected": -1766.4296875, "loss": 0.1247, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.0808175802230835, "rewards/margins": 0.33207273483276367, "rewards/rejected": -0.4128902852535248, "step": 710 }, { "epoch": 0.38, "learning_rate": 3.8702409525646535e-06, "logits/chosen": -1.496953010559082, "logits/rejected": -0.36526980996131897, "logps/chosen": -677.8695068359375, "logps/rejected": -1590.8409423828125, "loss": 0.1838, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.02380838617682457, "rewards/margins": 0.26771286129951477, "rewards/rejected": -0.29152125120162964, "step": 720 }, { "epoch": 0.39, "learning_rate": 3.8310655474507495e-06, "logits/chosen": -1.2399176359176636, "logits/rejected": -0.07380921393632889, "logps/chosen": -684.2032470703125, "logps/rejected": -1637.0625, "loss": 0.1776, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.05182964727282524, "rewards/margins": 0.2697630226612091, "rewards/rejected": -0.32159262895584106, "step": 730 }, { "epoch": 0.39, "learning_rate": 3.7914285522515002e-06, "logits/chosen": -1.3389846086502075, "logits/rejected": -0.04501932114362717, "logps/chosen": -659.2589721679688, "logps/rejected": -1660.969482421875, "loss": 0.1753, "rewards/accuracies": 0.875, "rewards/chosen": -0.06584084033966064, "rewards/margins": 0.3323908746242523, "rewards/rejected": -0.39823174476623535, "step": 740 }, { "epoch": 0.4, "learning_rate": 3.751343712378639e-06, "logits/chosen": -1.3522380590438843, "logits/rejected": 0.06499899923801422, "logps/chosen": -580.7006225585938, "logps/rejected": -1741.8189697265625, "loss": 0.1372, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.04593845084309578, "rewards/margins": 0.39734122157096863, "rewards/rejected": -0.4432796835899353, "step": 750 }, { "epoch": 0.41, "learning_rate": 3.710824928548546e-06, "logits/chosen": -1.3140994310379028, "logits/rejected": -0.617447555065155, "logps/chosen": -633.9112548828125, "logps/rejected": -1493.082763671875, "loss": 0.1866, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.09161052852869034, "rewards/margins": 0.26011547446250916, "rewards/rejected": -0.3517259955406189, "step": 760 }, { "epoch": 0.41, "learning_rate": 3.6698862519617225e-06, "logits/chosen": -1.2595350742340088, "logits/rejected": -0.20695531368255615, "logps/chosen": -593.3395385742188, "logps/rejected": -1490.5479736328125, "loss": 0.1646, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.06257372349500656, "rewards/margins": 0.2620692849159241, "rewards/rejected": -0.3246430456638336, "step": 770 }, { "epoch": 0.42, "learning_rate": 3.6285418794300793e-06, "logits/chosen": -1.4119771718978882, "logits/rejected": -0.5417270064353943, "logps/chosen": -610.9232788085938, "logps/rejected": -1522.27197265625, "loss": 0.2589, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.08761414885520935, "rewards/margins": 0.2841811180114746, "rewards/rejected": -0.37179526686668396, "step": 780 }, { "epoch": 0.42, "learning_rate": 3.5868061484537365e-06, "logits/chosen": -1.333315134048462, "logits/rejected": -0.016486238688230515, "logps/chosen": -770.37744140625, "logps/rejected": -1697.8402099609375, "loss": 0.236, "rewards/accuracies": 0.875, "rewards/chosen": -0.11275408416986465, "rewards/margins": 0.25441592931747437, "rewards/rejected": -0.3671700060367584, "step": 790 }, { "epoch": 0.43, "learning_rate": 3.5446935322490285e-06, "logits/chosen": -1.4592931270599365, "logits/rejected": -0.8714092373847961, "logps/chosen": -645.5399169921875, "logps/rejected": -1329.577392578125, "loss": 0.2498, "rewards/accuracies": 0.625, "rewards/chosen": -0.08806028217077255, "rewards/margins": 0.12212536484003067, "rewards/rejected": -0.21018561720848083, "step": 800 }, { "epoch": 0.43, "learning_rate": 3.502218634729447e-06, "logits/chosen": -1.4128929376602173, "logits/rejected": -0.7669699788093567, "logps/chosen": -649.4080810546875, "logps/rejected": -1620.560302734375, "loss": 0.1874, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.11496981233358383, "rewards/margins": 0.2872191071510315, "rewards/rejected": -0.40218886733055115, "step": 810 }, { "epoch": 0.44, "learning_rate": 3.459396185441265e-06, "logits/chosen": -1.3515876531600952, "logits/rejected": -0.6728461980819702, "logps/chosen": -762.8712158203125, "logps/rejected": -1834.1533203125, "loss": 0.2323, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.14702202379703522, "rewards/margins": 0.354480117559433, "rewards/rejected": -0.5015021562576294, "step": 820 }, { "epoch": 0.44, "learning_rate": 3.4162410344555834e-06, "logits/chosen": -1.4009459018707275, "logits/rejected": -0.9621168375015259, "logps/chosen": -657.5804443359375, "logps/rejected": -1689.560302734375, "loss": 0.1928, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.056845568120479584, "rewards/margins": 0.3001772463321686, "rewards/rejected": -0.35702282190322876, "step": 830 }, { "epoch": 0.45, "learning_rate": 3.3727681472185937e-06, "logits/chosen": -1.2048084735870361, "logits/rejected": -0.543747067451477, "logps/chosen": -639.1177978515625, "logps/rejected": -1738.9322509765625, "loss": 0.1711, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.02559160254895687, "rewards/margins": 0.27681928873062134, "rewards/rejected": -0.30241090059280396, "step": 840 }, { "epoch": 0.45, "learning_rate": 3.3289925993618217e-06, "logits/chosen": -1.1008461713790894, "logits/rejected": -0.47523483633995056, "logps/chosen": -638.2371215820312, "logps/rejected": -1568.861328125, "loss": 0.1779, "rewards/accuracies": 0.75, "rewards/chosen": -0.07939572632312775, "rewards/margins": 0.26904281973838806, "rewards/rejected": -0.3484385311603546, "step": 850 }, { "epoch": 0.46, "learning_rate": 3.2849295714741643e-06, "logits/chosen": -1.2790766954421997, "logits/rejected": -0.019085492938756943, "logps/chosen": -567.5126342773438, "logps/rejected": -1681.409912109375, "loss": 0.176, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.059426773339509964, "rewards/margins": 0.36287856101989746, "rewards/rejected": -0.4223053455352783, "step": 860 }, { "epoch": 0.46, "learning_rate": 3.2405943438375287e-06, "logits/chosen": -1.3036001920700073, "logits/rejected": -0.434285968542099, "logps/chosen": -641.4450073242188, "logps/rejected": -1748.319091796875, "loss": 0.124, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.03710533678531647, "rewards/margins": 0.3532947897911072, "rewards/rejected": -0.39040011167526245, "step": 870 }, { "epoch": 0.47, "learning_rate": 3.1960022911279036e-06, "logits/chosen": -1.302273154258728, "logits/rejected": -0.5197567939758301, "logps/chosen": -562.3380737304688, "logps/rejected": -1495.635498046875, "loss": 0.1667, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.03037446178495884, "rewards/margins": 0.2802720069885254, "rewards/rejected": -0.3106464743614197, "step": 880 }, { "epoch": 0.47, "learning_rate": 3.1511688770836844e-06, "logits/chosen": -1.4215189218521118, "logits/rejected": -0.5714216828346252, "logps/chosen": -665.7767333984375, "logps/rejected": -1658.6048583984375, "loss": 0.1316, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.05627693980932236, "rewards/margins": 0.32271087169647217, "rewards/rejected": -0.3789878189563751, "step": 890 }, { "epoch": 0.48, "learning_rate": 3.1061096491431307e-06, "logits/chosen": -1.3552201986312866, "logits/rejected": -0.9116779565811157, "logps/chosen": -690.0846557617188, "logps/rejected": -1613.9110107421875, "loss": 0.1754, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.04472965747117996, "rewards/margins": 0.2647218108177185, "rewards/rejected": -0.3094514310359955, "step": 900 }, { "epoch": 0.49, "learning_rate": 3.0608402330527796e-06, "logits/chosen": -1.3723796606063843, "logits/rejected": -0.6000717282295227, "logps/chosen": -692.8043212890625, "logps/rejected": -1629.4808349609375, "loss": 0.2028, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.13592025637626648, "rewards/margins": 0.29076087474823, "rewards/rejected": -0.42668113112449646, "step": 910 }, { "epoch": 0.49, "learning_rate": 3.0153763274487176e-06, "logits/chosen": -1.3126561641693115, "logits/rejected": -0.3268551230430603, "logps/chosen": -676.1267700195312, "logps/rejected": -1620.5989990234375, "loss": 0.1766, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.09026195108890533, "rewards/margins": 0.3179401159286499, "rewards/rejected": -0.40820208191871643, "step": 920 }, { "epoch": 0.5, "learning_rate": 2.9697336984125683e-06, "logits/chosen": -1.306850552558899, "logits/rejected": -0.35740819573402405, "logps/chosen": -662.0093994140625, "logps/rejected": -1545.4691162109375, "loss": 0.1835, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.05962755158543587, "rewards/margins": 0.2719072997570038, "rewards/rejected": -0.33153486251831055, "step": 930 }, { "epoch": 0.5, "learning_rate": 2.923928174004094e-06, "logits/chosen": -1.454861044883728, "logits/rejected": -0.696769654750824, "logps/chosen": -648.5043334960938, "logps/rejected": -1763.5390625, "loss": 0.1541, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.021979842334985733, "rewards/margins": 0.3547331690788269, "rewards/rejected": -0.3767130672931671, "step": 940 }, { "epoch": 0.51, "learning_rate": 2.8779756387723036e-06, "logits/chosen": -1.4002293348312378, "logits/rejected": -0.8346809148788452, "logps/chosen": -564.9570922851562, "logps/rejected": -1489.9595947265625, "loss": 0.2194, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.02880738116800785, "rewards/margins": 0.2632240653038025, "rewards/rejected": -0.292031466960907, "step": 950 }, { "epoch": 0.51, "learning_rate": 2.831892028246968e-06, "logits/chosen": -1.215348482131958, "logits/rejected": -0.574180006980896, "logps/chosen": -637.7142944335938, "logps/rejected": -1575.6964111328125, "loss": 0.1602, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.034471701830625534, "rewards/margins": 0.2788967490196228, "rewards/rejected": -0.31336846947669983, "step": 960 }, { "epoch": 0.52, "learning_rate": 2.7856933234124617e-06, "logits/chosen": -1.4842756986618042, "logits/rejected": -0.5266194343566895, "logps/chosen": -667.3348388671875, "logps/rejected": -1674.873779296875, "loss": 0.1523, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.04780828207731247, "rewards/margins": 0.32556313276290894, "rewards/rejected": -0.3733713626861572, "step": 970 }, { "epoch": 0.52, "learning_rate": 2.7393955451658387e-06, "logits/chosen": -0.9660801887512207, "logits/rejected": -0.038875680416822433, "logps/chosen": -644.0047607421875, "logps/rejected": -1645.885498046875, "loss": 0.1848, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.07426423579454422, "rewards/margins": 0.31580421328544617, "rewards/rejected": -0.3900684714317322, "step": 980 }, { "epoch": 0.53, "learning_rate": 2.6930147487610667e-06, "logits/chosen": -1.2562506198883057, "logits/rejected": -0.5583636164665222, "logps/chosen": -687.5775146484375, "logps/rejected": -1682.3447265625, "loss": 0.2442, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.10189177095890045, "rewards/margins": 0.29264310002326965, "rewards/rejected": -0.3945348560810089, "step": 990 }, { "epoch": 0.53, "learning_rate": 2.6465670182413487e-06, "logits/chosen": -1.0773265361785889, "logits/rejected": -0.38139039278030396, "logps/chosen": -669.3234252929688, "logps/rejected": -1580.733642578125, "loss": 0.1672, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.07045495510101318, "rewards/margins": 0.24894073605537415, "rewards/rejected": -0.31939566135406494, "step": 1000 }, { "epoch": 0.54, "learning_rate": 2.6000684608614594e-06, "logits/chosen": -1.3887989521026611, "logits/rejected": -0.6506294012069702, "logps/chosen": -653.4199829101562, "logps/rejected": -1556.509521484375, "loss": 0.1917, "rewards/accuracies": 0.75, "rewards/chosen": -0.016438838094472885, "rewards/margins": 0.27959805727005005, "rewards/rejected": -0.29603689908981323, "step": 1010 }, { "epoch": 0.54, "learning_rate": 2.5535352015020338e-06, "logits/chosen": -1.2708961963653564, "logits/rejected": -0.4829919934272766, "logps/chosen": -708.9461669921875, "logps/rejected": -1660.2349853515625, "loss": 0.1956, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.02399434708058834, "rewards/margins": 0.2889823317527771, "rewards/rejected": -0.3129766881465912, "step": 1020 }, { "epoch": 0.55, "learning_rate": 2.506983377077741e-06, "logits/chosen": -1.2446025609970093, "logits/rejected": -0.7467063665390015, "logps/chosen": -516.4700927734375, "logps/rejected": -1463.983642578125, "loss": 0.1882, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.015911739319562912, "rewards/margins": 0.294134259223938, "rewards/rejected": -0.3100460171699524, "step": 1030 }, { "epoch": 0.55, "learning_rate": 2.460429130941289e-06, "logits/chosen": -1.4046454429626465, "logits/rejected": -0.6258004903793335, "logps/chosen": -548.9246826171875, "logps/rejected": -1328.09716796875, "loss": 0.1558, "rewards/accuracies": 0.75, "rewards/chosen": -0.007907375693321228, "rewards/margins": 0.23490925133228302, "rewards/rejected": -0.24281664192676544, "step": 1040 }, { "epoch": 0.56, "learning_rate": 2.413888607285192e-06, "logits/chosen": -1.351510763168335, "logits/rejected": -0.3396194279193878, "logps/chosen": -716.7518310546875, "logps/rejected": -1452.3531494140625, "loss": 0.1761, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.030713701620697975, "rewards/margins": 0.24616649746894836, "rewards/rejected": -0.2768802046775818, "step": 1050 }, { "epoch": 0.57, "learning_rate": 2.367377945543249e-06, "logits/chosen": -1.2682628631591797, "logits/rejected": -0.39214056730270386, "logps/chosen": -745.8023681640625, "logps/rejected": -1684.0126953125, "loss": 0.1596, "rewards/accuracies": 0.875, "rewards/chosen": -0.05257143825292587, "rewards/margins": 0.3037104606628418, "rewards/rejected": -0.35628193616867065, "step": 1060 }, { "epoch": 0.57, "learning_rate": 2.320913274793676e-06, "logits/chosen": -1.3436534404754639, "logits/rejected": -0.21577882766723633, "logps/chosen": -625.4033813476562, "logps/rejected": -1678.371826171875, "loss": 0.178, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.026138927787542343, "rewards/margins": 0.3423928916454315, "rewards/rejected": -0.36853182315826416, "step": 1070 }, { "epoch": 0.58, "learning_rate": 2.27451070816582e-06, "logits/chosen": -1.5014312267303467, "logits/rejected": -0.43270865082740784, "logps/chosen": -686.2655029296875, "logps/rejected": -1728.8896484375, "loss": 0.183, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.019897693768143654, "rewards/margins": 0.35252150893211365, "rewards/rejected": -0.37241920828819275, "step": 1080 }, { "epoch": 0.58, "learning_rate": 2.228186337252414e-06, "logits/chosen": -1.3439433574676514, "logits/rejected": -0.20307889580726624, "logps/chosen": -795.5742797851562, "logps/rejected": -1796.8831787109375, "loss": 0.1734, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.08263162523508072, "rewards/margins": 0.32303547859191895, "rewards/rejected": -0.40566712617874146, "step": 1090 }, { "epoch": 0.59, "learning_rate": 2.1819562265292946e-06, "logits/chosen": -1.2423124313354492, "logits/rejected": -0.54206782579422, "logps/chosen": -570.5196533203125, "logps/rejected": -1636.732421875, "loss": 0.1307, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.07189210504293442, "rewards/margins": 0.3438221514225006, "rewards/rejected": -0.41571420431137085, "step": 1100 }, { "epoch": 0.59, "learning_rate": 2.1358364077845236e-06, "logits/chosen": -1.2860864400863647, "logits/rejected": -0.5983974933624268, "logps/chosen": -659.2459716796875, "logps/rejected": -1549.3529052734375, "loss": 0.1609, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.08437564969062805, "rewards/margins": 0.2931548058986664, "rewards/rejected": -0.3775304853916168, "step": 1110 }, { "epoch": 0.6, "learning_rate": 2.089842874558849e-06, "logits/chosen": -1.3163573741912842, "logits/rejected": -0.6514892578125, "logps/chosen": -619.4735107421875, "logps/rejected": -1633.3419189453125, "loss": 0.1827, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.05838247388601303, "rewards/margins": 0.3051317036151886, "rewards/rejected": -0.3635141849517822, "step": 1120 }, { "epoch": 0.6, "learning_rate": 2.0439915765994242e-06, "logits/chosen": -1.4336564540863037, "logits/rejected": -0.5167166590690613, "logps/chosen": -519.189453125, "logps/rejected": -1454.1304931640625, "loss": 0.154, "rewards/accuracies": 0.75, "rewards/chosen": -0.05485490709543228, "rewards/margins": 0.2894749343395233, "rewards/rejected": -0.3443298935890198, "step": 1130 }, { "epoch": 0.61, "learning_rate": 1.9982984143287186e-06, "logits/chosen": -0.9816936254501343, "logits/rejected": -0.26896458864212036, "logps/chosen": -681.84765625, "logps/rejected": -1698.229736328125, "loss": 0.1722, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.09818267822265625, "rewards/margins": 0.25163981318473816, "rewards/rejected": -0.3498225212097168, "step": 1140 }, { "epoch": 0.61, "learning_rate": 1.95277923333053e-06, "logits/chosen": -1.5167888402938843, "logits/rejected": -0.49634304642677307, "logps/chosen": -805.359130859375, "logps/rejected": -1700.7835693359375, "loss": 0.1245, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.09041927754878998, "rewards/margins": 0.3279864192008972, "rewards/rejected": -0.418405681848526, "step": 1150 }, { "epoch": 0.62, "learning_rate": 1.9074498188550156e-06, "logits/chosen": -1.519516110420227, "logits/rejected": -0.12074364721775055, "logps/chosen": -685.3983154296875, "logps/rejected": -1531.295166015625, "loss": 0.1808, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.056040991097688675, "rewards/margins": 0.315580278635025, "rewards/rejected": -0.3716212511062622, "step": 1160 }, { "epoch": 0.62, "learning_rate": 1.862325890344643e-06, "logits/chosen": -1.232160210609436, "logits/rejected": -0.23375025391578674, "logps/chosen": -698.9664916992188, "logps/rejected": -1915.4918212890625, "loss": 0.1419, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.09169380366802216, "rewards/margins": 0.40644773840904236, "rewards/rejected": -0.49814146757125854, "step": 1170 }, { "epoch": 0.63, "learning_rate": 1.817423095982972e-06, "logits/chosen": -1.3982770442962646, "logits/rejected": -0.8638380765914917, "logps/chosen": -595.23291015625, "logps/rejected": -1435.40966796875, "loss": 0.1757, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.07611984014511108, "rewards/margins": 0.268936425447464, "rewards/rejected": -0.34505629539489746, "step": 1180 }, { "epoch": 0.63, "learning_rate": 1.7727570072681293e-06, "logits/chosen": -1.4742844104766846, "logits/rejected": -0.3091976046562195, "logps/chosen": -600.8450317382812, "logps/rejected": -1698.4534912109375, "loss": 0.109, "rewards/accuracies": 0.875, "rewards/chosen": -0.05584261566400528, "rewards/margins": 0.35010969638824463, "rewards/rejected": -0.4059523046016693, "step": 1190 }, { "epoch": 0.64, "learning_rate": 1.7283431136128961e-06, "logits/chosen": -1.4247504472732544, "logits/rejected": -0.4096221327781677, "logps/chosen": -717.885498046875, "logps/rejected": -1856.927734375, "loss": 0.1427, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.0902978777885437, "rewards/margins": 0.3808268904685974, "rewards/rejected": -0.4711247384548187, "step": 1200 }, { "epoch": 0.65, "learning_rate": 1.6841968169732478e-06, "logits/chosen": -1.285744071006775, "logits/rejected": -0.08921636641025543, "logps/chosen": -761.4056396484375, "logps/rejected": -1614.3876953125, "loss": 0.1334, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.10277809947729111, "rewards/margins": 0.29823797941207886, "rewards/rejected": -0.4010160565376282, "step": 1210 }, { "epoch": 0.65, "learning_rate": 1.6403334265072284e-06, "logits/chosen": -1.2995268106460571, "logits/rejected": -0.6406716704368591, "logps/chosen": -857.0849609375, "logps/rejected": -1790.50390625, "loss": 0.1581, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.14391708374023438, "rewards/margins": 0.3202173411846161, "rewards/rejected": -0.4641345143318176, "step": 1220 }, { "epoch": 0.66, "learning_rate": 1.5967681532660066e-06, "logits/chosen": -1.2067750692367554, "logits/rejected": -0.40620866417884827, "logps/chosen": -689.6841430664062, "logps/rejected": -1506.9180908203125, "loss": 0.1781, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.08755698055028915, "rewards/margins": 0.24655339121818542, "rewards/rejected": -0.33411040902137756, "step": 1230 }, { "epoch": 0.66, "learning_rate": 1.5535161049189463e-06, "logits/chosen": -1.388139009475708, "logits/rejected": -0.28742751479148865, "logps/chosen": -702.9202270507812, "logps/rejected": -1619.192138671875, "loss": 0.1155, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.11033840477466583, "rewards/margins": 0.27864348888397217, "rewards/rejected": -0.3889819085597992, "step": 1240 }, { "epoch": 0.67, "learning_rate": 1.5105922805145356e-06, "logits/chosen": -1.4737704992294312, "logits/rejected": -0.7333568334579468, "logps/chosen": -611.47314453125, "logps/rejected": -1699.2509765625, "loss": 0.1432, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.06176890805363655, "rewards/margins": 0.35512489080429077, "rewards/rejected": -0.4168938100337982, "step": 1250 }, { "epoch": 0.67, "learning_rate": 1.4680115652789823e-06, "logits/chosen": -1.1875636577606201, "logits/rejected": -0.24240991473197937, "logps/chosen": -745.5838012695312, "logps/rejected": -1765.468505859375, "loss": 0.1855, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.10248545557260513, "rewards/margins": 0.34968844056129456, "rewards/rejected": -0.4521738588809967, "step": 1260 }, { "epoch": 0.68, "learning_rate": 1.4257887254542767e-06, "logits/chosen": -1.2042195796966553, "logits/rejected": -0.4282516837120056, "logps/chosen": -745.48291015625, "logps/rejected": -1836.1507568359375, "loss": 0.2098, "rewards/accuracies": 0.875, "rewards/chosen": -0.08774339407682419, "rewards/margins": 0.3781694173812866, "rewards/rejected": -0.4659128785133362, "step": 1270 }, { "epoch": 0.68, "learning_rate": 1.3839384031775227e-06, "logits/chosen": -1.3200931549072266, "logits/rejected": -0.23144832253456116, "logps/chosen": -582.4103393554688, "logps/rejected": -1639.1998291015625, "loss": 0.1594, "rewards/accuracies": 0.875, "rewards/chosen": -0.03387508541345596, "rewards/margins": 0.3454126715660095, "rewards/rejected": -0.3792877793312073, "step": 1280 }, { "epoch": 0.69, "learning_rate": 1.342475111403298e-06, "logits/chosen": -1.4199360609054565, "logits/rejected": -0.5141012072563171, "logps/chosen": -664.3624877929688, "logps/rejected": -1757.888427734375, "loss": 0.1348, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.03387322276830673, "rewards/margins": 0.35872942209243774, "rewards/rejected": -0.3926026523113251, "step": 1290 }, { "epoch": 0.69, "learning_rate": 1.3014132288708209e-06, "logits/chosen": -1.2313156127929688, "logits/rejected": -0.6418328881263733, "logps/chosen": -680.6788330078125, "logps/rejected": -1728.441162109375, "loss": 0.1366, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.06790607422590256, "rewards/margins": 0.3306724429130554, "rewards/rejected": -0.39857858419418335, "step": 1300 }, { "epoch": 0.7, "learning_rate": 1.2607669951176549e-06, "logits/chosen": -1.322935700416565, "logits/rejected": -0.7819048166275024, "logps/chosen": -642.0144653320312, "logps/rejected": -1622.2706298828125, "loss": 0.1403, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.03408288210630417, "rewards/margins": 0.3696083426475525, "rewards/rejected": -0.40369120240211487, "step": 1310 }, { "epoch": 0.7, "learning_rate": 1.2205505055416891e-06, "logits/chosen": -1.2069556713104248, "logits/rejected": -0.599054217338562, "logps/chosen": -657.8556518554688, "logps/rejected": -1849.9466552734375, "loss": 0.1322, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.06479531526565552, "rewards/margins": 0.36816781759262085, "rewards/rejected": -0.4329630732536316, "step": 1320 }, { "epoch": 0.71, "learning_rate": 1.1807777065131002e-06, "logits/chosen": -1.1746324300765991, "logits/rejected": -0.04525896906852722, "logps/chosen": -619.5230712890625, "logps/rejected": -1747.1986083984375, "loss": 0.1161, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.041824158281087875, "rewards/margins": 0.345977246761322, "rewards/rejected": -0.3878014385700226, "step": 1330 }, { "epoch": 0.71, "learning_rate": 1.1414623905380012e-06, "logits/chosen": -1.1227614879608154, "logits/rejected": 0.07286291569471359, "logps/chosen": -683.210693359375, "logps/rejected": -1760.2294921875, "loss": 0.1307, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.06253410875797272, "rewards/margins": 0.33463478088378906, "rewards/rejected": -0.39716893434524536, "step": 1340 }, { "epoch": 0.72, "learning_rate": 1.1026181914754388e-06, "logits/chosen": -1.2235952615737915, "logits/rejected": -0.23109273612499237, "logps/chosen": -533.2498168945312, "logps/rejected": -1668.765380859375, "loss": 0.1519, "rewards/accuracies": 0.875, "rewards/chosen": -0.01653262972831726, "rewards/margins": 0.36776840686798096, "rewards/rejected": -0.3843010663986206, "step": 1350 }, { "epoch": 0.73, "learning_rate": 1.0642585798094136e-06, "logits/chosen": -1.1692348718643188, "logits/rejected": -0.5899300575256348, "logps/chosen": -686.9459228515625, "logps/rejected": -1795.8177490234375, "loss": 0.1209, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.07367046922445297, "rewards/margins": 0.3583151698112488, "rewards/rejected": -0.43198567628860474, "step": 1360 }, { "epoch": 0.73, "learning_rate": 1.0263968579775522e-06, "logits/chosen": -1.2771265506744385, "logits/rejected": -0.38393262028694153, "logps/chosen": -683.27490234375, "logps/rejected": -1695.8990478515625, "loss": 0.1321, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.09097830951213837, "rewards/margins": 0.35342246294021606, "rewards/rejected": -0.4444007873535156, "step": 1370 }, { "epoch": 0.74, "learning_rate": 9.89046155758058e-07, "logits/chosen": -1.1900537014007568, "logits/rejected": -0.1559818983078003, "logps/chosen": -743.1553955078125, "logps/rejected": -1642.6292724609375, "loss": 0.1577, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.05998997762799263, "rewards/margins": 0.35595935583114624, "rewards/rejected": -0.41594934463500977, "step": 1380 }, { "epoch": 0.74, "learning_rate": 9.52219425716534e-07, "logits/chosen": -1.4094737768173218, "logits/rejected": -0.359419047832489, "logps/chosen": -764.5060424804688, "logps/rejected": -1663.7763671875, "loss": 0.1389, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.08220493793487549, "rewards/margins": 0.2807352542877197, "rewards/rejected": -0.3629401922225952, "step": 1390 }, { "epoch": 0.75, "learning_rate": 9.15929438714262e-07, "logits/chosen": -1.2259761095046997, "logits/rejected": -0.3568953275680542, "logps/chosen": -684.1424560546875, "logps/rejected": -1751.624267578125, "loss": 0.1703, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.07558954507112503, "rewards/margins": 0.3323276937007904, "rewards/rejected": -0.4079172611236572, "step": 1400 }, { "epoch": 0.75, "learning_rate": 8.801887794794911e-07, "logits/chosen": -1.2949384450912476, "logits/rejected": -0.26310306787490845, "logps/chosen": -527.4078979492188, "logps/rejected": -1489.334716796875, "loss": 0.167, "rewards/accuracies": 0.75, "rewards/chosen": -0.029276112094521523, "rewards/margins": 0.3588224947452545, "rewards/rejected": -0.3880985677242279, "step": 1410 }, { "epoch": 0.76, "learning_rate": 8.450098422432787e-07, "logits/chosen": -1.084120512008667, "logits/rejected": -0.5709729790687561, "logps/chosen": -519.4564208984375, "logps/rejected": -1378.505615234375, "loss": 0.1475, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.043223101645708084, "rewards/margins": 0.2802775204181671, "rewards/rejected": -0.3235006332397461, "step": 1420 }, { "epoch": 0.76, "learning_rate": 8.104048264413858e-07, "logits/chosen": -1.3592941761016846, "logits/rejected": -0.6775336265563965, "logps/chosen": -672.0179443359375, "logps/rejected": -1647.7578125, "loss": 0.1632, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.042347002774477005, "rewards/margins": 0.309231162071228, "rewards/rejected": -0.3515781760215759, "step": 1430 }, { "epoch": 0.77, "learning_rate": 7.763857324837321e-07, "logits/chosen": -1.3305068016052246, "logits/rejected": -0.422370046377182, "logps/chosen": -578.7777099609375, "logps/rejected": -1635.9923095703125, "loss": 0.1643, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.013389256782829762, "rewards/margins": 0.3833867907524109, "rewards/rejected": -0.3967760503292084, "step": 1440 }, { "epoch": 0.77, "learning_rate": 7.429643575928605e-07, "logits/chosen": -1.1784260272979736, "logits/rejected": -0.4220113754272461, "logps/chosen": -715.4688720703125, "logps/rejected": -1668.013916015625, "loss": 0.1387, "rewards/accuracies": 0.875, "rewards/chosen": -0.04753159359097481, "rewards/margins": 0.34793567657470703, "rewards/rejected": -0.3954673409461975, "step": 1450 }, { "epoch": 0.78, "learning_rate": 7.101522917128709e-07, "logits/chosen": -1.1453496217727661, "logits/rejected": -0.23903509974479675, "logps/chosen": -688.1276245117188, "logps/rejected": -1582.7860107421875, "loss": 0.1475, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.03351983800530434, "rewards/margins": 0.31615251302719116, "rewards/rejected": -0.3496723771095276, "step": 1460 }, { "epoch": 0.78, "learning_rate": 6.779609134902312e-07, "logits/chosen": -1.5365946292877197, "logits/rejected": -0.83519446849823, "logps/chosen": -617.84912109375, "logps/rejected": -1601.636474609375, "loss": 0.1358, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.02726108208298683, "rewards/margins": 0.3341112434864044, "rewards/rejected": -0.36137229204177856, "step": 1470 }, { "epoch": 0.79, "learning_rate": 6.464013863278629e-07, "logits/chosen": -1.3226994276046753, "logits/rejected": -0.2582075297832489, "logps/chosen": -690.5620727539062, "logps/rejected": -1618.515380859375, "loss": 0.1555, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.04355646297335625, "rewards/margins": 0.32091113924980164, "rewards/rejected": -0.364467591047287, "step": 1480 }, { "epoch": 0.79, "learning_rate": 6.154846545138696e-07, "logits/chosen": -1.1359361410140991, "logits/rejected": -0.28622904419898987, "logps/chosen": -643.8432006835938, "logps/rejected": -1722.578125, "loss": 0.1235, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.031076664105057716, "rewards/margins": 0.38383063673973083, "rewards/rejected": -0.414907306432724, "step": 1490 }, { "epoch": 0.8, "learning_rate": 5.852214394262515e-07, "logits/chosen": -1.4020814895629883, "logits/rejected": -0.29137012362480164, "logps/chosen": -722.795654296875, "logps/rejected": -1755.2005615234375, "loss": 0.145, "rewards/accuracies": 0.875, "rewards/chosen": -0.05530545115470886, "rewards/margins": 0.3759249448776245, "rewards/rejected": -0.431230366230011, "step": 1500 }, { "epoch": 0.81, "learning_rate": 5.556222358149191e-07, "logits/chosen": -1.082096815109253, "logits/rejected": -0.2843102216720581, "logps/chosen": -636.3153076171875, "logps/rejected": -1611.7149658203125, "loss": 0.1259, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.05015719681978226, "rewards/margins": 0.33069074153900146, "rewards/rejected": -0.3808479905128479, "step": 1510 }, { "epoch": 0.81, "learning_rate": 5.266973081622992e-07, "logits/chosen": -1.190901756286621, "logits/rejected": -0.3367752134799957, "logps/chosen": -654.8163452148438, "logps/rejected": -1681.1123046875, "loss": 0.1334, "rewards/accuracies": 0.875, "rewards/chosen": -0.05181831866502762, "rewards/margins": 0.37140342593193054, "rewards/rejected": -0.42322173714637756, "step": 1520 }, { "epoch": 0.82, "learning_rate": 4.984566871237942e-07, "logits/chosen": -1.569336175918579, "logits/rejected": -0.28200453519821167, "logps/chosen": -558.4755859375, "logps/rejected": -1696.124267578125, "loss": 0.1255, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.019404267892241478, "rewards/margins": 0.42805781960487366, "rewards/rejected": -0.4474620819091797, "step": 1530 }, { "epoch": 0.82, "learning_rate": 4.709101660493251e-07, "logits/chosen": -1.3863935470581055, "logits/rejected": -0.5687643885612488, "logps/chosen": -742.7957153320312, "logps/rejected": -1831.429931640625, "loss": 0.1213, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.082461878657341, "rewards/margins": 0.3909619450569153, "rewards/rejected": -0.4734238088130951, "step": 1540 }, { "epoch": 0.83, "learning_rate": 4.440672975871743e-07, "logits/chosen": -1.4774668216705322, "logits/rejected": -0.9906954765319824, "logps/chosen": -623.5938720703125, "logps/rejected": -1661.2242431640625, "loss": 0.1343, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.0459056980907917, "rewards/margins": 0.38650375604629517, "rewards/rejected": -0.43240952491760254, "step": 1550 }, { "epoch": 0.83, "learning_rate": 4.1793739037129134e-07, "logits/chosen": -1.197632074356079, "logits/rejected": -0.7537062168121338, "logps/chosen": -547.0568237304688, "logps/rejected": -1868.6796875, "loss": 0.1353, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.03627854213118553, "rewards/margins": 0.4777587950229645, "rewards/rejected": -0.5140373110771179, "step": 1560 }, { "epoch": 0.84, "learning_rate": 3.9252950579322405e-07, "logits/chosen": -1.4480640888214111, "logits/rejected": -0.6620736718177795, "logps/chosen": -862.5847778320312, "logps/rejected": -1864.031005859375, "loss": 0.1868, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.1275833547115326, "rewards/margins": 0.371966689825058, "rewards/rejected": -0.4995500147342682, "step": 1570 }, { "epoch": 0.84, "learning_rate": 3.6785245485978864e-07, "logits/chosen": -1.5964170694351196, "logits/rejected": -0.689164400100708, "logps/chosen": -763.082763671875, "logps/rejected": -1659.866943359375, "loss": 0.177, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.1088029146194458, "rewards/margins": 0.3148026764392853, "rewards/rejected": -0.4236055910587311, "step": 1580 }, { "epoch": 0.85, "learning_rate": 3.43914795137566e-07, "logits/chosen": -1.4386413097381592, "logits/rejected": -0.14032740890979767, "logps/chosen": -754.867431640625, "logps/rejected": -1875.2099609375, "loss": 0.1129, "rewards/accuracies": 0.875, "rewards/chosen": -0.08381593227386475, "rewards/margins": 0.4074801504611969, "rewards/rejected": -0.49129611253738403, "step": 1590 }, { "epoch": 0.85, "learning_rate": 3.207248277852901e-07, "logits/chosen": -1.4534538984298706, "logits/rejected": -0.4819762110710144, "logps/chosen": -752.8095703125, "logps/rejected": -1707.563232421875, "loss": 0.1529, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.10240931808948517, "rewards/margins": 0.3559865653514862, "rewards/rejected": -0.45839595794677734, "step": 1600 }, { "epoch": 0.86, "learning_rate": 2.9829059467515074e-07, "logits/chosen": -1.6940540075302124, "logits/rejected": -0.40000471472740173, "logps/chosen": -623.6875610351562, "logps/rejected": -1585.805908203125, "loss": 0.1576, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.046424802392721176, "rewards/margins": 0.34341102838516235, "rewards/rejected": -0.38983583450317383, "step": 1610 }, { "epoch": 0.86, "learning_rate": 2.766198756040153e-07, "logits/chosen": -1.2030726671218872, "logits/rejected": -0.2915817201137543, "logps/chosen": -833.3904418945312, "logps/rejected": -1901.398681640625, "loss": 0.1341, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.07304056733846664, "rewards/margins": 0.4044952392578125, "rewards/rejected": -0.47753578424453735, "step": 1620 }, { "epoch": 0.87, "learning_rate": 2.5572018559553155e-07, "logits/chosen": -1.4762271642684937, "logits/rejected": -0.5102109909057617, "logps/chosen": -739.6577758789062, "logps/rejected": -1855.5865478515625, "loss": 0.0915, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.08025506138801575, "rewards/margins": 0.4267922043800354, "rewards/rejected": -0.5070472955703735, "step": 1630 }, { "epoch": 0.87, "learning_rate": 2.3559877229404864e-07, "logits/chosen": -1.327376127243042, "logits/rejected": -0.7707889080047607, "logps/chosen": -711.1609497070312, "logps/rejected": -1628.607421875, "loss": 0.1191, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.07269873470067978, "rewards/margins": 0.33979684114456177, "rewards/rejected": -0.41249552369117737, "step": 1640 }, { "epoch": 0.88, "learning_rate": 2.1626261345126576e-07, "logits/chosen": -1.2067131996154785, "logits/rejected": -0.12356331199407578, "logps/chosen": -640.2079467773438, "logps/rejected": -1719.388427734375, "loss": 0.1247, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.05858426168560982, "rewards/margins": 0.4315997064113617, "rewards/rejected": -0.4901839792728424, "step": 1650 }, { "epoch": 0.89, "learning_rate": 1.9771841450646505e-07, "logits/chosen": -1.2794129848480225, "logits/rejected": -0.40672388672828674, "logps/chosen": -641.7750244140625, "logps/rejected": -1771.903564453125, "loss": 0.1566, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.08581944555044174, "rewards/margins": 0.3960307240486145, "rewards/rejected": -0.4818502366542816, "step": 1660 }, { "epoch": 0.89, "learning_rate": 1.7997260626118758e-07, "logits/chosen": -1.116964340209961, "logits/rejected": -0.30027687549591064, "logps/chosen": -827.5680541992188, "logps/rejected": -1756.0361328125, "loss": 0.1818, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.13185694813728333, "rewards/margins": 0.34328708052635193, "rewards/rejected": -0.47514405846595764, "step": 1670 }, { "epoch": 0.9, "learning_rate": 1.6303134264914365e-07, "logits/chosen": -1.050196647644043, "logits/rejected": -0.43848666548728943, "logps/chosen": -749.1686401367188, "logps/rejected": -1560.6253662109375, "loss": 0.2079, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.1045556515455246, "rewards/margins": 0.2498161792755127, "rewards/rejected": -0.3543718457221985, "step": 1680 }, { "epoch": 0.9, "learning_rate": 1.469004986021355e-07, "logits/chosen": -1.199964165687561, "logits/rejected": -0.08608709275722504, "logps/chosen": -697.6527099609375, "logps/rejected": -1817.315673828125, "loss": 0.1241, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.10202528536319733, "rewards/margins": 0.3813837170600891, "rewards/rejected": -0.48340901732444763, "step": 1690 }, { "epoch": 0.91, "learning_rate": 1.315856680127367e-07, "logits/chosen": -1.1728847026824951, "logits/rejected": -0.7924209833145142, "logps/chosen": -860.1814575195312, "logps/rejected": -2008.0902099609375, "loss": 0.1263, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.09493857622146606, "rewards/margins": 0.3988199830055237, "rewards/rejected": -0.49375852942466736, "step": 1700 }, { "epoch": 0.91, "learning_rate": 1.1709216179442817e-07, "logits/chosen": -1.3747029304504395, "logits/rejected": -0.3935699760913849, "logps/chosen": -706.3656005859375, "logps/rejected": -1836.0609130859375, "loss": 0.1349, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.062370091676712036, "rewards/margins": 0.4179624915122986, "rewards/rejected": -0.480332612991333, "step": 1710 }, { "epoch": 0.92, "learning_rate": 1.0342500603986421e-07, "logits/chosen": -1.3793084621429443, "logits/rejected": -0.1095462292432785, "logps/chosen": -771.80810546875, "logps/rejected": -1763.7025146484375, "loss": 0.1259, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.061395853757858276, "rewards/margins": 0.4151260256767273, "rewards/rejected": -0.47652187943458557, "step": 1720 }, { "epoch": 0.92, "learning_rate": 9.058894027791643e-08, "logits/chosen": -1.151395559310913, "logits/rejected": 0.09823288023471832, "logps/chosen": -677.2811889648438, "logps/rejected": -1689.9427490234375, "loss": 0.1347, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.054931480437517166, "rewards/margins": 0.3742934763431549, "rewards/rejected": -0.4292249083518982, "step": 1730 }, { "epoch": 0.93, "learning_rate": 7.858841583008592e-08, "logits/chosen": -1.3676621913909912, "logits/rejected": -0.6429020166397095, "logps/chosen": -617.450927734375, "logps/rejected": -1714.8131103515625, "loss": 0.124, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.05651504918932915, "rewards/margins": 0.4177074432373047, "rewards/rejected": -0.4742225110530853, "step": 1740 }, { "epoch": 0.93, "learning_rate": 6.742759426686313e-08, "logits/chosen": -1.183172583580017, "logits/rejected": -0.5100821256637573, "logps/chosen": -752.4456787109375, "logps/rejected": -1788.185302734375, "loss": 0.1374, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.09715384244918823, "rewards/margins": 0.3738781809806824, "rewards/rejected": -0.471032053232193, "step": 1750 }, { "epoch": 0.94, "learning_rate": 5.7110345964571104e-08, "logits/chosen": -1.42495596408844, "logits/rejected": -0.7022902369499207, "logps/chosen": -604.3352661132812, "logps/rejected": -1618.843017578125, "loss": 0.1543, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.03320794552564621, "rewards/margins": 0.35301321744918823, "rewards/rejected": -0.38622111082077026, "step": 1760 }, { "epoch": 0.94, "learning_rate": 4.764024876318357e-08, "logits/chosen": -1.4258668422698975, "logits/rejected": -0.1909623146057129, "logps/chosen": -670.0120849609375, "logps/rejected": -1743.475830078125, "loss": 0.077, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.05731179565191269, "rewards/margins": 0.4044620394706726, "rewards/rejected": -0.4617738723754883, "step": 1770 }, { "epoch": 0.95, "learning_rate": 3.902058672559633e-08, "logits/chosen": -1.355021595954895, "logits/rejected": -0.29654452204704285, "logps/chosen": -578.4010009765625, "logps/rejected": -1676.61328125, "loss": 0.175, "rewards/accuracies": 0.875, "rewards/chosen": -0.03146610036492348, "rewards/margins": 0.42508259415626526, "rewards/rejected": -0.45654863119125366, "step": 1780 }, { "epoch": 0.95, "learning_rate": 3.125434899876933e-08, "logits/chosen": -1.3393023014068604, "logits/rejected": -0.4936142861843109, "logps/chosen": -564.572509765625, "logps/rejected": -1514.8624267578125, "loss": 0.1283, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.05059174820780754, "rewards/margins": 0.33098191022872925, "rewards/rejected": -0.3815736770629883, "step": 1790 }, { "epoch": 0.96, "learning_rate": 2.4344228777145873e-08, "logits/chosen": -1.2701542377471924, "logits/rejected": -0.29851651191711426, "logps/chosen": -603.8160400390625, "logps/rejected": -1694.4417724609375, "loss": 0.15, "rewards/accuracies": 0.875, "rewards/chosen": -0.03015061281621456, "rewards/margins": 0.3750046491622925, "rewards/rejected": -0.4051552712917328, "step": 1800 }, { "epoch": 0.97, "learning_rate": 1.829262236869772e-08, "logits/chosen": -1.1179814338684082, "logits/rejected": -0.27846160531044006, "logps/chosen": -722.6538696289062, "logps/rejected": -1610.6104736328125, "loss": 0.153, "rewards/accuracies": 0.75, "rewards/chosen": -0.06969080120325089, "rewards/margins": 0.3620641827583313, "rewards/rejected": -0.4317549169063568, "step": 1810 }, { "epoch": 0.97, "learning_rate": 1.3101628363929586e-08, "logits/chosen": -1.2972911596298218, "logits/rejected": -0.3146804869174957, "logps/chosen": -702.9611206054688, "logps/rejected": -1858.8841552734375, "loss": 0.1325, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.0757565051317215, "rewards/margins": 0.4196552634239197, "rewards/rejected": -0.4954117238521576, "step": 1820 }, { "epoch": 0.98, "learning_rate": 8.773046908123195e-09, "logits/chosen": -1.2295328378677368, "logits/rejected": -0.45538797974586487, "logps/chosen": -609.9122314453125, "logps/rejected": -1732.151611328125, "loss": 0.0956, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.047846533358097076, "rewards/margins": 0.40931883454322815, "rewards/rejected": -0.457165390253067, "step": 1830 }, { "epoch": 0.98, "learning_rate": 5.308379077080817e-09, "logits/chosen": -1.3349180221557617, "logits/rejected": -0.37136349081993103, "logps/chosen": -741.32763671875, "logps/rejected": -1853.3609619140625, "loss": 0.1685, "rewards/accuracies": 0.875, "rewards/chosen": -0.10401256382465363, "rewards/margins": 0.3636459708213806, "rewards/rejected": -0.4676585793495178, "step": 1840 }, { "epoch": 0.99, "learning_rate": 2.7088263565760996e-09, "logits/chosen": -1.2106130123138428, "logits/rejected": -0.03263585641980171, "logps/chosen": -627.7078857421875, "logps/rejected": -1669.301025390625, "loss": 0.167, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.06294982880353928, "rewards/margins": 0.3623204827308655, "rewards/rejected": -0.4252702593803406, "step": 1850 }, { "epoch": 0.99, "learning_rate": 9.752902257023633e-10, "logits/chosen": -0.9112818837165833, "logits/rejected": -0.16677840054035187, "logps/chosen": -745.0077514648438, "logps/rejected": -1581.1097412109375, "loss": 0.1235, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.10941288620233536, "rewards/margins": 0.2992158532142639, "rewards/rejected": -0.40862876176834106, "step": 1860 }, { "epoch": 1.0, "learning_rate": 1.083718442532189e-10, "logits/chosen": -1.027451992034912, "logits/rejected": 0.37703937292099, "logps/chosen": -782.1871337890625, "logps/rejected": -1805.238037109375, "loss": 0.1393, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.11385799944400787, "rewards/margins": 0.39328035712242126, "rewards/rejected": -0.5071383714675903, "step": 1870 }, { "epoch": 1.0, "step": 1875, "total_flos": 0.0, "train_loss": 0.17237209237416584, "train_runtime": 10470.2127, "train_samples_per_second": 1.433, "train_steps_per_second": 0.179 } ], "logging_steps": 10, "max_steps": 1875, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }