{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.999297541394882, "eval_steps": 400, "global_step": 5604, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.002676032781401572, "grad_norm": 13.877074032368714, "learning_rate": 8.9126559714795e-09, "logits/chosen": -0.07003673166036606, "logits/rejected": 0.1360929310321808, "logps/chosen": -1.7158482074737549, "logps/rejected": -1.8893811702728271, "loss": 1.7158, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.7158482074737549, "rewards/margins": 0.17353308200836182, "rewards/rejected": -1.8893811702728271, "step": 5 }, { "epoch": 0.005352065562803144, "grad_norm": 26.688365008041714, "learning_rate": 1.7825311942959e-08, "logits/chosen": 0.009085097350180149, "logits/rejected": 0.12916086614131927, "logps/chosen": -1.802496314048767, "logps/rejected": -1.8462358713150024, "loss": 1.8025, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.802496314048767, "rewards/margins": 0.043739236891269684, "rewards/rejected": -1.8462358713150024, "step": 10 }, { "epoch": 0.008028098344204716, "grad_norm": 25.038336781178028, "learning_rate": 2.67379679144385e-08, "logits/chosen": -0.022359276190400124, "logits/rejected": 0.07573962211608887, "logps/chosen": -1.634742021560669, "logps/rejected": -1.7647755146026611, "loss": 1.6347, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.634742021560669, "rewards/margins": 0.1300334334373474, "rewards/rejected": -1.7647755146026611, "step": 15 }, { "epoch": 0.010704131125606288, "grad_norm": 15.665415570561908, "learning_rate": 3.5650623885918e-08, "logits/chosen": -0.027004677802324295, "logits/rejected": 0.05891682952642441, "logps/chosen": -1.7257192134857178, "logps/rejected": -1.8068325519561768, "loss": 1.7257, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -1.7257192134857178, "rewards/margins": 0.08111341297626495, "rewards/rejected": -1.8068325519561768, "step": 20 }, { "epoch": 0.013380163907007862, "grad_norm": 32.92088881343983, "learning_rate": 4.45632798573975e-08, "logits/chosen": -0.04839114099740982, "logits/rejected": 0.0362510085105896, "logps/chosen": -1.8681186437606812, "logps/rejected": -1.7786155939102173, "loss": 1.8681, "rewards/accuracies": 0.375, "rewards/chosen": -1.8681186437606812, "rewards/margins": -0.0895029604434967, "rewards/rejected": -1.7786155939102173, "step": 25 }, { "epoch": 0.016056196688409432, "grad_norm": 29.30033211825193, "learning_rate": 5.3475935828877e-08, "logits/chosen": -0.09242594242095947, "logits/rejected": -0.0012742519611492753, "logps/chosen": -1.9084527492523193, "logps/rejected": -1.8328012228012085, "loss": 1.9085, "rewards/accuracies": 0.4375, "rewards/chosen": -1.9084527492523193, "rewards/margins": -0.07565152645111084, "rewards/rejected": -1.8328012228012085, "step": 30 }, { "epoch": 0.018732229469811006, "grad_norm": 22.818341930000393, "learning_rate": 6.23885918003565e-08, "logits/chosen": -0.04134119674563408, "logits/rejected": 0.1202901229262352, "logps/chosen": -1.84597909450531, "logps/rejected": -1.9966322183609009, "loss": 1.846, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.84597909450531, "rewards/margins": 0.15065297484397888, "rewards/rejected": -1.9966322183609009, "step": 35 }, { "epoch": 0.021408262251212576, "grad_norm": 28.57410938668619, "learning_rate": 7.1301247771836e-08, "logits/chosen": 0.0422447994351387, "logits/rejected": 0.21637730300426483, "logps/chosen": -1.878265142440796, "logps/rejected": -1.7413383722305298, "loss": 1.8783, "rewards/accuracies": 0.45625001192092896, "rewards/chosen": -1.878265142440796, "rewards/margins": -0.13692685961723328, "rewards/rejected": -1.7413383722305298, "step": 40 }, { "epoch": 0.02408429503261415, "grad_norm": 26.628886397692057, "learning_rate": 8.021390374331551e-08, "logits/chosen": 0.02598525583744049, "logits/rejected": 0.22334837913513184, "logps/chosen": -1.8331044912338257, "logps/rejected": -1.8683526515960693, "loss": 1.8331, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.8331044912338257, "rewards/margins": 0.03524838760495186, "rewards/rejected": -1.8683526515960693, "step": 45 }, { "epoch": 0.026760327814015723, "grad_norm": 30.75542553272999, "learning_rate": 8.9126559714795e-08, "logits/chosen": -0.040215782821178436, "logits/rejected": 0.10817620903253555, "logps/chosen": -1.8912649154663086, "logps/rejected": -1.772831678390503, "loss": 1.8913, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.8912649154663086, "rewards/margins": -0.11843331903219223, "rewards/rejected": -1.772831678390503, "step": 50 }, { "epoch": 0.029436360595417294, "grad_norm": 24.898534668361208, "learning_rate": 9.80392156862745e-08, "logits/chosen": -0.09529398381710052, "logits/rejected": 0.1271449625492096, "logps/chosen": -1.8221250772476196, "logps/rejected": -1.8581008911132812, "loss": 1.8221, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.8221250772476196, "rewards/margins": 0.03597600385546684, "rewards/rejected": -1.8581008911132812, "step": 55 }, { "epoch": 0.032112393376818864, "grad_norm": 28.56766481311408, "learning_rate": 1.06951871657754e-07, "logits/chosen": -0.08029714971780777, "logits/rejected": 0.10841427743434906, "logps/chosen": -1.7789138555526733, "logps/rejected": -1.8844283819198608, "loss": 1.7789, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.7789138555526733, "rewards/margins": 0.10551442950963974, "rewards/rejected": -1.8844283819198608, "step": 60 }, { "epoch": 0.03478842615822044, "grad_norm": 26.85765689893232, "learning_rate": 1.158645276292335e-07, "logits/chosen": -0.03247509151697159, "logits/rejected": 0.1137252077460289, "logps/chosen": -1.6273759603500366, "logps/rejected": -1.7559734582901, "loss": 1.6274, "rewards/accuracies": 0.53125, "rewards/chosen": -1.6273759603500366, "rewards/margins": 0.1285974532365799, "rewards/rejected": -1.7559734582901, "step": 65 }, { "epoch": 0.03746445893962201, "grad_norm": 29.14076255553297, "learning_rate": 1.24777183600713e-07, "logits/chosen": -0.06084463745355606, "logits/rejected": 0.0928875282406807, "logps/chosen": -1.7509572505950928, "logps/rejected": -1.7954699993133545, "loss": 1.751, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": -1.7509572505950928, "rewards/margins": 0.04451284557580948, "rewards/rejected": -1.7954699993133545, "step": 70 }, { "epoch": 0.04014049172102358, "grad_norm": 22.99651063732658, "learning_rate": 1.3368983957219251e-07, "logits/chosen": -0.06009561941027641, "logits/rejected": 0.11752082407474518, "logps/chosen": -1.7421964406967163, "logps/rejected": -1.997208595275879, "loss": 1.7422, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.7421964406967163, "rewards/margins": 0.2550122141838074, "rewards/rejected": -1.997208595275879, "step": 75 }, { "epoch": 0.04281652450242515, "grad_norm": 21.48262289418791, "learning_rate": 1.42602495543672e-07, "logits/chosen": 4.8729776608524844e-05, "logits/rejected": 0.10217728465795517, "logps/chosen": -1.6768267154693604, "logps/rejected": -1.711584448814392, "loss": 1.6768, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.6768267154693604, "rewards/margins": 0.03475777432322502, "rewards/rejected": -1.711584448814392, "step": 80 }, { "epoch": 0.04549255728382673, "grad_norm": 16.997902386991708, "learning_rate": 1.5151515151515152e-07, "logits/chosen": -0.16405907273292542, "logits/rejected": 0.07780761271715164, "logps/chosen": -1.7426893711090088, "logps/rejected": -1.9093055725097656, "loss": 1.7427, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.7426893711090088, "rewards/margins": 0.1666160374879837, "rewards/rejected": -1.9093055725097656, "step": 85 }, { "epoch": 0.0481685900652283, "grad_norm": 27.913333052583468, "learning_rate": 1.6042780748663102e-07, "logits/chosen": 0.07619290053844452, "logits/rejected": 0.04206910729408264, "logps/chosen": -1.6917682886123657, "logps/rejected": -1.7319209575653076, "loss": 1.6918, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -1.6917682886123657, "rewards/margins": 0.040152657777071, "rewards/rejected": -1.7319209575653076, "step": 90 }, { "epoch": 0.05084462284662987, "grad_norm": 33.63182602787541, "learning_rate": 1.693404634581105e-07, "logits/chosen": -0.09333391487598419, "logits/rejected": 0.049796827137470245, "logps/chosen": -1.7271881103515625, "logps/rejected": -1.851801872253418, "loss": 1.7272, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.7271881103515625, "rewards/margins": 0.12461371719837189, "rewards/rejected": -1.851801872253418, "step": 95 }, { "epoch": 0.05352065562803145, "grad_norm": 16.440223876821317, "learning_rate": 1.7825311942959e-07, "logits/chosen": -0.06018573045730591, "logits/rejected": 0.0011752948630601168, "logps/chosen": -1.5971167087554932, "logps/rejected": -1.7004066705703735, "loss": 1.5971, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.5971167087554932, "rewards/margins": 0.10329004377126694, "rewards/rejected": -1.7004066705703735, "step": 100 }, { "epoch": 0.05619668840943302, "grad_norm": 23.391597905016095, "learning_rate": 1.8716577540106952e-07, "logits/chosen": 0.04256182909011841, "logits/rejected": 0.06666605174541473, "logps/chosen": -1.519822597503662, "logps/rejected": -1.687696099281311, "loss": 1.5198, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.519822597503662, "rewards/margins": 0.16787350177764893, "rewards/rejected": -1.687696099281311, "step": 105 }, { "epoch": 0.05887272119083459, "grad_norm": 22.99918599445662, "learning_rate": 1.96078431372549e-07, "logits/chosen": -0.018882427364587784, "logits/rejected": 0.07456149160861969, "logps/chosen": -1.5076581239700317, "logps/rejected": -1.5687804222106934, "loss": 1.5077, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.5076581239700317, "rewards/margins": 0.061122190207242966, "rewards/rejected": -1.5687804222106934, "step": 110 }, { "epoch": 0.06154875397223616, "grad_norm": 24.231384496473968, "learning_rate": 2.049910873440285e-07, "logits/chosen": 0.017508655786514282, "logits/rejected": 0.22213265299797058, "logps/chosen": -1.5098849534988403, "logps/rejected": -1.7426191568374634, "loss": 1.5099, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.5098849534988403, "rewards/margins": 0.23273427784442902, "rewards/rejected": -1.7426191568374634, "step": 115 }, { "epoch": 0.06422478675363773, "grad_norm": 26.85965409557547, "learning_rate": 2.13903743315508e-07, "logits/chosen": -0.1017683893442154, "logits/rejected": 0.06346691399812698, "logps/chosen": -1.5396220684051514, "logps/rejected": -1.6413103342056274, "loss": 1.5396, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.5396220684051514, "rewards/margins": 0.10168834775686264, "rewards/rejected": -1.6413103342056274, "step": 120 }, { "epoch": 0.0669008195350393, "grad_norm": 13.644850214873141, "learning_rate": 2.2281639928698751e-07, "logits/chosen": -0.10048311948776245, "logits/rejected": 0.02676692046225071, "logps/chosen": -1.4938478469848633, "logps/rejected": -1.4698207378387451, "loss": 1.4938, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -1.4938478469848633, "rewards/margins": -0.02402721904218197, "rewards/rejected": -1.4698207378387451, "step": 125 }, { "epoch": 0.06957685231644088, "grad_norm": 31.577688896942636, "learning_rate": 2.31729055258467e-07, "logits/chosen": 0.0158475823700428, "logits/rejected": 0.1465541571378708, "logps/chosen": -1.5026793479919434, "logps/rejected": -1.6143089532852173, "loss": 1.5027, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -1.5026793479919434, "rewards/margins": 0.11162944883108139, "rewards/rejected": -1.6143089532852173, "step": 130 }, { "epoch": 0.07225288509784245, "grad_norm": 17.60750148017886, "learning_rate": 2.406417112299465e-07, "logits/chosen": -0.08708186447620392, "logits/rejected": 0.025114428251981735, "logps/chosen": -1.496269941329956, "logps/rejected": -1.5489060878753662, "loss": 1.4963, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.496269941329956, "rewards/margins": 0.05263596773147583, "rewards/rejected": -1.5489060878753662, "step": 135 }, { "epoch": 0.07492891787924402, "grad_norm": 10.21004547956768, "learning_rate": 2.49554367201426e-07, "logits/chosen": -0.09615747630596161, "logits/rejected": 0.055442679673433304, "logps/chosen": -1.4498670101165771, "logps/rejected": -1.505812168121338, "loss": 1.4499, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -1.4498670101165771, "rewards/margins": 0.05594513565301895, "rewards/rejected": -1.505812168121338, "step": 140 }, { "epoch": 0.0776049506606456, "grad_norm": 14.565766048557224, "learning_rate": 2.5846702317290554e-07, "logits/chosen": -0.09362263232469559, "logits/rejected": 0.04528983309864998, "logps/chosen": -1.3516747951507568, "logps/rejected": -1.4457252025604248, "loss": 1.3517, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -1.3516747951507568, "rewards/margins": 0.09405046701431274, "rewards/rejected": -1.4457252025604248, "step": 145 }, { "epoch": 0.08028098344204716, "grad_norm": 9.083612085610536, "learning_rate": 2.6737967914438503e-07, "logits/chosen": -0.12748458981513977, "logits/rejected": 0.015873895958065987, "logps/chosen": -1.2856031656265259, "logps/rejected": -1.2975358963012695, "loss": 1.2856, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.2856031656265259, "rewards/margins": 0.011932793073356152, "rewards/rejected": -1.2975358963012695, "step": 150 }, { "epoch": 0.08295701622344874, "grad_norm": 10.321433869303315, "learning_rate": 2.762923351158645e-07, "logits/chosen": -0.12852779030799866, "logits/rejected": -0.08174242824316025, "logps/chosen": -1.300156831741333, "logps/rejected": -1.416930913925171, "loss": 1.3002, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.300156831741333, "rewards/margins": 0.11677402257919312, "rewards/rejected": -1.416930913925171, "step": 155 }, { "epoch": 0.0856330490048503, "grad_norm": 7.346088759816516, "learning_rate": 2.85204991087344e-07, "logits/chosen": -0.20748431980609894, "logits/rejected": -0.08024214208126068, "logps/chosen": -1.385986089706421, "logps/rejected": -1.3663543462753296, "loss": 1.386, "rewards/accuracies": 0.46875, "rewards/chosen": -1.385986089706421, "rewards/margins": -0.019631829112768173, "rewards/rejected": -1.3663543462753296, "step": 160 }, { "epoch": 0.08830908178625188, "grad_norm": 7.460968010012942, "learning_rate": 2.941176470588235e-07, "logits/chosen": -0.09018120169639587, "logits/rejected": 0.06941888481378555, "logps/chosen": -1.3056321144104004, "logps/rejected": -1.3763902187347412, "loss": 1.3056, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.3056321144104004, "rewards/margins": 0.07075806707143784, "rewards/rejected": -1.3763902187347412, "step": 165 }, { "epoch": 0.09098511456765346, "grad_norm": 9.613430163876068, "learning_rate": 3.0303030303030305e-07, "logits/chosen": -0.11957720667123795, "logits/rejected": -0.07055063545703888, "logps/chosen": -1.4190304279327393, "logps/rejected": -1.4897412061691284, "loss": 1.419, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.4190304279327393, "rewards/margins": 0.07071077823638916, "rewards/rejected": -1.4897412061691284, "step": 170 }, { "epoch": 0.09366114734905502, "grad_norm": 9.113473158398628, "learning_rate": 3.1194295900178254e-07, "logits/chosen": -0.015224168077111244, "logits/rejected": -0.020561030134558678, "logps/chosen": -1.3062463998794556, "logps/rejected": -1.3919000625610352, "loss": 1.3062, "rewards/accuracies": 0.5, "rewards/chosen": -1.3062463998794556, "rewards/margins": 0.08565353602170944, "rewards/rejected": -1.3919000625610352, "step": 175 }, { "epoch": 0.0963371801304566, "grad_norm": 7.4535875304242385, "learning_rate": 3.2085561497326203e-07, "logits/chosen": -0.04268120601773262, "logits/rejected": -0.04308682680130005, "logps/chosen": -1.3249297142028809, "logps/rejected": -1.5278981924057007, "loss": 1.3249, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.3249297142028809, "rewards/margins": 0.2029685080051422, "rewards/rejected": -1.5278981924057007, "step": 180 }, { "epoch": 0.09901321291185818, "grad_norm": 7.861004748563658, "learning_rate": 3.297682709447415e-07, "logits/chosen": -0.17761638760566711, "logits/rejected": -0.09988296031951904, "logps/chosen": -1.3216025829315186, "logps/rejected": -1.3682224750518799, "loss": 1.3216, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -1.3216025829315186, "rewards/margins": 0.04661983251571655, "rewards/rejected": -1.3682224750518799, "step": 185 }, { "epoch": 0.10168924569325974, "grad_norm": 8.106758020816505, "learning_rate": 3.38680926916221e-07, "logits/chosen": -0.08136186748743057, "logits/rejected": 0.029784444719552994, "logps/chosen": -1.2388192415237427, "logps/rejected": -1.3705213069915771, "loss": 1.2388, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.2388192415237427, "rewards/margins": 0.1317019909620285, "rewards/rejected": -1.3705213069915771, "step": 190 }, { "epoch": 0.10436527847466132, "grad_norm": 8.31954125754412, "learning_rate": 3.475935828877005e-07, "logits/chosen": -0.011146044358611107, "logits/rejected": 0.13384239375591278, "logps/chosen": -1.2321065664291382, "logps/rejected": -1.39292311668396, "loss": 1.2321, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.2321065664291382, "rewards/margins": 0.16081640124320984, "rewards/rejected": -1.39292311668396, "step": 195 }, { "epoch": 0.1070413112560629, "grad_norm": 20.400737511413453, "learning_rate": 3.5650623885918e-07, "logits/chosen": -0.09487776458263397, "logits/rejected": 0.03397997468709946, "logps/chosen": -1.3455091714859009, "logps/rejected": -1.391821265220642, "loss": 1.3455, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.3455091714859009, "rewards/margins": 0.04631196707487106, "rewards/rejected": -1.391821265220642, "step": 200 }, { "epoch": 0.10971734403746446, "grad_norm": 12.485998344278803, "learning_rate": 3.654188948306595e-07, "logits/chosen": -0.07200519740581512, "logits/rejected": 0.06073974817991257, "logps/chosen": -1.2566862106323242, "logps/rejected": -1.3336622714996338, "loss": 1.2567, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.2566862106323242, "rewards/margins": 0.07697612047195435, "rewards/rejected": -1.3336622714996338, "step": 205 }, { "epoch": 0.11239337681886603, "grad_norm": 9.29504457775058, "learning_rate": 3.7433155080213904e-07, "logits/chosen": -0.1525542438030243, "logits/rejected": 0.023893093690276146, "logps/chosen": -1.3387010097503662, "logps/rejected": -1.4427589178085327, "loss": 1.3387, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.3387010097503662, "rewards/margins": 0.1040579304099083, "rewards/rejected": -1.4427589178085327, "step": 210 }, { "epoch": 0.1150694096002676, "grad_norm": 7.19844372891334, "learning_rate": 3.8324420677361853e-07, "logits/chosen": -0.18078181147575378, "logits/rejected": 0.05429055169224739, "logps/chosen": -1.365750789642334, "logps/rejected": -1.4182298183441162, "loss": 1.3658, "rewards/accuracies": 0.53125, "rewards/chosen": -1.365750789642334, "rewards/margins": 0.05247907713055611, "rewards/rejected": -1.4182298183441162, "step": 215 }, { "epoch": 0.11774544238166917, "grad_norm": 17.184198140054523, "learning_rate": 3.92156862745098e-07, "logits/chosen": 0.045220356434583664, "logits/rejected": 0.13736529648303986, "logps/chosen": -1.283913016319275, "logps/rejected": -1.421570062637329, "loss": 1.2839, "rewards/accuracies": 0.5625, "rewards/chosen": -1.283913016319275, "rewards/margins": 0.1376567780971527, "rewards/rejected": -1.421570062637329, "step": 220 }, { "epoch": 0.12042147516307075, "grad_norm": 6.806030998963231, "learning_rate": 4.010695187165775e-07, "logits/chosen": -0.0893852561712265, "logits/rejected": 0.07129459083080292, "logps/chosen": -1.2963159084320068, "logps/rejected": -1.41970694065094, "loss": 1.2963, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.2963159084320068, "rewards/margins": 0.1233908087015152, "rewards/rejected": -1.41970694065094, "step": 225 }, { "epoch": 0.12309750794447231, "grad_norm": 7.03813016540103, "learning_rate": 4.09982174688057e-07, "logits/chosen": -0.022974932566285133, "logits/rejected": 0.04702044650912285, "logps/chosen": -1.2921230792999268, "logps/rejected": -1.445629358291626, "loss": 1.2921, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.2921230792999268, "rewards/margins": 0.15350636839866638, "rewards/rejected": -1.445629358291626, "step": 230 }, { "epoch": 0.1257735407258739, "grad_norm": 8.050933722900224, "learning_rate": 4.188948306595365e-07, "logits/chosen": 0.009233033284544945, "logits/rejected": 0.13117334246635437, "logps/chosen": -1.2665636539459229, "logps/rejected": -1.4279484748840332, "loss": 1.2666, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.2665636539459229, "rewards/margins": 0.1613847017288208, "rewards/rejected": -1.4279484748840332, "step": 235 }, { "epoch": 0.12844957350727546, "grad_norm": 5.888950264201518, "learning_rate": 4.27807486631016e-07, "logits/chosen": -0.02489379420876503, "logits/rejected": 0.09373176842927933, "logps/chosen": -1.2830955982208252, "logps/rejected": -1.4589357376098633, "loss": 1.2831, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.2830955982208252, "rewards/margins": 0.17584006488323212, "rewards/rejected": -1.4589357376098633, "step": 240 }, { "epoch": 0.13112560628867703, "grad_norm": 7.795351470327405, "learning_rate": 4.3672014260249554e-07, "logits/chosen": 0.02167614921927452, "logits/rejected": 0.12904587388038635, "logps/chosen": -1.3994932174682617, "logps/rejected": -1.4206799268722534, "loss": 1.3995, "rewards/accuracies": 0.5625, "rewards/chosen": -1.3994932174682617, "rewards/margins": 0.021186893805861473, "rewards/rejected": -1.4206799268722534, "step": 245 }, { "epoch": 0.1338016390700786, "grad_norm": 9.489355692862285, "learning_rate": 4.4563279857397503e-07, "logits/chosen": -0.06139354780316353, "logits/rejected": 0.08913201093673706, "logps/chosen": -1.2768274545669556, "logps/rejected": -1.3327828645706177, "loss": 1.2768, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.2768274545669556, "rewards/margins": 0.05595552921295166, "rewards/rejected": -1.3327828645706177, "step": 250 }, { "epoch": 0.1364776718514802, "grad_norm": 7.134851158237539, "learning_rate": 4.545454545454545e-07, "logits/chosen": -0.03687070310115814, "logits/rejected": 0.09339624643325806, "logps/chosen": -1.2503478527069092, "logps/rejected": -1.3490577936172485, "loss": 1.2503, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.2503478527069092, "rewards/margins": 0.09870986640453339, "rewards/rejected": -1.3490577936172485, "step": 255 }, { "epoch": 0.13915370463288176, "grad_norm": 7.6770789390928105, "learning_rate": 4.63458110516934e-07, "logits/chosen": -0.23246333003044128, "logits/rejected": -0.13467907905578613, "logps/chosen": -1.3338881731033325, "logps/rejected": -1.4934184551239014, "loss": 1.3339, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.3338881731033325, "rewards/margins": 0.15953023731708527, "rewards/rejected": -1.4934184551239014, "step": 260 }, { "epoch": 0.1418297374142833, "grad_norm": 9.575847198025533, "learning_rate": 4.723707664884135e-07, "logits/chosen": -0.08749864995479584, "logits/rejected": -0.010343274101614952, "logps/chosen": -1.3098180294036865, "logps/rejected": -1.4829431772232056, "loss": 1.3098, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.3098180294036865, "rewards/margins": 0.17312507331371307, "rewards/rejected": -1.4829431772232056, "step": 265 }, { "epoch": 0.1445057701956849, "grad_norm": 7.774415144807768, "learning_rate": 4.81283422459893e-07, "logits/chosen": -0.0915318951010704, "logits/rejected": 0.027320241555571556, "logps/chosen": -1.3016685247421265, "logps/rejected": -1.3920867443084717, "loss": 1.3017, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.3016685247421265, "rewards/margins": 0.0904182717204094, "rewards/rejected": -1.3920867443084717, "step": 270 }, { "epoch": 0.14718180297708647, "grad_norm": 7.942388678056595, "learning_rate": 4.901960784313725e-07, "logits/chosen": -0.038298558443784714, "logits/rejected": 0.05102933198213577, "logps/chosen": -1.2482672929763794, "logps/rejected": -1.3972370624542236, "loss": 1.2483, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.2482672929763794, "rewards/margins": 0.14896973967552185, "rewards/rejected": -1.3972370624542236, "step": 275 }, { "epoch": 0.14985783575848804, "grad_norm": 9.15178921616395, "learning_rate": 4.99108734402852e-07, "logits/chosen": -0.087051622569561, "logits/rejected": 0.05738597363233566, "logps/chosen": -1.2919450998306274, "logps/rejected": -1.3905742168426514, "loss": 1.2919, "rewards/accuracies": 0.59375, "rewards/chosen": -1.2919450998306274, "rewards/margins": 0.09862911701202393, "rewards/rejected": -1.3905742168426514, "step": 280 }, { "epoch": 0.15253386853988962, "grad_norm": 6.9963492099931, "learning_rate": 5.080213903743315e-07, "logits/chosen": -0.09403569996356964, "logits/rejected": 0.028331885114312172, "logps/chosen": -1.3331395387649536, "logps/rejected": -1.4057984352111816, "loss": 1.3331, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.3331395387649536, "rewards/margins": 0.07265901565551758, "rewards/rejected": -1.4057984352111816, "step": 285 }, { "epoch": 0.1552099013212912, "grad_norm": 6.660403391265759, "learning_rate": 5.169340463458111e-07, "logits/chosen": -0.1353941559791565, "logits/rejected": 0.1352883279323578, "logps/chosen": -1.3498847484588623, "logps/rejected": -1.4572992324829102, "loss": 1.3499, "rewards/accuracies": 0.59375, "rewards/chosen": -1.3498847484588623, "rewards/margins": 0.10741442441940308, "rewards/rejected": -1.4572992324829102, "step": 290 }, { "epoch": 0.15788593410269275, "grad_norm": 9.840932998012113, "learning_rate": 5.258467023172905e-07, "logits/chosen": -0.08095011860132217, "logits/rejected": -0.032910920679569244, "logps/chosen": -1.2481619119644165, "logps/rejected": -1.3819684982299805, "loss": 1.2482, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.2481619119644165, "rewards/margins": 0.13380658626556396, "rewards/rejected": -1.3819684982299805, "step": 295 }, { "epoch": 0.16056196688409433, "grad_norm": 7.672376959379642, "learning_rate": 5.347593582887701e-07, "logits/chosen": -0.09300623089075089, "logits/rejected": 0.05453777313232422, "logps/chosen": -1.281254529953003, "logps/rejected": -1.3553723096847534, "loss": 1.2813, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.281254529953003, "rewards/margins": 0.07411777973175049, "rewards/rejected": -1.3553723096847534, "step": 300 }, { "epoch": 0.1632379996654959, "grad_norm": 5.299392818784395, "learning_rate": 5.436720142602496e-07, "logits/chosen": -0.043602801859378815, "logits/rejected": 0.02074519917368889, "logps/chosen": -1.3871448040008545, "logps/rejected": -1.3950326442718506, "loss": 1.3871, "rewards/accuracies": 0.5, "rewards/chosen": -1.3871448040008545, "rewards/margins": 0.007887746207416058, "rewards/rejected": -1.3950326442718506, "step": 305 }, { "epoch": 0.16591403244689748, "grad_norm": 6.76380154372784, "learning_rate": 5.52584670231729e-07, "logits/chosen": -0.22940301895141602, "logits/rejected": -0.1509525030851364, "logps/chosen": -1.35076904296875, "logps/rejected": -1.4272911548614502, "loss": 1.3508, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.35076904296875, "rewards/margins": 0.07652216404676437, "rewards/rejected": -1.4272911548614502, "step": 310 }, { "epoch": 0.16859006522829906, "grad_norm": 8.971997561259, "learning_rate": 5.614973262032086e-07, "logits/chosen": -0.011641120538115501, "logits/rejected": 0.13592663407325745, "logps/chosen": -1.3441739082336426, "logps/rejected": -1.4828517436981201, "loss": 1.3442, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.3441739082336426, "rewards/margins": 0.13867780566215515, "rewards/rejected": -1.4828517436981201, "step": 315 }, { "epoch": 0.1712660980097006, "grad_norm": 5.1692998466911915, "learning_rate": 5.70409982174688e-07, "logits/chosen": -0.07499165832996368, "logits/rejected": 0.04556746408343315, "logps/chosen": -1.3048075437545776, "logps/rejected": -1.3530696630477905, "loss": 1.3048, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.3048075437545776, "rewards/margins": 0.04826224967837334, "rewards/rejected": -1.3530696630477905, "step": 320 }, { "epoch": 0.17394213079110218, "grad_norm": 8.423417843987572, "learning_rate": 5.793226381461676e-07, "logits/chosen": -0.13727346062660217, "logits/rejected": -0.03581923618912697, "logps/chosen": -1.2965418100357056, "logps/rejected": -1.5266648530960083, "loss": 1.2965, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.2965418100357056, "rewards/margins": 0.23012292385101318, "rewards/rejected": -1.5266648530960083, "step": 325 }, { "epoch": 0.17661816357250376, "grad_norm": 5.750552391383174, "learning_rate": 5.88235294117647e-07, "logits/chosen": -0.021421056240797043, "logits/rejected": 0.11484203487634659, "logps/chosen": -1.3183469772338867, "logps/rejected": -1.4761561155319214, "loss": 1.3183, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.3183469772338867, "rewards/margins": 0.15780894458293915, "rewards/rejected": -1.4761561155319214, "step": 330 }, { "epoch": 0.17929419635390534, "grad_norm": 9.953554506406007, "learning_rate": 5.971479500891266e-07, "logits/chosen": 0.0065179141238331795, "logits/rejected": 0.09631967544555664, "logps/chosen": -1.3238192796707153, "logps/rejected": -1.3529859781265259, "loss": 1.3238, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.3238192796707153, "rewards/margins": 0.02916662022471428, "rewards/rejected": -1.3529859781265259, "step": 335 }, { "epoch": 0.18197022913530692, "grad_norm": 9.004114968696026, "learning_rate": 6.060606060606061e-07, "logits/chosen": -0.04617312550544739, "logits/rejected": 0.0823989063501358, "logps/chosen": -1.3793723583221436, "logps/rejected": -1.457774043083191, "loss": 1.3794, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.3793723583221436, "rewards/margins": 0.07840146124362946, "rewards/rejected": -1.457774043083191, "step": 340 }, { "epoch": 0.1846462619167085, "grad_norm": 7.427110379929363, "learning_rate": 6.149732620320855e-07, "logits/chosen": 0.037537943571805954, "logits/rejected": 0.06238919496536255, "logps/chosen": -1.2823779582977295, "logps/rejected": -1.4166972637176514, "loss": 1.2824, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.2823779582977295, "rewards/margins": 0.13431932032108307, "rewards/rejected": -1.4166972637176514, "step": 345 }, { "epoch": 0.18732229469811004, "grad_norm": 8.43182387693383, "learning_rate": 6.238859180035651e-07, "logits/chosen": 0.018564339727163315, "logits/rejected": 0.10249503701925278, "logps/chosen": -1.264279842376709, "logps/rejected": -1.38966965675354, "loss": 1.2643, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.264279842376709, "rewards/margins": 0.12538984417915344, "rewards/rejected": -1.38966965675354, "step": 350 }, { "epoch": 0.18999832747951162, "grad_norm": 7.722730030073891, "learning_rate": 6.327985739750445e-07, "logits/chosen": -0.08012780547142029, "logits/rejected": 0.12899991869926453, "logps/chosen": -1.3699625730514526, "logps/rejected": -1.3968147039413452, "loss": 1.37, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -1.3699625730514526, "rewards/margins": 0.026852110400795937, "rewards/rejected": -1.3968147039413452, "step": 355 }, { "epoch": 0.1926743602609132, "grad_norm": 8.942240294467505, "learning_rate": 6.417112299465241e-07, "logits/chosen": -0.061219774186611176, "logits/rejected": 0.009085068479180336, "logps/chosen": -1.2970527410507202, "logps/rejected": -1.415627360343933, "loss": 1.2971, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.2970527410507202, "rewards/margins": 0.11857465654611588, "rewards/rejected": -1.415627360343933, "step": 360 }, { "epoch": 0.19535039304231477, "grad_norm": 8.083028425722574, "learning_rate": 6.506238859180035e-07, "logits/chosen": 0.0008977234247140586, "logits/rejected": 0.07092685997486115, "logps/chosen": -1.2770949602127075, "logps/rejected": -1.3818106651306152, "loss": 1.2771, "rewards/accuracies": 0.59375, "rewards/chosen": -1.2770949602127075, "rewards/margins": 0.10471560806035995, "rewards/rejected": -1.3818106651306152, "step": 365 }, { "epoch": 0.19802642582371635, "grad_norm": 5.419938098221415, "learning_rate": 6.59536541889483e-07, "logits/chosen": -0.0014151900541037321, "logits/rejected": 0.08144277334213257, "logps/chosen": -1.2733829021453857, "logps/rejected": -1.3123538494110107, "loss": 1.2734, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.2733829021453857, "rewards/margins": 0.03897082805633545, "rewards/rejected": -1.3123538494110107, "step": 370 }, { "epoch": 0.2007024586051179, "grad_norm": 5.9063450749536885, "learning_rate": 6.684491978609626e-07, "logits/chosen": -0.04876967892050743, "logits/rejected": 0.09632103145122528, "logps/chosen": -1.2545446157455444, "logps/rejected": -1.3884865045547485, "loss": 1.2545, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.2545446157455444, "rewards/margins": 0.13394179940223694, "rewards/rejected": -1.3884865045547485, "step": 375 }, { "epoch": 0.20337849138651948, "grad_norm": 7.174280522631382, "learning_rate": 6.77361853832442e-07, "logits/chosen": -0.022835582494735718, "logits/rejected": 0.05138144642114639, "logps/chosen": -1.2716599702835083, "logps/rejected": -1.4244883060455322, "loss": 1.2717, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.2716599702835083, "rewards/margins": 0.1528284102678299, "rewards/rejected": -1.4244883060455322, "step": 380 }, { "epoch": 0.20605452416792105, "grad_norm": 5.777574972974421, "learning_rate": 6.862745098039216e-07, "logits/chosen": 0.040655769407749176, "logits/rejected": 0.11681176722049713, "logps/chosen": -1.3768361806869507, "logps/rejected": -1.358905553817749, "loss": 1.3768, "rewards/accuracies": 0.53125, "rewards/chosen": -1.3768361806869507, "rewards/margins": -0.017930690199136734, "rewards/rejected": -1.358905553817749, "step": 385 }, { "epoch": 0.20873055694932263, "grad_norm": 8.989319251787503, "learning_rate": 6.95187165775401e-07, "logits/chosen": 0.07632546126842499, "logits/rejected": 0.2360633909702301, "logps/chosen": -1.3573204278945923, "logps/rejected": -1.415759801864624, "loss": 1.3573, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -1.3573204278945923, "rewards/margins": 0.05843930318951607, "rewards/rejected": -1.415759801864624, "step": 390 }, { "epoch": 0.2114065897307242, "grad_norm": 6.374214069648872, "learning_rate": 7.040998217468806e-07, "logits/chosen": -0.03284325823187828, "logits/rejected": 0.11839799582958221, "logps/chosen": -1.307308316230774, "logps/rejected": -1.3137998580932617, "loss": 1.3073, "rewards/accuracies": 0.5, "rewards/chosen": -1.307308316230774, "rewards/margins": 0.006491594947874546, "rewards/rejected": -1.3137998580932617, "step": 395 }, { "epoch": 0.2140826225121258, "grad_norm": 5.529059609819132, "learning_rate": 7.1301247771836e-07, "logits/chosen": 0.09079950302839279, "logits/rejected": 0.17978014051914215, "logps/chosen": -1.2908601760864258, "logps/rejected": -1.3756639957427979, "loss": 1.2909, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.2908601760864258, "rewards/margins": 0.08480380475521088, "rewards/rejected": -1.3756639957427979, "step": 400 }, { "epoch": 0.2140826225121258, "eval_logits/chosen": 0.2751438617706299, "eval_logits/rejected": 0.35816526412963867, "eval_logps/chosen": -1.3229039907455444, "eval_logps/rejected": -1.4422959089279175, "eval_loss": 1.32322096824646, "eval_rewards/accuracies": 0.5548961162567139, "eval_rewards/chosen": -1.3229039907455444, "eval_rewards/margins": 0.11939182877540588, "eval_rewards/rejected": -1.4422959089279175, "eval_runtime": 40.9346, "eval_samples_per_second": 32.857, "eval_steps_per_second": 8.233, "step": 400 }, { "epoch": 0.21675865529352734, "grad_norm": 7.904302844958224, "learning_rate": 7.219251336898395e-07, "logits/chosen": 0.009388813748955727, "logits/rejected": 0.09948752820491791, "logps/chosen": -1.2882788181304932, "logps/rejected": -1.3506300449371338, "loss": 1.2883, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.2882788181304932, "rewards/margins": 0.0623510368168354, "rewards/rejected": -1.3506300449371338, "step": 405 }, { "epoch": 0.2194346880749289, "grad_norm": 6.31455386641144, "learning_rate": 7.30837789661319e-07, "logits/chosen": 0.03647914528846741, "logits/rejected": 0.16083259880542755, "logps/chosen": -1.2622774839401245, "logps/rejected": -1.3449453115463257, "loss": 1.2623, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.2622774839401245, "rewards/margins": 0.0826677680015564, "rewards/rejected": -1.3449453115463257, "step": 410 }, { "epoch": 0.2221107208563305, "grad_norm": 5.329681776445978, "learning_rate": 7.397504456327985e-07, "logits/chosen": 0.014023147523403168, "logits/rejected": 0.04526161029934883, "logps/chosen": -1.2626899480819702, "logps/rejected": -1.415295124053955, "loss": 1.2627, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.2626899480819702, "rewards/margins": 0.15260501205921173, "rewards/rejected": -1.415295124053955, "step": 415 }, { "epoch": 0.22478675363773207, "grad_norm": 5.681650537156772, "learning_rate": 7.486631016042781e-07, "logits/chosen": -0.0212759580463171, "logits/rejected": 0.15063078701496124, "logps/chosen": -1.2497800588607788, "logps/rejected": -1.3348307609558105, "loss": 1.2498, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.2497800588607788, "rewards/margins": 0.085050567984581, "rewards/rejected": -1.3348307609558105, "step": 420 }, { "epoch": 0.22746278641913364, "grad_norm": 5.435984549539804, "learning_rate": 7.575757575757575e-07, "logits/chosen": -0.04126739501953125, "logits/rejected": 0.14835788309574127, "logps/chosen": -1.2885547876358032, "logps/rejected": -1.4358973503112793, "loss": 1.2886, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.2885547876358032, "rewards/margins": 0.147342711687088, "rewards/rejected": -1.4358973503112793, "step": 425 }, { "epoch": 0.2301388192005352, "grad_norm": 6.415426386813661, "learning_rate": 7.664884135472371e-07, "logits/chosen": -0.06486807018518448, "logits/rejected": 0.11717679351568222, "logps/chosen": -1.3271749019622803, "logps/rejected": -1.435565710067749, "loss": 1.3272, "rewards/accuracies": 0.5625, "rewards/chosen": -1.3271749019622803, "rewards/margins": 0.10839088261127472, "rewards/rejected": -1.435565710067749, "step": 430 }, { "epoch": 0.23281485198193677, "grad_norm": 8.388479044034469, "learning_rate": 7.754010695187165e-07, "logits/chosen": 0.00952795147895813, "logits/rejected": 0.09027308225631714, "logps/chosen": -1.189346432685852, "logps/rejected": -1.3179917335510254, "loss": 1.1893, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.189346432685852, "rewards/margins": 0.1286453753709793, "rewards/rejected": -1.3179917335510254, "step": 435 }, { "epoch": 0.23549088476333835, "grad_norm": 5.769285202818066, "learning_rate": 7.84313725490196e-07, "logits/chosen": 0.011318689212203026, "logits/rejected": 0.09357856959104538, "logps/chosen": -1.269335389137268, "logps/rejected": -1.3506667613983154, "loss": 1.2693, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.269335389137268, "rewards/margins": 0.08133126050233841, "rewards/rejected": -1.3506667613983154, "step": 440 }, { "epoch": 0.23816691754473993, "grad_norm": 6.3873421726868935, "learning_rate": 7.932263814616755e-07, "logits/chosen": -0.03760265186429024, "logits/rejected": 0.06436518579721451, "logps/chosen": -1.2952880859375, "logps/rejected": -1.4144254922866821, "loss": 1.2953, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.2952880859375, "rewards/margins": 0.119137242436409, "rewards/rejected": -1.4144254922866821, "step": 445 }, { "epoch": 0.2408429503261415, "grad_norm": 9.503508122180477, "learning_rate": 8.02139037433155e-07, "logits/chosen": 0.03547479212284088, "logits/rejected": 0.1539306938648224, "logps/chosen": -1.2753428220748901, "logps/rejected": -1.410194993019104, "loss": 1.2753, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.2753428220748901, "rewards/margins": 0.13485196232795715, "rewards/rejected": -1.410194993019104, "step": 450 }, { "epoch": 0.24351898310754308, "grad_norm": 7.369610373327497, "learning_rate": 8.110516934046346e-07, "logits/chosen": 0.04161831736564636, "logits/rejected": 0.12756311893463135, "logps/chosen": -1.2255194187164307, "logps/rejected": -1.3936083316802979, "loss": 1.2255, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.2255194187164307, "rewards/margins": 0.16808901727199554, "rewards/rejected": -1.3936083316802979, "step": 455 }, { "epoch": 0.24619501588894463, "grad_norm": 7.260792563486717, "learning_rate": 8.19964349376114e-07, "logits/chosen": -0.0774279236793518, "logits/rejected": 0.0425356850028038, "logps/chosen": -1.350956678390503, "logps/rejected": -1.3944776058197021, "loss": 1.351, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -1.350956678390503, "rewards/margins": 0.04352092370390892, "rewards/rejected": -1.3944776058197021, "step": 460 }, { "epoch": 0.2488710486703462, "grad_norm": 7.267434141647265, "learning_rate": 8.288770053475936e-07, "logits/chosen": 0.15314233303070068, "logits/rejected": 0.1659504622220993, "logps/chosen": -1.251481294631958, "logps/rejected": -1.4039957523345947, "loss": 1.2515, "rewards/accuracies": 0.5625, "rewards/chosen": -1.251481294631958, "rewards/margins": 0.15251459181308746, "rewards/rejected": -1.4039957523345947, "step": 465 }, { "epoch": 0.2515470814517478, "grad_norm": 7.690257409673891, "learning_rate": 8.37789661319073e-07, "logits/chosen": 0.1611669957637787, "logits/rejected": 0.11271514743566513, "logps/chosen": -1.2085187435150146, "logps/rejected": -1.3847599029541016, "loss": 1.2085, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.2085187435150146, "rewards/margins": 0.17624124884605408, "rewards/rejected": -1.3847599029541016, "step": 470 }, { "epoch": 0.25422311423314936, "grad_norm": 5.334997827405659, "learning_rate": 8.467023172905525e-07, "logits/chosen": -0.04921787604689598, "logits/rejected": 0.07611466944217682, "logps/chosen": -1.2834465503692627, "logps/rejected": -1.4538166522979736, "loss": 1.2834, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.2834465503692627, "rewards/margins": 0.1703699678182602, "rewards/rejected": -1.4538166522979736, "step": 475 }, { "epoch": 0.2568991470145509, "grad_norm": 7.487604558465863, "learning_rate": 8.55614973262032e-07, "logits/chosen": -0.03445490822196007, "logits/rejected": 0.15312166512012482, "logps/chosen": -1.2471004724502563, "logps/rejected": -1.3064740896224976, "loss": 1.2471, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.2471004724502563, "rewards/margins": 0.05937368422746658, "rewards/rejected": -1.3064740896224976, "step": 480 }, { "epoch": 0.2595751797959525, "grad_norm": 8.780771857281671, "learning_rate": 8.645276292335115e-07, "logits/chosen": 0.005399066023528576, "logits/rejected": 0.036442916840314865, "logps/chosen": -1.339504599571228, "logps/rejected": -1.408872365951538, "loss": 1.3395, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.339504599571228, "rewards/margins": 0.06936774402856827, "rewards/rejected": -1.408872365951538, "step": 485 }, { "epoch": 0.26225121257735406, "grad_norm": 7.461453714196461, "learning_rate": 8.734402852049911e-07, "logits/chosen": 0.030108636245131493, "logits/rejected": 0.09481552243232727, "logps/chosen": -1.2826862335205078, "logps/rejected": -1.339806318283081, "loss": 1.2827, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -1.2826862335205078, "rewards/margins": 0.05712023377418518, "rewards/rejected": -1.339806318283081, "step": 490 }, { "epoch": 0.26492724535875567, "grad_norm": 7.073283918502272, "learning_rate": 8.823529411764705e-07, "logits/chosen": -0.034579455852508545, "logits/rejected": -0.017912039533257484, "logps/chosen": -1.2872518301010132, "logps/rejected": -1.3923848867416382, "loss": 1.2873, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.2872518301010132, "rewards/margins": 0.10513293743133545, "rewards/rejected": -1.3923848867416382, "step": 495 }, { "epoch": 0.2676032781401572, "grad_norm": 6.396619255940083, "learning_rate": 8.912655971479501e-07, "logits/chosen": -0.03260781615972519, "logits/rejected": 0.060619182884693146, "logps/chosen": -1.20059072971344, "logps/rejected": -1.346470594406128, "loss": 1.2006, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.20059072971344, "rewards/margins": 0.14587993919849396, "rewards/rejected": -1.346470594406128, "step": 500 }, { "epoch": 0.27027931092155877, "grad_norm": 7.118410291644456, "learning_rate": 9.001782531194295e-07, "logits/chosen": -0.04373949021100998, "logits/rejected": 0.0871054008603096, "logps/chosen": -1.3186986446380615, "logps/rejected": -1.3556714057922363, "loss": 1.3187, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.3186986446380615, "rewards/margins": 0.036972709000110626, "rewards/rejected": -1.3556714057922363, "step": 505 }, { "epoch": 0.2729553437029604, "grad_norm": 7.294032697839244, "learning_rate": 9.09090909090909e-07, "logits/chosen": 0.0903499498963356, "logits/rejected": 0.1466381549835205, "logps/chosen": -1.2782901525497437, "logps/rejected": -1.4305799007415771, "loss": 1.2783, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.2782901525497437, "rewards/margins": 0.15228979289531708, "rewards/rejected": -1.4305799007415771, "step": 510 }, { "epoch": 0.2756313764843619, "grad_norm": 6.922533902914839, "learning_rate": 9.180035650623885e-07, "logits/chosen": 0.04225022345781326, "logits/rejected": 0.1283971220254898, "logps/chosen": -1.2199143171310425, "logps/rejected": -1.3619848489761353, "loss": 1.2199, "rewards/accuracies": 0.53125, "rewards/chosen": -1.2199143171310425, "rewards/margins": 0.14207051694393158, "rewards/rejected": -1.3619848489761353, "step": 515 }, { "epoch": 0.27830740926576353, "grad_norm": 5.599244579473142, "learning_rate": 9.26916221033868e-07, "logits/chosen": -0.06423594802618027, "logits/rejected": 0.06038806587457657, "logps/chosen": -1.2639728784561157, "logps/rejected": -1.3484299182891846, "loss": 1.264, "rewards/accuracies": 0.53125, "rewards/chosen": -1.2639728784561157, "rewards/margins": 0.08445700258016586, "rewards/rejected": -1.3484299182891846, "step": 520 }, { "epoch": 0.2809834420471651, "grad_norm": 10.561094467782869, "learning_rate": 9.358288770053476e-07, "logits/chosen": 0.1416289210319519, "logits/rejected": 0.20395240187644958, "logps/chosen": -1.235233187675476, "logps/rejected": -1.3876780271530151, "loss": 1.2352, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.235233187675476, "rewards/margins": 0.15244479477405548, "rewards/rejected": -1.3876780271530151, "step": 525 }, { "epoch": 0.2836594748285666, "grad_norm": 4.944873940326493, "learning_rate": 9.44741532976827e-07, "logits/chosen": 0.07820623368024826, "logits/rejected": 0.15809687972068787, "logps/chosen": -1.2142632007598877, "logps/rejected": -1.3032758235931396, "loss": 1.2143, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.2142632007598877, "rewards/margins": 0.08901272714138031, "rewards/rejected": -1.3032758235931396, "step": 530 }, { "epoch": 0.28633550760996823, "grad_norm": 5.995639984960049, "learning_rate": 9.536541889483066e-07, "logits/chosen": -0.08460856974124908, "logits/rejected": 0.1567671000957489, "logps/chosen": -1.2356255054473877, "logps/rejected": -1.2847849130630493, "loss": 1.2356, "rewards/accuracies": 0.53125, "rewards/chosen": -1.2356255054473877, "rewards/margins": 0.04915950819849968, "rewards/rejected": -1.2847849130630493, "step": 535 }, { "epoch": 0.2890115403913698, "grad_norm": 7.4018164930208545, "learning_rate": 9.62566844919786e-07, "logits/chosen": 0.023123014718294144, "logits/rejected": 0.08488713949918747, "logps/chosen": -1.3596012592315674, "logps/rejected": -1.403643250465393, "loss": 1.3596, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.3596012592315674, "rewards/margins": 0.044041894376277924, "rewards/rejected": -1.403643250465393, "step": 540 }, { "epoch": 0.2916875731727714, "grad_norm": 5.335136428484519, "learning_rate": 9.714795008912655e-07, "logits/chosen": -0.08957906812429428, "logits/rejected": 0.09729144722223282, "logps/chosen": -1.2579071521759033, "logps/rejected": -1.3570783138275146, "loss": 1.2579, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.2579071521759033, "rewards/margins": 0.09917120635509491, "rewards/rejected": -1.3570783138275146, "step": 545 }, { "epoch": 0.29436360595417294, "grad_norm": 6.937408064770952, "learning_rate": 9.80392156862745e-07, "logits/chosen": 0.041137952357530594, "logits/rejected": 0.0991501659154892, "logps/chosen": -1.2612898349761963, "logps/rejected": -1.3849916458129883, "loss": 1.2613, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.2612898349761963, "rewards/margins": 0.12370163202285767, "rewards/rejected": -1.3849916458129883, "step": 550 }, { "epoch": 0.2970396387355745, "grad_norm": 7.868833781340775, "learning_rate": 9.893048128342244e-07, "logits/chosen": -0.07089059054851532, "logits/rejected": 0.04633297771215439, "logps/chosen": -1.3335317373275757, "logps/rejected": -1.3853155374526978, "loss": 1.3335, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.3335317373275757, "rewards/margins": 0.051783740520477295, "rewards/rejected": -1.3853155374526978, "step": 555 }, { "epoch": 0.2997156715169761, "grad_norm": 5.394430566009185, "learning_rate": 9.98217468805704e-07, "logits/chosen": 0.0365101620554924, "logits/rejected": 0.045991528779268265, "logps/chosen": -1.1875207424163818, "logps/rejected": -1.3089485168457031, "loss": 1.1875, "rewards/accuracies": 0.5625, "rewards/chosen": -1.1875207424163818, "rewards/margins": 0.12142777442932129, "rewards/rejected": -1.3089485168457031, "step": 560 }, { "epoch": 0.30239170429837764, "grad_norm": 5.967557978146947, "learning_rate": 9.999984476788462e-07, "logits/chosen": 0.024489600211381912, "logits/rejected": 0.06346601247787476, "logps/chosen": -1.304334282875061, "logps/rejected": -1.4136617183685303, "loss": 1.3043, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.304334282875061, "rewards/margins": 0.109327532351017, "rewards/rejected": -1.4136617183685303, "step": 565 }, { "epoch": 0.30506773707977924, "grad_norm": 7.41445248957904, "learning_rate": 9.999921413906797e-07, "logits/chosen": -0.0665111243724823, "logits/rejected": 0.12914595007896423, "logps/chosen": -1.2841336727142334, "logps/rejected": -1.347597360610962, "loss": 1.2841, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.2841336727142334, "rewards/margins": 0.06346355378627777, "rewards/rejected": -1.347597360610962, "step": 570 }, { "epoch": 0.3077437698611808, "grad_norm": 6.346489411716735, "learning_rate": 9.999809841765644e-07, "logits/chosen": -0.036432672291994095, "logits/rejected": 0.01531162392348051, "logps/chosen": -1.2162864208221436, "logps/rejected": -1.311760663986206, "loss": 1.2163, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.2162864208221436, "rewards/margins": 0.09547445923089981, "rewards/rejected": -1.311760663986206, "step": 575 }, { "epoch": 0.3104198026425824, "grad_norm": 6.493107028731201, "learning_rate": 9.999649761447477e-07, "logits/chosen": -0.04070242494344711, "logits/rejected": 0.09171311557292938, "logps/chosen": -1.2101483345031738, "logps/rejected": -1.363893985748291, "loss": 1.2101, "rewards/accuracies": 0.5625, "rewards/chosen": -1.2101483345031738, "rewards/margins": 0.15374557673931122, "rewards/rejected": -1.363893985748291, "step": 580 }, { "epoch": 0.31309583542398395, "grad_norm": 6.757987276546191, "learning_rate": 9.999441174505398e-07, "logits/chosen": -0.06237734109163284, "logits/rejected": 0.025224661454558372, "logps/chosen": -1.3467481136322021, "logps/rejected": -1.3772515058517456, "loss": 1.3467, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.3467481136322021, "rewards/margins": 0.030503535643219948, "rewards/rejected": -1.3772515058517456, "step": 585 }, { "epoch": 0.3157718682053855, "grad_norm": 7.3659596292557925, "learning_rate": 9.999184082963116e-07, "logits/chosen": -0.040310338139534, "logits/rejected": 0.07738013565540314, "logps/chosen": -1.3280792236328125, "logps/rejected": -1.3462533950805664, "loss": 1.3281, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.3280792236328125, "rewards/margins": 0.01817408949136734, "rewards/rejected": -1.3462533950805664, "step": 590 }, { "epoch": 0.3184479009867871, "grad_norm": 6.197853704604656, "learning_rate": 9.998878489314937e-07, "logits/chosen": 0.008287662640213966, "logits/rejected": 0.11875814199447632, "logps/chosen": -1.2589359283447266, "logps/rejected": -1.2972991466522217, "loss": 1.2589, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -1.2589359283447266, "rewards/margins": 0.03836316615343094, "rewards/rejected": -1.2972991466522217, "step": 595 }, { "epoch": 0.32112393376818865, "grad_norm": 5.255300213088934, "learning_rate": 9.99852439652573e-07, "logits/chosen": -0.04999683052301407, "logits/rejected": 0.07973314821720123, "logps/chosen": -1.2507792711257935, "logps/rejected": -1.2541742324829102, "loss": 1.2508, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -1.2507792711257935, "rewards/margins": 0.003394791856408119, "rewards/rejected": -1.2541742324829102, "step": 600 }, { "epoch": 0.32379996654959026, "grad_norm": 6.480754646360136, "learning_rate": 9.998121808030904e-07, "logits/chosen": -0.05619609355926514, "logits/rejected": 0.02443588152527809, "logps/chosen": -1.3057987689971924, "logps/rejected": -1.434415340423584, "loss": 1.3058, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.3057987689971924, "rewards/margins": 0.12861666083335876, "rewards/rejected": -1.434415340423584, "step": 605 }, { "epoch": 0.3264759993309918, "grad_norm": 9.852908500368562, "learning_rate": 9.997670727736379e-07, "logits/chosen": 0.04269368201494217, "logits/rejected": 0.17667509615421295, "logps/chosen": -1.28101646900177, "logps/rejected": -1.3278926610946655, "loss": 1.281, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.28101646900177, "rewards/margins": 0.04687615856528282, "rewards/rejected": -1.3278926610946655, "step": 610 }, { "epoch": 0.32915203211239336, "grad_norm": 5.298485587290335, "learning_rate": 9.99717116001853e-07, "logits/chosen": -0.0611281581223011, "logits/rejected": 0.03383522480726242, "logps/chosen": -1.2874058485031128, "logps/rejected": -1.4022152423858643, "loss": 1.2874, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.2874058485031128, "rewards/margins": 0.11480961740016937, "rewards/rejected": -1.4022152423858643, "step": 615 }, { "epoch": 0.33182806489379496, "grad_norm": 5.889986163714016, "learning_rate": 9.996623109724173e-07, "logits/chosen": 0.013762956485152245, "logits/rejected": 0.06487260013818741, "logps/chosen": -1.3426588773727417, "logps/rejected": -1.4397079944610596, "loss": 1.3427, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -1.3426588773727417, "rewards/margins": 0.09704907238483429, "rewards/rejected": -1.4397079944610596, "step": 620 }, { "epoch": 0.3345040976751965, "grad_norm": 6.682053109887849, "learning_rate": 9.996026582170488e-07, "logits/chosen": 0.048189274966716766, "logits/rejected": 0.14720122516155243, "logps/chosen": -1.2645246982574463, "logps/rejected": -1.3666555881500244, "loss": 1.2645, "rewards/accuracies": 0.5625, "rewards/chosen": -1.2645246982574463, "rewards/margins": 0.1021309643983841, "rewards/rejected": -1.3666555881500244, "step": 625 }, { "epoch": 0.3371801304565981, "grad_norm": 6.759249312203628, "learning_rate": 9.995381583144996e-07, "logits/chosen": -0.026995714753866196, "logits/rejected": 0.06602618843317032, "logps/chosen": -1.297309160232544, "logps/rejected": -1.4209636449813843, "loss": 1.2973, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.297309160232544, "rewards/margins": 0.12365458160638809, "rewards/rejected": -1.4209636449813843, "step": 630 }, { "epoch": 0.33985616323799966, "grad_norm": 5.274310361493564, "learning_rate": 9.994688118905471e-07, "logits/chosen": -0.01724885404109955, "logits/rejected": 0.1988321989774704, "logps/chosen": -1.3644448518753052, "logps/rejected": -1.3817636966705322, "loss": 1.3644, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -1.3644448518753052, "rewards/margins": 0.017318878322839737, "rewards/rejected": -1.3817636966705322, "step": 635 }, { "epoch": 0.3425321960194012, "grad_norm": 7.319261921195394, "learning_rate": 9.993946196179912e-07, "logits/chosen": -0.088899627327919, "logits/rejected": 0.09549451619386673, "logps/chosen": -1.2999989986419678, "logps/rejected": -1.3770735263824463, "loss": 1.3, "rewards/accuracies": 0.45625001192092896, "rewards/chosen": -1.2999989986419678, "rewards/margins": 0.07707444578409195, "rewards/rejected": -1.3770735263824463, "step": 640 }, { "epoch": 0.3452082288008028, "grad_norm": 6.72024424512918, "learning_rate": 9.993155822166455e-07, "logits/chosen": -0.0812041312456131, "logits/rejected": -0.007278320379555225, "logps/chosen": -1.2057610750198364, "logps/rejected": -1.3553898334503174, "loss": 1.2058, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.2057610750198364, "rewards/margins": 0.14962883293628693, "rewards/rejected": -1.3553898334503174, "step": 645 }, { "epoch": 0.34788426158220437, "grad_norm": 8.755165477298165, "learning_rate": 9.992317004533313e-07, "logits/chosen": -0.03246548771858215, "logits/rejected": 0.09493082016706467, "logps/chosen": -1.3577139377593994, "logps/rejected": -1.4846954345703125, "loss": 1.3577, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.3577139377593994, "rewards/margins": 0.12698160111904144, "rewards/rejected": -1.4846954345703125, "step": 650 }, { "epoch": 0.350560294363606, "grad_norm": 10.832638186656437, "learning_rate": 9.991429751418696e-07, "logits/chosen": 0.04678253456950188, "logits/rejected": 0.050963062793016434, "logps/chosen": -1.285603642463684, "logps/rejected": -1.4623228311538696, "loss": 1.2856, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.285603642463684, "rewards/margins": 0.1767192780971527, "rewards/rejected": -1.4623228311538696, "step": 655 }, { "epoch": 0.3532363271450075, "grad_norm": 5.2654930767574175, "learning_rate": 9.99049407143074e-07, "logits/chosen": 0.009529873728752136, "logits/rejected": 0.1266329437494278, "logps/chosen": -1.241877555847168, "logps/rejected": -1.2610584497451782, "loss": 1.2419, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.241877555847168, "rewards/margins": 0.01918083056807518, "rewards/rejected": -1.2610584497451782, "step": 660 }, { "epoch": 0.35591235992640907, "grad_norm": 5.7145004158291295, "learning_rate": 9.989509973647416e-07, "logits/chosen": 0.006330925039947033, "logits/rejected": 0.13812440633773804, "logps/chosen": -1.2166540622711182, "logps/rejected": -1.330609917640686, "loss": 1.2167, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.2166540622711182, "rewards/margins": 0.1139557808637619, "rewards/rejected": -1.330609917640686, "step": 665 }, { "epoch": 0.3585883927078107, "grad_norm": 5.310939391265897, "learning_rate": 9.988477467616445e-07, "logits/chosen": -0.035627175122499466, "logits/rejected": 0.1577729433774948, "logps/chosen": -1.2475389242172241, "logps/rejected": -1.2807823419570923, "loss": 1.2475, "rewards/accuracies": 0.46875, "rewards/chosen": -1.2475389242172241, "rewards/margins": 0.033243320882320404, "rewards/rejected": -1.2807823419570923, "step": 670 }, { "epoch": 0.3612644254892122, "grad_norm": 6.958894189304468, "learning_rate": 9.987396563355205e-07, "logits/chosen": -0.06037105247378349, "logits/rejected": 0.009147820994257927, "logps/chosen": -1.2450675964355469, "logps/rejected": -1.449371099472046, "loss": 1.2451, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.2450675964355469, "rewards/margins": 0.2043035328388214, "rewards/rejected": -1.449371099472046, "step": 675 }, { "epoch": 0.36394045827061383, "grad_norm": 4.757804989182487, "learning_rate": 9.986267271350631e-07, "logits/chosen": 0.035208623856306076, "logits/rejected": 0.18331514298915863, "logps/chosen": -1.2884360551834106, "logps/rejected": -1.3269561529159546, "loss": 1.2884, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -1.2884360551834106, "rewards/margins": 0.038520049303770065, "rewards/rejected": -1.3269561529159546, "step": 680 }, { "epoch": 0.3666164910520154, "grad_norm": 8.220137109189386, "learning_rate": 9.985089602559123e-07, "logits/chosen": -0.007437742315232754, "logits/rejected": 0.130922332406044, "logps/chosen": -1.2707639932632446, "logps/rejected": -1.3124730587005615, "loss": 1.2708, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.2707639932632446, "rewards/margins": 0.04170908033847809, "rewards/rejected": -1.3124730587005615, "step": 685 }, { "epoch": 0.369292523833417, "grad_norm": 5.840258194722364, "learning_rate": 9.983863568406428e-07, "logits/chosen": 0.011055896990001202, "logits/rejected": 0.03522082790732384, "logps/chosen": -1.2747104167938232, "logps/rejected": -1.3744704723358154, "loss": 1.2747, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.2747104167938232, "rewards/margins": 0.09976005554199219, "rewards/rejected": -1.3744704723358154, "step": 690 }, { "epoch": 0.37196855661481854, "grad_norm": 6.712304280580629, "learning_rate": 9.982589180787532e-07, "logits/chosen": -0.029823308810591698, "logits/rejected": 0.05099375918507576, "logps/chosen": -1.1533950567245483, "logps/rejected": -1.315091848373413, "loss": 1.1534, "rewards/accuracies": 0.53125, "rewards/chosen": -1.1533950567245483, "rewards/margins": 0.16169673204421997, "rewards/rejected": -1.315091848373413, "step": 695 }, { "epoch": 0.3746445893962201, "grad_norm": 4.912769018056659, "learning_rate": 9.981266452066553e-07, "logits/chosen": -0.1283794492483139, "logits/rejected": -0.01094023697078228, "logps/chosen": -1.3359637260437012, "logps/rejected": -1.3930314779281616, "loss": 1.336, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.3359637260437012, "rewards/margins": 0.05706765130162239, "rewards/rejected": -1.3930314779281616, "step": 700 }, { "epoch": 0.3773206221776217, "grad_norm": 5.512745884334865, "learning_rate": 9.979895395076608e-07, "logits/chosen": -0.09660854935646057, "logits/rejected": 0.057076118886470795, "logps/chosen": -1.2839070558547974, "logps/rejected": -1.4205552339553833, "loss": 1.2839, "rewards/accuracies": 0.53125, "rewards/chosen": -1.2839070558547974, "rewards/margins": 0.13664817810058594, "rewards/rejected": -1.4205552339553833, "step": 705 }, { "epoch": 0.37999665495902324, "grad_norm": 6.897670908014209, "learning_rate": 9.9784760231197e-07, "logits/chosen": 0.03875797614455223, "logits/rejected": 0.12368787825107574, "logps/chosen": -1.2253133058547974, "logps/rejected": -1.3387963771820068, "loss": 1.2253, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.2253133058547974, "rewards/margins": 0.11348292976617813, "rewards/rejected": -1.3387963771820068, "step": 710 }, { "epoch": 0.38267268774042484, "grad_norm": 7.6941940155290585, "learning_rate": 9.97700834996658e-07, "logits/chosen": -0.03735549375414848, "logits/rejected": 0.10646843910217285, "logps/chosen": -1.2923686504364014, "logps/rejected": -1.4052579402923584, "loss": 1.2924, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.2923686504364014, "rewards/margins": 0.11288927495479584, "rewards/rejected": -1.4052579402923584, "step": 715 }, { "epoch": 0.3853487205218264, "grad_norm": 5.960646914320945, "learning_rate": 9.97549238985662e-07, "logits/chosen": 0.021368350833654404, "logits/rejected": 0.18767794966697693, "logps/chosen": -1.338719367980957, "logps/rejected": -1.3847020864486694, "loss": 1.3387, "rewards/accuracies": 0.53125, "rewards/chosen": -1.338719367980957, "rewards/margins": 0.04598287492990494, "rewards/rejected": -1.3847020864486694, "step": 720 }, { "epoch": 0.38802475330322794, "grad_norm": 6.7753462839630005, "learning_rate": 9.973928157497674e-07, "logits/chosen": -0.09927035868167877, "logits/rejected": 0.01873573660850525, "logps/chosen": -1.1787363290786743, "logps/rejected": -1.4264137744903564, "loss": 1.1787, "rewards/accuracies": 0.625, "rewards/chosen": -1.1787363290786743, "rewards/margins": 0.24767740070819855, "rewards/rejected": -1.4264137744903564, "step": 725 }, { "epoch": 0.39070078608462955, "grad_norm": 6.64424882166517, "learning_rate": 9.972315668065927e-07, "logits/chosen": -0.14807018637657166, "logits/rejected": 0.0016284100711345673, "logps/chosen": -1.2914100885391235, "logps/rejected": -1.3569614887237549, "loss": 1.2914, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.2914100885391235, "rewards/margins": 0.06555143743753433, "rewards/rejected": -1.3569614887237549, "step": 730 }, { "epoch": 0.3933768188660311, "grad_norm": 5.2647675863791985, "learning_rate": 9.97065493720576e-07, "logits/chosen": -0.09722632169723511, "logits/rejected": -0.008571791462600231, "logps/chosen": -1.3076422214508057, "logps/rejected": -1.3516309261322021, "loss": 1.3076, "rewards/accuracies": 0.53125, "rewards/chosen": -1.3076422214508057, "rewards/margins": 0.043988607823848724, "rewards/rejected": -1.3516309261322021, "step": 735 }, { "epoch": 0.3960528516474327, "grad_norm": 8.713060560739894, "learning_rate": 9.968945981029594e-07, "logits/chosen": -0.06519746035337448, "logits/rejected": 0.08618789911270142, "logps/chosen": -1.3548696041107178, "logps/rejected": -1.377759337425232, "loss": 1.3549, "rewards/accuracies": 0.46875, "rewards/chosen": -1.3548696041107178, "rewards/margins": 0.022889886051416397, "rewards/rejected": -1.377759337425232, "step": 740 }, { "epoch": 0.39872888442883425, "grad_norm": 5.816268063966353, "learning_rate": 9.967188816117726e-07, "logits/chosen": 0.04248436540365219, "logits/rejected": 0.10522858798503876, "logps/chosen": -1.335460901260376, "logps/rejected": -1.5103785991668701, "loss": 1.3355, "rewards/accuracies": 0.5625, "rewards/chosen": -1.335460901260376, "rewards/margins": 0.17491783201694489, "rewards/rejected": -1.5103785991668701, "step": 745 }, { "epoch": 0.4014049172102358, "grad_norm": 5.499461041307087, "learning_rate": 9.965383459518179e-07, "logits/chosen": -0.02196449786424637, "logits/rejected": 0.1288202852010727, "logps/chosen": -1.271683692932129, "logps/rejected": -1.417433500289917, "loss": 1.2717, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.271683692932129, "rewards/margins": 0.1457497477531433, "rewards/rejected": -1.417433500289917, "step": 750 }, { "epoch": 0.4040809499916374, "grad_norm": 5.077277957380026, "learning_rate": 9.963529928746533e-07, "logits/chosen": -0.0011986896861344576, "logits/rejected": 0.10022608190774918, "logps/chosen": -1.3058421611785889, "logps/rejected": -1.3685485124588013, "loss": 1.3058, "rewards/accuracies": 0.5, "rewards/chosen": -1.3058421611785889, "rewards/margins": 0.06270639598369598, "rewards/rejected": -1.3685485124588013, "step": 755 }, { "epoch": 0.40675698277303896, "grad_norm": 5.54378108109114, "learning_rate": 9.961628241785746e-07, "logits/chosen": -0.08130250126123428, "logits/rejected": -0.022916577756404877, "logps/chosen": -1.3253090381622314, "logps/rejected": -1.4402134418487549, "loss": 1.3253, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.3253090381622314, "rewards/margins": 0.11490444093942642, "rewards/rejected": -1.4402134418487549, "step": 760 }, { "epoch": 0.40943301555444056, "grad_norm": 11.763119708677307, "learning_rate": 9.959678417085998e-07, "logits/chosen": -0.030821245163679123, "logits/rejected": 0.052153147757053375, "logps/chosen": -1.2782700061798096, "logps/rejected": -1.3619083166122437, "loss": 1.2783, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.2782700061798096, "rewards/margins": 0.0836382657289505, "rewards/rejected": -1.3619083166122437, "step": 765 }, { "epoch": 0.4121090483358421, "grad_norm": 6.345391894260758, "learning_rate": 9.957680473564493e-07, "logits/chosen": 0.0759972482919693, "logits/rejected": 0.18403425812721252, "logps/chosen": -1.2332137823104858, "logps/rejected": -1.4189693927764893, "loss": 1.2332, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.2332137823104858, "rewards/margins": 0.18575584888458252, "rewards/rejected": -1.4189693927764893, "step": 770 }, { "epoch": 0.41478508111724366, "grad_norm": 7.081538563928381, "learning_rate": 9.95563443060529e-07, "logits/chosen": -0.07288991659879684, "logits/rejected": 0.0761675089597702, "logps/chosen": -1.3166158199310303, "logps/rejected": -1.4624823331832886, "loss": 1.3166, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.3166158199310303, "rewards/margins": 0.1458665281534195, "rewards/rejected": -1.4624823331832886, "step": 775 }, { "epoch": 0.41746111389864526, "grad_norm": 4.369709686057515, "learning_rate": 9.95354030805911e-07, "logits/chosen": -0.12510672211647034, "logits/rejected": 0.010861729271709919, "logps/chosen": -1.2385467290878296, "logps/rejected": -1.380101203918457, "loss": 1.2385, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.2385467290878296, "rewards/margins": 0.14155444502830505, "rewards/rejected": -1.380101203918457, "step": 780 }, { "epoch": 0.4201371466800468, "grad_norm": 6.00903062414085, "learning_rate": 9.951398126243133e-07, "logits/chosen": 0.023474574089050293, "logits/rejected": 0.13239595293998718, "logps/chosen": -1.2108378410339355, "logps/rejected": -1.399218201637268, "loss": 1.2108, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.2108378410339355, "rewards/margins": 0.18838027119636536, "rewards/rejected": -1.399218201637268, "step": 785 }, { "epoch": 0.4228131794614484, "grad_norm": 5.232794523103333, "learning_rate": 9.94920790594082e-07, "logits/chosen": -0.05090903490781784, "logits/rejected": 0.05644839257001877, "logps/chosen": -1.2877293825149536, "logps/rejected": -1.3221710920333862, "loss": 1.2877, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -1.2877293825149536, "rewards/margins": 0.03444188833236694, "rewards/rejected": -1.3221710920333862, "step": 790 }, { "epoch": 0.42548921224284997, "grad_norm": 4.886449950005038, "learning_rate": 9.946969668401696e-07, "logits/chosen": -0.0986107736825943, "logits/rejected": 0.06995345652103424, "logps/chosen": -1.2593374252319336, "logps/rejected": -1.3589516878128052, "loss": 1.2593, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.2593374252319336, "rewards/margins": 0.09961430728435516, "rewards/rejected": -1.3589516878128052, "step": 795 }, { "epoch": 0.4281652450242516, "grad_norm": 7.147801730169023, "learning_rate": 9.944683435341155e-07, "logits/chosen": -0.060142964124679565, "logits/rejected": 0.005943029187619686, "logps/chosen": -1.2586958408355713, "logps/rejected": -1.2933197021484375, "loss": 1.2587, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.2586958408355713, "rewards/margins": 0.034623872488737106, "rewards/rejected": -1.2933197021484375, "step": 800 }, { "epoch": 0.4281652450242516, "eval_logits/chosen": 0.2541988492012024, "eval_logits/rejected": 0.33021602034568787, "eval_logps/chosen": -1.2924518585205078, "eval_logps/rejected": -1.4166163206100464, "eval_loss": 1.2927271127700806, "eval_rewards/accuracies": 0.5504450798034668, "eval_rewards/chosen": -1.2924518585205078, "eval_rewards/margins": 0.12416449934244156, "eval_rewards/rejected": -1.4166163206100464, "eval_runtime": 40.1803, "eval_samples_per_second": 33.474, "eval_steps_per_second": 8.387, "step": 800 }, { "epoch": 0.4308412778056531, "grad_norm": 6.250040620120328, "learning_rate": 9.942349228940236e-07, "logits/chosen": -0.07929084450006485, "logits/rejected": 0.04928407445549965, "logps/chosen": -1.3074005842208862, "logps/rejected": -1.4313322305679321, "loss": 1.3074, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.3074005842208862, "rewards/margins": 0.12393154948949814, "rewards/rejected": -1.4313322305679321, "step": 805 }, { "epoch": 0.43351731058705467, "grad_norm": 5.9693846802412605, "learning_rate": 9.939967071845424e-07, "logits/chosen": -0.001812008791603148, "logits/rejected": 0.064687080681324, "logps/chosen": -1.2090251445770264, "logps/rejected": -1.3379465341567993, "loss": 1.209, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.2090251445770264, "rewards/margins": 0.12892156839370728, "rewards/rejected": -1.3379465341567993, "step": 810 }, { "epoch": 0.4361933433684563, "grad_norm": 6.017443368903702, "learning_rate": 9.937536987168413e-07, "logits/chosen": -0.014696076512336731, "logits/rejected": 0.09366099536418915, "logps/chosen": -1.2019816637039185, "logps/rejected": -1.3922555446624756, "loss": 1.202, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.2019816637039185, "rewards/margins": 0.19027389585971832, "rewards/rejected": -1.3922555446624756, "step": 815 }, { "epoch": 0.4388693761498578, "grad_norm": 6.501460648386699, "learning_rate": 9.935058998485896e-07, "logits/chosen": 0.007221733685582876, "logits/rejected": 0.04142792895436287, "logps/chosen": -1.2468233108520508, "logps/rejected": -1.4046034812927246, "loss": 1.2468, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.2468233108520508, "rewards/margins": 0.1577802449464798, "rewards/rejected": -1.4046034812927246, "step": 820 }, { "epoch": 0.44154540893125943, "grad_norm": 5.531452994207765, "learning_rate": 9.932533129839333e-07, "logits/chosen": -0.060831218957901, "logits/rejected": 0.04515770450234413, "logps/chosen": -1.192328691482544, "logps/rejected": -1.2691091299057007, "loss": 1.1923, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -1.192328691482544, "rewards/margins": 0.07678046822547913, "rewards/rejected": -1.2691091299057007, "step": 825 }, { "epoch": 0.444221441712661, "grad_norm": 6.051552438045931, "learning_rate": 9.929959405734711e-07, "logits/chosen": 0.007145153824239969, "logits/rejected": 0.14981447160243988, "logps/chosen": -1.3201632499694824, "logps/rejected": -1.3599677085876465, "loss": 1.3202, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.3201632499694824, "rewards/margins": 0.039804406464099884, "rewards/rejected": -1.3599677085876465, "step": 830 }, { "epoch": 0.44689747449406253, "grad_norm": 6.850648421851919, "learning_rate": 9.927337851142314e-07, "logits/chosen": -0.019765306264162064, "logits/rejected": 0.09608928114175797, "logps/chosen": -1.221375584602356, "logps/rejected": -1.3349218368530273, "loss": 1.2214, "rewards/accuracies": 0.5625, "rewards/chosen": -1.221375584602356, "rewards/margins": 0.11354625225067139, "rewards/rejected": -1.3349218368530273, "step": 835 }, { "epoch": 0.44957350727546413, "grad_norm": 4.443556880168312, "learning_rate": 9.924668491496474e-07, "logits/chosen": -0.05690440535545349, "logits/rejected": 0.0669245719909668, "logps/chosen": -1.2684918642044067, "logps/rejected": -1.4209693670272827, "loss": 1.2685, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.2684918642044067, "rewards/margins": 0.15247756242752075, "rewards/rejected": -1.4209693670272827, "step": 840 }, { "epoch": 0.4522495400568657, "grad_norm": 4.25933925609248, "learning_rate": 9.92195135269533e-07, "logits/chosen": 0.021819986402988434, "logits/rejected": 0.08051449805498123, "logps/chosen": -1.2653038501739502, "logps/rejected": -1.3116405010223389, "loss": 1.2653, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.2653038501739502, "rewards/margins": 0.046336762607097626, "rewards/rejected": -1.3116405010223389, "step": 845 }, { "epoch": 0.4549255728382673, "grad_norm": 7.257647890653314, "learning_rate": 9.919186461100574e-07, "logits/chosen": -0.031674690544605255, "logits/rejected": 0.012904509902000427, "logps/chosen": -1.2281488180160522, "logps/rejected": -1.3482401371002197, "loss": 1.2281, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.2281488180160522, "rewards/margins": 0.12009123712778091, "rewards/rejected": -1.3482401371002197, "step": 850 }, { "epoch": 0.45760160561966884, "grad_norm": 5.680118152330347, "learning_rate": 9.9163738435372e-07, "logits/chosen": -0.0627240538597107, "logits/rejected": 0.0528237447142601, "logps/chosen": -1.2900902032852173, "logps/rejected": -1.4480512142181396, "loss": 1.2901, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.2900902032852173, "rewards/margins": 0.15796101093292236, "rewards/rejected": -1.4480512142181396, "step": 855 }, { "epoch": 0.4602776384010704, "grad_norm": 4.88454716252413, "learning_rate": 9.913513527293234e-07, "logits/chosen": -0.09382763504981995, "logits/rejected": 0.03705960512161255, "logps/chosen": -1.324931025505066, "logps/rejected": -1.4851025342941284, "loss": 1.3249, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.324931025505066, "rewards/margins": 0.16017137467861176, "rewards/rejected": -1.4851025342941284, "step": 860 }, { "epoch": 0.462953671182472, "grad_norm": 10.688749898954267, "learning_rate": 9.910605540119474e-07, "logits/chosen": -0.04146295413374901, "logits/rejected": 0.03848208487033844, "logps/chosen": -1.2178617715835571, "logps/rejected": -1.3843746185302734, "loss": 1.2179, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.2178617715835571, "rewards/margins": 0.16651277244091034, "rewards/rejected": -1.3843746185302734, "step": 865 }, { "epoch": 0.46562970396387354, "grad_norm": 5.857606404280885, "learning_rate": 9.907649910229227e-07, "logits/chosen": -0.1165323406457901, "logits/rejected": 0.09269308298826218, "logps/chosen": -1.274859070777893, "logps/rejected": -1.3464224338531494, "loss": 1.2749, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.274859070777893, "rewards/margins": 0.07156340777873993, "rewards/rejected": -1.3464224338531494, "step": 870 }, { "epoch": 0.46830573674527515, "grad_norm": 6.436970952571557, "learning_rate": 9.90464666629803e-07, "logits/chosen": -0.0186513252556324, "logits/rejected": 0.03855866193771362, "logps/chosen": -1.2999582290649414, "logps/rejected": -1.4133074283599854, "loss": 1.3, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.2999582290649414, "rewards/margins": 0.11334912478923798, "rewards/rejected": -1.4133074283599854, "step": 875 }, { "epoch": 0.4709817695266767, "grad_norm": 5.099064887205044, "learning_rate": 9.901595837463363e-07, "logits/chosen": 0.02764453925192356, "logits/rejected": 0.16220436990261078, "logps/chosen": -1.3585113286972046, "logps/rejected": -1.4470500946044922, "loss": 1.3585, "rewards/accuracies": 0.5625, "rewards/chosen": -1.3585113286972046, "rewards/margins": 0.08853866159915924, "rewards/rejected": -1.4470500946044922, "step": 880 }, { "epoch": 0.47365780230807825, "grad_norm": 6.249254301994275, "learning_rate": 9.898497453324384e-07, "logits/chosen": -0.06601719558238983, "logits/rejected": -0.002524456474930048, "logps/chosen": -1.238088846206665, "logps/rejected": -1.4163563251495361, "loss": 1.2381, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.238088846206665, "rewards/margins": 0.17826753854751587, "rewards/rejected": -1.4163563251495361, "step": 885 }, { "epoch": 0.47633383508947985, "grad_norm": 7.7936893140845385, "learning_rate": 9.895351543941628e-07, "logits/chosen": -0.17241403460502625, "logits/rejected": -0.07531337440013885, "logps/chosen": -1.2960336208343506, "logps/rejected": -1.407441258430481, "loss": 1.296, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.2960336208343506, "rewards/margins": 0.11140771210193634, "rewards/rejected": -1.407441258430481, "step": 890 }, { "epoch": 0.4790098678708814, "grad_norm": 3.9847703128242316, "learning_rate": 9.892158139836724e-07, "logits/chosen": 0.018056729808449745, "logits/rejected": 0.11381411552429199, "logps/chosen": -1.193030834197998, "logps/rejected": -1.281029224395752, "loss": 1.193, "rewards/accuracies": 0.53125, "rewards/chosen": -1.193030834197998, "rewards/margins": 0.08799833059310913, "rewards/rejected": -1.281029224395752, "step": 895 }, { "epoch": 0.481685900652283, "grad_norm": 5.6762578559564565, "learning_rate": 9.88891727199209e-07, "logits/chosen": -0.10110030323266983, "logits/rejected": -0.05598319694399834, "logps/chosen": -1.169980764389038, "logps/rejected": -1.4114658832550049, "loss": 1.17, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.169980764389038, "rewards/margins": 0.24148491024971008, "rewards/rejected": -1.4114658832550049, "step": 900 }, { "epoch": 0.48436193343368455, "grad_norm": 6.566211495815069, "learning_rate": 9.885628971850641e-07, "logits/chosen": 0.020869866013526917, "logits/rejected": 0.17610354721546173, "logps/chosen": -1.2580798864364624, "logps/rejected": -1.4387552738189697, "loss": 1.2581, "rewards/accuracies": 0.5625, "rewards/chosen": -1.2580798864364624, "rewards/margins": 0.1806752234697342, "rewards/rejected": -1.4387552738189697, "step": 905 }, { "epoch": 0.48703796621508616, "grad_norm": 4.630206999267872, "learning_rate": 9.882293271315481e-07, "logits/chosen": -0.041457295417785645, "logits/rejected": 0.056789856404066086, "logps/chosen": -1.3156230449676514, "logps/rejected": -1.3740203380584717, "loss": 1.3156, "rewards/accuracies": 0.53125, "rewards/chosen": -1.3156230449676514, "rewards/margins": 0.05839718133211136, "rewards/rejected": -1.3740203380584717, "step": 910 }, { "epoch": 0.4897139989964877, "grad_norm": 5.743796034998247, "learning_rate": 9.878910202749589e-07, "logits/chosen": -0.03750307485461235, "logits/rejected": 0.10805882513523102, "logps/chosen": -1.2369829416275024, "logps/rejected": -1.3601778745651245, "loss": 1.237, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.2369829416275024, "rewards/margins": 0.12319488823413849, "rewards/rejected": -1.3601778745651245, "step": 915 }, { "epoch": 0.49239003177788926, "grad_norm": 6.890133876154185, "learning_rate": 9.875479798975512e-07, "logits/chosen": 0.06264761090278625, "logits/rejected": 0.16222670674324036, "logps/chosen": -1.1973307132720947, "logps/rejected": -1.331861138343811, "loss": 1.1973, "rewards/accuracies": 0.5625, "rewards/chosen": -1.1973307132720947, "rewards/margins": 0.13453027606010437, "rewards/rejected": -1.331861138343811, "step": 920 }, { "epoch": 0.49506606455929086, "grad_norm": 5.415906491540621, "learning_rate": 9.87200209327504e-07, "logits/chosen": -0.08010564744472504, "logits/rejected": 0.052248239517211914, "logps/chosen": -1.2885383367538452, "logps/rejected": -1.3219974040985107, "loss": 1.2885, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.2885383367538452, "rewards/margins": 0.03345906734466553, "rewards/rejected": -1.3219974040985107, "step": 925 }, { "epoch": 0.4977420973406924, "grad_norm": 8.550093075927247, "learning_rate": 9.868477119388894e-07, "logits/chosen": -0.0704786628484726, "logits/rejected": 0.03059127926826477, "logps/chosen": -1.2714080810546875, "logps/rejected": -1.444076418876648, "loss": 1.2714, "rewards/accuracies": 0.53125, "rewards/chosen": -1.2714080810546875, "rewards/margins": 0.17266830801963806, "rewards/rejected": -1.444076418876648, "step": 930 }, { "epoch": 0.500418130122094, "grad_norm": 6.448828186817751, "learning_rate": 9.864904911516383e-07, "logits/chosen": -0.0013090405846014619, "logits/rejected": 0.02456684410572052, "logps/chosen": -1.1876220703125, "logps/rejected": -1.3606204986572266, "loss": 1.1876, "rewards/accuracies": 0.59375, "rewards/chosen": -1.1876220703125, "rewards/margins": 0.1729983538389206, "rewards/rejected": -1.3606204986572266, "step": 935 }, { "epoch": 0.5030941629034956, "grad_norm": 7.864083622997477, "learning_rate": 9.861285504315084e-07, "logits/chosen": -0.06199083477258682, "logits/rejected": 0.03203614801168442, "logps/chosen": -1.254618525505066, "logps/rejected": -1.2897098064422607, "loss": 1.2546, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.254618525505066, "rewards/margins": 0.03509129583835602, "rewards/rejected": -1.2897098064422607, "step": 940 }, { "epoch": 0.5057701956848971, "grad_norm": 6.977481084998237, "learning_rate": 9.857618932900502e-07, "logits/chosen": -0.06937488913536072, "logits/rejected": 0.031760793179273605, "logps/chosen": -1.2346205711364746, "logps/rejected": -1.373268485069275, "loss": 1.2346, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.2346205711364746, "rewards/margins": 0.1386478841304779, "rewards/rejected": -1.373268485069275, "step": 945 }, { "epoch": 0.5084462284662987, "grad_norm": 4.800302379677818, "learning_rate": 9.853905232845727e-07, "logits/chosen": -0.05746585130691528, "logits/rejected": 0.0736013650894165, "logps/chosen": -1.333167314529419, "logps/rejected": -1.3427143096923828, "loss": 1.3332, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -1.333167314529419, "rewards/margins": 0.00954707432538271, "rewards/rejected": -1.3427143096923828, "step": 950 }, { "epoch": 0.5111222612477003, "grad_norm": 4.916150631660698, "learning_rate": 9.850144440181095e-07, "logits/chosen": -0.02261519804596901, "logits/rejected": 0.16062672436237335, "logps/chosen": -1.3167396783828735, "logps/rejected": -1.3585777282714844, "loss": 1.3167, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -1.3167396783828735, "rewards/margins": 0.04183819890022278, "rewards/rejected": -1.3585777282714844, "step": 955 }, { "epoch": 0.5137982940291018, "grad_norm": 6.030554526432209, "learning_rate": 9.846336591393832e-07, "logits/chosen": -0.07319796830415726, "logits/rejected": 0.04862436652183533, "logps/chosen": -1.27349853515625, "logps/rejected": -1.3457261323928833, "loss": 1.2735, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.27349853515625, "rewards/margins": 0.07222755998373032, "rewards/rejected": -1.3457261323928833, "step": 960 }, { "epoch": 0.5164743268105034, "grad_norm": 6.416763753699872, "learning_rate": 9.842481723427704e-07, "logits/chosen": 0.00729437917470932, "logits/rejected": -0.011470029130578041, "logps/chosen": -1.3153356313705444, "logps/rejected": -1.4611105918884277, "loss": 1.3153, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.3153356313705444, "rewards/margins": 0.14577490091323853, "rewards/rejected": -1.4611105918884277, "step": 965 }, { "epoch": 0.519150359591905, "grad_norm": 5.624644487971093, "learning_rate": 9.838579873682658e-07, "logits/chosen": 0.010949225164949894, "logits/rejected": 0.0016755014657974243, "logps/chosen": -1.1993709802627563, "logps/rejected": -1.2719544172286987, "loss": 1.1994, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.1993709802627563, "rewards/margins": 0.07258348166942596, "rewards/rejected": -1.2719544172286987, "step": 970 }, { "epoch": 0.5218263923733065, "grad_norm": 4.780016462941374, "learning_rate": 9.834631080014457e-07, "logits/chosen": -0.13335485756397247, "logits/rejected": 0.006332355551421642, "logps/chosen": -1.2721645832061768, "logps/rejected": -1.3479435443878174, "loss": 1.2722, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.2721645832061768, "rewards/margins": 0.075779028236866, "rewards/rejected": -1.3479435443878174, "step": 975 }, { "epoch": 0.5245024251547081, "grad_norm": 6.24696474293684, "learning_rate": 9.830635380734312e-07, "logits/chosen": -0.1300913542509079, "logits/rejected": 0.032060496509075165, "logps/chosen": -1.2956159114837646, "logps/rejected": -1.384390950202942, "loss": 1.2956, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.2956159114837646, "rewards/margins": 0.08877500146627426, "rewards/rejected": -1.384390950202942, "step": 980 }, { "epoch": 0.5271784579361097, "grad_norm": 5.3667564557057, "learning_rate": 9.826592814608517e-07, "logits/chosen": -0.010035110637545586, "logits/rejected": 0.14016814529895782, "logps/chosen": -1.270155906677246, "logps/rejected": -1.3871012926101685, "loss": 1.2702, "rewards/accuracies": 0.53125, "rewards/chosen": -1.270155906677246, "rewards/margins": 0.11694546788930893, "rewards/rejected": -1.3871012926101685, "step": 985 }, { "epoch": 0.5298544907175113, "grad_norm": 6.193768086701726, "learning_rate": 9.822503420858067e-07, "logits/chosen": -0.0026545911096036434, "logits/rejected": 0.030388375744223595, "logps/chosen": -1.1527667045593262, "logps/rejected": -1.355188250541687, "loss": 1.1528, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.1527667045593262, "rewards/margins": 0.2024216651916504, "rewards/rejected": -1.355188250541687, "step": 990 }, { "epoch": 0.5325305234989128, "grad_norm": 5.714757351576312, "learning_rate": 9.818367239158277e-07, "logits/chosen": 0.003025206970050931, "logits/rejected": 0.05063074827194214, "logps/chosen": -1.2739830017089844, "logps/rejected": -1.2768166065216064, "loss": 1.274, "rewards/accuracies": 0.4437499940395355, "rewards/chosen": -1.2739830017089844, "rewards/margins": 0.0028335279785096645, "rewards/rejected": -1.2768166065216064, "step": 995 }, { "epoch": 0.5352065562803144, "grad_norm": 6.08080386360039, "learning_rate": 9.8141843096384e-07, "logits/chosen": -0.010089387185871601, "logits/rejected": 0.062300264835357666, "logps/chosen": -1.3028050661087036, "logps/rejected": -1.396297812461853, "loss": 1.3028, "rewards/accuracies": 0.5, "rewards/chosen": -1.3028050661087036, "rewards/margins": 0.09349276125431061, "rewards/rejected": -1.396297812461853, "step": 1000 }, { "epoch": 0.537882589061716, "grad_norm": 6.370839810431373, "learning_rate": 9.809954672881237e-07, "logits/chosen": -0.041650157421827316, "logits/rejected": 0.09287434071302414, "logps/chosen": -1.2844218015670776, "logps/rejected": -1.3536185026168823, "loss": 1.2844, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.2844218015670776, "rewards/margins": 0.06919676065444946, "rewards/rejected": -1.3536185026168823, "step": 1005 }, { "epoch": 0.5405586218431175, "grad_norm": 5.396415180638087, "learning_rate": 9.80567836992274e-07, "logits/chosen": -0.018757149577140808, "logits/rejected": 0.13368038833141327, "logps/chosen": -1.1520249843597412, "logps/rejected": -1.3400964736938477, "loss": 1.152, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.1520249843597412, "rewards/margins": 0.18807151913642883, "rewards/rejected": -1.3400964736938477, "step": 1010 }, { "epoch": 0.5432346546245191, "grad_norm": 8.672179176992044, "learning_rate": 9.801355442251625e-07, "logits/chosen": -0.05544118955731392, "logits/rejected": 0.08116774260997772, "logps/chosen": -1.225459337234497, "logps/rejected": -1.372436761856079, "loss": 1.2255, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.225459337234497, "rewards/margins": 0.1469774842262268, "rewards/rejected": -1.372436761856079, "step": 1015 }, { "epoch": 0.5459106874059207, "grad_norm": 6.512830744816172, "learning_rate": 9.796985931808949e-07, "logits/chosen": -0.046632058918476105, "logits/rejected": 0.058753471821546555, "logps/chosen": -1.274216651916504, "logps/rejected": -1.4193313121795654, "loss": 1.2742, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.274216651916504, "rewards/margins": 0.1451147049665451, "rewards/rejected": -1.4193313121795654, "step": 1020 }, { "epoch": 0.5485867201873222, "grad_norm": 5.746711471741546, "learning_rate": 9.792569880987724e-07, "logits/chosen": -0.09305243194103241, "logits/rejected": -0.008764969184994698, "logps/chosen": -1.1756685972213745, "logps/rejected": -1.3979036808013916, "loss": 1.1757, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.1756685972213745, "rewards/margins": 0.22223511338233948, "rewards/rejected": -1.3979036808013916, "step": 1025 }, { "epoch": 0.5512627529687238, "grad_norm": 5.561724494736985, "learning_rate": 9.788107332632493e-07, "logits/chosen": -0.07967124879360199, "logits/rejected": -0.0089653879404068, "logps/chosen": -1.2514511346817017, "logps/rejected": -1.3071626424789429, "loss": 1.2515, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.2514511346817017, "rewards/margins": 0.055711530148983, "rewards/rejected": -1.3071626424789429, "step": 1030 }, { "epoch": 0.5539387857501255, "grad_norm": 6.619257778581214, "learning_rate": 9.783598330038924e-07, "logits/chosen": -0.06873577833175659, "logits/rejected": 0.020540963858366013, "logps/chosen": -1.3454402685165405, "logps/rejected": -1.3635761737823486, "loss": 1.3454, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -1.3454402685165405, "rewards/margins": 0.018136069178581238, "rewards/rejected": -1.3635761737823486, "step": 1035 }, { "epoch": 0.5566148185315271, "grad_norm": 6.507868445708673, "learning_rate": 9.779042916953376e-07, "logits/chosen": -0.05023226886987686, "logits/rejected": 0.056683819741010666, "logps/chosen": -1.2657454013824463, "logps/rejected": -1.3638733625411987, "loss": 1.2657, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.2657454013824463, "rewards/margins": 0.09812800586223602, "rewards/rejected": -1.3638733625411987, "step": 1040 }, { "epoch": 0.5592908513129285, "grad_norm": 4.860565649165002, "learning_rate": 9.774441137572487e-07, "logits/chosen": -0.10371844470500946, "logits/rejected": -0.0017913610208779573, "logps/chosen": -1.2362658977508545, "logps/rejected": -1.3865690231323242, "loss": 1.2363, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.2362658977508545, "rewards/margins": 0.15030322968959808, "rewards/rejected": -1.3865690231323242, "step": 1045 }, { "epoch": 0.5619668840943302, "grad_norm": 6.1881907628230755, "learning_rate": 9.76979303654274e-07, "logits/chosen": -0.13047316670417786, "logits/rejected": -0.06020767614245415, "logps/chosen": -1.281745195388794, "logps/rejected": -1.3792763948440552, "loss": 1.2817, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -1.281745195388794, "rewards/margins": 0.09753124415874481, "rewards/rejected": -1.3792763948440552, "step": 1050 }, { "epoch": 0.5646429168757318, "grad_norm": 7.006049699623025, "learning_rate": 9.765098658960035e-07, "logits/chosen": -0.031607579439878464, "logits/rejected": 0.024964239448308945, "logps/chosen": -1.270918607711792, "logps/rejected": -1.3933523893356323, "loss": 1.2709, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.270918607711792, "rewards/margins": 0.12243392318487167, "rewards/rejected": -1.3933523893356323, "step": 1055 }, { "epoch": 0.5673189496571333, "grad_norm": 5.1002226094667895, "learning_rate": 9.76035805036924e-07, "logits/chosen": -0.0232310201972723, "logits/rejected": 0.11482405662536621, "logps/chosen": -1.341477632522583, "logps/rejected": -1.4297873973846436, "loss": 1.3415, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.341477632522583, "rewards/margins": 0.08830976486206055, "rewards/rejected": -1.4297873973846436, "step": 1060 }, { "epoch": 0.5699949824385349, "grad_norm": 4.240887871424516, "learning_rate": 9.755571256763764e-07, "logits/chosen": 0.008359956555068493, "logits/rejected": 0.10747476667165756, "logps/chosen": -1.195178747177124, "logps/rejected": -1.3821508884429932, "loss": 1.1952, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.195178747177124, "rewards/margins": 0.186971977353096, "rewards/rejected": -1.3821508884429932, "step": 1065 }, { "epoch": 0.5726710152199365, "grad_norm": 6.376623267438161, "learning_rate": 9.750738324585097e-07, "logits/chosen": -0.1394558995962143, "logits/rejected": 0.05875846743583679, "logps/chosen": -1.261845350265503, "logps/rejected": -1.4287128448486328, "loss": 1.2618, "rewards/accuracies": 0.5625, "rewards/chosen": -1.261845350265503, "rewards/margins": 0.16686758399009705, "rewards/rejected": -1.4287128448486328, "step": 1070 }, { "epoch": 0.5753470480013381, "grad_norm": 5.302979158056712, "learning_rate": 9.74585930072237e-07, "logits/chosen": -0.061800550669431686, "logits/rejected": 0.03794045001268387, "logps/chosen": -1.2102464437484741, "logps/rejected": -1.3929535150527954, "loss": 1.2102, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.2102464437484741, "rewards/margins": 0.18270698189735413, "rewards/rejected": -1.3929535150527954, "step": 1075 }, { "epoch": 0.5780230807827396, "grad_norm": 5.690756886090038, "learning_rate": 9.740934232511892e-07, "logits/chosen": -0.11699533462524414, "logits/rejected": -0.03350159153342247, "logps/chosen": -1.3242558240890503, "logps/rejected": -1.3421905040740967, "loss": 1.3243, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -1.3242558240890503, "rewards/margins": 0.017934467643499374, "rewards/rejected": -1.3421905040740967, "step": 1080 }, { "epoch": 0.5806991135641412, "grad_norm": 6.023767652215243, "learning_rate": 9.735963167736698e-07, "logits/chosen": -0.05129870027303696, "logits/rejected": 0.07583866268396378, "logps/chosen": -1.2824286222457886, "logps/rejected": -1.307401180267334, "loss": 1.2824, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -1.2824286222457886, "rewards/margins": 0.024972468614578247, "rewards/rejected": -1.307401180267334, "step": 1085 }, { "epoch": 0.5833751463455428, "grad_norm": 4.110839923266719, "learning_rate": 9.730946154626078e-07, "logits/chosen": -0.06485255062580109, "logits/rejected": 0.008718984201550484, "logps/chosen": -1.2682318687438965, "logps/rejected": -1.2764408588409424, "loss": 1.2682, "rewards/accuracies": 0.45625001192092896, "rewards/chosen": -1.2682318687438965, "rewards/margins": 0.008208973333239555, "rewards/rejected": -1.2764408588409424, "step": 1090 }, { "epoch": 0.5860511791269443, "grad_norm": 6.827856136502948, "learning_rate": 9.725883241855117e-07, "logits/chosen": -0.15399453043937683, "logits/rejected": -0.044305093586444855, "logps/chosen": -1.2090466022491455, "logps/rejected": -1.3276557922363281, "loss": 1.209, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.2090466022491455, "rewards/margins": 0.11860903352499008, "rewards/rejected": -1.3276557922363281, "step": 1095 }, { "epoch": 0.5887272119083459, "grad_norm": 7.517511261131081, "learning_rate": 9.720774478544218e-07, "logits/chosen": -0.022174442186951637, "logits/rejected": 0.06240835040807724, "logps/chosen": -1.1176999807357788, "logps/rejected": -1.4069253206253052, "loss": 1.1177, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.1176999807357788, "rewards/margins": 0.28922539949417114, "rewards/rejected": -1.4069253206253052, "step": 1100 }, { "epoch": 0.5914032446897475, "grad_norm": 5.191285573260489, "learning_rate": 9.715619914258624e-07, "logits/chosen": -0.07206861674785614, "logits/rejected": -0.021900208666920662, "logps/chosen": -1.2515678405761719, "logps/rejected": -1.3431819677352905, "loss": 1.2516, "rewards/accuracies": 0.53125, "rewards/chosen": -1.2515678405761719, "rewards/margins": 0.09161397069692612, "rewards/rejected": -1.3431819677352905, "step": 1105 }, { "epoch": 0.594079277471149, "grad_norm": 4.95733869294476, "learning_rate": 9.710419599007937e-07, "logits/chosen": -0.06788556277751923, "logits/rejected": 0.027228718623518944, "logps/chosen": -1.2281213998794556, "logps/rejected": -1.2513644695281982, "loss": 1.2281, "rewards/accuracies": 0.45625001192092896, "rewards/chosen": -1.2281213998794556, "rewards/margins": 0.0232431348413229, "rewards/rejected": -1.2513644695281982, "step": 1110 }, { "epoch": 0.5967553102525506, "grad_norm": 5.128257254685448, "learning_rate": 9.705173583245643e-07, "logits/chosen": 0.04475606977939606, "logits/rejected": 0.13561873137950897, "logps/chosen": -1.1499310731887817, "logps/rejected": -1.3159959316253662, "loss": 1.1499, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.1499310731887817, "rewards/margins": 0.16606482863426208, "rewards/rejected": -1.3159959316253662, "step": 1115 }, { "epoch": 0.5994313430339522, "grad_norm": 7.673229380231191, "learning_rate": 9.699881917868609e-07, "logits/chosen": -0.17572034895420074, "logits/rejected": -0.09525928646326065, "logps/chosen": -1.2241933345794678, "logps/rejected": -1.3603832721710205, "loss": 1.2242, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.2241933345794678, "rewards/margins": 0.13618981838226318, "rewards/rejected": -1.3603832721710205, "step": 1120 }, { "epoch": 0.6021073758153538, "grad_norm": 7.423787633388482, "learning_rate": 9.694544654216594e-07, "logits/chosen": -0.1515866219997406, "logits/rejected": 9.545013017486781e-05, "logps/chosen": -1.2245181798934937, "logps/rejected": -1.329463243484497, "loss": 1.2245, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.2245181798934937, "rewards/margins": 0.10494516044855118, "rewards/rejected": -1.329463243484497, "step": 1125 }, { "epoch": 0.6047834085967553, "grad_norm": 5.168508577375761, "learning_rate": 9.689161844071755e-07, "logits/chosen": -0.0010724887251853943, "logits/rejected": 0.03993191570043564, "logps/chosen": -1.2693973779678345, "logps/rejected": -1.3974542617797852, "loss": 1.2694, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.2693973779678345, "rewards/margins": 0.12805666029453278, "rewards/rejected": -1.3974542617797852, "step": 1130 }, { "epoch": 0.6074594413781569, "grad_norm": 9.132837428821281, "learning_rate": 9.683733539658138e-07, "logits/chosen": -0.07440435886383057, "logits/rejected": 0.05557713657617569, "logps/chosen": -1.277978539466858, "logps/rejected": -1.4476351737976074, "loss": 1.278, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.277978539466858, "rewards/margins": 0.1696566939353943, "rewards/rejected": -1.4476351737976074, "step": 1135 }, { "epoch": 0.6101354741595585, "grad_norm": 6.404222856314687, "learning_rate": 9.678259793641178e-07, "logits/chosen": -0.06205382198095322, "logits/rejected": -0.045845527201890945, "logps/chosen": -1.2445296049118042, "logps/rejected": -1.260196328163147, "loss": 1.2445, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.2445296049118042, "rewards/margins": 0.015666665509343147, "rewards/rejected": -1.260196328163147, "step": 1140 }, { "epoch": 0.61281150694096, "grad_norm": 6.2128191550452145, "learning_rate": 9.672740659127183e-07, "logits/chosen": -0.18350091576576233, "logits/rejected": -0.10154464095830917, "logps/chosen": -1.2840421199798584, "logps/rejected": -1.3740594387054443, "loss": 1.284, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.2840421199798584, "rewards/margins": 0.09001748263835907, "rewards/rejected": -1.3740594387054443, "step": 1145 }, { "epoch": 0.6154875397223616, "grad_norm": 6.320844942924974, "learning_rate": 9.667176189662818e-07, "logits/chosen": -0.1735883355140686, "logits/rejected": -0.06137179210782051, "logps/chosen": -1.172741413116455, "logps/rejected": -1.299964189529419, "loss": 1.1727, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.172741413116455, "rewards/margins": 0.12722285091876984, "rewards/rejected": -1.299964189529419, "step": 1150 }, { "epoch": 0.6181635725037632, "grad_norm": 6.367773718499538, "learning_rate": 9.661566439234592e-07, "logits/chosen": -0.06603240221738815, "logits/rejected": -0.002590920077636838, "logps/chosen": -1.272516131401062, "logps/rejected": -1.3514564037322998, "loss": 1.2725, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.272516131401062, "rewards/margins": 0.07894023507833481, "rewards/rejected": -1.3514564037322998, "step": 1155 }, { "epoch": 0.6208396052851648, "grad_norm": 6.439998563210114, "learning_rate": 9.655911462268327e-07, "logits/chosen": 0.010955584235489368, "logits/rejected": 0.06357049196958542, "logps/chosen": -1.2248948812484741, "logps/rejected": -1.3173155784606934, "loss": 1.2249, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.2248948812484741, "rewards/margins": 0.09242062270641327, "rewards/rejected": -1.3173155784606934, "step": 1160 }, { "epoch": 0.6235156380665663, "grad_norm": 5.238427147479888, "learning_rate": 9.650211313628636e-07, "logits/chosen": -0.07727337628602982, "logits/rejected": -0.017749736085534096, "logps/chosen": -1.1664068698883057, "logps/rejected": -1.3247500658035278, "loss": 1.1664, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.1664068698883057, "rewards/margins": 0.1583430916070938, "rewards/rejected": -1.3247500658035278, "step": 1165 }, { "epoch": 0.6261916708479679, "grad_norm": 5.438907988009311, "learning_rate": 9.644466048618386e-07, "logits/chosen": -0.10764800012111664, "logits/rejected": 0.02199997939169407, "logps/chosen": -1.3702927827835083, "logps/rejected": -1.3725587129592896, "loss": 1.3703, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.3702927827835083, "rewards/margins": 0.0022659488022327423, "rewards/rejected": -1.3725587129592896, "step": 1170 }, { "epoch": 0.6288677036293695, "grad_norm": 4.760267677493578, "learning_rate": 9.63867572297816e-07, "logits/chosen": -0.0787278562784195, "logits/rejected": 0.059595175087451935, "logps/chosen": -1.2141425609588623, "logps/rejected": -1.2859586477279663, "loss": 1.2141, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.2141425609588623, "rewards/margins": 0.0718161016702652, "rewards/rejected": -1.2859586477279663, "step": 1175 }, { "epoch": 0.631543736410771, "grad_norm": 5.44533289073085, "learning_rate": 9.632840392885727e-07, "logits/chosen": -0.0734080895781517, "logits/rejected": 0.023104136809706688, "logps/chosen": -1.3176859617233276, "logps/rejected": -1.4567492008209229, "loss": 1.3177, "rewards/accuracies": 0.53125, "rewards/chosen": -1.3176859617233276, "rewards/margins": 0.1390632688999176, "rewards/rejected": -1.4567492008209229, "step": 1180 }, { "epoch": 0.6342197691921726, "grad_norm": 6.619128220448476, "learning_rate": 9.626960114955483e-07, "logits/chosen": -0.009047026745975018, "logits/rejected": 0.09341583400964737, "logps/chosen": -1.3517374992370605, "logps/rejected": -1.3832120895385742, "loss": 1.3517, "rewards/accuracies": 0.46875, "rewards/chosen": -1.3517374992370605, "rewards/margins": 0.031474631279706955, "rewards/rejected": -1.3832120895385742, "step": 1185 }, { "epoch": 0.6368958019735742, "grad_norm": 6.557503797727131, "learning_rate": 9.621034946237909e-07, "logits/chosen": -0.10001516342163086, "logits/rejected": 0.0017581104766577482, "logps/chosen": -1.2998219728469849, "logps/rejected": -1.432750940322876, "loss": 1.2998, "rewards/accuracies": 0.53125, "rewards/chosen": -1.2998219728469849, "rewards/margins": 0.1329290121793747, "rewards/rejected": -1.432750940322876, "step": 1190 }, { "epoch": 0.6395718347549757, "grad_norm": 5.360155343120525, "learning_rate": 9.615064944219021e-07, "logits/chosen": -0.04289788752794266, "logits/rejected": 0.04630057141184807, "logps/chosen": -1.169995903968811, "logps/rejected": -1.3566200733184814, "loss": 1.17, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.169995903968811, "rewards/margins": 0.18662437796592712, "rewards/rejected": -1.3566200733184814, "step": 1195 }, { "epoch": 0.6422478675363773, "grad_norm": 4.6271500811166275, "learning_rate": 9.609050166819803e-07, "logits/chosen": -0.08801095187664032, "logits/rejected": -0.03584954887628555, "logps/chosen": -1.2174196243286133, "logps/rejected": -1.3479154109954834, "loss": 1.2174, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.2174196243286133, "rewards/margins": 0.1304955780506134, "rewards/rejected": -1.3479154109954834, "step": 1200 }, { "epoch": 0.6422478675363773, "eval_logits/chosen": 0.1999603658914566, "eval_logits/rejected": 0.2711539566516876, "eval_logps/chosen": -1.283500075340271, "eval_logps/rejected": -1.404522180557251, "eval_loss": 1.283760905265808, "eval_rewards/accuracies": 0.5489614009857178, "eval_rewards/chosen": -1.283500075340271, "eval_rewards/margins": 0.12102215737104416, "eval_rewards/rejected": -1.404522180557251, "eval_runtime": 40.3387, "eval_samples_per_second": 33.343, "eval_steps_per_second": 8.354, "step": 1200 }, { "epoch": 0.6449239003177789, "grad_norm": 6.057545084603948, "learning_rate": 9.602990672395653e-07, "logits/chosen": -0.15034839510917664, "logits/rejected": -0.013563009910285473, "logps/chosen": -1.2326236963272095, "logps/rejected": -1.3441660404205322, "loss": 1.2326, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.2326236963272095, "rewards/margins": 0.11154214292764664, "rewards/rejected": -1.3441660404205322, "step": 1205 }, { "epoch": 0.6475999330991805, "grad_norm": 8.574618514414993, "learning_rate": 9.59688651973581e-07, "logits/chosen": -0.06808432936668396, "logits/rejected": 0.08646570146083832, "logps/chosen": -1.2702481746673584, "logps/rejected": -1.3541138172149658, "loss": 1.2702, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.2702481746673584, "rewards/margins": 0.0838657021522522, "rewards/rejected": -1.3541138172149658, "step": 1210 }, { "epoch": 0.650275965880582, "grad_norm": 5.555528139020475, "learning_rate": 9.590737768062792e-07, "logits/chosen": -0.1276719719171524, "logits/rejected": -0.028794754296541214, "logps/chosen": -1.2634365558624268, "logps/rejected": -1.2730616331100464, "loss": 1.2634, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -1.2634365558624268, "rewards/margins": 0.00962517224252224, "rewards/rejected": -1.2730616331100464, "step": 1215 }, { "epoch": 0.6529519986619836, "grad_norm": 6.058243042698275, "learning_rate": 9.584544477031816e-07, "logits/chosen": 0.01842622645199299, "logits/rejected": 0.096223846077919, "logps/chosen": -1.1783818006515503, "logps/rejected": -1.2990577220916748, "loss": 1.1784, "rewards/accuracies": 0.53125, "rewards/chosen": -1.1783818006515503, "rewards/margins": 0.12067589908838272, "rewards/rejected": -1.2990577220916748, "step": 1220 }, { "epoch": 0.6556280314433852, "grad_norm": 4.61663556618601, "learning_rate": 9.578306706730215e-07, "logits/chosen": -0.17845682799816132, "logits/rejected": -0.005718544125556946, "logps/chosen": -1.2811152935028076, "logps/rejected": -1.3629634380340576, "loss": 1.2811, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.2811152935028076, "rewards/margins": 0.08184824883937836, "rewards/rejected": -1.3629634380340576, "step": 1225 }, { "epoch": 0.6583040642247867, "grad_norm": 4.2942997862529415, "learning_rate": 9.572024517676865e-07, "logits/chosen": -0.10054387897253036, "logits/rejected": -0.006398229394108057, "logps/chosen": -1.206667423248291, "logps/rejected": -1.3610576391220093, "loss": 1.2067, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.206667423248291, "rewards/margins": 0.15439032018184662, "rewards/rejected": -1.3610576391220093, "step": 1230 }, { "epoch": 0.6609800970061883, "grad_norm": 5.263881360983775, "learning_rate": 9.565697970821593e-07, "logits/chosen": -0.03319742530584335, "logits/rejected": 0.07144156098365784, "logps/chosen": -1.266913890838623, "logps/rejected": -1.3376264572143555, "loss": 1.2669, "rewards/accuracies": 0.5, "rewards/chosen": -1.266913890838623, "rewards/margins": 0.07071264833211899, "rewards/rejected": -1.3376264572143555, "step": 1235 }, { "epoch": 0.6636561297875899, "grad_norm": 7.2728976365781755, "learning_rate": 9.559327127544585e-07, "logits/chosen": -0.16251137852668762, "logits/rejected": -0.05334026366472244, "logps/chosen": -1.2210978269577026, "logps/rejected": -1.3335076570510864, "loss": 1.2211, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.2210978269577026, "rewards/margins": 0.11240974813699722, "rewards/rejected": -1.3335076570510864, "step": 1240 }, { "epoch": 0.6663321625689914, "grad_norm": 6.050835419796436, "learning_rate": 9.552912049655789e-07, "logits/chosen": -0.06696081161499023, "logits/rejected": 0.07436566054821014, "logps/chosen": -1.3261497020721436, "logps/rejected": -1.3498252630233765, "loss": 1.3261, "rewards/accuracies": 0.45625001192092896, "rewards/chosen": -1.3261497020721436, "rewards/margins": 0.023675458505749702, "rewards/rejected": -1.3498252630233765, "step": 1245 }, { "epoch": 0.669008195350393, "grad_norm": 5.7540065241021745, "learning_rate": 9.546452799394315e-07, "logits/chosen": -0.06499700248241425, "logits/rejected": 0.08842761814594269, "logps/chosen": -1.2959587574005127, "logps/rejected": -1.3292210102081299, "loss": 1.296, "rewards/accuracies": 0.45625001192092896, "rewards/chosen": -1.2959587574005127, "rewards/margins": 0.033262260258197784, "rewards/rejected": -1.3292210102081299, "step": 1250 }, { "epoch": 0.6716842281317946, "grad_norm": 8.110651458019786, "learning_rate": 9.539949439427846e-07, "logits/chosen": -0.07847845554351807, "logits/rejected": 0.022258877754211426, "logps/chosen": -1.255712866783142, "logps/rejected": -1.3400136232376099, "loss": 1.2557, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -1.255712866783142, "rewards/margins": 0.08430073410272598, "rewards/rejected": -1.3400136232376099, "step": 1255 }, { "epoch": 0.6743602609131962, "grad_norm": 6.1887731899615925, "learning_rate": 9.533402032852002e-07, "logits/chosen": -0.10817959159612656, "logits/rejected": -0.0016319400165230036, "logps/chosen": -1.1983308792114258, "logps/rejected": -1.3529870510101318, "loss": 1.1983, "rewards/accuracies": 0.59375, "rewards/chosen": -1.1983308792114258, "rewards/margins": 0.15465609729290009, "rewards/rejected": -1.3529870510101318, "step": 1260 }, { "epoch": 0.6770362936945977, "grad_norm": 4.971494770449823, "learning_rate": 9.526810643189754e-07, "logits/chosen": -0.012739507481455803, "logits/rejected": 0.09471867978572845, "logps/chosen": -1.2467888593673706, "logps/rejected": -1.3258411884307861, "loss": 1.2468, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.2467888593673706, "rewards/margins": 0.07905243337154388, "rewards/rejected": -1.3258411884307861, "step": 1265 }, { "epoch": 0.6797123264759993, "grad_norm": 6.562732098841715, "learning_rate": 9.52017533439079e-07, "logits/chosen": -0.07657656073570251, "logits/rejected": 0.016019191592931747, "logps/chosen": -1.2000935077667236, "logps/rejected": -1.3412951231002808, "loss": 1.2001, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.2000935077667236, "rewards/margins": 0.14120161533355713, "rewards/rejected": -1.3412951231002808, "step": 1270 }, { "epoch": 0.6823883592574009, "grad_norm": 4.397831050475903, "learning_rate": 9.513496170830909e-07, "logits/chosen": -0.07882431894540787, "logits/rejected": 0.004360717721283436, "logps/chosen": -1.2628589868545532, "logps/rejected": -1.3195956945419312, "loss": 1.2629, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -1.2628589868545532, "rewards/margins": 0.05673682689666748, "rewards/rejected": -1.3195956945419312, "step": 1275 }, { "epoch": 0.6850643920388024, "grad_norm": 6.316707739328076, "learning_rate": 9.506773217311382e-07, "logits/chosen": -0.10306744277477264, "logits/rejected": 0.009233839809894562, "logps/chosen": -1.3306678533554077, "logps/rejected": -1.3783326148986816, "loss": 1.3307, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.3306678533554077, "rewards/margins": 0.04766470193862915, "rewards/rejected": -1.3783326148986816, "step": 1280 }, { "epoch": 0.687740424820204, "grad_norm": 4.834352901025394, "learning_rate": 9.500006539058334e-07, "logits/chosen": -0.07446451485157013, "logits/rejected": 0.0350925475358963, "logps/chosen": -1.2132048606872559, "logps/rejected": -1.3017178773880005, "loss": 1.2132, "rewards/accuracies": 0.59375, "rewards/chosen": -1.2132048606872559, "rewards/margins": 0.08851295709609985, "rewards/rejected": -1.3017178773880005, "step": 1285 }, { "epoch": 0.6904164576016056, "grad_norm": 7.4764985322726725, "learning_rate": 9.493196201722109e-07, "logits/chosen": -0.17309390008449554, "logits/rejected": -0.04871336743235588, "logps/chosen": -1.2725774049758911, "logps/rejected": -1.3079816102981567, "loss": 1.2726, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -1.2725774049758911, "rewards/margins": 0.03540420159697533, "rewards/rejected": -1.3079816102981567, "step": 1290 }, { "epoch": 0.6930924903830072, "grad_norm": 5.12556544137169, "learning_rate": 9.486342271376628e-07, "logits/chosen": -0.11905679851770401, "logits/rejected": -0.1117417961359024, "logps/chosen": -1.243586778640747, "logps/rejected": -1.3850252628326416, "loss": 1.2436, "rewards/accuracies": 0.5625, "rewards/chosen": -1.243586778640747, "rewards/margins": 0.14143864810466766, "rewards/rejected": -1.3850252628326416, "step": 1295 }, { "epoch": 0.6957685231644087, "grad_norm": 5.326807574527476, "learning_rate": 9.479444814518755e-07, "logits/chosen": -0.10664665699005127, "logits/rejected": 0.0816953033208847, "logps/chosen": -1.24702787399292, "logps/rejected": -1.361676812171936, "loss": 1.247, "rewards/accuracies": 0.53125, "rewards/chosen": -1.24702787399292, "rewards/margins": 0.11464889347553253, "rewards/rejected": -1.361676812171936, "step": 1300 }, { "epoch": 0.6984445559458103, "grad_norm": 4.688853219834959, "learning_rate": 9.472503898067645e-07, "logits/chosen": -0.010024547576904297, "logits/rejected": 0.03444181755185127, "logps/chosen": -1.2491990327835083, "logps/rejected": -1.3740923404693604, "loss": 1.2492, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.2491990327835083, "rewards/margins": 0.12489336729049683, "rewards/rejected": -1.3740923404693604, "step": 1305 }, { "epoch": 0.701120588727212, "grad_norm": 5.266323704366981, "learning_rate": 9.465519589364099e-07, "logits/chosen": -0.031176943331956863, "logits/rejected": 0.030650725588202477, "logps/chosen": -1.2704166173934937, "logps/rejected": -1.378047227859497, "loss": 1.2704, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.2704166173934937, "rewards/margins": 0.10763072967529297, "rewards/rejected": -1.378047227859497, "step": 1310 }, { "epoch": 0.7037966215086134, "grad_norm": 5.8337925220218425, "learning_rate": 9.458491956169914e-07, "logits/chosen": -0.08530330657958984, "logits/rejected": 0.06745810061693192, "logps/chosen": -1.2082918882369995, "logps/rejected": -1.4009073972702026, "loss": 1.2083, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.2082918882369995, "rewards/margins": 0.19261552393436432, "rewards/rejected": -1.4009073972702026, "step": 1315 }, { "epoch": 0.706472654290015, "grad_norm": 5.248813759044998, "learning_rate": 9.451421066667215e-07, "logits/chosen": -0.1838780641555786, "logits/rejected": -0.012331436388194561, "logps/chosen": -1.1938039064407349, "logps/rejected": -1.3321928977966309, "loss": 1.1938, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.1938039064407349, "rewards/margins": 0.13838902115821838, "rewards/rejected": -1.3321928977966309, "step": 1320 }, { "epoch": 0.7091486870714167, "grad_norm": 5.235761741803443, "learning_rate": 9.444306989457805e-07, "logits/chosen": -0.05376625061035156, "logits/rejected": 0.02818487212061882, "logps/chosen": -1.2670605182647705, "logps/rejected": -1.3325811624526978, "loss": 1.2671, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.2670605182647705, "rewards/margins": 0.06552070379257202, "rewards/rejected": -1.3325811624526978, "step": 1325 }, { "epoch": 0.7118247198528181, "grad_norm": 5.116281962762719, "learning_rate": 9.437149793562489e-07, "logits/chosen": -0.0930633395910263, "logits/rejected": 0.0021197155583649874, "logps/chosen": -1.2601063251495361, "logps/rejected": -1.327714204788208, "loss": 1.2601, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.2601063251495361, "rewards/margins": 0.06760775297880173, "rewards/rejected": -1.327714204788208, "step": 1330 }, { "epoch": 0.7145007526342197, "grad_norm": 6.8523185806998, "learning_rate": 9.429949548420417e-07, "logits/chosen": -0.06064723804593086, "logits/rejected": -0.0012881166767328978, "logps/chosen": -1.3280212879180908, "logps/rejected": -1.4465820789337158, "loss": 1.328, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.3280212879180908, "rewards/margins": 0.11856064945459366, "rewards/rejected": -1.4465820789337158, "step": 1335 }, { "epoch": 0.7171767854156214, "grad_norm": 7.657410658388213, "learning_rate": 9.422706323888396e-07, "logits/chosen": -0.06134505197405815, "logits/rejected": -0.04951154440641403, "logps/chosen": -1.2997723817825317, "logps/rejected": -1.3947632312774658, "loss": 1.2998, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.2997723817825317, "rewards/margins": 0.0949908122420311, "rewards/rejected": -1.3947632312774658, "step": 1340 }, { "epoch": 0.719852818197023, "grad_norm": 4.276855127327954, "learning_rate": 9.415420190240225e-07, "logits/chosen": -0.011663141660392284, "logits/rejected": 0.12842154502868652, "logps/chosen": -1.288332223892212, "logps/rejected": -1.3459032773971558, "loss": 1.2883, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.288332223892212, "rewards/margins": 0.05757122486829758, "rewards/rejected": -1.3459032773971558, "step": 1345 }, { "epoch": 0.7225288509784245, "grad_norm": 6.97960651644172, "learning_rate": 9.408091218166002e-07, "logits/chosen": -0.021268505603075027, "logits/rejected": 0.016198139637708664, "logps/chosen": -1.2672230005264282, "logps/rejected": -1.2842594385147095, "loss": 1.2672, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.2672230005264282, "rewards/margins": 0.01703643426299095, "rewards/rejected": -1.2842594385147095, "step": 1350 }, { "epoch": 0.7252048837598261, "grad_norm": 5.894689230704362, "learning_rate": 9.400719478771449e-07, "logits/chosen": -0.05862246826291084, "logits/rejected": 0.1806357353925705, "logps/chosen": -1.311891794204712, "logps/rejected": -1.3561058044433594, "loss": 1.3119, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.311891794204712, "rewards/margins": 0.044214002788066864, "rewards/rejected": -1.3561058044433594, "step": 1355 }, { "epoch": 0.7278809165412277, "grad_norm": 7.637544495739462, "learning_rate": 9.393305043577209e-07, "logits/chosen": -0.14658425748348236, "logits/rejected": -0.025414858013391495, "logps/chosen": -1.331628680229187, "logps/rejected": -1.4379479885101318, "loss": 1.3316, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.331628680229187, "rewards/margins": 0.1063193827867508, "rewards/rejected": -1.4379479885101318, "step": 1360 }, { "epoch": 0.7305569493226292, "grad_norm": 4.852267847210221, "learning_rate": 9.38584798451817e-07, "logits/chosen": -0.03822377324104309, "logits/rejected": 0.07163937389850616, "logps/chosen": -1.2571651935577393, "logps/rejected": -1.3726922273635864, "loss": 1.2572, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.2571651935577393, "rewards/margins": 0.11552713066339493, "rewards/rejected": -1.3726922273635864, "step": 1365 }, { "epoch": 0.7332329821040308, "grad_norm": 7.135588345378852, "learning_rate": 9.37834837394275e-07, "logits/chosen": -0.06661777198314667, "logits/rejected": 0.02481800876557827, "logps/chosen": -1.3160431385040283, "logps/rejected": -1.4689452648162842, "loss": 1.316, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.3160431385040283, "rewards/margins": 0.15290218591690063, "rewards/rejected": -1.4689452648162842, "step": 1370 }, { "epoch": 0.7359090148854324, "grad_norm": 5.673073606361706, "learning_rate": 9.370806284612203e-07, "logits/chosen": -0.08907458931207657, "logits/rejected": 0.026546388864517212, "logps/chosen": -1.2339574098587036, "logps/rejected": -1.4297950267791748, "loss": 1.234, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.2339574098587036, "rewards/margins": 0.19583778083324432, "rewards/rejected": -1.4297950267791748, "step": 1375 }, { "epoch": 0.738585047666834, "grad_norm": 5.57053607547597, "learning_rate": 9.363221789699912e-07, "logits/chosen": -0.13963700830936432, "logits/rejected": -0.05614779144525528, "logps/chosen": -1.2622143030166626, "logps/rejected": -1.2954144477844238, "loss": 1.2622, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.2622143030166626, "rewards/margins": 0.03320017829537392, "rewards/rejected": -1.2954144477844238, "step": 1380 }, { "epoch": 0.7412610804482355, "grad_norm": 5.9926734599588745, "learning_rate": 9.355594962790682e-07, "logits/chosen": -0.10698195546865463, "logits/rejected": -0.022134121507406235, "logps/chosen": -1.2225534915924072, "logps/rejected": -1.2984954118728638, "loss": 1.2226, "rewards/accuracies": 0.53125, "rewards/chosen": -1.2225534915924072, "rewards/margins": 0.0759420171380043, "rewards/rejected": -1.2984954118728638, "step": 1385 }, { "epoch": 0.7439371132296371, "grad_norm": 7.553313738201702, "learning_rate": 9.34792587788002e-07, "logits/chosen": -0.023461446166038513, "logits/rejected": 0.05690978094935417, "logps/chosen": -1.2644122838974, "logps/rejected": -1.37018620967865, "loss": 1.2644, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.2644122838974, "rewards/margins": 0.10577374696731567, "rewards/rejected": -1.37018620967865, "step": 1390 }, { "epoch": 0.7466131460110387, "grad_norm": 5.090371790008468, "learning_rate": 9.34021460937342e-07, "logits/chosen": -0.03182605654001236, "logits/rejected": 0.04644512012600899, "logps/chosen": -1.21986985206604, "logps/rejected": -1.2920414209365845, "loss": 1.2199, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.21986985206604, "rewards/margins": 0.07217174023389816, "rewards/rejected": -1.2920414209365845, "step": 1395 }, { "epoch": 0.7492891787924402, "grad_norm": 4.689885895110415, "learning_rate": 9.332461232085646e-07, "logits/chosen": -0.20271389186382294, "logits/rejected": -0.1088462844491005, "logps/chosen": -1.2996957302093506, "logps/rejected": -1.3332618474960327, "loss": 1.2997, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.2996957302093506, "rewards/margins": 0.03356624394655228, "rewards/rejected": -1.3332618474960327, "step": 1400 }, { "epoch": 0.7519652115738418, "grad_norm": 4.847379462279588, "learning_rate": 9.324665821239998e-07, "logits/chosen": -0.0846988782286644, "logits/rejected": 0.06465022265911102, "logps/chosen": -1.1509791612625122, "logps/rejected": -1.3411425352096558, "loss": 1.151, "rewards/accuracies": 0.53125, "rewards/chosen": -1.1509791612625122, "rewards/margins": 0.19016344845294952, "rewards/rejected": -1.3411425352096558, "step": 1405 }, { "epoch": 0.7546412443552434, "grad_norm": 4.90069205296615, "learning_rate": 9.316828452467583e-07, "logits/chosen": -0.1391901671886444, "logits/rejected": -3.529991954565048e-05, "logps/chosen": -1.2720468044281006, "logps/rejected": -1.3966305255889893, "loss": 1.272, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.2720468044281006, "rewards/margins": 0.12458349764347076, "rewards/rejected": -1.3966305255889893, "step": 1410 }, { "epoch": 0.7573172771366449, "grad_norm": 5.581021051525744, "learning_rate": 9.30894920180659e-07, "logits/chosen": -0.05805348604917526, "logits/rejected": 0.05376707389950752, "logps/chosen": -1.3038359880447388, "logps/rejected": -1.258689522743225, "loss": 1.3038, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.3038359880447388, "rewards/margins": -0.04514641314744949, "rewards/rejected": -1.258689522743225, "step": 1415 }, { "epoch": 0.7599933099180465, "grad_norm": 5.7185528821243725, "learning_rate": 9.301028145701543e-07, "logits/chosen": -0.056157756596803665, "logits/rejected": 0.030967507511377335, "logps/chosen": -1.1993039846420288, "logps/rejected": -1.4040653705596924, "loss": 1.1993, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -1.1993039846420288, "rewards/margins": 0.20476147532463074, "rewards/rejected": -1.4040653705596924, "step": 1420 }, { "epoch": 0.7626693426994481, "grad_norm": 5.724344021736016, "learning_rate": 9.293065361002563e-07, "logits/chosen": -0.019916271790862083, "logits/rejected": 0.02431649900972843, "logps/chosen": -1.2143032550811768, "logps/rejected": -1.4485315084457397, "loss": 1.2143, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.2143032550811768, "rewards/margins": 0.23422828316688538, "rewards/rejected": -1.4485315084457397, "step": 1425 }, { "epoch": 0.7653453754808497, "grad_norm": 5.9035426915279885, "learning_rate": 9.285060924964622e-07, "logits/chosen": -0.10706496238708496, "logits/rejected": -0.006744361016899347, "logps/chosen": -1.2709360122680664, "logps/rejected": -1.3172825574874878, "loss": 1.2709, "rewards/accuracies": 0.46875, "rewards/chosen": -1.2709360122680664, "rewards/margins": 0.046346552670001984, "rewards/rejected": -1.3172825574874878, "step": 1430 }, { "epoch": 0.7680214082622512, "grad_norm": 5.373978551263363, "learning_rate": 9.277014915246792e-07, "logits/chosen": -0.019690584391355515, "logits/rejected": 0.008826453238725662, "logps/chosen": -1.2254173755645752, "logps/rejected": -1.3795092105865479, "loss": 1.2254, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.2254173755645752, "rewards/margins": 0.15409204363822937, "rewards/rejected": -1.3795092105865479, "step": 1435 }, { "epoch": 0.7706974410436528, "grad_norm": 5.065766032942053, "learning_rate": 9.268927409911498e-07, "logits/chosen": -0.09725191444158554, "logits/rejected": -0.028021126985549927, "logps/chosen": -1.281625509262085, "logps/rejected": -1.3169580698013306, "loss": 1.2816, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.281625509262085, "rewards/margins": 0.03533259779214859, "rewards/rejected": -1.3169580698013306, "step": 1440 }, { "epoch": 0.7733734738250544, "grad_norm": 5.933931114102071, "learning_rate": 9.260798487423749e-07, "logits/chosen": -0.14130058884620667, "logits/rejected": 0.028697943314909935, "logps/chosen": -1.3140071630477905, "logps/rejected": -1.3957020044326782, "loss": 1.314, "rewards/accuracies": 0.5625, "rewards/chosen": -1.3140071630477905, "rewards/margins": 0.08169500529766083, "rewards/rejected": -1.3957020044326782, "step": 1445 }, { "epoch": 0.7760495066064559, "grad_norm": 6.819447466417209, "learning_rate": 9.252628226650389e-07, "logits/chosen": -0.05106180161237717, "logits/rejected": 0.02267330512404442, "logps/chosen": -1.196434497833252, "logps/rejected": -1.3058080673217773, "loss": 1.1964, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.196434497833252, "rewards/margins": 0.10937360674142838, "rewards/rejected": -1.3058080673217773, "step": 1450 }, { "epoch": 0.7787255393878575, "grad_norm": 7.078701679210401, "learning_rate": 9.244416706859321e-07, "logits/chosen": -0.04511256515979767, "logits/rejected": 0.08148322999477386, "logps/chosen": -1.2288661003112793, "logps/rejected": -1.404048204421997, "loss": 1.2289, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.2288661003112793, "rewards/margins": 0.17518216371536255, "rewards/rejected": -1.404048204421997, "step": 1455 }, { "epoch": 0.7814015721692591, "grad_norm": 3.9101258132199144, "learning_rate": 9.23616400771875e-07, "logits/chosen": -0.04062981158494949, "logits/rejected": 0.07478814572095871, "logps/chosen": -1.1893526315689087, "logps/rejected": -1.3548520803451538, "loss": 1.1894, "rewards/accuracies": 0.59375, "rewards/chosen": -1.1893526315689087, "rewards/margins": 0.16549938917160034, "rewards/rejected": -1.3548520803451538, "step": 1460 }, { "epoch": 0.7840776049506607, "grad_norm": 5.615704456398196, "learning_rate": 9.227870209296395e-07, "logits/chosen": -0.05048031732439995, "logits/rejected": 0.022416139021515846, "logps/chosen": -1.3129501342773438, "logps/rejected": -1.412232518196106, "loss": 1.313, "rewards/accuracies": 0.53125, "rewards/chosen": -1.3129501342773438, "rewards/margins": 0.09928235411643982, "rewards/rejected": -1.412232518196106, "step": 1465 }, { "epoch": 0.7867536377320622, "grad_norm": 5.554634703533356, "learning_rate": 9.219535392058728e-07, "logits/chosen": -0.12648837268352509, "logits/rejected": -0.10139493644237518, "logps/chosen": -1.2724651098251343, "logps/rejected": -1.3184854984283447, "loss": 1.2725, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.2724651098251343, "rewards/margins": 0.04602038115262985, "rewards/rejected": -1.3184854984283447, "step": 1470 }, { "epoch": 0.7894296705134638, "grad_norm": 4.558957014867374, "learning_rate": 9.211159636870181e-07, "logits/chosen": -0.10161463916301727, "logits/rejected": 0.031503576785326004, "logps/chosen": -1.2414968013763428, "logps/rejected": -1.3800835609436035, "loss": 1.2415, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.2414968013763428, "rewards/margins": 0.13858669996261597, "rewards/rejected": -1.3800835609436035, "step": 1475 }, { "epoch": 0.7921057032948654, "grad_norm": 5.665580762760367, "learning_rate": 9.202743024992367e-07, "logits/chosen": -0.054920125752687454, "logits/rejected": 0.025017287582159042, "logps/chosen": -1.2238645553588867, "logps/rejected": -1.395124077796936, "loss": 1.2239, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.2238645553588867, "rewards/margins": 0.17125944793224335, "rewards/rejected": -1.395124077796936, "step": 1480 }, { "epoch": 0.7947817360762669, "grad_norm": 6.441143128114766, "learning_rate": 9.194285638083293e-07, "logits/chosen": -0.03445065766572952, "logits/rejected": 0.08963262289762497, "logps/chosen": -1.2945940494537354, "logps/rejected": -1.4022890329360962, "loss": 1.2946, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.2945940494537354, "rewards/margins": 0.1076948270201683, "rewards/rejected": -1.4022890329360962, "step": 1485 }, { "epoch": 0.7974577688576685, "grad_norm": 6.824278314942149, "learning_rate": 9.185787558196562e-07, "logits/chosen": -0.10010816901922226, "logits/rejected": -0.024402331560850143, "logps/chosen": -1.265430212020874, "logps/rejected": -1.2750635147094727, "loss": 1.2654, "rewards/accuracies": 0.4437499940395355, "rewards/chosen": -1.265430212020874, "rewards/margins": 0.009633231908082962, "rewards/rejected": -1.2750635147094727, "step": 1490 }, { "epoch": 0.8001338016390701, "grad_norm": 6.903970175921318, "learning_rate": 9.177248867780583e-07, "logits/chosen": -0.1157464012503624, "logits/rejected": -0.023532018065452576, "logps/chosen": -1.3466459512710571, "logps/rejected": -1.354551076889038, "loss": 1.3466, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -1.3466459512710571, "rewards/margins": 0.00790502317249775, "rewards/rejected": -1.354551076889038, "step": 1495 }, { "epoch": 0.8028098344204716, "grad_norm": 7.738847183414445, "learning_rate": 9.168669649677769e-07, "logits/chosen": -0.11500082165002823, "logits/rejected": -0.03286829590797424, "logps/chosen": -1.2316514253616333, "logps/rejected": -1.3613814115524292, "loss": 1.2317, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.2316514253616333, "rewards/margins": 0.12973028421401978, "rewards/rejected": -1.3613814115524292, "step": 1500 }, { "epoch": 0.8054858672018732, "grad_norm": 6.606627518527863, "learning_rate": 9.16004998712373e-07, "logits/chosen": -0.03680127114057541, "logits/rejected": -0.001992885721847415, "logps/chosen": -1.1888556480407715, "logps/rejected": -1.3889747858047485, "loss": 1.1889, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.1888556480407715, "rewards/margins": 0.20011906325817108, "rewards/rejected": -1.3889747858047485, "step": 1505 }, { "epoch": 0.8081618999832748, "grad_norm": 5.645769985332795, "learning_rate": 9.151389963746472e-07, "logits/chosen": -0.12125668674707413, "logits/rejected": 0.08870793879032135, "logps/chosen": -1.29283607006073, "logps/rejected": -1.383994221687317, "loss": 1.2928, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.29283607006073, "rewards/margins": 0.09115798026323318, "rewards/rejected": -1.383994221687317, "step": 1510 }, { "epoch": 0.8108379327646764, "grad_norm": 5.632460250522089, "learning_rate": 9.142689663565577e-07, "logits/chosen": -0.06051849201321602, "logits/rejected": -0.011040596291422844, "logps/chosen": -1.2357362508773804, "logps/rejected": -1.3441052436828613, "loss": 1.2357, "rewards/accuracies": 0.46875, "rewards/chosen": -1.2357362508773804, "rewards/margins": 0.10836899280548096, "rewards/rejected": -1.3441052436828613, "step": 1515 }, { "epoch": 0.8135139655460779, "grad_norm": 6.139778624200637, "learning_rate": 9.133949170991397e-07, "logits/chosen": -0.0676514059305191, "logits/rejected": -0.01244634110480547, "logps/chosen": -1.2722649574279785, "logps/rejected": -1.3779116868972778, "loss": 1.2723, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.2722649574279785, "rewards/margins": 0.10564669221639633, "rewards/rejected": -1.3779116868972778, "step": 1520 }, { "epoch": 0.8161899983274795, "grad_norm": 6.367879324071448, "learning_rate": 9.125168570824231e-07, "logits/chosen": -0.06758121401071548, "logits/rejected": 0.06836263835430145, "logps/chosen": -1.254603624343872, "logps/rejected": -1.3010523319244385, "loss": 1.2546, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.254603624343872, "rewards/margins": 0.04644866660237312, "rewards/rejected": -1.3010523319244385, "step": 1525 }, { "epoch": 0.8188660311088811, "grad_norm": 7.300190344483854, "learning_rate": 9.116347948253496e-07, "logits/chosen": -0.11014936864376068, "logits/rejected": -0.03776741772890091, "logps/chosen": -1.2941062450408936, "logps/rejected": -1.375266194343567, "loss": 1.2941, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.2941062450408936, "rewards/margins": 0.08115986734628677, "rewards/rejected": -1.375266194343567, "step": 1530 }, { "epoch": 0.8215420638902826, "grad_norm": 8.146212928161232, "learning_rate": 9.107487388856916e-07, "logits/chosen": -0.12379995733499527, "logits/rejected": 0.0010415881406515837, "logps/chosen": -1.2017667293548584, "logps/rejected": -1.3645521402359009, "loss": 1.2018, "rewards/accuracies": 0.5625, "rewards/chosen": -1.2017667293548584, "rewards/margins": 0.1627853512763977, "rewards/rejected": -1.3645521402359009, "step": 1535 }, { "epoch": 0.8242180966716842, "grad_norm": 8.560787217389679, "learning_rate": 9.098586978599673e-07, "logits/chosen": -0.08004571497440338, "logits/rejected": 0.0380246676504612, "logps/chosen": -1.2490148544311523, "logps/rejected": -1.4283430576324463, "loss": 1.249, "rewards/accuracies": 0.5625, "rewards/chosen": -1.2490148544311523, "rewards/margins": 0.17932820320129395, "rewards/rejected": -1.4283430576324463, "step": 1540 }, { "epoch": 0.8268941294530858, "grad_norm": 5.713202088526529, "learning_rate": 9.089646803833588e-07, "logits/chosen": -0.06852756440639496, "logits/rejected": 0.057437021285295486, "logps/chosen": -1.2512567043304443, "logps/rejected": -1.3144997358322144, "loss": 1.2513, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.2512567043304443, "rewards/margins": 0.06324306130409241, "rewards/rejected": -1.3144997358322144, "step": 1545 }, { "epoch": 0.8295701622344873, "grad_norm": 6.946863556119406, "learning_rate": 9.080666951296276e-07, "logits/chosen": -0.18903210759162903, "logits/rejected": 0.016527706757187843, "logps/chosen": -1.31070077419281, "logps/rejected": -1.3833822011947632, "loss": 1.3107, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.31070077419281, "rewards/margins": 0.07268135994672775, "rewards/rejected": -1.3833822011947632, "step": 1550 }, { "epoch": 0.8322461950158889, "grad_norm": 4.306648496956122, "learning_rate": 9.071647508110305e-07, "logits/chosen": -0.14064452052116394, "logits/rejected": 0.04543847590684891, "logps/chosen": -1.333457112312317, "logps/rejected": -1.436434030532837, "loss": 1.3335, "rewards/accuracies": 0.5625, "rewards/chosen": -1.333457112312317, "rewards/margins": 0.10297715663909912, "rewards/rejected": -1.436434030532837, "step": 1555 }, { "epoch": 0.8349222277972905, "grad_norm": 6.700319051522781, "learning_rate": 9.062588561782354e-07, "logits/chosen": -0.05263591557741165, "logits/rejected": -0.0019503593211993575, "logps/chosen": -1.28522789478302, "logps/rejected": -1.4105018377304077, "loss": 1.2852, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.28522789478302, "rewards/margins": 0.1252739280462265, "rewards/rejected": -1.4105018377304077, "step": 1560 }, { "epoch": 0.8375982605786921, "grad_norm": 5.825796058327439, "learning_rate": 9.053490200202358e-07, "logits/chosen": -0.06337003409862518, "logits/rejected": 0.015892883762717247, "logps/chosen": -1.287670612335205, "logps/rejected": -1.3795655965805054, "loss": 1.2877, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.287670612335205, "rewards/margins": 0.09189485758543015, "rewards/rejected": -1.3795655965805054, "step": 1565 }, { "epoch": 0.8402742933600936, "grad_norm": 6.578604445113797, "learning_rate": 9.044352511642661e-07, "logits/chosen": -0.021803468465805054, "logits/rejected": -0.018277598544955254, "logps/chosen": -1.175083875656128, "logps/rejected": -1.2915709018707275, "loss": 1.1751, "rewards/accuracies": 0.53125, "rewards/chosen": -1.175083875656128, "rewards/margins": 0.11648701131343842, "rewards/rejected": -1.2915709018707275, "step": 1570 }, { "epoch": 0.8429503261414952, "grad_norm": 6.314769786901321, "learning_rate": 9.03517558475716e-07, "logits/chosen": -0.06093217059969902, "logits/rejected": 0.015973025932908058, "logps/chosen": -1.2683619260787964, "logps/rejected": -1.3031494617462158, "loss": 1.2684, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -1.2683619260787964, "rewards/margins": 0.034787289798259735, "rewards/rejected": -1.3031494617462158, "step": 1575 }, { "epoch": 0.8456263589228968, "grad_norm": 6.039010129872386, "learning_rate": 9.025959508580436e-07, "logits/chosen": -0.029864314943552017, "logits/rejected": 0.15171386301517487, "logps/chosen": -1.2530847787857056, "logps/rejected": -1.3728817701339722, "loss": 1.2531, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.2530847787857056, "rewards/margins": 0.11979691684246063, "rewards/rejected": -1.3728817701339722, "step": 1580 }, { "epoch": 0.8483023917042983, "grad_norm": 5.056772840448376, "learning_rate": 9.016704372526905e-07, "logits/chosen": -0.06780621409416199, "logits/rejected": 0.05078127980232239, "logps/chosen": -1.193649172782898, "logps/rejected": -1.4123687744140625, "loss": 1.1936, "rewards/accuracies": 0.59375, "rewards/chosen": -1.193649172782898, "rewards/margins": 0.21871964633464813, "rewards/rejected": -1.4123687744140625, "step": 1585 }, { "epoch": 0.8509784244856999, "grad_norm": 6.31437492331345, "learning_rate": 9.007410266389934e-07, "logits/chosen": -0.10749229043722153, "logits/rejected": -0.038525670766830444, "logps/chosen": -1.2099756002426147, "logps/rejected": -1.2933454513549805, "loss": 1.21, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.2099756002426147, "rewards/margins": 0.08336982876062393, "rewards/rejected": -1.2933454513549805, "step": 1590 }, { "epoch": 0.8536544572671015, "grad_norm": 8.119544975129815, "learning_rate": 8.998077280340981e-07, "logits/chosen": -0.04768582433462143, "logits/rejected": 0.011029532179236412, "logps/chosen": -1.325527548789978, "logps/rejected": -1.2926559448242188, "loss": 1.3255, "rewards/accuracies": 0.46875, "rewards/chosen": -1.325527548789978, "rewards/margins": -0.032871540635824203, "rewards/rejected": -1.2926559448242188, "step": 1595 }, { "epoch": 0.8563304900485031, "grad_norm": 6.388617087592912, "learning_rate": 8.988705504928722e-07, "logits/chosen": -0.13687041401863098, "logits/rejected": 0.005404523573815823, "logps/chosen": -1.2991489171981812, "logps/rejected": -1.4347189664840698, "loss": 1.2991, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.2991489171981812, "rewards/margins": 0.13557009398937225, "rewards/rejected": -1.4347189664840698, "step": 1600 }, { "epoch": 0.8563304900485031, "eval_logits/chosen": 0.1821470558643341, "eval_logits/rejected": 0.25188466906547546, "eval_logps/chosen": -1.276995301246643, "eval_logps/rejected": -1.3983421325683594, "eval_loss": 1.2772778272628784, "eval_rewards/accuracies": 0.5467358827590942, "eval_rewards/chosen": -1.276995301246643, "eval_rewards/margins": 0.12134693562984467, "eval_rewards/rejected": -1.3983421325683594, "eval_runtime": 40.2632, "eval_samples_per_second": 33.405, "eval_steps_per_second": 8.37, "step": 1600 }, { "epoch": 0.8590065228299046, "grad_norm": 4.735245382568011, "learning_rate": 8.979295031078157e-07, "logits/chosen": -0.14076590538024902, "logits/rejected": 0.03519008308649063, "logps/chosen": -1.2318437099456787, "logps/rejected": -1.4296140670776367, "loss": 1.2318, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.2318437099456787, "rewards/margins": 0.19777017831802368, "rewards/rejected": -1.4296140670776367, "step": 1605 }, { "epoch": 0.8616825556113062, "grad_norm": 4.698679347160353, "learning_rate": 8.969845950089751e-07, "logits/chosen": -0.16697251796722412, "logits/rejected": -0.028089489787817, "logps/chosen": -1.2180730104446411, "logps/rejected": -1.3496923446655273, "loss": 1.2181, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.2180730104446411, "rewards/margins": 0.1316193789243698, "rewards/rejected": -1.3496923446655273, "step": 1610 }, { "epoch": 0.8643585883927078, "grad_norm": 5.447005803587804, "learning_rate": 8.960358353638526e-07, "logits/chosen": -0.10555298626422882, "logits/rejected": -0.030538788065314293, "logps/chosen": -1.2814944982528687, "logps/rejected": -1.425047516822815, "loss": 1.2815, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.2814944982528687, "rewards/margins": 0.14355280995368958, "rewards/rejected": -1.425047516822815, "step": 1615 }, { "epoch": 0.8670346211741093, "grad_norm": 6.1390101809103905, "learning_rate": 8.950832333773184e-07, "logits/chosen": -0.06340653449296951, "logits/rejected": 0.04452090710401535, "logps/chosen": -1.1495368480682373, "logps/rejected": -1.3900883197784424, "loss": 1.1495, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.1495368480682373, "rewards/margins": 0.24055132269859314, "rewards/rejected": -1.3900883197784424, "step": 1620 }, { "epoch": 0.869710653955511, "grad_norm": 8.210732146661066, "learning_rate": 8.941267982915213e-07, "logits/chosen": -0.0067512271925807, "logits/rejected": 0.039842430502176285, "logps/chosen": -1.3221185207366943, "logps/rejected": -1.4353959560394287, "loss": 1.3221, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.3221185207366943, "rewards/margins": 0.1132773607969284, "rewards/rejected": -1.4353959560394287, "step": 1625 }, { "epoch": 0.8723866867369126, "grad_norm": 5.908345580825539, "learning_rate": 8.931665393857983e-07, "logits/chosen": -0.04581168293952942, "logits/rejected": 0.06733135879039764, "logps/chosen": -1.2873187065124512, "logps/rejected": -1.321148157119751, "loss": 1.2873, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -1.2873187065124512, "rewards/margins": 0.03382936865091324, "rewards/rejected": -1.321148157119751, "step": 1630 }, { "epoch": 0.875062719518314, "grad_norm": 6.266586704467301, "learning_rate": 8.922024659765861e-07, "logits/chosen": -0.12156398594379425, "logits/rejected": -0.03060215152800083, "logps/chosen": -1.19481360912323, "logps/rejected": -1.3301513195037842, "loss": 1.1948, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.19481360912323, "rewards/margins": 0.13533788919448853, "rewards/rejected": -1.3301513195037842, "step": 1635 }, { "epoch": 0.8777387522997157, "grad_norm": 6.23135472207158, "learning_rate": 8.912345874173288e-07, "logits/chosen": -0.10676511377096176, "logits/rejected": -0.030246105045080185, "logps/chosen": -1.1910871267318726, "logps/rejected": -1.3097288608551025, "loss": 1.1911, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.1910871267318726, "rewards/margins": 0.11864177882671356, "rewards/rejected": -1.3097288608551025, "step": 1640 }, { "epoch": 0.8804147850811173, "grad_norm": 6.309209662987664, "learning_rate": 8.902629130983885e-07, "logits/chosen": -0.06323610246181488, "logits/rejected": -0.026661913841962814, "logps/chosen": -1.1885191202163696, "logps/rejected": -1.3279482126235962, "loss": 1.1885, "rewards/accuracies": 0.59375, "rewards/chosen": -1.1885191202163696, "rewards/margins": 0.13942895829677582, "rewards/rejected": -1.3279482126235962, "step": 1645 }, { "epoch": 0.8830908178625189, "grad_norm": 7.417202776926713, "learning_rate": 8.892874524469537e-07, "logits/chosen": 0.00013731718354392797, "logits/rejected": 0.04801314324140549, "logps/chosen": -1.2191413640975952, "logps/rejected": -1.3895715475082397, "loss": 1.2191, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.2191413640975952, "rewards/margins": 0.17043009400367737, "rewards/rejected": -1.3895715475082397, "step": 1650 }, { "epoch": 0.8857668506439204, "grad_norm": 6.849589371761208, "learning_rate": 8.883082149269478e-07, "logits/chosen": -0.0972549244761467, "logits/rejected": -0.019929762929677963, "logps/chosen": -1.2793024778366089, "logps/rejected": -1.3266713619232178, "loss": 1.2793, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.2793024778366089, "rewards/margins": 0.047368861734867096, "rewards/rejected": -1.3266713619232178, "step": 1655 }, { "epoch": 0.888442883425322, "grad_norm": 5.695603178531794, "learning_rate": 8.873252100389377e-07, "logits/chosen": -0.05215173959732056, "logits/rejected": -0.06277383863925934, "logps/chosen": -1.2097136974334717, "logps/rejected": -1.3593628406524658, "loss": 1.2097, "rewards/accuracies": 0.53125, "rewards/chosen": -1.2097136974334717, "rewards/margins": 0.1496490091085434, "rewards/rejected": -1.3593628406524658, "step": 1660 }, { "epoch": 0.8911189162067236, "grad_norm": 4.277298611391971, "learning_rate": 8.863384473200411e-07, "logits/chosen": -0.061174940317869186, "logits/rejected": -0.008703869767487049, "logps/chosen": -1.2792913913726807, "logps/rejected": -1.391127109527588, "loss": 1.2793, "rewards/accuracies": 0.53125, "rewards/chosen": -1.2792913913726807, "rewards/margins": 0.11183595657348633, "rewards/rejected": -1.391127109527588, "step": 1665 }, { "epoch": 0.8937949489881251, "grad_norm": 4.981059874629846, "learning_rate": 8.853479363438342e-07, "logits/chosen": -0.027132153511047363, "logits/rejected": 0.10291673988103867, "logps/chosen": -1.2848248481750488, "logps/rejected": -1.3330994844436646, "loss": 1.2848, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -1.2848248481750488, "rewards/margins": 0.04827471822500229, "rewards/rejected": -1.3330994844436646, "step": 1670 }, { "epoch": 0.8964709817695267, "grad_norm": 4.183654179416239, "learning_rate": 8.843536867202588e-07, "logits/chosen": -0.03513443097472191, "logits/rejected": 0.13269735872745514, "logps/chosen": -1.2873567342758179, "logps/rejected": -1.4724624156951904, "loss": 1.2874, "rewards/accuracies": 0.53125, "rewards/chosen": -1.2873567342758179, "rewards/margins": 0.18510575592517853, "rewards/rejected": -1.4724624156951904, "step": 1675 }, { "epoch": 0.8991470145509283, "grad_norm": 5.542554297818384, "learning_rate": 8.833557080955292e-07, "logits/chosen": -0.12041264772415161, "logits/rejected": -0.03177953138947487, "logps/chosen": -1.3183209896087646, "logps/rejected": -1.4000449180603027, "loss": 1.3183, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.3183209896087646, "rewards/margins": 0.0817238986492157, "rewards/rejected": -1.4000449180603027, "step": 1680 }, { "epoch": 0.9018230473323299, "grad_norm": 6.542406757887637, "learning_rate": 8.823540101520381e-07, "logits/chosen": -0.08090032637119293, "logits/rejected": 0.10924837738275528, "logps/chosen": -1.2607743740081787, "logps/rejected": -1.345850944519043, "loss": 1.2608, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.2607743740081787, "rewards/margins": 0.08507661521434784, "rewards/rejected": -1.345850944519043, "step": 1685 }, { "epoch": 0.9044990801137314, "grad_norm": 5.552626214355011, "learning_rate": 8.813486026082637e-07, "logits/chosen": -0.08100254833698273, "logits/rejected": 0.07103855907917023, "logps/chosen": -1.19069504737854, "logps/rejected": -1.330545425415039, "loss": 1.1907, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.19069504737854, "rewards/margins": 0.13985039293766022, "rewards/rejected": -1.330545425415039, "step": 1690 }, { "epoch": 0.907175112895133, "grad_norm": 7.944246115042764, "learning_rate": 8.803394952186742e-07, "logits/chosen": -0.19452674686908722, "logits/rejected": -0.08147311210632324, "logps/chosen": -1.2961623668670654, "logps/rejected": -1.3727985620498657, "loss": 1.2962, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.2961623668670654, "rewards/margins": 0.07663621008396149, "rewards/rejected": -1.3727985620498657, "step": 1695 }, { "epoch": 0.9098511456765346, "grad_norm": 6.504118771637476, "learning_rate": 8.793266977736342e-07, "logits/chosen": -0.050820160657167435, "logits/rejected": -0.08396679908037186, "logps/chosen": -1.302768349647522, "logps/rejected": -1.3204041719436646, "loss": 1.3028, "rewards/accuracies": 0.46875, "rewards/chosen": -1.302768349647522, "rewards/margins": 0.017636047676205635, "rewards/rejected": -1.3204041719436646, "step": 1700 }, { "epoch": 0.9125271784579361, "grad_norm": 5.985170164847533, "learning_rate": 8.783102200993085e-07, "logits/chosen": -0.03008859232068062, "logits/rejected": 0.08918481320142746, "logps/chosen": -1.2735366821289062, "logps/rejected": -1.3343466520309448, "loss": 1.2735, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.2735366821289062, "rewards/margins": 0.06081010028719902, "rewards/rejected": -1.3343466520309448, "step": 1705 }, { "epoch": 0.9152032112393377, "grad_norm": 6.551908588879202, "learning_rate": 8.772900720575683e-07, "logits/chosen": -0.07526618242263794, "logits/rejected": -0.008849019184708595, "logps/chosen": -1.2107923030853271, "logps/rejected": -1.326629877090454, "loss": 1.2108, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.2107923030853271, "rewards/margins": 0.11583763360977173, "rewards/rejected": -1.326629877090454, "step": 1710 }, { "epoch": 0.9178792440207393, "grad_norm": 5.1445002463604625, "learning_rate": 8.762662635458944e-07, "logits/chosen": -0.06757532805204391, "logits/rejected": 0.10803943872451782, "logps/chosen": -1.3022701740264893, "logps/rejected": -1.3407765626907349, "loss": 1.3023, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.3022701740264893, "rewards/margins": 0.038506366312503815, "rewards/rejected": -1.3407765626907349, "step": 1715 }, { "epoch": 0.9205552768021408, "grad_norm": 4.47839524121856, "learning_rate": 8.752388044972811e-07, "logits/chosen": -0.09737445414066315, "logits/rejected": -0.05453004315495491, "logps/chosen": -1.1396570205688477, "logps/rejected": -1.3529913425445557, "loss": 1.1397, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.1396570205688477, "rewards/margins": 0.21333429217338562, "rewards/rejected": -1.3529913425445557, "step": 1720 }, { "epoch": 0.9232313095835424, "grad_norm": 5.606686171299104, "learning_rate": 8.74207704880141e-07, "logits/chosen": -0.12940481305122375, "logits/rejected": -0.05386684089899063, "logps/chosen": -1.2944468259811401, "logps/rejected": -1.4468432664871216, "loss": 1.2944, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.2944468259811401, "rewards/margins": 0.15239642560482025, "rewards/rejected": -1.4468432664871216, "step": 1725 }, { "epoch": 0.925907342364944, "grad_norm": 6.940521384958994, "learning_rate": 8.731729746982068e-07, "logits/chosen": 0.011741559021174908, "logits/rejected": 0.045678503811359406, "logps/chosen": -1.2583701610565186, "logps/rejected": -1.3183516263961792, "loss": 1.2584, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.2583701610565186, "rewards/margins": 0.05998154729604721, "rewards/rejected": -1.3183516263961792, "step": 1730 }, { "epoch": 0.9285833751463456, "grad_norm": 5.245568734986748, "learning_rate": 8.721346239904355e-07, "logits/chosen": -0.14888884127140045, "logits/rejected": -0.026350850239396095, "logps/chosen": -1.1945531368255615, "logps/rejected": -1.4894192218780518, "loss": 1.1946, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.1945531368255615, "rewards/margins": 0.29486608505249023, "rewards/rejected": -1.4894192218780518, "step": 1735 }, { "epoch": 0.9312594079277471, "grad_norm": 5.409093242808978, "learning_rate": 8.710926628309101e-07, "logits/chosen": -0.11007776111364365, "logits/rejected": -0.02097008004784584, "logps/chosen": -1.244447946548462, "logps/rejected": -1.3265798091888428, "loss": 1.2444, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.244447946548462, "rewards/margins": 0.0821317657828331, "rewards/rejected": -1.3265798091888428, "step": 1740 }, { "epoch": 0.9339354407091487, "grad_norm": 5.461789273061017, "learning_rate": 8.700471013287424e-07, "logits/chosen": -0.04357282817363739, "logits/rejected": -0.03339068219065666, "logps/chosen": -1.2384790182113647, "logps/rejected": -1.339045763015747, "loss": 1.2385, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.2384790182113647, "rewards/margins": 0.1005668193101883, "rewards/rejected": -1.339045763015747, "step": 1745 }, { "epoch": 0.9366114734905503, "grad_norm": 9.840609941598094, "learning_rate": 8.689979496279746e-07, "logits/chosen": -0.09472332149744034, "logits/rejected": -0.0404960997402668, "logps/chosen": -1.2143981456756592, "logps/rejected": -1.4122915267944336, "loss": 1.2144, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.2143981456756592, "rewards/margins": 0.19789329171180725, "rewards/rejected": -1.4122915267944336, "step": 1750 }, { "epoch": 0.9392875062719518, "grad_norm": 4.839149114001726, "learning_rate": 8.679452179074811e-07, "logits/chosen": -0.10445569455623627, "logits/rejected": -0.046605974435806274, "logps/chosen": -1.220332145690918, "logps/rejected": -1.301499605178833, "loss": 1.2203, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -1.220332145690918, "rewards/margins": 0.08116749674081802, "rewards/rejected": -1.301499605178833, "step": 1755 }, { "epoch": 0.9419635390533534, "grad_norm": 5.195241913409005, "learning_rate": 8.668889163808698e-07, "logits/chosen": -0.03786931931972504, "logits/rejected": 0.045100707560777664, "logps/chosen": -1.2042738199234009, "logps/rejected": -1.341800332069397, "loss": 1.2043, "rewards/accuracies": 0.5625, "rewards/chosen": -1.2042738199234009, "rewards/margins": 0.13752654194831848, "rewards/rejected": -1.341800332069397, "step": 1760 }, { "epoch": 0.944639571834755, "grad_norm": 5.616194257875042, "learning_rate": 8.658290552963827e-07, "logits/chosen": -0.03410481661558151, "logits/rejected": -0.016979102045297623, "logps/chosen": -1.2418986558914185, "logps/rejected": -1.3612347841262817, "loss": 1.2419, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.2418986558914185, "rewards/margins": 0.11933620274066925, "rewards/rejected": -1.3612347841262817, "step": 1765 }, { "epoch": 0.9473156046161565, "grad_norm": 5.462217088955761, "learning_rate": 8.647656449367966e-07, "logits/chosen": -0.045741885900497437, "logits/rejected": 0.08472435176372528, "logps/chosen": -1.2653464078903198, "logps/rejected": -1.3059977293014526, "loss": 1.2653, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.2653464078903198, "rewards/margins": 0.04065137356519699, "rewards/rejected": -1.3059977293014526, "step": 1770 }, { "epoch": 0.9499916373975581, "grad_norm": 5.683554409135229, "learning_rate": 8.636986956193235e-07, "logits/chosen": -0.11588531732559204, "logits/rejected": -0.06258896738290787, "logps/chosen": -1.2088450193405151, "logps/rejected": -1.312302827835083, "loss": 1.2088, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.2088450193405151, "rewards/margins": 0.1034577339887619, "rewards/rejected": -1.312302827835083, "step": 1775 }, { "epoch": 0.9526676701789597, "grad_norm": 5.586562160029373, "learning_rate": 8.626282176955104e-07, "logits/chosen": -0.11374504864215851, "logits/rejected": -0.021721968427300453, "logps/chosen": -1.2366387844085693, "logps/rejected": -1.3727188110351562, "loss": 1.2366, "rewards/accuracies": 0.5625, "rewards/chosen": -1.2366387844085693, "rewards/margins": 0.13607999682426453, "rewards/rejected": -1.3727188110351562, "step": 1780 }, { "epoch": 0.9553437029603613, "grad_norm": 5.638531277685169, "learning_rate": 8.615542215511389e-07, "logits/chosen": -0.025556201115250587, "logits/rejected": 0.02849406562745571, "logps/chosen": -1.1726168394088745, "logps/rejected": -1.2570855617523193, "loss": 1.1726, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.1726168394088745, "rewards/margins": 0.08446873724460602, "rewards/rejected": -1.2570855617523193, "step": 1785 }, { "epoch": 0.9580197357417628, "grad_norm": 5.75169059694091, "learning_rate": 8.604767176061241e-07, "logits/chosen": -0.01064176857471466, "logits/rejected": 0.011920974589884281, "logps/chosen": -1.2589858770370483, "logps/rejected": -1.3532261848449707, "loss": 1.259, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.2589858770370483, "rewards/margins": 0.09424014389514923, "rewards/rejected": -1.3532261848449707, "step": 1790 }, { "epoch": 0.9606957685231644, "grad_norm": 5.841959941562687, "learning_rate": 8.593957163144141e-07, "logits/chosen": -0.13006284832954407, "logits/rejected": -0.027362415567040443, "logps/chosen": -1.237945556640625, "logps/rejected": -1.3492916822433472, "loss": 1.2379, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.237945556640625, "rewards/margins": 0.11134610325098038, "rewards/rejected": -1.3492916822433472, "step": 1795 }, { "epoch": 0.963371801304566, "grad_norm": 4.704268181021579, "learning_rate": 8.58311228163888e-07, "logits/chosen": -0.0367184579372406, "logits/rejected": 0.0108641954138875, "logps/chosen": -1.216020107269287, "logps/rejected": -1.2973930835723877, "loss": 1.216, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.216020107269287, "rewards/margins": 0.08137305825948715, "rewards/rejected": -1.2973930835723877, "step": 1800 }, { "epoch": 0.9660478340859675, "grad_norm": 5.114922396446377, "learning_rate": 8.57223263676255e-07, "logits/chosen": -0.15045243501663208, "logits/rejected": -0.05076800659298897, "logps/chosen": -1.1804518699645996, "logps/rejected": -1.3439854383468628, "loss": 1.1805, "rewards/accuracies": 0.5625, "rewards/chosen": -1.1804518699645996, "rewards/margins": 0.16353367269039154, "rewards/rejected": -1.3439854383468628, "step": 1805 }, { "epoch": 0.9687238668673691, "grad_norm": 5.2089972186727636, "learning_rate": 8.561318334069511e-07, "logits/chosen": -0.05074445158243179, "logits/rejected": 0.06447550654411316, "logps/chosen": -1.2121670246124268, "logps/rejected": -1.292636752128601, "loss": 1.2122, "rewards/accuracies": 0.53125, "rewards/chosen": -1.2121670246124268, "rewards/margins": 0.08046970516443253, "rewards/rejected": -1.292636752128601, "step": 1810 }, { "epoch": 0.9713998996487707, "grad_norm": 4.815440420782523, "learning_rate": 8.550369479450375e-07, "logits/chosen": -0.08415403217077255, "logits/rejected": 0.0044464608654379845, "logps/chosen": -1.2383768558502197, "logps/rejected": -1.3050470352172852, "loss": 1.2384, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.2383768558502197, "rewards/margins": 0.06667017936706543, "rewards/rejected": -1.3050470352172852, "step": 1815 }, { "epoch": 0.9740759324301723, "grad_norm": 6.742398562069638, "learning_rate": 8.539386179130977e-07, "logits/chosen": -0.07077566534280777, "logits/rejected": -0.01992574892938137, "logps/chosen": -1.2198375463485718, "logps/rejected": -1.3103171586990356, "loss": 1.2198, "rewards/accuracies": 0.53125, "rewards/chosen": -1.2198375463485718, "rewards/margins": 0.09047947078943253, "rewards/rejected": -1.3103171586990356, "step": 1820 }, { "epoch": 0.9767519652115738, "grad_norm": 6.524129146934425, "learning_rate": 8.528368539671347e-07, "logits/chosen": -0.1257590651512146, "logits/rejected": -0.016526367515325546, "logps/chosen": -1.2119697332382202, "logps/rejected": -1.3133504390716553, "loss": 1.212, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.2119697332382202, "rewards/margins": 0.10138066858053207, "rewards/rejected": -1.3133504390716553, "step": 1825 }, { "epoch": 0.9794279979929754, "grad_norm": 4.232718950814958, "learning_rate": 8.51731666796467e-07, "logits/chosen": 0.010033226571977139, "logits/rejected": 0.02543255314230919, "logps/chosen": -1.2939362525939941, "logps/rejected": -1.3068492412567139, "loss": 1.2939, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.2939362525939941, "rewards/margins": 0.012912733480334282, "rewards/rejected": -1.3068492412567139, "step": 1830 }, { "epoch": 0.982104030774377, "grad_norm": 6.718364846914956, "learning_rate": 8.506230671236254e-07, "logits/chosen": -0.08272115886211395, "logits/rejected": -0.04296498745679855, "logps/chosen": -1.2445580959320068, "logps/rejected": -1.2526527643203735, "loss": 1.2446, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -1.2445580959320068, "rewards/margins": 0.008094715885818005, "rewards/rejected": -1.2526527643203735, "step": 1835 }, { "epoch": 0.9847800635557785, "grad_norm": 5.711669665916067, "learning_rate": 8.495110657042488e-07, "logits/chosen": -0.06293636560440063, "logits/rejected": 0.01734377257525921, "logps/chosen": -1.2451443672180176, "logps/rejected": -1.4016882181167603, "loss": 1.2451, "rewards/accuracies": 0.5625, "rewards/chosen": -1.2451443672180176, "rewards/margins": 0.15654362738132477, "rewards/rejected": -1.4016882181167603, "step": 1840 }, { "epoch": 0.9874560963371801, "grad_norm": 7.821760664707275, "learning_rate": 8.483956733269799e-07, "logits/chosen": -0.09410299360752106, "logits/rejected": -0.028073180466890335, "logps/chosen": -1.2679402828216553, "logps/rejected": -1.2864809036254883, "loss": 1.2679, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -1.2679402828216553, "rewards/margins": 0.018540600314736366, "rewards/rejected": -1.2864809036254883, "step": 1845 }, { "epoch": 0.9901321291185817, "grad_norm": 5.872976411084659, "learning_rate": 8.472769008133602e-07, "logits/chosen": -0.2226693332195282, "logits/rejected": -0.1276126652956009, "logps/chosen": -1.2789442539215088, "logps/rejected": -1.2547917366027832, "loss": 1.2789, "rewards/accuracies": 0.46875, "rewards/chosen": -1.2789442539215088, "rewards/margins": -0.024152468889951706, "rewards/rejected": -1.2547917366027832, "step": 1850 }, { "epoch": 0.9928081618999832, "grad_norm": 5.90468243263659, "learning_rate": 8.461547590177259e-07, "logits/chosen": -0.0903405100107193, "logits/rejected": -0.02273627370595932, "logps/chosen": -1.206971526145935, "logps/rejected": -1.3094390630722046, "loss": 1.207, "rewards/accuracies": 0.5, "rewards/chosen": -1.206971526145935, "rewards/margins": 0.10246758162975311, "rewards/rejected": -1.3094390630722046, "step": 1855 }, { "epoch": 0.9954841946813848, "grad_norm": 6.122957880655185, "learning_rate": 8.450292588271014e-07, "logits/chosen": -0.046001214534044266, "logits/rejected": 0.00991774071007967, "logps/chosen": -1.274603247642517, "logps/rejected": -1.347770094871521, "loss": 1.2746, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.274603247642517, "rewards/margins": 0.07316682487726212, "rewards/rejected": -1.347770094871521, "step": 1860 }, { "epoch": 0.9981602274627864, "grad_norm": 5.754804457180054, "learning_rate": 8.439004111610945e-07, "logits/chosen": -0.10217146575450897, "logits/rejected": -0.049587082117795944, "logps/chosen": -1.1358320713043213, "logps/rejected": -1.3509684801101685, "loss": 1.1358, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.1358320713043213, "rewards/margins": 0.21513637900352478, "rewards/rejected": -1.3509684801101685, "step": 1865 }, { "epoch": 1.000836260244188, "grad_norm": 5.557157154590046, "learning_rate": 8.427682269717901e-07, "logits/chosen": -0.13997718691825867, "logits/rejected": -0.01986132189631462, "logps/chosen": -1.3003907203674316, "logps/rejected": -1.3127762079238892, "loss": 1.3004, "rewards/accuracies": 0.46875, "rewards/chosen": -1.3003907203674316, "rewards/margins": 0.012385375797748566, "rewards/rejected": -1.3127762079238892, "step": 1870 }, { "epoch": 1.0035122930255895, "grad_norm": 5.526326582994575, "learning_rate": 8.416327172436446e-07, "logits/chosen": -0.1720302700996399, "logits/rejected": -0.06906794011592865, "logps/chosen": -1.2664414644241333, "logps/rejected": -1.3237035274505615, "loss": 1.2664, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.2664414644241333, "rewards/margins": 0.057262081652879715, "rewards/rejected": -1.3237035274505615, "step": 1875 }, { "epoch": 1.0061883258069912, "grad_norm": 5.775892094913355, "learning_rate": 8.404938929933778e-07, "logits/chosen": -0.07564069330692291, "logits/rejected": 0.0598999448120594, "logps/chosen": -1.21278977394104, "logps/rejected": -1.4452004432678223, "loss": 1.2128, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.21278977394104, "rewards/margins": 0.2324107587337494, "rewards/rejected": -1.4452004432678223, "step": 1880 }, { "epoch": 1.0088643585883927, "grad_norm": 6.996202004178738, "learning_rate": 8.39351765269868e-07, "logits/chosen": -0.11429326236248016, "logits/rejected": -0.05505236238241196, "logps/chosen": -1.1682350635528564, "logps/rejected": -1.31624436378479, "loss": 1.1682, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.1682350635528564, "rewards/margins": 0.1480093002319336, "rewards/rejected": -1.31624436378479, "step": 1885 }, { "epoch": 1.0115403913697942, "grad_norm": 4.749750528031102, "learning_rate": 8.382063451540431e-07, "logits/chosen": -0.12330254167318344, "logits/rejected": 0.037967100739479065, "logps/chosen": -1.240007758140564, "logps/rejected": -1.3562365770339966, "loss": 1.24, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.240007758140564, "rewards/margins": 0.11622867733240128, "rewards/rejected": -1.3562365770339966, "step": 1890 }, { "epoch": 1.014216424151196, "grad_norm": 5.341449909625558, "learning_rate": 8.370576437587742e-07, "logits/chosen": -0.0787450447678566, "logits/rejected": -0.03384889289736748, "logps/chosen": -1.2105351686477661, "logps/rejected": -1.3320449590682983, "loss": 1.2105, "rewards/accuracies": 0.59375, "rewards/chosen": -1.2105351686477661, "rewards/margins": 0.1215098649263382, "rewards/rejected": -1.3320449590682983, "step": 1895 }, { "epoch": 1.0168924569325974, "grad_norm": 5.129133812769112, "learning_rate": 8.359056722287674e-07, "logits/chosen": -0.1566084921360016, "logits/rejected": 0.04477376118302345, "logps/chosen": -1.2508774995803833, "logps/rejected": -1.3304204940795898, "loss": 1.2509, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.2508774995803833, "rewards/margins": 0.0795430913567543, "rewards/rejected": -1.3304204940795898, "step": 1900 }, { "epoch": 1.019568489713999, "grad_norm": 5.946648010465682, "learning_rate": 8.347504417404553e-07, "logits/chosen": -0.10479019582271576, "logits/rejected": 0.008218446746468544, "logps/chosen": -1.2664822340011597, "logps/rejected": -1.352635145187378, "loss": 1.2665, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.2664822340011597, "rewards/margins": 0.08615298569202423, "rewards/rejected": -1.352635145187378, "step": 1905 }, { "epoch": 1.0222445224954007, "grad_norm": 5.941931622772653, "learning_rate": 8.335919635018893e-07, "logits/chosen": -0.17767277359962463, "logits/rejected": -0.07711903005838394, "logps/chosen": -1.2411893606185913, "logps/rejected": -1.3731926679611206, "loss": 1.2412, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.2411893606185913, "rewards/margins": 0.13200335204601288, "rewards/rejected": -1.3731926679611206, "step": 1910 }, { "epoch": 1.0249205552768021, "grad_norm": 4.5020963608437805, "learning_rate": 8.324302487526303e-07, "logits/chosen": -0.11935059726238251, "logits/rejected": -0.06446530669927597, "logps/chosen": -1.2012693881988525, "logps/rejected": -1.277234435081482, "loss": 1.2013, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.2012693881988525, "rewards/margins": 0.07596506178379059, "rewards/rejected": -1.277234435081482, "step": 1915 }, { "epoch": 1.0275965880582036, "grad_norm": 4.9870798239878305, "learning_rate": 8.312653087636398e-07, "logits/chosen": -0.12934158742427826, "logits/rejected": -0.06960586458444595, "logps/chosen": -1.1279112100601196, "logps/rejected": -1.2835350036621094, "loss": 1.1279, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.1279112100601196, "rewards/margins": 0.1556238979101181, "rewards/rejected": -1.2835350036621094, "step": 1920 }, { "epoch": 1.0302726208396054, "grad_norm": 6.040563207942298, "learning_rate": 8.300971548371711e-07, "logits/chosen": -0.25389719009399414, "logits/rejected": -0.0903138667345047, "logps/chosen": -1.3106086254119873, "logps/rejected": -1.359699010848999, "loss": 1.3106, "rewards/accuracies": 0.53125, "rewards/chosen": -1.3106086254119873, "rewards/margins": 0.04909026622772217, "rewards/rejected": -1.359699010848999, "step": 1925 }, { "epoch": 1.0329486536210069, "grad_norm": 5.600753926473691, "learning_rate": 8.289257983066582e-07, "logits/chosen": -0.16796551644802094, "logits/rejected": -0.058437447994947433, "logps/chosen": -1.1679617166519165, "logps/rejected": -1.3298192024230957, "loss": 1.168, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.1679617166519165, "rewards/margins": 0.16185753047466278, "rewards/rejected": -1.3298192024230957, "step": 1930 }, { "epoch": 1.0356246864024083, "grad_norm": 4.911939588104228, "learning_rate": 8.277512505366077e-07, "logits/chosen": -0.1981288641691208, "logits/rejected": -0.05761311203241348, "logps/chosen": -1.2370867729187012, "logps/rejected": -1.3515924215316772, "loss": 1.2371, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.2370867729187012, "rewards/margins": 0.11450580507516861, "rewards/rejected": -1.3515924215316772, "step": 1935 }, { "epoch": 1.03830071918381, "grad_norm": 6.005051909782746, "learning_rate": 8.265735229224868e-07, "logits/chosen": -0.10568974912166595, "logits/rejected": -0.026971304789185524, "logps/chosen": -1.2397441864013672, "logps/rejected": -1.3684988021850586, "loss": 1.2397, "rewards/accuracies": 0.5, "rewards/chosen": -1.2397441864013672, "rewards/margins": 0.12875457108020782, "rewards/rejected": -1.3684988021850586, "step": 1940 }, { "epoch": 1.0409767519652116, "grad_norm": 6.806356985302508, "learning_rate": 8.253926268906144e-07, "logits/chosen": -0.19674186408519745, "logits/rejected": -0.06142623350024223, "logps/chosen": -1.2262510061264038, "logps/rejected": -1.3155097961425781, "loss": 1.2263, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.2262510061264038, "rewards/margins": 0.08925871551036835, "rewards/rejected": -1.3155097961425781, "step": 1945 }, { "epoch": 1.043652784746613, "grad_norm": 5.541500657040428, "learning_rate": 8.242085738980487e-07, "logits/chosen": -0.11930736154317856, "logits/rejected": 0.0463838130235672, "logps/chosen": -1.271257758140564, "logps/rejected": -1.3485958576202393, "loss": 1.2713, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.271257758140564, "rewards/margins": 0.0773380920290947, "rewards/rejected": -1.3485958576202393, "step": 1950 }, { "epoch": 1.0463288175280148, "grad_norm": 4.815100591799167, "learning_rate": 8.230213754324772e-07, "logits/chosen": -0.08058112859725952, "logits/rejected": -0.027586277574300766, "logps/chosen": -1.166109323501587, "logps/rejected": -1.3443069458007812, "loss": 1.1661, "rewards/accuracies": 0.59375, "rewards/chosen": -1.166109323501587, "rewards/margins": 0.17819759249687195, "rewards/rejected": -1.3443069458007812, "step": 1955 }, { "epoch": 1.0490048503094163, "grad_norm": 5.050481542881479, "learning_rate": 8.218310430121045e-07, "logits/chosen": -0.16489727795124054, "logits/rejected": -0.14600029587745667, "logps/chosen": -1.2210534811019897, "logps/rejected": -1.3212467432022095, "loss": 1.2211, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.2210534811019897, "rewards/margins": 0.10019324719905853, "rewards/rejected": -1.3212467432022095, "step": 1960 }, { "epoch": 1.051680883090818, "grad_norm": 5.817573384871135, "learning_rate": 8.20637588185541e-07, "logits/chosen": -0.0938141718506813, "logits/rejected": -0.04554055631160736, "logps/chosen": -1.1696784496307373, "logps/rejected": -1.4061262607574463, "loss": 1.1697, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.1696784496307373, "rewards/margins": 0.23644773662090302, "rewards/rejected": -1.4061262607574463, "step": 1965 }, { "epoch": 1.0543569158722195, "grad_norm": 6.444734968572768, "learning_rate": 8.194410225316906e-07, "logits/chosen": -0.1442783623933792, "logits/rejected": -0.04565654322504997, "logps/chosen": -1.2340043783187866, "logps/rejected": -1.3731390237808228, "loss": 1.234, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.2340043783187866, "rewards/margins": 0.13913467526435852, "rewards/rejected": -1.3731390237808228, "step": 1970 }, { "epoch": 1.057032948653621, "grad_norm": 6.679133599028149, "learning_rate": 8.182413576596385e-07, "logits/chosen": -0.06903968006372452, "logits/rejected": 0.0033228560350835323, "logps/chosen": -1.167856216430664, "logps/rejected": -1.3179877996444702, "loss": 1.1679, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.167856216430664, "rewards/margins": 0.15013161301612854, "rewards/rejected": -1.3179877996444702, "step": 1975 }, { "epoch": 1.0597089814350227, "grad_norm": 6.316163133775638, "learning_rate": 8.170386052085389e-07, "logits/chosen": -0.017253737896680832, "logits/rejected": 0.08609184622764587, "logps/chosen": -1.2378559112548828, "logps/rejected": -1.3601268529891968, "loss": 1.2379, "rewards/accuracies": 0.5, "rewards/chosen": -1.2378559112548828, "rewards/margins": 0.12227091938257217, "rewards/rejected": -1.3601268529891968, "step": 1980 }, { "epoch": 1.0623850142164242, "grad_norm": 7.121883865323607, "learning_rate": 8.158327768475008e-07, "logits/chosen": -0.11321721971035004, "logits/rejected": 0.011237586848437786, "logps/chosen": -1.2581787109375, "logps/rejected": -1.306822419166565, "loss": 1.2582, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.2581787109375, "rewards/margins": 0.048643629997968674, "rewards/rejected": -1.306822419166565, "step": 1985 }, { "epoch": 1.0650610469978257, "grad_norm": 6.362574380780379, "learning_rate": 8.146238842754767e-07, "logits/chosen": -0.12910301983356476, "logits/rejected": -0.05113428831100464, "logps/chosen": -1.2621046304702759, "logps/rejected": -1.3804562091827393, "loss": 1.2621, "rewards/accuracies": 0.5625, "rewards/chosen": -1.2621046304702759, "rewards/margins": 0.11835171282291412, "rewards/rejected": -1.3804562091827393, "step": 1990 }, { "epoch": 1.0677370797792274, "grad_norm": 6.969345214011902, "learning_rate": 8.134119392211476e-07, "logits/chosen": -0.02228573150932789, "logits/rejected": 0.09377633035182953, "logps/chosen": -1.1832574605941772, "logps/rejected": -1.3928004503250122, "loss": 1.1833, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.1832574605941772, "rewards/margins": 0.20954307913780212, "rewards/rejected": -1.3928004503250122, "step": 1995 }, { "epoch": 1.0704131125606289, "grad_norm": 6.439054641342165, "learning_rate": 8.121969534428094e-07, "logits/chosen": -0.13532628118991852, "logits/rejected": -0.0066508143208920956, "logps/chosen": -1.261492133140564, "logps/rejected": -1.2661584615707397, "loss": 1.2615, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -1.261492133140564, "rewards/margins": 0.004666291177272797, "rewards/rejected": -1.2661584615707397, "step": 2000 }, { "epoch": 1.0704131125606289, "eval_logits/chosen": 0.12802423536777496, "eval_logits/rejected": 0.19496504962444305, "eval_logps/chosen": -1.2724194526672363, "eval_logps/rejected": -1.3954881429672241, "eval_loss": 1.2726988792419434, "eval_rewards/accuracies": 0.5489614009857178, "eval_rewards/chosen": -1.2724194526672363, "eval_rewards/margins": 0.12306871265172958, "eval_rewards/rejected": -1.3954881429672241, "eval_runtime": 40.2145, "eval_samples_per_second": 33.446, "eval_steps_per_second": 8.38, "step": 2000 }, { "epoch": 1.0730891453420304, "grad_norm": 6.674951450033595, "learning_rate": 8.109789387282599e-07, "logits/chosen": -0.09415560960769653, "logits/rejected": -0.037315141409635544, "logps/chosen": -1.2722346782684326, "logps/rejected": -1.3050833940505981, "loss": 1.2722, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -1.2722346782684326, "rewards/margins": 0.032848477363586426, "rewards/rejected": -1.3050833940505981, "step": 2005 }, { "epoch": 1.075765178123432, "grad_norm": 6.297359321992773, "learning_rate": 8.097579068946827e-07, "logits/chosen": -0.05582252889871597, "logits/rejected": 0.021954774856567383, "logps/chosen": -1.2019598484039307, "logps/rejected": -1.3077048063278198, "loss": 1.202, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.2019598484039307, "rewards/margins": 0.10574493557214737, "rewards/rejected": -1.3077048063278198, "step": 2010 }, { "epoch": 1.0784412109048336, "grad_norm": 6.291055934588807, "learning_rate": 8.085338697885344e-07, "logits/chosen": -0.12539181113243103, "logits/rejected": 0.003695555031299591, "logps/chosen": -1.1687771081924438, "logps/rejected": -1.3020981550216675, "loss": 1.1688, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.1687771081924438, "rewards/margins": 0.133321151137352, "rewards/rejected": -1.3020981550216675, "step": 2015 }, { "epoch": 1.081117243686235, "grad_norm": 4.842155821906766, "learning_rate": 8.073068392854282e-07, "logits/chosen": -0.12109933793544769, "logits/rejected": -0.00026371702551841736, "logps/chosen": -1.2823290824890137, "logps/rejected": -1.4024314880371094, "loss": 1.2823, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.2823290824890137, "rewards/margins": 0.12010233104228973, "rewards/rejected": -1.4024314880371094, "step": 2020 }, { "epoch": 1.0837932764676368, "grad_norm": 5.594255885171201, "learning_rate": 8.060768272900193e-07, "logits/chosen": -0.010961165651679039, "logits/rejected": 0.09105803072452545, "logps/chosen": -1.2343180179595947, "logps/rejected": -1.353542685508728, "loss": 1.2343, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.2343180179595947, "rewards/margins": 0.11922464519739151, "rewards/rejected": -1.353542685508728, "step": 2025 }, { "epoch": 1.0864693092490383, "grad_norm": 4.340652566984723, "learning_rate": 8.0484384573589e-07, "logits/chosen": -0.1356695294380188, "logits/rejected": -0.11846984922885895, "logps/chosen": -1.1785694360733032, "logps/rejected": -1.3212659358978271, "loss": 1.1786, "rewards/accuracies": 0.5625, "rewards/chosen": -1.1785694360733032, "rewards/margins": 0.14269661903381348, "rewards/rejected": -1.3212659358978271, "step": 2030 }, { "epoch": 1.0891453420304398, "grad_norm": 6.27722334109344, "learning_rate": 8.03607906585432e-07, "logits/chosen": -0.1609405130147934, "logits/rejected": -0.02669568732380867, "logps/chosen": -1.1802971363067627, "logps/rejected": -1.293784260749817, "loss": 1.1803, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.1802971363067627, "rewards/margins": 0.11348716169595718, "rewards/rejected": -1.293784260749817, "step": 2035 }, { "epoch": 1.0918213748118415, "grad_norm": 7.614514711247098, "learning_rate": 8.023690218297329e-07, "logits/chosen": -0.17724788188934326, "logits/rejected": -0.1332141011953354, "logps/chosen": -1.2045015096664429, "logps/rejected": -1.2850162982940674, "loss": 1.2045, "rewards/accuracies": 0.5, "rewards/chosen": -1.2045015096664429, "rewards/margins": 0.08051472902297974, "rewards/rejected": -1.2850162982940674, "step": 2040 }, { "epoch": 1.094497407593243, "grad_norm": 6.664733125822478, "learning_rate": 8.01127203488458e-07, "logits/chosen": -0.09104237705469131, "logits/rejected": -0.05842696502804756, "logps/chosen": -1.210482120513916, "logps/rejected": -1.3371615409851074, "loss": 1.2105, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.210482120513916, "rewards/margins": 0.12667939066886902, "rewards/rejected": -1.3371615409851074, "step": 2045 }, { "epoch": 1.0971734403746445, "grad_norm": 4.634522926947915, "learning_rate": 7.998824636097339e-07, "logits/chosen": -0.17407886683940887, "logits/rejected": -0.098227359354496, "logps/chosen": -1.2487553358078003, "logps/rejected": -1.279837965965271, "loss": 1.2488, "rewards/accuracies": 0.53125, "rewards/chosen": -1.2487553358078003, "rewards/margins": 0.031082715839147568, "rewards/rejected": -1.279837965965271, "step": 2050 }, { "epoch": 1.0998494731560462, "grad_norm": 5.528664679067533, "learning_rate": 7.986348142700328e-07, "logits/chosen": -0.07081670314073563, "logits/rejected": 0.02289392426609993, "logps/chosen": -1.2226780652999878, "logps/rejected": -1.29796302318573, "loss": 1.2227, "rewards/accuracies": 0.53125, "rewards/chosen": -1.2226780652999878, "rewards/margins": 0.07528501749038696, "rewards/rejected": -1.29796302318573, "step": 2055 }, { "epoch": 1.1025255059374477, "grad_norm": 5.805500134216995, "learning_rate": 7.973842675740539e-07, "logits/chosen": -0.05923137068748474, "logits/rejected": -0.02862623892724514, "logps/chosen": -1.2447999715805054, "logps/rejected": -1.3807965517044067, "loss": 1.2448, "rewards/accuracies": 0.5625, "rewards/chosen": -1.2447999715805054, "rewards/margins": 0.135996475815773, "rewards/rejected": -1.3807965517044067, "step": 2060 }, { "epoch": 1.1052015387188494, "grad_norm": 4.317535844340677, "learning_rate": 7.961308356546066e-07, "logits/chosen": -0.09305359423160553, "logits/rejected": 0.0006329702446237206, "logps/chosen": -1.2297260761260986, "logps/rejected": -1.2981210947036743, "loss": 1.2297, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.2297260761260986, "rewards/margins": 0.06839494407176971, "rewards/rejected": -1.2981210947036743, "step": 2065 }, { "epoch": 1.107877571500251, "grad_norm": 5.97554415999826, "learning_rate": 7.948745306724931e-07, "logits/chosen": -0.11474784463644028, "logits/rejected": 0.00619147066026926, "logps/chosen": -1.1792539358139038, "logps/rejected": -1.3856431245803833, "loss": 1.1793, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.1792539358139038, "rewards/margins": 0.2063891440629959, "rewards/rejected": -1.3856431245803833, "step": 2070 }, { "epoch": 1.1105536042816524, "grad_norm": 5.81587686585966, "learning_rate": 7.936153648163897e-07, "logits/chosen": -0.11485800892114639, "logits/rejected": -0.029548903927206993, "logps/chosen": -1.2080246210098267, "logps/rejected": -1.384343147277832, "loss": 1.208, "rewards/accuracies": 0.625, "rewards/chosen": -1.2080246210098267, "rewards/margins": 0.17631857097148895, "rewards/rejected": -1.384343147277832, "step": 2075 }, { "epoch": 1.1132296370630541, "grad_norm": 5.433888175566809, "learning_rate": 7.92353350302729e-07, "logits/chosen": -0.1805831640958786, "logits/rejected": -0.05431520938873291, "logps/chosen": -1.146982192993164, "logps/rejected": -1.378523349761963, "loss": 1.147, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.146982192993164, "rewards/margins": 0.23154135048389435, "rewards/rejected": -1.378523349761963, "step": 2080 }, { "epoch": 1.1159056698444556, "grad_norm": 6.516538072816455, "learning_rate": 7.910884993755816e-07, "logits/chosen": -0.14322607219219208, "logits/rejected": -0.053766775876283646, "logps/chosen": -1.2037006616592407, "logps/rejected": -1.2987638711929321, "loss": 1.2037, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.2037006616592407, "rewards/margins": 0.09506304562091827, "rewards/rejected": -1.2987638711929321, "step": 2085 }, { "epoch": 1.118581702625857, "grad_norm": 6.48855924879323, "learning_rate": 7.898208243065367e-07, "logits/chosen": -0.17625293135643005, "logits/rejected": -0.17342235147953033, "logps/chosen": -1.1755708456039429, "logps/rejected": -1.284446358680725, "loss": 1.1756, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.1755708456039429, "rewards/margins": 0.10887549817562103, "rewards/rejected": -1.284446358680725, "step": 2090 }, { "epoch": 1.1212577354072588, "grad_norm": 4.106930582894316, "learning_rate": 7.88550337394583e-07, "logits/chosen": -0.12305369228124619, "logits/rejected": -0.017340976744890213, "logps/chosen": -1.339227318763733, "logps/rejected": -1.4131399393081665, "loss": 1.3392, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.339227318763733, "rewards/margins": 0.0739125981926918, "rewards/rejected": -1.4131399393081665, "step": 2095 }, { "epoch": 1.1239337681886603, "grad_norm": 7.119907896585677, "learning_rate": 7.872770509659905e-07, "logits/chosen": -0.03964807838201523, "logits/rejected": -0.017157461494207382, "logps/chosen": -1.3442144393920898, "logps/rejected": -1.3717283010482788, "loss": 1.3442, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -1.3442144393920898, "rewards/margins": 0.02751380205154419, "rewards/rejected": -1.3717283010482788, "step": 2100 }, { "epoch": 1.1266098009700618, "grad_norm": 5.1467064844642, "learning_rate": 7.860009773741896e-07, "logits/chosen": -0.04471995308995247, "logits/rejected": 0.04466588422656059, "logps/chosen": -1.2672961950302124, "logps/rejected": -1.330928921699524, "loss": 1.2673, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.2672961950302124, "rewards/margins": 0.06363272666931152, "rewards/rejected": -1.330928921699524, "step": 2105 }, { "epoch": 1.1292858337514635, "grad_norm": 6.579588082120925, "learning_rate": 7.84722128999652e-07, "logits/chosen": -0.1521325260400772, "logits/rejected": -0.02042418345808983, "logps/chosen": -1.203913927078247, "logps/rejected": -1.4811887741088867, "loss": 1.2039, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.203913927078247, "rewards/margins": 0.2772747874259949, "rewards/rejected": -1.4811887741088867, "step": 2110 }, { "epoch": 1.131961866532865, "grad_norm": 6.0435578761910635, "learning_rate": 7.834405182497699e-07, "logits/chosen": -0.03949969261884689, "logits/rejected": 0.002713510300964117, "logps/chosen": -1.2058850526809692, "logps/rejected": -1.304445505142212, "loss": 1.2059, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.2058850526809692, "rewards/margins": 0.09856045246124268, "rewards/rejected": -1.304445505142212, "step": 2115 }, { "epoch": 1.1346378993142665, "grad_norm": 6.009233565534818, "learning_rate": 7.821561575587368e-07, "logits/chosen": -0.1449139267206192, "logits/rejected": -0.13073290884494781, "logps/chosen": -1.2563257217407227, "logps/rejected": -1.3507394790649414, "loss": 1.2563, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.2563257217407227, "rewards/margins": 0.09441383928060532, "rewards/rejected": -1.3507394790649414, "step": 2120 }, { "epoch": 1.1373139320956682, "grad_norm": 5.593163223804738, "learning_rate": 7.808690593874254e-07, "logits/chosen": -0.1172168031334877, "logits/rejected": -0.07812513411045074, "logps/chosen": -1.1732755899429321, "logps/rejected": -1.3502775430679321, "loss": 1.1733, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.1732755899429321, "rewards/margins": 0.177001953125, "rewards/rejected": -1.3502775430679321, "step": 2125 }, { "epoch": 1.1399899648770697, "grad_norm": 5.2619065652309445, "learning_rate": 7.79579236223268e-07, "logits/chosen": -0.06675167381763458, "logits/rejected": 0.11614437401294708, "logps/chosen": -1.2513561248779297, "logps/rejected": -1.3679568767547607, "loss": 1.2514, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.2513561248779297, "rewards/margins": 0.11660071462392807, "rewards/rejected": -1.3679568767547607, "step": 2130 }, { "epoch": 1.1426659976584714, "grad_norm": 4.417002249067421, "learning_rate": 7.782867005801346e-07, "logits/chosen": -0.05869990587234497, "logits/rejected": 0.06342552602291107, "logps/chosen": -1.2603189945220947, "logps/rejected": -1.357358694076538, "loss": 1.2603, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.2603189945220947, "rewards/margins": 0.09703956544399261, "rewards/rejected": -1.357358694076538, "step": 2135 }, { "epoch": 1.145342030439873, "grad_norm": 6.311884466435982, "learning_rate": 7.769914649982117e-07, "logits/chosen": -0.12988826632499695, "logits/rejected": -0.02782016061246395, "logps/chosen": -1.2299487590789795, "logps/rejected": -1.3220772743225098, "loss": 1.2299, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -1.2299487590789795, "rewards/margins": 0.09212873131036758, "rewards/rejected": -1.3220772743225098, "step": 2140 }, { "epoch": 1.1480180632212744, "grad_norm": 5.253877016185165, "learning_rate": 7.756935420438803e-07, "logits/chosen": -0.07754499465227127, "logits/rejected": -0.020267512649297714, "logps/chosen": -1.11879563331604, "logps/rejected": -1.31880784034729, "loss": 1.1188, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.11879563331604, "rewards/margins": 0.20001228153705597, "rewards/rejected": -1.31880784034729, "step": 2145 }, { "epoch": 1.1506940960026761, "grad_norm": 4.545000424505452, "learning_rate": 7.743929443095951e-07, "logits/chosen": -0.14527609944343567, "logits/rejected": -0.11187932640314102, "logps/chosen": -1.2875800132751465, "logps/rejected": -1.3412866592407227, "loss": 1.2876, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.2875800132751465, "rewards/margins": 0.05370677635073662, "rewards/rejected": -1.3412866592407227, "step": 2150 }, { "epoch": 1.1533701287840776, "grad_norm": 5.953657664174296, "learning_rate": 7.730896844137609e-07, "logits/chosen": -0.0766095221042633, "logits/rejected": -0.02697588875889778, "logps/chosen": -1.2630789279937744, "logps/rejected": -1.3832489252090454, "loss": 1.2631, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.2630789279937744, "rewards/margins": 0.12016993761062622, "rewards/rejected": -1.3832489252090454, "step": 2155 }, { "epoch": 1.1560461615654791, "grad_norm": 6.773307260602965, "learning_rate": 7.717837750006106e-07, "logits/chosen": -0.15059784054756165, "logits/rejected": -0.06820385903120041, "logps/chosen": -1.1797010898590088, "logps/rejected": -1.3452390432357788, "loss": 1.1797, "rewards/accuracies": 0.59375, "rewards/chosen": -1.1797010898590088, "rewards/margins": 0.1655380129814148, "rewards/rejected": -1.3452390432357788, "step": 2160 }, { "epoch": 1.1587221943468808, "grad_norm": 6.212017811099183, "learning_rate": 7.704752287400832e-07, "logits/chosen": -0.13080987334251404, "logits/rejected": 0.010145789943635464, "logps/chosen": -1.2385661602020264, "logps/rejected": -1.401116967201233, "loss": 1.2386, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.2385661602020264, "rewards/margins": 0.16255083680152893, "rewards/rejected": -1.401116967201233, "step": 2165 }, { "epoch": 1.1613982271282823, "grad_norm": 4.886902154373173, "learning_rate": 7.691640583277004e-07, "logits/chosen": -0.09878955781459808, "logits/rejected": 0.03123495541512966, "logps/chosen": -1.198112964630127, "logps/rejected": -1.370409369468689, "loss": 1.1981, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.198112964630127, "rewards/margins": 0.17229653894901276, "rewards/rejected": -1.370409369468689, "step": 2170 }, { "epoch": 1.1640742599096838, "grad_norm": 5.397600136263168, "learning_rate": 7.678502764844433e-07, "logits/chosen": -0.13300319015979767, "logits/rejected": -0.005760944448411465, "logps/chosen": -1.2670361995697021, "logps/rejected": -1.3041824102401733, "loss": 1.267, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.2670361995697021, "rewards/margins": 0.03714611381292343, "rewards/rejected": -1.3041824102401733, "step": 2175 }, { "epoch": 1.1667502926910855, "grad_norm": 6.277069744695297, "learning_rate": 7.665338959566288e-07, "logits/chosen": -0.13631285727024078, "logits/rejected": -0.09045252948999405, "logps/chosen": -1.2043020725250244, "logps/rejected": -1.33321213722229, "loss": 1.2043, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.2043020725250244, "rewards/margins": 0.12890997529029846, "rewards/rejected": -1.33321213722229, "step": 2180 }, { "epoch": 1.169426325472487, "grad_norm": 6.685665167114626, "learning_rate": 7.652149295157868e-07, "logits/chosen": -0.047766998410224915, "logits/rejected": 0.055939752608537674, "logps/chosen": -1.2350966930389404, "logps/rejected": -1.310704231262207, "loss": 1.2351, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.2350966930389404, "rewards/margins": 0.0756075531244278, "rewards/rejected": -1.310704231262207, "step": 2185 }, { "epoch": 1.1721023582538885, "grad_norm": 5.602930601882656, "learning_rate": 7.638933899585354e-07, "logits/chosen": 0.0063167898915708065, "logits/rejected": 0.038793645799160004, "logps/chosen": -1.213621973991394, "logps/rejected": -1.3035609722137451, "loss": 1.2136, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.213621973991394, "rewards/margins": 0.0899391621351242, "rewards/rejected": -1.3035609722137451, "step": 2190 }, { "epoch": 1.1747783910352902, "grad_norm": 7.314680250873359, "learning_rate": 7.625692901064573e-07, "logits/chosen": -0.04452704265713692, "logits/rejected": 0.0316966250538826, "logps/chosen": -1.1935454607009888, "logps/rejected": -1.4144408702850342, "loss": 1.1935, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.1935454607009888, "rewards/margins": 0.2208956778049469, "rewards/rejected": -1.4144408702850342, "step": 2195 }, { "epoch": 1.1774544238166917, "grad_norm": 6.677675175436041, "learning_rate": 7.61242642805975e-07, "logits/chosen": -0.17070476710796356, "logits/rejected": -0.18248251080513, "logps/chosen": -1.2231392860412598, "logps/rejected": -1.33831787109375, "loss": 1.2231, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.2231392860412598, "rewards/margins": 0.11517846584320068, "rewards/rejected": -1.33831787109375, "step": 2200 }, { "epoch": 1.1801304565980932, "grad_norm": 4.835225055605517, "learning_rate": 7.599134609282266e-07, "logits/chosen": -0.14613401889801025, "logits/rejected": -0.0030880779959261417, "logps/chosen": -1.1627432107925415, "logps/rejected": -1.3021693229675293, "loss": 1.1627, "rewards/accuracies": 0.59375, "rewards/chosen": -1.1627432107925415, "rewards/margins": 0.13942614197731018, "rewards/rejected": -1.3021693229675293, "step": 2205 }, { "epoch": 1.182806489379495, "grad_norm": 5.5177891092729485, "learning_rate": 7.585817573689402e-07, "logits/chosen": -0.19509480893611908, "logits/rejected": -0.10757992416620255, "logps/chosen": -1.11354660987854, "logps/rejected": -1.308950424194336, "loss": 1.1135, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.11354660987854, "rewards/margins": 0.1954038441181183, "rewards/rejected": -1.308950424194336, "step": 2210 }, { "epoch": 1.1854825221608964, "grad_norm": 5.867656279143526, "learning_rate": 7.572475450483098e-07, "logits/chosen": -0.1660626381635666, "logits/rejected": -0.11172778904438019, "logps/chosen": -1.3324201107025146, "logps/rejected": -1.45529043674469, "loss": 1.3324, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.3324201107025146, "rewards/margins": 0.12287025153636932, "rewards/rejected": -1.45529043674469, "step": 2215 }, { "epoch": 1.188158554942298, "grad_norm": 5.155028071330208, "learning_rate": 7.559108369108689e-07, "logits/chosen": -0.2109367847442627, "logits/rejected": -0.11555452644824982, "logps/chosen": -1.170729398727417, "logps/rejected": -1.2614595890045166, "loss": 1.1707, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.170729398727417, "rewards/margins": 0.0907302275300026, "rewards/rejected": -1.2614595890045166, "step": 2220 }, { "epoch": 1.1908345877236997, "grad_norm": 6.064379838233022, "learning_rate": 7.54571645925366e-07, "logits/chosen": -0.15969400107860565, "logits/rejected": -0.005269153043627739, "logps/chosen": -1.1860032081604004, "logps/rejected": -1.3653500080108643, "loss": 1.186, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.1860032081604004, "rewards/margins": 0.17934691905975342, "rewards/rejected": -1.3653500080108643, "step": 2225 }, { "epoch": 1.1935106205051011, "grad_norm": 6.80854543834812, "learning_rate": 7.532299850846378e-07, "logits/chosen": -0.19866251945495605, "logits/rejected": -0.09718474745750427, "logps/chosen": -1.241593599319458, "logps/rejected": -1.4532580375671387, "loss": 1.2416, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.241593599319458, "rewards/margins": 0.21166452765464783, "rewards/rejected": -1.4532580375671387, "step": 2230 }, { "epoch": 1.1961866532865026, "grad_norm": 7.200349743910489, "learning_rate": 7.518858674054838e-07, "logits/chosen": -0.17651179432868958, "logits/rejected": -0.042947474867105484, "logps/chosen": -1.1893316507339478, "logps/rejected": -1.3908665180206299, "loss": 1.1893, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.1893316507339478, "rewards/margins": 0.20153474807739258, "rewards/rejected": -1.3908665180206299, "step": 2235 }, { "epoch": 1.1988626860679044, "grad_norm": 7.0358003566517775, "learning_rate": 7.505393059285394e-07, "logits/chosen": -0.1604907512664795, "logits/rejected": -0.05404793471097946, "logps/chosen": -1.2074410915374756, "logps/rejected": -1.3631184101104736, "loss": 1.2074, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.2074410915374756, "rewards/margins": 0.15567749738693237, "rewards/rejected": -1.3631184101104736, "step": 2240 }, { "epoch": 1.2015387188493059, "grad_norm": 6.687848348569351, "learning_rate": 7.491903137181501e-07, "logits/chosen": -0.13540717959403992, "logits/rejected": -0.10184917598962784, "logps/chosen": -1.1922575235366821, "logps/rejected": -1.3508437871932983, "loss": 1.1923, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.1922575235366821, "rewards/margins": 0.1585862934589386, "rewards/rejected": -1.3508437871932983, "step": 2245 }, { "epoch": 1.2042147516307076, "grad_norm": 5.504790683277279, "learning_rate": 7.478389038622441e-07, "logits/chosen": -0.08567659556865692, "logits/rejected": -0.0779736265540123, "logps/chosen": -1.1476901769638062, "logps/rejected": -1.3733478784561157, "loss": 1.1477, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.1476901769638062, "rewards/margins": 0.22565753757953644, "rewards/rejected": -1.3733478784561157, "step": 2250 }, { "epoch": 1.206890784412109, "grad_norm": 5.154268965886282, "learning_rate": 7.46485089472206e-07, "logits/chosen": -0.15398064255714417, "logits/rejected": -0.07263804227113724, "logps/chosen": -1.2842222452163696, "logps/rejected": -1.290809154510498, "loss": 1.2842, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.2842222452163696, "rewards/margins": 0.00658683106303215, "rewards/rejected": -1.290809154510498, "step": 2255 }, { "epoch": 1.2095668171935106, "grad_norm": 6.170155308207173, "learning_rate": 7.451288836827487e-07, "logits/chosen": -0.10253047943115234, "logits/rejected": -0.12191557884216309, "logps/chosen": -1.2337965965270996, "logps/rejected": -1.3166124820709229, "loss": 1.2338, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.2337965965270996, "rewards/margins": 0.0828157365322113, "rewards/rejected": -1.3166124820709229, "step": 2260 }, { "epoch": 1.2122428499749123, "grad_norm": 8.150749071922796, "learning_rate": 7.437702996517869e-07, "logits/chosen": -0.1896965205669403, "logits/rejected": -0.11598467826843262, "logps/chosen": -1.2652944326400757, "logps/rejected": -1.350515604019165, "loss": 1.2653, "rewards/accuracies": 0.5, "rewards/chosen": -1.2652944326400757, "rewards/margins": 0.08522116392850876, "rewards/rejected": -1.350515604019165, "step": 2265 }, { "epoch": 1.2149188827563138, "grad_norm": 5.981939375765169, "learning_rate": 7.424093505603087e-07, "logits/chosen": -0.2594521939754486, "logits/rejected": -0.14317116141319275, "logps/chosen": -1.204012393951416, "logps/rejected": -1.357337236404419, "loss": 1.204, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.204012393951416, "rewards/margins": 0.15332503616809845, "rewards/rejected": -1.357337236404419, "step": 2270 }, { "epoch": 1.2175949155377153, "grad_norm": 6.17219098942283, "learning_rate": 7.410460496122482e-07, "logits/chosen": -0.13455840945243835, "logits/rejected": -0.059302426874637604, "logps/chosen": -1.1862884759902954, "logps/rejected": -1.3785773515701294, "loss": 1.1863, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.1862884759902954, "rewards/margins": 0.19228887557983398, "rewards/rejected": -1.3785773515701294, "step": 2275 }, { "epoch": 1.220270948319117, "grad_norm": 6.681919273328882, "learning_rate": 7.396804100343572e-07, "logits/chosen": -0.18540659546852112, "logits/rejected": -0.07016501575708389, "logps/chosen": -1.125486135482788, "logps/rejected": -1.2568479776382446, "loss": 1.1255, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.125486135482788, "rewards/margins": 0.13136185705661774, "rewards/rejected": -1.2568479776382446, "step": 2280 }, { "epoch": 1.2229469811005185, "grad_norm": 5.522400964901961, "learning_rate": 7.383124450760768e-07, "logits/chosen": -0.1333099752664566, "logits/rejected": 0.01716659404337406, "logps/chosen": -1.2598899602890015, "logps/rejected": -1.3448975086212158, "loss": 1.2599, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -1.2598899602890015, "rewards/margins": 0.08500748872756958, "rewards/rejected": -1.3448975086212158, "step": 2285 }, { "epoch": 1.22562301388192, "grad_norm": 4.082840907668515, "learning_rate": 7.369421680094091e-07, "logits/chosen": -0.18829672038555145, "logits/rejected": -0.06630013138055801, "logps/chosen": -1.1207759380340576, "logps/rejected": -1.2290703058242798, "loss": 1.1208, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.1207759380340576, "rewards/margins": 0.10829408466815948, "rewards/rejected": -1.2290703058242798, "step": 2290 }, { "epoch": 1.2282990466633217, "grad_norm": 5.85634307286142, "learning_rate": 7.355695921287881e-07, "logits/chosen": -0.16212227940559387, "logits/rejected": -0.09145961701869965, "logps/chosen": -1.2000895738601685, "logps/rejected": -1.2850706577301025, "loss": 1.2001, "rewards/accuracies": 0.53125, "rewards/chosen": -1.2000895738601685, "rewards/margins": 0.08498111367225647, "rewards/rejected": -1.2850706577301025, "step": 2295 }, { "epoch": 1.2309750794447232, "grad_norm": 5.532776235753113, "learning_rate": 7.341947307509513e-07, "logits/chosen": -0.11148612201213837, "logits/rejected": -0.013058084063231945, "logps/chosen": -1.2060601711273193, "logps/rejected": -1.2797354459762573, "loss": 1.2061, "rewards/accuracies": 0.46875, "rewards/chosen": -1.2060601711273193, "rewards/margins": 0.07367529720067978, "rewards/rejected": -1.2797354459762573, "step": 2300 }, { "epoch": 1.233651112226125, "grad_norm": 5.901065038670789, "learning_rate": 7.328175972148094e-07, "logits/chosen": -0.1732938289642334, "logits/rejected": -0.06728793680667877, "logps/chosen": -1.3319823741912842, "logps/rejected": -1.3856518268585205, "loss": 1.332, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.3319823741912842, "rewards/margins": 0.053669340908527374, "rewards/rejected": -1.3856518268585205, "step": 2305 }, { "epoch": 1.2363271450075264, "grad_norm": 5.574294543985595, "learning_rate": 7.314382048813185e-07, "logits/chosen": -0.13616712391376495, "logits/rejected": 0.06450370699167252, "logps/chosen": -1.261338472366333, "logps/rejected": -1.387338399887085, "loss": 1.2613, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.261338472366333, "rewards/margins": 0.12599992752075195, "rewards/rejected": -1.387338399887085, "step": 2310 }, { "epoch": 1.2390031777889279, "grad_norm": 6.152074819157409, "learning_rate": 7.300565671333486e-07, "logits/chosen": -0.15258511900901794, "logits/rejected": -0.035422783344984055, "logps/chosen": -1.227022409439087, "logps/rejected": -1.3391637802124023, "loss": 1.227, "rewards/accuracies": 0.59375, "rewards/chosen": -1.227022409439087, "rewards/margins": 0.1121414303779602, "rewards/rejected": -1.3391637802124023, "step": 2315 }, { "epoch": 1.2416792105703296, "grad_norm": 6.088922394417611, "learning_rate": 7.286726973755554e-07, "logits/chosen": -0.060721635818481445, "logits/rejected": -0.05806620046496391, "logps/chosen": -1.249113917350769, "logps/rejected": -1.4058544635772705, "loss": 1.2491, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.249113917350769, "rewards/margins": 0.1567404717206955, "rewards/rejected": -1.4058544635772705, "step": 2320 }, { "epoch": 1.244355243351731, "grad_norm": 7.996173380893877, "learning_rate": 7.272866090342493e-07, "logits/chosen": -0.024810949340462685, "logits/rejected": 0.02293274737894535, "logps/chosen": -1.2609071731567383, "logps/rejected": -1.4536784887313843, "loss": 1.2609, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.2609071731567383, "rewards/margins": 0.1927713006734848, "rewards/rejected": -1.4536784887313843, "step": 2325 }, { "epoch": 1.2470312761331326, "grad_norm": 5.615830292552207, "learning_rate": 7.258983155572656e-07, "logits/chosen": -0.19686219096183777, "logits/rejected": -0.12374582141637802, "logps/chosen": -1.2122571468353271, "logps/rejected": -1.3578161001205444, "loss": 1.2123, "rewards/accuracies": 0.5625, "rewards/chosen": -1.2122571468353271, "rewards/margins": 0.14555911719799042, "rewards/rejected": -1.3578161001205444, "step": 2330 }, { "epoch": 1.2497073089145343, "grad_norm": 4.411183759499573, "learning_rate": 7.245078304138335e-07, "logits/chosen": -0.06172472983598709, "logits/rejected": -0.027445685118436813, "logps/chosen": -1.2004698514938354, "logps/rejected": -1.3933093547821045, "loss": 1.2005, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.2004698514938354, "rewards/margins": 0.1928395926952362, "rewards/rejected": -1.3933093547821045, "step": 2335 }, { "epoch": 1.2523833416959358, "grad_norm": 4.751825710832782, "learning_rate": 7.231151670944462e-07, "logits/chosen": -0.22182390093803406, "logits/rejected": -0.08409421890974045, "logps/chosen": -1.233047366142273, "logps/rejected": -1.3394038677215576, "loss": 1.233, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.233047366142273, "rewards/margins": 0.10635660588741302, "rewards/rejected": -1.3394038677215576, "step": 2340 }, { "epoch": 1.2550593744773373, "grad_norm": 7.106133559280826, "learning_rate": 7.217203391107291e-07, "logits/chosen": -0.15036682784557343, "logits/rejected": -0.030733788385987282, "logps/chosen": -1.223501205444336, "logps/rejected": -1.3265577554702759, "loss": 1.2235, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.223501205444336, "rewards/margins": 0.10305650532245636, "rewards/rejected": -1.3265577554702759, "step": 2345 }, { "epoch": 1.257735407258739, "grad_norm": 5.850742240025502, "learning_rate": 7.203233599953096e-07, "logits/chosen": -0.1587187796831131, "logits/rejected": -0.0649583488702774, "logps/chosen": -1.2704429626464844, "logps/rejected": -1.331121802330017, "loss": 1.2704, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.2704429626464844, "rewards/margins": 0.06067882105708122, "rewards/rejected": -1.331121802330017, "step": 2350 }, { "epoch": 1.2604114400401405, "grad_norm": 6.055138110739776, "learning_rate": 7.189242433016852e-07, "logits/chosen": -0.08767621219158173, "logits/rejected": 0.014603535644710064, "logps/chosen": -1.1468112468719482, "logps/rejected": -1.3127918243408203, "loss": 1.1468, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.1468112468719482, "rewards/margins": 0.16598042845726013, "rewards/rejected": -1.3127918243408203, "step": 2355 }, { "epoch": 1.263087472821542, "grad_norm": 5.195589354990195, "learning_rate": 7.17523002604092e-07, "logits/chosen": -0.1112845167517662, "logits/rejected": -0.029254049062728882, "logps/chosen": -1.1927610635757446, "logps/rejected": -1.4774773120880127, "loss": 1.1928, "rewards/accuracies": 0.59375, "rewards/chosen": -1.1927610635757446, "rewards/margins": 0.28471630811691284, "rewards/rejected": -1.4774773120880127, "step": 2360 }, { "epoch": 1.2657635056029437, "grad_norm": 4.362792539989624, "learning_rate": 7.161196514973734e-07, "logits/chosen": -0.07305806130170822, "logits/rejected": 0.012317663058638573, "logps/chosen": -1.2224600315093994, "logps/rejected": -1.3428205251693726, "loss": 1.2225, "rewards/accuracies": 0.53125, "rewards/chosen": -1.2224600315093994, "rewards/margins": 0.12036050856113434, "rewards/rejected": -1.3428205251693726, "step": 2365 }, { "epoch": 1.2684395383843452, "grad_norm": 5.9859524231176495, "learning_rate": 7.147142035968483e-07, "logits/chosen": -0.08257532119750977, "logits/rejected": -0.005396233405917883, "logps/chosen": -1.1721513271331787, "logps/rejected": -1.327762246131897, "loss": 1.1722, "rewards/accuracies": 0.5625, "rewards/chosen": -1.1721513271331787, "rewards/margins": 0.1556108444929123, "rewards/rejected": -1.327762246131897, "step": 2370 }, { "epoch": 1.2711155711657467, "grad_norm": 7.021089176486053, "learning_rate": 7.133066725381781e-07, "logits/chosen": -0.22731201350688934, "logits/rejected": -0.1104135736823082, "logps/chosen": -1.1380810737609863, "logps/rejected": -1.1913002729415894, "loss": 1.1381, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.1380810737609863, "rewards/margins": 0.0532192662358284, "rewards/rejected": -1.1913002729415894, "step": 2375 }, { "epoch": 1.2737916039471484, "grad_norm": 5.750123027336932, "learning_rate": 7.118970719772354e-07, "logits/chosen": -0.21579572558403015, "logits/rejected": -0.0440676286816597, "logps/chosen": -1.2532093524932861, "logps/rejected": -1.3621549606323242, "loss": 1.2532, "rewards/accuracies": 0.53125, "rewards/chosen": -1.2532093524932861, "rewards/margins": 0.10894570499658585, "rewards/rejected": -1.3621549606323242, "step": 2380 }, { "epoch": 1.27646763672855, "grad_norm": 5.002179730367955, "learning_rate": 7.104854155899711e-07, "logits/chosen": -0.08988645672798157, "logits/rejected": -0.014451200142502785, "logps/chosen": -1.2238969802856445, "logps/rejected": -1.3177368640899658, "loss": 1.2239, "rewards/accuracies": 0.53125, "rewards/chosen": -1.2238969802856445, "rewards/margins": 0.09383980184793472, "rewards/rejected": -1.3177368640899658, "step": 2385 }, { "epoch": 1.2791436695099514, "grad_norm": 5.957301668126643, "learning_rate": 7.090717170722817e-07, "logits/chosen": -0.09886602312326431, "logits/rejected": -0.055596303194761276, "logps/chosen": -1.201803207397461, "logps/rejected": -1.345169186592102, "loss": 1.2018, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.201803207397461, "rewards/margins": 0.14336594939231873, "rewards/rejected": -1.345169186592102, "step": 2390 }, { "epoch": 1.2818197022913531, "grad_norm": 4.669211766520229, "learning_rate": 7.076559901398762e-07, "logits/chosen": -0.2581543028354645, "logits/rejected": -0.17242996394634247, "logps/chosen": -1.131554126739502, "logps/rejected": -1.3023405075073242, "loss": 1.1316, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.131554126739502, "rewards/margins": 0.17078641057014465, "rewards/rejected": -1.3023405075073242, "step": 2395 }, { "epoch": 1.2844957350727546, "grad_norm": 7.271791971974523, "learning_rate": 7.062382485281436e-07, "logits/chosen": -0.1353069543838501, "logits/rejected": -0.05633528158068657, "logps/chosen": -1.1888632774353027, "logps/rejected": -1.3364334106445312, "loss": 1.1889, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.1888632774353027, "rewards/margins": 0.1475701630115509, "rewards/rejected": -1.3364334106445312, "step": 2400 }, { "epoch": 1.2844957350727546, "eval_logits/chosen": 0.09900359809398651, "eval_logits/rejected": 0.16488054394721985, "eval_logps/chosen": -1.2686352729797363, "eval_logps/rejected": -1.3925800323486328, "eval_loss": 1.2689100503921509, "eval_rewards/accuracies": 0.5474777221679688, "eval_rewards/chosen": -1.2686352729797363, "eval_rewards/margins": 0.12394492328166962, "eval_rewards/rejected": -1.3925800323486328, "eval_runtime": 40.244, "eval_samples_per_second": 33.421, "eval_steps_per_second": 8.374, "step": 2400 }, { "epoch": 1.287171767854156, "grad_norm": 5.768718353469991, "learning_rate": 7.048185059920193e-07, "logits/chosen": -0.14966537058353424, "logits/rejected": -0.031821150332689285, "logps/chosen": -1.22076416015625, "logps/rejected": -1.3633697032928467, "loss": 1.2208, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.22076416015625, "rewards/margins": 0.1426054835319519, "rewards/rejected": -1.3633697032928467, "step": 2405 }, { "epoch": 1.2898478006355578, "grad_norm": 5.021279342888253, "learning_rate": 7.033967763058516e-07, "logits/chosen": -0.2610272765159607, "logits/rejected": -0.10482768714427948, "logps/chosen": -1.1928308010101318, "logps/rejected": -1.2575037479400635, "loss": 1.1928, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.1928308010101318, "rewards/margins": 0.06467296183109283, "rewards/rejected": -1.2575037479400635, "step": 2410 }, { "epoch": 1.2925238334169593, "grad_norm": 4.8741512053852665, "learning_rate": 7.019730732632681e-07, "logits/chosen": -0.09253223985433578, "logits/rejected": -0.04813455045223236, "logps/chosen": -1.1326509714126587, "logps/rejected": -1.3582873344421387, "loss": 1.1327, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.1326509714126587, "rewards/margins": 0.22563643753528595, "rewards/rejected": -1.3582873344421387, "step": 2415 }, { "epoch": 1.2951998661983608, "grad_norm": 5.747895830765516, "learning_rate": 7.005474106770418e-07, "logits/chosen": -0.21806630492210388, "logits/rejected": -0.11929328739643097, "logps/chosen": -1.2453442811965942, "logps/rejected": -1.4542537927627563, "loss": 1.2453, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.2453442811965942, "rewards/margins": 0.2089095115661621, "rewards/rejected": -1.4542537927627563, "step": 2420 }, { "epoch": 1.2978758989797625, "grad_norm": 4.952159812105557, "learning_rate": 6.991198023789577e-07, "logits/chosen": -0.10807511955499649, "logits/rejected": -0.04972574859857559, "logps/chosen": -1.1823889017105103, "logps/rejected": -1.3400100469589233, "loss": 1.1824, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.1823889017105103, "rewards/margins": 0.15762123465538025, "rewards/rejected": -1.3400100469589233, "step": 2425 }, { "epoch": 1.300551931761164, "grad_norm": 6.9218997973530065, "learning_rate": 6.976902622196776e-07, "logits/chosen": -0.09087160974740982, "logits/rejected": -0.072505421936512, "logps/chosen": -1.3188713788986206, "logps/rejected": -1.4364707469940186, "loss": 1.3189, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.3188713788986206, "rewards/margins": 0.11759926378726959, "rewards/rejected": -1.4364707469940186, "step": 2430 }, { "epoch": 1.3032279645425655, "grad_norm": 4.112819016280579, "learning_rate": 6.962588040686064e-07, "logits/chosen": -0.08239294588565826, "logits/rejected": 0.008072967641055584, "logps/chosen": -1.1892492771148682, "logps/rejected": -1.3099805116653442, "loss": 1.1892, "rewards/accuracies": 0.5625, "rewards/chosen": -1.1892492771148682, "rewards/margins": 0.12073124945163727, "rewards/rejected": -1.3099805116653442, "step": 2435 }, { "epoch": 1.3059039973239672, "grad_norm": 7.076004921670809, "learning_rate": 6.948254418137573e-07, "logits/chosen": -0.20756976306438446, "logits/rejected": -0.12199071794748306, "logps/chosen": -1.1711616516113281, "logps/rejected": -1.3046002388000488, "loss": 1.1712, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.1711616516113281, "rewards/margins": 0.1334385871887207, "rewards/rejected": -1.3046002388000488, "step": 2440 }, { "epoch": 1.3085800301053687, "grad_norm": 7.2312700661625025, "learning_rate": 6.933901893616174e-07, "logits/chosen": -0.14465650916099548, "logits/rejected": -0.030448663979768753, "logps/chosen": -1.2133185863494873, "logps/rejected": -1.3145294189453125, "loss": 1.2133, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.2133185863494873, "rewards/margins": 0.10121090710163116, "rewards/rejected": -1.3145294189453125, "step": 2445 }, { "epoch": 1.3112560628867704, "grad_norm": 5.027835696718314, "learning_rate": 6.919530606370121e-07, "logits/chosen": -0.15373918414115906, "logits/rejected": -0.03414085507392883, "logps/chosen": -1.1754047870635986, "logps/rejected": -1.3791592121124268, "loss": 1.1754, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.1754047870635986, "rewards/margins": 0.2037545144557953, "rewards/rejected": -1.3791592121124268, "step": 2450 }, { "epoch": 1.313932095668172, "grad_norm": 4.338294766850221, "learning_rate": 6.905140695829706e-07, "logits/chosen": -0.1716085970401764, "logits/rejected": -0.011342559941112995, "logps/chosen": -1.2674248218536377, "logps/rejected": -1.3144280910491943, "loss": 1.2674, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.2674248218536377, "rewards/margins": 0.04700345918536186, "rewards/rejected": -1.3144280910491943, "step": 2455 }, { "epoch": 1.3166081284495736, "grad_norm": 7.885136719613696, "learning_rate": 6.890732301605904e-07, "logits/chosen": -0.1251334398984909, "logits/rejected": -0.05631180852651596, "logps/chosen": -1.248449683189392, "logps/rejected": -1.2985186576843262, "loss": 1.2484, "rewards/accuracies": 0.53125, "rewards/chosen": -1.248449683189392, "rewards/margins": 0.050069063901901245, "rewards/rejected": -1.2985186576843262, "step": 2460 }, { "epoch": 1.3192841612309751, "grad_norm": 5.314936058476169, "learning_rate": 6.876305563489021e-07, "logits/chosen": -0.12953408062458038, "logits/rejected": -0.08908665180206299, "logps/chosen": -1.2211748361587524, "logps/rejected": -1.3553272485733032, "loss": 1.2212, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.2211748361587524, "rewards/margins": 0.1341523975133896, "rewards/rejected": -1.3553272485733032, "step": 2465 }, { "epoch": 1.3219601940123766, "grad_norm": 6.874519465512668, "learning_rate": 6.861860621447331e-07, "logits/chosen": -0.2489032745361328, "logits/rejected": -0.15364430844783783, "logps/chosen": -1.2015185356140137, "logps/rejected": -1.234568476676941, "loss": 1.2015, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -1.2015185356140137, "rewards/margins": 0.03304990753531456, "rewards/rejected": -1.234568476676941, "step": 2470 }, { "epoch": 1.3246362267937783, "grad_norm": 4.62852965852627, "learning_rate": 6.847397615625725e-07, "logits/chosen": -0.17560788989067078, "logits/rejected": -0.12947866320610046, "logps/chosen": -1.2207379341125488, "logps/rejected": -1.3903605937957764, "loss": 1.2207, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.2207379341125488, "rewards/margins": 0.1696225106716156, "rewards/rejected": -1.3903605937957764, "step": 2475 }, { "epoch": 1.3273122595751798, "grad_norm": 4.729437405914685, "learning_rate": 6.83291668634435e-07, "logits/chosen": -0.2645382285118103, "logits/rejected": -0.14026287198066711, "logps/chosen": -1.2333698272705078, "logps/rejected": -1.3895245790481567, "loss": 1.2334, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.2333698272705078, "rewards/margins": 0.15615467727184296, "rewards/rejected": -1.3895245790481567, "step": 2480 }, { "epoch": 1.3299882923565813, "grad_norm": 6.737086167382301, "learning_rate": 6.818417974097246e-07, "logits/chosen": -0.13063566386699677, "logits/rejected": 0.016875583678483963, "logps/chosen": -1.25649094581604, "logps/rejected": -1.3749549388885498, "loss": 1.2565, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.25649094581604, "rewards/margins": 0.11846399307250977, "rewards/rejected": -1.3749549388885498, "step": 2485 }, { "epoch": 1.332664325137983, "grad_norm": 5.420121353286132, "learning_rate": 6.803901619550981e-07, "logits/chosen": -0.22883205115795135, "logits/rejected": -0.19659344851970673, "logps/chosen": -1.2385671138763428, "logps/rejected": -1.4412801265716553, "loss": 1.2386, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.2385671138763428, "rewards/margins": 0.20271281898021698, "rewards/rejected": -1.4412801265716553, "step": 2490 }, { "epoch": 1.3353403579193845, "grad_norm": 5.212107749921817, "learning_rate": 6.789367763543292e-07, "logits/chosen": -0.1087251752614975, "logits/rejected": -0.11686065047979355, "logps/chosen": -1.2219470739364624, "logps/rejected": -1.346097707748413, "loss": 1.2219, "rewards/accuracies": 0.5625, "rewards/chosen": -1.2219470739364624, "rewards/margins": 0.12415061146020889, "rewards/rejected": -1.346097707748413, "step": 2495 }, { "epoch": 1.338016390700786, "grad_norm": 6.412880693083864, "learning_rate": 6.774816547081714e-07, "logits/chosen": -0.11619206517934799, "logits/rejected": 0.0009071379899978638, "logps/chosen": -1.16709303855896, "logps/rejected": -1.3086304664611816, "loss": 1.1671, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.16709303855896, "rewards/margins": 0.14153733849525452, "rewards/rejected": -1.3086304664611816, "step": 2500 }, { "epoch": 1.3406924234821878, "grad_norm": 6.255183870577791, "learning_rate": 6.760248111342211e-07, "logits/chosen": -0.143222376704216, "logits/rejected": -0.01875067502260208, "logps/chosen": -1.1716115474700928, "logps/rejected": -1.2913930416107178, "loss": 1.1716, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.1716115474700928, "rewards/margins": 0.119781494140625, "rewards/rejected": -1.2913930416107178, "step": 2505 }, { "epoch": 1.3433684562635893, "grad_norm": 6.341193458064185, "learning_rate": 6.745662597667813e-07, "logits/chosen": -0.2072634994983673, "logits/rejected": -0.1190493255853653, "logps/chosen": -1.1748361587524414, "logps/rejected": -1.3178291320800781, "loss": 1.1748, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.1748361587524414, "rewards/margins": 0.14299294352531433, "rewards/rejected": -1.3178291320800781, "step": 2510 }, { "epoch": 1.3460444890449907, "grad_norm": 5.301375241242932, "learning_rate": 6.731060147567236e-07, "logits/chosen": -0.12110098451375961, "logits/rejected": -0.04728817939758301, "logps/chosen": -1.2493525743484497, "logps/rejected": -1.2969032526016235, "loss": 1.2494, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -1.2493525743484497, "rewards/margins": 0.047550659626722336, "rewards/rejected": -1.2969032526016235, "step": 2515 }, { "epoch": 1.3487205218263925, "grad_norm": 5.074444002926372, "learning_rate": 6.716440902713515e-07, "logits/chosen": -0.20418138802051544, "logits/rejected": -0.14839966595172882, "logps/chosen": -1.2208960056304932, "logps/rejected": -1.3313381671905518, "loss": 1.2209, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.2208960056304932, "rewards/margins": 0.11044222116470337, "rewards/rejected": -1.3313381671905518, "step": 2520 }, { "epoch": 1.351396554607794, "grad_norm": 7.144711356560336, "learning_rate": 6.701805004942627e-07, "logits/chosen": -0.1673976480960846, "logits/rejected": -0.12105701863765717, "logps/chosen": -1.2498142719268799, "logps/rejected": -1.331472635269165, "loss": 1.2498, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.2498142719268799, "rewards/margins": 0.08165836334228516, "rewards/rejected": -1.331472635269165, "step": 2525 }, { "epoch": 1.3540725873891954, "grad_norm": 5.306441551758573, "learning_rate": 6.687152596252119e-07, "logits/chosen": -0.22089031338691711, "logits/rejected": -0.19486112892627716, "logps/chosen": -1.1926316022872925, "logps/rejected": -1.3266105651855469, "loss": 1.1926, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.1926316022872925, "rewards/margins": 0.13397888839244843, "rewards/rejected": -1.3266105651855469, "step": 2530 }, { "epoch": 1.3567486201705972, "grad_norm": 4.758204404235803, "learning_rate": 6.672483818799722e-07, "logits/chosen": -0.23566047847270966, "logits/rejected": -0.14112043380737305, "logps/chosen": -1.2173415422439575, "logps/rejected": -1.3650144338607788, "loss": 1.2173, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.2173415422439575, "rewards/margins": 0.1476728767156601, "rewards/rejected": -1.3650144338607788, "step": 2535 }, { "epoch": 1.3594246529519987, "grad_norm": 6.127044984705447, "learning_rate": 6.657798814901978e-07, "logits/chosen": -0.1602519154548645, "logits/rejected": -0.049241237342357635, "logps/chosen": -1.3008054494857788, "logps/rejected": -1.3552905321121216, "loss": 1.3008, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.3008054494857788, "rewards/margins": 0.05448496341705322, "rewards/rejected": -1.3552905321121216, "step": 2540 }, { "epoch": 1.3621006857334002, "grad_norm": 5.163722910873142, "learning_rate": 6.643097727032863e-07, "logits/chosen": -0.14265184104442596, "logits/rejected": -0.037248071283102036, "logps/chosen": -1.2169709205627441, "logps/rejected": -1.3656368255615234, "loss": 1.217, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.2169709205627441, "rewards/margins": 0.14866593480110168, "rewards/rejected": -1.3656368255615234, "step": 2545 }, { "epoch": 1.3647767185148019, "grad_norm": 5.239041672068492, "learning_rate": 6.628380697822392e-07, "logits/chosen": -0.17224593460559845, "logits/rejected": -0.06451030820608139, "logps/chosen": -1.196614384651184, "logps/rejected": -1.2473446130752563, "loss": 1.1966, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.196614384651184, "rewards/margins": 0.050730086863040924, "rewards/rejected": -1.2473446130752563, "step": 2550 }, { "epoch": 1.3674527512962034, "grad_norm": 5.63409522192951, "learning_rate": 6.61364787005525e-07, "logits/chosen": -0.13030946254730225, "logits/rejected": -0.08356630057096481, "logps/chosen": -1.1452291011810303, "logps/rejected": -1.3875467777252197, "loss": 1.1452, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.1452291011810303, "rewards/margins": 0.24231751263141632, "rewards/rejected": -1.3875467777252197, "step": 2555 }, { "epoch": 1.3701287840776049, "grad_norm": 6.2225012887632625, "learning_rate": 6.598899386669395e-07, "logits/chosen": -0.12318539619445801, "logits/rejected": -0.03525859862565994, "logps/chosen": -1.1920503377914429, "logps/rejected": -1.3309270143508911, "loss": 1.1921, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.1920503377914429, "rewards/margins": 0.13887645304203033, "rewards/rejected": -1.3309270143508911, "step": 2560 }, { "epoch": 1.3728048168590066, "grad_norm": 5.599299386255071, "learning_rate": 6.584135390754679e-07, "logits/chosen": -0.13023993372917175, "logits/rejected": -0.04579556733369827, "logps/chosen": -1.1785519123077393, "logps/rejected": -1.3277703523635864, "loss": 1.1786, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.1785519123077393, "rewards/margins": 0.14921851456165314, "rewards/rejected": -1.3277703523635864, "step": 2565 }, { "epoch": 1.375480849640408, "grad_norm": 4.693549760449425, "learning_rate": 6.569356025551454e-07, "logits/chosen": -0.0744607225060463, "logits/rejected": -0.03623982518911362, "logps/chosen": -1.1738218069076538, "logps/rejected": -1.3510470390319824, "loss": 1.1738, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.1738218069076538, "rewards/margins": 0.17722511291503906, "rewards/rejected": -1.3510470390319824, "step": 2570 }, { "epoch": 1.3781568824218096, "grad_norm": 5.605405013919606, "learning_rate": 6.554561434449186e-07, "logits/chosen": -0.23079650104045868, "logits/rejected": -0.12691353261470795, "logps/chosen": -1.1724379062652588, "logps/rejected": -1.356667399406433, "loss": 1.1724, "rewards/accuracies": 0.5625, "rewards/chosen": -1.1724379062652588, "rewards/margins": 0.18422944843769073, "rewards/rejected": -1.356667399406433, "step": 2575 }, { "epoch": 1.3808329152032113, "grad_norm": 5.81629038717184, "learning_rate": 6.539751760985063e-07, "logits/chosen": -0.15413764119148254, "logits/rejected": -0.09653065353631973, "logps/chosen": -1.2821613550186157, "logps/rejected": -1.3803894519805908, "loss": 1.2822, "rewards/accuracies": 0.53125, "rewards/chosen": -1.2821613550186157, "rewards/margins": 0.09822817146778107, "rewards/rejected": -1.3803894519805908, "step": 2580 }, { "epoch": 1.3835089479846128, "grad_norm": 5.519241014352455, "learning_rate": 6.524927148842602e-07, "logits/chosen": -0.0750851184129715, "logits/rejected": 0.04987717047333717, "logps/chosen": -1.1104981899261475, "logps/rejected": -1.328404426574707, "loss": 1.1105, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.1104981899261475, "rewards/margins": 0.21790611743927002, "rewards/rejected": -1.328404426574707, "step": 2585 }, { "epoch": 1.3861849807660143, "grad_norm": 6.842345325980757, "learning_rate": 6.510087741850254e-07, "logits/chosen": -0.15379485487937927, "logits/rejected": -0.04311462119221687, "logps/chosen": -1.1645325422286987, "logps/rejected": -1.3284845352172852, "loss": 1.1645, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.1645325422286987, "rewards/margins": 0.1639520227909088, "rewards/rejected": -1.3284845352172852, "step": 2590 }, { "epoch": 1.388861013547416, "grad_norm": 3.985966907783397, "learning_rate": 6.495233683980012e-07, "logits/chosen": -0.10692670196294785, "logits/rejected": -0.08942770957946777, "logps/chosen": -1.1832859516143799, "logps/rejected": -1.3430266380310059, "loss": 1.1833, "rewards/accuracies": 0.5625, "rewards/chosen": -1.1832859516143799, "rewards/margins": 0.15974052250385284, "rewards/rejected": -1.3430266380310059, "step": 2595 }, { "epoch": 1.3915370463288175, "grad_norm": 6.860393947306785, "learning_rate": 6.480365119346011e-07, "logits/chosen": -0.061653971672058105, "logits/rejected": 0.029905397444963455, "logps/chosen": -1.211073637008667, "logps/rejected": -1.3106181621551514, "loss": 1.2111, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.211073637008667, "rewards/margins": 0.09954468160867691, "rewards/rejected": -1.3106181621551514, "step": 2600 }, { "epoch": 1.394213079110219, "grad_norm": 6.810679112287132, "learning_rate": 6.465482192203129e-07, "logits/chosen": -0.04961532726883888, "logits/rejected": -0.03229839727282524, "logps/chosen": -1.2062442302703857, "logps/rejected": -1.3166301250457764, "loss": 1.2062, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.2062442302703857, "rewards/margins": 0.11038605868816376, "rewards/rejected": -1.3166301250457764, "step": 2605 }, { "epoch": 1.3968891118916207, "grad_norm": 5.922823488119317, "learning_rate": 6.45058504694559e-07, "logits/chosen": -0.0290206428617239, "logits/rejected": 0.031753428280353546, "logps/chosen": -1.2494542598724365, "logps/rejected": -1.3166723251342773, "loss": 1.2495, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.2494542598724365, "rewards/margins": 0.06721795350313187, "rewards/rejected": -1.3166723251342773, "step": 2610 }, { "epoch": 1.3995651446730222, "grad_norm": 7.799407218595654, "learning_rate": 6.435673828105564e-07, "logits/chosen": -0.13528159260749817, "logits/rejected": -0.0290235485881567, "logps/chosen": -1.1631211042404175, "logps/rejected": -1.3808135986328125, "loss": 1.1631, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.1631211042404175, "rewards/margins": 0.21769246459007263, "rewards/rejected": -1.3808135986328125, "step": 2615 }, { "epoch": 1.402241177454424, "grad_norm": 6.103282495597116, "learning_rate": 6.420748680351763e-07, "logits/chosen": -0.1414068043231964, "logits/rejected": -0.15803930163383484, "logps/chosen": -1.2937171459197998, "logps/rejected": -1.3192265033721924, "loss": 1.2937, "rewards/accuracies": 0.4375, "rewards/chosen": -1.2937171459197998, "rewards/margins": 0.025509502738714218, "rewards/rejected": -1.3192265033721924, "step": 2620 }, { "epoch": 1.4049172102358254, "grad_norm": 6.960438785663138, "learning_rate": 6.405809748488032e-07, "logits/chosen": -0.08956550061702728, "logits/rejected": 0.022877173498272896, "logps/chosen": -1.211693286895752, "logps/rejected": -1.3430335521697998, "loss": 1.2117, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.211693286895752, "rewards/margins": 0.13134035468101501, "rewards/rejected": -1.3430335521697998, "step": 2625 }, { "epoch": 1.4075932430172269, "grad_norm": 6.433849968541622, "learning_rate": 6.390857177451956e-07, "logits/chosen": -0.2391899824142456, "logits/rejected": -0.07536061108112335, "logps/chosen": -1.2790664434432983, "logps/rejected": -1.2976868152618408, "loss": 1.2791, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -1.2790664434432983, "rewards/margins": 0.018620328977704048, "rewards/rejected": -1.2976868152618408, "step": 2630 }, { "epoch": 1.4102692757986286, "grad_norm": 6.9759814922611705, "learning_rate": 6.375891112313445e-07, "logits/chosen": -0.1329323947429657, "logits/rejected": -0.08133529126644135, "logps/chosen": -1.1996643543243408, "logps/rejected": -1.3174219131469727, "loss": 1.1997, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.1996643543243408, "rewards/margins": 0.11775755882263184, "rewards/rejected": -1.3174219131469727, "step": 2635 }, { "epoch": 1.41294530858003, "grad_norm": 4.596941733573693, "learning_rate": 6.360911698273326e-07, "logits/chosen": -0.08236449211835861, "logits/rejected": -0.042063284665346146, "logps/chosen": -1.254122257232666, "logps/rejected": -1.378869891166687, "loss": 1.2541, "rewards/accuracies": 0.59375, "rewards/chosen": -1.254122257232666, "rewards/margins": 0.1247476115822792, "rewards/rejected": -1.378869891166687, "step": 2640 }, { "epoch": 1.4156213413614318, "grad_norm": 5.523317435657738, "learning_rate": 6.345919080661944e-07, "logits/chosen": -0.13272365927696228, "logits/rejected": -0.0780007541179657, "logps/chosen": -1.2086185216903687, "logps/rejected": -1.4099886417388916, "loss": 1.2086, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.2086185216903687, "rewards/margins": 0.20137019455432892, "rewards/rejected": -1.4099886417388916, "step": 2645 }, { "epoch": 1.4182973741428333, "grad_norm": 4.77875708506341, "learning_rate": 6.330913404937737e-07, "logits/chosen": -0.20409151911735535, "logits/rejected": -0.09298364073038101, "logps/chosen": -1.2024623155593872, "logps/rejected": -1.4788533449172974, "loss": 1.2025, "rewards/accuracies": 0.59375, "rewards/chosen": -1.2024623155593872, "rewards/margins": 0.27639108896255493, "rewards/rejected": -1.4788533449172974, "step": 2650 }, { "epoch": 1.4209734069242348, "grad_norm": 6.489205374567575, "learning_rate": 6.315894816685838e-07, "logits/chosen": -0.11942043155431747, "logits/rejected": 0.00019886568770743906, "logps/chosen": -1.1004186868667603, "logps/rejected": -1.2476332187652588, "loss": 1.1004, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.1004186868667603, "rewards/margins": 0.14721444249153137, "rewards/rejected": -1.2476332187652588, "step": 2655 }, { "epoch": 1.4236494397056365, "grad_norm": 6.218772554732238, "learning_rate": 6.300863461616657e-07, "logits/chosen": -0.049031928181648254, "logits/rejected": -0.03157534450292587, "logps/chosen": -1.0783230066299438, "logps/rejected": -1.26331627368927, "loss": 1.0783, "rewards/accuracies": 0.59375, "rewards/chosen": -1.0783230066299438, "rewards/margins": 0.18499338626861572, "rewards/rejected": -1.26331627368927, "step": 2660 }, { "epoch": 1.426325472487038, "grad_norm": 5.544635549329785, "learning_rate": 6.285819485564465e-07, "logits/chosen": -0.16598865389823914, "logits/rejected": -0.08447521179914474, "logps/chosen": -1.2363511323928833, "logps/rejected": -1.3694822788238525, "loss": 1.2364, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.2363511323928833, "rewards/margins": 0.13313107192516327, "rewards/rejected": -1.3694822788238525, "step": 2665 }, { "epoch": 1.4290015052684395, "grad_norm": 6.643078903790699, "learning_rate": 6.270763034485986e-07, "logits/chosen": -0.10532794892787933, "logits/rejected": -0.05180637165904045, "logps/chosen": -1.3375548124313354, "logps/rejected": -1.40316641330719, "loss": 1.3376, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.3375548124313354, "rewards/margins": 0.06561161577701569, "rewards/rejected": -1.40316641330719, "step": 2670 }, { "epoch": 1.4316775380498412, "grad_norm": 6.641625562528009, "learning_rate": 6.255694254458972e-07, "logits/chosen": -0.14849057793617249, "logits/rejected": -0.033550143241882324, "logps/chosen": -1.2429174184799194, "logps/rejected": -1.3311035633087158, "loss": 1.2429, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.2429174184799194, "rewards/margins": 0.08818618953227997, "rewards/rejected": -1.3311035633087158, "step": 2675 }, { "epoch": 1.4343535708312427, "grad_norm": 7.02889503859994, "learning_rate": 6.240613291680795e-07, "logits/chosen": -0.17734350264072418, "logits/rejected": -0.0497741773724556, "logps/chosen": -1.2284928560256958, "logps/rejected": -1.3068510293960571, "loss": 1.2285, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.2284928560256958, "rewards/margins": 0.07835830003023148, "rewards/rejected": -1.3068510293960571, "step": 2680 }, { "epoch": 1.4370296036126442, "grad_norm": 6.458123189753758, "learning_rate": 6.225520292467021e-07, "logits/chosen": -0.17282505333423615, "logits/rejected": -0.019898682832717896, "logps/chosen": -1.2373493909835815, "logps/rejected": -1.3010917901992798, "loss": 1.2373, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.2373493909835815, "rewards/margins": 0.06374243646860123, "rewards/rejected": -1.3010917901992798, "step": 2685 }, { "epoch": 1.439705636394046, "grad_norm": 8.668192916798636, "learning_rate": 6.210415403249993e-07, "logits/chosen": -0.24959559738636017, "logits/rejected": -0.06614496558904648, "logps/chosen": -1.2140246629714966, "logps/rejected": -1.4108800888061523, "loss": 1.214, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.2140246629714966, "rewards/margins": 0.1968553364276886, "rewards/rejected": -1.4108800888061523, "step": 2690 }, { "epoch": 1.4423816691754474, "grad_norm": 5.259935673273579, "learning_rate": 6.195298770577415e-07, "logits/chosen": -0.1370142102241516, "logits/rejected": -0.11998993158340454, "logps/chosen": -1.2053474187850952, "logps/rejected": -1.3633159399032593, "loss": 1.2053, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.2053474187850952, "rewards/margins": 0.15796838700771332, "rewards/rejected": -1.3633159399032593, "step": 2695 }, { "epoch": 1.445057701956849, "grad_norm": 5.8472187269338525, "learning_rate": 6.180170541110923e-07, "logits/chosen": -0.18070097267627716, "logits/rejected": -0.03505333140492439, "logps/chosen": -1.2485049962997437, "logps/rejected": -1.3782227039337158, "loss": 1.2485, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.2485049962997437, "rewards/margins": 0.12971781194210052, "rewards/rejected": -1.3782227039337158, "step": 2700 }, { "epoch": 1.4477337347382506, "grad_norm": 4.848045843439798, "learning_rate": 6.165030861624663e-07, "logits/chosen": -0.19033488631248474, "logits/rejected": -0.034912239760160446, "logps/chosen": -1.1386581659317017, "logps/rejected": -1.3487049341201782, "loss": 1.1387, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.1386581659317017, "rewards/margins": 0.21004676818847656, "rewards/rejected": -1.3487049341201782, "step": 2705 }, { "epoch": 1.4504097675196521, "grad_norm": 6.891489733581904, "learning_rate": 6.149879879003876e-07, "logits/chosen": -0.08175579458475113, "logits/rejected": -0.06333563476800919, "logps/chosen": -1.2216109037399292, "logps/rejected": -1.3812209367752075, "loss": 1.2216, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.2216109037399292, "rewards/margins": 0.1596100777387619, "rewards/rejected": -1.3812209367752075, "step": 2710 }, { "epoch": 1.4530858003010536, "grad_norm": 5.527053995757542, "learning_rate": 6.13471774024346e-07, "logits/chosen": -0.21438345313072205, "logits/rejected": -0.14410871267318726, "logps/chosen": -1.1533279418945312, "logps/rejected": -1.2988663911819458, "loss": 1.1533, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.1533279418945312, "rewards/margins": 0.14553824067115784, "rewards/rejected": -1.2988663911819458, "step": 2715 }, { "epoch": 1.4557618330824553, "grad_norm": 5.966597745384227, "learning_rate": 6.119544592446551e-07, "logits/chosen": -0.20107242465019226, "logits/rejected": -0.11686976253986359, "logps/chosen": -1.1863126754760742, "logps/rejected": -1.2549879550933838, "loss": 1.1863, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.1863126754760742, "rewards/margins": 0.06867529451847076, "rewards/rejected": -1.2549879550933838, "step": 2720 }, { "epoch": 1.4584378658638568, "grad_norm": 7.216188001455052, "learning_rate": 6.104360582823096e-07, "logits/chosen": -0.15372993052005768, "logits/rejected": -0.07025248557329178, "logps/chosen": -1.1943224668502808, "logps/rejected": -1.352706789970398, "loss": 1.1943, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.1943224668502808, "rewards/margins": 0.15838412940502167, "rewards/rejected": -1.352706789970398, "step": 2725 }, { "epoch": 1.4611138986452583, "grad_norm": 5.9785143597397346, "learning_rate": 6.089165858688423e-07, "logits/chosen": -0.17867709696292877, "logits/rejected": -0.05721612647175789, "logps/chosen": -1.2034406661987305, "logps/rejected": -1.3655818700790405, "loss": 1.2034, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.2034406661987305, "rewards/margins": 0.16214114427566528, "rewards/rejected": -1.3655818700790405, "step": 2730 }, { "epoch": 1.46378993142666, "grad_norm": 5.419404086588341, "learning_rate": 6.073960567461811e-07, "logits/chosen": -0.14463157951831818, "logits/rejected": 0.01187673769891262, "logps/chosen": -1.1176784038543701, "logps/rejected": -1.391451120376587, "loss": 1.1177, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.1176784038543701, "rewards/margins": 0.27377280592918396, "rewards/rejected": -1.391451120376587, "step": 2735 }, { "epoch": 1.4664659642080615, "grad_norm": 7.103612835569111, "learning_rate": 6.058744856665065e-07, "logits/chosen": -0.1979888379573822, "logits/rejected": -0.11235777288675308, "logps/chosen": -1.134758710861206, "logps/rejected": -1.3274892568588257, "loss": 1.1348, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.134758710861206, "rewards/margins": 0.19273041188716888, "rewards/rejected": -1.3274892568588257, "step": 2740 }, { "epoch": 1.469141996989463, "grad_norm": 5.909687794135183, "learning_rate": 6.043518873921074e-07, "logits/chosen": -0.1663951575756073, "logits/rejected": -0.09601058065891266, "logps/chosen": -1.1709434986114502, "logps/rejected": -1.2519854307174683, "loss": 1.1709, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.1709434986114502, "rewards/margins": 0.0810420960187912, "rewards/rejected": -1.2519854307174683, "step": 2745 }, { "epoch": 1.4718180297708647, "grad_norm": 6.385067818122794, "learning_rate": 6.028282766952393e-07, "logits/chosen": -0.1465391367673874, "logits/rejected": -0.07444795966148376, "logps/chosen": -1.2447941303253174, "logps/rejected": -1.4348982572555542, "loss": 1.2448, "rewards/accuracies": 0.5625, "rewards/chosen": -1.2447941303253174, "rewards/margins": 0.19010430574417114, "rewards/rejected": -1.4348982572555542, "step": 2750 }, { "epoch": 1.4744940625522662, "grad_norm": 6.480293226827362, "learning_rate": 6.013036683579798e-07, "logits/chosen": -0.10530716180801392, "logits/rejected": 0.009901325218379498, "logps/chosen": -1.1871936321258545, "logps/rejected": -1.2807506322860718, "loss": 1.1872, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.1871936321258545, "rewards/margins": 0.09355700761079788, "rewards/rejected": -1.2807506322860718, "step": 2755 }, { "epoch": 1.4771700953336677, "grad_norm": 6.67378593817868, "learning_rate": 5.997780771720854e-07, "logits/chosen": -0.2241039276123047, "logits/rejected": -0.08775316178798676, "logps/chosen": -1.2366756200790405, "logps/rejected": -1.4248069524765015, "loss": 1.2367, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.2366756200790405, "rewards/margins": 0.18813134729862213, "rewards/rejected": -1.4248069524765015, "step": 2760 }, { "epoch": 1.4798461281150694, "grad_norm": 6.363083751576106, "learning_rate": 5.982515179388486e-07, "logits/chosen": -0.1173228770494461, "logits/rejected": -0.012707576155662537, "logps/chosen": -1.1432393789291382, "logps/rejected": -1.3016777038574219, "loss": 1.1432, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.1432393789291382, "rewards/margins": 0.15843839943408966, "rewards/rejected": -1.3016777038574219, "step": 2765 }, { "epoch": 1.482522160896471, "grad_norm": 5.084768110905697, "learning_rate": 5.967240054689541e-07, "logits/chosen": -0.17055687308311462, "logits/rejected": -0.11352336406707764, "logps/chosen": -1.1878856420516968, "logps/rejected": -1.2137116193771362, "loss": 1.1879, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.1878856420516968, "rewards/margins": 0.025826070457696915, "rewards/rejected": -1.2137116193771362, "step": 2770 }, { "epoch": 1.4851981936778724, "grad_norm": 4.487129980744408, "learning_rate": 5.951955545823342e-07, "logits/chosen": -0.13741546869277954, "logits/rejected": -0.12732776999473572, "logps/chosen": -1.1379644870758057, "logps/rejected": -1.2945141792297363, "loss": 1.138, "rewards/accuracies": 0.5625, "rewards/chosen": -1.1379644870758057, "rewards/margins": 0.15654967725276947, "rewards/rejected": -1.2945141792297363, "step": 2775 }, { "epoch": 1.4878742264592741, "grad_norm": 4.873720718100467, "learning_rate": 5.936661801080263e-07, "logits/chosen": -0.16011826694011688, "logits/rejected": -0.08337688446044922, "logps/chosen": -1.315423607826233, "logps/rejected": -1.4149740934371948, "loss": 1.3154, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -1.315423607826233, "rewards/margins": 0.09955054521560669, "rewards/rejected": -1.4149740934371948, "step": 2780 }, { "epoch": 1.4905502592406756, "grad_norm": 6.674398948279101, "learning_rate": 5.92135896884028e-07, "logits/chosen": -0.1762860119342804, "logits/rejected": -0.0826021209359169, "logps/chosen": -1.2702248096466064, "logps/rejected": -1.32528817653656, "loss": 1.2702, "rewards/accuracies": 0.5, "rewards/chosen": -1.2702248096466064, "rewards/margins": 0.0550631508231163, "rewards/rejected": -1.32528817653656, "step": 2785 }, { "epoch": 1.4932262920220774, "grad_norm": 8.668540598934465, "learning_rate": 5.906047197571541e-07, "logits/chosen": -0.13938485085964203, "logits/rejected": -0.15405963361263275, "logps/chosen": -1.1503784656524658, "logps/rejected": -1.3194903135299683, "loss": 1.1504, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.1503784656524658, "rewards/margins": 0.16911199688911438, "rewards/rejected": -1.3194903135299683, "step": 2790 }, { "epoch": 1.4959023248034788, "grad_norm": 5.802500033818053, "learning_rate": 5.890726635828919e-07, "logits/chosen": -0.034162361174821854, "logits/rejected": -0.02159692905843258, "logps/chosen": -1.1271542310714722, "logps/rejected": -1.213360071182251, "loss": 1.1272, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.1271542310714722, "rewards/margins": 0.08620588481426239, "rewards/rejected": -1.213360071182251, "step": 2795 }, { "epoch": 1.4985783575848803, "grad_norm": 6.210402897520438, "learning_rate": 5.875397432252569e-07, "logits/chosen": -0.19711938500404358, "logits/rejected": -0.12864002585411072, "logps/chosen": -1.2781708240509033, "logps/rejected": -1.4665791988372803, "loss": 1.2782, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.2781708240509033, "rewards/margins": 0.18840864300727844, "rewards/rejected": -1.4665791988372803, "step": 2800 }, { "epoch": 1.4985783575848803, "eval_logits/chosen": 0.08250605314970016, "eval_logits/rejected": 0.14715583622455597, "eval_logps/chosen": -1.266008734703064, "eval_logps/rejected": -1.388183355331421, "eval_loss": 1.2662686109542847, "eval_rewards/accuracies": 0.5482195615768433, "eval_rewards/chosen": -1.266008734703064, "eval_rewards/margins": 0.12217480689287186, "eval_rewards/rejected": -1.388183355331421, "eval_runtime": 40.0273, "eval_samples_per_second": 33.602, "eval_steps_per_second": 8.419, "step": 2800 }, { "epoch": 1.5012543903662818, "grad_norm": 4.532479446864917, "learning_rate": 5.860059735566491e-07, "logits/chosen": -0.2623627781867981, "logits/rejected": -0.14939776062965393, "logps/chosen": -1.1219213008880615, "logps/rejected": -1.3187074661254883, "loss": 1.1219, "rewards/accuracies": 0.625, "rewards/chosen": -1.1219213008880615, "rewards/margins": 0.19678612053394318, "rewards/rejected": -1.3187074661254883, "step": 2805 }, { "epoch": 1.5039304231476835, "grad_norm": 7.12014689528371, "learning_rate": 5.844713694577087e-07, "logits/chosen": -0.14758452773094177, "logits/rejected": -0.14022314548492432, "logps/chosen": -1.2497012615203857, "logps/rejected": -1.4004625082015991, "loss": 1.2497, "rewards/accuracies": 0.5625, "rewards/chosen": -1.2497012615203857, "rewards/margins": 0.15076127648353577, "rewards/rejected": -1.4004625082015991, "step": 2810 }, { "epoch": 1.5066064559290853, "grad_norm": 6.500765300277263, "learning_rate": 5.829359458171714e-07, "logits/chosen": -0.10217016935348511, "logits/rejected": -0.0018426947062835097, "logps/chosen": -1.2408527135849, "logps/rejected": -1.3546500205993652, "loss": 1.2409, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.2408527135849, "rewards/margins": 0.11379728466272354, "rewards/rejected": -1.3546500205993652, "step": 2815 }, { "epoch": 1.5092824887104868, "grad_norm": 4.722337376850997, "learning_rate": 5.81399717531724e-07, "logits/chosen": -0.10681865364313126, "logits/rejected": 0.012222632765769958, "logps/chosen": -1.2193005084991455, "logps/rejected": -1.2015225887298584, "loss": 1.2193, "rewards/accuracies": 0.5, "rewards/chosen": -1.2193005084991455, "rewards/margins": -0.017777858301997185, "rewards/rejected": -1.2015225887298584, "step": 2820 }, { "epoch": 1.5119585214918883, "grad_norm": 5.986577491027151, "learning_rate": 5.798626995058602e-07, "logits/chosen": -0.16803312301635742, "logits/rejected": -0.04343169182538986, "logps/chosen": -1.2751442193984985, "logps/rejected": -1.4545589685440063, "loss": 1.2751, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.2751442193984985, "rewards/margins": 0.17941470444202423, "rewards/rejected": -1.4545589685440063, "step": 2825 }, { "epoch": 1.51463455427329, "grad_norm": 5.085519366174262, "learning_rate": 5.783249066517354e-07, "logits/chosen": -0.15298210084438324, "logits/rejected": -0.04136541113257408, "logps/chosen": -1.2668578624725342, "logps/rejected": -1.2494256496429443, "loss": 1.2669, "rewards/accuracies": 0.4375, "rewards/chosen": -1.2668578624725342, "rewards/margins": -0.017432358115911484, "rewards/rejected": -1.2494256496429443, "step": 2830 }, { "epoch": 1.5173105870546915, "grad_norm": 7.771615521989697, "learning_rate": 5.767863538890228e-07, "logits/chosen": -0.1547311544418335, "logits/rejected": -0.019332554191350937, "logps/chosen": -1.188704490661621, "logps/rejected": -1.2712218761444092, "loss": 1.1887, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.188704490661621, "rewards/margins": 0.08251740038394928, "rewards/rejected": -1.2712218761444092, "step": 2835 }, { "epoch": 1.519986619836093, "grad_norm": 6.682995096357411, "learning_rate": 5.75247056144768e-07, "logits/chosen": -0.1413569450378418, "logits/rejected": -0.05635148286819458, "logps/chosen": -1.21980881690979, "logps/rejected": -1.3257548809051514, "loss": 1.2198, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.21980881690979, "rewards/margins": 0.10594598948955536, "rewards/rejected": -1.3257548809051514, "step": 2840 }, { "epoch": 1.5226626526174947, "grad_norm": 4.578930869380602, "learning_rate": 5.737070283532444e-07, "logits/chosen": -0.13955798745155334, "logits/rejected": -0.08480822294950485, "logps/chosen": -1.1595594882965088, "logps/rejected": -1.3342664241790771, "loss": 1.1596, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.1595594882965088, "rewards/margins": 0.1747068464756012, "rewards/rejected": -1.3342664241790771, "step": 2845 }, { "epoch": 1.5253386853988962, "grad_norm": 6.361124268188639, "learning_rate": 5.721662854558084e-07, "logits/chosen": -0.1926908791065216, "logits/rejected": -0.14163550734519958, "logps/chosen": -1.2513967752456665, "logps/rejected": -1.3260657787322998, "loss": 1.2514, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -1.2513967752456665, "rewards/margins": 0.07466918975114822, "rewards/rejected": -1.3260657787322998, "step": 2850 }, { "epoch": 1.5280147181802977, "grad_norm": 5.589980250566613, "learning_rate": 5.706248424007545e-07, "logits/chosen": -0.18902216851711273, "logits/rejected": -0.06937951594591141, "logps/chosen": -1.2956907749176025, "logps/rejected": -1.4399604797363281, "loss": 1.2957, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.2956907749176025, "rewards/margins": 0.14426973462104797, "rewards/rejected": -1.4399604797363281, "step": 2855 }, { "epoch": 1.5306907509616994, "grad_norm": 7.2515811756046515, "learning_rate": 5.690827141431699e-07, "logits/chosen": -0.2173794060945511, "logits/rejected": -0.09116693586111069, "logps/chosen": -1.1712462902069092, "logps/rejected": -1.2533628940582275, "loss": 1.1712, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.1712462902069092, "rewards/margins": 0.0821165069937706, "rewards/rejected": -1.2533628940582275, "step": 2860 }, { "epoch": 1.5333667837431009, "grad_norm": 4.9962136595083875, "learning_rate": 5.675399156447897e-07, "logits/chosen": -0.28589072823524475, "logits/rejected": -0.1882825642824173, "logps/chosen": -1.1760176420211792, "logps/rejected": -1.3186304569244385, "loss": 1.176, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.1760176420211792, "rewards/margins": 0.1426127403974533, "rewards/rejected": -1.3186304569244385, "step": 2865 }, { "epoch": 1.5360428165245024, "grad_norm": 5.052755275943915, "learning_rate": 5.659964618738515e-07, "logits/chosen": -0.18664869666099548, "logits/rejected": -0.105466328561306, "logps/chosen": -1.2411689758300781, "logps/rejected": -1.236175775527954, "loss": 1.2412, "rewards/accuracies": 0.4312500059604645, "rewards/chosen": -1.2411689758300781, "rewards/margins": -0.004993182606995106, "rewards/rejected": -1.236175775527954, "step": 2870 }, { "epoch": 1.538718849305904, "grad_norm": 5.947634563651251, "learning_rate": 5.644523678049509e-07, "logits/chosen": -0.16287419199943542, "logits/rejected": -0.1098705530166626, "logps/chosen": -1.2365381717681885, "logps/rejected": -1.3108277320861816, "loss": 1.2365, "rewards/accuracies": 0.5, "rewards/chosen": -1.2365381717681885, "rewards/margins": 0.07428963482379913, "rewards/rejected": -1.3108277320861816, "step": 2875 }, { "epoch": 1.5413948820873056, "grad_norm": 6.161734298433249, "learning_rate": 5.629076484188952e-07, "logits/chosen": -0.06857867538928986, "logits/rejected": 0.018205931410193443, "logps/chosen": -1.185482382774353, "logps/rejected": -1.3450819253921509, "loss": 1.1855, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.185482382774353, "rewards/margins": 0.15959949791431427, "rewards/rejected": -1.3450819253921509, "step": 2880 }, { "epoch": 1.544070914868707, "grad_norm": 5.599749208193078, "learning_rate": 5.613623187025587e-07, "logits/chosen": -0.15522900223731995, "logits/rejected": -0.062411628663539886, "logps/chosen": -1.200128436088562, "logps/rejected": -1.3145294189453125, "loss": 1.2001, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.200128436088562, "rewards/margins": 0.1144009605050087, "rewards/rejected": -1.3145294189453125, "step": 2885 }, { "epoch": 1.5467469476501088, "grad_norm": 4.80515593794597, "learning_rate": 5.598163936487369e-07, "logits/chosen": -0.25130391120910645, "logits/rejected": -0.11056108772754669, "logps/chosen": -1.207871675491333, "logps/rejected": -1.3067653179168701, "loss": 1.2079, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.207871675491333, "rewards/margins": 0.09889379888772964, "rewards/rejected": -1.3067653179168701, "step": 2890 }, { "epoch": 1.5494229804315103, "grad_norm": 4.911364967943851, "learning_rate": 5.582698882560017e-07, "logits/chosen": -0.2161664515733719, "logits/rejected": -0.12254748493432999, "logps/chosen": -1.129655361175537, "logps/rejected": -1.2141048908233643, "loss": 1.1297, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.129655361175537, "rewards/margins": 0.08444943279027939, "rewards/rejected": -1.2141048908233643, "step": 2895 }, { "epoch": 1.5520990132129118, "grad_norm": 4.814682109950279, "learning_rate": 5.567228175285549e-07, "logits/chosen": -0.13949784636497498, "logits/rejected": -0.07020962983369827, "logps/chosen": -1.2276554107666016, "logps/rejected": -1.3136409521102905, "loss": 1.2277, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.2276554107666016, "rewards/margins": 0.08598551899194717, "rewards/rejected": -1.3136409521102905, "step": 2900 }, { "epoch": 1.5547750459943135, "grad_norm": 5.0878395658679025, "learning_rate": 5.551751964760838e-07, "logits/chosen": -0.0637345090508461, "logits/rejected": -0.043388910591602325, "logps/chosen": -1.1789171695709229, "logps/rejected": -1.3274840116500854, "loss": 1.1789, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.1789171695709229, "rewards/margins": 0.14856679737567902, "rewards/rejected": -1.3274840116500854, "step": 2905 }, { "epoch": 1.557451078775715, "grad_norm": 5.997582070221772, "learning_rate": 5.536270401136145e-07, "logits/chosen": -0.12833306193351746, "logits/rejected": -0.08201871812343597, "logps/chosen": -1.1643414497375488, "logps/rejected": -1.2854677438735962, "loss": 1.1643, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.1643414497375488, "rewards/margins": 0.12112633883953094, "rewards/rejected": -1.2854677438735962, "step": 2910 }, { "epoch": 1.5601271115571165, "grad_norm": 7.283223622402654, "learning_rate": 5.520783634613667e-07, "logits/chosen": -0.10201741755008698, "logits/rejected": 0.027971964329481125, "logps/chosen": -1.2574814558029175, "logps/rejected": -1.3378180265426636, "loss": 1.2575, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.2574814558029175, "rewards/margins": 0.08033658564090729, "rewards/rejected": -1.3378180265426636, "step": 2915 }, { "epoch": 1.5628031443385182, "grad_norm": 4.570402580104556, "learning_rate": 5.505291815446082e-07, "logits/chosen": -0.08022661507129669, "logits/rejected": 0.009562060236930847, "logps/chosen": -1.2268797159194946, "logps/rejected": -1.373541235923767, "loss": 1.2269, "rewards/accuracies": 0.5625, "rewards/chosen": -1.2268797159194946, "rewards/margins": 0.14666154980659485, "rewards/rejected": -1.373541235923767, "step": 2920 }, { "epoch": 1.5654791771199197, "grad_norm": 5.876934061593762, "learning_rate": 5.489795093935089e-07, "logits/chosen": -0.08421742171049118, "logits/rejected": -0.05202580615878105, "logps/chosen": -1.1369997262954712, "logps/rejected": -1.3792202472686768, "loss": 1.137, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.1369997262954712, "rewards/margins": 0.24222061038017273, "rewards/rejected": -1.3792202472686768, "step": 2925 }, { "epoch": 1.5681552099013212, "grad_norm": 6.955727967693247, "learning_rate": 5.474293620429946e-07, "logits/chosen": -0.2178916186094284, "logits/rejected": -0.10460181534290314, "logps/chosen": -1.163715124130249, "logps/rejected": -1.3573652505874634, "loss": 1.1637, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.163715124130249, "rewards/margins": 0.193650022149086, "rewards/rejected": -1.3573652505874634, "step": 2930 }, { "epoch": 1.570831242682723, "grad_norm": 6.030948645817767, "learning_rate": 5.458787545326018e-07, "logits/chosen": -0.19033949077129364, "logits/rejected": -0.07696288079023361, "logps/chosen": -1.214900016784668, "logps/rejected": -1.361677885055542, "loss": 1.2149, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.214900016784668, "rewards/margins": 0.14677776396274567, "rewards/rejected": -1.361677885055542, "step": 2935 }, { "epoch": 1.5735072754641244, "grad_norm": 5.474406407344182, "learning_rate": 5.443277019063311e-07, "logits/chosen": -0.17739342153072357, "logits/rejected": -0.046794407069683075, "logps/chosen": -1.2170674800872803, "logps/rejected": -1.3618934154510498, "loss": 1.2171, "rewards/accuracies": 0.53125, "rewards/chosen": -1.2170674800872803, "rewards/margins": 0.14482590556144714, "rewards/rejected": -1.3618934154510498, "step": 2940 }, { "epoch": 1.5761833082455259, "grad_norm": 5.781704309210153, "learning_rate": 5.427762192125023e-07, "logits/chosen": -0.22032809257507324, "logits/rejected": -0.11137136071920395, "logps/chosen": -1.1859705448150635, "logps/rejected": -1.26736581325531, "loss": 1.186, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.1859705448150635, "rewards/margins": 0.08139535039663315, "rewards/rejected": -1.26736581325531, "step": 2945 }, { "epoch": 1.5788593410269276, "grad_norm": 5.980155617081231, "learning_rate": 5.41224321503607e-07, "logits/chosen": -0.0958017110824585, "logits/rejected": 0.09359926730394363, "logps/chosen": -1.156237244606018, "logps/rejected": -1.3081496953964233, "loss": 1.1562, "rewards/accuracies": 0.59375, "rewards/chosen": -1.156237244606018, "rewards/margins": 0.15191258490085602, "rewards/rejected": -1.3081496953964233, "step": 2950 }, { "epoch": 1.5815353738083293, "grad_norm": 6.15496285887636, "learning_rate": 5.396720238361637e-07, "logits/chosen": -0.07956452667713165, "logits/rejected": -0.0019595653284341097, "logps/chosen": -1.159544587135315, "logps/rejected": -1.344609022140503, "loss": 1.1595, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.159544587135315, "rewards/margins": 0.18506447970867157, "rewards/rejected": -1.344609022140503, "step": 2955 }, { "epoch": 1.5842114065897306, "grad_norm": 6.519010816657736, "learning_rate": 5.381193412705711e-07, "logits/chosen": -0.15968526899814606, "logits/rejected": -0.06919540464878082, "logps/chosen": -1.1926510334014893, "logps/rejected": -1.3143584728240967, "loss": 1.1927, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.1926510334014893, "rewards/margins": 0.12170721590518951, "rewards/rejected": -1.3143584728240967, "step": 2960 }, { "epoch": 1.5868874393711323, "grad_norm": 7.7084223377743495, "learning_rate": 5.365662888709622e-07, "logits/chosen": -0.13733483850955963, "logits/rejected": -0.07336181402206421, "logps/chosen": -1.1589477062225342, "logps/rejected": -1.2619670629501343, "loss": 1.1589, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.1589477062225342, "rewards/margins": 0.10301937907934189, "rewards/rejected": -1.2619670629501343, "step": 2965 }, { "epoch": 1.589563472152534, "grad_norm": 6.101483947699433, "learning_rate": 5.350128817050585e-07, "logits/chosen": -0.15902869403362274, "logits/rejected": -0.03487970679998398, "logps/chosen": -1.2657229900360107, "logps/rejected": -1.3030707836151123, "loss": 1.2657, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -1.2657229900360107, "rewards/margins": 0.03734783083200455, "rewards/rejected": -1.3030707836151123, "step": 2970 }, { "epoch": 1.5922395049339353, "grad_norm": 7.4715717049711365, "learning_rate": 5.334591348440229e-07, "logits/chosen": -0.0955582857131958, "logits/rejected": 0.005968635901808739, "logps/chosen": -1.2224586009979248, "logps/rejected": -1.4242290258407593, "loss": 1.2225, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.2224586009979248, "rewards/margins": 0.20177051424980164, "rewards/rejected": -1.4242290258407593, "step": 2975 }, { "epoch": 1.594915537715337, "grad_norm": 6.172419493268233, "learning_rate": 5.319050633623141e-07, "logits/chosen": -0.22095856070518494, "logits/rejected": -0.11121181398630142, "logps/chosen": -1.258859395980835, "logps/rejected": -1.3840798139572144, "loss": 1.2589, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.258859395980835, "rewards/margins": 0.12522053718566895, "rewards/rejected": -1.3840798139572144, "step": 2980 }, { "epoch": 1.5975915704967387, "grad_norm": 4.526432188055738, "learning_rate": 5.303506823375409e-07, "logits/chosen": -0.18040831387043, "logits/rejected": -0.050440408289432526, "logps/chosen": -1.2830060720443726, "logps/rejected": -1.3131835460662842, "loss": 1.283, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.2830060720443726, "rewards/margins": 0.030177345499396324, "rewards/rejected": -1.3131835460662842, "step": 2985 }, { "epoch": 1.60026760327814, "grad_norm": 6.856400458520931, "learning_rate": 5.287960068503143e-07, "logits/chosen": -0.14287550747394562, "logits/rejected": -0.01893492415547371, "logps/chosen": -1.1644659042358398, "logps/rejected": -1.3480331897735596, "loss": 1.1645, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.1644659042358398, "rewards/margins": 0.18356743454933167, "rewards/rejected": -1.3480331897735596, "step": 2990 }, { "epoch": 1.6029436360595417, "grad_norm": 6.838121832747213, "learning_rate": 5.272410519841032e-07, "logits/chosen": -0.15429839491844177, "logits/rejected": -0.09727036952972412, "logps/chosen": -1.262001633644104, "logps/rejected": -1.4208881855010986, "loss": 1.262, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.262001633644104, "rewards/margins": 0.1588863581418991, "rewards/rejected": -1.4208881855010986, "step": 2995 }, { "epoch": 1.6056196688409434, "grad_norm": 5.554967647338135, "learning_rate": 5.256858328250861e-07, "logits/chosen": -0.19037316739559174, "logits/rejected": -0.10018163919448853, "logps/chosen": -1.2529418468475342, "logps/rejected": -1.3540884256362915, "loss": 1.2529, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.2529418468475342, "rewards/margins": 0.10114644467830658, "rewards/rejected": -1.3540884256362915, "step": 3000 }, { "epoch": 1.608295701622345, "grad_norm": 6.035262383088494, "learning_rate": 5.241303644620063e-07, "logits/chosen": -0.2202160656452179, "logits/rejected": -0.10643961280584335, "logps/chosen": -1.1431570053100586, "logps/rejected": -1.2954752445220947, "loss": 1.1432, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.1431570053100586, "rewards/margins": 0.15231840312480927, "rewards/rejected": -1.2954752445220947, "step": 3005 }, { "epoch": 1.6109717344037464, "grad_norm": 6.5270126797710395, "learning_rate": 5.225746619860248e-07, "logits/chosen": -0.1912938356399536, "logits/rejected": -0.09891058504581451, "logps/chosen": -1.2103872299194336, "logps/rejected": -1.3077054023742676, "loss": 1.2104, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.2103872299194336, "rewards/margins": 0.09731811285018921, "rewards/rejected": -1.3077054023742676, "step": 3010 }, { "epoch": 1.6136477671851481, "grad_norm": 8.11383107663099, "learning_rate": 5.210187404905735e-07, "logits/chosen": -0.06253752112388611, "logits/rejected": 0.000773303210735321, "logps/chosen": -1.185871958732605, "logps/rejected": -1.3746248483657837, "loss": 1.1859, "rewards/accuracies": 0.59375, "rewards/chosen": -1.185871958732605, "rewards/margins": 0.18875280022621155, "rewards/rejected": -1.3746248483657837, "step": 3015 }, { "epoch": 1.6163237999665496, "grad_norm": 4.432990967842032, "learning_rate": 5.194626150712098e-07, "logits/chosen": -0.20629668235778809, "logits/rejected": -0.08360473811626434, "logps/chosen": -1.1964248418807983, "logps/rejected": -1.2561715841293335, "loss": 1.1964, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.1964248418807983, "rewards/margins": 0.059746671468019485, "rewards/rejected": -1.2561715841293335, "step": 3020 }, { "epoch": 1.6189998327479511, "grad_norm": 5.407931479416349, "learning_rate": 5.179063008254695e-07, "logits/chosen": -0.1433001607656479, "logits/rejected": -0.03913353383541107, "logps/chosen": -1.1690419912338257, "logps/rejected": -1.2795664072036743, "loss": 1.169, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.1690419912338257, "rewards/margins": 0.11052443087100983, "rewards/rejected": -1.2795664072036743, "step": 3025 }, { "epoch": 1.6216758655293528, "grad_norm": 3.885167998526482, "learning_rate": 5.163498128527199e-07, "logits/chosen": -0.14312958717346191, "logits/rejected": -0.03685606271028519, "logps/chosen": -1.2552669048309326, "logps/rejected": -1.2934163808822632, "loss": 1.2553, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.2552669048309326, "rewards/margins": 0.038149505853652954, "rewards/rejected": -1.2934163808822632, "step": 3030 }, { "epoch": 1.6243518983107543, "grad_norm": 6.553108456934906, "learning_rate": 5.147931662540144e-07, "logits/chosen": -0.014121739193797112, "logits/rejected": 0.06224860996007919, "logps/chosen": -1.2361886501312256, "logps/rejected": -1.2648775577545166, "loss": 1.2362, "rewards/accuracies": 0.53125, "rewards/chosen": -1.2361886501312256, "rewards/margins": 0.028688913211226463, "rewards/rejected": -1.2648775577545166, "step": 3035 }, { "epoch": 1.6270279310921558, "grad_norm": 8.434783168217171, "learning_rate": 5.132363761319449e-07, "logits/chosen": -0.14681598544120789, "logits/rejected": -0.10475464165210724, "logps/chosen": -1.1476919651031494, "logps/rejected": -1.329223394393921, "loss": 1.1477, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.1476919651031494, "rewards/margins": 0.18153154850006104, "rewards/rejected": -1.329223394393921, "step": 3040 }, { "epoch": 1.6297039638735575, "grad_norm": 5.481195251672301, "learning_rate": 5.116794575904962e-07, "logits/chosen": -0.11752434074878693, "logits/rejected": -0.03900138661265373, "logps/chosen": -1.1684796810150146, "logps/rejected": -1.2430273294448853, "loss": 1.1685, "rewards/accuracies": 0.5, "rewards/chosen": -1.1684796810150146, "rewards/margins": 0.07454764097929001, "rewards/rejected": -1.2430273294448853, "step": 3045 }, { "epoch": 1.632379996654959, "grad_norm": 6.923409549213314, "learning_rate": 5.101224257348987e-07, "logits/chosen": -0.1833103895187378, "logits/rejected": -0.054696209728717804, "logps/chosen": -1.2325997352600098, "logps/rejected": -1.3919403553009033, "loss": 1.2326, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.2325997352600098, "rewards/margins": 0.15934067964553833, "rewards/rejected": -1.3919403553009033, "step": 3050 }, { "epoch": 1.6350560294363605, "grad_norm": 4.562375102780086, "learning_rate": 5.085652956714823e-07, "logits/chosen": -0.15085473656654358, "logits/rejected": -0.05616817623376846, "logps/chosen": -1.1719554662704468, "logps/rejected": -1.3407684564590454, "loss": 1.172, "rewards/accuracies": 0.5625, "rewards/chosen": -1.1719554662704468, "rewards/margins": 0.1688128560781479, "rewards/rejected": -1.3407684564590454, "step": 3055 }, { "epoch": 1.6377320622177622, "grad_norm": 5.223903873278388, "learning_rate": 5.070080825075298e-07, "logits/chosen": -0.19301117956638336, "logits/rejected": -0.06311734020709991, "logps/chosen": -1.2162679433822632, "logps/rejected": -1.367849588394165, "loss": 1.2163, "rewards/accuracies": 0.5625, "rewards/chosen": -1.2162679433822632, "rewards/margins": 0.15158165991306305, "rewards/rejected": -1.367849588394165, "step": 3060 }, { "epoch": 1.6404080949991637, "grad_norm": 5.2437478027626385, "learning_rate": 5.0545080135113e-07, "logits/chosen": -0.07739882171154022, "logits/rejected": -0.06213326007127762, "logps/chosen": -1.212134599685669, "logps/rejected": -1.430133581161499, "loss": 1.2121, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.212134599685669, "rewards/margins": 0.21799901127815247, "rewards/rejected": -1.430133581161499, "step": 3065 }, { "epoch": 1.6430841277805652, "grad_norm": 5.27110228346977, "learning_rate": 5.038934673110316e-07, "logits/chosen": -0.21068044006824493, "logits/rejected": -0.1214398518204689, "logps/chosen": -1.2106714248657227, "logps/rejected": -1.3410584926605225, "loss": 1.2107, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.2106714248657227, "rewards/margins": 0.13038699328899384, "rewards/rejected": -1.3410584926605225, "step": 3070 }, { "epoch": 1.645760160561967, "grad_norm": 4.574627016566, "learning_rate": 5.023360954964963e-07, "logits/chosen": -0.2170284539461136, "logits/rejected": -0.17568986117839813, "logps/chosen": -1.1424022912979126, "logps/rejected": -1.2875709533691406, "loss": 1.1424, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.1424022912979126, "rewards/margins": 0.14516869187355042, "rewards/rejected": -1.2875709533691406, "step": 3075 }, { "epoch": 1.6484361933433684, "grad_norm": 4.788451314226406, "learning_rate": 5.007787010171524e-07, "logits/chosen": -0.2599736750125885, "logits/rejected": -0.10823975503444672, "logps/chosen": -1.1382328271865845, "logps/rejected": -1.2780641317367554, "loss": 1.1382, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.1382328271865845, "rewards/margins": 0.1398313194513321, "rewards/rejected": -1.2780641317367554, "step": 3080 }, { "epoch": 1.65111222612477, "grad_norm": 3.9602393741930646, "learning_rate": 4.992212989828477e-07, "logits/chosen": -0.0947759747505188, "logits/rejected": -0.09989680349826813, "logps/chosen": -1.1413180828094482, "logps/rejected": -1.2750688791275024, "loss": 1.1413, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.1413180828094482, "rewards/margins": 0.13375088572502136, "rewards/rejected": -1.2750688791275024, "step": 3085 }, { "epoch": 1.6537882589061716, "grad_norm": 4.8768332318601075, "learning_rate": 4.976639045035036e-07, "logits/chosen": -0.07192603498697281, "logits/rejected": -0.01887873373925686, "logps/chosen": -1.184253454208374, "logps/rejected": -1.2809436321258545, "loss": 1.1843, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.184253454208374, "rewards/margins": 0.09669016301631927, "rewards/rejected": -1.2809436321258545, "step": 3090 }, { "epoch": 1.6564642916875731, "grad_norm": 8.285113385663806, "learning_rate": 4.961065326889683e-07, "logits/chosen": -0.13662883639335632, "logits/rejected": -0.03375939279794693, "logps/chosen": -1.1924139261245728, "logps/rejected": -1.314000129699707, "loss": 1.1924, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.1924139261245728, "rewards/margins": 0.12158627808094025, "rewards/rejected": -1.314000129699707, "step": 3095 }, { "epoch": 1.6591403244689746, "grad_norm": 6.773619449546771, "learning_rate": 4.9454919864887e-07, "logits/chosen": -0.2870023846626282, "logits/rejected": -0.17406241595745087, "logps/chosen": -1.2400453090667725, "logps/rejected": -1.3264992237091064, "loss": 1.24, "rewards/accuracies": 0.53125, "rewards/chosen": -1.2400453090667725, "rewards/margins": 0.08645391464233398, "rewards/rejected": -1.3264992237091064, "step": 3100 }, { "epoch": 1.6618163572503764, "grad_norm": 6.2714825292507586, "learning_rate": 4.929919174924701e-07, "logits/chosen": -0.19973672926425934, "logits/rejected": -0.06744730472564697, "logps/chosen": -1.2441459894180298, "logps/rejected": -1.3366062641143799, "loss": 1.2441, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.2441459894180298, "rewards/margins": 0.09246013313531876, "rewards/rejected": -1.3366062641143799, "step": 3105 }, { "epoch": 1.6644923900317778, "grad_norm": 6.182041725119687, "learning_rate": 4.914347043285177e-07, "logits/chosen": -0.150596484541893, "logits/rejected": -0.07052230834960938, "logps/chosen": -1.2131426334381104, "logps/rejected": -1.3781474828720093, "loss": 1.2131, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.2131426334381104, "rewards/margins": 0.1650049090385437, "rewards/rejected": -1.3781474828720093, "step": 3110 }, { "epoch": 1.6671684228131793, "grad_norm": 4.689111054193859, "learning_rate": 4.898775742651013e-07, "logits/chosen": -0.08023428916931152, "logits/rejected": -0.03191228210926056, "logps/chosen": -1.2221479415893555, "logps/rejected": -1.4154804944992065, "loss": 1.2221, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.2221479415893555, "rewards/margins": 0.19333258271217346, "rewards/rejected": -1.4154804944992065, "step": 3115 }, { "epoch": 1.669844455594581, "grad_norm": 4.37873620393665, "learning_rate": 4.883205424095037e-07, "logits/chosen": -0.1957603245973587, "logits/rejected": -0.09004609286785126, "logps/chosen": -1.2818992137908936, "logps/rejected": -1.3804773092269897, "loss": 1.2819, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.2818992137908936, "rewards/margins": 0.09857799857854843, "rewards/rejected": -1.3804773092269897, "step": 3120 }, { "epoch": 1.6725204883759828, "grad_norm": 6.966538115340276, "learning_rate": 4.86763623868055e-07, "logits/chosen": -0.10500431060791016, "logits/rejected": -0.033387068659067154, "logps/chosen": -1.2528984546661377, "logps/rejected": -1.4050484895706177, "loss": 1.2529, "rewards/accuracies": 0.5625, "rewards/chosen": -1.2528984546661377, "rewards/margins": 0.1521500051021576, "rewards/rejected": -1.4050484895706177, "step": 3125 }, { "epoch": 1.675196521157384, "grad_norm": 6.052532850839008, "learning_rate": 4.852068337459856e-07, "logits/chosen": -0.089266836643219, "logits/rejected": 0.007674031890928745, "logps/chosen": -1.2808443307876587, "logps/rejected": -1.3945075273513794, "loss": 1.2808, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.2808443307876587, "rewards/margins": 0.11366318166255951, "rewards/rejected": -1.3945075273513794, "step": 3130 }, { "epoch": 1.6778725539387858, "grad_norm": 5.548099504471589, "learning_rate": 4.8365018714728e-07, "logits/chosen": -0.07107739895582199, "logits/rejected": -0.035806186497211456, "logps/chosen": -1.2749074697494507, "logps/rejected": -1.3686169385910034, "loss": 1.2749, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.2749074697494507, "rewards/margins": 0.09370945394039154, "rewards/rejected": -1.3686169385910034, "step": 3135 }, { "epoch": 1.6805485867201875, "grad_norm": 4.051242859517567, "learning_rate": 4.820936991745304e-07, "logits/chosen": -0.30449378490448, "logits/rejected": -0.17652608454227448, "logps/chosen": -1.1597936153411865, "logps/rejected": -1.2193009853363037, "loss": 1.1598, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.1597936153411865, "rewards/margins": 0.05950726941227913, "rewards/rejected": -1.2193009853363037, "step": 3140 }, { "epoch": 1.6832246195015887, "grad_norm": 5.823708038407726, "learning_rate": 4.8053738492879e-07, "logits/chosen": -0.10863365978002548, "logits/rejected": -0.006353181786835194, "logps/chosen": -1.2121670246124268, "logps/rejected": -1.2294548749923706, "loss": 1.2122, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -1.2121670246124268, "rewards/margins": 0.01728782057762146, "rewards/rejected": -1.2294548749923706, "step": 3145 }, { "epoch": 1.6859006522829905, "grad_norm": 4.584498303797099, "learning_rate": 4.789812595094265e-07, "logits/chosen": -0.2431916892528534, "logits/rejected": -0.14749568700790405, "logps/chosen": -1.2805802822113037, "logps/rejected": -1.4023070335388184, "loss": 1.2806, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.2805802822113037, "rewards/margins": 0.12172672897577286, "rewards/rejected": -1.4023070335388184, "step": 3150 }, { "epoch": 1.6885766850643922, "grad_norm": 5.833686281150346, "learning_rate": 4.774253380139752e-07, "logits/chosen": -0.23503148555755615, "logits/rejected": -0.14121034741401672, "logps/chosen": -1.1503289937973022, "logps/rejected": -1.3190218210220337, "loss": 1.1503, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.1503289937973022, "rewards/margins": 0.16869261860847473, "rewards/rejected": -1.3190218210220337, "step": 3155 }, { "epoch": 1.6912527178457935, "grad_norm": 6.575267958105358, "learning_rate": 4.758696355379936e-07, "logits/chosen": -0.1779743731021881, "logits/rejected": -0.15948285162448883, "logps/chosen": -1.1834557056427002, "logps/rejected": -1.3771584033966064, "loss": 1.1835, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.1834557056427002, "rewards/margins": 0.19370253384113312, "rewards/rejected": -1.3771584033966064, "step": 3160 }, { "epoch": 1.6939287506271952, "grad_norm": 5.259691998710948, "learning_rate": 4.743141671749138e-07, "logits/chosen": -0.2061273604631424, "logits/rejected": -0.1377643197774887, "logps/chosen": -1.2452961206436157, "logps/rejected": -1.3423787355422974, "loss": 1.2453, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -1.2452961206436157, "rewards/margins": 0.09708261489868164, "rewards/rejected": -1.3423787355422974, "step": 3165 }, { "epoch": 1.6966047834085969, "grad_norm": 5.072741847951233, "learning_rate": 4.727589480158968e-07, "logits/chosen": -0.18694308400154114, "logits/rejected": -0.11302758753299713, "logps/chosen": -1.2233017683029175, "logps/rejected": -1.317065954208374, "loss": 1.2233, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.2233017683029175, "rewards/margins": 0.09376402199268341, "rewards/rejected": -1.317065954208374, "step": 3170 }, { "epoch": 1.6992808161899984, "grad_norm": 5.7759686298122155, "learning_rate": 4.712039931496855e-07, "logits/chosen": -0.22526881098747253, "logits/rejected": -0.16255110502243042, "logps/chosen": -1.167076587677002, "logps/rejected": -1.3099515438079834, "loss": 1.1671, "rewards/accuracies": 0.5625, "rewards/chosen": -1.167076587677002, "rewards/margins": 0.14287501573562622, "rewards/rejected": -1.3099515438079834, "step": 3175 }, { "epoch": 1.7019568489713999, "grad_norm": 4.481205463629761, "learning_rate": 4.6964931766245905e-07, "logits/chosen": -0.09668828547000885, "logits/rejected": -0.05123674124479294, "logps/chosen": -1.2380788326263428, "logps/rejected": -1.3862733840942383, "loss": 1.2381, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.2380788326263428, "rewards/margins": 0.14819470047950745, "rewards/rejected": -1.3862733840942383, "step": 3180 }, { "epoch": 1.7046328817528016, "grad_norm": 5.709797231928383, "learning_rate": 4.6809493663768575e-07, "logits/chosen": -0.1661548912525177, "logits/rejected": -0.15955792367458344, "logps/chosen": -1.1290091276168823, "logps/rejected": -1.2672656774520874, "loss": 1.129, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.1290091276168823, "rewards/margins": 0.13825656473636627, "rewards/rejected": -1.2672656774520874, "step": 3185 }, { "epoch": 1.707308914534203, "grad_norm": 6.672216322801481, "learning_rate": 4.6654086515597716e-07, "logits/chosen": -0.21533150970935822, "logits/rejected": -0.11372718960046768, "logps/chosen": -1.1867080926895142, "logps/rejected": -1.3704780340194702, "loss": 1.1867, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.1867080926895142, "rewards/margins": 0.18376995623111725, "rewards/rejected": -1.3704780340194702, "step": 3190 }, { "epoch": 1.7099849473156046, "grad_norm": 4.070388639988112, "learning_rate": 4.6498711829494154e-07, "logits/chosen": -0.2501601576805115, "logits/rejected": -0.16371062397956848, "logps/chosen": -1.18706476688385, "logps/rejected": -1.390226125717163, "loss": 1.1871, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.18706476688385, "rewards/margins": 0.2031613290309906, "rewards/rejected": -1.390226125717163, "step": 3195 }, { "epoch": 1.7126609800970063, "grad_norm": 5.812914487635193, "learning_rate": 4.6343371112903777e-07, "logits/chosen": -0.14225588738918304, "logits/rejected": -0.011802751570940018, "logps/chosen": -1.2249597311019897, "logps/rejected": -1.4622248411178589, "loss": 1.225, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.2249597311019897, "rewards/margins": 0.2372651845216751, "rewards/rejected": -1.4622248411178589, "step": 3200 }, { "epoch": 1.7126609800970063, "eval_logits/chosen": 0.09014469385147095, "eval_logits/rejected": 0.15613968670368195, "eval_logps/chosen": -1.26457941532135, "eval_logps/rejected": -1.3871713876724243, "eval_loss": 1.264853596687317, "eval_rewards/accuracies": 0.5459940433502197, "eval_rewards/chosen": -1.26457941532135, "eval_rewards/margins": 0.12259194999933243, "eval_rewards/rejected": -1.3871713876724243, "eval_runtime": 40.0217, "eval_samples_per_second": 33.607, "eval_steps_per_second": 8.42, "step": 3200 }, { "epoch": 1.7153370128784078, "grad_norm": 4.5048152321698485, "learning_rate": 4.618806587294291e-07, "logits/chosen": -0.26347455382347107, "logits/rejected": -0.17424623668193817, "logps/chosen": -1.2465096712112427, "logps/rejected": -1.383754014968872, "loss": 1.2465, "rewards/accuracies": 0.5625, "rewards/chosen": -1.2465096712112427, "rewards/margins": 0.13724440336227417, "rewards/rejected": -1.383754014968872, "step": 3205 }, { "epoch": 1.7180130456598093, "grad_norm": 6.611659775975374, "learning_rate": 4.603279761638365e-07, "logits/chosen": -0.22858722507953644, "logits/rejected": -0.15155892074108124, "logps/chosen": -1.215763807296753, "logps/rejected": -1.3446292877197266, "loss": 1.2158, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.215763807296753, "rewards/margins": 0.12886551022529602, "rewards/rejected": -1.3446292877197266, "step": 3210 }, { "epoch": 1.720689078441211, "grad_norm": 6.5918914719815564, "learning_rate": 4.5877567849639315e-07, "logits/chosen": -0.20625846087932587, "logits/rejected": -0.1099659651517868, "logps/chosen": -1.1855093240737915, "logps/rejected": -1.2820783853530884, "loss": 1.1855, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.1855093240737915, "rewards/margins": 0.09656906127929688, "rewards/rejected": -1.2820783853530884, "step": 3215 }, { "epoch": 1.7233651112226125, "grad_norm": 4.977474819994388, "learning_rate": 4.572237807874979e-07, "logits/chosen": -0.19995896518230438, "logits/rejected": -0.04748598113656044, "logps/chosen": -1.2684147357940674, "logps/rejected": -1.3642070293426514, "loss": 1.2684, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.2684147357940674, "rewards/margins": 0.0957925096154213, "rewards/rejected": -1.3642070293426514, "step": 3220 }, { "epoch": 1.726041144004014, "grad_norm": 6.324969238143487, "learning_rate": 4.5567229809366895e-07, "logits/chosen": -0.2073971927165985, "logits/rejected": -0.09606052935123444, "logps/chosen": -1.1382046937942505, "logps/rejected": -1.3042744398117065, "loss": 1.1382, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.1382046937942505, "rewards/margins": 0.16606983542442322, "rewards/rejected": -1.3042744398117065, "step": 3225 }, { "epoch": 1.7287171767854157, "grad_norm": 6.6638192393436215, "learning_rate": 4.541212454673984e-07, "logits/chosen": -0.21064849197864532, "logits/rejected": -0.10079564154148102, "logps/chosen": -1.183974027633667, "logps/rejected": -1.3407843112945557, "loss": 1.184, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.183974027633667, "rewards/margins": 0.15681013464927673, "rewards/rejected": -1.3407843112945557, "step": 3230 }, { "epoch": 1.7313932095668172, "grad_norm": 5.958423481181359, "learning_rate": 4.525706379570055e-07, "logits/chosen": -0.17071852087974548, "logits/rejected": -0.13819026947021484, "logps/chosen": -1.2116693258285522, "logps/rejected": -1.3516271114349365, "loss": 1.2117, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.2116693258285522, "rewards/margins": 0.13995787501335144, "rewards/rejected": -1.3516271114349365, "step": 3235 }, { "epoch": 1.7340692423482187, "grad_norm": 5.414530406875436, "learning_rate": 4.510204906064911e-07, "logits/chosen": -0.10903360694646835, "logits/rejected": -0.04427602142095566, "logps/chosen": -1.1561000347137451, "logps/rejected": -1.3564187288284302, "loss": 1.1561, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.1561000347137451, "rewards/margins": 0.2003188133239746, "rewards/rejected": -1.3564187288284302, "step": 3240 }, { "epoch": 1.7367452751296204, "grad_norm": 5.897269677176872, "learning_rate": 4.4947081845539177e-07, "logits/chosen": -0.279106080532074, "logits/rejected": -0.1739000380039215, "logps/chosen": -1.1577059030532837, "logps/rejected": -1.294065237045288, "loss": 1.1577, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.1577059030532837, "rewards/margins": 0.13635943830013275, "rewards/rejected": -1.294065237045288, "step": 3245 }, { "epoch": 1.739421307911022, "grad_norm": 4.3934377593580525, "learning_rate": 4.479216365386333e-07, "logits/chosen": -0.07781601697206497, "logits/rejected": 0.005089476704597473, "logps/chosen": -1.2156165838241577, "logps/rejected": -1.3949494361877441, "loss": 1.2156, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.2156165838241577, "rewards/margins": 0.1793329417705536, "rewards/rejected": -1.3949494361877441, "step": 3250 }, { "epoch": 1.7420973406924234, "grad_norm": 6.666240727156985, "learning_rate": 4.4637295988638555e-07, "logits/chosen": -0.1072242483496666, "logits/rejected": -0.05045003443956375, "logps/chosen": -1.3007221221923828, "logps/rejected": -1.3096402883529663, "loss": 1.3007, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -1.3007221221923828, "rewards/margins": 0.008918101899325848, "rewards/rejected": -1.3096402883529663, "step": 3255 }, { "epoch": 1.744773373473825, "grad_norm": 6.428058737077053, "learning_rate": 4.4482480352391623e-07, "logits/chosen": -0.25356870889663696, "logits/rejected": -0.15076008439064026, "logps/chosen": -1.2620861530303955, "logps/rejected": -1.3085941076278687, "loss": 1.2621, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.2620861530303955, "rewards/margins": 0.046508003026247025, "rewards/rejected": -1.3085941076278687, "step": 3260 }, { "epoch": 1.7474494062552266, "grad_norm": 7.431339871200085, "learning_rate": 4.4327718247144507e-07, "logits/chosen": -0.1562182456254959, "logits/rejected": -0.07953982055187225, "logps/chosen": -1.1501110792160034, "logps/rejected": -1.3153927326202393, "loss": 1.1501, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.1501110792160034, "rewards/margins": 0.16528162360191345, "rewards/rejected": -1.3153927326202393, "step": 3265 }, { "epoch": 1.750125439036628, "grad_norm": 5.6363508066593315, "learning_rate": 4.417301117439984e-07, "logits/chosen": -0.12033051252365112, "logits/rejected": 0.017730269581079483, "logps/chosen": -1.1142678260803223, "logps/rejected": -1.3280953168869019, "loss": 1.1143, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.1142678260803223, "rewards/margins": 0.2138274908065796, "rewards/rejected": -1.3280953168869019, "step": 3270 }, { "epoch": 1.7528014718180298, "grad_norm": 4.1634650635553365, "learning_rate": 4.401836063512631e-07, "logits/chosen": -0.15279312431812286, "logits/rejected": 0.07945416867733002, "logps/chosen": -1.2397702932357788, "logps/rejected": -1.2728713750839233, "loss": 1.2398, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.2397702932357788, "rewards/margins": 0.03310111165046692, "rewards/rejected": -1.2728713750839233, "step": 3275 }, { "epoch": 1.7554775045994313, "grad_norm": 6.16485147664817, "learning_rate": 4.386376812974413e-07, "logits/chosen": -0.1574951559305191, "logits/rejected": -0.09371761232614517, "logps/chosen": -1.1746957302093506, "logps/rejected": -1.2900984287261963, "loss": 1.1747, "rewards/accuracies": 0.53125, "rewards/chosen": -1.1746957302093506, "rewards/margins": 0.11540268361568451, "rewards/rejected": -1.2900984287261963, "step": 3280 }, { "epoch": 1.7581535373808328, "grad_norm": 5.569966822993427, "learning_rate": 4.370923515811048e-07, "logits/chosen": -0.17144839465618134, "logits/rejected": -0.0011734783183783293, "logps/chosen": -1.1742132902145386, "logps/rejected": -1.3557209968566895, "loss": 1.1742, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.1742132902145386, "rewards/margins": 0.1815076321363449, "rewards/rejected": -1.3557209968566895, "step": 3285 }, { "epoch": 1.7608295701622345, "grad_norm": 6.745141635669032, "learning_rate": 4.35547632195049e-07, "logits/chosen": -0.12174420058727264, "logits/rejected": -0.03572068363428116, "logps/chosen": -1.1870481967926025, "logps/rejected": -1.224724531173706, "loss": 1.187, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -1.1870481967926025, "rewards/margins": 0.0376763641834259, "rewards/rejected": -1.224724531173706, "step": 3290 }, { "epoch": 1.763505602943636, "grad_norm": 5.902520503803092, "learning_rate": 4.340035381261484e-07, "logits/chosen": -0.17321956157684326, "logits/rejected": -0.13926365971565247, "logps/chosen": -1.2737218141555786, "logps/rejected": -1.3080699443817139, "loss": 1.2737, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -1.2737218141555786, "rewards/margins": 0.03434809669852257, "rewards/rejected": -1.3080699443817139, "step": 3295 }, { "epoch": 1.7661816357250375, "grad_norm": 5.877514642150992, "learning_rate": 4.324600843552104e-07, "logits/chosen": -0.24112734198570251, "logits/rejected": -0.14247636497020721, "logps/chosen": -1.2650901079177856, "logps/rejected": -1.3421342372894287, "loss": 1.2651, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.2650901079177856, "rewards/margins": 0.07704424858093262, "rewards/rejected": -1.3421342372894287, "step": 3300 }, { "epoch": 1.7688576685064392, "grad_norm": 6.496648499093931, "learning_rate": 4.309172858568302e-07, "logits/chosen": -0.2495114803314209, "logits/rejected": -0.1547159105539322, "logps/chosen": -1.2349172830581665, "logps/rejected": -1.307373285293579, "loss": 1.2349, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.2349172830581665, "rewards/margins": 0.0724562406539917, "rewards/rejected": -1.307373285293579, "step": 3305 }, { "epoch": 1.771533701287841, "grad_norm": 5.545967730111126, "learning_rate": 4.293751575992455e-07, "logits/chosen": -0.07288947701454163, "logits/rejected": -0.043522901833057404, "logps/chosen": -1.2012536525726318, "logps/rejected": -1.3243046998977661, "loss": 1.2013, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.2012536525726318, "rewards/margins": 0.12305095046758652, "rewards/rejected": -1.3243046998977661, "step": 3310 }, { "epoch": 1.7742097340692422, "grad_norm": 5.71623415855496, "learning_rate": 4.278337145441916e-07, "logits/chosen": -0.26046472787857056, "logits/rejected": -0.1597483605146408, "logps/chosen": -1.1704938411712646, "logps/rejected": -1.3039953708648682, "loss": 1.1705, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.1704938411712646, "rewards/margins": 0.13350148499011993, "rewards/rejected": -1.3039953708648682, "step": 3315 }, { "epoch": 1.776885766850644, "grad_norm": 4.871319317982382, "learning_rate": 4.262929716467556e-07, "logits/chosen": -0.16962336003780365, "logits/rejected": -0.03625626489520073, "logps/chosen": -1.197567343711853, "logps/rejected": -1.3290635347366333, "loss": 1.1976, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.197567343711853, "rewards/margins": 0.1314961016178131, "rewards/rejected": -1.3290635347366333, "step": 3320 }, { "epoch": 1.7795617996320456, "grad_norm": 5.673221699141394, "learning_rate": 4.247529438552321e-07, "logits/chosen": -0.2660422921180725, "logits/rejected": -0.12390347570180893, "logps/chosen": -1.216326117515564, "logps/rejected": -1.3546077013015747, "loss": 1.2163, "rewards/accuracies": 0.5625, "rewards/chosen": -1.216326117515564, "rewards/margins": 0.13828153908252716, "rewards/rejected": -1.3546077013015747, "step": 3325 }, { "epoch": 1.782237832413447, "grad_norm": 5.689605010185864, "learning_rate": 4.232136461109773e-07, "logits/chosen": -0.13146919012069702, "logits/rejected": -0.04918663576245308, "logps/chosen": -1.1210932731628418, "logps/rejected": -1.3059163093566895, "loss": 1.1211, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.1210932731628418, "rewards/margins": 0.18482284247875214, "rewards/rejected": -1.3059163093566895, "step": 3330 }, { "epoch": 1.7849138651948486, "grad_norm": 7.830052702431625, "learning_rate": 4.216750933482646e-07, "logits/chosen": -0.1832199990749359, "logits/rejected": -0.05450131744146347, "logps/chosen": -1.2377300262451172, "logps/rejected": -1.351043462753296, "loss": 1.2377, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.2377300262451172, "rewards/margins": 0.11331333220005035, "rewards/rejected": -1.351043462753296, "step": 3335 }, { "epoch": 1.7875898979762503, "grad_norm": 4.8910149804150285, "learning_rate": 4.2013730049413986e-07, "logits/chosen": -0.14366964995861053, "logits/rejected": -0.04172324761748314, "logps/chosen": -1.1853249073028564, "logps/rejected": -1.3239637613296509, "loss": 1.1853, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.1853249073028564, "rewards/margins": 0.13863886892795563, "rewards/rejected": -1.3239637613296509, "step": 3340 }, { "epoch": 1.7902659307576518, "grad_norm": 5.50958738499462, "learning_rate": 4.1860028246827594e-07, "logits/chosen": -0.1439540684223175, "logits/rejected": -0.020426208153367043, "logps/chosen": -1.1161819696426392, "logps/rejected": -1.2565706968307495, "loss": 1.1162, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.1161819696426392, "rewards/margins": 0.14038889110088348, "rewards/rejected": -1.2565706968307495, "step": 3345 }, { "epoch": 1.7929419635390533, "grad_norm": 6.59565319271333, "learning_rate": 4.170640541828285e-07, "logits/chosen": -0.2834468483924866, "logits/rejected": -0.17983433604240417, "logps/chosen": -1.286466360092163, "logps/rejected": -1.3472929000854492, "loss": 1.2865, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.286466360092163, "rewards/margins": 0.06082627922296524, "rewards/rejected": -1.3472929000854492, "step": 3350 }, { "epoch": 1.795617996320455, "grad_norm": 5.021471173468769, "learning_rate": 4.1552863054229116e-07, "logits/chosen": -0.04708681255578995, "logits/rejected": -0.02389518916606903, "logps/chosen": -1.258103847503662, "logps/rejected": -1.296245813369751, "loss": 1.2581, "rewards/accuracies": 0.46875, "rewards/chosen": -1.258103847503662, "rewards/margins": 0.03814171254634857, "rewards/rejected": -1.296245813369751, "step": 3355 }, { "epoch": 1.7982940291018565, "grad_norm": 6.640488432897256, "learning_rate": 4.139940264433508e-07, "logits/chosen": -0.18072344362735748, "logits/rejected": -0.028495365753769875, "logps/chosen": -1.1640628576278687, "logps/rejected": -1.264851689338684, "loss": 1.1641, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.1640628576278687, "rewards/margins": 0.10078861564397812, "rewards/rejected": -1.264851689338684, "step": 3360 }, { "epoch": 1.800970061883258, "grad_norm": 4.3225964678088085, "learning_rate": 4.1246025677474303e-07, "logits/chosen": -0.19880613684654236, "logits/rejected": -0.08087635040283203, "logps/chosen": -1.2011104822158813, "logps/rejected": -1.3289794921875, "loss": 1.2011, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.2011104822158813, "rewards/margins": 0.127869114279747, "rewards/rejected": -1.3289794921875, "step": 3365 }, { "epoch": 1.8036460946646597, "grad_norm": 4.2158690726471715, "learning_rate": 4.10927336417108e-07, "logits/chosen": -0.15716472268104553, "logits/rejected": -0.03811248019337654, "logps/chosen": -1.1903307437896729, "logps/rejected": -1.2926485538482666, "loss": 1.1903, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.1903307437896729, "rewards/margins": 0.102317675948143, "rewards/rejected": -1.2926485538482666, "step": 3370 }, { "epoch": 1.8063221274460612, "grad_norm": 6.260306649323853, "learning_rate": 4.093952802428457e-07, "logits/chosen": -0.0494052954018116, "logits/rejected": -0.022048506885766983, "logps/chosen": -1.2242833375930786, "logps/rejected": -1.246326208114624, "loss": 1.2243, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -1.2242833375930786, "rewards/margins": 0.022042976692318916, "rewards/rejected": -1.246326208114624, "step": 3375 }, { "epoch": 1.8089981602274627, "grad_norm": 6.326938364306672, "learning_rate": 4.0786410311597184e-07, "logits/chosen": -0.23895129561424255, "logits/rejected": -0.13263073563575745, "logps/chosen": -1.1985210180282593, "logps/rejected": -1.3636467456817627, "loss": 1.1985, "rewards/accuracies": 0.59375, "rewards/chosen": -1.1985210180282593, "rewards/margins": 0.1651257574558258, "rewards/rejected": -1.3636467456817627, "step": 3380 }, { "epoch": 1.8116741930088645, "grad_norm": 4.676543523136898, "learning_rate": 4.063338198919737e-07, "logits/chosen": -0.1903793066740036, "logits/rejected": -0.16690480709075928, "logps/chosen": -1.258945107460022, "logps/rejected": -1.3676373958587646, "loss": 1.2589, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.258945107460022, "rewards/margins": 0.10869216918945312, "rewards/rejected": -1.3676373958587646, "step": 3385 }, { "epoch": 1.814350225790266, "grad_norm": 7.130128286691459, "learning_rate": 4.0480444541766575e-07, "logits/chosen": -0.1252816915512085, "logits/rejected": -0.04375578463077545, "logps/chosen": -1.2637269496917725, "logps/rejected": -1.2898824214935303, "loss": 1.2637, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.2637269496917725, "rewards/margins": 0.026155462488532066, "rewards/rejected": -1.2898824214935303, "step": 3390 }, { "epoch": 1.8170262585716674, "grad_norm": 6.334496943888987, "learning_rate": 4.0327599453104606e-07, "logits/chosen": -0.18286773562431335, "logits/rejected": -0.12198203802108765, "logps/chosen": -1.1523030996322632, "logps/rejected": -1.3285894393920898, "loss": 1.1523, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.1523030996322632, "rewards/margins": 0.17628619074821472, "rewards/rejected": -1.3285894393920898, "step": 3395 }, { "epoch": 1.8197022913530692, "grad_norm": 5.454292564384518, "learning_rate": 4.017484820611514e-07, "logits/chosen": -0.15741322934627533, "logits/rejected": -0.06817597895860672, "logps/chosen": -1.2166006565093994, "logps/rejected": -1.2801234722137451, "loss": 1.2166, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.2166006565093994, "rewards/margins": 0.06352273374795914, "rewards/rejected": -1.2801234722137451, "step": 3400 }, { "epoch": 1.8223783241344707, "grad_norm": 7.074120636314785, "learning_rate": 4.002219228279148e-07, "logits/chosen": -0.1669076681137085, "logits/rejected": -0.04959757998585701, "logps/chosen": -1.2075153589248657, "logps/rejected": -1.3537585735321045, "loss": 1.2075, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.2075153589248657, "rewards/margins": 0.14624308049678802, "rewards/rejected": -1.3537585735321045, "step": 3405 }, { "epoch": 1.8250543569158721, "grad_norm": 6.072551317792774, "learning_rate": 3.9869633164202045e-07, "logits/chosen": -0.17116674780845642, "logits/rejected": -0.012791763059794903, "logps/chosen": -1.345576286315918, "logps/rejected": -1.4077210426330566, "loss": 1.3456, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -1.345576286315918, "rewards/margins": 0.06214488670229912, "rewards/rejected": -1.4077210426330566, "step": 3410 }, { "epoch": 1.8277303896972739, "grad_norm": 5.422286435509459, "learning_rate": 3.9717172330476077e-07, "logits/chosen": -0.15124377608299255, "logits/rejected": -0.07929189503192902, "logps/chosen": -1.1909183263778687, "logps/rejected": -1.3422876596450806, "loss": 1.1909, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.1909183263778687, "rewards/margins": 0.15136930346488953, "rewards/rejected": -1.3422876596450806, "step": 3415 }, { "epoch": 1.8304064224786754, "grad_norm": 7.44515033233004, "learning_rate": 3.956481126078927e-07, "logits/chosen": -0.11331383138895035, "logits/rejected": -0.022290926426649094, "logps/chosen": -1.2129724025726318, "logps/rejected": -1.4151670932769775, "loss": 1.213, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.2129724025726318, "rewards/margins": 0.20219452679157257, "rewards/rejected": -1.4151670932769775, "step": 3420 }, { "epoch": 1.8330824552600768, "grad_norm": 4.621455154797828, "learning_rate": 3.941255143334937e-07, "logits/chosen": -0.18934378027915955, "logits/rejected": -0.16193212568759918, "logps/chosen": -1.1949506998062134, "logps/rejected": -1.3123972415924072, "loss": 1.195, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.1949506998062134, "rewards/margins": 0.11744655668735504, "rewards/rejected": -1.3123972415924072, "step": 3425 }, { "epoch": 1.8357584880414786, "grad_norm": 6.386667009150437, "learning_rate": 3.9260394325381895e-07, "logits/chosen": -0.1717674881219864, "logits/rejected": -0.07800033688545227, "logps/chosen": -1.2645177841186523, "logps/rejected": -1.356939673423767, "loss": 1.2645, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.2645177841186523, "rewards/margins": 0.0924217700958252, "rewards/rejected": -1.356939673423767, "step": 3430 }, { "epoch": 1.83843452082288, "grad_norm": 6.720247704610905, "learning_rate": 3.9108341413115784e-07, "logits/chosen": -0.1811373084783554, "logits/rejected": -0.12745681405067444, "logps/chosen": -1.2219923734664917, "logps/rejected": -1.3938795328140259, "loss": 1.222, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.2219923734664917, "rewards/margins": 0.17188714444637299, "rewards/rejected": -1.3938795328140259, "step": 3435 }, { "epoch": 1.8411105536042816, "grad_norm": 7.493627489736403, "learning_rate": 3.895639417176905e-07, "logits/chosen": -0.2173558473587036, "logits/rejected": -0.17665314674377441, "logps/chosen": -1.1367450952529907, "logps/rejected": -1.281967043876648, "loss": 1.1367, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.1367450952529907, "rewards/margins": 0.1452219933271408, "rewards/rejected": -1.281967043876648, "step": 3440 }, { "epoch": 1.8437865863856833, "grad_norm": 5.161905607072832, "learning_rate": 3.8804554075534497e-07, "logits/chosen": -0.23074154555797577, "logits/rejected": -0.04991511255502701, "logps/chosen": -1.181444764137268, "logps/rejected": -1.299424409866333, "loss": 1.1814, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.181444764137268, "rewards/margins": 0.11797963082790375, "rewards/rejected": -1.299424409866333, "step": 3445 }, { "epoch": 1.8464626191670848, "grad_norm": 6.821201063294855, "learning_rate": 3.8652822597565403e-07, "logits/chosen": -0.28890055418014526, "logits/rejected": -0.14351220428943634, "logps/chosen": -1.208592176437378, "logps/rejected": -1.3897403478622437, "loss": 1.2086, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.208592176437378, "rewards/margins": 0.18114803731441498, "rewards/rejected": -1.3897403478622437, "step": 3450 }, { "epoch": 1.8491386519484863, "grad_norm": 5.586293421230272, "learning_rate": 3.850120120996123e-07, "logits/chosen": -0.15431705117225647, "logits/rejected": -0.03291536122560501, "logps/chosen": -1.3755861520767212, "logps/rejected": -1.498429775238037, "loss": 1.3756, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -1.3755861520767212, "rewards/margins": 0.12284356355667114, "rewards/rejected": -1.498429775238037, "step": 3455 }, { "epoch": 1.851814684729888, "grad_norm": 5.71284591343616, "learning_rate": 3.8349691383753356e-07, "logits/chosen": -0.07343684136867523, "logits/rejected": 0.023308023810386658, "logps/chosen": -1.182633638381958, "logps/rejected": -1.3288253545761108, "loss": 1.1826, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.182633638381958, "rewards/margins": 0.14619174599647522, "rewards/rejected": -1.3288253545761108, "step": 3460 }, { "epoch": 1.8544907175112895, "grad_norm": 5.322630549931583, "learning_rate": 3.819829458889078e-07, "logits/chosen": -0.24935957789421082, "logits/rejected": -0.1474248468875885, "logps/chosen": -1.1668226718902588, "logps/rejected": -1.2998926639556885, "loss": 1.1668, "rewards/accuracies": 0.59375, "rewards/chosen": -1.1668226718902588, "rewards/margins": 0.13306990265846252, "rewards/rejected": -1.2998926639556885, "step": 3465 }, { "epoch": 1.857166750292691, "grad_norm": 6.328581027661611, "learning_rate": 3.804701229422585e-07, "logits/chosen": -0.23472170531749725, "logits/rejected": -0.16069400310516357, "logps/chosen": -1.28728449344635, "logps/rejected": -1.3685401678085327, "loss": 1.2873, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.28728449344635, "rewards/margins": 0.08125577867031097, "rewards/rejected": -1.3685401678085327, "step": 3470 }, { "epoch": 1.8598427830740927, "grad_norm": 5.137410547183887, "learning_rate": 3.789584596750007e-07, "logits/chosen": -0.2557324469089508, "logits/rejected": -0.20525316894054413, "logps/chosen": -1.2207074165344238, "logps/rejected": -1.3514409065246582, "loss": 1.2207, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.2207074165344238, "rewards/margins": 0.130733460187912, "rewards/rejected": -1.3514409065246582, "step": 3475 }, { "epoch": 1.8625188158554944, "grad_norm": 5.866972791362018, "learning_rate": 3.77447970753298e-07, "logits/chosen": -0.13803324103355408, "logits/rejected": -0.11530227959156036, "logps/chosen": -1.2407671213150024, "logps/rejected": -1.4261610507965088, "loss": 1.2408, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.2407671213150024, "rewards/margins": 0.18539409339427948, "rewards/rejected": -1.4261610507965088, "step": 3480 }, { "epoch": 1.8651948486368957, "grad_norm": 4.488805817397294, "learning_rate": 3.7593867083192057e-07, "logits/chosen": -0.18039223551750183, "logits/rejected": -0.10027128458023071, "logps/chosen": -1.1976202726364136, "logps/rejected": -1.3192423582077026, "loss": 1.1976, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.1976202726364136, "rewards/margins": 0.12162204831838608, "rewards/rejected": -1.3192423582077026, "step": 3485 }, { "epoch": 1.8678708814182974, "grad_norm": 5.534562463537058, "learning_rate": 3.7443057455410276e-07, "logits/chosen": -0.13956980407238007, "logits/rejected": -0.0436553955078125, "logps/chosen": -1.279639482498169, "logps/rejected": -1.3101478815078735, "loss": 1.2796, "rewards/accuracies": 0.5, "rewards/chosen": -1.279639482498169, "rewards/margins": 0.03050844743847847, "rewards/rejected": -1.3101478815078735, "step": 3490 }, { "epoch": 1.870546914199699, "grad_norm": 6.339580752410625, "learning_rate": 3.7292369655140145e-07, "logits/chosen": -0.23667626082897186, "logits/rejected": -0.10052253305912018, "logps/chosen": -1.2101891040802002, "logps/rejected": -1.344784140586853, "loss": 1.2102, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.2101891040802002, "rewards/margins": 0.13459502160549164, "rewards/rejected": -1.344784140586853, "step": 3495 }, { "epoch": 1.8732229469811004, "grad_norm": 6.3571733708927205, "learning_rate": 3.714180514435534e-07, "logits/chosen": -0.19174787402153015, "logits/rejected": -0.07025592029094696, "logps/chosen": -1.2486528158187866, "logps/rejected": -1.419137716293335, "loss": 1.2487, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.2486528158187866, "rewards/margins": 0.17048490047454834, "rewards/rejected": -1.419137716293335, "step": 3500 }, { "epoch": 1.875898979762502, "grad_norm": 6.363648378675486, "learning_rate": 3.6991365383833426e-07, "logits/chosen": -0.16181819140911102, "logits/rejected": -0.07511533796787262, "logps/chosen": -1.2606223821640015, "logps/rejected": -1.3784898519515991, "loss": 1.2606, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.2606223821640015, "rewards/margins": 0.11786738783121109, "rewards/rejected": -1.3784898519515991, "step": 3505 }, { "epoch": 1.8785750125439038, "grad_norm": 6.98914498721647, "learning_rate": 3.684105183314162e-07, "logits/chosen": -0.18472889065742493, "logits/rejected": -0.13786232471466064, "logps/chosen": -1.1859204769134521, "logps/rejected": -1.311355710029602, "loss": 1.1859, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.1859204769134521, "rewards/margins": 0.12543535232543945, "rewards/rejected": -1.311355710029602, "step": 3510 }, { "epoch": 1.881251045325305, "grad_norm": 5.390377304762371, "learning_rate": 3.669086595062263e-07, "logits/chosen": -0.18602025508880615, "logits/rejected": -0.03348446637392044, "logps/chosen": -1.246976613998413, "logps/rejected": -1.4020326137542725, "loss": 1.247, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.246976613998413, "rewards/margins": 0.15505585074424744, "rewards/rejected": -1.4020326137542725, "step": 3515 }, { "epoch": 1.8839270781067068, "grad_norm": 5.222826014518282, "learning_rate": 3.654080919338056e-07, "logits/chosen": -0.21450456976890564, "logits/rejected": -0.10987764596939087, "logps/chosen": -1.2461720705032349, "logps/rejected": -1.3745940923690796, "loss": 1.2462, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.2461720705032349, "rewards/margins": 0.12842203676700592, "rewards/rejected": -1.3745940923690796, "step": 3520 }, { "epoch": 1.8866031108881085, "grad_norm": 3.414507128878083, "learning_rate": 3.639088301726673e-07, "logits/chosen": -0.1301032304763794, "logits/rejected": 0.028691908344626427, "logps/chosen": -1.2041635513305664, "logps/rejected": -1.3411433696746826, "loss": 1.2042, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.2041635513305664, "rewards/margins": 0.13697989284992218, "rewards/rejected": -1.3411433696746826, "step": 3525 }, { "epoch": 1.88927914366951, "grad_norm": 6.027295328065414, "learning_rate": 3.624108887686556e-07, "logits/chosen": -0.1428847759962082, "logits/rejected": -0.08195429295301437, "logps/chosen": -1.1936802864074707, "logps/rejected": -1.3749678134918213, "loss": 1.1937, "rewards/accuracies": 0.5625, "rewards/chosen": -1.1936802864074707, "rewards/margins": 0.18128763139247894, "rewards/rejected": -1.3749678134918213, "step": 3530 }, { "epoch": 1.8919551764509115, "grad_norm": 4.62634574806712, "learning_rate": 3.6091428225480433e-07, "logits/chosen": -0.2537812292575836, "logits/rejected": -0.15029551088809967, "logps/chosen": -1.1513285636901855, "logps/rejected": -1.2229506969451904, "loss": 1.1513, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -1.1513285636901855, "rewards/margins": 0.07162212580442429, "rewards/rejected": -1.2229506969451904, "step": 3535 }, { "epoch": 1.8946312092323132, "grad_norm": 6.852919592423108, "learning_rate": 3.5941902515119674e-07, "logits/chosen": -0.1692124605178833, "logits/rejected": 0.025646653026342392, "logps/chosen": -1.1911171674728394, "logps/rejected": -1.3597890138626099, "loss": 1.1911, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.1911171674728394, "rewards/margins": 0.1686718761920929, "rewards/rejected": -1.3597890138626099, "step": 3540 }, { "epoch": 1.8973072420137147, "grad_norm": 6.263325666424908, "learning_rate": 3.5792513196482373e-07, "logits/chosen": -0.2838030457496643, "logits/rejected": -0.06761365383863449, "logps/chosen": -1.203840970993042, "logps/rejected": -1.254758596420288, "loss": 1.2038, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -1.203840970993042, "rewards/margins": 0.05091753602027893, "rewards/rejected": -1.254758596420288, "step": 3545 }, { "epoch": 1.8999832747951162, "grad_norm": 5.349335272810496, "learning_rate": 3.5643261718944346e-07, "logits/chosen": -0.14909477531909943, "logits/rejected": -0.0800810307264328, "logps/chosen": -1.190441608428955, "logps/rejected": -1.2916029691696167, "loss": 1.1904, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.190441608428955, "rewards/margins": 0.10116131603717804, "rewards/rejected": -1.2916029691696167, "step": 3550 }, { "epoch": 1.902659307576518, "grad_norm": 5.041674686485234, "learning_rate": 3.5494149530544087e-07, "logits/chosen": -0.23277735710144043, "logits/rejected": -0.1172136440873146, "logps/chosen": -1.1403019428253174, "logps/rejected": -1.2967809438705444, "loss": 1.1403, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.1403019428253174, "rewards/margins": 0.15647898614406586, "rewards/rejected": -1.2967809438705444, "step": 3555 }, { "epoch": 1.9053353403579194, "grad_norm": 8.76653033216658, "learning_rate": 3.534517807796871e-07, "logits/chosen": -0.13583725690841675, "logits/rejected": -0.07847366482019424, "logps/chosen": -1.2405458688735962, "logps/rejected": -1.3724961280822754, "loss": 1.2405, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.2405458688735962, "rewards/margins": 0.13195018470287323, "rewards/rejected": -1.3724961280822754, "step": 3560 }, { "epoch": 1.908011373139321, "grad_norm": 4.9202426791810625, "learning_rate": 3.519634880653988e-07, "logits/chosen": -0.12641319632530212, "logits/rejected": -0.08319839835166931, "logps/chosen": -1.1580694913864136, "logps/rejected": -1.3105993270874023, "loss": 1.1581, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.1580694913864136, "rewards/margins": 0.1525297909975052, "rewards/rejected": -1.3105993270874023, "step": 3565 }, { "epoch": 1.9106874059207226, "grad_norm": 6.271037073775694, "learning_rate": 3.504766316019987e-07, "logits/chosen": -0.21856220066547394, "logits/rejected": -0.1186792254447937, "logps/chosen": -1.2088890075683594, "logps/rejected": -1.3537876605987549, "loss": 1.2089, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.2088890075683594, "rewards/margins": 0.14489854872226715, "rewards/rejected": -1.3537876605987549, "step": 3570 }, { "epoch": 1.913363438702124, "grad_norm": 5.943401556707548, "learning_rate": 3.489912258149745e-07, "logits/chosen": -0.10028059780597687, "logits/rejected": -0.004589706659317017, "logps/chosen": -1.1551620960235596, "logps/rejected": -1.3079516887664795, "loss": 1.1552, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.1551620960235596, "rewards/margins": 0.15278959274291992, "rewards/rejected": -1.3079516887664795, "step": 3575 }, { "epoch": 1.9160394714835256, "grad_norm": 4.88465033695237, "learning_rate": 3.475072851157397e-07, "logits/chosen": -0.1441275179386139, "logits/rejected": -0.10924132168292999, "logps/chosen": -1.178481936454773, "logps/rejected": -1.3580634593963623, "loss": 1.1785, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.178481936454773, "rewards/margins": 0.17958132922649384, "rewards/rejected": -1.3580634593963623, "step": 3580 }, { "epoch": 1.9187155042649273, "grad_norm": 7.003675067271657, "learning_rate": 3.460248239014936e-07, "logits/chosen": -0.1096426248550415, "logits/rejected": -0.07276495546102524, "logps/chosen": -1.27609121799469, "logps/rejected": -1.3795959949493408, "loss": 1.2761, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.27609121799469, "rewards/margins": 0.10350479930639267, "rewards/rejected": -1.3795959949493408, "step": 3585 }, { "epoch": 1.9213915370463288, "grad_norm": 6.9540331215359785, "learning_rate": 3.4454385655508134e-07, "logits/chosen": -0.11501473188400269, "logits/rejected": -0.045343369245529175, "logps/chosen": -1.236370325088501, "logps/rejected": -1.3338383436203003, "loss": 1.2364, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.236370325088501, "rewards/margins": 0.09746801853179932, "rewards/rejected": -1.3338383436203003, "step": 3590 }, { "epoch": 1.9240675698277303, "grad_norm": 5.387913956008847, "learning_rate": 3.4306439744485447e-07, "logits/chosen": -0.27982237935066223, "logits/rejected": -0.11400812864303589, "logps/chosen": -1.180597186088562, "logps/rejected": -1.3334075212478638, "loss": 1.1806, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.180597186088562, "rewards/margins": 0.15281042456626892, "rewards/rejected": -1.3334075212478638, "step": 3595 }, { "epoch": 1.926743602609132, "grad_norm": 8.29830840921333, "learning_rate": 3.415864609245322e-07, "logits/chosen": -0.12995333969593048, "logits/rejected": 0.02168891206383705, "logps/chosen": -1.162113904953003, "logps/rejected": -1.3191578388214111, "loss": 1.1621, "rewards/accuracies": 0.5625, "rewards/chosen": -1.162113904953003, "rewards/margins": 0.15704385936260223, "rewards/rejected": -1.3191578388214111, "step": 3600 }, { "epoch": 1.926743602609132, "eval_logits/chosen": 0.09907597303390503, "eval_logits/rejected": 0.16699618101119995, "eval_logps/chosen": -1.2633222341537476, "eval_logps/rejected": -1.3851231336593628, "eval_loss": 1.2635749578475952, "eval_rewards/accuracies": 0.5474777221679688, "eval_rewards/chosen": -1.2633222341537476, "eval_rewards/margins": 0.12180092930793762, "eval_rewards/rejected": -1.3851231336593628, "eval_runtime": 40.297, "eval_samples_per_second": 33.377, "eval_steps_per_second": 8.363, "step": 3600 }, { "epoch": 1.9294196353905335, "grad_norm": 5.766562905591561, "learning_rate": 3.401100613330605e-07, "logits/chosen": -0.2236834019422531, "logits/rejected": -0.19986633956432343, "logps/chosen": -1.2248042821884155, "logps/rejected": -1.286078929901123, "loss": 1.2248, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.2248042821884155, "rewards/margins": 0.06127455085515976, "rewards/rejected": -1.286078929901123, "step": 3605 }, { "epoch": 1.932095668171935, "grad_norm": 4.820563924299156, "learning_rate": 3.3863521299447514e-07, "logits/chosen": -0.1620894968509674, "logits/rejected": -0.07407096773386002, "logps/chosen": -1.1893503665924072, "logps/rejected": -1.3336570262908936, "loss": 1.1894, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.1893503665924072, "rewards/margins": 0.14430661499500275, "rewards/rejected": -1.3336570262908936, "step": 3610 }, { "epoch": 1.9347717009533367, "grad_norm": 4.800481581286196, "learning_rate": 3.371619302177609e-07, "logits/chosen": -0.11576857417821884, "logits/rejected": -0.03384217619895935, "logps/chosen": -1.2930406332015991, "logps/rejected": -1.35350501537323, "loss": 1.293, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.2930406332015991, "rewards/margins": 0.060464538633823395, "rewards/rejected": -1.35350501537323, "step": 3615 }, { "epoch": 1.9374477337347382, "grad_norm": 8.34532945502318, "learning_rate": 3.3569022729671393e-07, "logits/chosen": -0.1764056235551834, "logits/rejected": -0.11928689479827881, "logps/chosen": -1.2240722179412842, "logps/rejected": -1.2835490703582764, "loss": 1.2241, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.2240722179412842, "rewards/margins": 0.05947699397802353, "rewards/rejected": -1.2835490703582764, "step": 3620 }, { "epoch": 1.9401237665161397, "grad_norm": 7.409376038185012, "learning_rate": 3.342201185098024e-07, "logits/chosen": -0.13641294836997986, "logits/rejected": -0.1193992868065834, "logps/chosen": -1.2075504064559937, "logps/rejected": -1.4044629335403442, "loss": 1.2076, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.2075504064559937, "rewards/margins": 0.19691245257854462, "rewards/rejected": -1.4044629335403442, "step": 3625 }, { "epoch": 1.9427997992975414, "grad_norm": 5.58489231573706, "learning_rate": 3.3275161812002807e-07, "logits/chosen": -0.20766091346740723, "logits/rejected": -0.16700121760368347, "logps/chosen": -1.2253793478012085, "logps/rejected": -1.3350856304168701, "loss": 1.2254, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.2253793478012085, "rewards/margins": 0.10970617830753326, "rewards/rejected": -1.3350856304168701, "step": 3630 }, { "epoch": 1.945475832078943, "grad_norm": 6.8870596272700215, "learning_rate": 3.312847403747883e-07, "logits/chosen": -0.21872606873512268, "logits/rejected": -0.15501157939434052, "logps/chosen": -1.2477519512176514, "logps/rejected": -1.361515998840332, "loss": 1.2478, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.2477519512176514, "rewards/margins": 0.11376380920410156, "rewards/rejected": -1.361515998840332, "step": 3635 }, { "epoch": 1.9481518648603444, "grad_norm": 5.482448775483183, "learning_rate": 3.2981949950573733e-07, "logits/chosen": -0.1705399751663208, "logits/rejected": -0.07286882400512695, "logps/chosen": -1.3321467638015747, "logps/rejected": -1.3737891912460327, "loss": 1.3321, "rewards/accuracies": 0.5, "rewards/chosen": -1.3321467638015747, "rewards/margins": 0.041642289608716965, "rewards/rejected": -1.3737891912460327, "step": 3640 }, { "epoch": 1.9508278976417461, "grad_norm": 4.415890750989783, "learning_rate": 3.283559097286486e-07, "logits/chosen": -0.20676758885383606, "logits/rejected": -0.10797051340341568, "logps/chosen": -1.3211634159088135, "logps/rejected": -1.4131262302398682, "loss": 1.3212, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.3211634159088135, "rewards/margins": 0.09196287393569946, "rewards/rejected": -1.4131262302398682, "step": 3645 }, { "epoch": 1.9535039304231478, "grad_norm": 4.441754809572781, "learning_rate": 3.268939852432765e-07, "logits/chosen": -0.21519120037555695, "logits/rejected": -0.1342916190624237, "logps/chosen": -1.1988322734832764, "logps/rejected": -1.3026509284973145, "loss": 1.1988, "rewards/accuracies": 0.53125, "rewards/chosen": -1.1988322734832764, "rewards/margins": 0.10381858050823212, "rewards/rejected": -1.3026509284973145, "step": 3650 }, { "epoch": 1.9561799632045491, "grad_norm": 6.494603659404193, "learning_rate": 3.254337402332187e-07, "logits/chosen": -0.19395865499973297, "logits/rejected": -0.07957155257463455, "logps/chosen": -1.25752592086792, "logps/rejected": -1.3373297452926636, "loss": 1.2575, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.25752592086792, "rewards/margins": 0.07980392128229141, "rewards/rejected": -1.3373297452926636, "step": 3655 }, { "epoch": 1.9588559959859508, "grad_norm": 7.938683472087652, "learning_rate": 3.239751888657788e-07, "logits/chosen": -0.2311045378446579, "logits/rejected": -0.13569125533103943, "logps/chosen": -1.1545521020889282, "logps/rejected": -1.2417422533035278, "loss": 1.1546, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.1545521020889282, "rewards/margins": 0.08719011396169662, "rewards/rejected": -1.2417422533035278, "step": 3660 }, { "epoch": 1.9615320287673526, "grad_norm": 6.213204715237311, "learning_rate": 3.2251834529182856e-07, "logits/chosen": -0.1631738245487213, "logits/rejected": -0.06852308660745621, "logps/chosen": -1.181171178817749, "logps/rejected": -1.2857331037521362, "loss": 1.1812, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.181171178817749, "rewards/margins": 0.10456206649541855, "rewards/rejected": -1.2857331037521362, "step": 3665 }, { "epoch": 1.9642080615487538, "grad_norm": 5.826595315026236, "learning_rate": 3.2106322364567075e-07, "logits/chosen": -0.20606759190559387, "logits/rejected": -0.09321747720241547, "logps/chosen": -1.2265615463256836, "logps/rejected": -1.3645594120025635, "loss": 1.2266, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.2265615463256836, "rewards/margins": 0.13799771666526794, "rewards/rejected": -1.3645594120025635, "step": 3670 }, { "epoch": 1.9668840943301555, "grad_norm": 6.229497140875294, "learning_rate": 3.1960983804490183e-07, "logits/chosen": -0.17587599158287048, "logits/rejected": -0.06923215091228485, "logps/chosen": -1.2444448471069336, "logps/rejected": -1.4578088521957397, "loss": 1.2444, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.2444448471069336, "rewards/margins": 0.213363915681839, "rewards/rejected": -1.4578088521957397, "step": 3675 }, { "epoch": 1.9695601271115573, "grad_norm": 5.500921300348348, "learning_rate": 3.1815820259027537e-07, "logits/chosen": -0.17337365448474884, "logits/rejected": -0.08564025908708572, "logps/chosen": -1.0938003063201904, "logps/rejected": -1.2613918781280518, "loss": 1.0938, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.0938003063201904, "rewards/margins": 0.16759145259857178, "rewards/rejected": -1.2613918781280518, "step": 3680 }, { "epoch": 1.9722361598929585, "grad_norm": 5.580447954490444, "learning_rate": 3.16708331365565e-07, "logits/chosen": -0.20431354641914368, "logits/rejected": -0.14449433982372284, "logps/chosen": -1.201021432876587, "logps/rejected": -1.3622198104858398, "loss": 1.201, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.201021432876587, "rewards/margins": 0.16119836270809174, "rewards/rejected": -1.3622198104858398, "step": 3685 }, { "epoch": 1.9749121926743602, "grad_norm": 8.20487209493094, "learning_rate": 3.152602384374275e-07, "logits/chosen": -0.15653710067272186, "logits/rejected": -0.04038035124540329, "logps/chosen": -1.22836172580719, "logps/rejected": -1.3583418130874634, "loss": 1.2284, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -1.22836172580719, "rewards/margins": 0.1299799680709839, "rewards/rejected": -1.3583418130874634, "step": 3690 }, { "epoch": 1.977588225455762, "grad_norm": 5.1974298526779155, "learning_rate": 3.1381393785526697e-07, "logits/chosen": -0.12381824105978012, "logits/rejected": -0.10890702903270721, "logps/chosen": -1.274674415588379, "logps/rejected": -1.4409854412078857, "loss": 1.2747, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.274674415588379, "rewards/margins": 0.16631099581718445, "rewards/rejected": -1.4409854412078857, "step": 3695 }, { "epoch": 1.9802642582371635, "grad_norm": 4.747660180866828, "learning_rate": 3.123694436510979e-07, "logits/chosen": -0.10659520328044891, "logits/rejected": -0.026370953768491745, "logps/chosen": -1.1973693370819092, "logps/rejected": -1.3459436893463135, "loss": 1.1974, "rewards/accuracies": 0.625, "rewards/chosen": -1.1973693370819092, "rewards/margins": 0.148574098944664, "rewards/rejected": -1.3459436893463135, "step": 3700 }, { "epoch": 1.982940291018565, "grad_norm": 6.057822158534642, "learning_rate": 3.1092676983940946e-07, "logits/chosen": -0.14698882400989532, "logits/rejected": -0.09348581731319427, "logps/chosen": -1.258783221244812, "logps/rejected": -1.3883939981460571, "loss": 1.2588, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.258783221244812, "rewards/margins": 0.12961053848266602, "rewards/rejected": -1.3883939981460571, "step": 3705 }, { "epoch": 1.9856163237999667, "grad_norm": 5.597583355279603, "learning_rate": 3.094859304170293e-07, "logits/chosen": -0.05267949774861336, "logits/rejected": -0.015400650911033154, "logps/chosen": -1.260164499282837, "logps/rejected": -1.3539760112762451, "loss": 1.2602, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.260164499282837, "rewards/margins": 0.09381161630153656, "rewards/rejected": -1.3539760112762451, "step": 3710 }, { "epoch": 1.9882923565813682, "grad_norm": 4.494743982575287, "learning_rate": 3.0804693936298795e-07, "logits/chosen": -0.09866820275783539, "logits/rejected": -0.0343213751912117, "logps/chosen": -1.2419010400772095, "logps/rejected": -1.3681851625442505, "loss": 1.2419, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.2419010400772095, "rewards/margins": 0.12628427147865295, "rewards/rejected": -1.3681851625442505, "step": 3715 }, { "epoch": 1.9909683893627697, "grad_norm": 5.259098121587746, "learning_rate": 3.066098106383826e-07, "logits/chosen": -0.15005071461200714, "logits/rejected": -0.09308391064405441, "logps/chosen": -1.1984913349151611, "logps/rejected": -1.336576223373413, "loss": 1.1985, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.1984913349151611, "rewards/margins": 0.13808469474315643, "rewards/rejected": -1.336576223373413, "step": 3720 }, { "epoch": 1.9936444221441714, "grad_norm": 5.532297439024465, "learning_rate": 3.0517455818624263e-07, "logits/chosen": -0.2172476053237915, "logits/rejected": -0.14205335080623627, "logps/chosen": -1.2024891376495361, "logps/rejected": -1.295165777206421, "loss": 1.2025, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.2024891376495361, "rewards/margins": 0.09267668426036835, "rewards/rejected": -1.295165777206421, "step": 3725 }, { "epoch": 1.9963204549255729, "grad_norm": 5.708230380574543, "learning_rate": 3.037411959313936e-07, "logits/chosen": -0.11695092916488647, "logits/rejected": -0.02068774774670601, "logps/chosen": -1.1691228151321411, "logps/rejected": -1.3457962274551392, "loss": 1.1691, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.1691228151321411, "rewards/margins": 0.17667335271835327, "rewards/rejected": -1.3457962274551392, "step": 3730 }, { "epoch": 1.9989964877069744, "grad_norm": 6.261390185301411, "learning_rate": 3.023097377803224e-07, "logits/chosen": -0.08335531502962112, "logits/rejected": -0.030234187841415405, "logps/chosen": -1.299695372581482, "logps/rejected": -1.3476722240447998, "loss": 1.2997, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.299695372581482, "rewards/margins": 0.047976791858673096, "rewards/rejected": -1.3476722240447998, "step": 3735 }, { "epoch": 2.001672520488376, "grad_norm": 5.966319818106221, "learning_rate": 3.008801976210423e-07, "logits/chosen": -0.07353471219539642, "logits/rejected": -0.05148538202047348, "logps/chosen": -1.2785046100616455, "logps/rejected": -1.3418762683868408, "loss": 1.2785, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -1.2785046100616455, "rewards/margins": 0.0633719339966774, "rewards/rejected": -1.3418762683868408, "step": 3740 }, { "epoch": 2.0043485532697773, "grad_norm": 4.720553790029993, "learning_rate": 2.994525893229581e-07, "logits/chosen": -0.16832037270069122, "logits/rejected": -0.09125945717096329, "logps/chosen": -1.230668067932129, "logps/rejected": -1.344470500946045, "loss": 1.2307, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.230668067932129, "rewards/margins": 0.11380244791507721, "rewards/rejected": -1.344470500946045, "step": 3745 }, { "epoch": 2.007024586051179, "grad_norm": 5.658493022552921, "learning_rate": 2.98026926736732e-07, "logits/chosen": -0.18076691031455994, "logits/rejected": -0.11563412100076675, "logps/chosen": -1.138292908668518, "logps/rejected": -1.3450239896774292, "loss": 1.1383, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.138292908668518, "rewards/margins": 0.20673108100891113, "rewards/rejected": -1.3450239896774292, "step": 3750 }, { "epoch": 2.0097006188325808, "grad_norm": 4.893327420815337, "learning_rate": 2.9660322369414846e-07, "logits/chosen": -0.17297913134098053, "logits/rejected": -0.09917312115430832, "logps/chosen": -1.1538817882537842, "logps/rejected": -1.3932651281356812, "loss": 1.1539, "rewards/accuracies": 0.59375, "rewards/chosen": -1.1538817882537842, "rewards/margins": 0.23938336968421936, "rewards/rejected": -1.3932651281356812, "step": 3755 }, { "epoch": 2.0123766516139825, "grad_norm": 6.169473144899691, "learning_rate": 2.9518149400798063e-07, "logits/chosen": -0.25369197130203247, "logits/rejected": -0.2442415952682495, "logps/chosen": -1.2080129384994507, "logps/rejected": -1.342842698097229, "loss": 1.208, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.2080129384994507, "rewards/margins": 0.13482984900474548, "rewards/rejected": -1.342842698097229, "step": 3760 }, { "epoch": 2.0150526843953838, "grad_norm": 5.554147781296768, "learning_rate": 2.9376175147185633e-07, "logits/chosen": -0.10926420986652374, "logits/rejected": 0.025119369849562645, "logps/chosen": -1.177136778831482, "logps/rejected": -1.4038763046264648, "loss": 1.1771, "rewards/accuracies": 0.625, "rewards/chosen": -1.177136778831482, "rewards/margins": 0.22673968970775604, "rewards/rejected": -1.4038763046264648, "step": 3765 }, { "epoch": 2.0177287171767855, "grad_norm": 6.172557494804244, "learning_rate": 2.9234400986012376e-07, "logits/chosen": -0.26100242137908936, "logits/rejected": -0.14545118808746338, "logps/chosen": -1.1305357217788696, "logps/rejected": -1.470702886581421, "loss": 1.1305, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.1305357217788696, "rewards/margins": 0.3401672840118408, "rewards/rejected": -1.470702886581421, "step": 3770 }, { "epoch": 2.020404749958187, "grad_norm": 4.438248748667449, "learning_rate": 2.9092828292771817e-07, "logits/chosen": -0.22341112792491913, "logits/rejected": -0.1844293773174286, "logps/chosen": -1.1795382499694824, "logps/rejected": -1.2888224124908447, "loss": 1.1795, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -1.1795382499694824, "rewards/margins": 0.10928422212600708, "rewards/rejected": -1.2888224124908447, "step": 3775 }, { "epoch": 2.0230807827395885, "grad_norm": 5.866711759910989, "learning_rate": 2.8951458441002875e-07, "logits/chosen": -0.15054607391357422, "logits/rejected": -0.1287146955728531, "logps/chosen": -1.2174220085144043, "logps/rejected": -1.3671671152114868, "loss": 1.2174, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.2174220085144043, "rewards/margins": 0.1497451514005661, "rewards/rejected": -1.3671671152114868, "step": 3780 }, { "epoch": 2.02575681552099, "grad_norm": 5.616481160652861, "learning_rate": 2.881029280227643e-07, "logits/chosen": -0.17458465695381165, "logits/rejected": -0.08852483332157135, "logps/chosen": -1.2391754388809204, "logps/rejected": -1.4670593738555908, "loss": 1.2392, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.2391754388809204, "rewards/margins": 0.22788405418395996, "rewards/rejected": -1.4670593738555908, "step": 3785 }, { "epoch": 2.028432848302392, "grad_norm": 4.7920894644231105, "learning_rate": 2.8669332746182177e-07, "logits/chosen": -0.21288590133190155, "logits/rejected": -0.07269952446222305, "logps/chosen": -1.1886236667633057, "logps/rejected": -1.328176736831665, "loss": 1.1886, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.1886236667633057, "rewards/margins": 0.13955330848693848, "rewards/rejected": -1.328176736831665, "step": 3790 }, { "epoch": 2.031108881083793, "grad_norm": 5.348934906624253, "learning_rate": 2.8528579640315156e-07, "logits/chosen": -0.15119697153568268, "logits/rejected": -0.13006910681724548, "logps/chosen": -1.166041374206543, "logps/rejected": -1.3011915683746338, "loss": 1.166, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.166041374206543, "rewards/margins": 0.13515016436576843, "rewards/rejected": -1.3011915683746338, "step": 3795 }, { "epoch": 2.033784913865195, "grad_norm": 5.935582451233548, "learning_rate": 2.8388034850262646e-07, "logits/chosen": -0.1623251587152481, "logits/rejected": -0.0721767321228981, "logps/chosen": -1.2480765581130981, "logps/rejected": -1.4304217100143433, "loss": 1.2481, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.2480765581130981, "rewards/margins": 0.1823452115058899, "rewards/rejected": -1.4304217100143433, "step": 3800 }, { "epoch": 2.0364609466465966, "grad_norm": 5.620627214089343, "learning_rate": 2.824769973959079e-07, "logits/chosen": -0.12374546378850937, "logits/rejected": -0.04067706689238548, "logps/chosen": -1.1548192501068115, "logps/rejected": -1.3473902940750122, "loss": 1.1548, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.1548192501068115, "rewards/margins": 0.19257108867168427, "rewards/rejected": -1.3473902940750122, "step": 3805 }, { "epoch": 2.039136979427998, "grad_norm": 5.993743170108611, "learning_rate": 2.81075756698315e-07, "logits/chosen": -0.06396854668855667, "logits/rejected": 0.00715807918459177, "logps/chosen": -1.1829876899719238, "logps/rejected": -1.3414902687072754, "loss": 1.183, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.1829876899719238, "rewards/margins": 0.15850253403186798, "rewards/rejected": -1.3414902687072754, "step": 3810 }, { "epoch": 2.0418130122093996, "grad_norm": 5.723168461040252, "learning_rate": 2.7967664000469035e-07, "logits/chosen": -0.27506253123283386, "logits/rejected": -0.17463165521621704, "logps/chosen": -1.2123690843582153, "logps/rejected": -1.3037126064300537, "loss": 1.2124, "rewards/accuracies": 0.5625, "rewards/chosen": -1.2123690843582153, "rewards/margins": 0.09134359657764435, "rewards/rejected": -1.3037126064300537, "step": 3815 }, { "epoch": 2.0444890449908013, "grad_norm": 8.284819008752189, "learning_rate": 2.7827966088927095e-07, "logits/chosen": -0.2434227466583252, "logits/rejected": -0.08673135936260223, "logps/chosen": -1.2683742046356201, "logps/rejected": -1.23711359500885, "loss": 1.2684, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -1.2683742046356201, "rewards/margins": -0.03126070648431778, "rewards/rejected": -1.23711359500885, "step": 3820 }, { "epoch": 2.0471650777722026, "grad_norm": 5.7454144029406065, "learning_rate": 2.768848329055538e-07, "logits/chosen": -0.2001630961894989, "logits/rejected": -0.11856867372989655, "logps/chosen": -1.1831390857696533, "logps/rejected": -1.2786887884140015, "loss": 1.1831, "rewards/accuracies": 0.5625, "rewards/chosen": -1.1831390857696533, "rewards/margins": 0.09554972499608994, "rewards/rejected": -1.2786887884140015, "step": 3825 }, { "epoch": 2.0498411105536043, "grad_norm": 6.055416860582981, "learning_rate": 2.7549216958616657e-07, "logits/chosen": -0.24327535927295685, "logits/rejected": -0.13111427426338196, "logps/chosen": -1.2529833316802979, "logps/rejected": -1.4231163263320923, "loss": 1.253, "rewards/accuracies": 0.5625, "rewards/chosen": -1.2529833316802979, "rewards/margins": 0.17013299465179443, "rewards/rejected": -1.4231163263320923, "step": 3830 }, { "epoch": 2.052517143335006, "grad_norm": 5.427718444223444, "learning_rate": 2.741016844427344e-07, "logits/chosen": -0.16228987276554108, "logits/rejected": -0.06680922210216522, "logps/chosen": -1.2440131902694702, "logps/rejected": -1.378172755241394, "loss": 1.244, "rewards/accuracies": 0.5625, "rewards/chosen": -1.2440131902694702, "rewards/margins": 0.13415959477424622, "rewards/rejected": -1.378172755241394, "step": 3835 }, { "epoch": 2.0551931761164073, "grad_norm": 6.174123737585839, "learning_rate": 2.7271339096575073e-07, "logits/chosen": -0.12297642230987549, "logits/rejected": -0.05247511342167854, "logps/chosen": -1.1234946250915527, "logps/rejected": -1.3920444250106812, "loss": 1.1235, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.1234946250915527, "rewards/margins": 0.2685498893260956, "rewards/rejected": -1.3920444250106812, "step": 3840 }, { "epoch": 2.057869208897809, "grad_norm": 5.532679890303668, "learning_rate": 2.713273026244446e-07, "logits/chosen": -0.3069193959236145, "logits/rejected": -0.1292070597410202, "logps/chosen": -1.2603905200958252, "logps/rejected": -1.4123350381851196, "loss": 1.2604, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.2603905200958252, "rewards/margins": 0.15194445848464966, "rewards/rejected": -1.4123350381851196, "step": 3845 }, { "epoch": 2.0605452416792107, "grad_norm": 5.300563053100759, "learning_rate": 2.6994343286665156e-07, "logits/chosen": -0.20022615790367126, "logits/rejected": -0.07928653061389923, "logps/chosen": -1.2546939849853516, "logps/rejected": -1.3651126623153687, "loss": 1.2547, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.2546939849853516, "rewards/margins": 0.11041867733001709, "rewards/rejected": -1.3651126623153687, "step": 3850 }, { "epoch": 2.063221274460612, "grad_norm": 6.022666513266557, "learning_rate": 2.6856179511868156e-07, "logits/chosen": -0.17755185067653656, "logits/rejected": -0.041834503412246704, "logps/chosen": -1.191779375076294, "logps/rejected": -1.4031203985214233, "loss": 1.1918, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.191779375076294, "rewards/margins": 0.21134105324745178, "rewards/rejected": -1.4031203985214233, "step": 3855 }, { "epoch": 2.0658973072420137, "grad_norm": 4.83268217199813, "learning_rate": 2.6718240278519056e-07, "logits/chosen": -0.17566213011741638, "logits/rejected": -0.046472322195768356, "logps/chosen": -1.2054184675216675, "logps/rejected": -1.2850894927978516, "loss": 1.2054, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.2054184675216675, "rewards/margins": 0.07967091351747513, "rewards/rejected": -1.2850894927978516, "step": 3860 }, { "epoch": 2.0685733400234154, "grad_norm": 7.46278786539241, "learning_rate": 2.6580526924904866e-07, "logits/chosen": -0.2881203591823578, "logits/rejected": -0.14568719267845154, "logps/chosen": -1.248539686203003, "logps/rejected": -1.3071911334991455, "loss": 1.2485, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.248539686203003, "rewards/margins": 0.05865158885717392, "rewards/rejected": -1.3071911334991455, "step": 3865 }, { "epoch": 2.0712493728048167, "grad_norm": 6.215234477175454, "learning_rate": 2.6443040787121186e-07, "logits/chosen": -0.1983538717031479, "logits/rejected": -0.18816599249839783, "logps/chosen": -1.1092668771743774, "logps/rejected": -1.2191474437713623, "loss": 1.1093, "rewards/accuracies": 0.5625, "rewards/chosen": -1.1092668771743774, "rewards/margins": 0.1098807081580162, "rewards/rejected": -1.2191474437713623, "step": 3870 }, { "epoch": 2.0739254055862184, "grad_norm": 6.533651441169139, "learning_rate": 2.6305783199059084e-07, "logits/chosen": -0.2168056219816208, "logits/rejected": -0.11648724228143692, "logps/chosen": -1.1807115077972412, "logps/rejected": -1.3503268957138062, "loss": 1.1807, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.1807115077972412, "rewards/margins": 0.16961517930030823, "rewards/rejected": -1.3503268957138062, "step": 3875 }, { "epoch": 2.07660143836762, "grad_norm": 5.800372526579284, "learning_rate": 2.6168755492392324e-07, "logits/chosen": -0.19749070703983307, "logits/rejected": -0.08320140838623047, "logps/chosen": -1.1103262901306152, "logps/rejected": -1.262804627418518, "loss": 1.1103, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.1103262901306152, "rewards/margins": 0.15247836709022522, "rewards/rejected": -1.262804627418518, "step": 3880 }, { "epoch": 2.0792774711490214, "grad_norm": 5.548578504100435, "learning_rate": 2.6031958996564274e-07, "logits/chosen": -0.2316991090774536, "logits/rejected": -0.12330880016088486, "logps/chosen": -1.1607682704925537, "logps/rejected": -1.3241338729858398, "loss": 1.1608, "rewards/accuracies": 0.5, "rewards/chosen": -1.1607682704925537, "rewards/margins": 0.16336564719676971, "rewards/rejected": -1.3241338729858398, "step": 3885 }, { "epoch": 2.081953503930423, "grad_norm": 5.566893296081242, "learning_rate": 2.589539503877518e-07, "logits/chosen": -0.14514729380607605, "logits/rejected": -0.07553727924823761, "logps/chosen": -1.1952893733978271, "logps/rejected": -1.2873413562774658, "loss": 1.1953, "rewards/accuracies": 0.53125, "rewards/chosen": -1.1952893733978271, "rewards/margins": 0.09205184876918793, "rewards/rejected": -1.2873413562774658, "step": 3890 }, { "epoch": 2.084629536711825, "grad_norm": 4.982125271300471, "learning_rate": 2.5759064943969125e-07, "logits/chosen": -0.20139291882514954, "logits/rejected": -0.01820841059088707, "logps/chosen": -1.1583207845687866, "logps/rejected": -1.2918179035186768, "loss": 1.1583, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.1583207845687866, "rewards/margins": 0.1334972083568573, "rewards/rejected": -1.2918179035186768, "step": 3895 }, { "epoch": 2.087305569493226, "grad_norm": 6.1371397680932205, "learning_rate": 2.562297003482131e-07, "logits/chosen": -0.07977107912302017, "logits/rejected": -0.0942116528749466, "logps/chosen": -1.1783009767532349, "logps/rejected": -1.3213469982147217, "loss": 1.1783, "rewards/accuracies": 0.5625, "rewards/chosen": -1.1783009767532349, "rewards/margins": 0.14304593205451965, "rewards/rejected": -1.3213469982147217, "step": 3900 }, { "epoch": 2.089981602274628, "grad_norm": 5.411203587091132, "learning_rate": 2.548711163172512e-07, "logits/chosen": -0.16041326522827148, "logits/rejected": -0.08141361176967621, "logps/chosen": -1.192338228225708, "logps/rejected": -1.2893304824829102, "loss": 1.1923, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.192338228225708, "rewards/margins": 0.09699223190546036, "rewards/rejected": -1.2893304824829102, "step": 3905 }, { "epoch": 2.0926576350560295, "grad_norm": 5.988052061192542, "learning_rate": 2.53514910527794e-07, "logits/chosen": -0.1029415875673294, "logits/rejected": -0.02332778088748455, "logps/chosen": -1.1048588752746582, "logps/rejected": -1.278424620628357, "loss": 1.1049, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.1048588752746582, "rewards/margins": 0.17356595396995544, "rewards/rejected": -1.278424620628357, "step": 3910 }, { "epoch": 2.095333667837431, "grad_norm": 4.644949247269648, "learning_rate": 2.5216109613775573e-07, "logits/chosen": -0.194515198469162, "logits/rejected": -0.07326622307300568, "logps/chosen": -1.2042431831359863, "logps/rejected": -1.411727786064148, "loss": 1.2042, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.2042431831359863, "rewards/margins": 0.20748472213745117, "rewards/rejected": -1.411727786064148, "step": 3915 }, { "epoch": 2.0980097006188325, "grad_norm": 5.256053732701818, "learning_rate": 2.5080968628184993e-07, "logits/chosen": -0.21517808735370636, "logits/rejected": -0.09298683702945709, "logps/chosen": -1.2183308601379395, "logps/rejected": -1.3978440761566162, "loss": 1.2183, "rewards/accuracies": 0.59375, "rewards/chosen": -1.2183308601379395, "rewards/margins": 0.17951300740242004, "rewards/rejected": -1.3978440761566162, "step": 3920 }, { "epoch": 2.1006857334002342, "grad_norm": 4.196220345953325, "learning_rate": 2.494606940714605e-07, "logits/chosen": -0.18803146481513977, "logits/rejected": -0.09329582750797272, "logps/chosen": -1.149665117263794, "logps/rejected": -1.3139899969100952, "loss": 1.1497, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.149665117263794, "rewards/margins": 0.1643248200416565, "rewards/rejected": -1.3139899969100952, "step": 3925 }, { "epoch": 2.103361766181636, "grad_norm": 4.1183395285563815, "learning_rate": 2.4811413259451625e-07, "logits/chosen": -0.2597247064113617, "logits/rejected": -0.16696295142173767, "logps/chosen": -1.2429553270339966, "logps/rejected": -1.3102123737335205, "loss": 1.243, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.2429553270339966, "rewards/margins": 0.06725703179836273, "rewards/rejected": -1.3102123737335205, "step": 3930 }, { "epoch": 2.106037798963037, "grad_norm": 5.985745780678282, "learning_rate": 2.46770014915362e-07, "logits/chosen": -0.12688815593719482, "logits/rejected": -0.09124980866909027, "logps/chosen": -1.180567979812622, "logps/rejected": -1.364963412284851, "loss": 1.1806, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.180567979812622, "rewards/margins": 0.18439556658267975, "rewards/rejected": -1.364963412284851, "step": 3935 }, { "epoch": 2.108713831744439, "grad_norm": 6.210575656091264, "learning_rate": 2.45428354074634e-07, "logits/chosen": -0.15768767893314362, "logits/rejected": -0.13311973214149475, "logps/chosen": -1.0971598625183105, "logps/rejected": -1.4057886600494385, "loss": 1.0972, "rewards/accuracies": 0.625, "rewards/chosen": -1.0971598625183105, "rewards/margins": 0.30862879753112793, "rewards/rejected": -1.4057886600494385, "step": 3940 }, { "epoch": 2.1113898645258407, "grad_norm": 6.300062372916166, "learning_rate": 2.4408916308913105e-07, "logits/chosen": -0.18163347244262695, "logits/rejected": -0.04523243010044098, "logps/chosen": -1.2317986488342285, "logps/rejected": -1.3191403150558472, "loss": 1.2318, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.2317986488342285, "rewards/margins": 0.08734166622161865, "rewards/rejected": -1.3191403150558472, "step": 3945 }, { "epoch": 2.114065897307242, "grad_norm": 5.012617382905688, "learning_rate": 2.4275245495169025e-07, "logits/chosen": -0.10654453933238983, "logits/rejected": 0.014140886254608631, "logps/chosen": -1.1808995008468628, "logps/rejected": -1.2554553747177124, "loss": 1.1809, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.1808995008468628, "rewards/margins": 0.07455600798130035, "rewards/rejected": -1.2554553747177124, "step": 3950 }, { "epoch": 2.1167419300886436, "grad_norm": 5.4441391488921465, "learning_rate": 2.414182426310597e-07, "logits/chosen": -0.25004976987838745, "logits/rejected": -0.18301230669021606, "logps/chosen": -1.1323000192642212, "logps/rejected": -1.278407335281372, "loss": 1.1323, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.1323000192642212, "rewards/margins": 0.14610722661018372, "rewards/rejected": -1.278407335281372, "step": 3955 }, { "epoch": 2.1194179628700454, "grad_norm": 7.574739773492241, "learning_rate": 2.400865390717734e-07, "logits/chosen": -0.1610892415046692, "logits/rejected": -0.07008327543735504, "logps/chosen": -1.218372106552124, "logps/rejected": -1.4183920621871948, "loss": 1.2184, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.218372106552124, "rewards/margins": 0.20002003014087677, "rewards/rejected": -1.4183920621871948, "step": 3960 }, { "epoch": 2.1220939956514466, "grad_norm": 5.731575346404146, "learning_rate": 2.3875735719402475e-07, "logits/chosen": -0.11292193830013275, "logits/rejected": -0.034099627286195755, "logps/chosen": -1.1352802515029907, "logps/rejected": -1.3464281558990479, "loss": 1.1353, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.1352802515029907, "rewards/margins": 0.21114778518676758, "rewards/rejected": -1.3464281558990479, "step": 3965 }, { "epoch": 2.1247700284328483, "grad_norm": 4.957542709534137, "learning_rate": 2.3743070989354258e-07, "logits/chosen": -0.16472133994102478, "logits/rejected": -0.09346790611743927, "logps/chosen": -1.1655364036560059, "logps/rejected": -1.3642406463623047, "loss": 1.1655, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.1655364036560059, "rewards/margins": 0.19870427250862122, "rewards/rejected": -1.3642406463623047, "step": 3970 }, { "epoch": 2.12744606121425, "grad_norm": 4.378904801512612, "learning_rate": 2.3610661004146454e-07, "logits/chosen": -0.10130923986434937, "logits/rejected": -0.04169637709856033, "logps/chosen": -1.0745960474014282, "logps/rejected": -1.2901570796966553, "loss": 1.0746, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.0745960474014282, "rewards/margins": 0.21556086838245392, "rewards/rejected": -1.2901570796966553, "step": 3975 }, { "epoch": 2.1301220939956513, "grad_norm": 5.414015334483855, "learning_rate": 2.3478507048421314e-07, "logits/chosen": -0.21661147475242615, "logits/rejected": -0.16962361335754395, "logps/chosen": -1.0799754858016968, "logps/rejected": -1.3525947332382202, "loss": 1.08, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.0799754858016968, "rewards/margins": 0.27261897921562195, "rewards/rejected": -1.3525947332382202, "step": 3980 }, { "epoch": 2.132798126777053, "grad_norm": 7.900169291942871, "learning_rate": 2.334661040433713e-07, "logits/chosen": -0.23253560066223145, "logits/rejected": -0.15436120331287384, "logps/chosen": -1.1643885374069214, "logps/rejected": -1.3365429639816284, "loss": 1.1644, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.1643885374069214, "rewards/margins": 0.17215444147586823, "rewards/rejected": -1.3365429639816284, "step": 3985 }, { "epoch": 2.1354741595584548, "grad_norm": 4.747220427286868, "learning_rate": 2.321497235155568e-07, "logits/chosen": -0.2513046860694885, "logits/rejected": -0.15555958449840546, "logps/chosen": -1.1581534147262573, "logps/rejected": -1.2770174741744995, "loss": 1.1582, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.1581534147262573, "rewards/margins": 0.11886407434940338, "rewards/rejected": -1.2770174741744995, "step": 3990 }, { "epoch": 2.138150192339856, "grad_norm": 5.315805765895684, "learning_rate": 2.3083594167229965e-07, "logits/chosen": -0.303212434053421, "logits/rejected": -0.10713453590869904, "logps/chosen": -1.1823982000350952, "logps/rejected": -1.4079653024673462, "loss": 1.1824, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.1823982000350952, "rewards/margins": 0.2255672961473465, "rewards/rejected": -1.4079653024673462, "step": 3995 }, { "epoch": 2.1408262251212578, "grad_norm": 5.9800702130188945, "learning_rate": 2.295247712599167e-07, "logits/chosen": -0.1748323142528534, "logits/rejected": -0.09072832763195038, "logps/chosen": -1.1574102640151978, "logps/rejected": -1.319309949874878, "loss": 1.1574, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.1574102640151978, "rewards/margins": 0.16189967095851898, "rewards/rejected": -1.319309949874878, "step": 4000 }, { "epoch": 2.1408262251212578, "eval_logits/chosen": 0.05428016558289528, "eval_logits/rejected": 0.11889573931694031, "eval_logps/chosen": -1.2629995346069336, "eval_logps/rejected": -1.3881502151489258, "eval_loss": 1.2632604837417603, "eval_rewards/accuracies": 0.5467358827590942, "eval_rewards/chosen": -1.2629995346069336, "eval_rewards/margins": 0.1251506358385086, "eval_rewards/rejected": -1.3881502151489258, "eval_runtime": 40.344, "eval_samples_per_second": 33.338, "eval_steps_per_second": 8.353, "step": 4000 }, { "epoch": 2.1435022579026595, "grad_norm": 5.906995393182759, "learning_rate": 2.2821622499938948e-07, "logits/chosen": -0.21090462803840637, "logits/rejected": -0.044907040894031525, "logps/chosen": -1.3144917488098145, "logps/rejected": -1.4227732419967651, "loss": 1.3145, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.3144917488098145, "rewards/margins": 0.10828139632940292, "rewards/rejected": -1.4227732419967651, "step": 4005 }, { "epoch": 2.1461782906840607, "grad_norm": 4.660152748669971, "learning_rate": 2.269103155862391e-07, "logits/chosen": -0.19960683584213257, "logits/rejected": -0.130840465426445, "logps/chosen": -1.2303111553192139, "logps/rejected": -1.3218008279800415, "loss": 1.2303, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.2303111553192139, "rewards/margins": 0.09148967266082764, "rewards/rejected": -1.3218008279800415, "step": 4010 }, { "epoch": 2.1488543234654625, "grad_norm": 5.4948271533286, "learning_rate": 2.2560705569040483e-07, "logits/chosen": -0.21906621754169464, "logits/rejected": -0.02337515726685524, "logps/chosen": -1.2080949544906616, "logps/rejected": -1.3153215646743774, "loss": 1.2081, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.2080949544906616, "rewards/margins": 0.10722680389881134, "rewards/rejected": -1.3153215646743774, "step": 4015 }, { "epoch": 2.151530356246864, "grad_norm": 4.432657369465195, "learning_rate": 2.2430645795611963e-07, "logits/chosen": -0.2628961205482483, "logits/rejected": -0.15244093537330627, "logps/chosen": -1.2413898706436157, "logps/rejected": -1.3303942680358887, "loss": 1.2414, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.2413898706436157, "rewards/margins": 0.08900441974401474, "rewards/rejected": -1.3303942680358887, "step": 4020 }, { "epoch": 2.1542063890282654, "grad_norm": 5.942339681800726, "learning_rate": 2.230085350017884e-07, "logits/chosen": -0.20108933746814728, "logits/rejected": -0.1273794174194336, "logps/chosen": -1.126192331314087, "logps/rejected": -1.2530637979507446, "loss": 1.1262, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.126192331314087, "rewards/margins": 0.12687142193317413, "rewards/rejected": -1.2530637979507446, "step": 4025 }, { "epoch": 2.156882421809667, "grad_norm": 7.78294000194396, "learning_rate": 2.2171329941986554e-07, "logits/chosen": -0.23928682506084442, "logits/rejected": -0.18458862602710724, "logps/chosen": -1.1380538940429688, "logps/rejected": -1.3498550653457642, "loss": 1.1381, "rewards/accuracies": 0.5625, "rewards/chosen": -1.1380538940429688, "rewards/margins": 0.21180126070976257, "rewards/rejected": -1.3498550653457642, "step": 4030 }, { "epoch": 2.159558454591069, "grad_norm": 6.388277215403918, "learning_rate": 2.2042076377673202e-07, "logits/chosen": -0.20915019512176514, "logits/rejected": -0.1823062151670456, "logps/chosen": -1.1677210330963135, "logps/rejected": -1.2260313034057617, "loss": 1.1677, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.1677210330963135, "rewards/margins": 0.058310020714998245, "rewards/rejected": -1.2260313034057617, "step": 4035 }, { "epoch": 2.16223448737247, "grad_norm": 5.007521085258633, "learning_rate": 2.1913094061257476e-07, "logits/chosen": -0.21797342598438263, "logits/rejected": -0.18966297805309296, "logps/chosen": -1.1317708492279053, "logps/rejected": -1.2209327220916748, "loss": 1.1318, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.1317708492279053, "rewards/margins": 0.08916175365447998, "rewards/rejected": -1.2209327220916748, "step": 4040 }, { "epoch": 2.164910520153872, "grad_norm": 6.780296843223726, "learning_rate": 2.178438424412633e-07, "logits/chosen": -0.18733876943588257, "logits/rejected": -0.09680025279521942, "logps/chosen": -1.2313435077667236, "logps/rejected": -1.3279738426208496, "loss": 1.2313, "rewards/accuracies": 0.53125, "rewards/chosen": -1.2313435077667236, "rewards/margins": 0.09663021564483643, "rewards/rejected": -1.3279738426208496, "step": 4045 }, { "epoch": 2.1675865529352736, "grad_norm": 4.151607825541677, "learning_rate": 2.165594817502302e-07, "logits/chosen": -0.2484176605939865, "logits/rejected": -0.17595510184764862, "logps/chosen": -1.2114479541778564, "logps/rejected": -1.2888625860214233, "loss": 1.2114, "rewards/accuracies": 0.5625, "rewards/chosen": -1.2114479541778564, "rewards/margins": 0.07741458714008331, "rewards/rejected": -1.2888625860214233, "step": 4050 }, { "epoch": 2.170262585716675, "grad_norm": 5.4342290942689, "learning_rate": 2.1527787100034806e-07, "logits/chosen": -0.161203071475029, "logits/rejected": -0.12170781940221786, "logps/chosen": -1.1920238733291626, "logps/rejected": -1.337754249572754, "loss": 1.192, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.1920238733291626, "rewards/margins": 0.14573028683662415, "rewards/rejected": -1.337754249572754, "step": 4055 }, { "epoch": 2.1729386184980766, "grad_norm": 6.379065940759612, "learning_rate": 2.1399902262581037e-07, "logits/chosen": -0.0636528879404068, "logits/rejected": 0.0001773778349161148, "logps/chosen": -1.1191489696502686, "logps/rejected": -1.1820695400238037, "loss": 1.1191, "rewards/accuracies": 0.53125, "rewards/chosen": -1.1191489696502686, "rewards/margins": 0.06292052567005157, "rewards/rejected": -1.1820695400238037, "step": 4060 }, { "epoch": 2.1756146512794783, "grad_norm": 6.413439707919092, "learning_rate": 2.127229490340094e-07, "logits/chosen": -0.27686581015586853, "logits/rejected": -0.2106027901172638, "logps/chosen": -1.1920533180236816, "logps/rejected": -1.371518850326538, "loss": 1.1921, "rewards/accuracies": 0.5625, "rewards/chosen": -1.1920533180236816, "rewards/margins": 0.17946556210517883, "rewards/rejected": -1.371518850326538, "step": 4065 }, { "epoch": 2.1782906840608796, "grad_norm": 6.351346020271495, "learning_rate": 2.1144966260541698e-07, "logits/chosen": -0.18480250239372253, "logits/rejected": -0.0361134372651577, "logps/chosen": -1.1448554992675781, "logps/rejected": -1.3010799884796143, "loss": 1.1449, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.1448554992675781, "rewards/margins": 0.15622444450855255, "rewards/rejected": -1.3010799884796143, "step": 4070 }, { "epoch": 2.1809667168422813, "grad_norm": 5.93850036741521, "learning_rate": 2.1017917569346332e-07, "logits/chosen": -0.22624826431274414, "logits/rejected": -0.09671525657176971, "logps/chosen": -1.2367630004882812, "logps/rejected": -1.3488506078720093, "loss": 1.2368, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.2367630004882812, "rewards/margins": 0.11208770424127579, "rewards/rejected": -1.3488506078720093, "step": 4075 }, { "epoch": 2.183642749623683, "grad_norm": 4.756468560159965, "learning_rate": 2.0891150062441837e-07, "logits/chosen": -0.1886388510465622, "logits/rejected": -0.08393391221761703, "logps/chosen": -1.2231789827346802, "logps/rejected": -1.3411672115325928, "loss": 1.2232, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.2231789827346802, "rewards/margins": 0.11798839271068573, "rewards/rejected": -1.3411672115325928, "step": 4080 }, { "epoch": 2.1863187824050843, "grad_norm": 4.596971488512385, "learning_rate": 2.0764664969727086e-07, "logits/chosen": -0.15123023092746735, "logits/rejected": -0.06831952184438705, "logps/chosen": -1.2007838487625122, "logps/rejected": -1.2784687280654907, "loss": 1.2008, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.2007838487625122, "rewards/margins": 0.07768485695123672, "rewards/rejected": -1.2784687280654907, "step": 4085 }, { "epoch": 2.188994815186486, "grad_norm": 4.174393821978833, "learning_rate": 2.0638463518361033e-07, "logits/chosen": -0.21402418613433838, "logits/rejected": -0.07391025871038437, "logps/chosen": -1.1798505783081055, "logps/rejected": -1.2890411615371704, "loss": 1.1799, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.1798505783081055, "rewards/margins": 0.10919053852558136, "rewards/rejected": -1.2890411615371704, "step": 4090 }, { "epoch": 2.1916708479678877, "grad_norm": 5.1788446412777995, "learning_rate": 2.0512546932750702e-07, "logits/chosen": -0.19948585331439972, "logits/rejected": -0.13889484107494354, "logps/chosen": -1.2639248371124268, "logps/rejected": -1.3094875812530518, "loss": 1.2639, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.2639248371124268, "rewards/margins": 0.0455627515912056, "rewards/rejected": -1.3094875812530518, "step": 4095 }, { "epoch": 2.194346880749289, "grad_norm": 5.766457725606512, "learning_rate": 2.0386916434539343e-07, "logits/chosen": -0.1380622833967209, "logits/rejected": -0.04484080895781517, "logps/chosen": -1.1066079139709473, "logps/rejected": -1.2694671154022217, "loss": 1.1066, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.1066079139709473, "rewards/margins": 0.162859246134758, "rewards/rejected": -1.2694671154022217, "step": 4100 }, { "epoch": 2.1970229135306907, "grad_norm": 5.104600196917305, "learning_rate": 2.0261573242594627e-07, "logits/chosen": -0.18767094612121582, "logits/rejected": -0.030369337648153305, "logps/chosen": -1.2362712621688843, "logps/rejected": -1.3139015436172485, "loss": 1.2363, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.2362712621688843, "rewards/margins": 0.07763019949197769, "rewards/rejected": -1.3139015436172485, "step": 4105 }, { "epoch": 2.1996989463120924, "grad_norm": 6.291347136852975, "learning_rate": 2.0136518572996724e-07, "logits/chosen": -0.13025102019309998, "logits/rejected": -0.001417426741681993, "logps/chosen": -1.1583443880081177, "logps/rejected": -1.2692008018493652, "loss": 1.1583, "rewards/accuracies": 0.53125, "rewards/chosen": -1.1583443880081177, "rewards/margins": 0.11085645854473114, "rewards/rejected": -1.2692008018493652, "step": 4110 }, { "epoch": 2.202374979093494, "grad_norm": 4.5471879162471165, "learning_rate": 2.0011753639026617e-07, "logits/chosen": -0.13782523572444916, "logits/rejected": -0.12039103358983994, "logps/chosen": -1.1780264377593994, "logps/rejected": -1.3545936346054077, "loss": 1.178, "rewards/accuracies": 0.5625, "rewards/chosen": -1.1780264377593994, "rewards/margins": 0.17656728625297546, "rewards/rejected": -1.3545936346054077, "step": 4115 }, { "epoch": 2.2050510118748954, "grad_norm": 3.8669761736059804, "learning_rate": 1.988727965115421e-07, "logits/chosen": -0.17218826711177826, "logits/rejected": -0.13592317700386047, "logps/chosen": -1.1309181451797485, "logps/rejected": -1.2820180654525757, "loss": 1.1309, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.1309181451797485, "rewards/margins": 0.15109990537166595, "rewards/rejected": -1.2820180654525757, "step": 4120 }, { "epoch": 2.207727044656297, "grad_norm": 3.7728019582449672, "learning_rate": 1.9763097817026713e-07, "logits/chosen": -0.21644219756126404, "logits/rejected": -0.06764537841081619, "logps/chosen": -1.1596324443817139, "logps/rejected": -1.251082181930542, "loss": 1.1596, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.1596324443817139, "rewards/margins": 0.09144963324069977, "rewards/rejected": -1.251082181930542, "step": 4125 }, { "epoch": 2.210403077437699, "grad_norm": 5.835538930907811, "learning_rate": 1.9639209341456796e-07, "logits/chosen": -0.16939784586429596, "logits/rejected": -0.10945162922143936, "logps/chosen": -1.1656649112701416, "logps/rejected": -1.3416519165039062, "loss": 1.1657, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.1656649112701416, "rewards/margins": 0.17598704993724823, "rewards/rejected": -1.3416519165039062, "step": 4130 }, { "epoch": 2.2130791102191, "grad_norm": 9.403924293778829, "learning_rate": 1.951561542641102e-07, "logits/chosen": -0.18682146072387695, "logits/rejected": -0.19657281041145325, "logps/chosen": -1.2012414932250977, "logps/rejected": -1.3361612558364868, "loss": 1.2012, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.2012414932250977, "rewards/margins": 0.13491973280906677, "rewards/rejected": -1.3361612558364868, "step": 4135 }, { "epoch": 2.215755143000502, "grad_norm": 5.029076890312318, "learning_rate": 1.939231727099806e-07, "logits/chosen": -0.2820887565612793, "logits/rejected": -0.219572514295578, "logps/chosen": -1.1362923383712769, "logps/rejected": -1.324304223060608, "loss": 1.1363, "rewards/accuracies": 0.59375, "rewards/chosen": -1.1362923383712769, "rewards/margins": 0.18801166117191315, "rewards/rejected": -1.324304223060608, "step": 4140 }, { "epoch": 2.2184311757819035, "grad_norm": 5.443937276145351, "learning_rate": 1.926931607145719e-07, "logits/chosen": -0.14144976437091827, "logits/rejected": -0.03735000640153885, "logps/chosen": -1.2244155406951904, "logps/rejected": -1.406860113143921, "loss": 1.2244, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.2244155406951904, "rewards/margins": 0.1824444979429245, "rewards/rejected": -1.406860113143921, "step": 4145 }, { "epoch": 2.221107208563305, "grad_norm": 4.836251332656813, "learning_rate": 1.9146613021146564e-07, "logits/chosen": -0.16402634978294373, "logits/rejected": -0.10602299869060516, "logps/chosen": -1.119221806526184, "logps/rejected": -1.2857345342636108, "loss": 1.1192, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.119221806526184, "rewards/margins": 0.16651271283626556, "rewards/rejected": -1.2857345342636108, "step": 4150 }, { "epoch": 2.2237832413447065, "grad_norm": 6.6328863753663425, "learning_rate": 1.9024209310531736e-07, "logits/chosen": -0.11063434928655624, "logits/rejected": -0.14188024401664734, "logps/chosen": -1.168270468711853, "logps/rejected": -1.3023529052734375, "loss": 1.1683, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.168270468711853, "rewards/margins": 0.13408248126506805, "rewards/rejected": -1.3023529052734375, "step": 4155 }, { "epoch": 2.2264592741261082, "grad_norm": 6.073034775936719, "learning_rate": 1.890210612717401e-07, "logits/chosen": -0.1861906349658966, "logits/rejected": -0.06680184602737427, "logps/chosen": -1.2142117023468018, "logps/rejected": -1.3409240245819092, "loss": 1.2142, "rewards/accuracies": 0.5625, "rewards/chosen": -1.2142117023468018, "rewards/margins": 0.12671232223510742, "rewards/rejected": -1.3409240245819092, "step": 4160 }, { "epoch": 2.2291353069075095, "grad_norm": 5.078301579704596, "learning_rate": 1.8780304655719054e-07, "logits/chosen": -0.18547169864177704, "logits/rejected": -0.1158377081155777, "logps/chosen": -1.214281678199768, "logps/rejected": -1.3592588901519775, "loss": 1.2143, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.214281678199768, "rewards/margins": 0.14497728645801544, "rewards/rejected": -1.3592588901519775, "step": 4165 }, { "epoch": 2.231811339688911, "grad_norm": 6.013986212680199, "learning_rate": 1.865880607788523e-07, "logits/chosen": -0.10416097939014435, "logits/rejected": -0.06418883055448532, "logps/chosen": -1.1955678462982178, "logps/rejected": -1.3223003149032593, "loss": 1.1956, "rewards/accuracies": 0.5625, "rewards/chosen": -1.1955678462982178, "rewards/margins": 0.12673239409923553, "rewards/rejected": -1.3223003149032593, "step": 4170 }, { "epoch": 2.234487372470313, "grad_norm": 4.851088076362137, "learning_rate": 1.8537611572452316e-07, "logits/chosen": -0.21652379631996155, "logits/rejected": -0.13747577369213104, "logps/chosen": -1.1767390966415405, "logps/rejected": -1.2916357517242432, "loss": 1.1767, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.1767390966415405, "rewards/margins": 0.1148967295885086, "rewards/rejected": -1.2916357517242432, "step": 4175 }, { "epoch": 2.237163405251714, "grad_norm": 5.010714489537076, "learning_rate": 1.84167223152499e-07, "logits/chosen": -0.20866374671459198, "logits/rejected": -0.034557223320007324, "logps/chosen": -1.1267192363739014, "logps/rejected": -1.338154673576355, "loss": 1.1267, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.1267192363739014, "rewards/margins": 0.21143536269664764, "rewards/rejected": -1.338154673576355, "step": 4180 }, { "epoch": 2.239839438033116, "grad_norm": 5.42725365213271, "learning_rate": 1.8296139479146112e-07, "logits/chosen": -0.21722157299518585, "logits/rejected": -0.22120361030101776, "logps/chosen": -1.0948892831802368, "logps/rejected": -1.289182424545288, "loss": 1.0949, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.0948892831802368, "rewards/margins": 0.19429326057434082, "rewards/rejected": -1.289182424545288, "step": 4185 }, { "epoch": 2.2425154708145176, "grad_norm": 6.724749072766587, "learning_rate": 1.8175864234036132e-07, "logits/chosen": -0.12532177567481995, "logits/rejected": -0.04166935011744499, "logps/chosen": -1.1783170700073242, "logps/rejected": -1.260327696800232, "loss": 1.1783, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.1783170700073242, "rewards/margins": 0.08201070129871368, "rewards/rejected": -1.260327696800232, "step": 4190 }, { "epoch": 2.245191503595919, "grad_norm": 4.565760732700351, "learning_rate": 1.805589774683094e-07, "logits/chosen": -0.2991235554218292, "logits/rejected": -0.17879454791545868, "logps/chosen": -1.212246298789978, "logps/rejected": -1.280146837234497, "loss": 1.2122, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.212246298789978, "rewards/margins": 0.06790047883987427, "rewards/rejected": -1.280146837234497, "step": 4195 }, { "epoch": 2.2478675363773206, "grad_norm": 6.535230956201727, "learning_rate": 1.79362411814459e-07, "logits/chosen": -0.0603451132774353, "logits/rejected": -0.0837228000164032, "logps/chosen": -1.2196705341339111, "logps/rejected": -1.3689625263214111, "loss": 1.2197, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.2196705341339111, "rewards/margins": 0.14929182827472687, "rewards/rejected": -1.3689625263214111, "step": 4200 }, { "epoch": 2.2505435691587223, "grad_norm": 6.185010248224673, "learning_rate": 1.7816895698789552e-07, "logits/chosen": -0.252445250749588, "logits/rejected": -0.194708913564682, "logps/chosen": -1.15798819065094, "logps/rejected": -1.3629449605941772, "loss": 1.158, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.15798819065094, "rewards/margins": 0.20495660603046417, "rewards/rejected": -1.3629449605941772, "step": 4205 }, { "epoch": 2.2532196019401236, "grad_norm": 5.090949086475784, "learning_rate": 1.7697862456752271e-07, "logits/chosen": -0.19289270043373108, "logits/rejected": -0.12604117393493652, "logps/chosen": -1.1647452116012573, "logps/rejected": -1.367032527923584, "loss": 1.1647, "rewards/accuracies": 0.5625, "rewards/chosen": -1.1647452116012573, "rewards/margins": 0.20228734612464905, "rewards/rejected": -1.367032527923584, "step": 4210 }, { "epoch": 2.2558956347215253, "grad_norm": 5.150016508141441, "learning_rate": 1.7579142610195124e-07, "logits/chosen": -0.21593913435935974, "logits/rejected": -0.12875400483608246, "logps/chosen": -1.2000150680541992, "logps/rejected": -1.2847319841384888, "loss": 1.2, "rewards/accuracies": 0.5, "rewards/chosen": -1.2000150680541992, "rewards/margins": 0.08471700549125671, "rewards/rejected": -1.2847319841384888, "step": 4215 }, { "epoch": 2.258571667502927, "grad_norm": 5.58672446850194, "learning_rate": 1.7460737310938568e-07, "logits/chosen": -0.23756785690784454, "logits/rejected": -0.08933673053979874, "logps/chosen": -1.1930367946624756, "logps/rejected": -1.286585807800293, "loss": 1.193, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.1930367946624756, "rewards/margins": 0.09354908764362335, "rewards/rejected": -1.286585807800293, "step": 4220 }, { "epoch": 2.2612477002843283, "grad_norm": 4.582531189364189, "learning_rate": 1.734264770775133e-07, "logits/chosen": -0.2260771244764328, "logits/rejected": -0.10067816823720932, "logps/chosen": -1.2181450128555298, "logps/rejected": -1.3994672298431396, "loss": 1.2181, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.2181450128555298, "rewards/margins": 0.18132233619689941, "rewards/rejected": -1.3994672298431396, "step": 4225 }, { "epoch": 2.26392373306573, "grad_norm": 5.305092079435974, "learning_rate": 1.7224874946339241e-07, "logits/chosen": -0.23668476939201355, "logits/rejected": -0.18482361733913422, "logps/chosen": -1.2236946821212769, "logps/rejected": -1.4608328342437744, "loss": 1.2237, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.2236946821212769, "rewards/margins": 0.23713819682598114, "rewards/rejected": -1.4608328342437744, "step": 4230 }, { "epoch": 2.2665997658471317, "grad_norm": 4.412658053102296, "learning_rate": 1.7107420169334186e-07, "logits/chosen": -0.19530200958251953, "logits/rejected": -0.09423814713954926, "logps/chosen": -1.253401517868042, "logps/rejected": -1.259357213973999, "loss": 1.2534, "rewards/accuracies": 0.4375, "rewards/chosen": -1.253401517868042, "rewards/margins": 0.005955693311989307, "rewards/rejected": -1.259357213973999, "step": 4235 }, { "epoch": 2.269275798628533, "grad_norm": 5.444242327890368, "learning_rate": 1.6990284516282893e-07, "logits/chosen": -0.19495432078838348, "logits/rejected": -0.11021178960800171, "logps/chosen": -1.204071044921875, "logps/rejected": -1.2475641965866089, "loss": 1.2041, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.204071044921875, "rewards/margins": 0.0434931255877018, "rewards/rejected": -1.2475641965866089, "step": 4240 }, { "epoch": 2.2719518314099347, "grad_norm": 5.651793040599297, "learning_rate": 1.687346912363602e-07, "logits/chosen": -0.2540439963340759, "logits/rejected": -0.16842758655548096, "logps/chosen": -1.2042417526245117, "logps/rejected": -1.2306530475616455, "loss": 1.2042, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.2042417526245117, "rewards/margins": 0.02641119435429573, "rewards/rejected": -1.2306530475616455, "step": 4245 }, { "epoch": 2.2746278641913364, "grad_norm": 4.255152982044798, "learning_rate": 1.675697512473697e-07, "logits/chosen": -0.16772834956645966, "logits/rejected": -0.0462503619492054, "logps/chosen": -1.1865733861923218, "logps/rejected": -1.270499348640442, "loss": 1.1866, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.1865733861923218, "rewards/margins": 0.08392590284347534, "rewards/rejected": -1.270499348640442, "step": 4250 }, { "epoch": 2.2773038969727377, "grad_norm": 6.8288231242560755, "learning_rate": 1.6640803649811087e-07, "logits/chosen": -0.23114566504955292, "logits/rejected": -0.05208281800150871, "logps/chosen": -1.1997205018997192, "logps/rejected": -1.333919644355774, "loss": 1.1997, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.1997205018997192, "rewards/margins": 0.13419921696186066, "rewards/rejected": -1.333919644355774, "step": 4255 }, { "epoch": 2.2799799297541394, "grad_norm": 6.0667234707097775, "learning_rate": 1.6524955825954472e-07, "logits/chosen": -0.17926296591758728, "logits/rejected": -0.07549484074115753, "logps/chosen": -1.1265671253204346, "logps/rejected": -1.348105549812317, "loss": 1.1266, "rewards/accuracies": 0.59375, "rewards/chosen": -1.1265671253204346, "rewards/margins": 0.22153839468955994, "rewards/rejected": -1.348105549812317, "step": 4260 }, { "epoch": 2.282655962535541, "grad_norm": 5.203471081665015, "learning_rate": 1.6409432777123277e-07, "logits/chosen": -0.280588299036026, "logits/rejected": -0.16086241602897644, "logps/chosen": -1.1885846853256226, "logps/rejected": -1.3065459728240967, "loss": 1.1886, "rewards/accuracies": 0.59375, "rewards/chosen": -1.1885846853256226, "rewards/margins": 0.1179611086845398, "rewards/rejected": -1.3065459728240967, "step": 4265 }, { "epoch": 2.285331995316943, "grad_norm": 4.606303572849593, "learning_rate": 1.6294235624122577e-07, "logits/chosen": -0.1245080977678299, "logits/rejected": 0.06318476051092148, "logps/chosen": -1.1951229572296143, "logps/rejected": -1.4686368703842163, "loss": 1.1951, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.1951229572296143, "rewards/margins": 0.27351415157318115, "rewards/rejected": -1.4686368703842163, "step": 4270 }, { "epoch": 2.288008028098344, "grad_norm": 7.234621586216011, "learning_rate": 1.6179365484595697e-07, "logits/chosen": -0.16479277610778809, "logits/rejected": -0.11197604984045029, "logps/chosen": -1.2437224388122559, "logps/rejected": -1.3827816247940063, "loss": 1.2437, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.2437224388122559, "rewards/margins": 0.13905911147594452, "rewards/rejected": -1.3827816247940063, "step": 4275 }, { "epoch": 2.290684060879746, "grad_norm": 5.194976503553251, "learning_rate": 1.60648234730132e-07, "logits/chosen": -0.1847917139530182, "logits/rejected": -0.1338696926832199, "logps/chosen": -1.1496856212615967, "logps/rejected": -1.2471023797988892, "loss": 1.1497, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.1496856212615967, "rewards/margins": 0.09741681814193726, "rewards/rejected": -1.2471023797988892, "step": 4280 }, { "epoch": 2.293360093661147, "grad_norm": 8.757913107543331, "learning_rate": 1.595061070066222e-07, "logits/chosen": -0.12038049846887589, "logits/rejected": -0.13174544274806976, "logps/chosen": -1.120507001876831, "logps/rejected": -1.2848656177520752, "loss": 1.1205, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.120507001876831, "rewards/margins": 0.16435864567756653, "rewards/rejected": -1.2848656177520752, "step": 4285 }, { "epoch": 2.296036126442549, "grad_norm": 5.684583507216784, "learning_rate": 1.5836728275635542e-07, "logits/chosen": -0.2385338991880417, "logits/rejected": -0.09940926730632782, "logps/chosen": -1.2218226194381714, "logps/rejected": -1.2780125141143799, "loss": 1.2218, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.2218226194381714, "rewards/margins": 0.056189704686403275, "rewards/rejected": -1.2780125141143799, "step": 4290 }, { "epoch": 2.2987121592239506, "grad_norm": 8.194332233754405, "learning_rate": 1.5723177302820984e-07, "logits/chosen": -0.20604734122753143, "logits/rejected": -0.18329647183418274, "logps/chosen": -1.1734648942947388, "logps/rejected": -1.3342117071151733, "loss": 1.1735, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.1734648942947388, "rewards/margins": 0.16074667870998383, "rewards/rejected": -1.3342117071151733, "step": 4295 }, { "epoch": 2.3013881920053523, "grad_norm": 6.503720397135868, "learning_rate": 1.5609958883890544e-07, "logits/chosen": -0.19482329487800598, "logits/rejected": -0.113958939909935, "logps/chosen": -1.1837066411972046, "logps/rejected": -1.2476557493209839, "loss": 1.1837, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.1837066411972046, "rewards/margins": 0.06394918262958527, "rewards/rejected": -1.2476557493209839, "step": 4300 }, { "epoch": 2.3040642247867535, "grad_norm": 5.6631326090493275, "learning_rate": 1.5497074117289865e-07, "logits/chosen": -0.23928654193878174, "logits/rejected": -0.1413566768169403, "logps/chosen": -1.1509580612182617, "logps/rejected": -1.3253514766693115, "loss": 1.151, "rewards/accuracies": 0.5625, "rewards/chosen": -1.1509580612182617, "rewards/margins": 0.1743934601545334, "rewards/rejected": -1.3253514766693115, "step": 4305 }, { "epoch": 2.3067402575681553, "grad_norm": 5.567407671434526, "learning_rate": 1.5384524098227402e-07, "logits/chosen": -0.19828811287879944, "logits/rejected": -0.04995100945234299, "logps/chosen": -1.1821668148040771, "logps/rejected": -1.322472333908081, "loss": 1.1822, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.1821668148040771, "rewards/margins": 0.14030557870864868, "rewards/rejected": -1.322472333908081, "step": 4310 }, { "epoch": 2.3094162903495565, "grad_norm": 4.106275260833352, "learning_rate": 1.5272309918663974e-07, "logits/chosen": -0.17998647689819336, "logits/rejected": -0.07189061492681503, "logps/chosen": -1.1628793478012085, "logps/rejected": -1.2749170064926147, "loss": 1.1629, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.1628793478012085, "rewards/margins": 0.11203751713037491, "rewards/rejected": -1.2749170064926147, "step": 4315 }, { "epoch": 2.3120923231309582, "grad_norm": 6.0285351189570076, "learning_rate": 1.516043266730201e-07, "logits/chosen": -0.20296883583068848, "logits/rejected": -0.11972220242023468, "logps/chosen": -1.2435534000396729, "logps/rejected": -1.2556636333465576, "loss": 1.2436, "rewards/accuracies": 0.53125, "rewards/chosen": -1.2435534000396729, "rewards/margins": 0.01211016345769167, "rewards/rejected": -1.2556636333465576, "step": 4320 }, { "epoch": 2.31476835591236, "grad_norm": 7.441737646892377, "learning_rate": 1.504889342957512e-07, "logits/chosen": -0.22127382457256317, "logits/rejected": -0.10536559671163559, "logps/chosen": -1.1615079641342163, "logps/rejected": -1.3303349018096924, "loss": 1.1615, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.1615079641342163, "rewards/margins": 0.1688269078731537, "rewards/rejected": -1.3303349018096924, "step": 4325 }, { "epoch": 2.3174443886937617, "grad_norm": 4.531883635656041, "learning_rate": 1.4937693287637453e-07, "logits/chosen": -0.1733209788799286, "logits/rejected": -0.0958932489156723, "logps/chosen": -1.2588638067245483, "logps/rejected": -1.2944846153259277, "loss": 1.2589, "rewards/accuracies": 0.4437499940395355, "rewards/chosen": -1.2588638067245483, "rewards/margins": 0.03562081977725029, "rewards/rejected": -1.2944846153259277, "step": 4330 }, { "epoch": 2.320120421475163, "grad_norm": 4.44274463685262, "learning_rate": 1.4826833320353305e-07, "logits/chosen": -0.14835233986377716, "logits/rejected": -0.10592067241668701, "logps/chosen": -1.21968674659729, "logps/rejected": -1.3323265314102173, "loss": 1.2197, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.21968674659729, "rewards/margins": 0.11263986676931381, "rewards/rejected": -1.3323265314102173, "step": 4335 }, { "epoch": 2.3227964542565647, "grad_norm": 5.621687516578505, "learning_rate": 1.4716314603286528e-07, "logits/chosen": -0.18628022074699402, "logits/rejected": -0.05752452462911606, "logps/chosen": -1.0943838357925415, "logps/rejected": -1.3676068782806396, "loss": 1.0944, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.0943838357925415, "rewards/margins": 0.2732229232788086, "rewards/rejected": -1.3676068782806396, "step": 4340 }, { "epoch": 2.3254724870379664, "grad_norm": 6.856339279786717, "learning_rate": 1.4606138208690233e-07, "logits/chosen": -0.2285754680633545, "logits/rejected": -0.16179263591766357, "logps/chosen": -1.2859995365142822, "logps/rejected": -1.257214903831482, "loss": 1.286, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -1.2859995365142822, "rewards/margins": -0.028784509748220444, "rewards/rejected": -1.257214903831482, "step": 4345 }, { "epoch": 2.3281485198193677, "grad_norm": 5.243709661696402, "learning_rate": 1.4496305205496251e-07, "logits/chosen": -0.15565867722034454, "logits/rejected": -0.10928313434123993, "logps/chosen": -1.1767737865447998, "logps/rejected": -1.3419196605682373, "loss": 1.1768, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.1767737865447998, "rewards/margins": 0.16514596343040466, "rewards/rejected": -1.3419196605682373, "step": 4350 }, { "epoch": 2.3308245526007694, "grad_norm": 4.891847890095231, "learning_rate": 1.4386816659304895e-07, "logits/chosen": -0.20314589142799377, "logits/rejected": -0.13963311910629272, "logps/chosen": -1.2260957956314087, "logps/rejected": -1.3578708171844482, "loss": 1.2261, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.2260957956314087, "rewards/margins": 0.13177499175071716, "rewards/rejected": -1.3578708171844482, "step": 4355 }, { "epoch": 2.333500585382171, "grad_norm": 6.1570029390676115, "learning_rate": 1.4277673632374492e-07, "logits/chosen": -0.27182695269584656, "logits/rejected": -0.10808505862951279, "logps/chosen": -1.2792165279388428, "logps/rejected": -1.3240224123001099, "loss": 1.2792, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.2792165279388428, "rewards/margins": 0.04480605199933052, "rewards/rejected": -1.3240224123001099, "step": 4360 }, { "epoch": 2.3361766181635724, "grad_norm": 5.717619374918025, "learning_rate": 1.416887718361119e-07, "logits/chosen": -0.10773682594299316, "logits/rejected": -0.11748391389846802, "logps/chosen": -1.140716314315796, "logps/rejected": -1.3248605728149414, "loss": 1.1407, "rewards/accuracies": 0.53125, "rewards/chosen": -1.140716314315796, "rewards/margins": 0.1841442883014679, "rewards/rejected": -1.3248605728149414, "step": 4365 }, { "epoch": 2.338852650944974, "grad_norm": 7.4029551146693375, "learning_rate": 1.406042836855859e-07, "logits/chosen": -0.14697062969207764, "logits/rejected": -0.04036722332239151, "logps/chosen": -1.09419846534729, "logps/rejected": -1.3960702419281006, "loss": 1.0942, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.09419846534729, "rewards/margins": 0.3018718361854553, "rewards/rejected": -1.3960702419281006, "step": 4370 }, { "epoch": 2.341528683726376, "grad_norm": 4.1983934326236065, "learning_rate": 1.3952328239387595e-07, "logits/chosen": -0.29027217626571655, "logits/rejected": -0.15462729334831238, "logps/chosen": -1.2136160135269165, "logps/rejected": -1.3466086387634277, "loss": 1.2136, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.2136160135269165, "rewards/margins": 0.13299258053302765, "rewards/rejected": -1.3466086387634277, "step": 4375 }, { "epoch": 2.344204716507777, "grad_norm": 5.088121388456647, "learning_rate": 1.3844577844886109e-07, "logits/chosen": -0.19045235216617584, "logits/rejected": -0.06157989054918289, "logps/chosen": -1.263784646987915, "logps/rejected": -1.3311150074005127, "loss": 1.2638, "rewards/accuracies": 0.5, "rewards/chosen": -1.263784646987915, "rewards/margins": 0.067330501973629, "rewards/rejected": -1.3311150074005127, "step": 4380 }, { "epoch": 2.346880749289179, "grad_norm": 6.96469855338725, "learning_rate": 1.3737178230448955e-07, "logits/chosen": -0.2547512650489807, "logits/rejected": -0.13945113122463226, "logps/chosen": -1.1682751178741455, "logps/rejected": -1.3938511610031128, "loss": 1.1683, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.1682751178741455, "rewards/margins": 0.22557613253593445, "rewards/rejected": -1.3938511610031128, "step": 4385 }, { "epoch": 2.3495567820705805, "grad_norm": 5.5618655963842025, "learning_rate": 1.363013043806764e-07, "logits/chosen": -0.18951451778411865, "logits/rejected": -0.08988400548696518, "logps/chosen": -1.171539545059204, "logps/rejected": -1.2835333347320557, "loss": 1.1715, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.171539545059204, "rewards/margins": 0.11199358850717545, "rewards/rejected": -1.2835333347320557, "step": 4390 }, { "epoch": 2.3522328148519818, "grad_norm": 4.523159489750791, "learning_rate": 1.352343550632034e-07, "logits/chosen": -0.15745487809181213, "logits/rejected": -0.04096124321222305, "logps/chosen": -1.192718505859375, "logps/rejected": -1.3966691493988037, "loss": 1.1927, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.192718505859375, "rewards/margins": 0.20395059883594513, "rewards/rejected": -1.3966691493988037, "step": 4395 }, { "epoch": 2.3549088476333835, "grad_norm": 4.9948257857547205, "learning_rate": 1.3417094470361722e-07, "logits/chosen": -0.2500094771385193, "logits/rejected": -0.1257254183292389, "logps/chosen": -1.1513335704803467, "logps/rejected": -1.357204794883728, "loss": 1.1513, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.1513335704803467, "rewards/margins": 0.20587129890918732, "rewards/rejected": -1.357204794883728, "step": 4400 }, { "epoch": 2.3549088476333835, "eval_logits/chosen": 0.05666728690266609, "eval_logits/rejected": 0.12224721163511276, "eval_logps/chosen": -1.2627294063568115, "eval_logps/rejected": -1.386842131614685, "eval_loss": 1.262986660003662, "eval_rewards/accuracies": 0.5452522039413452, "eval_rewards/chosen": -1.2627294063568115, "eval_rewards/margins": 0.12411272525787354, "eval_rewards/rejected": -1.386842131614685, "eval_runtime": 40.2702, "eval_samples_per_second": 33.399, "eval_steps_per_second": 8.368, "step": 4400 }, { "epoch": 2.357584880414785, "grad_norm": 5.523813868598994, "learning_rate": 1.3311108361913015e-07, "logits/chosen": -0.24156026542186737, "logits/rejected": -0.20555052161216736, "logps/chosen": -1.1676639318466187, "logps/rejected": -1.258882761001587, "loss": 1.1677, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.1676639318466187, "rewards/margins": 0.09121882915496826, "rewards/rejected": -1.258882761001587, "step": 4405 }, { "epoch": 2.3602609131961865, "grad_norm": 5.56630335125377, "learning_rate": 1.3205478209251874e-07, "logits/chosen": -0.1927478015422821, "logits/rejected": -0.1678342968225479, "logps/chosen": -1.2579342126846313, "logps/rejected": -1.447281002998352, "loss": 1.2579, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.2579342126846313, "rewards/margins": 0.18934671580791473, "rewards/rejected": -1.447281002998352, "step": 4410 }, { "epoch": 2.362936945977588, "grad_norm": 5.714586536588741, "learning_rate": 1.310020503720254e-07, "logits/chosen": -0.17149321734905243, "logits/rejected": -0.05465017631649971, "logps/chosen": -1.1974270343780518, "logps/rejected": -1.3208844661712646, "loss": 1.1974, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.1974270343780518, "rewards/margins": 0.12345733493566513, "rewards/rejected": -1.3208844661712646, "step": 4415 }, { "epoch": 2.36561297875899, "grad_norm": 5.00103716945003, "learning_rate": 1.2995289867125752e-07, "logits/chosen": -0.1917458474636078, "logits/rejected": -0.13558319211006165, "logps/chosen": -1.231506586074829, "logps/rejected": -1.2670447826385498, "loss": 1.2315, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.231506586074829, "rewards/margins": 0.035538215190172195, "rewards/rejected": -1.2670447826385498, "step": 4420 }, { "epoch": 2.368289011540391, "grad_norm": 4.973549375351627, "learning_rate": 1.2890733716908986e-07, "logits/chosen": -0.20136579871177673, "logits/rejected": -0.10562875121831894, "logps/chosen": -1.158573865890503, "logps/rejected": -1.351975679397583, "loss": 1.1586, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.158573865890503, "rewards/margins": 0.19340190291404724, "rewards/rejected": -1.351975679397583, "step": 4425 }, { "epoch": 2.370965044321793, "grad_norm": 4.510354380537708, "learning_rate": 1.2786537600956454e-07, "logits/chosen": -0.194970041513443, "logits/rejected": -0.0729452520608902, "logps/chosen": -1.2427709102630615, "logps/rejected": -1.354774832725525, "loss": 1.2428, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -1.2427709102630615, "rewards/margins": 0.11200396716594696, "rewards/rejected": -1.354774832725525, "step": 4430 }, { "epoch": 2.3736410771031946, "grad_norm": 5.0380893653519365, "learning_rate": 1.268270253017933e-07, "logits/chosen": -0.2969645857810974, "logits/rejected": -0.1270444542169571, "logps/chosen": -1.1290185451507568, "logps/rejected": -1.3071434497833252, "loss": 1.129, "rewards/accuracies": 0.59375, "rewards/chosen": -1.1290185451507568, "rewards/margins": 0.17812471091747284, "rewards/rejected": -1.3071434497833252, "step": 4435 }, { "epoch": 2.376317109884596, "grad_norm": 5.288911560868772, "learning_rate": 1.257922951198591e-07, "logits/chosen": -0.27020302414894104, "logits/rejected": -0.11802991479635239, "logps/chosen": -1.2351467609405518, "logps/rejected": -1.2933213710784912, "loss": 1.2351, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.2351467609405518, "rewards/margins": 0.05817471817135811, "rewards/rejected": -1.2933213710784912, "step": 4440 }, { "epoch": 2.3789931426659976, "grad_norm": 5.434960057184529, "learning_rate": 1.24761195502719e-07, "logits/chosen": -0.22193975746631622, "logits/rejected": -0.08344869315624237, "logps/chosen": -1.1814290285110474, "logps/rejected": -1.298051118850708, "loss": 1.1814, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.1814290285110474, "rewards/margins": 0.11662214994430542, "rewards/rejected": -1.298051118850708, "step": 4445 }, { "epoch": 2.3816691754473993, "grad_norm": 6.968396622882444, "learning_rate": 1.2373373645410573e-07, "logits/chosen": -0.17296342551708221, "logits/rejected": -0.08542780578136444, "logps/chosen": -1.225337028503418, "logps/rejected": -1.3246697187423706, "loss": 1.2253, "rewards/accuracies": 0.5625, "rewards/chosen": -1.225337028503418, "rewards/margins": 0.0993327870965004, "rewards/rejected": -1.3246697187423706, "step": 4450 }, { "epoch": 2.384345208228801, "grad_norm": 5.888447750893898, "learning_rate": 1.2270992794243175e-07, "logits/chosen": -0.24413204193115234, "logits/rejected": -0.1756843477487564, "logps/chosen": -1.2198927402496338, "logps/rejected": -1.3160490989685059, "loss": 1.2199, "rewards/accuracies": 0.5625, "rewards/chosen": -1.2198927402496338, "rewards/margins": 0.0961565151810646, "rewards/rejected": -1.3160490989685059, "step": 4455 }, { "epoch": 2.3870212410102023, "grad_norm": 5.751472658119907, "learning_rate": 1.2168977990069147e-07, "logits/chosen": -0.2480367124080658, "logits/rejected": -0.06240849569439888, "logps/chosen": -1.1448185443878174, "logps/rejected": -1.3475663661956787, "loss": 1.1448, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.1448185443878174, "rewards/margins": 0.20274803042411804, "rewards/rejected": -1.3475663661956787, "step": 4460 }, { "epoch": 2.389697273791604, "grad_norm": 5.3625057191664, "learning_rate": 1.206733022263659e-07, "logits/chosen": -0.24401506781578064, "logits/rejected": -0.10291938483715057, "logps/chosen": -1.2392865419387817, "logps/rejected": -1.3362852334976196, "loss": 1.2393, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.2392865419387817, "rewards/margins": 0.09699881821870804, "rewards/rejected": -1.3362852334976196, "step": 4465 }, { "epoch": 2.3923733065730053, "grad_norm": 6.722048231435796, "learning_rate": 1.1966050478132572e-07, "logits/chosen": -0.11147363483905792, "logits/rejected": -0.05899057537317276, "logps/chosen": -1.0932403802871704, "logps/rejected": -1.223857045173645, "loss": 1.0932, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.0932403802871704, "rewards/margins": 0.13061664998531342, "rewards/rejected": -1.223857045173645, "step": 4470 }, { "epoch": 2.395049339354407, "grad_norm": 4.441869541970744, "learning_rate": 1.1865139739173635e-07, "logits/chosen": -0.2188471108675003, "logits/rejected": -0.045181117951869965, "logps/chosen": -1.1408249139785767, "logps/rejected": -1.298242211341858, "loss": 1.1408, "rewards/accuracies": 0.59375, "rewards/chosen": -1.1408249139785767, "rewards/margins": 0.15741726756095886, "rewards/rejected": -1.298242211341858, "step": 4475 }, { "epoch": 2.3977253721358087, "grad_norm": 5.312001229949146, "learning_rate": 1.1764598984796187e-07, "logits/chosen": -0.16660510003566742, "logits/rejected": -0.12142334133386612, "logps/chosen": -1.0977728366851807, "logps/rejected": -1.2042092084884644, "loss": 1.0978, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.0977728366851807, "rewards/margins": 0.1064363569021225, "rewards/rejected": -1.2042092084884644, "step": 4480 }, { "epoch": 2.4004014049172104, "grad_norm": 5.159925166267903, "learning_rate": 1.1664429190447095e-07, "logits/chosen": -0.21362897753715515, "logits/rejected": -0.12581035494804382, "logps/chosen": -1.2133947610855103, "logps/rejected": -1.3622853755950928, "loss": 1.2134, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.2133947610855103, "rewards/margins": 0.1488906443119049, "rewards/rejected": -1.3622853755950928, "step": 4485 }, { "epoch": 2.4030774376986117, "grad_norm": 5.131275291658471, "learning_rate": 1.1564631327974122e-07, "logits/chosen": -0.23948411643505096, "logits/rejected": -0.06456609815359116, "logps/chosen": -1.1743736267089844, "logps/rejected": -1.3703372478485107, "loss": 1.1744, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.1743736267089844, "rewards/margins": 0.195963516831398, "rewards/rejected": -1.3703372478485107, "step": 4490 }, { "epoch": 2.4057534704800134, "grad_norm": 5.505989284377567, "learning_rate": 1.1465206365616587e-07, "logits/chosen": -0.2696453928947449, "logits/rejected": -0.10105737298727036, "logps/chosen": -1.159428358078003, "logps/rejected": -1.3010650873184204, "loss": 1.1594, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.159428358078003, "rewards/margins": 0.14163659512996674, "rewards/rejected": -1.3010650873184204, "step": 4495 }, { "epoch": 2.408429503261415, "grad_norm": 5.9194036130805925, "learning_rate": 1.1366155267995887e-07, "logits/chosen": -0.08802883327007294, "logits/rejected": -0.10079698264598846, "logps/chosen": -1.1923701763153076, "logps/rejected": -1.3078563213348389, "loss": 1.1924, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.1923701763153076, "rewards/margins": 0.1154862493276596, "rewards/rejected": -1.3078563213348389, "step": 4500 }, { "epoch": 2.4111055360428164, "grad_norm": 5.652342183095649, "learning_rate": 1.1267478996106228e-07, "logits/chosen": -0.24272421002388, "logits/rejected": -0.131375253200531, "logps/chosen": -1.1781219244003296, "logps/rejected": -1.4049549102783203, "loss": 1.1781, "rewards/accuracies": 0.59375, "rewards/chosen": -1.1781219244003296, "rewards/margins": 0.22683291137218475, "rewards/rejected": -1.4049549102783203, "step": 4505 }, { "epoch": 2.413781568824218, "grad_norm": 6.878253690907194, "learning_rate": 1.116917850730521e-07, "logits/chosen": -0.2678452134132385, "logits/rejected": -0.16677314043045044, "logps/chosen": -1.117531418800354, "logps/rejected": -1.27407705783844, "loss": 1.1175, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.117531418800354, "rewards/margins": 0.15654563903808594, "rewards/rejected": -1.27407705783844, "step": 4510 }, { "epoch": 2.41645760160562, "grad_norm": 4.873054560165348, "learning_rate": 1.1071254755304637e-07, "logits/chosen": -0.2543852627277374, "logits/rejected": -0.20947547256946564, "logps/chosen": -1.1993083953857422, "logps/rejected": -1.3618755340576172, "loss": 1.1993, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.1993083953857422, "rewards/margins": 0.16256703436374664, "rewards/rejected": -1.3618755340576172, "step": 4515 }, { "epoch": 2.419133634387021, "grad_norm": 5.973354068208982, "learning_rate": 1.0973708690161143e-07, "logits/chosen": -0.1900860071182251, "logits/rejected": -0.12994596362113953, "logps/chosen": -1.160346269607544, "logps/rejected": -1.3866424560546875, "loss": 1.1603, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.160346269607544, "rewards/margins": 0.22629621624946594, "rewards/rejected": -1.3866424560546875, "step": 4520 }, { "epoch": 2.421809667168423, "grad_norm": 5.93686634100255, "learning_rate": 1.0876541258267119e-07, "logits/chosen": -0.28135067224502563, "logits/rejected": -0.13348601758480072, "logps/chosen": -1.2872869968414307, "logps/rejected": -1.4346427917480469, "loss": 1.2873, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.2872869968414307, "rewards/margins": 0.14735588431358337, "rewards/rejected": -1.4346427917480469, "step": 4525 }, { "epoch": 2.4244856999498245, "grad_norm": 4.855938558420744, "learning_rate": 1.0779753402341379e-07, "logits/chosen": -0.24857434630393982, "logits/rejected": -0.18430069088935852, "logps/chosen": -1.2187031507492065, "logps/rejected": -1.3256276845932007, "loss": 1.2187, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.2187031507492065, "rewards/margins": 0.10692459344863892, "rewards/rejected": -1.3256276845932007, "step": 4530 }, { "epoch": 2.427161732731226, "grad_norm": 4.641183807403526, "learning_rate": 1.0683346061420157e-07, "logits/chosen": -0.08672784268856049, "logits/rejected": -0.008265185169875622, "logps/chosen": -1.143980860710144, "logps/rejected": -1.279421091079712, "loss": 1.144, "rewards/accuracies": 0.5625, "rewards/chosen": -1.143980860710144, "rewards/margins": 0.13544026017189026, "rewards/rejected": -1.279421091079712, "step": 4535 }, { "epoch": 2.4298377655126275, "grad_norm": 5.739283494463384, "learning_rate": 1.0587320170847874e-07, "logits/chosen": -0.09493064880371094, "logits/rejected": -0.054532237350940704, "logps/chosen": -1.1251952648162842, "logps/rejected": -1.2601686716079712, "loss": 1.1252, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.1251952648162842, "rewards/margins": 0.13497324287891388, "rewards/rejected": -1.2601686716079712, "step": 4540 }, { "epoch": 2.4325137982940293, "grad_norm": 5.730168155271142, "learning_rate": 1.0491676662268156e-07, "logits/chosen": -0.1829853057861328, "logits/rejected": -0.0723324716091156, "logps/chosen": -1.1314141750335693, "logps/rejected": -1.268724799156189, "loss": 1.1314, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.1314141750335693, "rewards/margins": 0.13731065392494202, "rewards/rejected": -1.268724799156189, "step": 4545 }, { "epoch": 2.4351898310754305, "grad_norm": 6.159181417905654, "learning_rate": 1.0396416463614732e-07, "logits/chosen": -0.22580654919147491, "logits/rejected": -0.1415761411190033, "logps/chosen": -1.1196224689483643, "logps/rejected": -1.3204675912857056, "loss": 1.1196, "rewards/accuracies": 0.59375, "rewards/chosen": -1.1196224689483643, "rewards/margins": 0.20084521174430847, "rewards/rejected": -1.3204675912857056, "step": 4550 }, { "epoch": 2.4378658638568322, "grad_norm": 6.047812965475491, "learning_rate": 1.0301540499102479e-07, "logits/chosen": -0.19016066193580627, "logits/rejected": -0.13770441710948944, "logps/chosen": -1.2472150325775146, "logps/rejected": -1.3130838871002197, "loss": 1.2472, "rewards/accuracies": 0.53125, "rewards/chosen": -1.2472150325775146, "rewards/margins": 0.06586878001689911, "rewards/rejected": -1.3130838871002197, "step": 4555 }, { "epoch": 2.440541896638234, "grad_norm": 4.993452560479063, "learning_rate": 1.0207049689218405e-07, "logits/chosen": -0.222621887922287, "logits/rejected": -0.08934080600738525, "logps/chosen": -1.1443763971328735, "logps/rejected": -1.259687066078186, "loss": 1.1444, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.1443763971328735, "rewards/margins": 0.11531062424182892, "rewards/rejected": -1.259687066078186, "step": 4560 }, { "epoch": 2.4432179294196352, "grad_norm": 5.118583894435553, "learning_rate": 1.0112944950712782e-07, "logits/chosen": -0.17633287608623505, "logits/rejected": -0.13063417375087738, "logps/chosen": -1.2355321645736694, "logps/rejected": -1.3300834894180298, "loss": 1.2355, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.2355321645736694, "rewards/margins": 0.0945511981844902, "rewards/rejected": -1.3300834894180298, "step": 4565 }, { "epoch": 2.445893962201037, "grad_norm": 4.829945695842303, "learning_rate": 1.0019227196590174e-07, "logits/chosen": -0.14353114366531372, "logits/rejected": -0.023749398067593575, "logps/chosen": -1.188342809677124, "logps/rejected": -1.3751198053359985, "loss": 1.1883, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.188342809677124, "rewards/margins": 0.18677690625190735, "rewards/rejected": -1.3751198053359985, "step": 4570 }, { "epoch": 2.4485699949824387, "grad_norm": 4.447009298417039, "learning_rate": 9.925897336100664e-08, "logits/chosen": -0.13171133399009705, "logits/rejected": -0.10042989253997803, "logps/chosen": -1.1794962882995605, "logps/rejected": -1.2994484901428223, "loss": 1.1795, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.1794962882995605, "rewards/margins": 0.1199522465467453, "rewards/rejected": -1.2994484901428223, "step": 4575 }, { "epoch": 2.45124602776384, "grad_norm": 6.241954639396088, "learning_rate": 9.832956274730946e-08, "logits/chosen": -0.2116454541683197, "logits/rejected": -0.17488452792167664, "logps/chosen": -1.1050554513931274, "logps/rejected": -1.3357754945755005, "loss": 1.1051, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.1050554513931274, "rewards/margins": 0.23072025179862976, "rewards/rejected": -1.3357754945755005, "step": 4580 }, { "epoch": 2.4539220605452416, "grad_norm": 4.728306222902174, "learning_rate": 9.740404914195633e-08, "logits/chosen": -0.15715214610099792, "logits/rejected": -0.05165759474039078, "logps/chosen": -1.2528505325317383, "logps/rejected": -1.3100569248199463, "loss": 1.2529, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.2528505325317383, "rewards/margins": 0.05720631033182144, "rewards/rejected": -1.3100569248199463, "step": 4585 }, { "epoch": 2.4565980933266434, "grad_norm": 5.292029059812616, "learning_rate": 9.648244152428392e-08, "logits/chosen": -0.27580323815345764, "logits/rejected": -0.12879705429077148, "logps/chosen": -1.133514642715454, "logps/rejected": -1.3243728876113892, "loss": 1.1335, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.133514642715454, "rewards/margins": 0.1908581405878067, "rewards/rejected": -1.3243728876113892, "step": 4590 }, { "epoch": 2.4592741261080446, "grad_norm": 5.597982928520781, "learning_rate": 9.556474883573379e-08, "logits/chosen": -0.23237867653369904, "logits/rejected": -0.1350330114364624, "logps/chosen": -1.1951416730880737, "logps/rejected": -1.3520140647888184, "loss": 1.1951, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.1951416730880737, "rewards/margins": 0.15687242150306702, "rewards/rejected": -1.3520140647888184, "step": 4595 }, { "epoch": 2.4619501588894463, "grad_norm": 5.11692476746155, "learning_rate": 9.465097997976412e-08, "logits/chosen": -0.21551290154457092, "logits/rejected": 0.0001922404335346073, "logps/chosen": -1.240607500076294, "logps/rejected": -1.4366354942321777, "loss": 1.2406, "rewards/accuracies": 0.5625, "rewards/chosen": -1.240607500076294, "rewards/margins": 0.19602814316749573, "rewards/rejected": -1.4366354942321777, "step": 4600 }, { "epoch": 2.464626191670848, "grad_norm": 6.090119116839553, "learning_rate": 9.374114382176457e-08, "logits/chosen": -0.1400359719991684, "logits/rejected": -0.04062289744615555, "logps/chosen": -1.2085881233215332, "logps/rejected": -1.3708651065826416, "loss": 1.2086, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.2085881233215332, "rewards/margins": 0.16227713227272034, "rewards/rejected": -1.3708651065826416, "step": 4605 }, { "epoch": 2.46730222445225, "grad_norm": 5.658284313301721, "learning_rate": 9.283524918896945e-08, "logits/chosen": -0.22492024302482605, "logits/rejected": -0.10142304748296738, "logps/chosen": -1.1958942413330078, "logps/rejected": -1.370773196220398, "loss": 1.1959, "rewards/accuracies": 0.625, "rewards/chosen": -1.1958942413330078, "rewards/margins": 0.17487908899784088, "rewards/rejected": -1.370773196220398, "step": 4610 }, { "epoch": 2.469978257233651, "grad_norm": 4.901023166887599, "learning_rate": 9.193330487037232e-08, "logits/chosen": -0.14853301644325256, "logits/rejected": -0.037921637296676636, "logps/chosen": -1.2176637649536133, "logps/rejected": -1.3970154523849487, "loss": 1.2177, "rewards/accuracies": 0.5625, "rewards/chosen": -1.2176637649536133, "rewards/margins": 0.17935167253017426, "rewards/rejected": -1.3970154523849487, "step": 4615 }, { "epoch": 2.4726542900150528, "grad_norm": 6.19431422728608, "learning_rate": 9.103531961664118e-08, "logits/chosen": -0.17562012374401093, "logits/rejected": -0.024827472865581512, "logps/chosen": -1.1130481958389282, "logps/rejected": -1.316993236541748, "loss": 1.113, "rewards/accuracies": 0.59375, "rewards/chosen": -1.1130481958389282, "rewards/margins": 0.20394523441791534, "rewards/rejected": -1.316993236541748, "step": 4620 }, { "epoch": 2.475330322796454, "grad_norm": 5.062521159412405, "learning_rate": 9.014130214003269e-08, "logits/chosen": -0.27393895387649536, "logits/rejected": -0.23451296985149384, "logps/chosen": -1.2380180358886719, "logps/rejected": -1.3642761707305908, "loss": 1.238, "rewards/accuracies": 0.53125, "rewards/chosen": -1.2380180358886719, "rewards/margins": 0.12625792622566223, "rewards/rejected": -1.3642761707305908, "step": 4625 }, { "epoch": 2.4780063555778558, "grad_norm": 5.885070561703599, "learning_rate": 8.925126111430848e-08, "logits/chosen": -0.16663148999214172, "logits/rejected": -0.1011597141623497, "logps/chosen": -1.1397186517715454, "logps/rejected": -1.346660852432251, "loss": 1.1397, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.1397186517715454, "rewards/margins": 0.20694205164909363, "rewards/rejected": -1.346660852432251, "step": 4630 }, { "epoch": 2.4806823883592575, "grad_norm": 5.944143040502373, "learning_rate": 8.83652051746504e-08, "logits/chosen": -0.10842017829418182, "logits/rejected": -0.0036957175470888615, "logps/chosen": -1.1433407068252563, "logps/rejected": -1.317526936531067, "loss": 1.1433, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.1433407068252563, "rewards/margins": 0.17418628931045532, "rewards/rejected": -1.317526936531067, "step": 4635 }, { "epoch": 2.483358421140659, "grad_norm": 6.0717720945161435, "learning_rate": 8.748314291757696e-08, "logits/chosen": -0.14757370948791504, "logits/rejected": -0.06467844545841217, "logps/chosen": -1.186374306678772, "logps/rejected": -1.3525961637496948, "loss": 1.1864, "rewards/accuracies": 0.5625, "rewards/chosen": -1.186374306678772, "rewards/margins": 0.1662217080593109, "rewards/rejected": -1.3525961637496948, "step": 4640 }, { "epoch": 2.4860344539220605, "grad_norm": 5.533376908613183, "learning_rate": 8.660508290086032e-08, "logits/chosen": -0.16511614620685577, "logits/rejected": -0.051627375185489655, "logps/chosen": -1.1647348403930664, "logps/rejected": -1.352691888809204, "loss": 1.1647, "rewards/accuracies": 0.5625, "rewards/chosen": -1.1647348403930664, "rewards/margins": 0.1879568248987198, "rewards/rejected": -1.352691888809204, "step": 4645 }, { "epoch": 2.488710486703462, "grad_norm": 5.660193126871121, "learning_rate": 8.573103364344231e-08, "logits/chosen": -0.20231175422668457, "logits/rejected": -0.006126874592155218, "logps/chosen": -1.1979787349700928, "logps/rejected": -1.4212746620178223, "loss": 1.198, "rewards/accuracies": 0.53125, "rewards/chosen": -1.1979787349700928, "rewards/margins": 0.22329601645469666, "rewards/rejected": -1.4212746620178223, "step": 4650 }, { "epoch": 2.4913865194848634, "grad_norm": 6.397949134754644, "learning_rate": 8.486100362535292e-08, "logits/chosen": -0.22454223036766052, "logits/rejected": -0.10834813117980957, "logps/chosen": -1.170555830001831, "logps/rejected": -1.2736228704452515, "loss": 1.1706, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.170555830001831, "rewards/margins": 0.1030670553445816, "rewards/rejected": -1.2736228704452515, "step": 4655 }, { "epoch": 2.494062552266265, "grad_norm": 5.163428647654393, "learning_rate": 8.399500128762693e-08, "logits/chosen": -0.20536179840564728, "logits/rejected": -0.11849091947078705, "logps/chosen": -1.2344039678573608, "logps/rejected": -1.3251535892486572, "loss": 1.2344, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.2344039678573608, "rewards/margins": 0.09074946492910385, "rewards/rejected": -1.3251535892486572, "step": 4660 }, { "epoch": 2.496738585047667, "grad_norm": 6.231489019142623, "learning_rate": 8.313303503222313e-08, "logits/chosen": -0.1729615032672882, "logits/rejected": -0.11849389225244522, "logps/chosen": -1.2576578855514526, "logps/rejected": -1.4136765003204346, "loss": 1.2577, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.2576578855514526, "rewards/margins": 0.1560186743736267, "rewards/rejected": -1.4136765003204346, "step": 4665 }, { "epoch": 2.4994146178290686, "grad_norm": 5.861567135479616, "learning_rate": 8.227511322194164e-08, "logits/chosen": -0.18548189103603363, "logits/rejected": -0.08917588740587234, "logps/chosen": -1.23190176486969, "logps/rejected": -1.3362205028533936, "loss": 1.2319, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.23190176486969, "rewards/margins": 0.10431866347789764, "rewards/rejected": -1.3362205028533936, "step": 4670 }, { "epoch": 2.50209065061047, "grad_norm": 5.182587265542845, "learning_rate": 8.142124418034385e-08, "logits/chosen": -0.16466785967350006, "logits/rejected": -0.060517121106386185, "logps/chosen": -1.1398767232894897, "logps/rejected": -1.2975809574127197, "loss": 1.1399, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.1398767232894897, "rewards/margins": 0.15770414471626282, "rewards/rejected": -1.2975809574127197, "step": 4675 }, { "epoch": 2.5047666833918716, "grad_norm": 6.918831642219247, "learning_rate": 8.057143619167073e-08, "logits/chosen": -0.16954359412193298, "logits/rejected": -0.08503154665231705, "logps/chosen": -1.1868553161621094, "logps/rejected": -1.2876912355422974, "loss": 1.1869, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.1868553161621094, "rewards/margins": 0.1008359044790268, "rewards/rejected": -1.2876912355422974, "step": 4680 }, { "epoch": 2.507442716173273, "grad_norm": 4.4065475384264605, "learning_rate": 7.97256975007633e-08, "logits/chosen": -0.26007571816444397, "logits/rejected": -0.12184002250432968, "logps/chosen": -1.205588936805725, "logps/rejected": -1.3191555738449097, "loss": 1.2056, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -1.205588936805725, "rewards/margins": 0.11356665939092636, "rewards/rejected": -1.3191555738449097, "step": 4685 }, { "epoch": 2.5101187489546746, "grad_norm": 6.6765351159604105, "learning_rate": 7.888403631298186e-08, "logits/chosen": -0.18126823008060455, "logits/rejected": -0.15048277378082275, "logps/chosen": -1.2253854274749756, "logps/rejected": -1.3279612064361572, "loss": 1.2254, "rewards/accuracies": 0.5625, "rewards/chosen": -1.2253854274749756, "rewards/margins": 0.1025756374001503, "rewards/rejected": -1.3279612064361572, "step": 4690 }, { "epoch": 2.5127947817360763, "grad_norm": 5.666001009467963, "learning_rate": 7.804646079412719e-08, "logits/chosen": -0.17539486289024353, "logits/rejected": -0.025555212050676346, "logps/chosen": -1.2076208591461182, "logps/rejected": -1.320468544960022, "loss": 1.2076, "rewards/accuracies": 0.5625, "rewards/chosen": -1.2076208591461182, "rewards/margins": 0.11284773051738739, "rewards/rejected": -1.320468544960022, "step": 4695 }, { "epoch": 2.515470814517478, "grad_norm": 5.325048713176919, "learning_rate": 7.72129790703604e-08, "logits/chosen": -0.2408408671617508, "logits/rejected": -0.1614464819431305, "logps/chosen": -1.1684648990631104, "logps/rejected": -1.2785074710845947, "loss": 1.1685, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.1684648990631104, "rewards/margins": 0.11004259437322617, "rewards/rejected": -1.2785074710845947, "step": 4700 }, { "epoch": 2.5181468472988793, "grad_norm": 7.672554993204083, "learning_rate": 7.638359922812504e-08, "logits/chosen": -0.13694968819618225, "logits/rejected": -0.12687084078788757, "logps/chosen": -1.2422469854354858, "logps/rejected": -1.3408772945404053, "loss": 1.2422, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.2422469854354858, "rewards/margins": 0.09863032400608063, "rewards/rejected": -1.3408772945404053, "step": 4705 }, { "epoch": 2.520822880080281, "grad_norm": 7.498197704173337, "learning_rate": 7.555832931406774e-08, "logits/chosen": -0.2084512710571289, "logits/rejected": -0.10262282192707062, "logps/chosen": -1.2103984355926514, "logps/rejected": -1.3477360010147095, "loss": 1.2104, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.2103984355926514, "rewards/margins": 0.1373373568058014, "rewards/rejected": -1.3477360010147095, "step": 4710 }, { "epoch": 2.5234989128616827, "grad_norm": 6.074585917115218, "learning_rate": 7.47371773349611e-08, "logits/chosen": -0.20565000176429749, "logits/rejected": -0.1958608329296112, "logps/chosen": -1.2800343036651611, "logps/rejected": -1.4032142162322998, "loss": 1.28, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.2800343036651611, "rewards/margins": 0.12317997217178345, "rewards/rejected": -1.4032142162322998, "step": 4715 }, { "epoch": 2.526174945643084, "grad_norm": 6.133864907773721, "learning_rate": 7.392015125762496e-08, "logits/chosen": -0.16822464764118195, "logits/rejected": -0.10831649601459503, "logps/chosen": -1.1343886852264404, "logps/rejected": -1.3218200206756592, "loss": 1.1344, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.1343886852264404, "rewards/margins": 0.1874314844608307, "rewards/rejected": -1.3218200206756592, "step": 4720 }, { "epoch": 2.5288509784244857, "grad_norm": 5.405635759647701, "learning_rate": 7.310725900885018e-08, "logits/chosen": -0.2158416509628296, "logits/rejected": -0.17118822038173676, "logps/chosen": -1.2045447826385498, "logps/rejected": -1.3072080612182617, "loss": 1.2045, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.2045447826385498, "rewards/margins": 0.10266326367855072, "rewards/rejected": -1.3072080612182617, "step": 4725 }, { "epoch": 2.5315270112058874, "grad_norm": 5.393079110881741, "learning_rate": 7.229850847532076e-08, "logits/chosen": -0.16964614391326904, "logits/rejected": -0.07894574105739594, "logps/chosen": -1.1124845743179321, "logps/rejected": -1.257402777671814, "loss": 1.1125, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.1124845743179321, "rewards/margins": 0.14491820335388184, "rewards/rejected": -1.257402777671814, "step": 4730 }, { "epoch": 2.5342030439872887, "grad_norm": 4.587399921829868, "learning_rate": 7.149390750353779e-08, "logits/chosen": -0.16120073199272156, "logits/rejected": -0.22209982573986053, "logps/chosen": -1.243112325668335, "logps/rejected": -1.3660151958465576, "loss": 1.2431, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.243112325668335, "rewards/margins": 0.12290263175964355, "rewards/rejected": -1.3660151958465576, "step": 4735 }, { "epoch": 2.5368790767686904, "grad_norm": 5.031742704718312, "learning_rate": 7.069346389974374e-08, "logits/chosen": -0.2238035649061203, "logits/rejected": -0.11091597378253937, "logps/chosen": -1.228814721107483, "logps/rejected": -1.4159437417984009, "loss": 1.2288, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.228814721107483, "rewards/margins": 0.1871289163827896, "rewards/rejected": -1.4159437417984009, "step": 4740 }, { "epoch": 2.539555109550092, "grad_norm": 6.567319010681789, "learning_rate": 6.989718542984563e-08, "logits/chosen": -0.15541593730449677, "logits/rejected": -0.13594503700733185, "logps/chosen": -1.2299058437347412, "logps/rejected": -1.3922489881515503, "loss": 1.2299, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.2299058437347412, "rewards/margins": 0.1623431146144867, "rewards/rejected": -1.3922489881515503, "step": 4745 }, { "epoch": 2.5422311423314934, "grad_norm": 4.406660008618449, "learning_rate": 6.9105079819341e-08, "logits/chosen": -0.1264290064573288, "logits/rejected": 0.012927663512527943, "logps/chosen": -1.1730016469955444, "logps/rejected": -1.4700548648834229, "loss": 1.173, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.1730016469955444, "rewards/margins": 0.29705309867858887, "rewards/rejected": -1.4700548648834229, "step": 4750 }, { "epoch": 2.544907175112895, "grad_norm": 5.319421256398169, "learning_rate": 6.831715475324163e-08, "logits/chosen": -0.18707406520843506, "logits/rejected": -0.06002429872751236, "logps/chosen": -1.1145838499069214, "logps/rejected": -1.362133264541626, "loss": 1.1146, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.1145838499069214, "rewards/margins": 0.24754926562309265, "rewards/rejected": -1.362133264541626, "step": 4755 }, { "epoch": 2.547583207894297, "grad_norm": 4.759779000585132, "learning_rate": 6.753341787600026e-08, "logits/chosen": -0.22456875443458557, "logits/rejected": -0.11728477478027344, "logps/chosen": -1.118850827217102, "logps/rejected": -1.2661397457122803, "loss": 1.1189, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.118850827217102, "rewards/margins": 0.14728888869285583, "rewards/rejected": -1.2661397457122803, "step": 4760 }, { "epoch": 2.5502592406756985, "grad_norm": 6.283755377338128, "learning_rate": 6.67538767914353e-08, "logits/chosen": -0.21545438468456268, "logits/rejected": -0.08083502948284149, "logps/chosen": -1.1989905834197998, "logps/rejected": -1.2623674869537354, "loss": 1.199, "rewards/accuracies": 0.53125, "rewards/chosen": -1.1989905834197998, "rewards/margins": 0.06337696313858032, "rewards/rejected": -1.2623674869537354, "step": 4765 }, { "epoch": 2.5529352734571, "grad_norm": 5.8519084111559625, "learning_rate": 6.597853906265793e-08, "logits/chosen": -0.17429722845554352, "logits/rejected": -0.10008683055639267, "logps/chosen": -1.213916540145874, "logps/rejected": -1.3702911138534546, "loss": 1.2139, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.213916540145874, "rewards/margins": 0.15637469291687012, "rewards/rejected": -1.3702911138534546, "step": 4770 }, { "epoch": 2.5556113062385015, "grad_norm": 4.2554833873852, "learning_rate": 6.5207412211998e-08, "logits/chosen": -0.09401429444551468, "logits/rejected": -0.004216939210891724, "logps/chosen": -1.1406121253967285, "logps/rejected": -1.285555124282837, "loss": 1.1406, "rewards/accuracies": 0.625, "rewards/chosen": -1.1406121253967285, "rewards/margins": 0.1449429988861084, "rewards/rejected": -1.285555124282837, "step": 4775 }, { "epoch": 2.558287339019903, "grad_norm": 4.3193203122828825, "learning_rate": 6.444050372093186e-08, "logits/chosen": -0.15367071330547333, "logits/rejected": -0.0981803685426712, "logps/chosen": -1.1941250562667847, "logps/rejected": -1.3311028480529785, "loss": 1.1941, "rewards/accuracies": 0.5625, "rewards/chosen": -1.1941250562667847, "rewards/margins": 0.13697774708271027, "rewards/rejected": -1.3311028480529785, "step": 4780 }, { "epoch": 2.5609633718013045, "grad_norm": 6.292242046194075, "learning_rate": 6.367782103000873e-08, "logits/chosen": -0.14404451847076416, "logits/rejected": -0.09851828962564468, "logps/chosen": -1.2087633609771729, "logps/rejected": -1.3050864934921265, "loss": 1.2088, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.2087633609771729, "rewards/margins": 0.09632303565740585, "rewards/rejected": -1.3050864934921265, "step": 4785 }, { "epoch": 2.5636394045827062, "grad_norm": 5.252354638104454, "learning_rate": 6.29193715387798e-08, "logits/chosen": -0.22890302538871765, "logits/rejected": -0.1200626865029335, "logps/chosen": -1.2216485738754272, "logps/rejected": -1.3447625637054443, "loss": 1.2216, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.2216485738754272, "rewards/margins": 0.12311387062072754, "rewards/rejected": -1.3447625637054443, "step": 4790 }, { "epoch": 2.566315437364108, "grad_norm": 7.371512208812444, "learning_rate": 6.216516260572502e-08, "logits/chosen": -0.12141036987304688, "logits/rejected": -0.07743864506483078, "logps/chosen": -1.2227764129638672, "logps/rejected": -1.3519577980041504, "loss": 1.2228, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.2227764129638672, "rewards/margins": 0.1291813850402832, "rewards/rejected": -1.3519577980041504, "step": 4795 }, { "epoch": 2.568991470145509, "grad_norm": 6.162446600460915, "learning_rate": 6.141520154818297e-08, "logits/chosen": -0.17543458938598633, "logits/rejected": -0.0968356728553772, "logps/chosen": -1.1365505456924438, "logps/rejected": -1.3125585317611694, "loss": 1.1366, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.1365505456924438, "rewards/margins": 0.17600807547569275, "rewards/rejected": -1.3125585317611694, "step": 4800 }, { "epoch": 2.568991470145509, "eval_logits/chosen": 0.0752953514456749, "eval_logits/rejected": 0.14240272343158722, "eval_logps/chosen": -1.2621642351150513, "eval_logps/rejected": -1.3865567445755005, "eval_loss": 1.2624272108078003, "eval_rewards/accuracies": 0.5474777221679688, "eval_rewards/chosen": -1.2621642351150513, "eval_rewards/margins": 0.12439243495464325, "eval_rewards/rejected": -1.3865567445755005, "eval_runtime": 40.2424, "eval_samples_per_second": 33.422, "eval_steps_per_second": 8.374, "step": 4800 }, { "epoch": 2.571667502926911, "grad_norm": 7.4960708169434405, "learning_rate": 6.066949564227897e-08, "logits/chosen": -0.24003823101520538, "logits/rejected": -0.1478370875120163, "logps/chosen": -1.2503254413604736, "logps/rejected": -1.4473637342453003, "loss": 1.2503, "rewards/accuracies": 0.5625, "rewards/chosen": -1.2503254413604736, "rewards/margins": 0.19703839719295502, "rewards/rejected": -1.4473637342453003, "step": 4805 }, { "epoch": 2.574343535708312, "grad_norm": 4.812672987028012, "learning_rate": 5.992805212285523e-08, "logits/chosen": -0.1379368007183075, "logits/rejected": -0.028243806213140488, "logps/chosen": -1.258652687072754, "logps/rejected": -1.3914520740509033, "loss": 1.2587, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.258652687072754, "rewards/margins": 0.1327994167804718, "rewards/rejected": -1.3914520740509033, "step": 4810 }, { "epoch": 2.577019568489714, "grad_norm": 5.446149926950313, "learning_rate": 5.9190878183399684e-08, "logits/chosen": -0.1551944613456726, "logits/rejected": -0.0711025595664978, "logps/chosen": -1.0876340866088867, "logps/rejected": -1.4005587100982666, "loss": 1.0876, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.0876340866088867, "rewards/margins": 0.31292468309402466, "rewards/rejected": -1.4005587100982666, "step": 4815 }, { "epoch": 2.5796956012711156, "grad_norm": 6.677432131986382, "learning_rate": 5.845798097597748e-08, "logits/chosen": -0.15092647075653076, "logits/rejected": -0.07807110249996185, "logps/chosen": -1.2569522857666016, "logps/rejected": -1.3605177402496338, "loss": 1.257, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.2569522857666016, "rewards/margins": 0.103565514087677, "rewards/rejected": -1.3605177402496338, "step": 4820 }, { "epoch": 2.5823716340525174, "grad_norm": 5.197052399158813, "learning_rate": 5.772936761116026e-08, "logits/chosen": -0.13844040036201477, "logits/rejected": -0.060238033533096313, "logps/chosen": -1.1579468250274658, "logps/rejected": -1.2746968269348145, "loss": 1.1579, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.1579468250274658, "rewards/margins": 0.11675004661083221, "rewards/rejected": -1.2746968269348145, "step": 4825 }, { "epoch": 2.5850476668339186, "grad_norm": 4.129962450853852, "learning_rate": 5.700504515795829e-08, "logits/chosen": -0.21032865345478058, "logits/rejected": -0.09868721663951874, "logps/chosen": -1.2482476234436035, "logps/rejected": -1.3198285102844238, "loss": 1.2482, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.2482476234436035, "rewards/margins": 0.07158108055591583, "rewards/rejected": -1.3198285102844238, "step": 4830 }, { "epoch": 2.5877236996153203, "grad_norm": 5.522796185793764, "learning_rate": 5.628502064375101e-08, "logits/chosen": -0.2913459539413452, "logits/rejected": -0.15157397091388702, "logps/chosen": -1.1762312650680542, "logps/rejected": -1.2772314548492432, "loss": 1.1762, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.1762312650680542, "rewards/margins": 0.10100017488002777, "rewards/rejected": -1.2772314548492432, "step": 4835 }, { "epoch": 2.5903997323967216, "grad_norm": 6.797577756672359, "learning_rate": 5.55693010542197e-08, "logits/chosen": -0.2330973893404007, "logits/rejected": -0.09843787550926208, "logps/chosen": -1.1963143348693848, "logps/rejected": -1.303551435470581, "loss": 1.1963, "rewards/accuracies": 0.53125, "rewards/chosen": -1.1963143348693848, "rewards/margins": 0.10723702609539032, "rewards/rejected": -1.303551435470581, "step": 4840 }, { "epoch": 2.5930757651781233, "grad_norm": 5.537708343567315, "learning_rate": 5.485789333327856e-08, "logits/chosen": -0.22261783480644226, "logits/rejected": -0.11749349534511566, "logps/chosen": -1.154234766960144, "logps/rejected": -1.3453904390335083, "loss": 1.1542, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.154234766960144, "rewards/margins": 0.19115546345710754, "rewards/rejected": -1.3453904390335083, "step": 4845 }, { "epoch": 2.595751797959525, "grad_norm": 4.947702455342151, "learning_rate": 5.4150804383008675e-08, "logits/chosen": -0.3160969316959381, "logits/rejected": -0.188938170671463, "logps/chosen": -1.1438184976577759, "logps/rejected": -1.2822356224060059, "loss": 1.1438, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.1438184976577759, "rewards/margins": 0.13841703534126282, "rewards/rejected": -1.2822356224060059, "step": 4850 }, { "epoch": 2.5984278307409268, "grad_norm": 5.009193290407017, "learning_rate": 5.344804106359002e-08, "logits/chosen": -0.15136580169200897, "logits/rejected": -0.02489718422293663, "logps/chosen": -1.1397643089294434, "logps/rejected": -1.2501283884048462, "loss": 1.1398, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.1397643089294434, "rewards/margins": 0.1103641614317894, "rewards/rejected": -1.2501283884048462, "step": 4855 }, { "epoch": 2.601103863522328, "grad_norm": 7.339441458758584, "learning_rate": 5.274961019323559e-08, "logits/chosen": -0.19895166158676147, "logits/rejected": -0.1605958491563797, "logps/chosen": -1.1060354709625244, "logps/rejected": -1.389443278312683, "loss": 1.106, "rewards/accuracies": 0.625, "rewards/chosen": -1.1060354709625244, "rewards/margins": 0.2834080159664154, "rewards/rejected": -1.389443278312683, "step": 4860 }, { "epoch": 2.6037798963037297, "grad_norm": 5.320839070474526, "learning_rate": 5.205551854812451e-08, "logits/chosen": -0.28505444526672363, "logits/rejected": -0.22394447028636932, "logps/chosen": -1.1787769794464111, "logps/rejected": -1.3853830099105835, "loss": 1.1788, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.1787769794464111, "rewards/margins": 0.20660610496997833, "rewards/rejected": -1.3853830099105835, "step": 4865 }, { "epoch": 2.606455929085131, "grad_norm": 6.484380224057769, "learning_rate": 5.1365772862337177e-08, "logits/chosen": -0.15008458495140076, "logits/rejected": -0.051386576145887375, "logps/chosen": -1.1805475950241089, "logps/rejected": -1.2719175815582275, "loss": 1.1805, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.1805475950241089, "rewards/margins": 0.09136978536844254, "rewards/rejected": -1.2719175815582275, "step": 4870 }, { "epoch": 2.6091319618665327, "grad_norm": 6.296979666412114, "learning_rate": 5.068037982778905e-08, "logits/chosen": -0.0641704648733139, "logits/rejected": -0.008793656714260578, "logps/chosen": -1.150014042854309, "logps/rejected": -1.4204457998275757, "loss": 1.15, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.150014042854309, "rewards/margins": 0.2704318165779114, "rewards/rejected": -1.4204457998275757, "step": 4875 }, { "epoch": 2.6118079946479344, "grad_norm": 4.670697663655496, "learning_rate": 4.999934609416656e-08, "logits/chosen": -0.09966184198856354, "logits/rejected": -0.018138553947210312, "logps/chosen": -1.1557397842407227, "logps/rejected": -1.3259261846542358, "loss": 1.1557, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.1557397842407227, "rewards/margins": 0.170186385512352, "rewards/rejected": -1.3259261846542358, "step": 4880 }, { "epoch": 2.614484027429336, "grad_norm": 5.513139267842803, "learning_rate": 4.932267826886183e-08, "logits/chosen": -0.12034069001674652, "logits/rejected": -0.05818212777376175, "logps/chosen": -1.2567121982574463, "logps/rejected": -1.4427788257598877, "loss": 1.2567, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.2567121982574463, "rewards/margins": 0.1860666573047638, "rewards/rejected": -1.4427788257598877, "step": 4885 }, { "epoch": 2.6171600602107374, "grad_norm": 4.3887667522166955, "learning_rate": 4.8650382916909206e-08, "logits/chosen": -0.24120764434337616, "logits/rejected": -0.11841032654047012, "logps/chosen": -1.1846390962600708, "logps/rejected": -1.3520411252975464, "loss": 1.1846, "rewards/accuracies": 0.59375, "rewards/chosen": -1.1846390962600708, "rewards/margins": 0.16740210354328156, "rewards/rejected": -1.3520411252975464, "step": 4890 }, { "epoch": 2.619836092992139, "grad_norm": 5.777710337374481, "learning_rate": 4.7982466560920976e-08, "logits/chosen": -0.2067561149597168, "logits/rejected": -0.13148149847984314, "logps/chosen": -1.2291290760040283, "logps/rejected": -1.3055697679519653, "loss": 1.2291, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.2291290760040283, "rewards/margins": 0.07644061744213104, "rewards/rejected": -1.3055697679519653, "step": 4895 }, { "epoch": 2.622512125773541, "grad_norm": 4.86780116045908, "learning_rate": 4.7318935681024685e-08, "logits/chosen": -0.14461107552051544, "logits/rejected": -0.042825616896152496, "logps/chosen": -1.174375057220459, "logps/rejected": -1.3412834405899048, "loss": 1.1744, "rewards/accuracies": 0.59375, "rewards/chosen": -1.174375057220459, "rewards/margins": 0.1669083833694458, "rewards/rejected": -1.3412834405899048, "step": 4900 }, { "epoch": 2.625188158554942, "grad_norm": 4.714393388089444, "learning_rate": 4.6659796714799745e-08, "logits/chosen": -0.20261415839195251, "logits/rejected": -0.08214298635721207, "logps/chosen": -1.167417287826538, "logps/rejected": -1.3362938165664673, "loss": 1.1674, "rewards/accuracies": 0.5625, "rewards/chosen": -1.167417287826538, "rewards/margins": 0.1688765585422516, "rewards/rejected": -1.3362938165664673, "step": 4905 }, { "epoch": 2.627864191336344, "grad_norm": 5.17041608417091, "learning_rate": 4.60050560572155e-08, "logits/chosen": -0.2040441781282425, "logits/rejected": -0.2344442903995514, "logps/chosen": -1.1840440034866333, "logps/rejected": -1.3913416862487793, "loss": 1.184, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.1840440034866333, "rewards/margins": 0.20729772746562958, "rewards/rejected": -1.3913416862487793, "step": 4910 }, { "epoch": 2.6305402241177456, "grad_norm": 5.997372220482468, "learning_rate": 4.535472006056834e-08, "logits/chosen": -0.14078983664512634, "logits/rejected": -0.07120084017515182, "logps/chosen": -1.1058403253555298, "logps/rejected": -1.3315671682357788, "loss": 1.1058, "rewards/accuracies": 0.625, "rewards/chosen": -1.1058403253555298, "rewards/margins": 0.22572699189186096, "rewards/rejected": -1.3315671682357788, "step": 4915 }, { "epoch": 2.6332162568991473, "grad_norm": 7.804683088984589, "learning_rate": 4.470879503442132e-08, "logits/chosen": -0.1071610301733017, "logits/rejected": -0.052795834839344025, "logps/chosen": -1.192765235900879, "logps/rejected": -1.2656968832015991, "loss": 1.1928, "rewards/accuracies": 0.53125, "rewards/chosen": -1.192765235900879, "rewards/margins": 0.0729316845536232, "rewards/rejected": -1.2656968832015991, "step": 4920 }, { "epoch": 2.6358922896805486, "grad_norm": 5.738891963292777, "learning_rate": 4.406728724554154e-08, "logits/chosen": -0.3229966163635254, "logits/rejected": -0.11962000280618668, "logps/chosen": -1.1598461866378784, "logps/rejected": -1.3309322595596313, "loss": 1.1598, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.1598461866378784, "rewards/margins": 0.17108608782291412, "rewards/rejected": -1.3309322595596313, "step": 4925 }, { "epoch": 2.6385683224619503, "grad_norm": 5.403408037894388, "learning_rate": 4.3430202917840664e-08, "logits/chosen": -0.10431374609470367, "logits/rejected": 0.0071934908628463745, "logps/chosen": -1.2204995155334473, "logps/rejected": -1.4359748363494873, "loss": 1.2205, "rewards/accuracies": 0.5625, "rewards/chosen": -1.2204995155334473, "rewards/margins": 0.21547511219978333, "rewards/rejected": -1.4359748363494873, "step": 4930 }, { "epoch": 2.6412443552433515, "grad_norm": 5.117836695480666, "learning_rate": 4.279754823231346e-08, "logits/chosen": -0.23167547583580017, "logits/rejected": -0.11429695785045624, "logps/chosen": -1.1821587085723877, "logps/rejected": -1.2938364744186401, "loss": 1.1822, "rewards/accuracies": 0.53125, "rewards/chosen": -1.1821587085723877, "rewards/margins": 0.11167782545089722, "rewards/rejected": -1.2938364744186401, "step": 4935 }, { "epoch": 2.6439203880247533, "grad_norm": 4.794404068962244, "learning_rate": 4.216932932697859e-08, "logits/chosen": -0.18662020564079285, "logits/rejected": -0.13535261154174805, "logps/chosen": -1.1891790628433228, "logps/rejected": -1.2651548385620117, "loss": 1.1892, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.1891790628433228, "rewards/margins": 0.07597588002681732, "rewards/rejected": -1.2651548385620117, "step": 4940 }, { "epoch": 2.646596420806155, "grad_norm": 5.354862229272335, "learning_rate": 4.154555229681844e-08, "logits/chosen": -0.2027326077222824, "logits/rejected": -0.050923902541399, "logps/chosen": -1.1797066926956177, "logps/rejected": -1.387333869934082, "loss": 1.1797, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.1797066926956177, "rewards/margins": 0.20762722194194794, "rewards/rejected": -1.387333869934082, "step": 4945 }, { "epoch": 2.6492724535875567, "grad_norm": 5.217182521520097, "learning_rate": 4.092622319372069e-08, "logits/chosen": -0.21795284748077393, "logits/rejected": -0.13162443041801453, "logps/chosen": -1.1420456171035767, "logps/rejected": -1.3397343158721924, "loss": 1.142, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.1420456171035767, "rewards/margins": 0.19768880307674408, "rewards/rejected": -1.3397343158721924, "step": 4950 }, { "epoch": 2.651948486368958, "grad_norm": 5.990667337314142, "learning_rate": 4.031134802641889e-08, "logits/chosen": -0.17237022519111633, "logits/rejected": -0.18822908401489258, "logps/chosen": -1.1675573587417603, "logps/rejected": -1.3790652751922607, "loss": 1.1676, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.1675573587417603, "rewards/margins": 0.21150800585746765, "rewards/rejected": -1.3790652751922607, "step": 4955 }, { "epoch": 2.6546245191503597, "grad_norm": 6.163116239154256, "learning_rate": 3.970093276043468e-08, "logits/chosen": -0.0863252580165863, "logits/rejected": -0.009489977732300758, "logps/chosen": -1.2129433155059814, "logps/rejected": -1.321547508239746, "loss": 1.2129, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.2129433155059814, "rewards/margins": 0.10860419273376465, "rewards/rejected": -1.321547508239746, "step": 4960 }, { "epoch": 2.657300551931761, "grad_norm": 5.651322549666769, "learning_rate": 3.9094983318019584e-08, "logits/chosen": -0.20125167071819305, "logits/rejected": -0.09831400960683823, "logps/chosen": -1.099871039390564, "logps/rejected": -1.2618236541748047, "loss": 1.0999, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.099871039390564, "rewards/margins": 0.16195257008075714, "rewards/rejected": -1.2618236541748047, "step": 4965 }, { "epoch": 2.6599765847131627, "grad_norm": 7.326244840344578, "learning_rate": 3.849350557809789e-08, "logits/chosen": -0.12110571563243866, "logits/rejected": -0.07631794363260269, "logps/chosen": -1.1424297094345093, "logps/rejected": -1.379982590675354, "loss": 1.1424, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.1424297094345093, "rewards/margins": 0.2375527322292328, "rewards/rejected": -1.379982590675354, "step": 4970 }, { "epoch": 2.6626526174945644, "grad_norm": 6.031805625967296, "learning_rate": 3.789650537620903e-08, "logits/chosen": -0.15165838599205017, "logits/rejected": -0.12247618287801743, "logps/chosen": -1.2030824422836304, "logps/rejected": -1.3262325525283813, "loss": 1.2031, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.2030824422836304, "rewards/margins": 0.1231498271226883, "rewards/rejected": -1.3262325525283813, "step": 4975 }, { "epoch": 2.665328650275966, "grad_norm": 5.609373229705442, "learning_rate": 3.730398850445182e-08, "logits/chosen": -0.07479704171419144, "logits/rejected": -0.02665177546441555, "logps/chosen": -1.3090709447860718, "logps/rejected": -1.384779691696167, "loss": 1.3091, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.3090709447860718, "rewards/margins": 0.07570876181125641, "rewards/rejected": -1.384779691696167, "step": 4980 }, { "epoch": 2.6680046830573674, "grad_norm": 6.326004564681149, "learning_rate": 3.671596071142735e-08, "logits/chosen": -0.12609989941120148, "logits/rejected": 0.0023765333462506533, "logps/chosen": -1.1358258724212646, "logps/rejected": -1.3041812181472778, "loss": 1.1358, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.1358258724212646, "rewards/margins": 0.16835537552833557, "rewards/rejected": -1.3041812181472778, "step": 4985 }, { "epoch": 2.670680715838769, "grad_norm": 4.747061786799284, "learning_rate": 3.6132427702183996e-08, "logits/chosen": -0.23498916625976562, "logits/rejected": -0.05921167880296707, "logps/chosen": -1.1376913785934448, "logps/rejected": -1.3292109966278076, "loss": 1.1377, "rewards/accuracies": 0.59375, "rewards/chosen": -1.1376913785934448, "rewards/margins": 0.19151976704597473, "rewards/rejected": -1.3292109966278076, "step": 4990 }, { "epoch": 2.6733567486201704, "grad_norm": 6.097580295563486, "learning_rate": 3.555339513816147e-08, "logits/chosen": -0.20533323287963867, "logits/rejected": -0.20229384303092957, "logps/chosen": -1.1930975914001465, "logps/rejected": -1.3230268955230713, "loss": 1.1931, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.1930975914001465, "rewards/margins": 0.1299293339252472, "rewards/rejected": -1.3230268955230713, "step": 4995 }, { "epoch": 2.676032781401572, "grad_norm": 6.215917025695918, "learning_rate": 3.497886863713639e-08, "logits/chosen": -0.19908447563648224, "logits/rejected": -0.1981719583272934, "logps/chosen": -1.1754047870635986, "logps/rejected": -1.4053175449371338, "loss": 1.1754, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.1754047870635986, "rewards/margins": 0.22991275787353516, "rewards/rejected": -1.4053175449371338, "step": 5000 }, { "epoch": 2.678708814182974, "grad_norm": 6.534425518157526, "learning_rate": 3.440885377316721e-08, "logits/chosen": -0.13613763451576233, "logits/rejected": -0.11029189825057983, "logps/chosen": -1.1713402271270752, "logps/rejected": -1.3105189800262451, "loss": 1.1713, "rewards/accuracies": 0.59375, "rewards/chosen": -1.1713402271270752, "rewards/margins": 0.13917875289916992, "rewards/rejected": -1.3105189800262451, "step": 5005 }, { "epoch": 2.6813848469643755, "grad_norm": 5.8684669501863205, "learning_rate": 3.384335607654082e-08, "logits/chosen": -0.1185506209731102, "logits/rejected": -0.050756312906742096, "logps/chosen": -1.3052974939346313, "logps/rejected": -1.4339029788970947, "loss": 1.3053, "rewards/accuracies": 0.53125, "rewards/chosen": -1.3052974939346313, "rewards/margins": 0.12860548496246338, "rewards/rejected": -1.4339029788970947, "step": 5010 }, { "epoch": 2.684060879745777, "grad_norm": 7.181824740197804, "learning_rate": 3.328238103371811e-08, "logits/chosen": -0.2272065430879593, "logits/rejected": -0.18868757784366608, "logps/chosen": -1.173488974571228, "logps/rejected": -1.365796685218811, "loss": 1.1735, "rewards/accuracies": 0.59375, "rewards/chosen": -1.173488974571228, "rewards/margins": 0.1923074871301651, "rewards/rejected": -1.365796685218811, "step": 5015 }, { "epoch": 2.6867369125271785, "grad_norm": 6.438360201490381, "learning_rate": 3.272593408728169e-08, "logits/chosen": -0.23044832050800323, "logits/rejected": -0.08877022564411163, "logps/chosen": -1.1520580053329468, "logps/rejected": -1.2617380619049072, "loss": 1.1521, "rewards/accuracies": 0.5625, "rewards/chosen": -1.1520580053329468, "rewards/margins": 0.10968013107776642, "rewards/rejected": -1.2617380619049072, "step": 5020 }, { "epoch": 2.6894129453085798, "grad_norm": 6.118612213504916, "learning_rate": 3.217402063588204e-08, "logits/chosen": -0.22410042583942413, "logits/rejected": -0.115653857588768, "logps/chosen": -1.2182083129882812, "logps/rejected": -1.3085105419158936, "loss": 1.2182, "rewards/accuracies": 0.53125, "rewards/chosen": -1.2182083129882812, "rewards/margins": 0.09030220657587051, "rewards/rejected": -1.3085105419158936, "step": 5025 }, { "epoch": 2.6920889780899815, "grad_norm": 4.707440573234997, "learning_rate": 3.162664603418608e-08, "logits/chosen": -0.17822813987731934, "logits/rejected": -0.14111852645874023, "logps/chosen": -1.1891123056411743, "logps/rejected": -1.262563705444336, "loss": 1.1891, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -1.1891123056411743, "rewards/margins": 0.07345139235258102, "rewards/rejected": -1.262563705444336, "step": 5030 }, { "epoch": 2.694765010871383, "grad_norm": 7.368397229785851, "learning_rate": 3.1083815592824416e-08, "logits/chosen": -0.2379087656736374, "logits/rejected": -0.14099231362342834, "logps/chosen": -1.2164126634597778, "logps/rejected": -1.354068636894226, "loss": 1.2164, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.2164126634597778, "rewards/margins": 0.13765597343444824, "rewards/rejected": -1.354068636894226, "step": 5035 }, { "epoch": 2.697441043652785, "grad_norm": 7.548351521746323, "learning_rate": 3.054553457834053e-08, "logits/chosen": 0.0034999041818082333, "logits/rejected": -0.05113924667239189, "logps/chosen": -1.121537685394287, "logps/rejected": -1.3558710813522339, "loss": 1.1215, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.121537685394287, "rewards/margins": 0.23433348536491394, "rewards/rejected": -1.3558710813522339, "step": 5040 }, { "epoch": 2.700117076434186, "grad_norm": 5.484986866056879, "learning_rate": 3.0011808213139036e-08, "logits/chosen": -0.1240164190530777, "logits/rejected": -0.0998527780175209, "logps/chosen": -1.136046290397644, "logps/rejected": -1.2683727741241455, "loss": 1.136, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.136046290397644, "rewards/margins": 0.1323263794183731, "rewards/rejected": -1.2683727741241455, "step": 5045 }, { "epoch": 2.702793109215588, "grad_norm": 6.766260479417767, "learning_rate": 2.948264167543568e-08, "logits/chosen": -0.1699165552854538, "logits/rejected": -0.1331300586462021, "logps/chosen": -1.0606178045272827, "logps/rejected": -1.3137447834014893, "loss": 1.0606, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.0606178045272827, "rewards/margins": 0.2531268000602722, "rewards/rejected": -1.3137447834014893, "step": 5050 }, { "epoch": 2.7054691419969896, "grad_norm": 5.271051702801839, "learning_rate": 2.8958040099206216e-08, "logits/chosen": -0.23504877090454102, "logits/rejected": -0.17150886356830597, "logps/chosen": -1.1107003688812256, "logps/rejected": -1.280916690826416, "loss": 1.1107, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.1107003688812256, "rewards/margins": 0.1702163964509964, "rewards/rejected": -1.280916690826416, "step": 5055 }, { "epoch": 2.708145174778391, "grad_norm": 5.531020569145611, "learning_rate": 2.843800857413775e-08, "logits/chosen": -0.15638892352581024, "logits/rejected": -0.12254093587398529, "logps/chosen": -1.170561671257019, "logps/rejected": -1.3564308881759644, "loss": 1.1706, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.170561671257019, "rewards/margins": 0.18586906790733337, "rewards/rejected": -1.3564308881759644, "step": 5060 }, { "epoch": 2.7108212075597926, "grad_norm": 6.117195418465577, "learning_rate": 2.7922552145578203e-08, "logits/chosen": -0.18556250631809235, "logits/rejected": -0.004349616356194019, "logps/chosen": -1.245448350906372, "logps/rejected": -1.4479674100875854, "loss": 1.2454, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.245448350906372, "rewards/margins": 0.20251917839050293, "rewards/rejected": -1.4479674100875854, "step": 5065 }, { "epoch": 2.7134972403411943, "grad_norm": 5.8617681541551185, "learning_rate": 2.7411675814488277e-08, "logits/chosen": -0.08442506939172745, "logits/rejected": 0.039942871779203415, "logps/chosen": -1.119390606880188, "logps/rejected": -1.2646725177764893, "loss": 1.1194, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.119390606880188, "rewards/margins": 0.14528211951255798, "rewards/rejected": -1.2646725177764893, "step": 5070 }, { "epoch": 2.7161732731225956, "grad_norm": 5.210351743976197, "learning_rate": 2.690538453739216e-08, "logits/chosen": -0.14774274826049805, "logits/rejected": -0.09399475157260895, "logps/chosen": -1.1585891246795654, "logps/rejected": -1.2267574071884155, "loss": 1.1586, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.1585891246795654, "rewards/margins": 0.0681682601571083, "rewards/rejected": -1.2267574071884155, "step": 5075 }, { "epoch": 2.7188493059039973, "grad_norm": 5.57792243546036, "learning_rate": 2.6403683226330298e-08, "logits/chosen": -0.2116890400648117, "logits/rejected": -0.11149144172668457, "logps/chosen": -1.2379134893417358, "logps/rejected": -1.3565165996551514, "loss": 1.2379, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.2379134893417358, "rewards/margins": 0.11860306560993195, "rewards/rejected": -1.3565165996551514, "step": 5080 }, { "epoch": 2.721525338685399, "grad_norm": 5.072895813964197, "learning_rate": 2.5906576748810804e-08, "logits/chosen": -0.2556164860725403, "logits/rejected": -0.16885975003242493, "logps/chosen": -1.0888261795043945, "logps/rejected": -1.402574896812439, "loss": 1.0888, "rewards/accuracies": 0.625, "rewards/chosen": -1.0888261795043945, "rewards/margins": 0.3137487769126892, "rewards/rejected": -1.402574896812439, "step": 5085 }, { "epoch": 2.7242013714668003, "grad_norm": 6.106043110250434, "learning_rate": 2.5414069927763016e-08, "logits/chosen": -0.25574764609336853, "logits/rejected": -0.13573111593723297, "logps/chosen": -1.2441829442977905, "logps/rejected": -1.397621512413025, "loss": 1.2442, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.2441829442977905, "rewards/margins": 0.15343870222568512, "rewards/rejected": -1.397621512413025, "step": 5090 }, { "epoch": 2.726877404248202, "grad_norm": 4.482655799170747, "learning_rate": 2.4926167541490185e-08, "logits/chosen": -0.28304266929626465, "logits/rejected": -0.1262107789516449, "logps/chosen": -1.1783808469772339, "logps/rejected": -1.3832371234893799, "loss": 1.1784, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.1783808469772339, "rewards/margins": 0.20485642552375793, "rewards/rejected": -1.3832371234893799, "step": 5095 }, { "epoch": 2.7295534370296037, "grad_norm": 6.142320044751149, "learning_rate": 2.4442874323623574e-08, "logits/chosen": -0.09005960077047348, "logits/rejected": 0.007062084041535854, "logps/chosen": -1.1235836744308472, "logps/rejected": -1.3408554792404175, "loss": 1.1236, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.1235836744308472, "rewards/margins": 0.21727165579795837, "rewards/rejected": -1.3408554792404175, "step": 5100 }, { "epoch": 2.7322294698110055, "grad_norm": 6.3933082043443985, "learning_rate": 2.396419496307589e-08, "logits/chosen": -0.1892465353012085, "logits/rejected": -0.06142977997660637, "logps/chosen": -1.190131664276123, "logps/rejected": -1.2875920534133911, "loss": 1.1901, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.190131664276123, "rewards/margins": 0.0974603071808815, "rewards/rejected": -1.2875920534133911, "step": 5105 }, { "epoch": 2.7349055025924067, "grad_norm": 5.1901593636586565, "learning_rate": 2.349013410399653e-08, "logits/chosen": -0.23706801235675812, "logits/rejected": -0.11218998581171036, "logps/chosen": -1.1507718563079834, "logps/rejected": -1.3023192882537842, "loss": 1.1508, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.1507718563079834, "rewards/margins": 0.15154756605625153, "rewards/rejected": -1.3023192882537842, "step": 5110 }, { "epoch": 2.7375815353738084, "grad_norm": 6.477628111333991, "learning_rate": 2.3020696345725954e-08, "logits/chosen": -0.24744561314582825, "logits/rejected": -0.08080488443374634, "logps/chosen": -1.1652326583862305, "logps/rejected": -1.3709019422531128, "loss": 1.1652, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.1652326583862305, "rewards/margins": 0.2056693136692047, "rewards/rejected": -1.3709019422531128, "step": 5115 }, { "epoch": 2.7402575681552097, "grad_norm": 5.720093405267048, "learning_rate": 2.2555886242751398e-08, "logits/chosen": -0.20288994908332825, "logits/rejected": -0.17528463900089264, "logps/chosen": -1.2815059423446655, "logps/rejected": -1.4438999891281128, "loss": 1.2815, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.2815059423446655, "rewards/margins": 0.16239413619041443, "rewards/rejected": -1.4438999891281128, "step": 5120 }, { "epoch": 2.7429336009366114, "grad_norm": 6.589835651791422, "learning_rate": 2.2095708304662453e-08, "logits/chosen": -0.2912960648536682, "logits/rejected": -0.11323080211877823, "logps/chosen": -1.1320594549179077, "logps/rejected": -1.34895658493042, "loss": 1.1321, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.1320594549179077, "rewards/margins": 0.2168971747159958, "rewards/rejected": -1.34895658493042, "step": 5125 }, { "epoch": 2.745609633718013, "grad_norm": 4.635554596883503, "learning_rate": 2.16401669961076e-08, "logits/chosen": -0.3295147716999054, "logits/rejected": -0.167080357670784, "logps/chosen": -1.1646815538406372, "logps/rejected": -1.3534224033355713, "loss": 1.1647, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.1646815538406372, "rewards/margins": 0.1887408196926117, "rewards/rejected": -1.3534224033355713, "step": 5130 }, { "epoch": 2.748285666499415, "grad_norm": 7.019780255940042, "learning_rate": 2.1189266736750532e-08, "logits/chosen": -0.11575952917337418, "logits/rejected": -0.06812619417905807, "logps/chosen": -1.1605476140975952, "logps/rejected": -1.331203818321228, "loss": 1.1605, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.1605476140975952, "rewards/margins": 0.17065612971782684, "rewards/rejected": -1.331203818321228, "step": 5135 }, { "epoch": 2.750961699280816, "grad_norm": 4.766709297872366, "learning_rate": 2.0743011901227623e-08, "logits/chosen": -0.14184683561325073, "logits/rejected": -0.049594536423683167, "logps/chosen": -1.2534557580947876, "logps/rejected": -1.385242223739624, "loss": 1.2535, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.2534557580947876, "rewards/margins": 0.1317864954471588, "rewards/rejected": -1.385242223739624, "step": 5140 }, { "epoch": 2.753637732062218, "grad_norm": 6.842979367327819, "learning_rate": 2.030140681910508e-08, "logits/chosen": -0.22800295054912567, "logits/rejected": -0.08031152188777924, "logps/chosen": -1.1534291505813599, "logps/rejected": -1.2762508392333984, "loss": 1.1534, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.1534291505813599, "rewards/margins": 0.12282167375087738, "rewards/rejected": -1.2762508392333984, "step": 5145 }, { "epoch": 2.756313764843619, "grad_norm": 4.486448435992153, "learning_rate": 1.986445577483753e-08, "logits/chosen": -0.24245715141296387, "logits/rejected": -0.15438078343868256, "logps/chosen": -1.1604692935943604, "logps/rejected": -1.339611291885376, "loss": 1.1605, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.1604692935943604, "rewards/margins": 0.17914213240146637, "rewards/rejected": -1.339611291885376, "step": 5150 }, { "epoch": 2.758989797625021, "grad_norm": 5.623927989148431, "learning_rate": 1.9432163007725765e-08, "logits/chosen": -0.2511288523674011, "logits/rejected": -0.16645702719688416, "logps/chosen": -1.2224483489990234, "logps/rejected": -1.400957703590393, "loss": 1.2224, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.2224483489990234, "rewards/margins": 0.17850922048091888, "rewards/rejected": -1.400957703590393, "step": 5155 }, { "epoch": 2.7616658304064226, "grad_norm": 4.587443898341372, "learning_rate": 1.9004532711876297e-08, "logits/chosen": -0.22570574283599854, "logits/rejected": -0.17558540403842926, "logps/chosen": -1.1495441198349, "logps/rejected": -1.446333646774292, "loss": 1.1495, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.1495441198349, "rewards/margins": 0.2967895567417145, "rewards/rejected": -1.446333646774292, "step": 5160 }, { "epoch": 2.7643418631878243, "grad_norm": 5.479412529555262, "learning_rate": 1.8581569036159928e-08, "logits/chosen": -0.22182950377464294, "logits/rejected": -0.07615089416503906, "logps/chosen": -1.1469885110855103, "logps/rejected": -1.3230836391448975, "loss": 1.147, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.1469885110855103, "rewards/margins": 0.17609518766403198, "rewards/rejected": -1.3230836391448975, "step": 5165 }, { "epoch": 2.7670178959692255, "grad_norm": 6.12793618127254, "learning_rate": 1.8163276084172285e-08, "logits/chosen": -0.18292966485023499, "logits/rejected": -0.08844266831874847, "logps/chosen": -1.2326257228851318, "logps/rejected": -1.3448771238327026, "loss": 1.2326, "rewards/accuracies": 0.53125, "rewards/chosen": -1.2326257228851318, "rewards/margins": 0.11225137859582901, "rewards/rejected": -1.3448771238327026, "step": 5170 }, { "epoch": 2.7696939287506273, "grad_norm": 5.240244335308906, "learning_rate": 1.7749657914193194e-08, "logits/chosen": -0.2384016513824463, "logits/rejected": -0.15568968653678894, "logps/chosen": -1.2387675046920776, "logps/rejected": -1.2979096174240112, "loss": 1.2388, "rewards/accuracies": 0.53125, "rewards/chosen": -1.2387675046920776, "rewards/margins": 0.059142231941223145, "rewards/rejected": -1.2979096174240112, "step": 5175 }, { "epoch": 2.7723699615320285, "grad_norm": 5.567375557982347, "learning_rate": 1.7340718539148203e-08, "logits/chosen": -0.16214582324028015, "logits/rejected": -0.13355332612991333, "logps/chosen": -1.243973970413208, "logps/rejected": -1.4654653072357178, "loss": 1.244, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.243973970413208, "rewards/margins": 0.2214914858341217, "rewards/rejected": -1.4654653072357178, "step": 5180 }, { "epoch": 2.7750459943134302, "grad_norm": 4.89132199446453, "learning_rate": 1.6936461926568724e-08, "logits/chosen": -0.19983211159706116, "logits/rejected": -0.11957468837499619, "logps/chosen": -1.089228868484497, "logps/rejected": -1.2631906270980835, "loss": 1.0892, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.089228868484497, "rewards/margins": 0.1739617884159088, "rewards/rejected": -1.2631906270980835, "step": 5185 }, { "epoch": 2.777722027094832, "grad_norm": 6.2544420402699625, "learning_rate": 1.6536891998554346e-08, "logits/chosen": -0.28958266973495483, "logits/rejected": -0.17118024826049805, "logps/chosen": -1.1828479766845703, "logps/rejected": -1.2858158349990845, "loss": 1.1828, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.1828479766845703, "rewards/margins": 0.1029679998755455, "rewards/rejected": -1.2858158349990845, "step": 5190 }, { "epoch": 2.7803980598762337, "grad_norm": 5.46338613210667, "learning_rate": 1.6142012631734093e-08, "logits/chosen": -0.2279481440782547, "logits/rejected": -0.12894509732723236, "logps/chosen": -1.1752735376358032, "logps/rejected": -1.317029595375061, "loss": 1.1753, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.1752735376358032, "rewards/margins": 0.14175590872764587, "rewards/rejected": -1.317029595375061, "step": 5195 }, { "epoch": 2.783074092657635, "grad_norm": 7.113532411559869, "learning_rate": 1.575182765722949e-08, "logits/chosen": -0.25294214487075806, "logits/rejected": -0.14866214990615845, "logps/chosen": -1.1253039836883545, "logps/rejected": -1.2848401069641113, "loss": 1.1253, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.1253039836883545, "rewards/margins": 0.15953612327575684, "rewards/rejected": -1.2848401069641113, "step": 5200 }, { "epoch": 2.783074092657635, "eval_logits/chosen": 0.05278417095541954, "eval_logits/rejected": 0.11776336282491684, "eval_logps/chosen": -1.2624114751815796, "eval_logps/rejected": -1.3865481615066528, "eval_loss": 1.262673020362854, "eval_rewards/accuracies": 0.5474777221679688, "eval_rewards/chosen": -1.2624114751815796, "eval_rewards/margins": 0.12413667887449265, "eval_rewards/rejected": -1.3865481615066528, "eval_runtime": 40.1653, "eval_samples_per_second": 33.487, "eval_steps_per_second": 8.39, "step": 5200 }, { "epoch": 2.7857501254390367, "grad_norm": 4.790631770261145, "learning_rate": 1.536634086061672e-08, "logits/chosen": -0.13223351538181305, "logits/rejected": -0.13094766438007355, "logps/chosen": -1.1831859350204468, "logps/rejected": -1.2915376424789429, "loss": 1.1832, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": -1.1831859350204468, "rewards/margins": 0.1083517074584961, "rewards/rejected": -1.2915376424789429, "step": 5205 }, { "epoch": 2.788426158220438, "grad_norm": 6.33213733676369, "learning_rate": 1.4985555981890495e-08, "logits/chosen": -0.18376502394676208, "logits/rejected": -0.13123205304145813, "logps/chosen": -1.1661059856414795, "logps/rejected": -1.2773351669311523, "loss": 1.1661, "rewards/accuracies": 0.5625, "rewards/chosen": -1.1661059856414795, "rewards/margins": 0.11122922599315643, "rewards/rejected": -1.2773351669311523, "step": 5210 }, { "epoch": 2.7911021910018396, "grad_norm": 5.011058897022669, "learning_rate": 1.4609476715427226e-08, "logits/chosen": -0.18909461796283722, "logits/rejected": -0.1277770698070526, "logps/chosen": -1.1711606979370117, "logps/rejected": -1.3674226999282837, "loss": 1.1712, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.1711606979370117, "rewards/margins": 0.19626189768314362, "rewards/rejected": -1.3674226999282837, "step": 5215 }, { "epoch": 2.7937782237832414, "grad_norm": 5.038783879507043, "learning_rate": 1.4238106709949792e-08, "logits/chosen": -0.21323256194591522, "logits/rejected": -0.17240896821022034, "logps/chosen": -1.1479761600494385, "logps/rejected": -1.3175342082977295, "loss": 1.148, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.1479761600494385, "rewards/margins": 0.16955788433551788, "rewards/rejected": -1.3175342082977295, "step": 5220 }, { "epoch": 2.796454256564643, "grad_norm": 5.474353807254319, "learning_rate": 1.3871449568491511e-08, "logits/chosen": -0.14836075901985168, "logits/rejected": -0.051611561328172684, "logps/chosen": -1.2330199480056763, "logps/rejected": -1.4008675813674927, "loss": 1.233, "rewards/accuracies": 0.53125, "rewards/chosen": -1.2330199480056763, "rewards/margins": 0.1678476333618164, "rewards/rejected": -1.4008675813674927, "step": 5225 }, { "epoch": 2.7991302893460444, "grad_norm": 8.638921236751381, "learning_rate": 1.3509508848361606e-08, "logits/chosen": -0.2605668902397156, "logits/rejected": -0.15718919038772583, "logps/chosen": -1.1870616674423218, "logps/rejected": -1.314060926437378, "loss": 1.1871, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.1870616674423218, "rewards/margins": 0.12699927389621735, "rewards/rejected": -1.314060926437378, "step": 5230 }, { "epoch": 2.801806322127446, "grad_norm": 5.459036730061305, "learning_rate": 1.3152288061110517e-08, "logits/chosen": -0.2362012416124344, "logits/rejected": -0.18095453083515167, "logps/chosen": -1.2097476720809937, "logps/rejected": -1.310587763786316, "loss": 1.2097, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.2097476720809937, "rewards/margins": 0.10084006935358047, "rewards/rejected": -1.310587763786316, "step": 5235 }, { "epoch": 2.804482354908848, "grad_norm": 5.160586783016273, "learning_rate": 1.2799790672495814e-08, "logits/chosen": -0.2072005569934845, "logits/rejected": -0.047491393983364105, "logps/chosen": -1.1913962364196777, "logps/rejected": -1.311750054359436, "loss": 1.1914, "rewards/accuracies": 0.53125, "rewards/chosen": -1.1913962364196777, "rewards/margins": 0.12035369873046875, "rewards/rejected": -1.311750054359436, "step": 5240 }, { "epoch": 2.807158387690249, "grad_norm": 5.355173264333818, "learning_rate": 1.2452020102448835e-08, "logits/chosen": -0.1324390172958374, "logits/rejected": -0.09507165849208832, "logps/chosen": -1.1465109586715698, "logps/rejected": -1.2734403610229492, "loss": 1.1465, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.1465109586715698, "rewards/margins": 0.1269294172525406, "rewards/rejected": -1.2734403610229492, "step": 5245 }, { "epoch": 2.8098344204716508, "grad_norm": 5.595419436265161, "learning_rate": 1.2108979725041103e-08, "logits/chosen": -0.22274549305438995, "logits/rejected": -0.12368407100439072, "logps/chosen": -1.217674732208252, "logps/rejected": -1.329187035560608, "loss": 1.2177, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.217674732208252, "rewards/margins": 0.11151238530874252, "rewards/rejected": -1.329187035560608, "step": 5250 }, { "epoch": 2.8125104532530525, "grad_norm": 5.292101938578717, "learning_rate": 1.1770672868451958e-08, "logits/chosen": -0.18631181120872498, "logits/rejected": -0.02948579750955105, "logps/chosen": -1.2062535285949707, "logps/rejected": -1.2603120803833008, "loss": 1.2063, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -1.2062535285949707, "rewards/margins": 0.0540585033595562, "rewards/rejected": -1.2603120803833008, "step": 5255 }, { "epoch": 2.8151864860344538, "grad_norm": 6.878027461080006, "learning_rate": 1.1437102814935872e-08, "logits/chosen": -0.19709287583827972, "logits/rejected": -0.14394691586494446, "logps/chosen": -1.1880168914794922, "logps/rejected": -1.3368690013885498, "loss": 1.188, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.1880168914794922, "rewards/margins": 0.14885208010673523, "rewards/rejected": -1.3368690013885498, "step": 5260 }, { "epoch": 2.8178625188158555, "grad_norm": 5.357729177175309, "learning_rate": 1.1108272800791018e-08, "logits/chosen": -0.2982117831707001, "logits/rejected": -0.14496132731437683, "logps/chosen": -1.3641748428344727, "logps/rejected": -1.3889119625091553, "loss": 1.3642, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.3641748428344727, "rewards/margins": 0.02473713830113411, "rewards/rejected": -1.3889119625091553, "step": 5265 }, { "epoch": 2.820538551597257, "grad_norm": 5.122278939180805, "learning_rate": 1.078418601632769e-08, "logits/chosen": -0.1779308319091797, "logits/rejected": -0.0698479637503624, "logps/chosen": -1.1304622888565063, "logps/rejected": -1.37287437915802, "loss": 1.1305, "rewards/accuracies": 0.625, "rewards/chosen": -1.1304622888565063, "rewards/margins": 0.24241212010383606, "rewards/rejected": -1.37287437915802, "step": 5270 }, { "epoch": 2.8232145843786585, "grad_norm": 6.266688976719186, "learning_rate": 1.0464845605837159e-08, "logits/chosen": -0.17148005962371826, "logits/rejected": -0.051482170820236206, "logps/chosen": -1.2139627933502197, "logps/rejected": -1.3756651878356934, "loss": 1.214, "rewards/accuracies": 0.53125, "rewards/chosen": -1.2139627933502197, "rewards/margins": 0.1617024540901184, "rewards/rejected": -1.3756651878356934, "step": 5275 }, { "epoch": 2.82589061716006, "grad_norm": 5.478713291506253, "learning_rate": 1.0150254667561642e-08, "logits/chosen": -0.17016106843948364, "logits/rejected": -0.04973593354225159, "logps/chosen": -1.2339063882827759, "logps/rejected": -1.3303006887435913, "loss": 1.2339, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.2339063882827759, "rewards/margins": 0.09639439731836319, "rewards/rejected": -1.3303006887435913, "step": 5280 }, { "epoch": 2.828566649941462, "grad_norm": 6.33731463330771, "learning_rate": 9.840416253663719e-09, "logits/chosen": -0.21851710975170135, "logits/rejected": -0.1465352177619934, "logps/chosen": -1.1618664264678955, "logps/rejected": -1.279360055923462, "loss": 1.1619, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.1618664264678955, "rewards/margins": 0.11749361455440521, "rewards/rejected": -1.279360055923462, "step": 5285 }, { "epoch": 2.8312426827228636, "grad_norm": 4.457632667855535, "learning_rate": 9.535333370197074e-09, "logits/chosen": -0.20362329483032227, "logits/rejected": -0.10357587039470673, "logps/chosen": -1.2354543209075928, "logps/rejected": -1.3443900346755981, "loss": 1.2355, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.2354543209075928, "rewards/margins": 0.10893561691045761, "rewards/rejected": -1.3443900346755981, "step": 5290 }, { "epoch": 2.833918715504265, "grad_norm": 4.021216219141969, "learning_rate": 9.23500897707713e-09, "logits/chosen": -0.24143080413341522, "logits/rejected": -0.09366939961910248, "logps/chosen": -1.2757933139801025, "logps/rejected": -1.4686439037322998, "loss": 1.2758, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.2757933139801025, "rewards/margins": 0.1928505301475525, "rewards/rejected": -1.4686439037322998, "step": 5295 }, { "epoch": 2.8365947482856666, "grad_norm": 6.000427120958321, "learning_rate": 8.939445988052574e-09, "logits/chosen": -0.2008521556854248, "logits/rejected": -0.1785990297794342, "logps/chosen": -1.1802165508270264, "logps/rejected": -1.4273624420166016, "loss": 1.1802, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.1802165508270264, "rewards/margins": 0.2471458911895752, "rewards/rejected": -1.4273624420166016, "step": 5300 }, { "epoch": 2.839270781067068, "grad_norm": 5.129546492016116, "learning_rate": 8.648647270676656e-09, "logits/chosen": -0.2029685229063034, "logits/rejected": -0.09293171763420105, "logps/chosen": -1.243990182876587, "logps/rejected": -1.4096708297729492, "loss": 1.244, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.243990182876587, "rewards/margins": 0.1656806766986847, "rewards/rejected": -1.4096708297729492, "step": 5305 }, { "epoch": 2.8419468138484696, "grad_norm": 3.6635619978756035, "learning_rate": 8.362615646279991e-09, "logits/chosen": -0.3039104640483856, "logits/rejected": -0.10996575653553009, "logps/chosen": -1.127610445022583, "logps/rejected": -1.3549935817718506, "loss": 1.1276, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.127610445022583, "rewards/margins": 0.22738322615623474, "rewards/rejected": -1.3549935817718506, "step": 5310 }, { "epoch": 2.8446228466298713, "grad_norm": 5.751878403387536, "learning_rate": 8.081353889942466e-09, "logits/chosen": -0.10042442381381989, "logits/rejected": -0.03391529247164726, "logps/chosen": -1.166150450706482, "logps/rejected": -1.2460756301879883, "loss": 1.1662, "rewards/accuracies": 0.5625, "rewards/chosen": -1.166150450706482, "rewards/margins": 0.07992511242628098, "rewards/rejected": -1.2460756301879883, "step": 5315 }, { "epoch": 2.847298879411273, "grad_norm": 4.751078612692468, "learning_rate": 7.804864730467042e-09, "logits/chosen": -0.11679749190807343, "logits/rejected": -0.06671228259801865, "logps/chosen": -1.1706610918045044, "logps/rejected": -1.2474629878997803, "loss": 1.1707, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.1706610918045044, "rewards/margins": 0.07680189609527588, "rewards/rejected": -1.2474629878997803, "step": 5320 }, { "epoch": 2.8499749121926743, "grad_norm": 6.075921845590768, "learning_rate": 7.533150850352665e-09, "logits/chosen": -0.16341452300548553, "logits/rejected": -0.07650180906057358, "logps/chosen": -1.2123228311538696, "logps/rejected": -1.4129843711853027, "loss": 1.2123, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.2123228311538696, "rewards/margins": 0.2006615400314331, "rewards/rejected": -1.4129843711853027, "step": 5325 }, { "epoch": 2.852650944974076, "grad_norm": 6.4459959299488325, "learning_rate": 7.2662148857686175e-09, "logits/chosen": -0.12143876403570175, "logits/rejected": -0.0847756415605545, "logps/chosen": -1.1499342918395996, "logps/rejected": -1.3209540843963623, "loss": 1.1499, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.1499342918395996, "rewards/margins": 0.17101994156837463, "rewards/rejected": -1.3209540843963623, "step": 5330 }, { "epoch": 2.8553269777554773, "grad_norm": 6.289630818578041, "learning_rate": 7.0040594265287635e-09, "logits/chosen": -0.10154581069946289, "logits/rejected": -0.12143011391162872, "logps/chosen": -1.1877286434173584, "logps/rejected": -1.289803147315979, "loss": 1.1877, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.1877286434173584, "rewards/margins": 0.10207446664571762, "rewards/rejected": -1.289803147315979, "step": 5335 }, { "epoch": 2.858003010536879, "grad_norm": 5.751409855963283, "learning_rate": 6.746687016066566e-09, "logits/chosen": -0.13757766783237457, "logits/rejected": -0.1346289962530136, "logps/chosen": -1.2165266275405884, "logps/rejected": -1.3018563985824585, "loss": 1.2165, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.2165266275405884, "rewards/margins": 0.08532998710870743, "rewards/rejected": -1.3018563985824585, "step": 5340 }, { "epoch": 2.8606790433182807, "grad_norm": 4.669318219057626, "learning_rate": 6.494100151410276e-09, "logits/chosen": -0.286662220954895, "logits/rejected": -0.16184048354625702, "logps/chosen": -1.1508328914642334, "logps/rejected": -1.3281511068344116, "loss": 1.1508, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.1508328914642334, "rewards/margins": 0.1773180514574051, "rewards/rejected": -1.3281511068344116, "step": 5345 }, { "epoch": 2.8633550760996824, "grad_norm": 6.051145955192834, "learning_rate": 6.246301283158728e-09, "logits/chosen": -0.1367761641740799, "logits/rejected": -0.17481480538845062, "logps/chosen": -1.2669693231582642, "logps/rejected": -1.389802098274231, "loss": 1.267, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.2669693231582642, "rewards/margins": 0.12283267080783844, "rewards/rejected": -1.389802098274231, "step": 5350 }, { "epoch": 2.8660311088810837, "grad_norm": 6.6752041497249115, "learning_rate": 6.0032928154576944e-09, "logits/chosen": -0.23038676381111145, "logits/rejected": -0.16715243458747864, "logps/chosen": -1.1792786121368408, "logps/rejected": -1.311183214187622, "loss": 1.1793, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.1792786121368408, "rewards/margins": 0.1319047510623932, "rewards/rejected": -1.311183214187622, "step": 5355 }, { "epoch": 2.8687071416624854, "grad_norm": 5.4000304627953275, "learning_rate": 5.76507710597629e-09, "logits/chosen": -0.18941862881183624, "logits/rejected": -0.04981934651732445, "logps/chosen": -1.200024127960205, "logps/rejected": -1.3578031063079834, "loss": 1.2, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.200024127960205, "rewards/margins": 0.15777911245822906, "rewards/rejected": -1.3578031063079834, "step": 5360 }, { "epoch": 2.8713831744438867, "grad_norm": 5.785924692967314, "learning_rate": 5.531656465884438e-09, "logits/chosen": -0.24784822762012482, "logits/rejected": -0.133918896317482, "logps/chosen": -1.195948839187622, "logps/rejected": -1.3674269914627075, "loss": 1.1959, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.195948839187622, "rewards/margins": 0.17147839069366455, "rewards/rejected": -1.3674269914627075, "step": 5365 }, { "epoch": 2.8740592072252884, "grad_norm": 7.132438571319397, "learning_rate": 5.303033159830217e-09, "logits/chosen": -0.12826357781887054, "logits/rejected": -0.11071465164422989, "logps/chosen": -1.1684768199920654, "logps/rejected": -1.2549574375152588, "loss": 1.1685, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.1684768199920654, "rewards/margins": 0.08648072928190231, "rewards/rejected": -1.2549574375152588, "step": 5370 }, { "epoch": 2.87673524000669, "grad_norm": 5.747037083974847, "learning_rate": 5.079209405917939e-09, "logits/chosen": -0.18757368624210358, "logits/rejected": -0.1114475354552269, "logps/chosen": -1.1432888507843018, "logps/rejected": -1.3564655780792236, "loss": 1.1433, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.1432888507843018, "rewards/margins": 0.2131766974925995, "rewards/rejected": -1.3564655780792236, "step": 5375 }, { "epoch": 2.879411272788092, "grad_norm": 4.538699027324028, "learning_rate": 4.860187375686664e-09, "logits/chosen": -0.20114584267139435, "logits/rejected": -0.041608959436416626, "logps/chosen": -1.3206082582473755, "logps/rejected": -1.4236054420471191, "loss": 1.3206, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.3206082582473755, "rewards/margins": 0.10299708694219589, "rewards/rejected": -1.4236054420471191, "step": 5380 }, { "epoch": 2.882087305569493, "grad_norm": 5.206768348965183, "learning_rate": 4.64596919408905e-09, "logits/chosen": -0.14151960611343384, "logits/rejected": -0.0887361615896225, "logps/chosen": -1.1718854904174805, "logps/rejected": -1.2888778448104858, "loss": 1.1719, "rewards/accuracies": 0.5625, "rewards/chosen": -1.1718854904174805, "rewards/margins": 0.11699223518371582, "rewards/rejected": -1.2888778448104858, "step": 5385 }, { "epoch": 2.884763338350895, "grad_norm": 4.675197614617292, "learning_rate": 4.436556939470814e-09, "logits/chosen": -0.16116514801979065, "logits/rejected": -0.09017006307840347, "logps/chosen": -1.219369888305664, "logps/rejected": -1.3267099857330322, "loss": 1.2194, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.219369888305664, "rewards/margins": 0.1073400154709816, "rewards/rejected": -1.3267099857330322, "step": 5390 }, { "epoch": 2.887439371132296, "grad_norm": 4.8146770679967394, "learning_rate": 4.23195264355064e-09, "logits/chosen": -0.3272706866264343, "logits/rejected": -0.18220779299736023, "logps/chosen": -1.1447089910507202, "logps/rejected": -1.3249528408050537, "loss": 1.1447, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.1447089910507202, "rewards/margins": 0.1802438497543335, "rewards/rejected": -1.3249528408050537, "step": 5395 }, { "epoch": 2.890115403913698, "grad_norm": 5.603229197800884, "learning_rate": 4.032158291400245e-09, "logits/chosen": -0.2212717980146408, "logits/rejected": -0.025863762944936752, "logps/chosen": -1.1981313228607178, "logps/rejected": -1.516899824142456, "loss": 1.1981, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.1981313228607178, "rewards/margins": 0.31876862049102783, "rewards/rejected": -1.516899824142456, "step": 5400 }, { "epoch": 2.8927914366950995, "grad_norm": 6.977022888203803, "learning_rate": 3.837175821425398e-09, "logits/chosen": -0.12772777676582336, "logits/rejected": -0.11354750394821167, "logps/chosen": -1.2380201816558838, "logps/rejected": -1.4254734516143799, "loss": 1.238, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.2380201816558838, "rewards/margins": 0.18745338916778564, "rewards/rejected": -1.4254734516143799, "step": 5405 }, { "epoch": 2.8954674694765012, "grad_norm": 4.806300003259373, "learning_rate": 3.6470071253467683e-09, "logits/chosen": -0.13121911883354187, "logits/rejected": -0.04247012734413147, "logps/chosen": -1.1847097873687744, "logps/rejected": -1.4286596775054932, "loss": 1.1847, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.1847097873687744, "rewards/margins": 0.24394996464252472, "rewards/rejected": -1.4286596775054932, "step": 5410 }, { "epoch": 2.8981435022579025, "grad_norm": 4.032178946059019, "learning_rate": 3.461654048181939e-09, "logits/chosen": -0.20015370845794678, "logits/rejected": -0.07000967860221863, "logps/chosen": -1.2113254070281982, "logps/rejected": -1.357100248336792, "loss": 1.2113, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.2113254070281982, "rewards/margins": 0.14577504992485046, "rewards/rejected": -1.357100248336792, "step": 5415 }, { "epoch": 2.9008195350393042, "grad_norm": 4.58896521601528, "learning_rate": 3.281118388227255e-09, "logits/chosen": -0.17084988951683044, "logits/rejected": -0.12472991645336151, "logps/chosen": -1.1186854839324951, "logps/rejected": -1.2727420330047607, "loss": 1.1187, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.1186854839324951, "rewards/margins": 0.154056578874588, "rewards/rejected": -1.2727420330047607, "step": 5420 }, { "epoch": 2.903495567820706, "grad_norm": 7.310688318726059, "learning_rate": 3.1054018970405048e-09, "logits/chosen": -0.17354199290275574, "logits/rejected": -0.08744578063488007, "logps/chosen": -1.2193574905395508, "logps/rejected": -1.3915101289749146, "loss": 1.2194, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.2193574905395508, "rewards/margins": 0.172152578830719, "rewards/rejected": -1.3915101289749146, "step": 5425 }, { "epoch": 2.906171600602107, "grad_norm": 5.088112489912727, "learning_rate": 2.9345062794238207e-09, "logits/chosen": -0.17608974874019623, "logits/rejected": -0.04766732454299927, "logps/chosen": -1.1696498394012451, "logps/rejected": -1.421309232711792, "loss": 1.1696, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.1696498394012451, "rewards/margins": 0.25165921449661255, "rewards/rejected": -1.421309232711792, "step": 5430 }, { "epoch": 2.908847633383509, "grad_norm": 7.05494082604633, "learning_rate": 2.7684331934072492e-09, "logits/chosen": -0.2800789773464203, "logits/rejected": -0.19829480350017548, "logps/chosen": -1.1665462255477905, "logps/rejected": -1.4256784915924072, "loss": 1.1665, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.1665462255477905, "rewards/margins": 0.25913217663764954, "rewards/rejected": -1.4256784915924072, "step": 5435 }, { "epoch": 2.9115236661649107, "grad_norm": 6.717624274327547, "learning_rate": 2.6071842502326526e-09, "logits/chosen": -0.21893565356731415, "logits/rejected": -0.13619257509708405, "logps/chosen": -1.1777057647705078, "logps/rejected": -1.4285337924957275, "loss": 1.1777, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.1777057647705078, "rewards/margins": 0.25082796812057495, "rewards/rejected": -1.4285337924957275, "step": 5440 }, { "epoch": 2.9141996989463124, "grad_norm": 5.2061100054353195, "learning_rate": 2.450761014337888e-09, "logits/chosen": -0.034659214317798615, "logits/rejected": -0.038635872304439545, "logps/chosen": -1.147814393043518, "logps/rejected": -1.4134447574615479, "loss": 1.1478, "rewards/accuracies": 0.59375, "rewards/chosen": -1.147814393043518, "rewards/margins": 0.265630304813385, "rewards/rejected": -1.4134447574615479, "step": 5445 }, { "epoch": 2.9168757317277136, "grad_norm": 5.1587973879945395, "learning_rate": 2.299165003341985e-09, "logits/chosen": -0.1330794245004654, "logits/rejected": -0.051003240048885345, "logps/chosen": -1.2063974142074585, "logps/rejected": -1.4323190450668335, "loss": 1.2064, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.2063974142074585, "rewards/margins": 0.22592172026634216, "rewards/rejected": -1.4323190450668335, "step": 5450 }, { "epoch": 2.9195517645091154, "grad_norm": 4.9857350250920405, "learning_rate": 2.1523976880299945e-09, "logits/chosen": -0.19212231040000916, "logits/rejected": -0.053259432315826416, "logps/chosen": -1.1774660348892212, "logps/rejected": -1.2599360942840576, "loss": 1.1775, "rewards/accuracies": 0.5625, "rewards/chosen": -1.1774660348892212, "rewards/margins": 0.0824701189994812, "rewards/rejected": -1.2599360942840576, "step": 5455 }, { "epoch": 2.9222277972905166, "grad_norm": 6.027293913178992, "learning_rate": 2.010460492339161e-09, "logits/chosen": -0.19847995042800903, "logits/rejected": -0.13442733883857727, "logps/chosen": -1.199053406715393, "logps/rejected": -1.371095895767212, "loss": 1.1991, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.199053406715393, "rewards/margins": 0.17204253375530243, "rewards/rejected": -1.371095895767212, "step": 5460 }, { "epoch": 2.9249038300719183, "grad_norm": 5.936850367931928, "learning_rate": 1.8733547933446614e-09, "logits/chosen": -0.2706778049468994, "logits/rejected": -0.13571687042713165, "logps/chosen": -1.2424981594085693, "logps/rejected": -1.3454482555389404, "loss": 1.2425, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.2424981594085693, "rewards/margins": 0.10295015573501587, "rewards/rejected": -1.3454482555389404, "step": 5465 }, { "epoch": 2.92757986285332, "grad_norm": 6.014046123105168, "learning_rate": 1.7410819212467231e-09, "logits/chosen": -0.16208641231060028, "logits/rejected": -0.09876864403486252, "logps/chosen": -1.1232110261917114, "logps/rejected": -1.292966604232788, "loss": 1.1232, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.1232110261917114, "rewards/margins": 0.1697554886341095, "rewards/rejected": -1.292966604232788, "step": 5470 }, { "epoch": 2.9302558956347218, "grad_norm": 5.712057175577949, "learning_rate": 1.613643159357192e-09, "logits/chosen": -0.11771228164434433, "logits/rejected": -0.16313078999519348, "logps/chosen": -1.0954174995422363, "logps/rejected": -1.2577972412109375, "loss": 1.0954, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.0954174995422363, "rewards/margins": 0.162379652261734, "rewards/rejected": -1.2577972412109375, "step": 5475 }, { "epoch": 2.932931928416123, "grad_norm": 5.033523502278854, "learning_rate": 1.4910397440875967e-09, "logits/chosen": -0.18696366250514984, "logits/rejected": -0.11766360700130463, "logps/chosen": -1.1988705396652222, "logps/rejected": -1.3083391189575195, "loss": 1.1989, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.1988705396652222, "rewards/margins": 0.1094684824347496, "rewards/rejected": -1.3083391189575195, "step": 5480 }, { "epoch": 2.9356079611975248, "grad_norm": 5.086717232087232, "learning_rate": 1.3732728649368253e-09, "logits/chosen": -0.1280520260334015, "logits/rejected": -0.00840255431830883, "logps/chosen": -1.1732430458068848, "logps/rejected": -1.3137986660003662, "loss": 1.1732, "rewards/accuracies": 0.59375, "rewards/chosen": -1.1732430458068848, "rewards/margins": 0.1405554711818695, "rewards/rejected": -1.3137986660003662, "step": 5485 }, { "epoch": 2.938283993978926, "grad_norm": 5.5801936764696976, "learning_rate": 1.260343664479524e-09, "logits/chosen": -0.1856532096862793, "logits/rejected": -0.16944333910942078, "logps/chosen": -1.1771879196166992, "logps/rejected": -1.2218797206878662, "loss": 1.1772, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -1.1771879196166992, "rewards/margins": 0.04469165951013565, "rewards/rejected": -1.2218797206878662, "step": 5490 }, { "epoch": 2.9409600267603278, "grad_norm": 5.297794359028156, "learning_rate": 1.1522532383554384e-09, "logits/chosen": -0.2559570074081421, "logits/rejected": -0.09719814360141754, "logps/chosen": -1.1482151746749878, "logps/rejected": -1.3596141338348389, "loss": 1.1482, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.1482151746749878, "rewards/margins": 0.2113989293575287, "rewards/rejected": -1.3596141338348389, "step": 5495 }, { "epoch": 2.9436360595417295, "grad_norm": 5.413204713061657, "learning_rate": 1.049002635258256e-09, "logits/chosen": -0.19594617187976837, "logits/rejected": -0.11011216789484024, "logps/chosen": -1.2074006795883179, "logps/rejected": -1.247920274734497, "loss": 1.2074, "rewards/accuracies": 0.5, "rewards/chosen": -1.2074006795883179, "rewards/margins": 0.04051954299211502, "rewards/rejected": -1.247920274734497, "step": 5500 }, { "epoch": 2.946312092323131, "grad_norm": 4.923586327581736, "learning_rate": 9.505928569258358e-10, "logits/chosen": -0.10183026641607285, "logits/rejected": -0.11270435154438019, "logps/chosen": -1.1918495893478394, "logps/rejected": -1.3893471956253052, "loss": 1.1918, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.1918495893478394, "rewards/margins": 0.19749745726585388, "rewards/rejected": -1.3893471956253052, "step": 5505 }, { "epoch": 2.9489881251045325, "grad_norm": 7.031682671225751, "learning_rate": 8.57024858130273e-10, "logits/chosen": -0.1908537745475769, "logits/rejected": -0.11992889642715454, "logps/chosen": -1.189032793045044, "logps/rejected": -1.3823628425598145, "loss": 1.189, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.189032793045044, "rewards/margins": 0.19332988560199738, "rewards/rejected": -1.3823628425598145, "step": 5510 }, { "epoch": 2.951664157885934, "grad_norm": 6.296364262321986, "learning_rate": 7.682995466686826e-10, "logits/chosen": -0.26160120964050293, "logits/rejected": -0.16374163329601288, "logps/chosen": -1.1421865224838257, "logps/rejected": -1.402442455291748, "loss": 1.1422, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.1421865224838257, "rewards/margins": 0.2602558434009552, "rewards/rejected": -1.402442455291748, "step": 5515 }, { "epoch": 2.9543401906673354, "grad_norm": 5.249376553428297, "learning_rate": 6.844177833543741e-10, "logits/chosen": -0.16666463017463684, "logits/rejected": -0.13466012477874756, "logps/chosen": -1.1322290897369385, "logps/rejected": -1.2921251058578491, "loss": 1.1322, "rewards/accuracies": 0.5625, "rewards/chosen": -1.1322290897369385, "rewards/margins": 0.15989604592323303, "rewards/rejected": -1.2921251058578491, "step": 5520 }, { "epoch": 2.957016223448737, "grad_norm": 4.882416823476414, "learning_rate": 6.053803820087467e-10, "logits/chosen": -0.18863682448863983, "logits/rejected": -0.10109467804431915, "logps/chosen": -1.1694958209991455, "logps/rejected": -1.3831243515014648, "loss": 1.1695, "rewards/accuracies": 0.5625, "rewards/chosen": -1.1694958209991455, "rewards/margins": 0.21362848579883575, "rewards/rejected": -1.3831243515014648, "step": 5525 }, { "epoch": 2.959692256230139, "grad_norm": 5.899590823387211, "learning_rate": 5.311881094528514e-10, "logits/chosen": -0.2196766436100006, "logits/rejected": -0.04926829785108566, "logps/chosen": -1.2730305194854736, "logps/rejected": -1.3407198190689087, "loss": 1.273, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.2730305194854736, "rewards/margins": 0.06768918037414551, "rewards/rejected": -1.3407198190689087, "step": 5530 }, { "epoch": 2.9623682890115406, "grad_norm": 6.233770036189493, "learning_rate": 4.6184168550050806e-10, "logits/chosen": -0.21866372227668762, "logits/rejected": -0.19956882297992706, "logps/chosen": -1.1595227718353271, "logps/rejected": -1.217761516571045, "loss": 1.1595, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.1595227718353271, "rewards/margins": 0.05823869630694389, "rewards/rejected": -1.217761516571045, "step": 5535 }, { "epoch": 2.965044321792942, "grad_norm": 5.634651868750121, "learning_rate": 3.973417829510328e-10, "logits/chosen": -0.2976319193840027, "logits/rejected": -0.1756175458431244, "logps/chosen": -1.2344791889190674, "logps/rejected": -1.3104290962219238, "loss": 1.2345, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.2344791889190674, "rewards/margins": 0.07595004141330719, "rewards/rejected": -1.3104290962219238, "step": 5540 }, { "epoch": 2.9677203545743436, "grad_norm": 5.254253154849797, "learning_rate": 3.3768902758274377e-10, "logits/chosen": -0.19602517783641815, "logits/rejected": -0.12964285910129547, "logps/chosen": -1.1196893453598022, "logps/rejected": -1.2611335515975952, "loss": 1.1197, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.1196893453598022, "rewards/margins": 0.14144405722618103, "rewards/rejected": -1.2611335515975952, "step": 5545 }, { "epoch": 2.970396387355745, "grad_norm": 5.870056857404476, "learning_rate": 2.8288399814691e-10, "logits/chosen": -0.12271688878536224, "logits/rejected": -0.03449391573667526, "logps/chosen": -1.2522612810134888, "logps/rejected": -1.3625961542129517, "loss": 1.2523, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.2522612810134888, "rewards/margins": 0.11033464968204498, "rewards/rejected": -1.3625961542129517, "step": 5550 }, { "epoch": 2.9730724201371466, "grad_norm": 7.087604020772203, "learning_rate": 2.3292722636220066e-10, "logits/chosen": -0.22375154495239258, "logits/rejected": -0.06741191446781158, "logps/chosen": -1.2814875841140747, "logps/rejected": -1.3807151317596436, "loss": 1.2815, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.2814875841140747, "rewards/margins": 0.09922753274440765, "rewards/rejected": -1.3807151317596436, "step": 5555 }, { "epoch": 2.9757484529185483, "grad_norm": 5.423783373678054, "learning_rate": 1.8781919690946668e-10, "logits/chosen": -0.1711069792509079, "logits/rejected": -0.18257930874824524, "logps/chosen": -1.203863501548767, "logps/rejected": -1.241209626197815, "loss": 1.2039, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.203863501548767, "rewards/margins": 0.037346214056015015, "rewards/rejected": -1.241209626197815, "step": 5560 }, { "epoch": 2.97842448569995, "grad_norm": 6.262557695918883, "learning_rate": 1.4756034742696711e-10, "logits/chosen": -0.24587690830230713, "logits/rejected": -0.2339576780796051, "logps/chosen": -1.1813150644302368, "logps/rejected": -1.2543747425079346, "loss": 1.1813, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.1813150644302368, "rewards/margins": 0.07305969297885895, "rewards/rejected": -1.2543747425079346, "step": 5565 }, { "epoch": 2.9811005184813513, "grad_norm": 6.305670335810419, "learning_rate": 1.12151068506261e-10, "logits/chosen": -0.15528343617916107, "logits/rejected": -0.07148648053407669, "logps/chosen": -1.1413120031356812, "logps/rejected": -1.3781181573867798, "loss": 1.1413, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.1413120031356812, "rewards/margins": 0.23680610954761505, "rewards/rejected": -1.3781181573867798, "step": 5570 }, { "epoch": 2.983776551262753, "grad_norm": 5.216245650965476, "learning_rate": 8.159170368826629e-11, "logits/chosen": -0.21261830627918243, "logits/rejected": -0.111419178545475, "logps/chosen": -1.167712926864624, "logps/rejected": -1.3613389730453491, "loss": 1.1677, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.167712926864624, "rewards/margins": 0.19362585246562958, "rewards/rejected": -1.3613389730453491, "step": 5575 }, { "epoch": 2.9864525840441547, "grad_norm": 6.671368858535387, "learning_rate": 5.588254946015114e-11, "logits/chosen": -0.2762644588947296, "logits/rejected": -0.0992206260561943, "logps/chosen": -1.0955836772918701, "logps/rejected": -1.358459711074829, "loss": 1.0956, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.0955836772918701, "rewards/margins": 0.2628757953643799, "rewards/rejected": -1.358459711074829, "step": 5580 }, { "epoch": 2.989128616825556, "grad_norm": 5.171880039132064, "learning_rate": 3.502385525216978e-11, "logits/chosen": -0.2609833776950836, "logits/rejected": -0.140590637922287, "logps/chosen": -1.2149375677108765, "logps/rejected": -1.3585855960845947, "loss": 1.2149, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.2149375677108765, "rewards/margins": 0.14364801347255707, "rewards/rejected": -1.3585855960845947, "step": 5585 }, { "epoch": 2.9918046496069577, "grad_norm": 5.577265460550481, "learning_rate": 1.901582343555308e-11, "logits/chosen": -0.2048136293888092, "logits/rejected": -0.17846134305000305, "logps/chosen": -1.2463903427124023, "logps/rejected": -1.3096749782562256, "loss": 1.2464, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -1.2463903427124023, "rewards/margins": 0.06328479945659637, "rewards/rejected": -1.3096749782562256, "step": 5590 }, { "epoch": 2.9944806823883594, "grad_norm": 4.962044794687051, "learning_rate": 7.858609320232634e-12, "logits/chosen": -0.19899597764015198, "logits/rejected": -0.1218920573592186, "logps/chosen": -1.1157242059707642, "logps/rejected": -1.30050528049469, "loss": 1.1157, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.1157242059707642, "rewards/margins": 0.18478113412857056, "rewards/rejected": -1.30050528049469, "step": 5595 }, { "epoch": 2.9971567151697607, "grad_norm": 5.798511098782109, "learning_rate": 1.5523211535639624e-12, "logits/chosen": -0.20105350017547607, "logits/rejected": -0.14885543286800385, "logps/chosen": -1.1657222509384155, "logps/rejected": -1.3627907037734985, "loss": 1.1657, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.1657222509384155, "rewards/margins": 0.19706842303276062, "rewards/rejected": -1.3627907037734985, "step": 5600 }, { "epoch": 2.9971567151697607, "eval_logits/chosen": 0.023058688268065453, "eval_logits/rejected": 0.08547348529100418, "eval_logps/chosen": -1.262244462966919, "eval_logps/rejected": -1.3865187168121338, "eval_loss": 1.262499451637268, "eval_rewards/accuracies": 0.5467358827590942, "eval_rewards/chosen": -1.262244462966919, "eval_rewards/margins": 0.12427420914173126, "eval_rewards/rejected": -1.3865187168121338, "eval_runtime": 40.0447, "eval_samples_per_second": 33.587, "eval_steps_per_second": 8.416, "step": 5600 }, { "epoch": 2.999297541394882, "step": 5604, "total_flos": 0.0, "train_loss": 1.23147624156715, "train_runtime": 30062.0122, "train_samples_per_second": 5.967, "train_steps_per_second": 0.186 } ], "logging_steps": 5, "max_steps": 5604, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 1000000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }