{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 1891, "global_step": 7564, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0026441036488630354, "grad_norm": 38.75, "learning_rate": 9.986779481755685e-05, "logits/chosen": -86.0, "logits/rejected": -82.0, "logps/chosen": -624.0, "logps/rejected": -494.0, "loss": 1.0105, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": 1.4140625, "rewards/margins": 0.2353515625, "rewards/rejected": 1.1796875, "step": 10 }, { "epoch": 0.005288207297726071, "grad_norm": 80.5, "learning_rate": 9.97355896351137e-05, "logits/chosen": -82.5, "logits/rejected": -80.0, "logps/chosen": -572.0, "logps/rejected": -506.0, "loss": 0.9611, "rewards/accuracies": 0.518750011920929, "rewards/chosen": 2.09375, "rewards/margins": 0.39453125, "rewards/rejected": 1.6875, "step": 20 }, { "epoch": 0.007932310946589107, "grad_norm": 81.0, "learning_rate": 9.960338445267055e-05, "logits/chosen": -82.5, "logits/rejected": -79.5, "logps/chosen": -580.0, "logps/rejected": -470.0, "loss": 1.0371, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": 1.25, "rewards/margins": 0.40234375, "rewards/rejected": 0.8515625, "step": 30 }, { "epoch": 0.010576414595452142, "grad_norm": 35.25, "learning_rate": 9.94711792702274e-05, "logits/chosen": -79.0, "logits/rejected": -78.5, "logps/chosen": -592.0, "logps/rejected": -478.0, "loss": 0.9102, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 2.0625, "rewards/margins": 0.9609375, "rewards/rejected": 1.09375, "step": 40 }, { "epoch": 0.013220518244315178, "grad_norm": 32.5, "learning_rate": 9.933897408778425e-05, "logits/chosen": -78.5, "logits/rejected": -77.0, "logps/chosen": -600.0, "logps/rejected": -480.0, "loss": 0.8436, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 3.375, "rewards/margins": 1.3046875, "rewards/rejected": 2.078125, "step": 50 }, { "epoch": 0.015864621893178214, "grad_norm": 36.25, "learning_rate": 9.92067689053411e-05, "logits/chosen": -78.0, "logits/rejected": -77.5, "logps/chosen": -548.0, "logps/rejected": -474.0, "loss": 0.8244, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.390625, "rewards/margins": 1.015625, "rewards/rejected": 1.375, "step": 60 }, { "epoch": 0.01850872554204125, "grad_norm": 29.375, "learning_rate": 9.907456372289795e-05, "logits/chosen": -83.0, "logits/rejected": -80.5, "logps/chosen": -540.0, "logps/rejected": -464.0, "loss": 0.6811, "rewards/accuracies": 0.625, "rewards/chosen": 2.03125, "rewards/margins": 1.125, "rewards/rejected": 0.90234375, "step": 70 }, { "epoch": 0.021152829190904283, "grad_norm": 54.75, "learning_rate": 9.894235854045479e-05, "logits/chosen": -83.5, "logits/rejected": -82.0, "logps/chosen": -576.0, "logps/rejected": -460.0, "loss": 0.7408, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 1.8203125, "rewards/margins": 0.87890625, "rewards/rejected": 0.9453125, "step": 80 }, { "epoch": 0.023796932839767318, "grad_norm": 54.25, "learning_rate": 9.881015335801163e-05, "logits/chosen": -82.0, "logits/rejected": -81.0, "logps/chosen": -552.0, "logps/rejected": -458.0, "loss": 0.7426, "rewards/accuracies": 0.6875, "rewards/chosen": 2.5, "rewards/margins": 0.9375, "rewards/rejected": 1.5703125, "step": 90 }, { "epoch": 0.026441036488630356, "grad_norm": 44.5, "learning_rate": 9.867794817556849e-05, "logits/chosen": -83.5, "logits/rejected": -81.5, "logps/chosen": -604.0, "logps/rejected": -512.0, "loss": 0.8777, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 1.71875, "rewards/margins": 0.65625, "rewards/rejected": 1.0625, "step": 100 }, { "epoch": 0.02908514013749339, "grad_norm": 59.75, "learning_rate": 9.854574299312533e-05, "logits/chosen": -83.5, "logits/rejected": -81.5, "logps/chosen": -608.0, "logps/rejected": -528.0, "loss": 0.8438, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 1.84375, "rewards/margins": 0.91015625, "rewards/rejected": 0.93359375, "step": 110 }, { "epoch": 0.03172924378635643, "grad_norm": 50.75, "learning_rate": 9.841353781068219e-05, "logits/chosen": -83.0, "logits/rejected": -80.5, "logps/chosen": -600.0, "logps/rejected": -492.0, "loss": 0.6564, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 2.0625, "rewards/margins": 1.203125, "rewards/rejected": 0.8515625, "step": 120 }, { "epoch": 0.03437334743521946, "grad_norm": 47.0, "learning_rate": 9.828133262823903e-05, "logits/chosen": -83.0, "logits/rejected": -83.0, "logps/chosen": -588.0, "logps/rejected": -506.0, "loss": 0.8234, "rewards/accuracies": 0.581250011920929, "rewards/chosen": 1.421875, "rewards/margins": 0.86328125, "rewards/rejected": 0.55859375, "step": 130 }, { "epoch": 0.0370174510840825, "grad_norm": 54.75, "learning_rate": 9.814912744579589e-05, "logits/chosen": -85.5, "logits/rejected": -84.0, "logps/chosen": -584.0, "logps/rejected": -524.0, "loss": 0.7816, "rewards/accuracies": 0.625, "rewards/chosen": 1.75, "rewards/margins": 0.89453125, "rewards/rejected": 0.859375, "step": 140 }, { "epoch": 0.03966155473294553, "grad_norm": 57.0, "learning_rate": 9.801692226335273e-05, "logits/chosen": -85.5, "logits/rejected": -85.0, "logps/chosen": -576.0, "logps/rejected": -504.0, "loss": 0.7117, "rewards/accuracies": 0.65625, "rewards/chosen": 1.015625, "rewards/margins": 0.984375, "rewards/rejected": 0.03369140625, "step": 150 }, { "epoch": 0.04230565838180857, "grad_norm": 52.75, "learning_rate": 9.788471708090957e-05, "logits/chosen": -82.0, "logits/rejected": -81.0, "logps/chosen": -588.0, "logps/rejected": -500.0, "loss": 0.7574, "rewards/accuracies": 0.625, "rewards/chosen": 1.8671875, "rewards/margins": 0.9453125, "rewards/rejected": 0.921875, "step": 160 }, { "epoch": 0.0449497620306716, "grad_norm": 39.5, "learning_rate": 9.775251189846643e-05, "logits/chosen": -81.5, "logits/rejected": -80.0, "logps/chosen": -604.0, "logps/rejected": -500.0, "loss": 0.6389, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 2.25, "rewards/margins": 1.328125, "rewards/rejected": 0.91796875, "step": 170 }, { "epoch": 0.047593865679534636, "grad_norm": 52.75, "learning_rate": 9.762030671602327e-05, "logits/chosen": -81.0, "logits/rejected": -78.0, "logps/chosen": -592.0, "logps/rejected": -484.0, "loss": 0.6801, "rewards/accuracies": 0.6875, "rewards/chosen": 2.84375, "rewards/margins": 1.1875, "rewards/rejected": 1.6640625, "step": 180 }, { "epoch": 0.05023796932839767, "grad_norm": 66.0, "learning_rate": 9.748810153358013e-05, "logits/chosen": -80.5, "logits/rejected": -77.5, "logps/chosen": -604.0, "logps/rejected": -516.0, "loss": 1.1301, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 3.703125, "rewards/margins": 1.046875, "rewards/rejected": 2.65625, "step": 190 }, { "epoch": 0.05288207297726071, "grad_norm": 30.125, "learning_rate": 9.735589635113697e-05, "logits/chosen": -80.5, "logits/rejected": -78.0, "logps/chosen": -576.0, "logps/rejected": -468.0, "loss": 0.7217, "rewards/accuracies": 0.65625, "rewards/chosen": 2.359375, "rewards/margins": 1.296875, "rewards/rejected": 1.0625, "step": 200 }, { "epoch": 0.05552617662612375, "grad_norm": 25.375, "learning_rate": 9.722369116869383e-05, "logits/chosen": -83.5, "logits/rejected": -79.0, "logps/chosen": -600.0, "logps/rejected": -486.0, "loss": 0.757, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 2.3125, "rewards/margins": 1.0859375, "rewards/rejected": 1.234375, "step": 210 }, { "epoch": 0.05817028027498678, "grad_norm": 57.75, "learning_rate": 9.709148598625067e-05, "logits/chosen": -82.5, "logits/rejected": -81.5, "logps/chosen": -568.0, "logps/rejected": -500.0, "loss": 0.7469, "rewards/accuracies": 0.65625, "rewards/chosen": 2.359375, "rewards/margins": 1.1015625, "rewards/rejected": 1.265625, "step": 220 }, { "epoch": 0.060814383923849816, "grad_norm": 41.25, "learning_rate": 9.695928080380751e-05, "logits/chosen": -83.0, "logits/rejected": -80.5, "logps/chosen": -604.0, "logps/rejected": -488.0, "loss": 0.752, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 2.21875, "rewards/margins": 0.97265625, "rewards/rejected": 1.2421875, "step": 230 }, { "epoch": 0.06345848757271286, "grad_norm": 57.25, "learning_rate": 9.682707562136435e-05, "logits/chosen": -84.5, "logits/rejected": -82.5, "logps/chosen": -612.0, "logps/rejected": -516.0, "loss": 0.8262, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 2.390625, "rewards/margins": 1.203125, "rewards/rejected": 1.1953125, "step": 240 }, { "epoch": 0.06610259122157588, "grad_norm": 38.25, "learning_rate": 9.669487043892121e-05, "logits/chosen": -84.0, "logits/rejected": -82.5, "logps/chosen": -612.0, "logps/rejected": -504.0, "loss": 0.7063, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 1.7890625, "rewards/margins": 1.015625, "rewards/rejected": 0.77734375, "step": 250 }, { "epoch": 0.06874669487043893, "grad_norm": 63.25, "learning_rate": 9.656266525647805e-05, "logits/chosen": -86.0, "logits/rejected": -83.5, "logps/chosen": -588.0, "logps/rejected": -512.0, "loss": 0.8863, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 2.09375, "rewards/margins": 0.625, "rewards/rejected": 1.46875, "step": 260 }, { "epoch": 0.07139079851930195, "grad_norm": 66.0, "learning_rate": 9.643046007403491e-05, "logits/chosen": -87.0, "logits/rejected": -86.0, "logps/chosen": -616.0, "logps/rejected": -536.0, "loss": 0.7957, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 2.734375, "rewards/margins": 1.0625, "rewards/rejected": 1.671875, "step": 270 }, { "epoch": 0.074034902168165, "grad_norm": 54.75, "learning_rate": 9.629825489159175e-05, "logits/chosen": -85.0, "logits/rejected": -83.5, "logps/chosen": -560.0, "logps/rejected": -454.0, "loss": 0.66, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 2.390625, "rewards/margins": 1.1015625, "rewards/rejected": 1.296875, "step": 280 }, { "epoch": 0.07667900581702802, "grad_norm": 41.0, "learning_rate": 9.61660497091486e-05, "logits/chosen": -81.0, "logits/rejected": -81.0, "logps/chosen": -592.0, "logps/rejected": -528.0, "loss": 0.6418, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 1.859375, "rewards/margins": 1.234375, "rewards/rejected": 0.625, "step": 290 }, { "epoch": 0.07932310946589106, "grad_norm": 57.25, "learning_rate": 9.603384452670545e-05, "logits/chosen": -82.5, "logits/rejected": -80.0, "logps/chosen": -564.0, "logps/rejected": -474.0, "loss": 0.6871, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 2.1875, "rewards/margins": 1.1953125, "rewards/rejected": 0.9921875, "step": 300 }, { "epoch": 0.08196721311475409, "grad_norm": 59.5, "learning_rate": 9.59016393442623e-05, "logits/chosen": -84.5, "logits/rejected": -81.0, "logps/chosen": -596.0, "logps/rejected": -496.0, "loss": 0.7434, "rewards/accuracies": 0.625, "rewards/chosen": 3.015625, "rewards/margins": 1.1015625, "rewards/rejected": 1.9140625, "step": 310 }, { "epoch": 0.08461131676361713, "grad_norm": 81.0, "learning_rate": 9.576943416181915e-05, "logits/chosen": -85.0, "logits/rejected": -83.0, "logps/chosen": -596.0, "logps/rejected": -490.0, "loss": 0.8238, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 2.53125, "rewards/margins": 1.0234375, "rewards/rejected": 1.515625, "step": 320 }, { "epoch": 0.08725542041248018, "grad_norm": 57.75, "learning_rate": 9.5637228979376e-05, "logits/chosen": -83.5, "logits/rejected": -83.5, "logps/chosen": -596.0, "logps/rejected": -520.0, "loss": 0.8035, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": 2.28125, "rewards/margins": 0.94921875, "rewards/rejected": 1.3359375, "step": 330 }, { "epoch": 0.0898995240613432, "grad_norm": 37.0, "learning_rate": 9.550502379693285e-05, "logits/chosen": -83.0, "logits/rejected": -81.0, "logps/chosen": -580.0, "logps/rejected": -486.0, "loss": 0.5799, "rewards/accuracies": 0.65625, "rewards/chosen": 2.15625, "rewards/margins": 1.375, "rewards/rejected": 0.7890625, "step": 340 }, { "epoch": 0.09254362771020624, "grad_norm": 41.5, "learning_rate": 9.537281861448969e-05, "logits/chosen": -85.5, "logits/rejected": -83.0, "logps/chosen": -588.0, "logps/rejected": -488.0, "loss": 0.7193, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 2.15625, "rewards/margins": 1.40625, "rewards/rejected": 0.75390625, "step": 350 }, { "epoch": 0.09518773135906927, "grad_norm": 57.5, "learning_rate": 9.524061343204655e-05, "logits/chosen": -86.5, "logits/rejected": -83.0, "logps/chosen": -580.0, "logps/rejected": -512.0, "loss": 0.7123, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 1.1484375, "rewards/margins": 0.953125, "rewards/rejected": 0.19921875, "step": 360 }, { "epoch": 0.09783183500793231, "grad_norm": 43.25, "learning_rate": 9.510840824960339e-05, "logits/chosen": -87.5, "logits/rejected": -86.0, "logps/chosen": -580.0, "logps/rejected": -496.0, "loss": 0.5658, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 1.5078125, "rewards/margins": 1.3203125, "rewards/rejected": 0.1884765625, "step": 370 }, { "epoch": 0.10047593865679534, "grad_norm": 46.0, "learning_rate": 9.497620306716023e-05, "logits/chosen": -88.5, "logits/rejected": -88.0, "logps/chosen": -528.0, "logps/rejected": -470.0, "loss": 0.7781, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 1.484375, "rewards/margins": 0.640625, "rewards/rejected": 0.83984375, "step": 380 }, { "epoch": 0.10312004230565838, "grad_norm": 58.25, "learning_rate": 9.484399788471708e-05, "logits/chosen": -89.0, "logits/rejected": -86.0, "logps/chosen": -596.0, "logps/rejected": -488.0, "loss": 0.6854, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 2.8125, "rewards/margins": 1.2109375, "rewards/rejected": 1.609375, "step": 390 }, { "epoch": 0.10576414595452142, "grad_norm": 47.75, "learning_rate": 9.471179270227393e-05, "logits/chosen": -85.0, "logits/rejected": -84.5, "logps/chosen": -524.0, "logps/rejected": -472.0, "loss": 0.7047, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 2.890625, "rewards/margins": 1.171875, "rewards/rejected": 1.71875, "step": 400 }, { "epoch": 0.10840824960338445, "grad_norm": 50.5, "learning_rate": 9.457958751983078e-05, "logits/chosen": -85.5, "logits/rejected": -83.0, "logps/chosen": -520.0, "logps/rejected": -438.0, "loss": 0.8797, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.7109375, "rewards/margins": 0.9765625, "rewards/rejected": 0.734375, "step": 410 }, { "epoch": 0.1110523532522475, "grad_norm": 57.0, "learning_rate": 9.444738233738763e-05, "logits/chosen": -88.0, "logits/rejected": -87.5, "logps/chosen": -588.0, "logps/rejected": -520.0, "loss": 0.7574, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 2.1875, "rewards/margins": 1.125, "rewards/rejected": 1.0546875, "step": 420 }, { "epoch": 0.11369645690111052, "grad_norm": 49.0, "learning_rate": 9.431517715494448e-05, "logits/chosen": -86.0, "logits/rejected": -83.0, "logps/chosen": -516.0, "logps/rejected": -432.0, "loss": 0.7842, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 2.53125, "rewards/margins": 1.0234375, "rewards/rejected": 1.515625, "step": 430 }, { "epoch": 0.11634056054997356, "grad_norm": 48.75, "learning_rate": 9.418297197250132e-05, "logits/chosen": -87.0, "logits/rejected": -84.5, "logps/chosen": -604.0, "logps/rejected": -516.0, "loss": 0.8293, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 2.90625, "rewards/margins": 0.98046875, "rewards/rejected": 1.9296875, "step": 440 }, { "epoch": 0.11898466419883659, "grad_norm": 31.5, "learning_rate": 9.405076679005818e-05, "logits/chosen": -85.0, "logits/rejected": -85.0, "logps/chosen": -548.0, "logps/rejected": -498.0, "loss": 0.7736, "rewards/accuracies": 0.59375, "rewards/chosen": 1.7890625, "rewards/margins": 0.8046875, "rewards/rejected": 0.984375, "step": 450 }, { "epoch": 0.12162876784769963, "grad_norm": 44.5, "learning_rate": 9.391856160761502e-05, "logits/chosen": -86.0, "logits/rejected": -83.5, "logps/chosen": -620.0, "logps/rejected": -496.0, "loss": 0.6715, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 2.15625, "rewards/margins": 1.1484375, "rewards/rejected": 1.0078125, "step": 460 }, { "epoch": 0.12427287149656266, "grad_norm": 48.75, "learning_rate": 9.378635642517187e-05, "logits/chosen": -82.5, "logits/rejected": -83.0, "logps/chosen": -568.0, "logps/rejected": -486.0, "loss": 0.5992, "rewards/accuracies": 0.75, "rewards/chosen": 2.375, "rewards/margins": 1.3984375, "rewards/rejected": 0.98046875, "step": 470 }, { "epoch": 0.12691697514542571, "grad_norm": 73.5, "learning_rate": 9.365415124272872e-05, "logits/chosen": -81.5, "logits/rejected": -80.0, "logps/chosen": -544.0, "logps/rejected": -480.0, "loss": 0.8057, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.046875, "rewards/margins": 0.67578125, "rewards/rejected": 1.375, "step": 480 }, { "epoch": 0.12956107879428874, "grad_norm": 56.25, "learning_rate": 9.352194606028557e-05, "logits/chosen": -83.0, "logits/rejected": -82.5, "logps/chosen": -592.0, "logps/rejected": -502.0, "loss": 0.775, "rewards/accuracies": 0.65625, "rewards/chosen": 2.96875, "rewards/margins": 1.1796875, "rewards/rejected": 1.78125, "step": 490 }, { "epoch": 0.13220518244315177, "grad_norm": 46.0, "learning_rate": 9.338974087784242e-05, "logits/chosen": -83.0, "logits/rejected": -82.5, "logps/chosen": -588.0, "logps/rejected": -500.0, "loss": 0.6191, "rewards/accuracies": 0.6875, "rewards/chosen": 3.078125, "rewards/margins": 1.546875, "rewards/rejected": 1.53125, "step": 500 }, { "epoch": 0.1348492860920148, "grad_norm": 56.0, "learning_rate": 9.325753569539927e-05, "logits/chosen": -86.0, "logits/rejected": -84.5, "logps/chosen": -612.0, "logps/rejected": -480.0, "loss": 0.7651, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 2.078125, "rewards/margins": 1.1484375, "rewards/rejected": 0.9375, "step": 510 }, { "epoch": 0.13749338974087785, "grad_norm": 51.5, "learning_rate": 9.312533051295612e-05, "logits/chosen": -82.5, "logits/rejected": -82.0, "logps/chosen": -510.0, "logps/rejected": -468.0, "loss": 0.7504, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 2.84375, "rewards/margins": 0.9375, "rewards/rejected": 1.8984375, "step": 520 }, { "epoch": 0.14013749338974088, "grad_norm": 70.0, "learning_rate": 9.299312533051296e-05, "logits/chosen": -83.5, "logits/rejected": -81.0, "logps/chosen": -596.0, "logps/rejected": -504.0, "loss": 0.6199, "rewards/accuracies": 0.731249988079071, "rewards/chosen": 2.8125, "rewards/margins": 1.671875, "rewards/rejected": 1.140625, "step": 530 }, { "epoch": 0.1427815970386039, "grad_norm": 62.0, "learning_rate": 9.28609201480698e-05, "logits/chosen": -80.5, "logits/rejected": -81.5, "logps/chosen": -528.0, "logps/rejected": -492.0, "loss": 0.7928, "rewards/accuracies": 0.606249988079071, "rewards/chosen": 1.6640625, "rewards/margins": 1.2265625, "rewards/rejected": 0.43359375, "step": 540 }, { "epoch": 0.14542570068746694, "grad_norm": 64.0, "learning_rate": 9.272871496562666e-05, "logits/chosen": -84.0, "logits/rejected": -83.0, "logps/chosen": -552.0, "logps/rejected": -484.0, "loss": 1.034, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": 1.765625, "rewards/margins": 0.51953125, "rewards/rejected": 1.25, "step": 550 }, { "epoch": 0.14806980433633, "grad_norm": 54.0, "learning_rate": 9.25965097831835e-05, "logits/chosen": -84.0, "logits/rejected": -82.5, "logps/chosen": -560.0, "logps/rejected": -442.0, "loss": 0.6648, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 2.484375, "rewards/margins": 1.1640625, "rewards/rejected": 1.3125, "step": 560 }, { "epoch": 0.15071390798519302, "grad_norm": 62.75, "learning_rate": 9.246430460074034e-05, "logits/chosen": -86.5, "logits/rejected": -84.0, "logps/chosen": -604.0, "logps/rejected": -536.0, "loss": 0.6891, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 2.890625, "rewards/margins": 1.296875, "rewards/rejected": 1.6015625, "step": 570 }, { "epoch": 0.15335801163405605, "grad_norm": 48.25, "learning_rate": 9.23320994182972e-05, "logits/chosen": -89.0, "logits/rejected": -86.0, "logps/chosen": -636.0, "logps/rejected": -528.0, "loss": 0.8029, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 1.9609375, "rewards/margins": 1.0078125, "rewards/rejected": 0.953125, "step": 580 }, { "epoch": 0.1560021152829191, "grad_norm": 25.25, "learning_rate": 9.219989423585404e-05, "logits/chosen": -85.0, "logits/rejected": -83.0, "logps/chosen": -552.0, "logps/rejected": -448.0, "loss": 0.6338, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.09375, "rewards/margins": 1.2421875, "rewards/rejected": -0.1474609375, "step": 590 }, { "epoch": 0.15864621893178213, "grad_norm": 43.25, "learning_rate": 9.20676890534109e-05, "logits/chosen": -88.0, "logits/rejected": -84.0, "logps/chosen": -576.0, "logps/rejected": -464.0, "loss": 0.6787, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 2.296875, "rewards/margins": 1.2734375, "rewards/rejected": 1.03125, "step": 600 }, { "epoch": 0.16129032258064516, "grad_norm": 67.5, "learning_rate": 9.193548387096774e-05, "logits/chosen": -86.5, "logits/rejected": -86.0, "logps/chosen": -592.0, "logps/rejected": -524.0, "loss": 0.9029, "rewards/accuracies": 0.65625, "rewards/chosen": 2.796875, "rewards/margins": 0.94140625, "rewards/rejected": 1.859375, "step": 610 }, { "epoch": 0.16393442622950818, "grad_norm": 55.0, "learning_rate": 9.18032786885246e-05, "logits/chosen": -82.5, "logits/rejected": -79.5, "logps/chosen": -620.0, "logps/rejected": -490.0, "loss": 0.6928, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.0625, "rewards/margins": 1.21875, "rewards/rejected": 0.84765625, "step": 620 }, { "epoch": 0.16657852987837124, "grad_norm": 45.75, "learning_rate": 9.167107350608144e-05, "logits/chosen": -79.5, "logits/rejected": -78.0, "logps/chosen": -588.0, "logps/rejected": -490.0, "loss": 0.7674, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 2.3125, "rewards/margins": 1.203125, "rewards/rejected": 1.1015625, "step": 630 }, { "epoch": 0.16922263352723427, "grad_norm": 59.5, "learning_rate": 9.15388683236383e-05, "logits/chosen": -77.0, "logits/rejected": -75.5, "logps/chosen": -544.0, "logps/rejected": -478.0, "loss": 1.0, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": 2.828125, "rewards/margins": 1.109375, "rewards/rejected": 1.7109375, "step": 640 }, { "epoch": 0.1718667371760973, "grad_norm": 53.75, "learning_rate": 9.140666314119514e-05, "logits/chosen": -82.5, "logits/rejected": -79.0, "logps/chosen": -560.0, "logps/rejected": -478.0, "loss": 0.7697, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": 2.046875, "rewards/margins": 0.9140625, "rewards/rejected": 1.140625, "step": 650 }, { "epoch": 0.17451084082496035, "grad_norm": 38.0, "learning_rate": 9.1274457958752e-05, "logits/chosen": -81.0, "logits/rejected": -77.5, "logps/chosen": -536.0, "logps/rejected": -438.0, "loss": 0.6539, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 1.5, "rewards/margins": 1.0859375, "rewards/rejected": 0.412109375, "step": 660 }, { "epoch": 0.17715494447382338, "grad_norm": 25.875, "learning_rate": 9.114225277630884e-05, "logits/chosen": -81.5, "logits/rejected": -80.0, "logps/chosen": -572.0, "logps/rejected": -486.0, "loss": 0.5848, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 2.84375, "rewards/margins": 1.3359375, "rewards/rejected": 1.515625, "step": 670 }, { "epoch": 0.1797990481226864, "grad_norm": 50.25, "learning_rate": 9.101004759386568e-05, "logits/chosen": -80.0, "logits/rejected": -79.0, "logps/chosen": -544.0, "logps/rejected": -430.0, "loss": 1.0248, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 2.984375, "rewards/margins": 1.21875, "rewards/rejected": 1.765625, "step": 680 }, { "epoch": 0.18244315177154943, "grad_norm": 47.75, "learning_rate": 9.087784241142254e-05, "logits/chosen": -80.5, "logits/rejected": -79.5, "logps/chosen": -568.0, "logps/rejected": -500.0, "loss": 0.7455, "rewards/accuracies": 0.625, "rewards/chosen": 1.6796875, "rewards/margins": 1.3046875, "rewards/rejected": 0.3828125, "step": 690 }, { "epoch": 0.1850872554204125, "grad_norm": 55.5, "learning_rate": 9.074563722897938e-05, "logits/chosen": -82.0, "logits/rejected": -79.5, "logps/chosen": -540.0, "logps/rejected": -454.0, "loss": 0.6893, "rewards/accuracies": 0.625, "rewards/chosen": 1.8671875, "rewards/margins": 0.99609375, "rewards/rejected": 0.87109375, "step": 700 }, { "epoch": 0.18773135906927552, "grad_norm": 73.0, "learning_rate": 9.061343204653622e-05, "logits/chosen": -82.5, "logits/rejected": -80.5, "logps/chosen": -564.0, "logps/rejected": -474.0, "loss": 0.8586, "rewards/accuracies": 0.606249988079071, "rewards/chosen": 2.875, "rewards/margins": 0.80859375, "rewards/rejected": 2.0625, "step": 710 }, { "epoch": 0.19037546271813854, "grad_norm": 68.5, "learning_rate": 9.048122686409307e-05, "logits/chosen": -78.0, "logits/rejected": -77.0, "logps/chosen": -536.0, "logps/rejected": -442.0, "loss": 0.7121, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 2.65625, "rewards/margins": 1.09375, "rewards/rejected": 1.5546875, "step": 720 }, { "epoch": 0.1930195663670016, "grad_norm": 37.25, "learning_rate": 9.034902168164992e-05, "logits/chosen": -82.0, "logits/rejected": -80.0, "logps/chosen": -596.0, "logps/rejected": -496.0, "loss": 0.7865, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 1.7265625, "rewards/margins": 1.21875, "rewards/rejected": 0.5078125, "step": 730 }, { "epoch": 0.19566367001586463, "grad_norm": 32.5, "learning_rate": 9.021681649920677e-05, "logits/chosen": -82.0, "logits/rejected": -81.0, "logps/chosen": -616.0, "logps/rejected": -504.0, "loss": 0.7119, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 2.765625, "rewards/margins": 1.140625, "rewards/rejected": 1.625, "step": 740 }, { "epoch": 0.19830777366472765, "grad_norm": 56.5, "learning_rate": 9.008461131676362e-05, "logits/chosen": -83.0, "logits/rejected": -81.5, "logps/chosen": -584.0, "logps/rejected": -512.0, "loss": 0.882, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 3.171875, "rewards/margins": 0.98046875, "rewards/rejected": 2.1875, "step": 750 }, { "epoch": 0.20095187731359068, "grad_norm": 38.75, "learning_rate": 8.995240613432046e-05, "logits/chosen": -80.5, "logits/rejected": -80.0, "logps/chosen": -540.0, "logps/rejected": -476.0, "loss": 0.8799, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.6875, "rewards/margins": 0.8515625, "rewards/rejected": 1.828125, "step": 760 }, { "epoch": 0.20359598096245374, "grad_norm": 48.0, "learning_rate": 8.982020095187732e-05, "logits/chosen": -79.0, "logits/rejected": -78.0, "logps/chosen": -544.0, "logps/rejected": -432.0, "loss": 0.6654, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 2.625, "rewards/margins": 1.265625, "rewards/rejected": 1.3671875, "step": 770 }, { "epoch": 0.20624008461131677, "grad_norm": 57.75, "learning_rate": 8.968799576943416e-05, "logits/chosen": -80.5, "logits/rejected": -78.0, "logps/chosen": -600.0, "logps/rejected": -482.0, "loss": 0.8164, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 2.578125, "rewards/margins": 0.9765625, "rewards/rejected": 1.59375, "step": 780 }, { "epoch": 0.2088841882601798, "grad_norm": 32.0, "learning_rate": 8.955579058699102e-05, "logits/chosen": -83.5, "logits/rejected": -80.5, "logps/chosen": -572.0, "logps/rejected": -498.0, "loss": 0.7537, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 2.609375, "rewards/margins": 1.2265625, "rewards/rejected": 1.390625, "step": 790 }, { "epoch": 0.21152829190904285, "grad_norm": 49.0, "learning_rate": 8.942358540454786e-05, "logits/chosen": -85.0, "logits/rejected": -84.5, "logps/chosen": -580.0, "logps/rejected": -532.0, "loss": 0.841, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 1.7421875, "rewards/margins": 0.8359375, "rewards/rejected": 0.91015625, "step": 800 }, { "epoch": 0.21417239555790588, "grad_norm": 52.75, "learning_rate": 8.929138022210472e-05, "logits/chosen": -85.0, "logits/rejected": -82.0, "logps/chosen": -596.0, "logps/rejected": -480.0, "loss": 0.7207, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 2.109375, "rewards/margins": 1.3828125, "rewards/rejected": 0.7265625, "step": 810 }, { "epoch": 0.2168164992067689, "grad_norm": 39.25, "learning_rate": 8.915917503966156e-05, "logits/chosen": -81.5, "logits/rejected": -79.5, "logps/chosen": -596.0, "logps/rejected": -494.0, "loss": 0.6824, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 2.75, "rewards/margins": 1.296875, "rewards/rejected": 1.453125, "step": 820 }, { "epoch": 0.21946060285563193, "grad_norm": 65.0, "learning_rate": 8.902696985721842e-05, "logits/chosen": -80.5, "logits/rejected": -80.0, "logps/chosen": -576.0, "logps/rejected": -502.0, "loss": 0.6834, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 2.71875, "rewards/margins": 1.34375, "rewards/rejected": 1.3828125, "step": 830 }, { "epoch": 0.222104706504495, "grad_norm": 33.5, "learning_rate": 8.889476467477526e-05, "logits/chosen": -82.0, "logits/rejected": -80.5, "logps/chosen": -592.0, "logps/rejected": -498.0, "loss": 0.7041, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 2.4375, "rewards/margins": 1.7109375, "rewards/rejected": 0.71875, "step": 840 }, { "epoch": 0.22474881015335801, "grad_norm": 56.25, "learning_rate": 8.87625594923321e-05, "logits/chosen": -81.5, "logits/rejected": -80.0, "logps/chosen": -588.0, "logps/rejected": -484.0, "loss": 0.8426, "rewards/accuracies": 0.6875, "rewards/chosen": 2.0625, "rewards/margins": 0.80078125, "rewards/rejected": 1.2578125, "step": 850 }, { "epoch": 0.22739291380222104, "grad_norm": 24.625, "learning_rate": 8.863035430988895e-05, "logits/chosen": -83.0, "logits/rejected": -80.0, "logps/chosen": -592.0, "logps/rejected": -466.0, "loss": 0.7009, "rewards/accuracies": 0.625, "rewards/chosen": 1.8984375, "rewards/margins": 1.0390625, "rewards/rejected": 0.8515625, "step": 860 }, { "epoch": 0.23003701745108407, "grad_norm": 43.25, "learning_rate": 8.849814912744579e-05, "logits/chosen": -82.0, "logits/rejected": -79.5, "logps/chosen": -616.0, "logps/rejected": -496.0, "loss": 0.6285, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 2.75, "rewards/margins": 1.1640625, "rewards/rejected": 1.578125, "step": 870 }, { "epoch": 0.23268112109994712, "grad_norm": 43.25, "learning_rate": 8.836594394500265e-05, "logits/chosen": -79.5, "logits/rejected": -81.0, "logps/chosen": -528.0, "logps/rejected": -484.0, "loss": 0.8926, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.375, "rewards/margins": 0.7265625, "rewards/rejected": 1.65625, "step": 880 }, { "epoch": 0.23532522474881015, "grad_norm": 54.5, "learning_rate": 8.823373876255949e-05, "logits/chosen": -84.0, "logits/rejected": -80.0, "logps/chosen": -608.0, "logps/rejected": -482.0, "loss": 0.6979, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.75, "rewards/margins": 1.21875, "rewards/rejected": 1.53125, "step": 890 }, { "epoch": 0.23796932839767318, "grad_norm": 53.25, "learning_rate": 8.810153358011635e-05, "logits/chosen": -83.0, "logits/rejected": -82.0, "logps/chosen": -596.0, "logps/rejected": -506.0, "loss": 0.815, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 2.671875, "rewards/margins": 1.1328125, "rewards/rejected": 1.53125, "step": 900 }, { "epoch": 0.24061343204653624, "grad_norm": 58.25, "learning_rate": 8.796932839767319e-05, "logits/chosen": -82.0, "logits/rejected": -79.0, "logps/chosen": -572.0, "logps/rejected": -476.0, "loss": 0.6863, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 2.453125, "rewards/margins": 0.9921875, "rewards/rejected": 1.4609375, "step": 910 }, { "epoch": 0.24325753569539926, "grad_norm": 35.5, "learning_rate": 8.783712321523004e-05, "logits/chosen": -85.0, "logits/rejected": -82.0, "logps/chosen": -624.0, "logps/rejected": -540.0, "loss": 0.7527, "rewards/accuracies": 0.6875, "rewards/chosen": 2.421875, "rewards/margins": 1.046875, "rewards/rejected": 1.375, "step": 920 }, { "epoch": 0.2459016393442623, "grad_norm": 46.25, "learning_rate": 8.770491803278689e-05, "logits/chosen": -84.5, "logits/rejected": -82.5, "logps/chosen": -572.0, "logps/rejected": -504.0, "loss": 0.7494, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 2.046875, "rewards/margins": 0.81640625, "rewards/rejected": 1.2265625, "step": 930 }, { "epoch": 0.24854574299312532, "grad_norm": 49.0, "learning_rate": 8.757271285034374e-05, "logits/chosen": -84.0, "logits/rejected": -83.0, "logps/chosen": -596.0, "logps/rejected": -486.0, "loss": 0.5156, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 2.484375, "rewards/margins": 1.828125, "rewards/rejected": 0.65625, "step": 940 }, { "epoch": 0.25118984664198835, "grad_norm": 57.5, "learning_rate": 8.744050766790059e-05, "logits/chosen": -82.5, "logits/rejected": -82.5, "logps/chosen": -536.0, "logps/rejected": -524.0, "loss": 0.8426, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 1.3671875, "rewards/margins": 0.78515625, "rewards/rejected": 0.58203125, "step": 950 }, { "epoch": 0.25383395029085143, "grad_norm": 48.0, "learning_rate": 8.730830248545744e-05, "logits/chosen": -87.0, "logits/rejected": -83.5, "logps/chosen": -584.0, "logps/rejected": -496.0, "loss": 0.9223, "rewards/accuracies": 0.625, "rewards/chosen": 1.3671875, "rewards/margins": 0.62109375, "rewards/rejected": 0.75, "step": 960 }, { "epoch": 0.25647805393971446, "grad_norm": 72.0, "learning_rate": 8.717609730301429e-05, "logits/chosen": -86.0, "logits/rejected": -85.0, "logps/chosen": -636.0, "logps/rejected": -536.0, "loss": 0.8688, "rewards/accuracies": 0.59375, "rewards/chosen": 2.796875, "rewards/margins": 0.828125, "rewards/rejected": 1.9609375, "step": 970 }, { "epoch": 0.2591221575885775, "grad_norm": 56.75, "learning_rate": 8.704389212057114e-05, "logits/chosen": -84.0, "logits/rejected": -82.5, "logps/chosen": -580.0, "logps/rejected": -498.0, "loss": 0.6738, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 2.5, "rewards/margins": 1.0703125, "rewards/rejected": 1.4296875, "step": 980 }, { "epoch": 0.2617662612374405, "grad_norm": 35.5, "learning_rate": 8.691168693812798e-05, "logits/chosen": -81.0, "logits/rejected": -79.0, "logps/chosen": -624.0, "logps/rejected": -506.0, "loss": 0.6199, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 2.546875, "rewards/margins": 1.5390625, "rewards/rejected": 1.0078125, "step": 990 }, { "epoch": 0.26441036488630354, "grad_norm": 56.25, "learning_rate": 8.677948175568483e-05, "logits/chosen": -82.0, "logits/rejected": -83.0, "logps/chosen": -580.0, "logps/rejected": -492.0, "loss": 0.673, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 2.859375, "rewards/margins": 1.109375, "rewards/rejected": 1.7421875, "step": 1000 }, { "epoch": 0.26705446853516657, "grad_norm": 65.5, "learning_rate": 8.664727657324167e-05, "logits/chosen": -82.5, "logits/rejected": -81.0, "logps/chosen": -592.0, "logps/rejected": -524.0, "loss": 0.7941, "rewards/accuracies": 0.606249988079071, "rewards/chosen": 2.578125, "rewards/margins": 0.86328125, "rewards/rejected": 1.7109375, "step": 1010 }, { "epoch": 0.2696985721840296, "grad_norm": 53.25, "learning_rate": 8.651507139079851e-05, "logits/chosen": -84.0, "logits/rejected": -84.0, "logps/chosen": -636.0, "logps/rejected": -528.0, "loss": 0.7539, "rewards/accuracies": 0.6875, "rewards/chosen": 2.1875, "rewards/margins": 1.3984375, "rewards/rejected": 0.78515625, "step": 1020 }, { "epoch": 0.2723426758328926, "grad_norm": 55.0, "learning_rate": 8.638286620835537e-05, "logits/chosen": -85.5, "logits/rejected": -84.0, "logps/chosen": -592.0, "logps/rejected": -512.0, "loss": 0.6053, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 1.9765625, "rewards/margins": 1.28125, "rewards/rejected": 0.6953125, "step": 1030 }, { "epoch": 0.2749867794817557, "grad_norm": 31.125, "learning_rate": 8.625066102591221e-05, "logits/chosen": -83.5, "logits/rejected": -82.5, "logps/chosen": -560.0, "logps/rejected": -484.0, "loss": 0.6654, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 2.953125, "rewards/margins": 1.4140625, "rewards/rejected": 1.53125, "step": 1040 }, { "epoch": 0.27763088313061873, "grad_norm": 55.25, "learning_rate": 8.611845584346907e-05, "logits/chosen": -83.5, "logits/rejected": -83.0, "logps/chosen": -592.0, "logps/rejected": -504.0, "loss": 0.7736, "rewards/accuracies": 0.625, "rewards/chosen": 3.265625, "rewards/margins": 1.1484375, "rewards/rejected": 2.125, "step": 1050 }, { "epoch": 0.28027498677948176, "grad_norm": 46.75, "learning_rate": 8.598625066102591e-05, "logits/chosen": -85.0, "logits/rejected": -86.0, "logps/chosen": -548.0, "logps/rejected": -506.0, "loss": 0.8475, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 2.15625, "rewards/margins": 0.9921875, "rewards/rejected": 1.171875, "step": 1060 }, { "epoch": 0.2829190904283448, "grad_norm": 47.75, "learning_rate": 8.585404547858277e-05, "logits/chosen": -87.0, "logits/rejected": -86.0, "logps/chosen": -556.0, "logps/rejected": -454.0, "loss": 0.7715, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 1.734375, "rewards/margins": 0.98828125, "rewards/rejected": 0.75, "step": 1070 }, { "epoch": 0.2855631940772078, "grad_norm": 46.5, "learning_rate": 8.572184029613961e-05, "logits/chosen": -86.0, "logits/rejected": -86.0, "logps/chosen": -576.0, "logps/rejected": -500.0, "loss": 0.6963, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 2.6875, "rewards/margins": 1.1640625, "rewards/rejected": 1.5234375, "step": 1080 }, { "epoch": 0.28820729772607084, "grad_norm": 52.75, "learning_rate": 8.558963511369647e-05, "logits/chosen": -86.5, "logits/rejected": -85.5, "logps/chosen": -544.0, "logps/rejected": -412.0, "loss": 0.6672, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": 2.34375, "rewards/margins": 1.3046875, "rewards/rejected": 1.03125, "step": 1090 }, { "epoch": 0.29085140137493387, "grad_norm": 57.25, "learning_rate": 8.545742993125331e-05, "logits/chosen": -86.5, "logits/rejected": -85.5, "logps/chosen": -600.0, "logps/rejected": -504.0, "loss": 0.7562, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 3.265625, "rewards/margins": 1.3984375, "rewards/rejected": 1.859375, "step": 1100 }, { "epoch": 0.29349550502379695, "grad_norm": 30.25, "learning_rate": 8.532522474881017e-05, "logits/chosen": -86.5, "logits/rejected": -85.0, "logps/chosen": -636.0, "logps/rejected": -516.0, "loss": 0.6037, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 3.234375, "rewards/margins": 1.5625, "rewards/rejected": 1.671875, "step": 1110 }, { "epoch": 0.29613960867266, "grad_norm": 60.25, "learning_rate": 8.519301956636701e-05, "logits/chosen": -86.0, "logits/rejected": -85.5, "logps/chosen": -632.0, "logps/rejected": -516.0, "loss": 0.6447, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 2.625, "rewards/margins": 1.2890625, "rewards/rejected": 1.328125, "step": 1120 }, { "epoch": 0.298783712321523, "grad_norm": 41.5, "learning_rate": 8.506081438392387e-05, "logits/chosen": -85.5, "logits/rejected": -84.5, "logps/chosen": -576.0, "logps/rejected": -516.0, "loss": 0.7277, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 2.90625, "rewards/margins": 1.09375, "rewards/rejected": 1.8046875, "step": 1130 }, { "epoch": 0.30142781597038604, "grad_norm": 58.5, "learning_rate": 8.492860920148071e-05, "logits/chosen": -87.0, "logits/rejected": -85.5, "logps/chosen": -568.0, "logps/rejected": -476.0, "loss": 0.7744, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 2.078125, "rewards/margins": 1.125, "rewards/rejected": 0.95703125, "step": 1140 }, { "epoch": 0.30407191961924906, "grad_norm": 34.0, "learning_rate": 8.479640401903755e-05, "logits/chosen": -86.5, "logits/rejected": -85.5, "logps/chosen": -616.0, "logps/rejected": -528.0, "loss": 0.8396, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 1.9375, "rewards/margins": 0.98828125, "rewards/rejected": 0.94921875, "step": 1150 }, { "epoch": 0.3067160232681121, "grad_norm": 42.5, "learning_rate": 8.46641988365944e-05, "logits/chosen": -86.5, "logits/rejected": -84.5, "logps/chosen": -568.0, "logps/rejected": -500.0, "loss": 0.7633, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 1.984375, "rewards/margins": 0.97265625, "rewards/rejected": 1.0078125, "step": 1160 }, { "epoch": 0.3093601269169751, "grad_norm": 28.375, "learning_rate": 8.453199365415124e-05, "logits/chosen": -90.0, "logits/rejected": -86.0, "logps/chosen": -600.0, "logps/rejected": -506.0, "loss": 0.6629, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 1.7890625, "rewards/margins": 1.3984375, "rewards/rejected": 0.38671875, "step": 1170 }, { "epoch": 0.3120042305658382, "grad_norm": 57.5, "learning_rate": 8.439978847170809e-05, "logits/chosen": -87.5, "logits/rejected": -86.0, "logps/chosen": -612.0, "logps/rejected": -520.0, "loss": 0.8105, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 1.703125, "rewards/margins": 1.0625, "rewards/rejected": 0.63671875, "step": 1180 }, { "epoch": 0.31464833421470123, "grad_norm": 55.0, "learning_rate": 8.426758328926494e-05, "logits/chosen": -84.5, "logits/rejected": -85.0, "logps/chosen": -552.0, "logps/rejected": -524.0, "loss": 0.7594, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 2.078125, "rewards/margins": 1.046875, "rewards/rejected": 1.03125, "step": 1190 }, { "epoch": 0.31729243786356426, "grad_norm": 55.25, "learning_rate": 8.413537810682179e-05, "logits/chosen": -85.5, "logits/rejected": -84.0, "logps/chosen": -576.0, "logps/rejected": -502.0, "loss": 0.7937, "rewards/accuracies": 0.606249988079071, "rewards/chosen": 1.8984375, "rewards/margins": 0.76953125, "rewards/rejected": 1.125, "step": 1200 }, { "epoch": 0.3199365415124273, "grad_norm": 27.25, "learning_rate": 8.400317292437864e-05, "logits/chosen": -85.5, "logits/rejected": -84.0, "logps/chosen": -540.0, "logps/rejected": -450.0, "loss": 0.623, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.21875, "rewards/margins": 1.25, "rewards/rejected": 0.96484375, "step": 1210 }, { "epoch": 0.3225806451612903, "grad_norm": 56.25, "learning_rate": 8.387096774193549e-05, "logits/chosen": -84.5, "logits/rejected": -84.0, "logps/chosen": -588.0, "logps/rejected": -516.0, "loss": 0.7131, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 2.609375, "rewards/margins": 1.2578125, "rewards/rejected": 1.3515625, "step": 1220 }, { "epoch": 0.32522474881015334, "grad_norm": 52.75, "learning_rate": 8.373876255949233e-05, "logits/chosen": -81.5, "logits/rejected": -82.0, "logps/chosen": -572.0, "logps/rejected": -508.0, "loss": 0.7361, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 1.859375, "rewards/margins": 1.0546875, "rewards/rejected": 0.80859375, "step": 1230 }, { "epoch": 0.32786885245901637, "grad_norm": 51.0, "learning_rate": 8.360655737704919e-05, "logits/chosen": -83.0, "logits/rejected": -82.0, "logps/chosen": -580.0, "logps/rejected": -474.0, "loss": 0.7717, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 2.484375, "rewards/margins": 1.21875, "rewards/rejected": 1.265625, "step": 1240 }, { "epoch": 0.33051295610787945, "grad_norm": 67.5, "learning_rate": 8.347435219460603e-05, "logits/chosen": -82.5, "logits/rejected": -80.0, "logps/chosen": -588.0, "logps/rejected": -476.0, "loss": 0.7037, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 2.40625, "rewards/margins": 1.1796875, "rewards/rejected": 1.2265625, "step": 1250 }, { "epoch": 0.3331570597567425, "grad_norm": 31.375, "learning_rate": 8.334214701216289e-05, "logits/chosen": -83.5, "logits/rejected": -82.5, "logps/chosen": -596.0, "logps/rejected": -498.0, "loss": 0.6512, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 2.25, "rewards/margins": 1.2890625, "rewards/rejected": 0.95703125, "step": 1260 }, { "epoch": 0.3358011634056055, "grad_norm": 58.75, "learning_rate": 8.320994182971973e-05, "logits/chosen": -84.5, "logits/rejected": -84.5, "logps/chosen": -576.0, "logps/rejected": -500.0, "loss": 0.6072, "rewards/accuracies": 0.6875, "rewards/chosen": 1.9609375, "rewards/margins": 1.3671875, "rewards/rejected": 0.59375, "step": 1270 }, { "epoch": 0.33844526705446853, "grad_norm": 29.875, "learning_rate": 8.307773664727658e-05, "logits/chosen": -84.0, "logits/rejected": -81.5, "logps/chosen": -560.0, "logps/rejected": -506.0, "loss": 0.7457, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 2.015625, "rewards/margins": 1.0078125, "rewards/rejected": 1.0, "step": 1280 }, { "epoch": 0.34108937070333156, "grad_norm": 51.0, "learning_rate": 8.294553146483343e-05, "logits/chosen": -85.0, "logits/rejected": -83.0, "logps/chosen": -608.0, "logps/rejected": -480.0, "loss": 1.0531, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 2.40625, "rewards/margins": 0.69140625, "rewards/rejected": 1.7109375, "step": 1290 }, { "epoch": 0.3437334743521946, "grad_norm": 48.5, "learning_rate": 8.281332628239027e-05, "logits/chosen": -84.0, "logits/rejected": -84.5, "logps/chosen": -588.0, "logps/rejected": -524.0, "loss": 0.7023, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": 2.890625, "rewards/margins": 1.2421875, "rewards/rejected": 1.640625, "step": 1300 }, { "epoch": 0.3463775780010576, "grad_norm": 47.0, "learning_rate": 8.268112109994712e-05, "logits/chosen": -88.0, "logits/rejected": -86.5, "logps/chosen": -564.0, "logps/rejected": -476.0, "loss": 0.6199, "rewards/accuracies": 0.6875, "rewards/chosen": 2.390625, "rewards/margins": 1.1953125, "rewards/rejected": 1.1875, "step": 1310 }, { "epoch": 0.3490216816499207, "grad_norm": 42.0, "learning_rate": 8.254891591750396e-05, "logits/chosen": -83.0, "logits/rejected": -82.0, "logps/chosen": -548.0, "logps/rejected": -458.0, "loss": 0.7125, "rewards/accuracies": 0.65625, "rewards/chosen": 2.34375, "rewards/margins": 0.8828125, "rewards/rejected": 1.4609375, "step": 1320 }, { "epoch": 0.35166578529878373, "grad_norm": 73.0, "learning_rate": 8.241671073506082e-05, "logits/chosen": -83.5, "logits/rejected": -82.0, "logps/chosen": -584.0, "logps/rejected": -498.0, "loss": 0.7529, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 2.875, "rewards/margins": 0.94921875, "rewards/rejected": 1.9375, "step": 1330 }, { "epoch": 0.35430988894764676, "grad_norm": 28.375, "learning_rate": 8.228450555261766e-05, "logits/chosen": -83.5, "logits/rejected": -80.5, "logps/chosen": -632.0, "logps/rejected": -484.0, "loss": 0.6377, "rewards/accuracies": 0.71875, "rewards/chosen": 2.71875, "rewards/margins": 1.4296875, "rewards/rejected": 1.296875, "step": 1340 }, { "epoch": 0.3569539925965098, "grad_norm": 55.0, "learning_rate": 8.215230037017452e-05, "logits/chosen": -80.0, "logits/rejected": -79.5, "logps/chosen": -540.0, "logps/rejected": -448.0, "loss": 0.6762, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 2.09375, "rewards/margins": 0.95703125, "rewards/rejected": 1.125, "step": 1350 }, { "epoch": 0.3595980962453728, "grad_norm": 44.0, "learning_rate": 8.202009518773136e-05, "logits/chosen": -82.0, "logits/rejected": -81.0, "logps/chosen": -540.0, "logps/rejected": -470.0, "loss": 0.7797, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 1.7734375, "rewards/margins": 0.80078125, "rewards/rejected": 0.96875, "step": 1360 }, { "epoch": 0.36224219989423584, "grad_norm": 48.75, "learning_rate": 8.188789000528821e-05, "logits/chosen": -82.5, "logits/rejected": -81.0, "logps/chosen": -564.0, "logps/rejected": -470.0, "loss": 0.8023, "rewards/accuracies": 0.625, "rewards/chosen": 2.09375, "rewards/margins": 0.7578125, "rewards/rejected": 1.3359375, "step": 1370 }, { "epoch": 0.36488630354309887, "grad_norm": 50.0, "learning_rate": 8.175568482284506e-05, "logits/chosen": -84.0, "logits/rejected": -82.5, "logps/chosen": -628.0, "logps/rejected": -532.0, "loss": 0.6973, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 2.640625, "rewards/margins": 1.171875, "rewards/rejected": 1.4765625, "step": 1380 }, { "epoch": 0.36753040719196195, "grad_norm": 42.5, "learning_rate": 8.162347964040191e-05, "logits/chosen": -81.0, "logits/rejected": -81.5, "logps/chosen": -532.0, "logps/rejected": -468.0, "loss": 0.6494, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 2.21875, "rewards/margins": 1.2578125, "rewards/rejected": 0.96484375, "step": 1390 }, { "epoch": 0.370174510840825, "grad_norm": 36.25, "learning_rate": 8.149127445795876e-05, "logits/chosen": -84.0, "logits/rejected": -82.0, "logps/chosen": -620.0, "logps/rejected": -552.0, "loss": 0.5062, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 1.9453125, "rewards/margins": 1.5078125, "rewards/rejected": 0.439453125, "step": 1400 }, { "epoch": 0.372818614489688, "grad_norm": 48.75, "learning_rate": 8.135906927551561e-05, "logits/chosen": -83.0, "logits/rejected": -81.5, "logps/chosen": -584.0, "logps/rejected": -490.0, "loss": 0.6889, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 1.7109375, "rewards/margins": 1.3046875, "rewards/rejected": 0.41015625, "step": 1410 }, { "epoch": 0.37546271813855103, "grad_norm": 39.75, "learning_rate": 8.122686409307246e-05, "logits/chosen": -83.0, "logits/rejected": -82.0, "logps/chosen": -552.0, "logps/rejected": -484.0, "loss": 0.6857, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 1.71875, "rewards/margins": 1.1171875, "rewards/rejected": 0.609375, "step": 1420 }, { "epoch": 0.37810682178741406, "grad_norm": 37.5, "learning_rate": 8.10946589106293e-05, "logits/chosen": -83.0, "logits/rejected": -82.5, "logps/chosen": -560.0, "logps/rejected": -466.0, "loss": 0.817, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 2.421875, "rewards/margins": 1.3125, "rewards/rejected": 1.1015625, "step": 1430 }, { "epoch": 0.3807509254362771, "grad_norm": 65.0, "learning_rate": 8.096245372818616e-05, "logits/chosen": -85.0, "logits/rejected": -82.0, "logps/chosen": -584.0, "logps/rejected": -482.0, "loss": 0.759, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.71875, "rewards/margins": 0.88671875, "rewards/rejected": 0.8359375, "step": 1440 }, { "epoch": 0.3833950290851401, "grad_norm": 46.5, "learning_rate": 8.0830248545743e-05, "logits/chosen": -82.0, "logits/rejected": -81.0, "logps/chosen": -544.0, "logps/rejected": -476.0, "loss": 0.5766, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 2.640625, "rewards/margins": 1.375, "rewards/rejected": 1.265625, "step": 1450 }, { "epoch": 0.3860391327340032, "grad_norm": 56.5, "learning_rate": 8.069804336329984e-05, "logits/chosen": -84.0, "logits/rejected": -82.5, "logps/chosen": -592.0, "logps/rejected": -486.0, "loss": 0.8445, "rewards/accuracies": 0.65625, "rewards/chosen": 2.59375, "rewards/margins": 0.9921875, "rewards/rejected": 1.6015625, "step": 1460 }, { "epoch": 0.3886832363828662, "grad_norm": 31.0, "learning_rate": 8.056583818085668e-05, "logits/chosen": -86.0, "logits/rejected": -83.5, "logps/chosen": -560.0, "logps/rejected": -462.0, "loss": 0.6328, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 2.265625, "rewards/margins": 1.21875, "rewards/rejected": 1.0546875, "step": 1470 }, { "epoch": 0.39132734003172925, "grad_norm": 43.25, "learning_rate": 8.043363299841354e-05, "logits/chosen": -86.0, "logits/rejected": -83.5, "logps/chosen": -612.0, "logps/rejected": -488.0, "loss": 0.6648, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 3.203125, "rewards/margins": 1.5078125, "rewards/rejected": 1.6953125, "step": 1480 }, { "epoch": 0.3939714436805923, "grad_norm": 44.5, "learning_rate": 8.030142781597038e-05, "logits/chosen": -85.0, "logits/rejected": -84.5, "logps/chosen": -576.0, "logps/rejected": -516.0, "loss": 0.6379, "rewards/accuracies": 0.6875, "rewards/chosen": 2.671875, "rewards/margins": 1.03125, "rewards/rejected": 1.6484375, "step": 1490 }, { "epoch": 0.3966155473294553, "grad_norm": 49.75, "learning_rate": 8.016922263352724e-05, "logits/chosen": -85.5, "logits/rejected": -84.0, "logps/chosen": -572.0, "logps/rejected": -480.0, "loss": 0.6273, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.953125, "rewards/margins": 1.2890625, "rewards/rejected": 1.65625, "step": 1500 }, { "epoch": 0.39925965097831834, "grad_norm": 55.25, "learning_rate": 8.003701745108408e-05, "logits/chosen": -84.0, "logits/rejected": -83.0, "logps/chosen": -600.0, "logps/rejected": -496.0, "loss": 0.7141, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 2.90625, "rewards/margins": 1.125, "rewards/rejected": 1.78125, "step": 1510 }, { "epoch": 0.40190375462718136, "grad_norm": 56.25, "learning_rate": 7.990481226864094e-05, "logits/chosen": -85.5, "logits/rejected": -84.0, "logps/chosen": -584.0, "logps/rejected": -492.0, "loss": 0.6875, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 2.96875, "rewards/margins": 1.5703125, "rewards/rejected": 1.40625, "step": 1520 }, { "epoch": 0.40454785827604445, "grad_norm": 35.0, "learning_rate": 7.977260708619778e-05, "logits/chosen": -85.0, "logits/rejected": -85.0, "logps/chosen": -536.0, "logps/rejected": -474.0, "loss": 0.7426, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 2.359375, "rewards/margins": 0.96484375, "rewards/rejected": 1.390625, "step": 1530 }, { "epoch": 0.4071919619249075, "grad_norm": 37.25, "learning_rate": 7.964040190375464e-05, "logits/chosen": -84.0, "logits/rejected": -82.0, "logps/chosen": -496.0, "logps/rejected": -448.0, "loss": 0.7234, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 1.609375, "rewards/margins": 0.96484375, "rewards/rejected": 0.6484375, "step": 1540 }, { "epoch": 0.4098360655737705, "grad_norm": 47.0, "learning_rate": 7.950819672131148e-05, "logits/chosen": -85.5, "logits/rejected": -85.0, "logps/chosen": -508.0, "logps/rejected": -440.0, "loss": 0.7121, "rewards/accuracies": 0.625, "rewards/chosen": 1.859375, "rewards/margins": 0.8828125, "rewards/rejected": 0.97265625, "step": 1550 }, { "epoch": 0.41248016922263353, "grad_norm": 40.25, "learning_rate": 7.937599153886834e-05, "logits/chosen": -88.5, "logits/rejected": -87.5, "logps/chosen": -636.0, "logps/rejected": -556.0, "loss": 0.7119, "rewards/accuracies": 0.6875, "rewards/chosen": 2.59375, "rewards/margins": 1.1640625, "rewards/rejected": 1.4296875, "step": 1560 }, { "epoch": 0.41512427287149656, "grad_norm": 56.75, "learning_rate": 7.924378635642518e-05, "logits/chosen": -84.5, "logits/rejected": -82.5, "logps/chosen": -580.0, "logps/rejected": -484.0, "loss": 0.6455, "rewards/accuracies": 0.6875, "rewards/chosen": 2.15625, "rewards/margins": 1.0, "rewards/rejected": 1.1484375, "step": 1570 }, { "epoch": 0.4177683765203596, "grad_norm": 28.125, "learning_rate": 7.911158117398202e-05, "logits/chosen": -85.5, "logits/rejected": -83.5, "logps/chosen": -556.0, "logps/rejected": -498.0, "loss": 0.6822, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 2.25, "rewards/margins": 1.25, "rewards/rejected": 0.9921875, "step": 1580 }, { "epoch": 0.4204124801692226, "grad_norm": 47.0, "learning_rate": 7.897937599153888e-05, "logits/chosen": -82.0, "logits/rejected": -81.0, "logps/chosen": -524.0, "logps/rejected": -454.0, "loss": 0.5879, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 2.015625, "rewards/margins": 1.25, "rewards/rejected": 0.765625, "step": 1590 }, { "epoch": 0.4230565838180857, "grad_norm": 44.0, "learning_rate": 7.884717080909572e-05, "logits/chosen": -84.0, "logits/rejected": -80.5, "logps/chosen": -544.0, "logps/rejected": -432.0, "loss": 0.5338, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 2.453125, "rewards/margins": 1.328125, "rewards/rejected": 1.1328125, "step": 1600 }, { "epoch": 0.4257006874669487, "grad_norm": 35.0, "learning_rate": 7.871496562665256e-05, "logits/chosen": -83.0, "logits/rejected": -80.0, "logps/chosen": -552.0, "logps/rejected": -446.0, "loss": 0.7592, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 3.078125, "rewards/margins": 1.1640625, "rewards/rejected": 1.90625, "step": 1610 }, { "epoch": 0.42834479111581175, "grad_norm": 47.5, "learning_rate": 7.858276044420941e-05, "logits/chosen": -81.5, "logits/rejected": -80.0, "logps/chosen": -596.0, "logps/rejected": -480.0, "loss": 0.4873, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": 3.34375, "rewards/margins": 1.953125, "rewards/rejected": 1.390625, "step": 1620 }, { "epoch": 0.4309888947646748, "grad_norm": 37.0, "learning_rate": 7.845055526176626e-05, "logits/chosen": -83.5, "logits/rejected": -82.5, "logps/chosen": -548.0, "logps/rejected": -472.0, "loss": 0.633, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 2.515625, "rewards/margins": 1.2109375, "rewards/rejected": 1.3046875, "step": 1630 }, { "epoch": 0.4336329984135378, "grad_norm": 55.5, "learning_rate": 7.83183500793231e-05, "logits/chosen": -80.0, "logits/rejected": -81.0, "logps/chosen": -532.0, "logps/rejected": -462.0, "loss": 0.5357, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 1.6328125, "rewards/margins": 1.6015625, "rewards/rejected": 0.033447265625, "step": 1640 }, { "epoch": 0.43627710206240083, "grad_norm": 46.75, "learning_rate": 7.818614489687996e-05, "logits/chosen": -85.5, "logits/rejected": -83.0, "logps/chosen": -636.0, "logps/rejected": -536.0, "loss": 0.924, "rewards/accuracies": 0.581250011920929, "rewards/chosen": 2.546875, "rewards/margins": 0.80859375, "rewards/rejected": 1.734375, "step": 1650 }, { "epoch": 0.43892120571126386, "grad_norm": 57.25, "learning_rate": 7.80539397144368e-05, "logits/chosen": -84.0, "logits/rejected": -82.0, "logps/chosen": -624.0, "logps/rejected": -508.0, "loss": 0.6736, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 3.078125, "rewards/margins": 1.3359375, "rewards/rejected": 1.7421875, "step": 1660 }, { "epoch": 0.4415653093601269, "grad_norm": 66.0, "learning_rate": 7.792173453199366e-05, "logits/chosen": -83.0, "logits/rejected": -81.0, "logps/chosen": -568.0, "logps/rejected": -470.0, "loss": 0.6392, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 2.953125, "rewards/margins": 1.2734375, "rewards/rejected": 1.6796875, "step": 1670 }, { "epoch": 0.44420941300899, "grad_norm": 23.25, "learning_rate": 7.77895293495505e-05, "logits/chosen": -86.5, "logits/rejected": -84.0, "logps/chosen": -568.0, "logps/rejected": -472.0, "loss": 0.6781, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.875, "rewards/margins": 1.25, "rewards/rejected": 1.6171875, "step": 1680 }, { "epoch": 0.446853516657853, "grad_norm": 16.625, "learning_rate": 7.765732416710736e-05, "logits/chosen": -84.0, "logits/rejected": -82.5, "logps/chosen": -572.0, "logps/rejected": -492.0, "loss": 0.5377, "rewards/accuracies": 0.71875, "rewards/chosen": 2.78125, "rewards/margins": 1.359375, "rewards/rejected": 1.421875, "step": 1690 }, { "epoch": 0.44949762030671603, "grad_norm": 107.0, "learning_rate": 7.75251189846642e-05, "logits/chosen": -85.0, "logits/rejected": -83.5, "logps/chosen": -584.0, "logps/rejected": -476.0, "loss": 0.7008, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 2.8125, "rewards/margins": 1.1015625, "rewards/rejected": 1.7109375, "step": 1700 }, { "epoch": 0.45214172395557906, "grad_norm": 52.0, "learning_rate": 7.739291380222105e-05, "logits/chosen": -83.5, "logits/rejected": -81.0, "logps/chosen": -560.0, "logps/rejected": -458.0, "loss": 0.6969, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 2.578125, "rewards/margins": 1.203125, "rewards/rejected": 1.375, "step": 1710 }, { "epoch": 0.4547858276044421, "grad_norm": 58.0, "learning_rate": 7.72607086197779e-05, "logits/chosen": -82.5, "logits/rejected": -82.0, "logps/chosen": -524.0, "logps/rejected": -454.0, "loss": 0.6297, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 1.8203125, "rewards/margins": 1.15625, "rewards/rejected": 0.66796875, "step": 1720 }, { "epoch": 0.4574299312533051, "grad_norm": 44.0, "learning_rate": 7.712850343733475e-05, "logits/chosen": -87.0, "logits/rejected": -82.5, "logps/chosen": -648.0, "logps/rejected": -478.0, "loss": 0.6467, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 3.09375, "rewards/margins": 1.5546875, "rewards/rejected": 1.546875, "step": 1730 }, { "epoch": 0.46007403490216814, "grad_norm": 49.75, "learning_rate": 7.69962982548916e-05, "logits/chosen": -85.5, "logits/rejected": -84.0, "logps/chosen": -548.0, "logps/rejected": -464.0, "loss": 0.9139, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 3.015625, "rewards/margins": 1.0078125, "rewards/rejected": 2.0, "step": 1740 }, { "epoch": 0.4627181385510312, "grad_norm": 26.25, "learning_rate": 7.686409307244844e-05, "logits/chosen": -88.5, "logits/rejected": -86.0, "logps/chosen": -620.0, "logps/rejected": -544.0, "loss": 0.6174, "rewards/accuracies": 0.71875, "rewards/chosen": 2.953125, "rewards/margins": 1.5234375, "rewards/rejected": 1.4296875, "step": 1750 }, { "epoch": 0.46536224219989425, "grad_norm": 24.75, "learning_rate": 7.673188789000529e-05, "logits/chosen": -85.5, "logits/rejected": -84.0, "logps/chosen": -544.0, "logps/rejected": -486.0, "loss": 0.605, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": 2.4375, "rewards/margins": 1.578125, "rewards/rejected": 0.86328125, "step": 1760 }, { "epoch": 0.4680063458487573, "grad_norm": 94.0, "learning_rate": 7.659968270756213e-05, "logits/chosen": -86.0, "logits/rejected": -86.0, "logps/chosen": -576.0, "logps/rejected": -510.0, "loss": 0.6971, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 2.1875, "rewards/margins": 1.25, "rewards/rejected": 0.9375, "step": 1770 }, { "epoch": 0.4706504494976203, "grad_norm": 60.0, "learning_rate": 7.646747752511899e-05, "logits/chosen": -88.5, "logits/rejected": -85.5, "logps/chosen": -600.0, "logps/rejected": -520.0, "loss": 0.7213, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 1.609375, "rewards/margins": 1.2890625, "rewards/rejected": 0.318359375, "step": 1780 }, { "epoch": 0.47329455314648333, "grad_norm": 55.25, "learning_rate": 7.633527234267583e-05, "logits/chosen": -87.5, "logits/rejected": -85.0, "logps/chosen": -608.0, "logps/rejected": -490.0, "loss": 0.6291, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 2.03125, "rewards/margins": 1.3828125, "rewards/rejected": 0.6484375, "step": 1790 }, { "epoch": 0.47593865679534636, "grad_norm": 38.75, "learning_rate": 7.620306716023269e-05, "logits/chosen": -88.0, "logits/rejected": -86.5, "logps/chosen": -544.0, "logps/rejected": -470.0, "loss": 0.8174, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 1.5234375, "rewards/margins": 0.75, "rewards/rejected": 0.7734375, "step": 1800 }, { "epoch": 0.4785827604442094, "grad_norm": 43.5, "learning_rate": 7.607086197778953e-05, "logits/chosen": -88.0, "logits/rejected": -86.0, "logps/chosen": -584.0, "logps/rejected": -516.0, "loss": 0.6514, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 1.890625, "rewards/margins": 1.1484375, "rewards/rejected": 0.74609375, "step": 1810 }, { "epoch": 0.48122686409307247, "grad_norm": 43.75, "learning_rate": 7.593865679534639e-05, "logits/chosen": -88.5, "logits/rejected": -87.5, "logps/chosen": -576.0, "logps/rejected": -512.0, "loss": 0.6043, "rewards/accuracies": 0.6875, "rewards/chosen": 2.890625, "rewards/margins": 1.390625, "rewards/rejected": 1.5, "step": 1820 }, { "epoch": 0.4838709677419355, "grad_norm": 69.5, "learning_rate": 7.580645161290323e-05, "logits/chosen": -86.0, "logits/rejected": -84.5, "logps/chosen": -548.0, "logps/rejected": -512.0, "loss": 0.6881, "rewards/accuracies": 0.65625, "rewards/chosen": 2.234375, "rewards/margins": 1.0625, "rewards/rejected": 1.171875, "step": 1830 }, { "epoch": 0.4865150713907985, "grad_norm": 43.5, "learning_rate": 7.567424643046008e-05, "logits/chosen": -85.5, "logits/rejected": -83.5, "logps/chosen": -604.0, "logps/rejected": -476.0, "loss": 0.7262, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 2.015625, "rewards/margins": 0.84765625, "rewards/rejected": 1.1640625, "step": 1840 }, { "epoch": 0.48915917503966155, "grad_norm": 58.25, "learning_rate": 7.554204124801693e-05, "logits/chosen": -86.0, "logits/rejected": -85.0, "logps/chosen": -588.0, "logps/rejected": -520.0, "loss": 0.6188, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 2.75, "rewards/margins": 1.234375, "rewards/rejected": 1.515625, "step": 1850 }, { "epoch": 0.4918032786885246, "grad_norm": 37.5, "learning_rate": 7.540983606557377e-05, "logits/chosen": -85.0, "logits/rejected": -84.0, "logps/chosen": -572.0, "logps/rejected": -482.0, "loss": 0.7207, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 2.515625, "rewards/margins": 1.1953125, "rewards/rejected": 1.3203125, "step": 1860 }, { "epoch": 0.4944473823373876, "grad_norm": 56.25, "learning_rate": 7.527763088313063e-05, "logits/chosen": -86.5, "logits/rejected": -83.0, "logps/chosen": -600.0, "logps/rejected": -480.0, "loss": 0.7768, "rewards/accuracies": 0.65625, "rewards/chosen": 2.6875, "rewards/margins": 1.0859375, "rewards/rejected": 1.59375, "step": 1870 }, { "epoch": 0.49709148598625064, "grad_norm": 66.0, "learning_rate": 7.514542570068747e-05, "logits/chosen": -85.0, "logits/rejected": -82.5, "logps/chosen": -568.0, "logps/rejected": -466.0, "loss": 0.6797, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 2.65625, "rewards/margins": 1.1484375, "rewards/rejected": 1.5078125, "step": 1880 }, { "epoch": 0.4997355896351137, "grad_norm": 40.0, "learning_rate": 7.501322051824433e-05, "logits/chosen": -86.0, "logits/rejected": -83.0, "logps/chosen": -608.0, "logps/rejected": -456.0, "loss": 0.6832, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 2.59375, "rewards/margins": 1.1484375, "rewards/rejected": 1.4375, "step": 1890 }, { "epoch": 0.5, "eval_logits/chosen": -85.5, "eval_logits/rejected": -84.0, "eval_logps/chosen": -576.0, "eval_logps/rejected": -486.0, "eval_loss": 0.7038646340370178, "eval_rewards/accuracies": 0.6686707139015198, "eval_rewards/chosen": 2.765625, "eval_rewards/margins": 1.1171875, "eval_rewards/rejected": 1.6484375, "eval_runtime": 998.9738, "eval_samples_per_second": 15.142, "eval_steps_per_second": 0.947, "step": 1891 }, { "epoch": 0.5023796932839767, "grad_norm": 58.0, "learning_rate": 7.488101533580117e-05, "logits/chosen": -85.0, "logits/rejected": -83.0, "logps/chosen": -572.0, "logps/rejected": -464.0, "loss": 0.5906, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 2.703125, "rewards/margins": 1.2421875, "rewards/rejected": 1.4609375, "step": 1900 }, { "epoch": 0.5050237969328397, "grad_norm": 52.75, "learning_rate": 7.474881015335801e-05, "logits/chosen": -84.5, "logits/rejected": -85.0, "logps/chosen": -504.0, "logps/rejected": -474.0, "loss": 0.7234, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 2.609375, "rewards/margins": 1.0234375, "rewards/rejected": 1.578125, "step": 1910 }, { "epoch": 0.5076679005817029, "grad_norm": 45.5, "learning_rate": 7.461660497091487e-05, "logits/chosen": -87.0, "logits/rejected": -84.5, "logps/chosen": -592.0, "logps/rejected": -516.0, "loss": 0.648, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 2.4375, "rewards/margins": 1.1484375, "rewards/rejected": 1.28125, "step": 1920 }, { "epoch": 0.5103120042305659, "grad_norm": 32.5, "learning_rate": 7.448439978847171e-05, "logits/chosen": -86.5, "logits/rejected": -84.0, "logps/chosen": -528.0, "logps/rejected": -464.0, "loss": 0.583, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 2.375, "rewards/margins": 1.2890625, "rewards/rejected": 1.0859375, "step": 1930 }, { "epoch": 0.5129561078794289, "grad_norm": 50.75, "learning_rate": 7.435219460602855e-05, "logits/chosen": -86.0, "logits/rejected": -84.5, "logps/chosen": -592.0, "logps/rejected": -524.0, "loss": 0.78, "rewards/accuracies": 0.625, "rewards/chosen": 2.328125, "rewards/margins": 0.890625, "rewards/rejected": 1.4375, "step": 1940 }, { "epoch": 0.5156002115282919, "grad_norm": 45.75, "learning_rate": 7.421998942358541e-05, "logits/chosen": -87.5, "logits/rejected": -84.0, "logps/chosen": -556.0, "logps/rejected": -488.0, "loss": 0.7254, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 2.453125, "rewards/margins": 1.046875, "rewards/rejected": 1.3984375, "step": 1950 }, { "epoch": 0.518244315177155, "grad_norm": 53.75, "learning_rate": 7.408778424114225e-05, "logits/chosen": -87.5, "logits/rejected": -87.0, "logps/chosen": -592.0, "logps/rejected": -508.0, "loss": 0.5874, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 2.4375, "rewards/margins": 1.390625, "rewards/rejected": 1.046875, "step": 1960 }, { "epoch": 0.520888418826018, "grad_norm": 48.0, "learning_rate": 7.395557905869911e-05, "logits/chosen": -86.5, "logits/rejected": -83.0, "logps/chosen": -584.0, "logps/rejected": -448.0, "loss": 0.5182, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 2.578125, "rewards/margins": 1.453125, "rewards/rejected": 1.125, "step": 1970 }, { "epoch": 0.523532522474881, "grad_norm": 62.75, "learning_rate": 7.382337387625595e-05, "logits/chosen": -85.0, "logits/rejected": -84.0, "logps/chosen": -536.0, "logps/rejected": -448.0, "loss": 0.6439, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 2.484375, "rewards/margins": 1.2109375, "rewards/rejected": 1.2734375, "step": 1980 }, { "epoch": 0.526176626123744, "grad_norm": 38.75, "learning_rate": 7.369116869381281e-05, "logits/chosen": -87.0, "logits/rejected": -83.0, "logps/chosen": -568.0, "logps/rejected": -452.0, "loss": 0.6145, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 2.84375, "rewards/margins": 1.5625, "rewards/rejected": 1.28125, "step": 1990 }, { "epoch": 0.5288207297726071, "grad_norm": 23.375, "learning_rate": 7.355896351136965e-05, "logits/chosen": -83.5, "logits/rejected": -81.5, "logps/chosen": -536.0, "logps/rejected": -462.0, "loss": 0.6359, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.75, "rewards/margins": 1.328125, "rewards/rejected": 1.421875, "step": 2000 }, { "epoch": 0.5314648334214701, "grad_norm": 32.75, "learning_rate": 7.34267583289265e-05, "logits/chosen": -86.0, "logits/rejected": -83.0, "logps/chosen": -564.0, "logps/rejected": -446.0, "loss": 0.6238, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 2.484375, "rewards/margins": 1.3671875, "rewards/rejected": 1.125, "step": 2010 }, { "epoch": 0.5341089370703331, "grad_norm": 46.75, "learning_rate": 7.329455314648335e-05, "logits/chosen": -87.0, "logits/rejected": -85.0, "logps/chosen": -588.0, "logps/rejected": -484.0, "loss": 0.6934, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 2.5, "rewards/margins": 1.109375, "rewards/rejected": 1.390625, "step": 2020 }, { "epoch": 0.5367530407191962, "grad_norm": 54.25, "learning_rate": 7.316234796404019e-05, "logits/chosen": -84.5, "logits/rejected": -84.5, "logps/chosen": -548.0, "logps/rejected": -488.0, "loss": 0.717, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 2.34375, "rewards/margins": 0.96484375, "rewards/rejected": 1.375, "step": 2030 }, { "epoch": 0.5393971443680592, "grad_norm": 48.0, "learning_rate": 7.303014278159705e-05, "logits/chosen": -86.0, "logits/rejected": -84.5, "logps/chosen": -568.0, "logps/rejected": -494.0, "loss": 0.6324, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 3.03125, "rewards/margins": 1.390625, "rewards/rejected": 1.640625, "step": 2040 }, { "epoch": 0.5420412480169222, "grad_norm": 83.5, "learning_rate": 7.289793759915389e-05, "logits/chosen": -88.0, "logits/rejected": -84.5, "logps/chosen": -640.0, "logps/rejected": -536.0, "loss": 0.7365, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 2.5, "rewards/margins": 1.1640625, "rewards/rejected": 1.328125, "step": 2050 }, { "epoch": 0.5446853516657852, "grad_norm": 52.75, "learning_rate": 7.276573241671075e-05, "logits/chosen": -85.0, "logits/rejected": -85.5, "logps/chosen": -580.0, "logps/rejected": -548.0, "loss": 0.8979, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 2.3125, "rewards/margins": 1.0703125, "rewards/rejected": 1.2421875, "step": 2060 }, { "epoch": 0.5473294553146484, "grad_norm": 53.0, "learning_rate": 7.263352723426759e-05, "logits/chosen": -88.5, "logits/rejected": -84.0, "logps/chosen": -580.0, "logps/rejected": -502.0, "loss": 0.7537, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.109375, "rewards/margins": 0.76953125, "rewards/rejected": 1.34375, "step": 2070 }, { "epoch": 0.5499735589635114, "grad_norm": 32.25, "learning_rate": 7.250132205182443e-05, "logits/chosen": -87.5, "logits/rejected": -84.5, "logps/chosen": -612.0, "logps/rejected": -528.0, "loss": 0.6129, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.609375, "rewards/margins": 1.3046875, "rewards/rejected": 1.3046875, "step": 2080 }, { "epoch": 0.5526176626123744, "grad_norm": 33.5, "learning_rate": 7.236911686938128e-05, "logits/chosen": -85.5, "logits/rejected": -84.5, "logps/chosen": -588.0, "logps/rejected": -472.0, "loss": 0.5652, "rewards/accuracies": 0.6875, "rewards/chosen": 3.078125, "rewards/margins": 1.65625, "rewards/rejected": 1.4140625, "step": 2090 }, { "epoch": 0.5552617662612375, "grad_norm": 50.75, "learning_rate": 7.223691168693813e-05, "logits/chosen": -86.0, "logits/rejected": -85.0, "logps/chosen": -564.0, "logps/rejected": -466.0, "loss": 0.5957, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 3.125, "rewards/margins": 1.46875, "rewards/rejected": 1.6484375, "step": 2100 }, { "epoch": 0.5579058699101005, "grad_norm": 42.25, "learning_rate": 7.210470650449498e-05, "logits/chosen": -87.0, "logits/rejected": -86.0, "logps/chosen": -580.0, "logps/rejected": -492.0, "loss": 0.7441, "rewards/accuracies": 0.625, "rewards/chosen": 2.28125, "rewards/margins": 1.0703125, "rewards/rejected": 1.2109375, "step": 2110 }, { "epoch": 0.5605499735589635, "grad_norm": 40.5, "learning_rate": 7.197250132205183e-05, "logits/chosen": -87.0, "logits/rejected": -83.0, "logps/chosen": -592.0, "logps/rejected": -464.0, "loss": 0.6855, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 2.359375, "rewards/margins": 0.87890625, "rewards/rejected": 1.4765625, "step": 2120 }, { "epoch": 0.5631940772078265, "grad_norm": 71.5, "learning_rate": 7.184029613960867e-05, "logits/chosen": -84.0, "logits/rejected": -82.5, "logps/chosen": -532.0, "logps/rejected": -460.0, "loss": 0.8223, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": 2.21875, "rewards/margins": 0.98828125, "rewards/rejected": 1.2265625, "step": 2130 }, { "epoch": 0.5658381808566896, "grad_norm": 54.0, "learning_rate": 7.170809095716552e-05, "logits/chosen": -86.5, "logits/rejected": -82.0, "logps/chosen": -596.0, "logps/rejected": -480.0, "loss": 0.7664, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 2.640625, "rewards/margins": 1.140625, "rewards/rejected": 1.4921875, "step": 2140 }, { "epoch": 0.5684822845055526, "grad_norm": 34.5, "learning_rate": 7.157588577472237e-05, "logits/chosen": -84.5, "logits/rejected": -80.5, "logps/chosen": -504.0, "logps/rejected": -422.0, "loss": 0.6908, "rewards/accuracies": 0.6875, "rewards/chosen": 2.8125, "rewards/margins": 1.28125, "rewards/rejected": 1.53125, "step": 2150 }, { "epoch": 0.5711263881544156, "grad_norm": 48.25, "learning_rate": 7.144368059227922e-05, "logits/chosen": -83.5, "logits/rejected": -81.0, "logps/chosen": -600.0, "logps/rejected": -510.0, "loss": 0.9252, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 3.734375, "rewards/margins": 1.1640625, "rewards/rejected": 2.5625, "step": 2160 }, { "epoch": 0.5737704918032787, "grad_norm": 84.0, "learning_rate": 7.131147540983607e-05, "logits/chosen": -85.5, "logits/rejected": -83.0, "logps/chosen": -612.0, "logps/rejected": -494.0, "loss": 0.7403, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 3.09375, "rewards/margins": 1.203125, "rewards/rejected": 1.8984375, "step": 2170 }, { "epoch": 0.5764145954521417, "grad_norm": 36.75, "learning_rate": 7.117927022739292e-05, "logits/chosen": -85.0, "logits/rejected": -83.0, "logps/chosen": -560.0, "logps/rejected": -504.0, "loss": 0.7318, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": 2.359375, "rewards/margins": 0.73828125, "rewards/rejected": 1.6171875, "step": 2180 }, { "epoch": 0.5790586991010047, "grad_norm": 46.0, "learning_rate": 7.104706504494977e-05, "logits/chosen": -88.0, "logits/rejected": -84.0, "logps/chosen": -608.0, "logps/rejected": -506.0, "loss": 0.5941, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 2.046875, "rewards/margins": 1.203125, "rewards/rejected": 0.84765625, "step": 2190 }, { "epoch": 0.5817028027498677, "grad_norm": 46.0, "learning_rate": 7.091485986250662e-05, "logits/chosen": -85.5, "logits/rejected": -82.5, "logps/chosen": -600.0, "logps/rejected": -504.0, "loss": 0.666, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 2.359375, "rewards/margins": 1.2734375, "rewards/rejected": 1.09375, "step": 2200 }, { "epoch": 0.5843469063987309, "grad_norm": 39.5, "learning_rate": 7.078265468006347e-05, "logits/chosen": -87.0, "logits/rejected": -83.5, "logps/chosen": -608.0, "logps/rejected": -496.0, "loss": 0.7355, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 2.28125, "rewards/margins": 1.2109375, "rewards/rejected": 1.0703125, "step": 2210 }, { "epoch": 0.5869910100475939, "grad_norm": 40.5, "learning_rate": 7.065044949762031e-05, "logits/chosen": -86.0, "logits/rejected": -82.5, "logps/chosen": -580.0, "logps/rejected": -484.0, "loss": 0.741, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 2.1875, "rewards/margins": 1.0546875, "rewards/rejected": 1.125, "step": 2220 }, { "epoch": 0.5896351136964569, "grad_norm": 40.0, "learning_rate": 7.051824431517716e-05, "logits/chosen": -84.0, "logits/rejected": -84.5, "logps/chosen": -588.0, "logps/rejected": -516.0, "loss": 0.6184, "rewards/accuracies": 0.6875, "rewards/chosen": 2.46875, "rewards/margins": 1.2890625, "rewards/rejected": 1.1796875, "step": 2230 }, { "epoch": 0.59227921734532, "grad_norm": 34.5, "learning_rate": 7.0386039132734e-05, "logits/chosen": -84.0, "logits/rejected": -82.5, "logps/chosen": -588.0, "logps/rejected": -506.0, "loss": 0.5531, "rewards/accuracies": 0.71875, "rewards/chosen": 2.703125, "rewards/margins": 1.3828125, "rewards/rejected": 1.328125, "step": 2240 }, { "epoch": 0.594923320994183, "grad_norm": 41.75, "learning_rate": 7.025383395029086e-05, "logits/chosen": -85.0, "logits/rejected": -83.5, "logps/chosen": -564.0, "logps/rejected": -468.0, "loss": 0.56, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 2.296875, "rewards/margins": 1.484375, "rewards/rejected": 0.8125, "step": 2250 }, { "epoch": 0.597567424643046, "grad_norm": 53.75, "learning_rate": 7.01216287678477e-05, "logits/chosen": -85.0, "logits/rejected": -83.5, "logps/chosen": -580.0, "logps/rejected": -540.0, "loss": 0.6855, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 2.09375, "rewards/margins": 1.234375, "rewards/rejected": 0.86328125, "step": 2260 }, { "epoch": 0.600211528291909, "grad_norm": 45.0, "learning_rate": 6.998942358540456e-05, "logits/chosen": -85.5, "logits/rejected": -83.0, "logps/chosen": -596.0, "logps/rejected": -512.0, "loss": 0.793, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 2.078125, "rewards/margins": 0.78125, "rewards/rejected": 1.2890625, "step": 2270 }, { "epoch": 0.6028556319407721, "grad_norm": 93.5, "learning_rate": 6.98572184029614e-05, "logits/chosen": -86.5, "logits/rejected": -86.5, "logps/chosen": -568.0, "logps/rejected": -510.0, "loss": 0.7701, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.953125, "rewards/margins": 0.90625, "rewards/rejected": 1.046875, "step": 2280 }, { "epoch": 0.6054997355896351, "grad_norm": 55.75, "learning_rate": 6.972501322051824e-05, "logits/chosen": -87.5, "logits/rejected": -86.5, "logps/chosen": -548.0, "logps/rejected": -490.0, "loss": 0.7234, "rewards/accuracies": 0.65625, "rewards/chosen": 1.9609375, "rewards/margins": 0.9765625, "rewards/rejected": 0.984375, "step": 2290 }, { "epoch": 0.6081438392384981, "grad_norm": 42.25, "learning_rate": 6.95928080380751e-05, "logits/chosen": -86.5, "logits/rejected": -85.0, "logps/chosen": -568.0, "logps/rejected": -524.0, "loss": 0.4948, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 2.34375, "rewards/margins": 1.4375, "rewards/rejected": 0.90234375, "step": 2300 }, { "epoch": 0.6107879428873612, "grad_norm": 59.0, "learning_rate": 6.946060285563194e-05, "logits/chosen": -87.5, "logits/rejected": -84.0, "logps/chosen": -580.0, "logps/rejected": -472.0, "loss": 0.6641, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 1.7890625, "rewards/margins": 1.34375, "rewards/rejected": 0.447265625, "step": 2310 }, { "epoch": 0.6134320465362242, "grad_norm": 55.75, "learning_rate": 6.93283976731888e-05, "logits/chosen": -88.5, "logits/rejected": -86.0, "logps/chosen": -584.0, "logps/rejected": -502.0, "loss": 0.642, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 1.890625, "rewards/margins": 1.265625, "rewards/rejected": 0.62109375, "step": 2320 }, { "epoch": 0.6160761501850872, "grad_norm": 52.75, "learning_rate": 6.919619249074564e-05, "logits/chosen": -90.0, "logits/rejected": -85.5, "logps/chosen": -616.0, "logps/rejected": -510.0, "loss": 0.6572, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": 1.7734375, "rewards/margins": 0.94921875, "rewards/rejected": 0.828125, "step": 2330 }, { "epoch": 0.6187202538339502, "grad_norm": 52.0, "learning_rate": 6.90639873083025e-05, "logits/chosen": -86.5, "logits/rejected": -85.5, "logps/chosen": -608.0, "logps/rejected": -498.0, "loss": 0.6424, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 2.109375, "rewards/margins": 1.03125, "rewards/rejected": 1.0703125, "step": 2340 }, { "epoch": 0.6213643574828134, "grad_norm": 50.0, "learning_rate": 6.893178212585934e-05, "logits/chosen": -86.0, "logits/rejected": -83.0, "logps/chosen": -608.0, "logps/rejected": -528.0, "loss": 0.6031, "rewards/accuracies": 0.65625, "rewards/chosen": 2.234375, "rewards/margins": 1.328125, "rewards/rejected": 0.90234375, "step": 2350 }, { "epoch": 0.6240084611316764, "grad_norm": 53.75, "learning_rate": 6.87995769434162e-05, "logits/chosen": -87.0, "logits/rejected": -83.0, "logps/chosen": -572.0, "logps/rejected": -504.0, "loss": 0.6789, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 2.015625, "rewards/margins": 1.0078125, "rewards/rejected": 1.0, "step": 2360 }, { "epoch": 0.6266525647805394, "grad_norm": 33.75, "learning_rate": 6.866737176097304e-05, "logits/chosen": -86.5, "logits/rejected": -84.5, "logps/chosen": -576.0, "logps/rejected": -504.0, "loss": 0.5207, "rewards/accuracies": 0.731249988079071, "rewards/chosen": 2.21875, "rewards/margins": 1.375, "rewards/rejected": 0.8359375, "step": 2370 }, { "epoch": 0.6292966684294025, "grad_norm": 62.0, "learning_rate": 6.853516657852988e-05, "logits/chosen": -86.0, "logits/rejected": -83.0, "logps/chosen": -600.0, "logps/rejected": -502.0, "loss": 0.8082, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 1.6953125, "rewards/margins": 0.84375, "rewards/rejected": 0.8515625, "step": 2380 }, { "epoch": 0.6319407720782655, "grad_norm": 72.0, "learning_rate": 6.840296139608672e-05, "logits/chosen": -87.0, "logits/rejected": -87.0, "logps/chosen": -592.0, "logps/rejected": -510.0, "loss": 0.759, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 1.75, "rewards/margins": 0.93359375, "rewards/rejected": 0.81640625, "step": 2390 }, { "epoch": 0.6345848757271285, "grad_norm": 53.0, "learning_rate": 6.827075621364358e-05, "logits/chosen": -87.5, "logits/rejected": -85.0, "logps/chosen": -576.0, "logps/rejected": -474.0, "loss": 0.6809, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 1.8671875, "rewards/margins": 1.140625, "rewards/rejected": 0.72265625, "step": 2400 }, { "epoch": 0.6372289793759915, "grad_norm": 61.0, "learning_rate": 6.813855103120042e-05, "logits/chosen": -85.5, "logits/rejected": -85.0, "logps/chosen": -600.0, "logps/rejected": -552.0, "loss": 0.9367, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 2.828125, "rewards/margins": 0.87109375, "rewards/rejected": 1.9609375, "step": 2410 }, { "epoch": 0.6398730830248546, "grad_norm": 46.25, "learning_rate": 6.800634584875727e-05, "logits/chosen": -84.5, "logits/rejected": -84.0, "logps/chosen": -536.0, "logps/rejected": -466.0, "loss": 0.6432, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 2.421875, "rewards/margins": 1.3359375, "rewards/rejected": 1.0859375, "step": 2420 }, { "epoch": 0.6425171866737176, "grad_norm": 53.75, "learning_rate": 6.787414066631412e-05, "logits/chosen": -83.0, "logits/rejected": -82.0, "logps/chosen": -540.0, "logps/rejected": -478.0, "loss": 0.6479, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 2.09375, "rewards/margins": 1.15625, "rewards/rejected": 0.9453125, "step": 2430 }, { "epoch": 0.6451612903225806, "grad_norm": 46.25, "learning_rate": 6.774193548387096e-05, "logits/chosen": -86.0, "logits/rejected": -85.0, "logps/chosen": -588.0, "logps/rejected": -494.0, "loss": 0.607, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 2.890625, "rewards/margins": 1.2265625, "rewards/rejected": 1.6640625, "step": 2440 }, { "epoch": 0.6478053939714437, "grad_norm": 83.0, "learning_rate": 6.760973030142782e-05, "logits/chosen": -85.0, "logits/rejected": -81.0, "logps/chosen": -568.0, "logps/rejected": -464.0, "loss": 0.5593, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 2.53125, "rewards/margins": 1.4375, "rewards/rejected": 1.09375, "step": 2450 }, { "epoch": 0.6504494976203067, "grad_norm": 75.0, "learning_rate": 6.747752511898466e-05, "logits/chosen": -83.5, "logits/rejected": -81.5, "logps/chosen": -552.0, "logps/rejected": -438.0, "loss": 0.6492, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 2.734375, "rewards/margins": 1.3046875, "rewards/rejected": 1.421875, "step": 2460 }, { "epoch": 0.6530936012691697, "grad_norm": 36.25, "learning_rate": 6.734531993654152e-05, "logits/chosen": -84.0, "logits/rejected": -84.0, "logps/chosen": -548.0, "logps/rejected": -470.0, "loss": 0.6166, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 2.59375, "rewards/margins": 1.1328125, "rewards/rejected": 1.4609375, "step": 2470 }, { "epoch": 0.6557377049180327, "grad_norm": 50.25, "learning_rate": 6.721311475409836e-05, "logits/chosen": -85.0, "logits/rejected": -84.0, "logps/chosen": -560.0, "logps/rejected": -478.0, "loss": 0.7475, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 2.171875, "rewards/margins": 0.87890625, "rewards/rejected": 1.2890625, "step": 2480 }, { "epoch": 0.6583818085668959, "grad_norm": 51.5, "learning_rate": 6.708090957165522e-05, "logits/chosen": -87.5, "logits/rejected": -85.0, "logps/chosen": -572.0, "logps/rejected": -516.0, "loss": 0.7842, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": 2.078125, "rewards/margins": 0.9609375, "rewards/rejected": 1.1171875, "step": 2490 }, { "epoch": 0.6610259122157589, "grad_norm": 32.0, "learning_rate": 6.694870438921206e-05, "logits/chosen": -86.0, "logits/rejected": -82.5, "logps/chosen": -576.0, "logps/rejected": -506.0, "loss": 0.5996, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 1.9453125, "rewards/margins": 1.390625, "rewards/rejected": 0.5546875, "step": 2500 }, { "epoch": 0.6636700158646219, "grad_norm": 77.0, "learning_rate": 6.681649920676892e-05, "logits/chosen": -85.5, "logits/rejected": -83.5, "logps/chosen": -544.0, "logps/rejected": -486.0, "loss": 0.8594, "rewards/accuracies": 0.59375, "rewards/chosen": 1.5, "rewards/margins": 0.671875, "rewards/rejected": 0.828125, "step": 2510 }, { "epoch": 0.666314119513485, "grad_norm": 46.0, "learning_rate": 6.668429402432576e-05, "logits/chosen": -85.5, "logits/rejected": -84.0, "logps/chosen": -564.0, "logps/rejected": -464.0, "loss": 0.6217, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 2.25, "rewards/margins": 1.1484375, "rewards/rejected": 1.1015625, "step": 2520 }, { "epoch": 0.668958223162348, "grad_norm": 30.5, "learning_rate": 6.65520888418826e-05, "logits/chosen": -84.5, "logits/rejected": -81.0, "logps/chosen": -588.0, "logps/rejected": -472.0, "loss": 0.6396, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 2.765625, "rewards/margins": 1.359375, "rewards/rejected": 1.3984375, "step": 2530 }, { "epoch": 0.671602326811211, "grad_norm": 39.5, "learning_rate": 6.641988365943945e-05, "logits/chosen": -82.5, "logits/rejected": -81.0, "logps/chosen": -576.0, "logps/rejected": -478.0, "loss": 0.7891, "rewards/accuracies": 0.65625, "rewards/chosen": 2.796875, "rewards/margins": 0.87109375, "rewards/rejected": 1.9296875, "step": 2540 }, { "epoch": 0.674246430460074, "grad_norm": 41.25, "learning_rate": 6.62876784769963e-05, "logits/chosen": -86.5, "logits/rejected": -82.5, "logps/chosen": -560.0, "logps/rejected": -482.0, "loss": 0.7059, "rewards/accuracies": 0.6875, "rewards/chosen": 2.359375, "rewards/margins": 0.9296875, "rewards/rejected": 1.421875, "step": 2550 }, { "epoch": 0.6768905341089371, "grad_norm": 101.5, "learning_rate": 6.615547329455315e-05, "logits/chosen": -86.0, "logits/rejected": -82.5, "logps/chosen": -584.0, "logps/rejected": -442.0, "loss": 0.625, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 2.125, "rewards/margins": 1.2890625, "rewards/rejected": 0.83203125, "step": 2560 }, { "epoch": 0.6795346377578001, "grad_norm": 65.0, "learning_rate": 6.602326811210999e-05, "logits/chosen": -86.0, "logits/rejected": -84.0, "logps/chosen": -560.0, "logps/rejected": -482.0, "loss": 0.7088, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 2.203125, "rewards/margins": 0.93359375, "rewards/rejected": 1.265625, "step": 2570 }, { "epoch": 0.6821787414066631, "grad_norm": 30.5, "learning_rate": 6.589106292966685e-05, "logits/chosen": -88.0, "logits/rejected": -86.5, "logps/chosen": -660.0, "logps/rejected": -564.0, "loss": 0.6074, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 3.03125, "rewards/margins": 1.28125, "rewards/rejected": 1.75, "step": 2580 }, { "epoch": 0.6848228450555262, "grad_norm": 57.75, "learning_rate": 6.575885774722369e-05, "logits/chosen": -87.0, "logits/rejected": -85.0, "logps/chosen": -596.0, "logps/rejected": -512.0, "loss": 0.7473, "rewards/accuracies": 0.65625, "rewards/chosen": 2.421875, "rewards/margins": 1.1484375, "rewards/rejected": 1.265625, "step": 2590 }, { "epoch": 0.6874669487043892, "grad_norm": 38.75, "learning_rate": 6.562665256478054e-05, "logits/chosen": -86.5, "logits/rejected": -85.5, "logps/chosen": -580.0, "logps/rejected": -520.0, "loss": 0.6422, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 1.84375, "rewards/margins": 1.0078125, "rewards/rejected": 0.8359375, "step": 2600 }, { "epoch": 0.6901110523532522, "grad_norm": 49.75, "learning_rate": 6.549444738233739e-05, "logits/chosen": -86.0, "logits/rejected": -83.0, "logps/chosen": -600.0, "logps/rejected": -474.0, "loss": 0.7418, "rewards/accuracies": 0.606249988079071, "rewards/chosen": 1.6796875, "rewards/margins": 0.87890625, "rewards/rejected": 0.80078125, "step": 2610 }, { "epoch": 0.6927551560021152, "grad_norm": 58.0, "learning_rate": 6.536224219989424e-05, "logits/chosen": -84.5, "logits/rejected": -84.0, "logps/chosen": -576.0, "logps/rejected": -508.0, "loss": 0.5819, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 1.90625, "rewards/margins": 1.1640625, "rewards/rejected": 0.73828125, "step": 2620 }, { "epoch": 0.6953992596509783, "grad_norm": 45.75, "learning_rate": 6.523003701745109e-05, "logits/chosen": -87.0, "logits/rejected": -85.0, "logps/chosen": -604.0, "logps/rejected": -502.0, "loss": 0.6707, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 2.46875, "rewards/margins": 1.234375, "rewards/rejected": 1.2265625, "step": 2630 }, { "epoch": 0.6980433632998414, "grad_norm": 34.0, "learning_rate": 6.509783183500794e-05, "logits/chosen": -85.5, "logits/rejected": -84.5, "logps/chosen": -540.0, "logps/rejected": -464.0, "loss": 0.7521, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 2.34375, "rewards/margins": 0.78515625, "rewards/rejected": 1.5625, "step": 2640 }, { "epoch": 0.7006874669487044, "grad_norm": 32.75, "learning_rate": 6.496562665256479e-05, "logits/chosen": -85.0, "logits/rejected": -83.5, "logps/chosen": -572.0, "logps/rejected": -462.0, "loss": 0.5445, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 2.515625, "rewards/margins": 1.4921875, "rewards/rejected": 1.0234375, "step": 2650 }, { "epoch": 0.7033315705975675, "grad_norm": 60.0, "learning_rate": 6.483342147012164e-05, "logits/chosen": -84.0, "logits/rejected": -86.5, "logps/chosen": -588.0, "logps/rejected": -512.0, "loss": 0.59, "rewards/accuracies": 0.6875, "rewards/chosen": 2.546875, "rewards/margins": 1.3046875, "rewards/rejected": 1.2421875, "step": 2660 }, { "epoch": 0.7059756742464305, "grad_norm": 46.25, "learning_rate": 6.470121628767848e-05, "logits/chosen": -85.5, "logits/rejected": -85.0, "logps/chosen": -552.0, "logps/rejected": -494.0, "loss": 0.6025, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 2.390625, "rewards/margins": 1.078125, "rewards/rejected": 1.3046875, "step": 2670 }, { "epoch": 0.7086197778952935, "grad_norm": 56.25, "learning_rate": 6.456901110523533e-05, "logits/chosen": -83.0, "logits/rejected": -81.0, "logps/chosen": -508.0, "logps/rejected": -424.0, "loss": 0.5965, "rewards/accuracies": 0.71875, "rewards/chosen": 2.109375, "rewards/margins": 1.375, "rewards/rejected": 0.73046875, "step": 2680 }, { "epoch": 0.7112638815441565, "grad_norm": 41.25, "learning_rate": 6.443680592279217e-05, "logits/chosen": -84.0, "logits/rejected": -86.5, "logps/chosen": -568.0, "logps/rejected": -548.0, "loss": 0.7338, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 2.265625, "rewards/margins": 1.1171875, "rewards/rejected": 1.15625, "step": 2690 }, { "epoch": 0.7139079851930196, "grad_norm": 86.5, "learning_rate": 6.430460074034903e-05, "logits/chosen": -87.5, "logits/rejected": -85.0, "logps/chosen": -596.0, "logps/rejected": -532.0, "loss": 0.692, "rewards/accuracies": 0.6875, "rewards/chosen": 2.15625, "rewards/margins": 1.234375, "rewards/rejected": 0.91796875, "step": 2700 }, { "epoch": 0.7165520888418826, "grad_norm": 31.5, "learning_rate": 6.417239555790587e-05, "logits/chosen": -90.0, "logits/rejected": -87.0, "logps/chosen": -572.0, "logps/rejected": -492.0, "loss": 0.5828, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 2.25, "rewards/margins": 1.375, "rewards/rejected": 0.87109375, "step": 2710 }, { "epoch": 0.7191961924907456, "grad_norm": 51.5, "learning_rate": 6.404019037546271e-05, "logits/chosen": -91.5, "logits/rejected": -88.0, "logps/chosen": -652.0, "logps/rejected": -560.0, "loss": 0.6299, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 2.5625, "rewards/margins": 1.296875, "rewards/rejected": 1.2578125, "step": 2720 }, { "epoch": 0.7218402961396087, "grad_norm": 58.5, "learning_rate": 6.390798519301957e-05, "logits/chosen": -86.5, "logits/rejected": -85.0, "logps/chosen": -568.0, "logps/rejected": -474.0, "loss": 0.5832, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 1.7734375, "rewards/margins": 1.078125, "rewards/rejected": 0.69921875, "step": 2730 }, { "epoch": 0.7244843997884717, "grad_norm": 47.25, "learning_rate": 6.377578001057641e-05, "logits/chosen": -86.0, "logits/rejected": -84.0, "logps/chosen": -564.0, "logps/rejected": -490.0, "loss": 0.7086, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.421875, "rewards/margins": 1.0546875, "rewards/rejected": 1.359375, "step": 2740 }, { "epoch": 0.7271285034373347, "grad_norm": 35.25, "learning_rate": 6.364357482813327e-05, "logits/chosen": -87.5, "logits/rejected": -85.0, "logps/chosen": -576.0, "logps/rejected": -454.0, "loss": 0.551, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 2.40625, "rewards/margins": 1.234375, "rewards/rejected": 1.1875, "step": 2750 }, { "epoch": 0.7297726070861977, "grad_norm": 40.5, "learning_rate": 6.351136964569011e-05, "logits/chosen": -85.0, "logits/rejected": -84.5, "logps/chosen": -564.0, "logps/rejected": -492.0, "loss": 0.8031, "rewards/accuracies": 0.625, "rewards/chosen": 2.25, "rewards/margins": 0.76171875, "rewards/rejected": 1.484375, "step": 2760 }, { "epoch": 0.7324167107350608, "grad_norm": 82.5, "learning_rate": 6.337916446324697e-05, "logits/chosen": -87.0, "logits/rejected": -86.0, "logps/chosen": -608.0, "logps/rejected": -532.0, "loss": 0.76, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 2.859375, "rewards/margins": 1.2578125, "rewards/rejected": 1.6015625, "step": 2770 }, { "epoch": 0.7350608143839239, "grad_norm": 49.0, "learning_rate": 6.324695928080381e-05, "logits/chosen": -85.0, "logits/rejected": -83.5, "logps/chosen": -576.0, "logps/rejected": -548.0, "loss": 0.7682, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 2.375, "rewards/margins": 0.80078125, "rewards/rejected": 1.578125, "step": 2780 }, { "epoch": 0.7377049180327869, "grad_norm": 41.25, "learning_rate": 6.311475409836067e-05, "logits/chosen": -85.0, "logits/rejected": -82.5, "logps/chosen": -524.0, "logps/rejected": -426.0, "loss": 0.5453, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 1.7734375, "rewards/margins": 1.1171875, "rewards/rejected": 0.66015625, "step": 2790 }, { "epoch": 0.74034902168165, "grad_norm": 46.5, "learning_rate": 6.298254891591751e-05, "logits/chosen": -88.0, "logits/rejected": -85.5, "logps/chosen": -600.0, "logps/rejected": -506.0, "loss": 0.6568, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 2.109375, "rewards/margins": 1.2421875, "rewards/rejected": 0.86328125, "step": 2800 }, { "epoch": 0.742993125330513, "grad_norm": 44.25, "learning_rate": 6.285034373347437e-05, "logits/chosen": -85.0, "logits/rejected": -83.5, "logps/chosen": -568.0, "logps/rejected": -464.0, "loss": 0.6092, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 2.0625, "rewards/margins": 1.375, "rewards/rejected": 0.6953125, "step": 2810 }, { "epoch": 0.745637228979376, "grad_norm": 49.0, "learning_rate": 6.271813855103121e-05, "logits/chosen": -86.0, "logits/rejected": -83.5, "logps/chosen": -596.0, "logps/rejected": -540.0, "loss": 0.5965, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 2.328125, "rewards/margins": 1.359375, "rewards/rejected": 0.97265625, "step": 2820 }, { "epoch": 0.748281332628239, "grad_norm": 40.25, "learning_rate": 6.258593336858805e-05, "logits/chosen": -86.0, "logits/rejected": -83.5, "logps/chosen": -616.0, "logps/rejected": -472.0, "loss": 0.5998, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 2.640625, "rewards/margins": 1.296875, "rewards/rejected": 1.34375, "step": 2830 }, { "epoch": 0.7509254362771021, "grad_norm": 31.875, "learning_rate": 6.24537281861449e-05, "logits/chosen": -84.0, "logits/rejected": -82.5, "logps/chosen": -528.0, "logps/rejected": -478.0, "loss": 0.6848, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 2.296875, "rewards/margins": 1.21875, "rewards/rejected": 1.078125, "step": 2840 }, { "epoch": 0.7535695399259651, "grad_norm": 41.75, "learning_rate": 6.232152300370174e-05, "logits/chosen": -84.5, "logits/rejected": -83.5, "logps/chosen": -600.0, "logps/rejected": -498.0, "loss": 0.6027, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 2.59375, "rewards/margins": 1.1640625, "rewards/rejected": 1.421875, "step": 2850 }, { "epoch": 0.7562136435748281, "grad_norm": 23.25, "learning_rate": 6.218931782125859e-05, "logits/chosen": -86.5, "logits/rejected": -83.0, "logps/chosen": -620.0, "logps/rejected": -504.0, "loss": 0.5375, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 2.84375, "rewards/margins": 1.3984375, "rewards/rejected": 1.4375, "step": 2860 }, { "epoch": 0.7588577472236911, "grad_norm": 78.0, "learning_rate": 6.205711263881544e-05, "logits/chosen": -87.0, "logits/rejected": -84.5, "logps/chosen": -600.0, "logps/rejected": -516.0, "loss": 0.6531, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 2.375, "rewards/margins": 1.28125, "rewards/rejected": 1.1015625, "step": 2870 }, { "epoch": 0.7615018508725542, "grad_norm": 52.5, "learning_rate": 6.192490745637229e-05, "logits/chosen": -82.5, "logits/rejected": -81.5, "logps/chosen": -524.0, "logps/rejected": -452.0, "loss": 0.6633, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 2.140625, "rewards/margins": 1.109375, "rewards/rejected": 1.03125, "step": 2880 }, { "epoch": 0.7641459545214172, "grad_norm": 38.0, "learning_rate": 6.179270227392913e-05, "logits/chosen": -87.0, "logits/rejected": -86.0, "logps/chosen": -612.0, "logps/rejected": -512.0, "loss": 0.5768, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 2.328125, "rewards/margins": 1.4921875, "rewards/rejected": 0.8359375, "step": 2890 }, { "epoch": 0.7667900581702802, "grad_norm": 33.5, "learning_rate": 6.166049709148599e-05, "logits/chosen": -89.0, "logits/rejected": -85.5, "logps/chosen": -600.0, "logps/rejected": -516.0, "loss": 0.6641, "rewards/accuracies": 0.6875, "rewards/chosen": 2.546875, "rewards/margins": 1.046875, "rewards/rejected": 1.5, "step": 2900 }, { "epoch": 0.7694341618191433, "grad_norm": 51.75, "learning_rate": 6.152829190904283e-05, "logits/chosen": -87.5, "logits/rejected": -83.5, "logps/chosen": -568.0, "logps/rejected": -472.0, "loss": 0.6891, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": 2.59375, "rewards/margins": 1.0703125, "rewards/rejected": 1.5234375, "step": 2910 }, { "epoch": 0.7720782654680064, "grad_norm": 50.5, "learning_rate": 6.139608672659969e-05, "logits/chosen": -88.5, "logits/rejected": -83.5, "logps/chosen": -556.0, "logps/rejected": -460.0, "loss": 0.6221, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 2.328125, "rewards/margins": 1.1484375, "rewards/rejected": 1.1796875, "step": 2920 }, { "epoch": 0.7747223691168694, "grad_norm": 42.75, "learning_rate": 6.126388154415653e-05, "logits/chosen": -86.0, "logits/rejected": -84.0, "logps/chosen": -580.0, "logps/rejected": -464.0, "loss": 0.6721, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 2.515625, "rewards/margins": 1.3359375, "rewards/rejected": 1.1875, "step": 2930 }, { "epoch": 0.7773664727657325, "grad_norm": 29.125, "learning_rate": 6.113167636171339e-05, "logits/chosen": -87.0, "logits/rejected": -83.0, "logps/chosen": -596.0, "logps/rejected": -472.0, "loss": 0.6119, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 2.609375, "rewards/margins": 1.125, "rewards/rejected": 1.484375, "step": 2940 }, { "epoch": 0.7800105764145955, "grad_norm": 58.5, "learning_rate": 6.099947117927023e-05, "logits/chosen": -88.0, "logits/rejected": -84.5, "logps/chosen": -580.0, "logps/rejected": -516.0, "loss": 0.6229, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 2.8125, "rewards/margins": 1.1640625, "rewards/rejected": 1.65625, "step": 2950 }, { "epoch": 0.7826546800634585, "grad_norm": 40.75, "learning_rate": 6.086726599682708e-05, "logits/chosen": -86.0, "logits/rejected": -84.0, "logps/chosen": -556.0, "logps/rejected": -474.0, "loss": 0.5266, "rewards/accuracies": 0.768750011920929, "rewards/chosen": 2.84375, "rewards/margins": 1.46875, "rewards/rejected": 1.375, "step": 2960 }, { "epoch": 0.7852987837123215, "grad_norm": 50.0, "learning_rate": 6.0735060814383925e-05, "logits/chosen": -86.0, "logits/rejected": -85.5, "logps/chosen": -596.0, "logps/rejected": -532.0, "loss": 0.8629, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 2.59375, "rewards/margins": 0.828125, "rewards/rejected": 1.765625, "step": 2970 }, { "epoch": 0.7879428873611846, "grad_norm": 45.5, "learning_rate": 6.060285563194078e-05, "logits/chosen": -87.0, "logits/rejected": -86.0, "logps/chosen": -576.0, "logps/rejected": -494.0, "loss": 0.643, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 2.171875, "rewards/margins": 0.99609375, "rewards/rejected": 1.171875, "step": 2980 }, { "epoch": 0.7905869910100476, "grad_norm": 60.75, "learning_rate": 6.0470650449497624e-05, "logits/chosen": -85.5, "logits/rejected": -84.0, "logps/chosen": -600.0, "logps/rejected": -492.0, "loss": 0.752, "rewards/accuracies": 0.59375, "rewards/chosen": 2.609375, "rewards/margins": 0.875, "rewards/rejected": 1.7421875, "step": 2990 }, { "epoch": 0.7932310946589106, "grad_norm": 41.75, "learning_rate": 6.0338445267054467e-05, "logits/chosen": -86.5, "logits/rejected": -84.5, "logps/chosen": -596.0, "logps/rejected": -476.0, "loss": 0.615, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 3.03125, "rewards/margins": 1.2578125, "rewards/rejected": 1.765625, "step": 3000 }, { "epoch": 0.7958751983077736, "grad_norm": 55.75, "learning_rate": 6.020624008461132e-05, "logits/chosen": -84.5, "logits/rejected": -86.0, "logps/chosen": -604.0, "logps/rejected": -516.0, "loss": 0.5502, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 2.984375, "rewards/margins": 1.46875, "rewards/rejected": 1.5234375, "step": 3010 }, { "epoch": 0.7985193019566367, "grad_norm": 42.25, "learning_rate": 6.0074034902168166e-05, "logits/chosen": -84.5, "logits/rejected": -84.0, "logps/chosen": -572.0, "logps/rejected": -488.0, "loss": 0.632, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.1875, "rewards/margins": 0.9375, "rewards/rejected": 1.25, "step": 3020 }, { "epoch": 0.8011634056054997, "grad_norm": 27.625, "learning_rate": 5.9941829719725015e-05, "logits/chosen": -86.5, "logits/rejected": -85.0, "logps/chosen": -628.0, "logps/rejected": -564.0, "loss": 0.5881, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 3.109375, "rewards/margins": 1.3359375, "rewards/rejected": 1.7734375, "step": 3030 }, { "epoch": 0.8038075092543627, "grad_norm": 48.0, "learning_rate": 5.980962453728186e-05, "logits/chosen": -83.5, "logits/rejected": -82.5, "logps/chosen": -580.0, "logps/rejected": -532.0, "loss": 0.7133, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 1.84375, "rewards/margins": 0.9140625, "rewards/rejected": 0.9296875, "step": 3040 }, { "epoch": 0.8064516129032258, "grad_norm": 40.0, "learning_rate": 5.9677419354838715e-05, "logits/chosen": -86.0, "logits/rejected": -83.5, "logps/chosen": -576.0, "logps/rejected": -458.0, "loss": 0.6015, "rewards/accuracies": 0.6875, "rewards/chosen": 1.7734375, "rewards/margins": 1.421875, "rewards/rejected": 0.35546875, "step": 3050 }, { "epoch": 0.8090957165520889, "grad_norm": 46.25, "learning_rate": 5.954521417239556e-05, "logits/chosen": -84.5, "logits/rejected": -82.0, "logps/chosen": -540.0, "logps/rejected": -464.0, "loss": 0.6297, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.328125, "rewards/margins": 1.0703125, "rewards/rejected": 1.25, "step": 3060 }, { "epoch": 0.8117398202009519, "grad_norm": 36.5, "learning_rate": 5.9413008989952414e-05, "logits/chosen": -84.0, "logits/rejected": -83.0, "logps/chosen": -580.0, "logps/rejected": -484.0, "loss": 0.6791, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 2.421875, "rewards/margins": 1.078125, "rewards/rejected": 1.34375, "step": 3070 }, { "epoch": 0.814383923849815, "grad_norm": 26.125, "learning_rate": 5.9280803807509256e-05, "logits/chosen": -88.0, "logits/rejected": -84.0, "logps/chosen": -588.0, "logps/rejected": -482.0, "loss": 0.6832, "rewards/accuracies": 0.65625, "rewards/chosen": 2.4375, "rewards/margins": 0.94140625, "rewards/rejected": 1.4921875, "step": 3080 }, { "epoch": 0.817028027498678, "grad_norm": 45.0, "learning_rate": 5.914859862506611e-05, "logits/chosen": -84.5, "logits/rejected": -81.5, "logps/chosen": -580.0, "logps/rejected": -480.0, "loss": 0.5865, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 2.5, "rewards/margins": 1.3046875, "rewards/rejected": 1.1875, "step": 3090 }, { "epoch": 0.819672131147541, "grad_norm": 49.25, "learning_rate": 5.9016393442622956e-05, "logits/chosen": -86.0, "logits/rejected": -82.5, "logps/chosen": -592.0, "logps/rejected": -466.0, "loss": 0.5697, "rewards/accuracies": 0.731249988079071, "rewards/chosen": 2.65625, "rewards/margins": 1.4453125, "rewards/rejected": 1.2109375, "step": 3100 }, { "epoch": 0.822316234796404, "grad_norm": 56.0, "learning_rate": 5.8884188260179805e-05, "logits/chosen": -86.0, "logits/rejected": -83.0, "logps/chosen": -576.0, "logps/rejected": -482.0, "loss": 0.5572, "rewards/accuracies": 0.71875, "rewards/chosen": 2.703125, "rewards/margins": 1.296875, "rewards/rejected": 1.40625, "step": 3110 }, { "epoch": 0.8249603384452671, "grad_norm": 47.5, "learning_rate": 5.875198307773665e-05, "logits/chosen": -85.0, "logits/rejected": -83.0, "logps/chosen": -616.0, "logps/rejected": -506.0, "loss": 0.7951, "rewards/accuracies": 0.65625, "rewards/chosen": 2.78125, "rewards/margins": 1.078125, "rewards/rejected": 1.7109375, "step": 3120 }, { "epoch": 0.8276044420941301, "grad_norm": 55.5, "learning_rate": 5.861977789529349e-05, "logits/chosen": -83.0, "logits/rejected": -81.0, "logps/chosen": -568.0, "logps/rejected": -508.0, "loss": 0.5814, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 2.234375, "rewards/margins": 1.2890625, "rewards/rejected": 0.94140625, "step": 3130 }, { "epoch": 0.8302485457429931, "grad_norm": 31.0, "learning_rate": 5.848757271285035e-05, "logits/chosen": -86.0, "logits/rejected": -84.0, "logps/chosen": -584.0, "logps/rejected": -520.0, "loss": 0.6264, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 1.96875, "rewards/margins": 1.2265625, "rewards/rejected": 0.7421875, "step": 3140 }, { "epoch": 0.8328926493918561, "grad_norm": 44.5, "learning_rate": 5.835536753040719e-05, "logits/chosen": -89.0, "logits/rejected": -83.5, "logps/chosen": -572.0, "logps/rejected": -456.0, "loss": 0.6969, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 1.9296875, "rewards/margins": 0.953125, "rewards/rejected": 0.98046875, "step": 3150 }, { "epoch": 0.8355367530407192, "grad_norm": 37.25, "learning_rate": 5.8223162347964046e-05, "logits/chosen": -85.5, "logits/rejected": -85.5, "logps/chosen": -588.0, "logps/rejected": -528.0, "loss": 0.6186, "rewards/accuracies": 0.6875, "rewards/chosen": 2.484375, "rewards/margins": 1.2109375, "rewards/rejected": 1.28125, "step": 3160 }, { "epoch": 0.8381808566895822, "grad_norm": 34.25, "learning_rate": 5.809095716552089e-05, "logits/chosen": -86.0, "logits/rejected": -82.5, "logps/chosen": -560.0, "logps/rejected": -474.0, "loss": 0.5936, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 2.421875, "rewards/margins": 1.234375, "rewards/rejected": 1.1875, "step": 3170 }, { "epoch": 0.8408249603384452, "grad_norm": 67.5, "learning_rate": 5.795875198307774e-05, "logits/chosen": -86.0, "logits/rejected": -85.5, "logps/chosen": -532.0, "logps/rejected": -492.0, "loss": 0.6299, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 2.09375, "rewards/margins": 1.1484375, "rewards/rejected": 0.9453125, "step": 3180 }, { "epoch": 0.8434690639873083, "grad_norm": 37.5, "learning_rate": 5.782654680063458e-05, "logits/chosen": -88.0, "logits/rejected": -85.0, "logps/chosen": -510.0, "logps/rejected": -446.0, "loss": 0.5729, "rewards/accuracies": 0.6875, "rewards/chosen": 1.46875, "rewards/margins": 1.1640625, "rewards/rejected": 0.30859375, "step": 3190 }, { "epoch": 0.8461131676361714, "grad_norm": 49.75, "learning_rate": 5.769434161819144e-05, "logits/chosen": -88.5, "logits/rejected": -84.5, "logps/chosen": -584.0, "logps/rejected": -470.0, "loss": 0.585, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 1.890625, "rewards/margins": 1.171875, "rewards/rejected": 0.71875, "step": 3200 }, { "epoch": 0.8487572712850344, "grad_norm": 51.75, "learning_rate": 5.756213643574828e-05, "logits/chosen": -84.5, "logits/rejected": -83.5, "logps/chosen": -556.0, "logps/rejected": -494.0, "loss": 0.6111, "rewards/accuracies": 0.71875, "rewards/chosen": 1.890625, "rewards/margins": 1.15625, "rewards/rejected": 0.73046875, "step": 3210 }, { "epoch": 0.8514013749338974, "grad_norm": 43.25, "learning_rate": 5.742993125330514e-05, "logits/chosen": -87.0, "logits/rejected": -86.0, "logps/chosen": -532.0, "logps/rejected": -482.0, "loss": 0.741, "rewards/accuracies": 0.625, "rewards/chosen": 1.8984375, "rewards/margins": 0.87890625, "rewards/rejected": 1.015625, "step": 3220 }, { "epoch": 0.8540454785827605, "grad_norm": 48.75, "learning_rate": 5.729772607086198e-05, "logits/chosen": -85.0, "logits/rejected": -83.5, "logps/chosen": -512.0, "logps/rejected": -440.0, "loss": 0.5918, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 1.6015625, "rewards/margins": 0.9921875, "rewards/rejected": 0.60546875, "step": 3230 }, { "epoch": 0.8566895822316235, "grad_norm": 113.5, "learning_rate": 5.7165520888418836e-05, "logits/chosen": -84.5, "logits/rejected": -84.0, "logps/chosen": -608.0, "logps/rejected": -502.0, "loss": 0.5135, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 2.15625, "rewards/margins": 1.34375, "rewards/rejected": 0.82421875, "step": 3240 }, { "epoch": 0.8593336858804865, "grad_norm": 35.5, "learning_rate": 5.703331570597568e-05, "logits/chosen": -84.5, "logits/rejected": -83.5, "logps/chosen": -572.0, "logps/rejected": -516.0, "loss": 0.5725, "rewards/accuracies": 0.731249988079071, "rewards/chosen": 2.53125, "rewards/margins": 1.1875, "rewards/rejected": 1.34375, "step": 3250 }, { "epoch": 0.8619777895293496, "grad_norm": 25.75, "learning_rate": 5.690111052353253e-05, "logits/chosen": -82.5, "logits/rejected": -80.5, "logps/chosen": -576.0, "logps/rejected": -500.0, "loss": 0.6393, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 2.296875, "rewards/margins": 1.0703125, "rewards/rejected": 1.234375, "step": 3260 }, { "epoch": 0.8646218931782126, "grad_norm": 38.75, "learning_rate": 5.676890534108937e-05, "logits/chosen": -86.0, "logits/rejected": -83.5, "logps/chosen": -624.0, "logps/rejected": -508.0, "loss": 0.7127, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 2.125, "rewards/margins": 1.2265625, "rewards/rejected": 0.89453125, "step": 3270 }, { "epoch": 0.8672659968270756, "grad_norm": 49.25, "learning_rate": 5.6636700158646214e-05, "logits/chosen": -85.5, "logits/rejected": -85.0, "logps/chosen": -584.0, "logps/rejected": -560.0, "loss": 0.6763, "rewards/accuracies": 0.65625, "rewards/chosen": 1.859375, "rewards/margins": 1.140625, "rewards/rejected": 0.71875, "step": 3280 }, { "epoch": 0.8699101004759386, "grad_norm": 37.75, "learning_rate": 5.650449497620307e-05, "logits/chosen": -85.0, "logits/rejected": -84.0, "logps/chosen": -584.0, "logps/rejected": -510.0, "loss": 0.6254, "rewards/accuracies": 0.6875, "rewards/chosen": 1.6875, "rewards/margins": 1.1328125, "rewards/rejected": 0.55078125, "step": 3290 }, { "epoch": 0.8725542041248017, "grad_norm": 42.25, "learning_rate": 5.6372289793759913e-05, "logits/chosen": -88.0, "logits/rejected": -85.0, "logps/chosen": -600.0, "logps/rejected": -536.0, "loss": 0.7219, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 2.171875, "rewards/margins": 1.2265625, "rewards/rejected": 0.9453125, "step": 3300 }, { "epoch": 0.8751983077736647, "grad_norm": 63.0, "learning_rate": 5.624008461131677e-05, "logits/chosen": -89.0, "logits/rejected": -85.0, "logps/chosen": -564.0, "logps/rejected": -458.0, "loss": 0.5762, "rewards/accuracies": 0.71875, "rewards/chosen": 2.40625, "rewards/margins": 1.25, "rewards/rejected": 1.15625, "step": 3310 }, { "epoch": 0.8778424114225277, "grad_norm": 34.0, "learning_rate": 5.610787942887361e-05, "logits/chosen": -86.0, "logits/rejected": -84.0, "logps/chosen": -564.0, "logps/rejected": -516.0, "loss": 0.6721, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 2.5, "rewards/margins": 0.99609375, "rewards/rejected": 1.5078125, "step": 3320 }, { "epoch": 0.8804865150713908, "grad_norm": 72.0, "learning_rate": 5.597567424643046e-05, "logits/chosen": -87.5, "logits/rejected": -83.5, "logps/chosen": -576.0, "logps/rejected": -496.0, "loss": 0.627, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 2.421875, "rewards/margins": 1.0703125, "rewards/rejected": 1.3515625, "step": 3330 }, { "epoch": 0.8831306187202538, "grad_norm": 70.0, "learning_rate": 5.5843469063987305e-05, "logits/chosen": -86.5, "logits/rejected": -83.5, "logps/chosen": -564.0, "logps/rejected": -482.0, "loss": 0.6094, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 2.4375, "rewards/margins": 1.2890625, "rewards/rejected": 1.140625, "step": 3340 }, { "epoch": 0.8857747223691169, "grad_norm": 27.625, "learning_rate": 5.571126388154416e-05, "logits/chosen": -89.5, "logits/rejected": -84.0, "logps/chosen": -596.0, "logps/rejected": -460.0, "loss": 0.4938, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": 2.46875, "rewards/margins": 1.71875, "rewards/rejected": 0.7421875, "step": 3350 }, { "epoch": 0.88841882601798, "grad_norm": 49.5, "learning_rate": 5.5579058699101004e-05, "logits/chosen": -85.0, "logits/rejected": -81.5, "logps/chosen": -560.0, "logps/rejected": -468.0, "loss": 0.6072, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 2.21875, "rewards/margins": 1.1328125, "rewards/rejected": 1.0859375, "step": 3360 }, { "epoch": 0.891062929666843, "grad_norm": 42.75, "learning_rate": 5.544685351665786e-05, "logits/chosen": -85.5, "logits/rejected": -83.5, "logps/chosen": -588.0, "logps/rejected": -492.0, "loss": 0.6506, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 2.703125, "rewards/margins": 1.1328125, "rewards/rejected": 1.578125, "step": 3370 }, { "epoch": 0.893707033315706, "grad_norm": 51.75, "learning_rate": 5.53146483342147e-05, "logits/chosen": -88.5, "logits/rejected": -87.0, "logps/chosen": -604.0, "logps/rejected": -560.0, "loss": 0.623, "rewards/accuracies": 0.65625, "rewards/chosen": 2.25, "rewards/margins": 1.140625, "rewards/rejected": 1.1171875, "step": 3380 }, { "epoch": 0.896351136964569, "grad_norm": 57.25, "learning_rate": 5.518244315177156e-05, "logits/chosen": -85.0, "logits/rejected": -84.0, "logps/chosen": -552.0, "logps/rejected": -484.0, "loss": 0.5504, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 1.90625, "rewards/margins": 1.1328125, "rewards/rejected": 0.77734375, "step": 3390 }, { "epoch": 0.8989952406134321, "grad_norm": 50.25, "learning_rate": 5.50502379693284e-05, "logits/chosen": -86.0, "logits/rejected": -85.0, "logps/chosen": -588.0, "logps/rejected": -512.0, "loss": 0.7295, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 2.109375, "rewards/margins": 1.1171875, "rewards/rejected": 0.9921875, "step": 3400 }, { "epoch": 0.9016393442622951, "grad_norm": 63.0, "learning_rate": 5.491803278688525e-05, "logits/chosen": -85.0, "logits/rejected": -81.5, "logps/chosen": -596.0, "logps/rejected": -474.0, "loss": 0.6602, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 2.328125, "rewards/margins": 1.09375, "rewards/rejected": 1.2421875, "step": 3410 }, { "epoch": 0.9042834479111581, "grad_norm": 46.0, "learning_rate": 5.4785827604442095e-05, "logits/chosen": -84.0, "logits/rejected": -81.0, "logps/chosen": -540.0, "logps/rejected": -468.0, "loss": 0.6525, "rewards/accuracies": 0.606249988079071, "rewards/chosen": 2.4375, "rewards/margins": 1.0390625, "rewards/rejected": 1.390625, "step": 3420 }, { "epoch": 0.9069275515600211, "grad_norm": 41.5, "learning_rate": 5.465362242199894e-05, "logits/chosen": -83.5, "logits/rejected": -83.0, "logps/chosen": -568.0, "logps/rejected": -512.0, "loss": 0.6406, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 2.46875, "rewards/margins": 0.92578125, "rewards/rejected": 1.5390625, "step": 3430 }, { "epoch": 0.9095716552088842, "grad_norm": 45.75, "learning_rate": 5.4521417239555794e-05, "logits/chosen": -82.0, "logits/rejected": -82.0, "logps/chosen": -528.0, "logps/rejected": -474.0, "loss": 0.7375, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 2.03125, "rewards/margins": 0.89453125, "rewards/rejected": 1.1328125, "step": 3440 }, { "epoch": 0.9122157588577472, "grad_norm": 49.0, "learning_rate": 5.438921205711264e-05, "logits/chosen": -83.5, "logits/rejected": -83.0, "logps/chosen": -560.0, "logps/rejected": -510.0, "loss": 0.6795, "rewards/accuracies": 0.6875, "rewards/chosen": 2.546875, "rewards/margins": 1.0859375, "rewards/rejected": 1.4609375, "step": 3450 }, { "epoch": 0.9148598625066102, "grad_norm": 43.75, "learning_rate": 5.425700687466949e-05, "logits/chosen": -86.5, "logits/rejected": -83.0, "logps/chosen": -596.0, "logps/rejected": -516.0, "loss": 0.7072, "rewards/accuracies": 0.65625, "rewards/chosen": 2.28125, "rewards/margins": 0.96875, "rewards/rejected": 1.3125, "step": 3460 }, { "epoch": 0.9175039661554732, "grad_norm": 39.0, "learning_rate": 5.4124801692226336e-05, "logits/chosen": -83.5, "logits/rejected": -81.5, "logps/chosen": -548.0, "logps/rejected": -494.0, "loss": 0.6295, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 2.40625, "rewards/margins": 1.0234375, "rewards/rejected": 1.375, "step": 3470 }, { "epoch": 0.9201480698043363, "grad_norm": 49.75, "learning_rate": 5.3992596509783186e-05, "logits/chosen": -85.0, "logits/rejected": -83.5, "logps/chosen": -584.0, "logps/rejected": -510.0, "loss": 0.7078, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 2.671875, "rewards/margins": 1.1640625, "rewards/rejected": 1.5078125, "step": 3480 }, { "epoch": 0.9227921734531994, "grad_norm": 53.25, "learning_rate": 5.386039132734003e-05, "logits/chosen": -84.0, "logits/rejected": -82.0, "logps/chosen": -568.0, "logps/rejected": -470.0, "loss": 0.5664, "rewards/accuracies": 0.6875, "rewards/chosen": 2.34375, "rewards/margins": 1.09375, "rewards/rejected": 1.2578125, "step": 3490 }, { "epoch": 0.9254362771020624, "grad_norm": 67.0, "learning_rate": 5.3728186144896885e-05, "logits/chosen": -86.0, "logits/rejected": -85.5, "logps/chosen": -588.0, "logps/rejected": -494.0, "loss": 0.6209, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 2.375, "rewards/margins": 1.2265625, "rewards/rejected": 1.1484375, "step": 3500 }, { "epoch": 0.9280803807509255, "grad_norm": 36.5, "learning_rate": 5.359598096245373e-05, "logits/chosen": -84.5, "logits/rejected": -82.5, "logps/chosen": -576.0, "logps/rejected": -490.0, "loss": 0.5863, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": 2.046875, "rewards/margins": 1.140625, "rewards/rejected": 0.90625, "step": 3510 }, { "epoch": 0.9307244843997885, "grad_norm": 33.75, "learning_rate": 5.3463775780010584e-05, "logits/chosen": -84.0, "logits/rejected": -82.5, "logps/chosen": -624.0, "logps/rejected": -496.0, "loss": 0.6331, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 2.53125, "rewards/margins": 1.234375, "rewards/rejected": 1.3046875, "step": 3520 }, { "epoch": 0.9333685880486515, "grad_norm": 54.0, "learning_rate": 5.333157059756743e-05, "logits/chosen": -84.0, "logits/rejected": -80.5, "logps/chosen": -592.0, "logps/rejected": -488.0, "loss": 0.5857, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 2.5, "rewards/margins": 1.1953125, "rewards/rejected": 1.3125, "step": 3530 }, { "epoch": 0.9360126916975146, "grad_norm": 59.0, "learning_rate": 5.319936541512428e-05, "logits/chosen": -84.5, "logits/rejected": -83.0, "logps/chosen": -600.0, "logps/rejected": -498.0, "loss": 0.5916, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 2.828125, "rewards/margins": 1.3984375, "rewards/rejected": 1.4296875, "step": 3540 }, { "epoch": 0.9386567953463776, "grad_norm": 45.25, "learning_rate": 5.3067160232681126e-05, "logits/chosen": -87.5, "logits/rejected": -83.5, "logps/chosen": -648.0, "logps/rejected": -498.0, "loss": 0.5161, "rewards/accuracies": 0.8125, "rewards/chosen": 3.109375, "rewards/margins": 1.65625, "rewards/rejected": 1.4453125, "step": 3550 }, { "epoch": 0.9413008989952406, "grad_norm": 37.5, "learning_rate": 5.293495505023797e-05, "logits/chosen": -84.5, "logits/rejected": -83.5, "logps/chosen": -584.0, "logps/rejected": -466.0, "loss": 0.5334, "rewards/accuracies": 0.75, "rewards/chosen": 2.515625, "rewards/margins": 1.3984375, "rewards/rejected": 1.1171875, "step": 3560 }, { "epoch": 0.9439450026441036, "grad_norm": 51.5, "learning_rate": 5.280274986779482e-05, "logits/chosen": -87.0, "logits/rejected": -86.0, "logps/chosen": -592.0, "logps/rejected": -486.0, "loss": 0.7445, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 2.484375, "rewards/margins": 0.8828125, "rewards/rejected": 1.59375, "step": 3570 }, { "epoch": 0.9465891062929667, "grad_norm": 35.5, "learning_rate": 5.267054468535166e-05, "logits/chosen": -86.0, "logits/rejected": -84.5, "logps/chosen": -636.0, "logps/rejected": -532.0, "loss": 0.6068, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.625, "rewards/margins": 1.421875, "rewards/rejected": 1.1953125, "step": 3580 }, { "epoch": 0.9492332099418297, "grad_norm": 43.0, "learning_rate": 5.253833950290852e-05, "logits/chosen": -87.5, "logits/rejected": -85.0, "logps/chosen": -588.0, "logps/rejected": -488.0, "loss": 0.5402, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.546875, "rewards/margins": 1.34375, "rewards/rejected": 1.2109375, "step": 3590 }, { "epoch": 0.9518773135906927, "grad_norm": 55.5, "learning_rate": 5.240613432046536e-05, "logits/chosen": -86.0, "logits/rejected": -84.5, "logps/chosen": -584.0, "logps/rejected": -466.0, "loss": 0.6572, "rewards/accuracies": 0.6875, "rewards/chosen": 2.65625, "rewards/margins": 1.3515625, "rewards/rejected": 1.3125, "step": 3600 }, { "epoch": 0.9545214172395557, "grad_norm": 57.0, "learning_rate": 5.227392913802222e-05, "logits/chosen": -86.5, "logits/rejected": -85.0, "logps/chosen": -612.0, "logps/rejected": -548.0, "loss": 0.5459, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 2.71875, "rewards/margins": 1.4921875, "rewards/rejected": 1.21875, "step": 3610 }, { "epoch": 0.9571655208884188, "grad_norm": 36.5, "learning_rate": 5.214172395557906e-05, "logits/chosen": -86.5, "logits/rejected": -84.0, "logps/chosen": -624.0, "logps/rejected": -548.0, "loss": 0.6277, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 2.53125, "rewards/margins": 1.2109375, "rewards/rejected": 1.3125, "step": 3620 }, { "epoch": 0.9598096245372819, "grad_norm": 33.0, "learning_rate": 5.200951877313591e-05, "logits/chosen": -86.0, "logits/rejected": -84.0, "logps/chosen": -540.0, "logps/rejected": -486.0, "loss": 0.5771, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 2.265625, "rewards/margins": 1.171875, "rewards/rejected": 1.09375, "step": 3630 }, { "epoch": 0.9624537281861449, "grad_norm": 58.0, "learning_rate": 5.187731359069276e-05, "logits/chosen": -84.5, "logits/rejected": -84.0, "logps/chosen": -580.0, "logps/rejected": -508.0, "loss": 0.676, "rewards/accuracies": 0.6875, "rewards/chosen": 2.28125, "rewards/margins": 0.9921875, "rewards/rejected": 1.2890625, "step": 3640 }, { "epoch": 0.965097831835008, "grad_norm": 63.0, "learning_rate": 5.174510840824961e-05, "logits/chosen": -87.5, "logits/rejected": -84.5, "logps/chosen": -580.0, "logps/rejected": -488.0, "loss": 0.6699, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 2.296875, "rewards/margins": 0.97265625, "rewards/rejected": 1.328125, "step": 3650 }, { "epoch": 0.967741935483871, "grad_norm": 81.0, "learning_rate": 5.161290322580645e-05, "logits/chosen": -86.0, "logits/rejected": -84.5, "logps/chosen": -564.0, "logps/rejected": -478.0, "loss": 0.7312, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 2.59375, "rewards/margins": 1.1171875, "rewards/rejected": 1.46875, "step": 3660 }, { "epoch": 0.970386039132734, "grad_norm": 58.5, "learning_rate": 5.148069804336331e-05, "logits/chosen": -84.5, "logits/rejected": -84.5, "logps/chosen": -552.0, "logps/rejected": -520.0, "loss": 0.6258, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 2.34375, "rewards/margins": 1.09375, "rewards/rejected": 1.2578125, "step": 3670 }, { "epoch": 0.973030142781597, "grad_norm": 29.25, "learning_rate": 5.134849286092015e-05, "logits/chosen": -85.0, "logits/rejected": -84.0, "logps/chosen": -556.0, "logps/rejected": -488.0, "loss": 0.7105, "rewards/accuracies": 0.6875, "rewards/chosen": 2.140625, "rewards/margins": 1.171875, "rewards/rejected": 0.98046875, "step": 3680 }, { "epoch": 0.9756742464304601, "grad_norm": 31.25, "learning_rate": 5.1216287678477007e-05, "logits/chosen": -88.5, "logits/rejected": -85.0, "logps/chosen": -644.0, "logps/rejected": -508.0, "loss": 0.551, "rewards/accuracies": 0.6875, "rewards/chosen": 2.59375, "rewards/margins": 1.3515625, "rewards/rejected": 1.2421875, "step": 3690 }, { "epoch": 0.9783183500793231, "grad_norm": 29.875, "learning_rate": 5.108408249603385e-05, "logits/chosen": -84.0, "logits/rejected": -85.5, "logps/chosen": -536.0, "logps/rejected": -504.0, "loss": 0.7854, "rewards/accuracies": 0.59375, "rewards/chosen": 1.9609375, "rewards/margins": 0.69140625, "rewards/rejected": 1.265625, "step": 3700 }, { "epoch": 0.9809624537281861, "grad_norm": 57.5, "learning_rate": 5.095187731359069e-05, "logits/chosen": -87.0, "logits/rejected": -85.0, "logps/chosen": -616.0, "logps/rejected": -506.0, "loss": 0.6902, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 2.3125, "rewards/margins": 1.0859375, "rewards/rejected": 1.2265625, "step": 3710 }, { "epoch": 0.9836065573770492, "grad_norm": 47.75, "learning_rate": 5.081967213114754e-05, "logits/chosen": -88.0, "logits/rejected": -83.5, "logps/chosen": -652.0, "logps/rejected": -588.0, "loss": 0.6801, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 2.15625, "rewards/margins": 1.1015625, "rewards/rejected": 1.046875, "step": 3720 }, { "epoch": 0.9862506610259122, "grad_norm": 61.0, "learning_rate": 5.0687466948704385e-05, "logits/chosen": -88.5, "logits/rejected": -82.5, "logps/chosen": -628.0, "logps/rejected": -512.0, "loss": 0.6908, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 2.25, "rewards/margins": 1.140625, "rewards/rejected": 1.109375, "step": 3730 }, { "epoch": 0.9888947646747752, "grad_norm": 25.5, "learning_rate": 5.055526176626124e-05, "logits/chosen": -86.0, "logits/rejected": -83.0, "logps/chosen": -616.0, "logps/rejected": -500.0, "loss": 0.6221, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 2.796875, "rewards/margins": 1.1484375, "rewards/rejected": 1.640625, "step": 3740 }, { "epoch": 0.9915388683236382, "grad_norm": 45.5, "learning_rate": 5.0423056583818084e-05, "logits/chosen": -86.0, "logits/rejected": -83.5, "logps/chosen": -584.0, "logps/rejected": -480.0, "loss": 0.602, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 3.015625, "rewards/margins": 1.25, "rewards/rejected": 1.7734375, "step": 3750 }, { "epoch": 0.9941829719725013, "grad_norm": 37.75, "learning_rate": 5.029085140137494e-05, "logits/chosen": -85.0, "logits/rejected": -83.0, "logps/chosen": -572.0, "logps/rejected": -490.0, "loss": 0.6225, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 2.609375, "rewards/margins": 1.15625, "rewards/rejected": 1.4453125, "step": 3760 }, { "epoch": 0.9968270756213644, "grad_norm": 60.5, "learning_rate": 5.015864621893178e-05, "logits/chosen": -86.0, "logits/rejected": -86.0, "logps/chosen": -536.0, "logps/rejected": -472.0, "loss": 0.6607, "rewards/accuracies": 0.6875, "rewards/chosen": 2.296875, "rewards/margins": 1.0703125, "rewards/rejected": 1.2421875, "step": 3770 }, { "epoch": 0.9994711792702274, "grad_norm": 74.5, "learning_rate": 5.002644103648864e-05, "logits/chosen": -87.5, "logits/rejected": -84.5, "logps/chosen": -600.0, "logps/rejected": -512.0, "loss": 0.7043, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": 2.171875, "rewards/margins": 0.97265625, "rewards/rejected": 1.1953125, "step": 3780 }, { "epoch": 1.0, "eval_logits/chosen": -86.5, "eval_logits/rejected": -84.0, "eval_logps/chosen": -580.0, "eval_logps/rejected": -492.0, "eval_loss": 0.612174928188324, "eval_rewards/accuracies": 0.684020459651947, "eval_rewards/chosen": 2.21875, "eval_rewards/margins": 1.1953125, "eval_rewards/rejected": 1.015625, "eval_runtime": 998.8184, "eval_samples_per_second": 15.144, "eval_steps_per_second": 0.947, "step": 3782 }, { "epoch": 1.0021152829190905, "grad_norm": 33.5, "learning_rate": 4.989423585404548e-05, "logits/chosen": -87.0, "logits/rejected": -83.5, "logps/chosen": -556.0, "logps/rejected": -490.0, "loss": 0.5213, "rewards/accuracies": 0.7987500429153442, "rewards/chosen": 2.609375, "rewards/margins": 1.6640625, "rewards/rejected": 0.94921875, "step": 3790 }, { "epoch": 1.0047593865679534, "grad_norm": 15.75, "learning_rate": 4.9762030671602325e-05, "logits/chosen": -86.0, "logits/rejected": -85.0, "logps/chosen": -616.0, "logps/rejected": -506.0, "loss": 0.2681, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": 3.125, "rewards/margins": 2.359375, "rewards/rejected": 0.77734375, "step": 3800 }, { "epoch": 1.0074034902168165, "grad_norm": 29.5, "learning_rate": 4.9629825489159174e-05, "logits/chosen": -86.5, "logits/rejected": -84.0, "logps/chosen": -584.0, "logps/rejected": -516.0, "loss": 0.3112, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": 3.109375, "rewards/margins": 1.953125, "rewards/rejected": 1.15625, "step": 3810 }, { "epoch": 1.0100475938656794, "grad_norm": 29.875, "learning_rate": 4.9497620306716024e-05, "logits/chosen": -87.0, "logits/rejected": -82.5, "logps/chosen": -620.0, "logps/rejected": -476.0, "loss": 0.3146, "rewards/accuracies": 0.856249988079071, "rewards/chosen": 3.09375, "rewards/margins": 2.015625, "rewards/rejected": 1.078125, "step": 3820 }, { "epoch": 1.0126916975145426, "grad_norm": 26.25, "learning_rate": 4.9365415124272874e-05, "logits/chosen": -86.0, "logits/rejected": -85.5, "logps/chosen": -556.0, "logps/rejected": -532.0, "loss": 0.3546, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": 2.921875, "rewards/margins": 1.6875, "rewards/rejected": 1.234375, "step": 3830 }, { "epoch": 1.0153358011634057, "grad_norm": 15.25, "learning_rate": 4.923320994182972e-05, "logits/chosen": -86.5, "logits/rejected": -84.0, "logps/chosen": -612.0, "logps/rejected": -490.0, "loss": 0.3583, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": 2.8125, "rewards/margins": 1.921875, "rewards/rejected": 0.890625, "step": 3840 }, { "epoch": 1.0179799048122686, "grad_norm": 16.625, "learning_rate": 4.910100475938657e-05, "logits/chosen": -87.5, "logits/rejected": -85.0, "logps/chosen": -592.0, "logps/rejected": -492.0, "loss": 0.2906, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 3.015625, "rewards/margins": 1.9921875, "rewards/rejected": 1.0234375, "step": 3850 }, { "epoch": 1.0206240084611318, "grad_norm": 19.625, "learning_rate": 4.896879957694342e-05, "logits/chosen": -86.5, "logits/rejected": -85.5, "logps/chosen": -604.0, "logps/rejected": -536.0, "loss": 0.3443, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.6875, "rewards/margins": 1.90625, "rewards/rejected": 0.77734375, "step": 3860 }, { "epoch": 1.0232681121099947, "grad_norm": 33.25, "learning_rate": 4.8836594394500265e-05, "logits/chosen": -89.5, "logits/rejected": -88.0, "logps/chosen": -548.0, "logps/rejected": -490.0, "loss": 0.3368, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.65625, "rewards/margins": 1.96875, "rewards/rejected": 0.68359375, "step": 3870 }, { "epoch": 1.0259122157588578, "grad_norm": 24.0, "learning_rate": 4.8704389212057115e-05, "logits/chosen": -86.5, "logits/rejected": -86.5, "logps/chosen": -556.0, "logps/rejected": -502.0, "loss": 0.3222, "rewards/accuracies": 0.84375, "rewards/chosen": 2.21875, "rewards/margins": 1.921875, "rewards/rejected": 0.3046875, "step": 3880 }, { "epoch": 1.0285563194077207, "grad_norm": 18.25, "learning_rate": 4.8572184029613964e-05, "logits/chosen": -89.0, "logits/rejected": -86.0, "logps/chosen": -604.0, "logps/rejected": -516.0, "loss": 0.3258, "rewards/accuracies": 0.856249988079071, "rewards/chosen": 2.796875, "rewards/margins": 2.1875, "rewards/rejected": 0.609375, "step": 3890 }, { "epoch": 1.0312004230565839, "grad_norm": 37.75, "learning_rate": 4.8439978847170814e-05, "logits/chosen": -85.0, "logits/rejected": -84.0, "logps/chosen": -544.0, "logps/rejected": -462.0, "loss": 0.3305, "rewards/accuracies": 0.856249988079071, "rewards/chosen": 2.640625, "rewards/margins": 1.8828125, "rewards/rejected": 0.75390625, "step": 3900 }, { "epoch": 1.0338445267054468, "grad_norm": 25.625, "learning_rate": 4.830777366472766e-05, "logits/chosen": -88.0, "logits/rejected": -86.5, "logps/chosen": -604.0, "logps/rejected": -500.0, "loss": 0.2902, "rewards/accuracies": 0.856249988079071, "rewards/chosen": 2.984375, "rewards/margins": 2.21875, "rewards/rejected": 0.76953125, "step": 3910 }, { "epoch": 1.03648863035431, "grad_norm": 28.25, "learning_rate": 4.8175568482284506e-05, "logits/chosen": -88.0, "logits/rejected": -85.0, "logps/chosen": -628.0, "logps/rejected": -492.0, "loss": 0.3656, "rewards/accuracies": 0.84375, "rewards/chosen": 2.921875, "rewards/margins": 1.9140625, "rewards/rejected": 1.015625, "step": 3920 }, { "epoch": 1.0391327340031729, "grad_norm": 20.5, "learning_rate": 4.8043363299841356e-05, "logits/chosen": -85.5, "logits/rejected": -84.0, "logps/chosen": -552.0, "logps/rejected": -524.0, "loss": 0.3505, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 3.109375, "rewards/margins": 1.96875, "rewards/rejected": 1.1328125, "step": 3930 }, { "epoch": 1.041776837652036, "grad_norm": 26.75, "learning_rate": 4.7911158117398205e-05, "logits/chosen": -87.0, "logits/rejected": -86.0, "logps/chosen": -560.0, "logps/rejected": -504.0, "loss": 0.3114, "rewards/accuracies": 0.856249988079071, "rewards/chosen": 2.65625, "rewards/margins": 1.9375, "rewards/rejected": 0.72265625, "step": 3940 }, { "epoch": 1.044420941300899, "grad_norm": 28.25, "learning_rate": 4.777895293495505e-05, "logits/chosen": -89.5, "logits/rejected": -84.0, "logps/chosen": -564.0, "logps/rejected": -456.0, "loss": 0.3361, "rewards/accuracies": 0.84375, "rewards/chosen": 2.78125, "rewards/margins": 2.15625, "rewards/rejected": 0.6171875, "step": 3950 }, { "epoch": 1.047065044949762, "grad_norm": 63.0, "learning_rate": 4.76467477525119e-05, "logits/chosen": -88.0, "logits/rejected": -85.0, "logps/chosen": -572.0, "logps/rejected": -488.0, "loss": 0.3551, "rewards/accuracies": 0.831250011920929, "rewards/chosen": 2.78125, "rewards/margins": 2.015625, "rewards/rejected": 0.765625, "step": 3960 }, { "epoch": 1.049709148598625, "grad_norm": 19.875, "learning_rate": 4.751454257006875e-05, "logits/chosen": -87.5, "logits/rejected": -85.5, "logps/chosen": -568.0, "logps/rejected": -482.0, "loss": 0.3555, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 2.796875, "rewards/margins": 1.8203125, "rewards/rejected": 0.98046875, "step": 3970 }, { "epoch": 1.052353252247488, "grad_norm": 24.25, "learning_rate": 4.73823373876256e-05, "logits/chosen": -87.5, "logits/rejected": -84.5, "logps/chosen": -560.0, "logps/rejected": -470.0, "loss": 0.3706, "rewards/accuracies": 0.831250011920929, "rewards/chosen": 3.0, "rewards/margins": 2.0, "rewards/rejected": 0.99609375, "step": 3980 }, { "epoch": 1.0549973558963512, "grad_norm": 29.25, "learning_rate": 4.725013220518245e-05, "logits/chosen": -85.0, "logits/rejected": -83.5, "logps/chosen": -532.0, "logps/rejected": -496.0, "loss": 0.342, "rewards/accuracies": 0.84375, "rewards/chosen": 2.671875, "rewards/margins": 1.9921875, "rewards/rejected": 0.6796875, "step": 3990 }, { "epoch": 1.0576414595452142, "grad_norm": 30.25, "learning_rate": 4.7117927022739296e-05, "logits/chosen": -85.0, "logits/rejected": -83.0, "logps/chosen": -568.0, "logps/rejected": -488.0, "loss": 0.3474, "rewards/accuracies": 0.84375, "rewards/chosen": 2.375, "rewards/margins": 2.015625, "rewards/rejected": 0.369140625, "step": 4000 }, { "epoch": 1.0602855631940773, "grad_norm": 24.125, "learning_rate": 4.6985721840296146e-05, "logits/chosen": -90.0, "logits/rejected": -85.5, "logps/chosen": -616.0, "logps/rejected": -520.0, "loss": 0.3441, "rewards/accuracies": 0.84375, "rewards/chosen": 2.734375, "rewards/margins": 2.0, "rewards/rejected": 0.73046875, "step": 4010 }, { "epoch": 1.0629296668429402, "grad_norm": 22.25, "learning_rate": 4.685351665785299e-05, "logits/chosen": -89.5, "logits/rejected": -85.0, "logps/chosen": -672.0, "logps/rejected": -544.0, "loss": 0.3326, "rewards/accuracies": 0.831250011920929, "rewards/chosen": 3.1875, "rewards/margins": 2.203125, "rewards/rejected": 0.98046875, "step": 4020 }, { "epoch": 1.0655737704918034, "grad_norm": 20.875, "learning_rate": 4.672131147540984e-05, "logits/chosen": -88.5, "logits/rejected": -84.5, "logps/chosen": -584.0, "logps/rejected": -468.0, "loss": 0.3149, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 2.75, "rewards/margins": 1.9453125, "rewards/rejected": 0.80078125, "step": 4030 }, { "epoch": 1.0682178741406663, "grad_norm": 24.875, "learning_rate": 4.658910629296669e-05, "logits/chosen": -87.0, "logits/rejected": -83.5, "logps/chosen": -592.0, "logps/rejected": -540.0, "loss": 0.2991, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": 3.09375, "rewards/margins": 2.140625, "rewards/rejected": 0.953125, "step": 4040 }, { "epoch": 1.0708619777895294, "grad_norm": 28.125, "learning_rate": 4.645690111052354e-05, "logits/chosen": -86.0, "logits/rejected": -83.5, "logps/chosen": -524.0, "logps/rejected": -496.0, "loss": 0.4026, "rewards/accuracies": 0.78125, "rewards/chosen": 2.609375, "rewards/margins": 1.875, "rewards/rejected": 0.73828125, "step": 4050 }, { "epoch": 1.0735060814383923, "grad_norm": 23.0, "learning_rate": 4.632469592808038e-05, "logits/chosen": -86.5, "logits/rejected": -86.5, "logps/chosen": -548.0, "logps/rejected": -512.0, "loss": 0.3318, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": 2.796875, "rewards/margins": 2.0625, "rewards/rejected": 0.73046875, "step": 4060 }, { "epoch": 1.0761501850872555, "grad_norm": 17.375, "learning_rate": 4.619249074563723e-05, "logits/chosen": -85.5, "logits/rejected": -84.0, "logps/chosen": -568.0, "logps/rejected": -512.0, "loss": 0.3855, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": 2.40625, "rewards/margins": 2.03125, "rewards/rejected": 0.37890625, "step": 4070 }, { "epoch": 1.0787942887361184, "grad_norm": 18.625, "learning_rate": 4.606028556319408e-05, "logits/chosen": -89.0, "logits/rejected": -86.0, "logps/chosen": -604.0, "logps/rejected": -536.0, "loss": 0.2951, "rewards/accuracies": 0.875, "rewards/chosen": 2.875, "rewards/margins": 2.265625, "rewards/rejected": 0.60546875, "step": 4080 }, { "epoch": 1.0814383923849815, "grad_norm": 31.75, "learning_rate": 4.592808038075093e-05, "logits/chosen": -85.5, "logits/rejected": -84.0, "logps/chosen": -548.0, "logps/rejected": -448.0, "loss": 0.3388, "rewards/accuracies": 0.84375, "rewards/chosen": 2.265625, "rewards/margins": 2.0625, "rewards/rejected": 0.1982421875, "step": 4090 }, { "epoch": 1.0840824960338444, "grad_norm": 19.0, "learning_rate": 4.579587519830777e-05, "logits/chosen": -87.5, "logits/rejected": -87.0, "logps/chosen": -608.0, "logps/rejected": -506.0, "loss": 0.3443, "rewards/accuracies": 0.856249988079071, "rewards/chosen": 2.8125, "rewards/margins": 1.90625, "rewards/rejected": 0.90625, "step": 4100 }, { "epoch": 1.0867265996827076, "grad_norm": 29.625, "learning_rate": 4.566367001586462e-05, "logits/chosen": -85.5, "logits/rejected": -83.0, "logps/chosen": -532.0, "logps/rejected": -428.0, "loss": 0.3308, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 2.578125, "rewards/margins": 1.9375, "rewards/rejected": 0.64453125, "step": 4110 }, { "epoch": 1.0893707033315705, "grad_norm": 15.8125, "learning_rate": 4.553146483342147e-05, "logits/chosen": -87.0, "logits/rejected": -84.0, "logps/chosen": -524.0, "logps/rejected": -464.0, "loss": 0.2884, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 2.484375, "rewards/margins": 2.078125, "rewards/rejected": 0.4140625, "step": 4120 }, { "epoch": 1.0920148069804336, "grad_norm": 40.25, "learning_rate": 4.539925965097832e-05, "logits/chosen": -87.0, "logits/rejected": -86.0, "logps/chosen": -588.0, "logps/rejected": -500.0, "loss": 0.3153, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 3.515625, "rewards/margins": 2.3125, "rewards/rejected": 1.203125, "step": 4130 }, { "epoch": 1.0946589106292968, "grad_norm": 42.75, "learning_rate": 4.526705446853517e-05, "logits/chosen": -88.0, "logits/rejected": -85.5, "logps/chosen": -624.0, "logps/rejected": -524.0, "loss": 0.3392, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 3.359375, "rewards/margins": 2.09375, "rewards/rejected": 1.265625, "step": 4140 }, { "epoch": 1.0973030142781597, "grad_norm": 34.5, "learning_rate": 4.513484928609202e-05, "logits/chosen": -85.5, "logits/rejected": -85.5, "logps/chosen": -506.0, "logps/rejected": -488.0, "loss": 0.38, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 2.515625, "rewards/margins": 1.7890625, "rewards/rejected": 0.72265625, "step": 4150 }, { "epoch": 1.0999471179270228, "grad_norm": 18.75, "learning_rate": 4.500264410364887e-05, "logits/chosen": -89.5, "logits/rejected": -86.5, "logps/chosen": -584.0, "logps/rejected": -508.0, "loss": 0.281, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": 2.78125, "rewards/margins": 2.078125, "rewards/rejected": 0.7109375, "step": 4160 }, { "epoch": 1.1025912215758857, "grad_norm": 25.625, "learning_rate": 4.487043892120571e-05, "logits/chosen": -88.0, "logits/rejected": -86.5, "logps/chosen": -556.0, "logps/rejected": -528.0, "loss": 0.3119, "rewards/accuracies": 0.856249988079071, "rewards/chosen": 2.84375, "rewards/margins": 2.21875, "rewards/rejected": 0.62890625, "step": 4170 }, { "epoch": 1.1052353252247489, "grad_norm": 20.25, "learning_rate": 4.473823373876256e-05, "logits/chosen": -87.0, "logits/rejected": -87.5, "logps/chosen": -544.0, "logps/rejected": -508.0, "loss": 0.3232, "rewards/accuracies": 0.84375, "rewards/chosen": 2.375, "rewards/margins": 2.0625, "rewards/rejected": 0.322265625, "step": 4180 }, { "epoch": 1.1078794288736118, "grad_norm": 31.0, "learning_rate": 4.460602855631941e-05, "logits/chosen": -90.5, "logits/rejected": -86.0, "logps/chosen": -624.0, "logps/rejected": -512.0, "loss": 0.3279, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": 2.46875, "rewards/margins": 2.234375, "rewards/rejected": 0.2255859375, "step": 4190 }, { "epoch": 1.110523532522475, "grad_norm": 48.25, "learning_rate": 4.4473823373876254e-05, "logits/chosen": -88.0, "logits/rejected": -85.5, "logps/chosen": -576.0, "logps/rejected": -484.0, "loss": 0.3224, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 2.515625, "rewards/margins": 2.03125, "rewards/rejected": 0.484375, "step": 4200 }, { "epoch": 1.1131676361713378, "grad_norm": 21.875, "learning_rate": 4.4341618191433104e-05, "logits/chosen": -85.5, "logits/rejected": -84.5, "logps/chosen": -552.0, "logps/rejected": -492.0, "loss": 0.3182, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": 2.46875, "rewards/margins": 2.0625, "rewards/rejected": 0.40625, "step": 4210 }, { "epoch": 1.115811739820201, "grad_norm": 10.625, "learning_rate": 4.420941300898995e-05, "logits/chosen": -88.0, "logits/rejected": -86.0, "logps/chosen": -608.0, "logps/rejected": -532.0, "loss": 0.3187, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 2.828125, "rewards/margins": 2.3125, "rewards/rejected": 0.51953125, "step": 4220 }, { "epoch": 1.118455843469064, "grad_norm": 28.375, "learning_rate": 4.40772078265468e-05, "logits/chosen": -88.5, "logits/rejected": -85.0, "logps/chosen": -584.0, "logps/rejected": -486.0, "loss": 0.2781, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": 2.921875, "rewards/margins": 2.3125, "rewards/rejected": 0.609375, "step": 4230 }, { "epoch": 1.121099947117927, "grad_norm": 31.75, "learning_rate": 4.394500264410365e-05, "logits/chosen": -87.5, "logits/rejected": -84.0, "logps/chosen": -568.0, "logps/rejected": -468.0, "loss": 0.3075, "rewards/accuracies": 0.831250011920929, "rewards/chosen": 2.6875, "rewards/margins": 1.984375, "rewards/rejected": 0.6953125, "step": 4240 }, { "epoch": 1.12374405076679, "grad_norm": 38.5, "learning_rate": 4.38127974616605e-05, "logits/chosen": -86.0, "logits/rejected": -84.0, "logps/chosen": -568.0, "logps/rejected": -468.0, "loss": 0.3052, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.078125, "rewards/margins": 2.078125, "rewards/rejected": 1.0, "step": 4250 }, { "epoch": 1.126388154415653, "grad_norm": 28.5, "learning_rate": 4.3680592279217345e-05, "logits/chosen": -86.0, "logits/rejected": -83.0, "logps/chosen": -532.0, "logps/rejected": -460.0, "loss": 0.3481, "rewards/accuracies": 0.831250011920929, "rewards/chosen": 2.734375, "rewards/margins": 1.8203125, "rewards/rejected": 0.9140625, "step": 4260 }, { "epoch": 1.129032258064516, "grad_norm": 34.25, "learning_rate": 4.3548387096774194e-05, "logits/chosen": -86.0, "logits/rejected": -84.5, "logps/chosen": -544.0, "logps/rejected": -454.0, "loss": 0.3751, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": 2.859375, "rewards/margins": 2.15625, "rewards/rejected": 0.703125, "step": 4270 }, { "epoch": 1.1316763617133792, "grad_norm": 27.125, "learning_rate": 4.3416181914331044e-05, "logits/chosen": -86.0, "logits/rejected": -84.0, "logps/chosen": -548.0, "logps/rejected": -474.0, "loss": 0.333, "rewards/accuracies": 0.831250011920929, "rewards/chosen": 2.796875, "rewards/margins": 1.921875, "rewards/rejected": 0.87109375, "step": 4280 }, { "epoch": 1.1343204653622423, "grad_norm": 15.625, "learning_rate": 4.3283976731887893e-05, "logits/chosen": -86.5, "logits/rejected": -84.0, "logps/chosen": -588.0, "logps/rejected": -524.0, "loss": 0.3548, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 2.375, "rewards/margins": 1.765625, "rewards/rejected": 0.609375, "step": 4290 }, { "epoch": 1.1369645690111052, "grad_norm": 36.25, "learning_rate": 4.315177154944474e-05, "logits/chosen": -86.0, "logits/rejected": -86.0, "logps/chosen": -588.0, "logps/rejected": -520.0, "loss": 0.2956, "rewards/accuracies": 0.831250011920929, "rewards/chosen": 2.390625, "rewards/margins": 2.078125, "rewards/rejected": 0.30859375, "step": 4300 }, { "epoch": 1.1396086726599683, "grad_norm": 29.125, "learning_rate": 4.301956636700159e-05, "logits/chosen": -89.0, "logits/rejected": -84.5, "logps/chosen": -608.0, "logps/rejected": -506.0, "loss": 0.2826, "rewards/accuracies": 0.875, "rewards/chosen": 2.796875, "rewards/margins": 2.171875, "rewards/rejected": 0.625, "step": 4310 }, { "epoch": 1.1422527763088313, "grad_norm": 13.0, "learning_rate": 4.288736118455844e-05, "logits/chosen": -89.0, "logits/rejected": -85.0, "logps/chosen": -588.0, "logps/rejected": -520.0, "loss": 0.3394, "rewards/accuracies": 0.831250011920929, "rewards/chosen": 2.75, "rewards/margins": 1.9921875, "rewards/rejected": 0.765625, "step": 4320 }, { "epoch": 1.1448968799576944, "grad_norm": 25.375, "learning_rate": 4.2755156002115285e-05, "logits/chosen": -88.0, "logits/rejected": -86.0, "logps/chosen": -564.0, "logps/rejected": -524.0, "loss": 0.2943, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 2.734375, "rewards/margins": 2.296875, "rewards/rejected": 0.4296875, "step": 4330 }, { "epoch": 1.1475409836065573, "grad_norm": 25.0, "learning_rate": 4.262295081967213e-05, "logits/chosen": -89.0, "logits/rejected": -85.5, "logps/chosen": -612.0, "logps/rejected": -536.0, "loss": 0.3267, "rewards/accuracies": 0.84375, "rewards/chosen": 2.671875, "rewards/margins": 2.265625, "rewards/rejected": 0.419921875, "step": 4340 }, { "epoch": 1.1501850872554205, "grad_norm": 15.3125, "learning_rate": 4.249074563722898e-05, "logits/chosen": -89.0, "logits/rejected": -86.0, "logps/chosen": -572.0, "logps/rejected": -488.0, "loss": 0.2862, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 2.6875, "rewards/margins": 2.015625, "rewards/rejected": 0.671875, "step": 4350 }, { "epoch": 1.1528291909042834, "grad_norm": 25.625, "learning_rate": 4.235854045478583e-05, "logits/chosen": -86.5, "logits/rejected": -83.5, "logps/chosen": -588.0, "logps/rejected": -484.0, "loss": 0.3504, "rewards/accuracies": 0.831250011920929, "rewards/chosen": 2.6875, "rewards/margins": 1.96875, "rewards/rejected": 0.7265625, "step": 4360 }, { "epoch": 1.1554732945531465, "grad_norm": 13.875, "learning_rate": 4.2226335272342677e-05, "logits/chosen": -88.0, "logits/rejected": -87.0, "logps/chosen": -612.0, "logps/rejected": -560.0, "loss": 0.2594, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.90625, "rewards/margins": 2.359375, "rewards/rejected": 0.55078125, "step": 4370 }, { "epoch": 1.1581173982020094, "grad_norm": 19.5, "learning_rate": 4.2094130089899526e-05, "logits/chosen": -86.0, "logits/rejected": -85.0, "logps/chosen": -524.0, "logps/rejected": -476.0, "loss": 0.3126, "rewards/accuracies": 0.856249988079071, "rewards/chosen": 2.4375, "rewards/margins": 2.1875, "rewards/rejected": 0.23828125, "step": 4380 }, { "epoch": 1.1607615018508726, "grad_norm": 51.25, "learning_rate": 4.1961924907456376e-05, "logits/chosen": -86.0, "logits/rejected": -83.5, "logps/chosen": -580.0, "logps/rejected": -516.0, "loss": 0.4182, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 2.109375, "rewards/margins": 2.03125, "rewards/rejected": 0.0732421875, "step": 4390 }, { "epoch": 1.1634056054997357, "grad_norm": 26.375, "learning_rate": 4.1829719725013225e-05, "logits/chosen": -87.0, "logits/rejected": -84.5, "logps/chosen": -632.0, "logps/rejected": -498.0, "loss": 0.276, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.84375, "rewards/margins": 2.328125, "rewards/rejected": 0.5, "step": 4400 }, { "epoch": 1.1660497091485986, "grad_norm": 24.5, "learning_rate": 4.169751454257007e-05, "logits/chosen": -89.0, "logits/rejected": -85.5, "logps/chosen": -548.0, "logps/rejected": -468.0, "loss": 0.3302, "rewards/accuracies": 0.84375, "rewards/chosen": 2.671875, "rewards/margins": 2.03125, "rewards/rejected": 0.6328125, "step": 4410 }, { "epoch": 1.1686938127974615, "grad_norm": 20.875, "learning_rate": 4.156530936012692e-05, "logits/chosen": -88.0, "logits/rejected": -86.5, "logps/chosen": -540.0, "logps/rejected": -492.0, "loss": 0.3609, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 2.828125, "rewards/margins": 1.78125, "rewards/rejected": 1.046875, "step": 4420 }, { "epoch": 1.1713379164463247, "grad_norm": 36.25, "learning_rate": 4.143310417768377e-05, "logits/chosen": -89.5, "logits/rejected": -88.5, "logps/chosen": -608.0, "logps/rejected": -544.0, "loss": 0.3174, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 3.0, "rewards/margins": 2.125, "rewards/rejected": 0.87109375, "step": 4430 }, { "epoch": 1.1739820200951878, "grad_norm": 44.75, "learning_rate": 4.130089899524062e-05, "logits/chosen": -88.0, "logits/rejected": -85.5, "logps/chosen": -580.0, "logps/rejected": -512.0, "loss": 0.3354, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": 2.53125, "rewards/margins": 1.953125, "rewards/rejected": 0.578125, "step": 4440 }, { "epoch": 1.1766261237440507, "grad_norm": 28.25, "learning_rate": 4.1168693812797467e-05, "logits/chosen": -89.5, "logits/rejected": -85.5, "logps/chosen": -560.0, "logps/rejected": -484.0, "loss": 0.2894, "rewards/accuracies": 0.875, "rewards/chosen": 2.5, "rewards/margins": 2.0, "rewards/rejected": 0.5, "step": 4450 }, { "epoch": 1.1792702273929139, "grad_norm": 20.75, "learning_rate": 4.1036488630354316e-05, "logits/chosen": -87.5, "logits/rejected": -85.0, "logps/chosen": -552.0, "logps/rejected": -474.0, "loss": 0.2568, "rewards/accuracies": 0.893750011920929, "rewards/chosen": 2.453125, "rewards/margins": 2.265625, "rewards/rejected": 0.185546875, "step": 4460 }, { "epoch": 1.1819143310417768, "grad_norm": 25.625, "learning_rate": 4.0904283447911166e-05, "logits/chosen": -87.5, "logits/rejected": -86.0, "logps/chosen": -616.0, "logps/rejected": -532.0, "loss": 0.3013, "rewards/accuracies": 0.84375, "rewards/chosen": 2.609375, "rewards/margins": 2.265625, "rewards/rejected": 0.33984375, "step": 4470 }, { "epoch": 1.18455843469064, "grad_norm": 26.375, "learning_rate": 4.077207826546801e-05, "logits/chosen": -89.0, "logits/rejected": -86.0, "logps/chosen": -560.0, "logps/rejected": -494.0, "loss": 0.2976, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": 2.34375, "rewards/margins": 1.9140625, "rewards/rejected": 0.43359375, "step": 4480 }, { "epoch": 1.1872025383395028, "grad_norm": 33.75, "learning_rate": 4.063987308302485e-05, "logits/chosen": -85.0, "logits/rejected": -83.0, "logps/chosen": -564.0, "logps/rejected": -492.0, "loss": 0.3923, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 2.21875, "rewards/margins": 1.6875, "rewards/rejected": 0.5390625, "step": 4490 }, { "epoch": 1.189846641988366, "grad_norm": 22.875, "learning_rate": 4.05076679005817e-05, "logits/chosen": -89.0, "logits/rejected": -87.0, "logps/chosen": -600.0, "logps/rejected": -556.0, "loss": 0.2481, "rewards/accuracies": 0.893750011920929, "rewards/chosen": 2.75, "rewards/margins": 2.21875, "rewards/rejected": 0.515625, "step": 4500 }, { "epoch": 1.192490745637229, "grad_norm": 26.0, "learning_rate": 4.037546271813855e-05, "logits/chosen": -88.0, "logits/rejected": -87.0, "logps/chosen": -616.0, "logps/rejected": -520.0, "loss": 0.2901, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 2.671875, "rewards/margins": 2.171875, "rewards/rejected": 0.5078125, "step": 4510 }, { "epoch": 1.195134849286092, "grad_norm": 55.5, "learning_rate": 4.02432575356954e-05, "logits/chosen": -87.0, "logits/rejected": -84.5, "logps/chosen": -548.0, "logps/rejected": -490.0, "loss": 0.3753, "rewards/accuracies": 0.793749988079071, "rewards/chosen": 2.28125, "rewards/margins": 1.7734375, "rewards/rejected": 0.51171875, "step": 4520 }, { "epoch": 1.197778952934955, "grad_norm": 23.875, "learning_rate": 4.011105235325225e-05, "logits/chosen": -88.0, "logits/rejected": -85.0, "logps/chosen": -612.0, "logps/rejected": -520.0, "loss": 0.3033, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": 2.578125, "rewards/margins": 2.125, "rewards/rejected": 0.4453125, "step": 4530 }, { "epoch": 1.200423056583818, "grad_norm": 26.375, "learning_rate": 3.99788471708091e-05, "logits/chosen": -89.5, "logits/rejected": -85.5, "logps/chosen": -568.0, "logps/rejected": -470.0, "loss": 0.3395, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": 2.8125, "rewards/margins": 1.90625, "rewards/rejected": 0.91015625, "step": 4540 }, { "epoch": 1.2030671602326812, "grad_norm": 31.25, "learning_rate": 3.984664198836595e-05, "logits/chosen": -87.5, "logits/rejected": -86.0, "logps/chosen": -588.0, "logps/rejected": -504.0, "loss": 0.326, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": 2.859375, "rewards/margins": 1.9609375, "rewards/rejected": 0.90625, "step": 4550 }, { "epoch": 1.2057112638815441, "grad_norm": 24.375, "learning_rate": 3.971443680592279e-05, "logits/chosen": -87.0, "logits/rejected": -85.0, "logps/chosen": -560.0, "logps/rejected": -516.0, "loss": 0.3114, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": 2.734375, "rewards/margins": 2.4375, "rewards/rejected": 0.298828125, "step": 4560 }, { "epoch": 1.2083553675304073, "grad_norm": 36.0, "learning_rate": 3.958223162347964e-05, "logits/chosen": -89.5, "logits/rejected": -88.0, "logps/chosen": -624.0, "logps/rejected": -540.0, "loss": 0.3071, "rewards/accuracies": 0.84375, "rewards/chosen": 2.84375, "rewards/margins": 2.21875, "rewards/rejected": 0.625, "step": 4570 }, { "epoch": 1.2109994711792702, "grad_norm": 35.5, "learning_rate": 3.945002644103649e-05, "logits/chosen": -87.5, "logits/rejected": -84.5, "logps/chosen": -572.0, "logps/rejected": -496.0, "loss": 0.3713, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 2.34375, "rewards/margins": 1.890625, "rewards/rejected": 0.455078125, "step": 4580 }, { "epoch": 1.2136435748281333, "grad_norm": 40.5, "learning_rate": 3.931782125859334e-05, "logits/chosen": -86.5, "logits/rejected": -85.0, "logps/chosen": -520.0, "logps/rejected": -460.0, "loss": 0.3659, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": 2.3125, "rewards/margins": 1.8984375, "rewards/rejected": 0.419921875, "step": 4590 }, { "epoch": 1.2162876784769963, "grad_norm": 46.75, "learning_rate": 3.918561607615019e-05, "logits/chosen": -87.0, "logits/rejected": -85.5, "logps/chosen": -588.0, "logps/rejected": -516.0, "loss": 0.3559, "rewards/accuracies": 0.831250011920929, "rewards/chosen": 2.609375, "rewards/margins": 1.8671875, "rewards/rejected": 0.74609375, "step": 4600 }, { "epoch": 1.2189317821258594, "grad_norm": 33.25, "learning_rate": 3.905341089370704e-05, "logits/chosen": -85.0, "logits/rejected": -83.0, "logps/chosen": -592.0, "logps/rejected": -506.0, "loss": 0.3441, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": 2.765625, "rewards/margins": 2.1875, "rewards/rejected": 0.578125, "step": 4610 }, { "epoch": 1.2215758857747223, "grad_norm": 27.75, "learning_rate": 3.892120571126389e-05, "logits/chosen": -87.0, "logits/rejected": -84.0, "logps/chosen": -544.0, "logps/rejected": -496.0, "loss": 0.339, "rewards/accuracies": 0.831250011920929, "rewards/chosen": 2.59375, "rewards/margins": 1.9921875, "rewards/rejected": 0.609375, "step": 4620 }, { "epoch": 1.2242199894235855, "grad_norm": 35.0, "learning_rate": 3.878900052882073e-05, "logits/chosen": -86.0, "logits/rejected": -84.0, "logps/chosen": -540.0, "logps/rejected": -512.0, "loss": 0.3467, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 2.515625, "rewards/margins": 2.0, "rewards/rejected": 0.5234375, "step": 4630 }, { "epoch": 1.2268640930724484, "grad_norm": 18.75, "learning_rate": 3.8656795346377575e-05, "logits/chosen": -85.0, "logits/rejected": -85.0, "logps/chosen": -564.0, "logps/rejected": -520.0, "loss": 0.3415, "rewards/accuracies": 0.856249988079071, "rewards/chosen": 2.515625, "rewards/margins": 1.8671875, "rewards/rejected": 0.65234375, "step": 4640 }, { "epoch": 1.2295081967213115, "grad_norm": 10.375, "learning_rate": 3.8524590163934424e-05, "logits/chosen": -90.0, "logits/rejected": -88.0, "logps/chosen": -644.0, "logps/rejected": -532.0, "loss": 0.2974, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": 2.859375, "rewards/margins": 2.28125, "rewards/rejected": 0.57421875, "step": 4650 }, { "epoch": 1.2321523003701744, "grad_norm": 23.625, "learning_rate": 3.8392384981491274e-05, "logits/chosen": -86.0, "logits/rejected": -85.5, "logps/chosen": -568.0, "logps/rejected": -524.0, "loss": 0.262, "rewards/accuracies": 0.893750011920929, "rewards/chosen": 2.5, "rewards/margins": 2.171875, "rewards/rejected": 0.32421875, "step": 4660 }, { "epoch": 1.2347964040190376, "grad_norm": 35.5, "learning_rate": 3.8260179799048123e-05, "logits/chosen": -85.5, "logits/rejected": -84.5, "logps/chosen": -576.0, "logps/rejected": -516.0, "loss": 0.3228, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": 2.59375, "rewards/margins": 2.09375, "rewards/rejected": 0.490234375, "step": 4670 }, { "epoch": 1.2374405076679005, "grad_norm": 38.75, "learning_rate": 3.812797461660497e-05, "logits/chosen": -86.5, "logits/rejected": -85.0, "logps/chosen": -564.0, "logps/rejected": -510.0, "loss": 0.3164, "rewards/accuracies": 0.84375, "rewards/chosen": 2.625, "rewards/margins": 2.125, "rewards/rejected": 0.486328125, "step": 4680 }, { "epoch": 1.2400846113167636, "grad_norm": 35.25, "learning_rate": 3.799576943416182e-05, "logits/chosen": -85.5, "logits/rejected": -85.0, "logps/chosen": -536.0, "logps/rejected": -460.0, "loss": 0.3227, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": 2.265625, "rewards/margins": 1.828125, "rewards/rejected": 0.439453125, "step": 4690 }, { "epoch": 1.2427287149656268, "grad_norm": 23.875, "learning_rate": 3.786356425171867e-05, "logits/chosen": -87.5, "logits/rejected": -83.5, "logps/chosen": -576.0, "logps/rejected": -464.0, "loss": 0.3337, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": 2.109375, "rewards/margins": 1.9921875, "rewards/rejected": 0.126953125, "step": 4700 }, { "epoch": 1.2453728186144897, "grad_norm": 16.75, "learning_rate": 3.7731359069275515e-05, "logits/chosen": -87.5, "logits/rejected": -85.0, "logps/chosen": -512.0, "logps/rejected": -462.0, "loss": 0.3971, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 2.359375, "rewards/margins": 1.7265625, "rewards/rejected": 0.63671875, "step": 4710 }, { "epoch": 1.2480169222633528, "grad_norm": 31.125, "learning_rate": 3.7599153886832365e-05, "logits/chosen": -87.0, "logits/rejected": -84.5, "logps/chosen": -568.0, "logps/rejected": -482.0, "loss": 0.2628, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.75, "rewards/margins": 2.4375, "rewards/rejected": 0.314453125, "step": 4720 }, { "epoch": 1.2506610259122157, "grad_norm": 29.125, "learning_rate": 3.7466948704389214e-05, "logits/chosen": -89.5, "logits/rejected": -90.0, "logps/chosen": -612.0, "logps/rejected": -520.0, "loss": 0.2852, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 2.984375, "rewards/margins": 2.15625, "rewards/rejected": 0.8359375, "step": 4730 }, { "epoch": 1.2533051295610789, "grad_norm": 27.125, "learning_rate": 3.7334743521946064e-05, "logits/chosen": -88.0, "logits/rejected": -87.5, "logps/chosen": -568.0, "logps/rejected": -480.0, "loss": 0.3278, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 2.8125, "rewards/margins": 2.15625, "rewards/rejected": 0.6484375, "step": 4740 }, { "epoch": 1.2559492332099418, "grad_norm": 29.25, "learning_rate": 3.720253833950291e-05, "logits/chosen": -86.5, "logits/rejected": -86.5, "logps/chosen": -508.0, "logps/rejected": -464.0, "loss": 0.3478, "rewards/accuracies": 0.84375, "rewards/chosen": 2.453125, "rewards/margins": 1.8046875, "rewards/rejected": 0.6484375, "step": 4750 }, { "epoch": 1.258593336858805, "grad_norm": 53.25, "learning_rate": 3.707033315705976e-05, "logits/chosen": -86.5, "logits/rejected": -83.5, "logps/chosen": -600.0, "logps/rejected": -472.0, "loss": 0.3271, "rewards/accuracies": 0.856249988079071, "rewards/chosen": 2.765625, "rewards/margins": 2.109375, "rewards/rejected": 0.64453125, "step": 4760 }, { "epoch": 1.2612374405076678, "grad_norm": 31.75, "learning_rate": 3.6938127974616606e-05, "logits/chosen": -87.5, "logits/rejected": -86.0, "logps/chosen": -564.0, "logps/rejected": -500.0, "loss": 0.3464, "rewards/accuracies": 0.856249988079071, "rewards/chosen": 2.828125, "rewards/margins": 2.09375, "rewards/rejected": 0.73046875, "step": 4770 }, { "epoch": 1.263881544156531, "grad_norm": 12.875, "learning_rate": 3.6805922792173455e-05, "logits/chosen": -86.5, "logits/rejected": -86.0, "logps/chosen": -540.0, "logps/rejected": -502.0, "loss": 0.3155, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 2.921875, "rewards/margins": 2.1875, "rewards/rejected": 0.7421875, "step": 4780 }, { "epoch": 1.266525647805394, "grad_norm": 24.875, "learning_rate": 3.66737176097303e-05, "logits/chosen": -89.0, "logits/rejected": -87.0, "logps/chosen": -572.0, "logps/rejected": -528.0, "loss": 0.3372, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": 2.703125, "rewards/margins": 2.09375, "rewards/rejected": 0.609375, "step": 4790 }, { "epoch": 1.269169751454257, "grad_norm": 16.375, "learning_rate": 3.654151242728715e-05, "logits/chosen": -86.5, "logits/rejected": -85.5, "logps/chosen": -576.0, "logps/rejected": -510.0, "loss": 0.2899, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 2.859375, "rewards/margins": 2.140625, "rewards/rejected": 0.71484375, "step": 4800 }, { "epoch": 1.2718138551031202, "grad_norm": 20.375, "learning_rate": 3.6409307244844e-05, "logits/chosen": -88.0, "logits/rejected": -84.0, "logps/chosen": -592.0, "logps/rejected": -478.0, "loss": 0.3, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": 2.921875, "rewards/margins": 2.234375, "rewards/rejected": 0.68359375, "step": 4810 }, { "epoch": 1.274457958751983, "grad_norm": 31.625, "learning_rate": 3.627710206240085e-05, "logits/chosen": -87.0, "logits/rejected": -83.0, "logps/chosen": -572.0, "logps/rejected": -486.0, "loss": 0.3598, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 2.84375, "rewards/margins": 2.078125, "rewards/rejected": 0.76171875, "step": 4820 }, { "epoch": 1.277102062400846, "grad_norm": 23.0, "learning_rate": 3.6144896879957696e-05, "logits/chosen": -87.0, "logits/rejected": -86.0, "logps/chosen": -584.0, "logps/rejected": -498.0, "loss": 0.2961, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 2.75, "rewards/margins": 2.265625, "rewards/rejected": 0.498046875, "step": 4830 }, { "epoch": 1.2797461660497091, "grad_norm": 39.25, "learning_rate": 3.6012691697514546e-05, "logits/chosen": -88.5, "logits/rejected": -86.0, "logps/chosen": -576.0, "logps/rejected": -490.0, "loss": 0.2797, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 2.6875, "rewards/margins": 2.15625, "rewards/rejected": 0.53515625, "step": 4840 }, { "epoch": 1.2823902696985723, "grad_norm": 38.0, "learning_rate": 3.5880486515071396e-05, "logits/chosen": -88.0, "logits/rejected": -83.0, "logps/chosen": -588.0, "logps/rejected": -486.0, "loss": 0.3599, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 3.0, "rewards/margins": 2.015625, "rewards/rejected": 0.9921875, "step": 4850 }, { "epoch": 1.2850343733474352, "grad_norm": 35.5, "learning_rate": 3.574828133262824e-05, "logits/chosen": -86.0, "logits/rejected": -83.5, "logps/chosen": -576.0, "logps/rejected": -498.0, "loss": 0.2934, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 3.125, "rewards/margins": 2.328125, "rewards/rejected": 0.78515625, "step": 4860 }, { "epoch": 1.2876784769962983, "grad_norm": 29.5, "learning_rate": 3.561607615018509e-05, "logits/chosen": -86.0, "logits/rejected": -84.0, "logps/chosen": -564.0, "logps/rejected": -482.0, "loss": 0.3782, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 2.515625, "rewards/margins": 1.71875, "rewards/rejected": 0.796875, "step": 4870 }, { "epoch": 1.2903225806451613, "grad_norm": 28.125, "learning_rate": 3.548387096774194e-05, "logits/chosen": -87.0, "logits/rejected": -86.0, "logps/chosen": -636.0, "logps/rejected": -532.0, "loss": 0.3044, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 2.921875, "rewards/margins": 2.171875, "rewards/rejected": 0.7421875, "step": 4880 }, { "epoch": 1.2929666842940244, "grad_norm": 20.375, "learning_rate": 3.535166578529879e-05, "logits/chosen": -85.5, "logits/rejected": -84.5, "logps/chosen": -536.0, "logps/rejected": -458.0, "loss": 0.3514, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": 2.59375, "rewards/margins": 2.078125, "rewards/rejected": 0.53125, "step": 4890 }, { "epoch": 1.2956107879428873, "grad_norm": 28.25, "learning_rate": 3.521946060285564e-05, "logits/chosen": -88.5, "logits/rejected": -87.0, "logps/chosen": -616.0, "logps/rejected": -532.0, "loss": 0.4041, "rewards/accuracies": 0.84375, "rewards/chosen": 2.96875, "rewards/margins": 1.9140625, "rewards/rejected": 1.0546875, "step": 4900 }, { "epoch": 1.2982548915917504, "grad_norm": 43.5, "learning_rate": 3.508725542041248e-05, "logits/chosen": -87.0, "logits/rejected": -86.5, "logps/chosen": -584.0, "logps/rejected": -516.0, "loss": 0.3493, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": 2.703125, "rewards/margins": 1.984375, "rewards/rejected": 0.72265625, "step": 4910 }, { "epoch": 1.3008989952406134, "grad_norm": 38.0, "learning_rate": 3.495505023796933e-05, "logits/chosen": -86.0, "logits/rejected": -85.5, "logps/chosen": -580.0, "logps/rejected": -508.0, "loss": 0.3695, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": 2.609375, "rewards/margins": 1.8671875, "rewards/rejected": 0.7421875, "step": 4920 }, { "epoch": 1.3035430988894765, "grad_norm": 18.0, "learning_rate": 3.482284505552618e-05, "logits/chosen": -86.5, "logits/rejected": -87.0, "logps/chosen": -576.0, "logps/rejected": -528.0, "loss": 0.2671, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 2.890625, "rewards/margins": 2.296875, "rewards/rejected": 0.59375, "step": 4930 }, { "epoch": 1.3061872025383394, "grad_norm": 27.125, "learning_rate": 3.469063987308303e-05, "logits/chosen": -85.5, "logits/rejected": -86.0, "logps/chosen": -600.0, "logps/rejected": -520.0, "loss": 0.3018, "rewards/accuracies": 0.856249988079071, "rewards/chosen": 2.53125, "rewards/margins": 2.265625, "rewards/rejected": 0.263671875, "step": 4940 }, { "epoch": 1.3088313061872026, "grad_norm": 33.75, "learning_rate": 3.455843469063987e-05, "logits/chosen": -87.5, "logits/rejected": -84.5, "logps/chosen": -576.0, "logps/rejected": -468.0, "loss": 0.325, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": 2.421875, "rewards/margins": 1.953125, "rewards/rejected": 0.462890625, "step": 4950 }, { "epoch": 1.3114754098360657, "grad_norm": 31.25, "learning_rate": 3.442622950819672e-05, "logits/chosen": -88.0, "logits/rejected": -83.5, "logps/chosen": -588.0, "logps/rejected": -476.0, "loss": 0.2924, "rewards/accuracies": 0.856249988079071, "rewards/chosen": 2.640625, "rewards/margins": 2.171875, "rewards/rejected": 0.470703125, "step": 4960 }, { "epoch": 1.3141195134849286, "grad_norm": 27.0, "learning_rate": 3.429402432575357e-05, "logits/chosen": -88.0, "logits/rejected": -85.0, "logps/chosen": -576.0, "logps/rejected": -502.0, "loss": 0.3646, "rewards/accuracies": 0.831250011920929, "rewards/chosen": 2.75, "rewards/margins": 1.96875, "rewards/rejected": 0.76953125, "step": 4970 }, { "epoch": 1.3167636171337915, "grad_norm": 23.375, "learning_rate": 3.416181914331042e-05, "logits/chosen": -87.0, "logits/rejected": -85.5, "logps/chosen": -544.0, "logps/rejected": -488.0, "loss": 0.311, "rewards/accuracies": 0.875, "rewards/chosen": 2.578125, "rewards/margins": 2.03125, "rewards/rejected": 0.55078125, "step": 4980 }, { "epoch": 1.3194077207826547, "grad_norm": 39.25, "learning_rate": 3.402961396086727e-05, "logits/chosen": -89.0, "logits/rejected": -85.0, "logps/chosen": -600.0, "logps/rejected": -504.0, "loss": 0.3481, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": 2.890625, "rewards/margins": 2.125, "rewards/rejected": 0.75, "step": 4990 }, { "epoch": 1.3220518244315178, "grad_norm": 26.5, "learning_rate": 3.389740877842412e-05, "logits/chosen": -90.0, "logits/rejected": -86.0, "logps/chosen": -624.0, "logps/rejected": -536.0, "loss": 0.2722, "rewards/accuracies": 0.875, "rewards/chosen": 2.828125, "rewards/margins": 2.265625, "rewards/rejected": 0.5546875, "step": 5000 }, { "epoch": 1.3246959280803807, "grad_norm": 36.5, "learning_rate": 3.376520359598097e-05, "logits/chosen": -87.0, "logits/rejected": -84.5, "logps/chosen": -536.0, "logps/rejected": -492.0, "loss": 0.2944, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": 2.40625, "rewards/margins": 2.0625, "rewards/rejected": 0.345703125, "step": 5010 }, { "epoch": 1.3273400317292439, "grad_norm": 26.125, "learning_rate": 3.363299841353781e-05, "logits/chosen": -89.0, "logits/rejected": -86.0, "logps/chosen": -592.0, "logps/rejected": -500.0, "loss": 0.2627, "rewards/accuracies": 0.90625, "rewards/chosen": 2.734375, "rewards/margins": 2.203125, "rewards/rejected": 0.53125, "step": 5020 }, { "epoch": 1.3299841353781068, "grad_norm": 33.5, "learning_rate": 3.350079323109466e-05, "logits/chosen": -89.5, "logits/rejected": -84.5, "logps/chosen": -616.0, "logps/rejected": -502.0, "loss": 0.3265, "rewards/accuracies": 0.875, "rewards/chosen": 2.84375, "rewards/margins": 2.109375, "rewards/rejected": 0.73046875, "step": 5030 }, { "epoch": 1.33262823902697, "grad_norm": 13.9375, "learning_rate": 3.336858804865151e-05, "logits/chosen": -86.0, "logits/rejected": -85.5, "logps/chosen": -532.0, "logps/rejected": -488.0, "loss": 0.2976, "rewards/accuracies": 0.856249988079071, "rewards/chosen": 2.4375, "rewards/margins": 2.125, "rewards/rejected": 0.306640625, "step": 5040 }, { "epoch": 1.3352723426758328, "grad_norm": 36.25, "learning_rate": 3.3236382866208353e-05, "logits/chosen": -87.5, "logits/rejected": -84.0, "logps/chosen": -592.0, "logps/rejected": -532.0, "loss": 0.3068, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": 2.390625, "rewards/margins": 2.078125, "rewards/rejected": 0.3125, "step": 5050 }, { "epoch": 1.337916446324696, "grad_norm": 19.0, "learning_rate": 3.31041776837652e-05, "logits/chosen": -88.5, "logits/rejected": -86.5, "logps/chosen": -568.0, "logps/rejected": -516.0, "loss": 0.3574, "rewards/accuracies": 0.793749988079071, "rewards/chosen": 2.375, "rewards/margins": 1.8203125, "rewards/rejected": 0.54296875, "step": 5060 }, { "epoch": 1.340560549973559, "grad_norm": 32.75, "learning_rate": 3.297197250132205e-05, "logits/chosen": -86.0, "logits/rejected": -85.0, "logps/chosen": -604.0, "logps/rejected": -520.0, "loss": 0.3257, "rewards/accuracies": 0.831250011920929, "rewards/chosen": 2.734375, "rewards/margins": 2.25, "rewards/rejected": 0.49609375, "step": 5070 }, { "epoch": 1.343204653622422, "grad_norm": 17.875, "learning_rate": 3.28397673188789e-05, "logits/chosen": -87.0, "logits/rejected": -85.5, "logps/chosen": -584.0, "logps/rejected": -544.0, "loss": 0.2988, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 2.65625, "rewards/margins": 2.265625, "rewards/rejected": 0.400390625, "step": 5080 }, { "epoch": 1.345848757271285, "grad_norm": 24.125, "learning_rate": 3.270756213643575e-05, "logits/chosen": -87.5, "logits/rejected": -85.0, "logps/chosen": -576.0, "logps/rejected": -460.0, "loss": 0.2759, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": 2.4375, "rewards/margins": 2.21875, "rewards/rejected": 0.2099609375, "step": 5090 }, { "epoch": 1.348492860920148, "grad_norm": 33.5, "learning_rate": 3.2575356953992595e-05, "logits/chosen": -88.5, "logits/rejected": -86.0, "logps/chosen": -576.0, "logps/rejected": -472.0, "loss": 0.3835, "rewards/accuracies": 0.8125, "rewards/chosen": 2.71875, "rewards/margins": 2.09375, "rewards/rejected": 0.62109375, "step": 5100 }, { "epoch": 1.3511369645690112, "grad_norm": 22.875, "learning_rate": 3.2443151771549444e-05, "logits/chosen": -87.5, "logits/rejected": -85.0, "logps/chosen": -620.0, "logps/rejected": -496.0, "loss": 0.2837, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": 3.25, "rewards/margins": 2.3125, "rewards/rejected": 0.94140625, "step": 5110 }, { "epoch": 1.3537810682178741, "grad_norm": 22.75, "learning_rate": 3.2310946589106294e-05, "logits/chosen": -85.5, "logits/rejected": -83.5, "logps/chosen": -544.0, "logps/rejected": -440.0, "loss": 0.3499, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": 2.578125, "rewards/margins": 1.8671875, "rewards/rejected": 0.71484375, "step": 5120 }, { "epoch": 1.356425171866737, "grad_norm": 22.5, "learning_rate": 3.217874140666314e-05, "logits/chosen": -86.0, "logits/rejected": -84.5, "logps/chosen": -510.0, "logps/rejected": -456.0, "loss": 0.3386, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": 2.796875, "rewards/margins": 2.25, "rewards/rejected": 0.546875, "step": 5130 }, { "epoch": 1.3590692755156002, "grad_norm": 19.75, "learning_rate": 3.204653622421999e-05, "logits/chosen": -86.5, "logits/rejected": -84.5, "logps/chosen": -540.0, "logps/rejected": -458.0, "loss": 0.2911, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 2.65625, "rewards/margins": 2.109375, "rewards/rejected": 0.5546875, "step": 5140 }, { "epoch": 1.3617133791644633, "grad_norm": 35.25, "learning_rate": 3.191433104177684e-05, "logits/chosen": -86.0, "logits/rejected": -87.0, "logps/chosen": -576.0, "logps/rejected": -520.0, "loss": 0.3155, "rewards/accuracies": 0.84375, "rewards/chosen": 2.8125, "rewards/margins": 2.21875, "rewards/rejected": 0.59375, "step": 5150 }, { "epoch": 1.3643574828133262, "grad_norm": 31.0, "learning_rate": 3.178212585933369e-05, "logits/chosen": -86.5, "logits/rejected": -85.0, "logps/chosen": -584.0, "logps/rejected": -504.0, "loss": 0.3411, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 2.453125, "rewards/margins": 1.9453125, "rewards/rejected": 0.5078125, "step": 5160 }, { "epoch": 1.3670015864621894, "grad_norm": 26.125, "learning_rate": 3.1649920676890535e-05, "logits/chosen": -88.0, "logits/rejected": -85.5, "logps/chosen": -616.0, "logps/rejected": -500.0, "loss": 0.3104, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 2.5625, "rewards/margins": 2.390625, "rewards/rejected": 0.1689453125, "step": 5170 }, { "epoch": 1.3696456901110523, "grad_norm": 25.75, "learning_rate": 3.1517715494447384e-05, "logits/chosen": -85.0, "logits/rejected": -84.0, "logps/chosen": -560.0, "logps/rejected": -478.0, "loss": 0.3124, "rewards/accuracies": 0.856249988079071, "rewards/chosen": 2.640625, "rewards/margins": 2.03125, "rewards/rejected": 0.61328125, "step": 5180 }, { "epoch": 1.3722897937599154, "grad_norm": 23.375, "learning_rate": 3.138551031200423e-05, "logits/chosen": -87.0, "logits/rejected": -86.0, "logps/chosen": -596.0, "logps/rejected": -516.0, "loss": 0.3148, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": 2.625, "rewards/margins": 2.09375, "rewards/rejected": 0.52734375, "step": 5190 }, { "epoch": 1.3749338974087784, "grad_norm": 27.375, "learning_rate": 3.125330512956108e-05, "logits/chosen": -86.5, "logits/rejected": -85.5, "logps/chosen": -584.0, "logps/rejected": -536.0, "loss": 0.3289, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": 2.703125, "rewards/margins": 1.9140625, "rewards/rejected": 0.7890625, "step": 5200 }, { "epoch": 1.3775780010576415, "grad_norm": 30.875, "learning_rate": 3.1121099947117926e-05, "logits/chosen": -89.0, "logits/rejected": -88.0, "logps/chosen": -568.0, "logps/rejected": -492.0, "loss": 0.3334, "rewards/accuracies": 0.84375, "rewards/chosen": 2.640625, "rewards/margins": 1.9453125, "rewards/rejected": 0.6875, "step": 5210 }, { "epoch": 1.3802221047065044, "grad_norm": 38.25, "learning_rate": 3.0988894764674776e-05, "logits/chosen": -86.5, "logits/rejected": -84.5, "logps/chosen": -506.0, "logps/rejected": -460.0, "loss": 0.4078, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 2.0, "rewards/margins": 1.578125, "rewards/rejected": 0.423828125, "step": 5220 }, { "epoch": 1.3828662083553676, "grad_norm": 28.5, "learning_rate": 3.0856689582231626e-05, "logits/chosen": -89.0, "logits/rejected": -88.0, "logps/chosen": -560.0, "logps/rejected": -516.0, "loss": 0.3208, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 2.296875, "rewards/margins": 2.125, "rewards/rejected": 0.1796875, "step": 5230 }, { "epoch": 1.3855103120042305, "grad_norm": 47.5, "learning_rate": 3.0724484399788475e-05, "logits/chosen": -88.0, "logits/rejected": -86.0, "logps/chosen": -568.0, "logps/rejected": -468.0, "loss": 0.3, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": 2.328125, "rewards/margins": 2.21875, "rewards/rejected": 0.1123046875, "step": 5240 }, { "epoch": 1.3881544156530936, "grad_norm": 31.375, "learning_rate": 3.059227921734532e-05, "logits/chosen": -88.5, "logits/rejected": -85.5, "logps/chosen": -640.0, "logps/rejected": -516.0, "loss": 0.3559, "rewards/accuracies": 0.856249988079071, "rewards/chosen": 2.734375, "rewards/margins": 1.890625, "rewards/rejected": 0.8515625, "step": 5250 }, { "epoch": 1.3907985193019567, "grad_norm": 37.75, "learning_rate": 3.046007403490217e-05, "logits/chosen": -87.0, "logits/rejected": -85.0, "logps/chosen": -568.0, "logps/rejected": -520.0, "loss": 0.341, "rewards/accuracies": 0.856249988079071, "rewards/chosen": 2.609375, "rewards/margins": 1.9375, "rewards/rejected": 0.6796875, "step": 5260 }, { "epoch": 1.3934426229508197, "grad_norm": 25.625, "learning_rate": 3.0327868852459017e-05, "logits/chosen": -87.5, "logits/rejected": -84.5, "logps/chosen": -568.0, "logps/rejected": -448.0, "loss": 0.3521, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 2.59375, "rewards/margins": 1.78125, "rewards/rejected": 0.8125, "step": 5270 }, { "epoch": 1.3960867265996826, "grad_norm": 17.25, "learning_rate": 3.0195663670015867e-05, "logits/chosen": -87.0, "logits/rejected": -84.0, "logps/chosen": -564.0, "logps/rejected": -466.0, "loss": 0.3541, "rewards/accuracies": 0.8125, "rewards/chosen": 2.390625, "rewards/margins": 1.8671875, "rewards/rejected": 0.53125, "step": 5280 }, { "epoch": 1.3987308302485457, "grad_norm": 33.75, "learning_rate": 3.0063458487572716e-05, "logits/chosen": -88.0, "logits/rejected": -87.0, "logps/chosen": -568.0, "logps/rejected": -490.0, "loss": 0.3309, "rewards/accuracies": 0.8125, "rewards/chosen": 3.015625, "rewards/margins": 2.03125, "rewards/rejected": 0.984375, "step": 5290 }, { "epoch": 1.4013749338974089, "grad_norm": 37.0, "learning_rate": 2.9931253305129563e-05, "logits/chosen": -86.0, "logits/rejected": -84.5, "logps/chosen": -584.0, "logps/rejected": -478.0, "loss": 0.3638, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": 2.515625, "rewards/margins": 1.828125, "rewards/rejected": 0.6796875, "step": 5300 }, { "epoch": 1.4040190375462718, "grad_norm": 38.75, "learning_rate": 2.9799048122686412e-05, "logits/chosen": -86.5, "logits/rejected": -85.0, "logps/chosen": -564.0, "logps/rejected": -480.0, "loss": 0.366, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 2.9375, "rewards/margins": 1.90625, "rewards/rejected": 1.0390625, "step": 5310 }, { "epoch": 1.406663141195135, "grad_norm": 30.75, "learning_rate": 2.9666842940243262e-05, "logits/chosen": -85.5, "logits/rejected": -84.0, "logps/chosen": -556.0, "logps/rejected": -486.0, "loss": 0.3389, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 2.5, "rewards/margins": 1.9609375, "rewards/rejected": 0.53515625, "step": 5320 }, { "epoch": 1.4093072448439978, "grad_norm": 18.375, "learning_rate": 2.953463775780011e-05, "logits/chosen": -89.0, "logits/rejected": -86.5, "logps/chosen": -548.0, "logps/rejected": -474.0, "loss": 0.3761, "rewards/accuracies": 0.831250011920929, "rewards/chosen": 2.75, "rewards/margins": 1.8359375, "rewards/rejected": 0.91015625, "step": 5330 }, { "epoch": 1.411951348492861, "grad_norm": 55.5, "learning_rate": 2.9402432575356954e-05, "logits/chosen": -91.0, "logits/rejected": -85.0, "logps/chosen": -592.0, "logps/rejected": -466.0, "loss": 0.4255, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": 2.90625, "rewards/margins": 1.953125, "rewards/rejected": 0.9609375, "step": 5340 }, { "epoch": 1.4145954521417239, "grad_norm": 34.75, "learning_rate": 2.92702273929138e-05, "logits/chosen": -87.0, "logits/rejected": -85.0, "logps/chosen": -584.0, "logps/rejected": -520.0, "loss": 0.3296, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": 2.953125, "rewards/margins": 1.9609375, "rewards/rejected": 1.0, "step": 5350 }, { "epoch": 1.417239555790587, "grad_norm": 31.625, "learning_rate": 2.913802221047065e-05, "logits/chosen": -84.0, "logits/rejected": -83.5, "logps/chosen": -512.0, "logps/rejected": -464.0, "loss": 0.3574, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 2.40625, "rewards/margins": 1.84375, "rewards/rejected": 0.5625, "step": 5360 }, { "epoch": 1.4198836594394502, "grad_norm": 26.125, "learning_rate": 2.90058170280275e-05, "logits/chosen": -87.0, "logits/rejected": -84.5, "logps/chosen": -556.0, "logps/rejected": -474.0, "loss": 0.3247, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 2.609375, "rewards/margins": 2.078125, "rewards/rejected": 0.5234375, "step": 5370 }, { "epoch": 1.422527763088313, "grad_norm": 26.875, "learning_rate": 2.887361184558435e-05, "logits/chosen": -88.5, "logits/rejected": -88.5, "logps/chosen": -584.0, "logps/rejected": -588.0, "loss": 0.3514, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 2.578125, "rewards/margins": 2.03125, "rewards/rejected": 0.546875, "step": 5380 }, { "epoch": 1.425171866737176, "grad_norm": 177.0, "learning_rate": 2.8741406663141195e-05, "logits/chosen": -88.5, "logits/rejected": -85.0, "logps/chosen": -560.0, "logps/rejected": -448.0, "loss": 0.3196, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": 2.515625, "rewards/margins": 2.03125, "rewards/rejected": 0.478515625, "step": 5390 }, { "epoch": 1.4278159703860391, "grad_norm": 40.5, "learning_rate": 2.8609201480698045e-05, "logits/chosen": -88.0, "logits/rejected": -85.5, "logps/chosen": -552.0, "logps/rejected": -528.0, "loss": 0.3297, "rewards/accuracies": 0.856249988079071, "rewards/chosen": 2.4375, "rewards/margins": 1.9296875, "rewards/rejected": 0.5078125, "step": 5400 }, { "epoch": 1.4304600740349023, "grad_norm": 38.0, "learning_rate": 2.8476996298254894e-05, "logits/chosen": -89.0, "logits/rejected": -86.0, "logps/chosen": -576.0, "logps/rejected": -520.0, "loss": 0.3279, "rewards/accuracies": 0.856249988079071, "rewards/chosen": 2.4375, "rewards/margins": 2.078125, "rewards/rejected": 0.365234375, "step": 5410 }, { "epoch": 1.4331041776837652, "grad_norm": 21.75, "learning_rate": 2.834479111581174e-05, "logits/chosen": -87.5, "logits/rejected": -84.0, "logps/chosen": -604.0, "logps/rejected": -480.0, "loss": 0.3051, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": 2.53125, "rewards/margins": 2.125, "rewards/rejected": 0.3984375, "step": 5420 }, { "epoch": 1.435748281332628, "grad_norm": 21.25, "learning_rate": 2.821258593336859e-05, "logits/chosen": -85.5, "logits/rejected": -83.5, "logps/chosen": -592.0, "logps/rejected": -492.0, "loss": 0.3315, "rewards/accuracies": 0.84375, "rewards/chosen": 2.4375, "rewards/margins": 1.953125, "rewards/rejected": 0.490234375, "step": 5430 }, { "epoch": 1.4383923849814912, "grad_norm": 19.375, "learning_rate": 2.808038075092544e-05, "logits/chosen": -88.5, "logits/rejected": -86.0, "logps/chosen": -576.0, "logps/rejected": -520.0, "loss": 0.3067, "rewards/accuracies": 0.875, "rewards/chosen": 2.6875, "rewards/margins": 2.125, "rewards/rejected": 0.5703125, "step": 5440 }, { "epoch": 1.4410364886303544, "grad_norm": 27.625, "learning_rate": 2.794817556848229e-05, "logits/chosen": -89.0, "logits/rejected": -86.0, "logps/chosen": -600.0, "logps/rejected": -544.0, "loss": 0.3004, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 2.90625, "rewards/margins": 2.15625, "rewards/rejected": 0.75, "step": 5450 }, { "epoch": 1.4436805922792173, "grad_norm": 28.875, "learning_rate": 2.7815970386039136e-05, "logits/chosen": -89.5, "logits/rejected": -88.5, "logps/chosen": -584.0, "logps/rejected": -512.0, "loss": 0.3449, "rewards/accuracies": 0.8125, "rewards/chosen": 2.640625, "rewards/margins": 1.796875, "rewards/rejected": 0.84375, "step": 5460 }, { "epoch": 1.4463246959280804, "grad_norm": 38.75, "learning_rate": 2.7683765203595985e-05, "logits/chosen": -89.5, "logits/rejected": -87.0, "logps/chosen": -556.0, "logps/rejected": -498.0, "loss": 0.3147, "rewards/accuracies": 0.918749988079071, "rewards/chosen": 2.46875, "rewards/margins": 2.09375, "rewards/rejected": 0.373046875, "step": 5470 }, { "epoch": 1.4489687995769434, "grad_norm": 15.9375, "learning_rate": 2.7551560021152828e-05, "logits/chosen": -88.5, "logits/rejected": -86.0, "logps/chosen": -616.0, "logps/rejected": -520.0, "loss": 0.3331, "rewards/accuracies": 0.84375, "rewards/chosen": 2.671875, "rewards/margins": 2.125, "rewards/rejected": 0.5390625, "step": 5480 }, { "epoch": 1.4516129032258065, "grad_norm": 48.25, "learning_rate": 2.7419354838709678e-05, "logits/chosen": -89.5, "logits/rejected": -85.5, "logps/chosen": -600.0, "logps/rejected": -466.0, "loss": 0.3116, "rewards/accuracies": 0.856249988079071, "rewards/chosen": 2.4375, "rewards/margins": 1.828125, "rewards/rejected": 0.61328125, "step": 5490 }, { "epoch": 1.4542570068746694, "grad_norm": 15.5, "learning_rate": 2.7287149656266524e-05, "logits/chosen": -90.0, "logits/rejected": -86.5, "logps/chosen": -540.0, "logps/rejected": -456.0, "loss": 0.2896, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": 2.390625, "rewards/margins": 1.9609375, "rewards/rejected": 0.431640625, "step": 5500 }, { "epoch": 1.4569011105235325, "grad_norm": 27.5, "learning_rate": 2.7154944473823373e-05, "logits/chosen": -89.5, "logits/rejected": -84.5, "logps/chosen": -632.0, "logps/rejected": -516.0, "loss": 0.3, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": 2.859375, "rewards/margins": 2.0625, "rewards/rejected": 0.7890625, "step": 5510 }, { "epoch": 1.4595452141723957, "grad_norm": 19.5, "learning_rate": 2.7022739291380223e-05, "logits/chosen": -89.5, "logits/rejected": -86.0, "logps/chosen": -624.0, "logps/rejected": -510.0, "loss": 0.3172, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": 2.921875, "rewards/margins": 2.375, "rewards/rejected": 0.546875, "step": 5520 }, { "epoch": 1.4621893178212586, "grad_norm": 30.375, "learning_rate": 2.6890534108937072e-05, "logits/chosen": -88.0, "logits/rejected": -85.5, "logps/chosen": -552.0, "logps/rejected": -470.0, "loss": 0.3256, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 2.734375, "rewards/margins": 2.125, "rewards/rejected": 0.60546875, "step": 5530 }, { "epoch": 1.4648334214701215, "grad_norm": 35.5, "learning_rate": 2.675832892649392e-05, "logits/chosen": -88.0, "logits/rejected": -87.5, "logps/chosen": -600.0, "logps/rejected": -548.0, "loss": 0.3046, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 3.078125, "rewards/margins": 2.0625, "rewards/rejected": 1.015625, "step": 5540 }, { "epoch": 1.4674775251189847, "grad_norm": 24.25, "learning_rate": 2.6626123744050768e-05, "logits/chosen": -87.5, "logits/rejected": -86.5, "logps/chosen": -588.0, "logps/rejected": -508.0, "loss": 0.3045, "rewards/accuracies": 0.875, "rewards/chosen": 2.90625, "rewards/margins": 1.8984375, "rewards/rejected": 1.0078125, "step": 5550 }, { "epoch": 1.4701216287678478, "grad_norm": 44.0, "learning_rate": 2.6493918561607618e-05, "logits/chosen": -88.0, "logits/rejected": -87.0, "logps/chosen": -616.0, "logps/rejected": -548.0, "loss": 0.4212, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.953125, "rewards/margins": 1.90625, "rewards/rejected": 1.046875, "step": 5560 }, { "epoch": 1.4727657324167107, "grad_norm": 36.75, "learning_rate": 2.6361713379164464e-05, "logits/chosen": -86.5, "logits/rejected": -84.5, "logps/chosen": -548.0, "logps/rejected": -504.0, "loss": 0.324, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 2.640625, "rewards/margins": 2.15625, "rewards/rejected": 0.474609375, "step": 5570 }, { "epoch": 1.4754098360655736, "grad_norm": 24.125, "learning_rate": 2.6229508196721314e-05, "logits/chosen": -86.0, "logits/rejected": -84.0, "logps/chosen": -552.0, "logps/rejected": -476.0, "loss": 0.3021, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 2.90625, "rewards/margins": 2.140625, "rewards/rejected": 0.765625, "step": 5580 }, { "epoch": 1.4780539397144368, "grad_norm": 14.75, "learning_rate": 2.6097303014278163e-05, "logits/chosen": -88.5, "logits/rejected": -85.0, "logps/chosen": -596.0, "logps/rejected": -510.0, "loss": 0.2771, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 3.0625, "rewards/margins": 2.25, "rewards/rejected": 0.81640625, "step": 5590 }, { "epoch": 1.4806980433633, "grad_norm": 19.5, "learning_rate": 2.5965097831835013e-05, "logits/chosen": -88.0, "logits/rejected": -85.0, "logps/chosen": -596.0, "logps/rejected": -520.0, "loss": 0.3145, "rewards/accuracies": 0.875, "rewards/chosen": 2.9375, "rewards/margins": 2.03125, "rewards/rejected": 0.8984375, "step": 5600 }, { "epoch": 1.4833421470121628, "grad_norm": 45.0, "learning_rate": 2.583289264939186e-05, "logits/chosen": -90.0, "logits/rejected": -87.0, "logps/chosen": -624.0, "logps/rejected": -540.0, "loss": 0.3374, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": 2.875, "rewards/margins": 2.046875, "rewards/rejected": 0.828125, "step": 5610 }, { "epoch": 1.485986250661026, "grad_norm": 18.875, "learning_rate": 2.5700687466948702e-05, "logits/chosen": -89.5, "logits/rejected": -86.5, "logps/chosen": -616.0, "logps/rejected": -506.0, "loss": 0.2798, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": 3.015625, "rewards/margins": 2.34375, "rewards/rejected": 0.67578125, "step": 5620 }, { "epoch": 1.4886303543098889, "grad_norm": 19.875, "learning_rate": 2.556848228450555e-05, "logits/chosen": -87.0, "logits/rejected": -84.5, "logps/chosen": -588.0, "logps/rejected": -486.0, "loss": 0.2718, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": 2.78125, "rewards/margins": 2.3125, "rewards/rejected": 0.47265625, "step": 5630 }, { "epoch": 1.491274457958752, "grad_norm": 22.25, "learning_rate": 2.54362771020624e-05, "logits/chosen": -87.5, "logits/rejected": -82.5, "logps/chosen": -568.0, "logps/rejected": -468.0, "loss": 0.3641, "rewards/accuracies": 0.831250011920929, "rewards/chosen": 2.65625, "rewards/margins": 1.8515625, "rewards/rejected": 0.8046875, "step": 5640 }, { "epoch": 1.493918561607615, "grad_norm": 33.0, "learning_rate": 2.5304071919619247e-05, "logits/chosen": -87.0, "logits/rejected": -86.5, "logps/chosen": -588.0, "logps/rejected": -480.0, "loss": 0.2944, "rewards/accuracies": 0.90625, "rewards/chosen": 2.53125, "rewards/margins": 2.140625, "rewards/rejected": 0.392578125, "step": 5650 }, { "epoch": 1.496562665256478, "grad_norm": 31.875, "learning_rate": 2.5171866737176097e-05, "logits/chosen": -89.0, "logits/rejected": -83.0, "logps/chosen": -576.0, "logps/rejected": -456.0, "loss": 0.2659, "rewards/accuracies": 0.90625, "rewards/chosen": 2.890625, "rewards/margins": 2.359375, "rewards/rejected": 0.5234375, "step": 5660 }, { "epoch": 1.4992067689053412, "grad_norm": 29.0, "learning_rate": 2.5039661554732946e-05, "logits/chosen": -90.0, "logits/rejected": -86.0, "logps/chosen": -600.0, "logps/rejected": -520.0, "loss": 0.313, "rewards/accuracies": 0.856249988079071, "rewards/chosen": 2.75, "rewards/margins": 2.171875, "rewards/rejected": 0.578125, "step": 5670 }, { "epoch": 1.5, "eval_logits/chosen": -88.0, "eval_logits/rejected": -86.0, "eval_logps/chosen": -580.0, "eval_logps/rejected": -492.0, "eval_loss": 0.5979180932044983, "eval_rewards/accuracies": 0.6942168474197388, "eval_rewards/chosen": 2.203125, "eval_rewards/margins": 1.2734375, "eval_rewards/rejected": 0.9375, "eval_runtime": 998.7316, "eval_samples_per_second": 15.145, "eval_steps_per_second": 0.947, "step": 5673 }, { "epoch": 1.5018508725542041, "grad_norm": 17.625, "learning_rate": 2.4907456372289796e-05, "logits/chosen": -89.0, "logits/rejected": -86.0, "logps/chosen": -588.0, "logps/rejected": -490.0, "loss": 0.3804, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.546875, "rewards/margins": 2.15625, "rewards/rejected": 0.392578125, "step": 5680 }, { "epoch": 1.504494976203067, "grad_norm": 16.625, "learning_rate": 2.4775251189846642e-05, "logits/chosen": -90.5, "logits/rejected": -86.0, "logps/chosen": -592.0, "logps/rejected": -492.0, "loss": 0.3243, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 2.78125, "rewards/margins": 2.140625, "rewards/rejected": 0.64453125, "step": 5690 }, { "epoch": 1.5071390798519302, "grad_norm": 16.625, "learning_rate": 2.464304600740349e-05, "logits/chosen": -87.5, "logits/rejected": -85.5, "logps/chosen": -556.0, "logps/rejected": -504.0, "loss": 0.2974, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": 2.53125, "rewards/margins": 1.984375, "rewards/rejected": 0.55859375, "step": 5700 }, { "epoch": 1.5097831835007933, "grad_norm": 19.25, "learning_rate": 2.451084082496034e-05, "logits/chosen": -89.0, "logits/rejected": -83.5, "logps/chosen": -592.0, "logps/rejected": -452.0, "loss": 0.3007, "rewards/accuracies": 0.875, "rewards/chosen": 2.71875, "rewards/margins": 2.015625, "rewards/rejected": 0.69921875, "step": 5710 }, { "epoch": 1.5124272871496562, "grad_norm": 24.75, "learning_rate": 2.4378635642517187e-05, "logits/chosen": -87.0, "logits/rejected": -84.5, "logps/chosen": -604.0, "logps/rejected": -528.0, "loss": 0.3082, "rewards/accuracies": 0.856249988079071, "rewards/chosen": 2.578125, "rewards/margins": 1.9296875, "rewards/rejected": 0.64453125, "step": 5720 }, { "epoch": 1.5150713907985192, "grad_norm": 37.0, "learning_rate": 2.4246430460074034e-05, "logits/chosen": -86.5, "logits/rejected": -83.5, "logps/chosen": -584.0, "logps/rejected": -488.0, "loss": 0.323, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": 2.515625, "rewards/margins": 1.9609375, "rewards/rejected": 0.54296875, "step": 5730 }, { "epoch": 1.5177154944473823, "grad_norm": 15.8125, "learning_rate": 2.4114225277630883e-05, "logits/chosen": -84.5, "logits/rejected": -84.5, "logps/chosen": -592.0, "logps/rejected": -502.0, "loss": 0.337, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": 2.640625, "rewards/margins": 1.9765625, "rewards/rejected": 0.66796875, "step": 5740 }, { "epoch": 1.5203595980962454, "grad_norm": 33.0, "learning_rate": 2.3982020095187733e-05, "logits/chosen": -86.0, "logits/rejected": -84.0, "logps/chosen": -528.0, "logps/rejected": -478.0, "loss": 0.3395, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": 2.3125, "rewards/margins": 2.0625, "rewards/rejected": 0.263671875, "step": 5750 }, { "epoch": 1.5230037017451084, "grad_norm": 45.0, "learning_rate": 2.3849814912744582e-05, "logits/chosen": -88.5, "logits/rejected": -84.5, "logps/chosen": -524.0, "logps/rejected": -450.0, "loss": 0.3704, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 2.453125, "rewards/margins": 1.8359375, "rewards/rejected": 0.6171875, "step": 5760 }, { "epoch": 1.5256478053939715, "grad_norm": 21.25, "learning_rate": 2.371760973030143e-05, "logits/chosen": -88.0, "logits/rejected": -86.0, "logps/chosen": -580.0, "logps/rejected": -536.0, "loss": 0.308, "rewards/accuracies": 0.84375, "rewards/chosen": 2.625, "rewards/margins": 2.296875, "rewards/rejected": 0.3203125, "step": 5770 }, { "epoch": 1.5282919090428346, "grad_norm": 17.125, "learning_rate": 2.3585404547858278e-05, "logits/chosen": -87.5, "logits/rejected": -87.5, "logps/chosen": -568.0, "logps/rejected": -486.0, "loss": 0.2803, "rewards/accuracies": 0.90625, "rewards/chosen": 2.71875, "rewards/margins": 2.109375, "rewards/rejected": 0.61328125, "step": 5780 }, { "epoch": 1.5309360126916975, "grad_norm": 19.375, "learning_rate": 2.3453199365415128e-05, "logits/chosen": -88.5, "logits/rejected": -85.5, "logps/chosen": -544.0, "logps/rejected": -482.0, "loss": 0.3092, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 2.640625, "rewards/margins": 2.171875, "rewards/rejected": 0.46875, "step": 5790 }, { "epoch": 1.5335801163405605, "grad_norm": 30.875, "learning_rate": 2.3320994182971974e-05, "logits/chosen": -88.5, "logits/rejected": -84.0, "logps/chosen": -552.0, "logps/rejected": -450.0, "loss": 0.3478, "rewards/accuracies": 0.875, "rewards/chosen": 2.640625, "rewards/margins": 1.953125, "rewards/rejected": 0.6875, "step": 5800 }, { "epoch": 1.5362242199894236, "grad_norm": 44.75, "learning_rate": 2.318878900052882e-05, "logits/chosen": -87.5, "logits/rejected": -85.5, "logps/chosen": -520.0, "logps/rejected": -466.0, "loss": 0.3344, "rewards/accuracies": 0.831250011920929, "rewards/chosen": 2.4375, "rewards/margins": 1.859375, "rewards/rejected": 0.57421875, "step": 5810 }, { "epoch": 1.5388683236382867, "grad_norm": 33.25, "learning_rate": 2.305658381808567e-05, "logits/chosen": -88.0, "logits/rejected": -84.0, "logps/chosen": -584.0, "logps/rejected": -510.0, "loss": 0.3671, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 2.78125, "rewards/margins": 1.953125, "rewards/rejected": 0.8203125, "step": 5820 }, { "epoch": 1.5415124272871497, "grad_norm": 29.25, "learning_rate": 2.292437863564252e-05, "logits/chosen": -86.5, "logits/rejected": -86.5, "logps/chosen": -556.0, "logps/rejected": -516.0, "loss": 0.3735, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": 2.25, "rewards/margins": 1.8359375, "rewards/rejected": 0.419921875, "step": 5830 }, { "epoch": 1.5441565309360126, "grad_norm": 17.125, "learning_rate": 2.2792173453199366e-05, "logits/chosen": -86.5, "logits/rejected": -86.5, "logps/chosen": -544.0, "logps/rejected": -498.0, "loss": 0.2946, "rewards/accuracies": 0.84375, "rewards/chosen": 2.453125, "rewards/margins": 2.140625, "rewards/rejected": 0.314453125, "step": 5840 }, { "epoch": 1.5468006345848757, "grad_norm": 41.25, "learning_rate": 2.2659968270756215e-05, "logits/chosen": -89.5, "logits/rejected": -87.5, "logps/chosen": -588.0, "logps/rejected": -564.0, "loss": 0.3192, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": 2.875, "rewards/margins": 2.28125, "rewards/rejected": 0.59765625, "step": 5850 }, { "epoch": 1.5494447382337388, "grad_norm": 23.375, "learning_rate": 2.2527763088313065e-05, "logits/chosen": -89.0, "logits/rejected": -86.0, "logps/chosen": -568.0, "logps/rejected": -484.0, "loss": 0.3553, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": 2.15625, "rewards/margins": 1.734375, "rewards/rejected": 0.42578125, "step": 5860 }, { "epoch": 1.5520888418826018, "grad_norm": 32.25, "learning_rate": 2.239555790586991e-05, "logits/chosen": -88.0, "logits/rejected": -87.0, "logps/chosen": -568.0, "logps/rejected": -516.0, "loss": 0.3757, "rewards/accuracies": 0.8125, "rewards/chosen": 2.375, "rewards/margins": 1.75, "rewards/rejected": 0.625, "step": 5870 }, { "epoch": 1.5547329455314647, "grad_norm": 32.5, "learning_rate": 2.2263352723426757e-05, "logits/chosen": -88.5, "logits/rejected": -86.0, "logps/chosen": -572.0, "logps/rejected": -500.0, "loss": 0.3561, "rewards/accuracies": 0.793749988079071, "rewards/chosen": 2.296875, "rewards/margins": 1.8125, "rewards/rejected": 0.494140625, "step": 5880 }, { "epoch": 1.5573770491803278, "grad_norm": 30.375, "learning_rate": 2.2131147540983607e-05, "logits/chosen": -89.0, "logits/rejected": -86.5, "logps/chosen": -600.0, "logps/rejected": -548.0, "loss": 0.3528, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 2.75, "rewards/margins": 2.34375, "rewards/rejected": 0.41015625, "step": 5890 }, { "epoch": 1.560021152829191, "grad_norm": 18.25, "learning_rate": 2.1998942358540456e-05, "logits/chosen": -86.5, "logits/rejected": -87.0, "logps/chosen": -568.0, "logps/rejected": -504.0, "loss": 0.3239, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 2.359375, "rewards/margins": 2.15625, "rewards/rejected": 0.19921875, "step": 5900 }, { "epoch": 1.5626652564780539, "grad_norm": 30.375, "learning_rate": 2.1866737176097306e-05, "logits/chosen": -89.5, "logits/rejected": -88.0, "logps/chosen": -580.0, "logps/rejected": -532.0, "loss": 0.3224, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": 2.625, "rewards/margins": 2.171875, "rewards/rejected": 0.455078125, "step": 5910 }, { "epoch": 1.565309360126917, "grad_norm": 37.0, "learning_rate": 2.1734531993654152e-05, "logits/chosen": -88.0, "logits/rejected": -83.5, "logps/chosen": -608.0, "logps/rejected": -464.0, "loss": 0.2644, "rewards/accuracies": 0.893750011920929, "rewards/chosen": 2.65625, "rewards/margins": 2.140625, "rewards/rejected": 0.5078125, "step": 5920 }, { "epoch": 1.5679534637757802, "grad_norm": 21.75, "learning_rate": 2.1602326811211e-05, "logits/chosen": -88.0, "logits/rejected": -86.5, "logps/chosen": -576.0, "logps/rejected": -548.0, "loss": 0.2901, "rewards/accuracies": 0.875, "rewards/chosen": 2.640625, "rewards/margins": 2.046875, "rewards/rejected": 0.59765625, "step": 5930 }, { "epoch": 1.570597567424643, "grad_norm": 22.75, "learning_rate": 2.1470121628767848e-05, "logits/chosen": -87.0, "logits/rejected": -82.5, "logps/chosen": -576.0, "logps/rejected": -472.0, "loss": 0.3521, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 2.515625, "rewards/margins": 1.8984375, "rewards/rejected": 0.609375, "step": 5940 }, { "epoch": 1.573241671073506, "grad_norm": 23.75, "learning_rate": 2.1337916446324697e-05, "logits/chosen": -88.5, "logits/rejected": -85.5, "logps/chosen": -652.0, "logps/rejected": -524.0, "loss": 0.2767, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 3.0625, "rewards/margins": 2.359375, "rewards/rejected": 0.703125, "step": 5950 }, { "epoch": 1.5758857747223691, "grad_norm": 40.25, "learning_rate": 2.1205711263881544e-05, "logits/chosen": -87.0, "logits/rejected": -86.0, "logps/chosen": -616.0, "logps/rejected": -552.0, "loss": 0.3234, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": 2.71875, "rewards/margins": 2.09375, "rewards/rejected": 0.62109375, "step": 5960 }, { "epoch": 1.5785298783712323, "grad_norm": 18.625, "learning_rate": 2.1073506081438393e-05, "logits/chosen": -89.0, "logits/rejected": -84.5, "logps/chosen": -596.0, "logps/rejected": -452.0, "loss": 0.303, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": 2.90625, "rewards/margins": 2.15625, "rewards/rejected": 0.75390625, "step": 5970 }, { "epoch": 1.5811739820200952, "grad_norm": 29.125, "learning_rate": 2.0941300898995243e-05, "logits/chosen": -88.0, "logits/rejected": -86.0, "logps/chosen": -532.0, "logps/rejected": -510.0, "loss": 0.3171, "rewards/accuracies": 0.856249988079071, "rewards/chosen": 2.28125, "rewards/margins": 1.8671875, "rewards/rejected": 0.408203125, "step": 5980 }, { "epoch": 1.583818085668958, "grad_norm": 41.0, "learning_rate": 2.080909571655209e-05, "logits/chosen": -87.5, "logits/rejected": -83.5, "logps/chosen": -596.0, "logps/rejected": -502.0, "loss": 0.3174, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": 2.515625, "rewards/margins": 2.046875, "rewards/rejected": 0.462890625, "step": 5990 }, { "epoch": 1.5864621893178212, "grad_norm": 43.75, "learning_rate": 2.067689053410894e-05, "logits/chosen": -89.5, "logits/rejected": -86.0, "logps/chosen": -620.0, "logps/rejected": -506.0, "loss": 0.3836, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": 2.671875, "rewards/margins": 1.9140625, "rewards/rejected": 0.76171875, "step": 6000 }, { "epoch": 1.5891062929666844, "grad_norm": 24.375, "learning_rate": 2.0544685351665785e-05, "logits/chosen": -85.0, "logits/rejected": -84.0, "logps/chosen": -536.0, "logps/rejected": -460.0, "loss": 0.3421, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 2.15625, "rewards/margins": 1.921875, "rewards/rejected": 0.244140625, "step": 6010 }, { "epoch": 1.5917503966155473, "grad_norm": 26.375, "learning_rate": 2.0412480169222634e-05, "logits/chosen": -89.0, "logits/rejected": -83.5, "logps/chosen": -568.0, "logps/rejected": -448.0, "loss": 0.322, "rewards/accuracies": 0.893750011920929, "rewards/chosen": 2.578125, "rewards/margins": 2.109375, "rewards/rejected": 0.46875, "step": 6020 }, { "epoch": 1.5943945002644102, "grad_norm": 32.0, "learning_rate": 2.0280274986779484e-05, "logits/chosen": -89.5, "logits/rejected": -87.0, "logps/chosen": -600.0, "logps/rejected": -512.0, "loss": 0.3613, "rewards/accuracies": 0.84375, "rewards/chosen": 2.53125, "rewards/margins": 1.9453125, "rewards/rejected": 0.59375, "step": 6030 }, { "epoch": 1.5970386039132733, "grad_norm": 187.0, "learning_rate": 2.014806980433633e-05, "logits/chosen": -86.5, "logits/rejected": -85.0, "logps/chosen": -520.0, "logps/rejected": -468.0, "loss": 0.3957, "rewards/accuracies": 0.831250011920929, "rewards/chosen": 2.5, "rewards/margins": 1.7109375, "rewards/rejected": 0.79296875, "step": 6040 }, { "epoch": 1.5996827075621365, "grad_norm": 30.375, "learning_rate": 2.001586462189318e-05, "logits/chosen": -89.0, "logits/rejected": -88.0, "logps/chosen": -544.0, "logps/rejected": -476.0, "loss": 0.3457, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": 2.78125, "rewards/margins": 1.9296875, "rewards/rejected": 0.859375, "step": 6050 }, { "epoch": 1.6023268112109994, "grad_norm": 21.25, "learning_rate": 1.988365943945003e-05, "logits/chosen": -87.0, "logits/rejected": -85.5, "logps/chosen": -568.0, "logps/rejected": -478.0, "loss": 0.2854, "rewards/accuracies": 0.893750011920929, "rewards/chosen": 2.5, "rewards/margins": 2.203125, "rewards/rejected": 0.294921875, "step": 6060 }, { "epoch": 1.6049709148598625, "grad_norm": 37.75, "learning_rate": 1.9751454257006875e-05, "logits/chosen": -86.5, "logits/rejected": -86.0, "logps/chosen": -564.0, "logps/rejected": -516.0, "loss": 0.4138, "rewards/accuracies": 0.84375, "rewards/chosen": 2.546875, "rewards/margins": 1.8515625, "rewards/rejected": 0.6953125, "step": 6070 }, { "epoch": 1.6076150185087257, "grad_norm": 21.0, "learning_rate": 1.961924907456372e-05, "logits/chosen": -89.0, "logits/rejected": -86.0, "logps/chosen": -616.0, "logps/rejected": -536.0, "loss": 0.2776, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 2.953125, "rewards/margins": 2.3125, "rewards/rejected": 0.640625, "step": 6080 }, { "epoch": 1.6102591221575886, "grad_norm": 18.25, "learning_rate": 1.948704389212057e-05, "logits/chosen": -88.5, "logits/rejected": -88.5, "logps/chosen": -580.0, "logps/rejected": -492.0, "loss": 0.33, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": 2.84375, "rewards/margins": 1.9296875, "rewards/rejected": 0.91015625, "step": 6090 }, { "epoch": 1.6129032258064515, "grad_norm": 37.25, "learning_rate": 1.935483870967742e-05, "logits/chosen": -89.5, "logits/rejected": -86.0, "logps/chosen": -572.0, "logps/rejected": -484.0, "loss": 0.3283, "rewards/accuracies": 0.856249988079071, "rewards/chosen": 2.59375, "rewards/margins": 2.0625, "rewards/rejected": 0.52734375, "step": 6100 }, { "epoch": 1.6155473294553147, "grad_norm": 16.75, "learning_rate": 1.9222633527234267e-05, "logits/chosen": -87.0, "logits/rejected": -87.0, "logps/chosen": -536.0, "logps/rejected": -500.0, "loss": 0.3043, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.5, "rewards/margins": 2.0, "rewards/rejected": 0.498046875, "step": 6110 }, { "epoch": 1.6181914331041778, "grad_norm": 31.625, "learning_rate": 1.9090428344791117e-05, "logits/chosen": -86.0, "logits/rejected": -84.5, "logps/chosen": -540.0, "logps/rejected": -504.0, "loss": 0.3629, "rewards/accuracies": 0.8125, "rewards/chosen": 2.59375, "rewards/margins": 1.8359375, "rewards/rejected": 0.75, "step": 6120 }, { "epoch": 1.6208355367530407, "grad_norm": 18.5, "learning_rate": 1.8958223162347966e-05, "logits/chosen": -87.0, "logits/rejected": -86.0, "logps/chosen": -548.0, "logps/rejected": -476.0, "loss": 0.313, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 2.5625, "rewards/margins": 1.9609375, "rewards/rejected": 0.6015625, "step": 6130 }, { "epoch": 1.6234796404019036, "grad_norm": 13.9375, "learning_rate": 1.8826017979904816e-05, "logits/chosen": -88.0, "logits/rejected": -87.5, "logps/chosen": -588.0, "logps/rejected": -520.0, "loss": 0.2446, "rewards/accuracies": 0.893750011920929, "rewards/chosen": 2.6875, "rewards/margins": 2.40625, "rewards/rejected": 0.291015625, "step": 6140 }, { "epoch": 1.6261237440507668, "grad_norm": 52.25, "learning_rate": 1.869381279746166e-05, "logits/chosen": -88.0, "logits/rejected": -86.0, "logps/chosen": -576.0, "logps/rejected": -502.0, "loss": 0.3357, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 2.875, "rewards/margins": 2.296875, "rewards/rejected": 0.57421875, "step": 6150 }, { "epoch": 1.62876784769963, "grad_norm": 16.0, "learning_rate": 1.8561607615018508e-05, "logits/chosen": -87.0, "logits/rejected": -86.0, "logps/chosen": -608.0, "logps/rejected": -548.0, "loss": 0.2862, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.859375, "rewards/margins": 2.1875, "rewards/rejected": 0.671875, "step": 6160 }, { "epoch": 1.6314119513484928, "grad_norm": 32.0, "learning_rate": 1.8429402432575358e-05, "logits/chosen": -86.0, "logits/rejected": -86.0, "logps/chosen": -592.0, "logps/rejected": -500.0, "loss": 0.3537, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": 2.734375, "rewards/margins": 2.109375, "rewards/rejected": 0.625, "step": 6170 }, { "epoch": 1.6340560549973557, "grad_norm": 47.5, "learning_rate": 1.8297197250132207e-05, "logits/chosen": -85.0, "logits/rejected": -83.5, "logps/chosen": -552.0, "logps/rejected": -476.0, "loss": 0.339, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": 2.453125, "rewards/margins": 1.9375, "rewards/rejected": 0.5234375, "step": 6180 }, { "epoch": 1.636700158646219, "grad_norm": 24.25, "learning_rate": 1.8164992067689054e-05, "logits/chosen": -88.0, "logits/rejected": -87.5, "logps/chosen": -572.0, "logps/rejected": -524.0, "loss": 0.3662, "rewards/accuracies": 0.875, "rewards/chosen": 2.515625, "rewards/margins": 2.0, "rewards/rejected": 0.51953125, "step": 6190 }, { "epoch": 1.639344262295082, "grad_norm": 16.5, "learning_rate": 1.8032786885245903e-05, "logits/chosen": -89.0, "logits/rejected": -85.0, "logps/chosen": -592.0, "logps/rejected": -486.0, "loss": 0.2581, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.109375, "rewards/margins": 2.328125, "rewards/rejected": 0.76953125, "step": 6200 }, { "epoch": 1.641988365943945, "grad_norm": 19.0, "learning_rate": 1.7900581702802753e-05, "logits/chosen": -86.0, "logits/rejected": -87.5, "logps/chosen": -532.0, "logps/rejected": -512.0, "loss": 0.3002, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": 2.578125, "rewards/margins": 1.921875, "rewards/rejected": 0.65234375, "step": 6210 }, { "epoch": 1.644632469592808, "grad_norm": 27.25, "learning_rate": 1.77683765203596e-05, "logits/chosen": -87.0, "logits/rejected": -85.5, "logps/chosen": -512.0, "logps/rejected": -460.0, "loss": 0.3944, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 2.28125, "rewards/margins": 1.59375, "rewards/rejected": 0.6875, "step": 6220 }, { "epoch": 1.6472765732416712, "grad_norm": 30.5, "learning_rate": 1.7636171337916445e-05, "logits/chosen": -88.0, "logits/rejected": -86.5, "logps/chosen": -596.0, "logps/rejected": -532.0, "loss": 0.2762, "rewards/accuracies": 0.893750011920929, "rewards/chosen": 2.96875, "rewards/margins": 2.328125, "rewards/rejected": 0.63671875, "step": 6230 }, { "epoch": 1.6499206768905341, "grad_norm": 21.5, "learning_rate": 1.7503966155473295e-05, "logits/chosen": -87.5, "logits/rejected": -85.0, "logps/chosen": -560.0, "logps/rejected": -484.0, "loss": 0.2902, "rewards/accuracies": 0.893750011920929, "rewards/chosen": 2.578125, "rewards/margins": 2.046875, "rewards/rejected": 0.53515625, "step": 6240 }, { "epoch": 1.652564780539397, "grad_norm": 30.0, "learning_rate": 1.7371760973030144e-05, "logits/chosen": -87.0, "logits/rejected": -84.0, "logps/chosen": -564.0, "logps/rejected": -470.0, "loss": 0.329, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": 2.5, "rewards/margins": 1.953125, "rewards/rejected": 0.546875, "step": 6250 }, { "epoch": 1.6552088841882602, "grad_norm": 11.125, "learning_rate": 1.723955579058699e-05, "logits/chosen": -87.5, "logits/rejected": -88.0, "logps/chosen": -576.0, "logps/rejected": -552.0, "loss": 0.3291, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 2.859375, "rewards/margins": 2.3125, "rewards/rejected": 0.55078125, "step": 6260 }, { "epoch": 1.6578529878371233, "grad_norm": 30.875, "learning_rate": 1.710735060814384e-05, "logits/chosen": -88.0, "logits/rejected": -85.5, "logps/chosen": -560.0, "logps/rejected": -462.0, "loss": 0.3063, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": 2.4375, "rewards/margins": 1.8984375, "rewards/rejected": 0.54296875, "step": 6270 }, { "epoch": 1.6604970914859862, "grad_norm": 14.25, "learning_rate": 1.697514542570069e-05, "logits/chosen": -87.0, "logits/rejected": -84.5, "logps/chosen": -568.0, "logps/rejected": -474.0, "loss": 0.3575, "rewards/accuracies": 0.84375, "rewards/chosen": 2.546875, "rewards/margins": 1.8984375, "rewards/rejected": 0.64453125, "step": 6280 }, { "epoch": 1.6631411951348491, "grad_norm": 26.125, "learning_rate": 1.684294024325754e-05, "logits/chosen": -88.5, "logits/rejected": -86.5, "logps/chosen": -532.0, "logps/rejected": -490.0, "loss": 0.3532, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 2.609375, "rewards/margins": 1.7578125, "rewards/rejected": 0.8515625, "step": 6290 }, { "epoch": 1.6657852987837123, "grad_norm": 24.125, "learning_rate": 1.6710735060814382e-05, "logits/chosen": -88.5, "logits/rejected": -85.0, "logps/chosen": -592.0, "logps/rejected": -490.0, "loss": 0.2921, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": 2.8125, "rewards/margins": 1.984375, "rewards/rejected": 0.8203125, "step": 6300 }, { "epoch": 1.6684294024325754, "grad_norm": 22.0, "learning_rate": 1.657852987837123e-05, "logits/chosen": -88.5, "logits/rejected": -85.5, "logps/chosen": -572.0, "logps/rejected": -500.0, "loss": 0.3233, "rewards/accuracies": 0.856249988079071, "rewards/chosen": 2.421875, "rewards/margins": 1.984375, "rewards/rejected": 0.4453125, "step": 6310 }, { "epoch": 1.6710735060814383, "grad_norm": 82.0, "learning_rate": 1.644632469592808e-05, "logits/chosen": -87.5, "logits/rejected": -86.0, "logps/chosen": -572.0, "logps/rejected": -494.0, "loss": 0.3807, "rewards/accuracies": 0.831250011920929, "rewards/chosen": 2.515625, "rewards/margins": 1.890625, "rewards/rejected": 0.62890625, "step": 6320 }, { "epoch": 1.6737176097303015, "grad_norm": 40.25, "learning_rate": 1.631411951348493e-05, "logits/chosen": -87.5, "logits/rejected": -86.5, "logps/chosen": -612.0, "logps/rejected": -540.0, "loss": 0.3169, "rewards/accuracies": 0.831250011920929, "rewards/chosen": 3.0, "rewards/margins": 2.203125, "rewards/rejected": 0.78515625, "step": 6330 }, { "epoch": 1.6763617133791646, "grad_norm": 32.0, "learning_rate": 1.6181914331041777e-05, "logits/chosen": -87.5, "logits/rejected": -86.0, "logps/chosen": -604.0, "logps/rejected": -520.0, "loss": 0.3, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": 2.859375, "rewards/margins": 2.28125, "rewards/rejected": 0.58984375, "step": 6340 }, { "epoch": 1.6790058170280275, "grad_norm": 21.25, "learning_rate": 1.6049709148598627e-05, "logits/chosen": -87.0, "logits/rejected": -86.5, "logps/chosen": -580.0, "logps/rejected": -532.0, "loss": 0.3039, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 2.6875, "rewards/margins": 1.984375, "rewards/rejected": 0.703125, "step": 6350 }, { "epoch": 1.6816499206768905, "grad_norm": 18.5, "learning_rate": 1.5917503966155476e-05, "logits/chosen": -86.0, "logits/rejected": -85.0, "logps/chosen": -588.0, "logps/rejected": -524.0, "loss": 0.2758, "rewards/accuracies": 0.893750011920929, "rewards/chosen": 2.828125, "rewards/margins": 2.15625, "rewards/rejected": 0.6796875, "step": 6360 }, { "epoch": 1.6842940243257536, "grad_norm": 26.625, "learning_rate": 1.5785298783712322e-05, "logits/chosen": -86.5, "logits/rejected": -85.5, "logps/chosen": -592.0, "logps/rejected": -502.0, "loss": 0.3243, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 2.5625, "rewards/margins": 2.109375, "rewards/rejected": 0.45703125, "step": 6370 }, { "epoch": 1.6869381279746167, "grad_norm": 36.75, "learning_rate": 1.565309360126917e-05, "logits/chosen": -88.0, "logits/rejected": -89.0, "logps/chosen": -600.0, "logps/rejected": -528.0, "loss": 0.3132, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 2.515625, "rewards/margins": 2.078125, "rewards/rejected": 0.439453125, "step": 6380 }, { "epoch": 1.6895822316234796, "grad_norm": 21.125, "learning_rate": 1.5520888418826018e-05, "logits/chosen": -87.5, "logits/rejected": -87.0, "logps/chosen": -580.0, "logps/rejected": -516.0, "loss": 0.3389, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 2.53125, "rewards/margins": 2.015625, "rewards/rejected": 0.515625, "step": 6390 }, { "epoch": 1.6922263352723426, "grad_norm": 26.125, "learning_rate": 1.5388683236382868e-05, "logits/chosen": -86.5, "logits/rejected": -85.0, "logps/chosen": -532.0, "logps/rejected": -438.0, "loss": 0.3629, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": 2.28125, "rewards/margins": 1.9609375, "rewards/rejected": 0.3203125, "step": 6400 }, { "epoch": 1.6948704389212057, "grad_norm": 25.25, "learning_rate": 1.5256478053939716e-05, "logits/chosen": -86.0, "logits/rejected": -84.0, "logps/chosen": -520.0, "logps/rejected": -464.0, "loss": 0.3544, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 2.3125, "rewards/margins": 1.8359375, "rewards/rejected": 0.474609375, "step": 6410 }, { "epoch": 1.6975145425700688, "grad_norm": 18.25, "learning_rate": 1.5124272871496563e-05, "logits/chosen": -88.0, "logits/rejected": -85.0, "logps/chosen": -608.0, "logps/rejected": -512.0, "loss": 0.3189, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": 2.765625, "rewards/margins": 2.40625, "rewards/rejected": 0.359375, "step": 6420 }, { "epoch": 1.7001586462189318, "grad_norm": 16.75, "learning_rate": 1.4992067689053413e-05, "logits/chosen": -88.0, "logits/rejected": -86.5, "logps/chosen": -540.0, "logps/rejected": -480.0, "loss": 0.2784, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.640625, "rewards/margins": 2.0, "rewards/rejected": 0.640625, "step": 6430 }, { "epoch": 1.7028027498677947, "grad_norm": 26.25, "learning_rate": 1.485986250661026e-05, "logits/chosen": -87.5, "logits/rejected": -85.5, "logps/chosen": -596.0, "logps/rejected": -520.0, "loss": 0.284, "rewards/accuracies": 0.893750011920929, "rewards/chosen": 2.59375, "rewards/margins": 2.1875, "rewards/rejected": 0.41015625, "step": 6440 }, { "epoch": 1.7054468535166578, "grad_norm": 16.125, "learning_rate": 1.4727657324167107e-05, "logits/chosen": -88.0, "logits/rejected": -88.0, "logps/chosen": -592.0, "logps/rejected": -540.0, "loss": 0.2942, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 2.625, "rewards/margins": 2.1875, "rewards/rejected": 0.443359375, "step": 6450 }, { "epoch": 1.708090957165521, "grad_norm": 32.5, "learning_rate": 1.4595452141723957e-05, "logits/chosen": -86.5, "logits/rejected": -85.5, "logps/chosen": -560.0, "logps/rejected": -484.0, "loss": 0.2639, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.71875, "rewards/margins": 2.234375, "rewards/rejected": 0.486328125, "step": 6460 }, { "epoch": 1.7107350608143839, "grad_norm": 14.375, "learning_rate": 1.4463246959280805e-05, "logits/chosen": -88.5, "logits/rejected": -85.0, "logps/chosen": -588.0, "logps/rejected": -496.0, "loss": 0.321, "rewards/accuracies": 0.856249988079071, "rewards/chosen": 2.453125, "rewards/margins": 1.8125, "rewards/rejected": 0.6328125, "step": 6470 }, { "epoch": 1.713379164463247, "grad_norm": 24.125, "learning_rate": 1.4331041776837652e-05, "logits/chosen": -88.0, "logits/rejected": -85.5, "logps/chosen": -536.0, "logps/rejected": -484.0, "loss": 0.3753, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 2.4375, "rewards/margins": 1.984375, "rewards/rejected": 0.447265625, "step": 6480 }, { "epoch": 1.7160232681121101, "grad_norm": 19.125, "learning_rate": 1.4198836594394502e-05, "logits/chosen": -88.0, "logits/rejected": -85.0, "logps/chosen": -592.0, "logps/rejected": -496.0, "loss": 0.2932, "rewards/accuracies": 0.875, "rewards/chosen": 2.78125, "rewards/margins": 2.203125, "rewards/rejected": 0.57421875, "step": 6490 }, { "epoch": 1.718667371760973, "grad_norm": 29.875, "learning_rate": 1.406663141195135e-05, "logits/chosen": -89.5, "logits/rejected": -87.0, "logps/chosen": -612.0, "logps/rejected": -572.0, "loss": 0.2737, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": 2.734375, "rewards/margins": 2.34375, "rewards/rejected": 0.39453125, "step": 6500 }, { "epoch": 1.721311475409836, "grad_norm": 26.375, "learning_rate": 1.3934426229508196e-05, "logits/chosen": -87.0, "logits/rejected": -85.5, "logps/chosen": -496.0, "logps/rejected": -448.0, "loss": 0.305, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": 2.203125, "rewards/margins": 1.953125, "rewards/rejected": 0.2431640625, "step": 6510 }, { "epoch": 1.7239555790586991, "grad_norm": 23.125, "learning_rate": 1.3802221047065044e-05, "logits/chosen": -85.5, "logits/rejected": -83.5, "logps/chosen": -596.0, "logps/rejected": -474.0, "loss": 0.2926, "rewards/accuracies": 0.875, "rewards/chosen": 2.640625, "rewards/margins": 2.296875, "rewards/rejected": 0.33984375, "step": 6520 }, { "epoch": 1.7265996827075623, "grad_norm": 12.1875, "learning_rate": 1.3670015864621894e-05, "logits/chosen": -88.0, "logits/rejected": -85.0, "logps/chosen": -568.0, "logps/rejected": -472.0, "loss": 0.3122, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": 2.4375, "rewards/margins": 2.046875, "rewards/rejected": 0.3828125, "step": 6530 }, { "epoch": 1.7292437863564252, "grad_norm": 29.125, "learning_rate": 1.3537810682178742e-05, "logits/chosen": -88.0, "logits/rejected": -85.0, "logps/chosen": -592.0, "logps/rejected": -528.0, "loss": 0.3141, "rewards/accuracies": 0.875, "rewards/chosen": 2.546875, "rewards/margins": 2.15625, "rewards/rejected": 0.3984375, "step": 6540 }, { "epoch": 1.731887890005288, "grad_norm": 40.75, "learning_rate": 1.3405605499735591e-05, "logits/chosen": -86.5, "logits/rejected": -85.0, "logps/chosen": -552.0, "logps/rejected": -532.0, "loss": 0.4056, "rewards/accuracies": 0.8125, "rewards/chosen": 2.65625, "rewards/margins": 1.703125, "rewards/rejected": 0.94921875, "step": 6550 }, { "epoch": 1.7345319936541512, "grad_norm": 17.5, "learning_rate": 1.3273400317292439e-05, "logits/chosen": -89.0, "logits/rejected": -86.0, "logps/chosen": -608.0, "logps/rejected": -462.0, "loss": 0.3708, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.9375, "rewards/margins": 2.21875, "rewards/rejected": 0.73046875, "step": 6560 }, { "epoch": 1.7371760973030144, "grad_norm": 13.5, "learning_rate": 1.3141195134849289e-05, "logits/chosen": -87.0, "logits/rejected": -85.5, "logps/chosen": -560.0, "logps/rejected": -500.0, "loss": 0.3152, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 2.765625, "rewards/margins": 2.140625, "rewards/rejected": 0.61328125, "step": 6570 }, { "epoch": 1.7398202009518773, "grad_norm": 18.5, "learning_rate": 1.3008989952406133e-05, "logits/chosen": -90.0, "logits/rejected": -88.0, "logps/chosen": -604.0, "logps/rejected": -536.0, "loss": 0.281, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 2.78125, "rewards/margins": 2.453125, "rewards/rejected": 0.33203125, "step": 6580 }, { "epoch": 1.7424643046007402, "grad_norm": 22.625, "learning_rate": 1.2876784769962983e-05, "logits/chosen": -87.0, "logits/rejected": -85.5, "logps/chosen": -572.0, "logps/rejected": -470.0, "loss": 0.3227, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": 2.421875, "rewards/margins": 1.9140625, "rewards/rejected": 0.50390625, "step": 6590 }, { "epoch": 1.7451084082496033, "grad_norm": 20.125, "learning_rate": 1.274457958751983e-05, "logits/chosen": -91.0, "logits/rejected": -85.0, "logps/chosen": -568.0, "logps/rejected": -462.0, "loss": 0.3348, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 2.671875, "rewards/margins": 2.078125, "rewards/rejected": 0.60546875, "step": 6600 }, { "epoch": 1.7477525118984665, "grad_norm": 47.75, "learning_rate": 1.261237440507668e-05, "logits/chosen": -89.5, "logits/rejected": -83.5, "logps/chosen": -636.0, "logps/rejected": -492.0, "loss": 0.3076, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 2.953125, "rewards/margins": 2.15625, "rewards/rejected": 0.79296875, "step": 6610 }, { "epoch": 1.7503966155473294, "grad_norm": 34.25, "learning_rate": 1.2480169222633528e-05, "logits/chosen": -88.0, "logits/rejected": -84.5, "logps/chosen": -612.0, "logps/rejected": -482.0, "loss": 0.3701, "rewards/accuracies": 0.8125, "rewards/chosen": 2.734375, "rewards/margins": 1.9296875, "rewards/rejected": 0.8046875, "step": 6620 }, { "epoch": 1.7530407191961925, "grad_norm": 35.0, "learning_rate": 1.2347964040190376e-05, "logits/chosen": -87.0, "logits/rejected": -84.5, "logps/chosen": -560.0, "logps/rejected": -428.0, "loss": 0.3407, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 2.28125, "rewards/margins": 1.8515625, "rewards/rejected": 0.43359375, "step": 6630 }, { "epoch": 1.7556848228450557, "grad_norm": 23.75, "learning_rate": 1.2215758857747224e-05, "logits/chosen": -86.5, "logits/rejected": -82.5, "logps/chosen": -540.0, "logps/rejected": -436.0, "loss": 0.3354, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.28125, "rewards/margins": 1.9375, "rewards/rejected": 0.337890625, "step": 6640 }, { "epoch": 1.7583289264939186, "grad_norm": 28.875, "learning_rate": 1.2083553675304073e-05, "logits/chosen": -85.5, "logits/rejected": -85.0, "logps/chosen": -552.0, "logps/rejected": -506.0, "loss": 0.3979, "rewards/accuracies": 0.84375, "rewards/chosen": 2.515625, "rewards/margins": 1.7421875, "rewards/rejected": 0.76953125, "step": 6650 }, { "epoch": 1.7609730301427815, "grad_norm": 52.25, "learning_rate": 1.1951348492860921e-05, "logits/chosen": -88.5, "logits/rejected": -86.0, "logps/chosen": -592.0, "logps/rejected": -500.0, "loss": 0.3543, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": 2.71875, "rewards/margins": 1.828125, "rewards/rejected": 0.89453125, "step": 6660 }, { "epoch": 1.7636171337916446, "grad_norm": 43.75, "learning_rate": 1.181914331041777e-05, "logits/chosen": -88.5, "logits/rejected": -85.0, "logps/chosen": -576.0, "logps/rejected": -478.0, "loss": 0.2966, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 2.828125, "rewards/margins": 2.140625, "rewards/rejected": 0.69140625, "step": 6670 }, { "epoch": 1.7662612374405078, "grad_norm": 17.875, "learning_rate": 1.1686938127974617e-05, "logits/chosen": -85.5, "logits/rejected": -84.0, "logps/chosen": -596.0, "logps/rejected": -548.0, "loss": 0.295, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": 2.890625, "rewards/margins": 2.375, "rewards/rejected": 0.51171875, "step": 6680 }, { "epoch": 1.7689053410893707, "grad_norm": 21.25, "learning_rate": 1.1554732945531465e-05, "logits/chosen": -88.5, "logits/rejected": -87.0, "logps/chosen": -544.0, "logps/rejected": -488.0, "loss": 0.3289, "rewards/accuracies": 0.856249988079071, "rewards/chosen": 2.71875, "rewards/margins": 1.953125, "rewards/rejected": 0.76953125, "step": 6690 }, { "epoch": 1.7715494447382336, "grad_norm": 33.5, "learning_rate": 1.1422527763088315e-05, "logits/chosen": -85.5, "logits/rejected": -85.0, "logps/chosen": -540.0, "logps/rejected": -506.0, "loss": 0.3173, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": 2.640625, "rewards/margins": 2.25, "rewards/rejected": 0.376953125, "step": 6700 }, { "epoch": 1.7741935483870968, "grad_norm": 37.75, "learning_rate": 1.129032258064516e-05, "logits/chosen": -87.0, "logits/rejected": -83.5, "logps/chosen": -572.0, "logps/rejected": -462.0, "loss": 0.3541, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 2.421875, "rewards/margins": 2.109375, "rewards/rejected": 0.314453125, "step": 6710 }, { "epoch": 1.77683765203596, "grad_norm": 37.0, "learning_rate": 1.115811739820201e-05, "logits/chosen": -88.0, "logits/rejected": -83.5, "logps/chosen": -548.0, "logps/rejected": -450.0, "loss": 0.3657, "rewards/accuracies": 0.8125, "rewards/chosen": 2.296875, "rewards/margins": 1.6328125, "rewards/rejected": 0.6640625, "step": 6720 }, { "epoch": 1.7794817556848228, "grad_norm": 35.5, "learning_rate": 1.1025912215758858e-05, "logits/chosen": -86.5, "logits/rejected": -84.0, "logps/chosen": -568.0, "logps/rejected": -468.0, "loss": 0.3714, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": 2.484375, "rewards/margins": 1.7890625, "rewards/rejected": 0.6875, "step": 6730 }, { "epoch": 1.7821258593336857, "grad_norm": 22.625, "learning_rate": 1.0893707033315706e-05, "logits/chosen": -88.5, "logits/rejected": -86.0, "logps/chosen": -644.0, "logps/rejected": -528.0, "loss": 0.2646, "rewards/accuracies": 0.893750011920929, "rewards/chosen": 3.09375, "rewards/margins": 2.5, "rewards/rejected": 0.5859375, "step": 6740 }, { "epoch": 1.7847699629825489, "grad_norm": 23.25, "learning_rate": 1.0761501850872554e-05, "logits/chosen": -88.5, "logits/rejected": -84.5, "logps/chosen": -532.0, "logps/rejected": -460.0, "loss": 0.3121, "rewards/accuracies": 0.84375, "rewards/chosen": 2.5, "rewards/margins": 2.03125, "rewards/rejected": 0.47265625, "step": 6750 }, { "epoch": 1.787414066631412, "grad_norm": 20.75, "learning_rate": 1.0629296668429404e-05, "logits/chosen": -87.0, "logits/rejected": -86.0, "logps/chosen": -600.0, "logps/rejected": -532.0, "loss": 0.3008, "rewards/accuracies": 0.875, "rewards/chosen": 2.8125, "rewards/margins": 2.265625, "rewards/rejected": 0.546875, "step": 6760 }, { "epoch": 1.790058170280275, "grad_norm": 24.0, "learning_rate": 1.0497091485986251e-05, "logits/chosen": -86.5, "logits/rejected": -84.5, "logps/chosen": -552.0, "logps/rejected": -478.0, "loss": 0.3126, "rewards/accuracies": 0.893750011920929, "rewards/chosen": 2.296875, "rewards/margins": 2.140625, "rewards/rejected": 0.150390625, "step": 6770 }, { "epoch": 1.792702273929138, "grad_norm": 28.125, "learning_rate": 1.03648863035431e-05, "logits/chosen": -89.0, "logits/rejected": -87.5, "logps/chosen": -592.0, "logps/rejected": -504.0, "loss": 0.2748, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": 2.84375, "rewards/margins": 2.234375, "rewards/rejected": 0.6015625, "step": 6780 }, { "epoch": 1.7953463775780012, "grad_norm": 41.5, "learning_rate": 1.0232681121099947e-05, "logits/chosen": -91.5, "logits/rejected": -86.0, "logps/chosen": -628.0, "logps/rejected": -516.0, "loss": 0.3011, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 2.703125, "rewards/margins": 2.265625, "rewards/rejected": 0.439453125, "step": 6790 }, { "epoch": 1.7979904812268641, "grad_norm": 11.8125, "learning_rate": 1.0100475938656797e-05, "logits/chosen": -88.0, "logits/rejected": -86.0, "logps/chosen": -576.0, "logps/rejected": -498.0, "loss": 0.2619, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.734375, "rewards/margins": 2.125, "rewards/rejected": 0.60546875, "step": 6800 }, { "epoch": 1.800634584875727, "grad_norm": 28.625, "learning_rate": 9.968270756213643e-06, "logits/chosen": -85.5, "logits/rejected": -84.5, "logps/chosen": -478.0, "logps/rejected": -478.0, "loss": 0.3742, "rewards/accuracies": 0.831250011920929, "rewards/chosen": 2.140625, "rewards/margins": 1.6171875, "rewards/rejected": 0.5234375, "step": 6810 }, { "epoch": 1.8032786885245902, "grad_norm": 38.75, "learning_rate": 9.836065573770493e-06, "logits/chosen": -89.0, "logits/rejected": -87.0, "logps/chosen": -612.0, "logps/rejected": -502.0, "loss": 0.3232, "rewards/accuracies": 0.875, "rewards/chosen": 2.8125, "rewards/margins": 2.09375, "rewards/rejected": 0.7109375, "step": 6820 }, { "epoch": 1.8059227921734533, "grad_norm": 28.25, "learning_rate": 9.70386039132734e-06, "logits/chosen": -87.5, "logits/rejected": -85.0, "logps/chosen": -560.0, "logps/rejected": -488.0, "loss": 0.3331, "rewards/accuracies": 0.831250011920929, "rewards/chosen": 2.640625, "rewards/margins": 2.109375, "rewards/rejected": 0.52734375, "step": 6830 }, { "epoch": 1.8085668958223162, "grad_norm": 22.875, "learning_rate": 9.57165520888419e-06, "logits/chosen": -89.0, "logits/rejected": -86.0, "logps/chosen": -616.0, "logps/rejected": -516.0, "loss": 0.2746, "rewards/accuracies": 0.893750011920929, "rewards/chosen": 2.9375, "rewards/margins": 2.171875, "rewards/rejected": 0.77734375, "step": 6840 }, { "epoch": 1.8112109994711791, "grad_norm": 28.875, "learning_rate": 9.439450026441036e-06, "logits/chosen": -89.5, "logits/rejected": -85.0, "logps/chosen": -576.0, "logps/rejected": -472.0, "loss": 0.3078, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 2.625, "rewards/margins": 1.9375, "rewards/rejected": 0.68359375, "step": 6850 }, { "epoch": 1.8138551031200423, "grad_norm": 13.375, "learning_rate": 9.307244843997886e-06, "logits/chosen": -87.0, "logits/rejected": -85.5, "logps/chosen": -548.0, "logps/rejected": -532.0, "loss": 0.4267, "rewards/accuracies": 0.84375, "rewards/chosen": 2.453125, "rewards/margins": 1.890625, "rewards/rejected": 0.56640625, "step": 6860 }, { "epoch": 1.8164992067689054, "grad_norm": 24.375, "learning_rate": 9.175039661554734e-06, "logits/chosen": -86.0, "logits/rejected": -84.0, "logps/chosen": -556.0, "logps/rejected": -470.0, "loss": 0.2826, "rewards/accuracies": 0.90625, "rewards/chosen": 2.65625, "rewards/margins": 2.296875, "rewards/rejected": 0.357421875, "step": 6870 }, { "epoch": 1.8191433104177683, "grad_norm": 24.375, "learning_rate": 9.042834479111582e-06, "logits/chosen": -87.0, "logits/rejected": -86.0, "logps/chosen": -604.0, "logps/rejected": -488.0, "loss": 0.2955, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": 2.953125, "rewards/margins": 2.21875, "rewards/rejected": 0.73046875, "step": 6880 }, { "epoch": 1.8217874140666312, "grad_norm": 23.5, "learning_rate": 8.91062929666843e-06, "logits/chosen": -87.5, "logits/rejected": -85.0, "logps/chosen": -524.0, "logps/rejected": -432.0, "loss": 0.3273, "rewards/accuracies": 0.831250011920929, "rewards/chosen": 2.4375, "rewards/margins": 1.9765625, "rewards/rejected": 0.46875, "step": 6890 }, { "epoch": 1.8244315177154946, "grad_norm": 21.0, "learning_rate": 8.778424114225277e-06, "logits/chosen": -89.5, "logits/rejected": -84.5, "logps/chosen": -588.0, "logps/rejected": -484.0, "loss": 0.3162, "rewards/accuracies": 0.793749988079071, "rewards/chosen": 2.546875, "rewards/margins": 2.09375, "rewards/rejected": 0.455078125, "step": 6900 }, { "epoch": 1.8270756213643575, "grad_norm": 43.75, "learning_rate": 8.646218931782127e-06, "logits/chosen": -88.5, "logits/rejected": -85.0, "logps/chosen": -568.0, "logps/rejected": -502.0, "loss": 0.3277, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": 2.65625, "rewards/margins": 2.140625, "rewards/rejected": 0.5234375, "step": 6910 }, { "epoch": 1.8297197250132204, "grad_norm": 20.875, "learning_rate": 8.514013749338975e-06, "logits/chosen": -87.5, "logits/rejected": -83.0, "logps/chosen": -556.0, "logps/rejected": -452.0, "loss": 0.339, "rewards/accuracies": 0.84375, "rewards/chosen": 2.640625, "rewards/margins": 1.921875, "rewards/rejected": 0.7109375, "step": 6920 }, { "epoch": 1.8323638286620836, "grad_norm": 22.25, "learning_rate": 8.381808566895823e-06, "logits/chosen": -88.0, "logits/rejected": -84.0, "logps/chosen": -624.0, "logps/rejected": -486.0, "loss": 0.2808, "rewards/accuracies": 0.856249988079071, "rewards/chosen": 2.78125, "rewards/margins": 2.1875, "rewards/rejected": 0.6015625, "step": 6930 }, { "epoch": 1.8350079323109467, "grad_norm": 21.75, "learning_rate": 8.24960338445267e-06, "logits/chosen": -87.5, "logits/rejected": -84.0, "logps/chosen": -568.0, "logps/rejected": -464.0, "loss": 0.3214, "rewards/accuracies": 0.875, "rewards/chosen": 2.84375, "rewards/margins": 2.015625, "rewards/rejected": 0.83203125, "step": 6940 }, { "epoch": 1.8376520359598096, "grad_norm": 20.75, "learning_rate": 8.117398202009519e-06, "logits/chosen": -87.0, "logits/rejected": -84.5, "logps/chosen": -604.0, "logps/rejected": -490.0, "loss": 0.2908, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": 2.671875, "rewards/margins": 2.1875, "rewards/rejected": 0.484375, "step": 6950 }, { "epoch": 1.8402961396086726, "grad_norm": 20.125, "learning_rate": 7.985193019566366e-06, "logits/chosen": -86.0, "logits/rejected": -83.0, "logps/chosen": -512.0, "logps/rejected": -446.0, "loss": 0.3367, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": 2.46875, "rewards/margins": 1.984375, "rewards/rejected": 0.486328125, "step": 6960 }, { "epoch": 1.8429402432575357, "grad_norm": 30.75, "learning_rate": 7.852987837123216e-06, "logits/chosen": -85.0, "logits/rejected": -84.0, "logps/chosen": -572.0, "logps/rejected": -494.0, "loss": 0.2954, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": 2.75, "rewards/margins": 2.359375, "rewards/rejected": 0.3984375, "step": 6970 }, { "epoch": 1.8455843469063988, "grad_norm": 28.125, "learning_rate": 7.720782654680064e-06, "logits/chosen": -87.0, "logits/rejected": -85.0, "logps/chosen": -556.0, "logps/rejected": -468.0, "loss": 0.37, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": 2.671875, "rewards/margins": 2.0625, "rewards/rejected": 0.59765625, "step": 6980 }, { "epoch": 1.8482284505552617, "grad_norm": 35.25, "learning_rate": 7.588577472236912e-06, "logits/chosen": -87.0, "logits/rejected": -86.0, "logps/chosen": -524.0, "logps/rejected": -502.0, "loss": 0.2889, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 2.625, "rewards/margins": 2.078125, "rewards/rejected": 0.54296875, "step": 6990 }, { "epoch": 1.8508725542041247, "grad_norm": 47.5, "learning_rate": 7.4563722897937606e-06, "logits/chosen": -88.0, "logits/rejected": -85.0, "logps/chosen": -560.0, "logps/rejected": -516.0, "loss": 0.386, "rewards/accuracies": 0.831250011920929, "rewards/chosen": 2.75, "rewards/margins": 1.96875, "rewards/rejected": 0.77734375, "step": 7000 }, { "epoch": 1.8535166578529878, "grad_norm": 25.375, "learning_rate": 7.3241671073506084e-06, "logits/chosen": -88.0, "logits/rejected": -86.5, "logps/chosen": -560.0, "logps/rejected": -512.0, "loss": 0.3104, "rewards/accuracies": 0.856249988079071, "rewards/chosen": 2.765625, "rewards/margins": 2.3125, "rewards/rejected": 0.453125, "step": 7010 }, { "epoch": 1.856160761501851, "grad_norm": 11.375, "learning_rate": 7.191961924907457e-06, "logits/chosen": -86.5, "logits/rejected": -86.0, "logps/chosen": -560.0, "logps/rejected": -476.0, "loss": 0.3396, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": 2.65625, "rewards/margins": 1.96875, "rewards/rejected": 0.69140625, "step": 7020 }, { "epoch": 1.8588048651507139, "grad_norm": 19.625, "learning_rate": 7.059756742464304e-06, "logits/chosen": -85.5, "logits/rejected": -86.0, "logps/chosen": -540.0, "logps/rejected": -490.0, "loss": 0.3003, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 2.5, "rewards/margins": 2.15625, "rewards/rejected": 0.353515625, "step": 7030 }, { "epoch": 1.861448968799577, "grad_norm": 21.75, "learning_rate": 6.927551560021153e-06, "logits/chosen": -85.5, "logits/rejected": -83.0, "logps/chosen": -552.0, "logps/rejected": -464.0, "loss": 0.3126, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 2.671875, "rewards/margins": 2.125, "rewards/rejected": 0.546875, "step": 7040 }, { "epoch": 1.8640930724484401, "grad_norm": 32.0, "learning_rate": 6.795346377578002e-06, "logits/chosen": -87.0, "logits/rejected": -84.5, "logps/chosen": -584.0, "logps/rejected": -512.0, "loss": 0.3634, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.625, "rewards/margins": 1.9140625, "rewards/rejected": 0.7109375, "step": 7050 }, { "epoch": 1.866737176097303, "grad_norm": 28.75, "learning_rate": 6.663141195134849e-06, "logits/chosen": -87.5, "logits/rejected": -84.0, "logps/chosen": -544.0, "logps/rejected": -482.0, "loss": 0.2754, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": 2.59375, "rewards/margins": 1.984375, "rewards/rejected": 0.609375, "step": 7060 }, { "epoch": 1.869381279746166, "grad_norm": 39.0, "learning_rate": 6.5309360126916975e-06, "logits/chosen": -88.0, "logits/rejected": -86.0, "logps/chosen": -612.0, "logps/rejected": -510.0, "loss": 0.2821, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 2.953125, "rewards/margins": 2.28125, "rewards/rejected": 0.6640625, "step": 7070 }, { "epoch": 1.872025383395029, "grad_norm": 46.5, "learning_rate": 6.398730830248546e-06, "logits/chosen": -88.0, "logits/rejected": -87.0, "logps/chosen": -548.0, "logps/rejected": -500.0, "loss": 0.3396, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 2.703125, "rewards/margins": 2.046875, "rewards/rejected": 0.65625, "step": 7080 }, { "epoch": 1.8746694870438922, "grad_norm": 19.0, "learning_rate": 6.266525647805395e-06, "logits/chosen": -88.0, "logits/rejected": -85.0, "logps/chosen": -612.0, "logps/rejected": -496.0, "loss": 0.3338, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": 2.921875, "rewards/margins": 2.203125, "rewards/rejected": 0.71484375, "step": 7090 }, { "epoch": 1.8773135906927552, "grad_norm": 34.75, "learning_rate": 6.134320465362243e-06, "logits/chosen": -85.0, "logits/rejected": -85.0, "logps/chosen": -532.0, "logps/rejected": -486.0, "loss": 0.3416, "rewards/accuracies": 0.875, "rewards/chosen": 2.234375, "rewards/margins": 1.84375, "rewards/rejected": 0.39453125, "step": 7100 }, { "epoch": 1.879957694341618, "grad_norm": 23.75, "learning_rate": 6.002115282919091e-06, "logits/chosen": -87.0, "logits/rejected": -86.0, "logps/chosen": -544.0, "logps/rejected": -524.0, "loss": 0.2902, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.6875, "rewards/margins": 2.046875, "rewards/rejected": 0.6484375, "step": 7110 }, { "epoch": 1.8826017979904812, "grad_norm": 47.0, "learning_rate": 5.869910100475939e-06, "logits/chosen": -84.0, "logits/rejected": -84.5, "logps/chosen": -512.0, "logps/rejected": -468.0, "loss": 0.3688, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": 2.46875, "rewards/margins": 1.96875, "rewards/rejected": 0.50390625, "step": 7120 }, { "epoch": 1.8852459016393444, "grad_norm": 18.375, "learning_rate": 5.737704918032787e-06, "logits/chosen": -88.5, "logits/rejected": -85.0, "logps/chosen": -600.0, "logps/rejected": -536.0, "loss": 0.3479, "rewards/accuracies": 0.831250011920929, "rewards/chosen": 2.8125, "rewards/margins": 2.171875, "rewards/rejected": 0.640625, "step": 7130 }, { "epoch": 1.8878900052882073, "grad_norm": 26.25, "learning_rate": 5.605499735589635e-06, "logits/chosen": -90.0, "logits/rejected": -87.0, "logps/chosen": -588.0, "logps/rejected": -516.0, "loss": 0.3868, "rewards/accuracies": 0.84375, "rewards/chosen": 2.59375, "rewards/margins": 1.9921875, "rewards/rejected": 0.609375, "step": 7140 }, { "epoch": 1.8905341089370702, "grad_norm": 14.9375, "learning_rate": 5.473294553146484e-06, "logits/chosen": -89.5, "logits/rejected": -86.5, "logps/chosen": -600.0, "logps/rejected": -496.0, "loss": 0.2662, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 3.03125, "rewards/margins": 2.3125, "rewards/rejected": 0.73046875, "step": 7150 }, { "epoch": 1.8931782125859333, "grad_norm": 32.25, "learning_rate": 5.341089370703332e-06, "logits/chosen": -86.0, "logits/rejected": -85.5, "logps/chosen": -548.0, "logps/rejected": -478.0, "loss": 0.3125, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 2.5, "rewards/margins": 1.953125, "rewards/rejected": 0.5546875, "step": 7160 }, { "epoch": 1.8958223162347965, "grad_norm": 22.125, "learning_rate": 5.208884188260181e-06, "logits/chosen": -86.0, "logits/rejected": -85.0, "logps/chosen": -524.0, "logps/rejected": -466.0, "loss": 0.2922, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 2.546875, "rewards/margins": 2.125, "rewards/rejected": 0.423828125, "step": 7170 }, { "epoch": 1.8984664198836594, "grad_norm": 43.25, "learning_rate": 5.0766790058170285e-06, "logits/chosen": -87.0, "logits/rejected": -85.0, "logps/chosen": -540.0, "logps/rejected": -482.0, "loss": 0.327, "rewards/accuracies": 0.831250011920929, "rewards/chosen": 2.515625, "rewards/margins": 1.9609375, "rewards/rejected": 0.55859375, "step": 7180 }, { "epoch": 1.9011105235325225, "grad_norm": 38.25, "learning_rate": 4.944473823373876e-06, "logits/chosen": -89.5, "logits/rejected": -86.0, "logps/chosen": -644.0, "logps/rejected": -532.0, "loss": 0.3256, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 2.921875, "rewards/margins": 2.0625, "rewards/rejected": 0.859375, "step": 7190 }, { "epoch": 1.9037546271813857, "grad_norm": 21.25, "learning_rate": 4.812268640930725e-06, "logits/chosen": -87.0, "logits/rejected": -86.0, "logps/chosen": -568.0, "logps/rejected": -500.0, "loss": 0.3129, "rewards/accuracies": 0.856249988079071, "rewards/chosen": 2.734375, "rewards/margins": 2.078125, "rewards/rejected": 0.6484375, "step": 7200 }, { "epoch": 1.9063987308302486, "grad_norm": 19.375, "learning_rate": 4.680063458487573e-06, "logits/chosen": -88.0, "logits/rejected": -85.0, "logps/chosen": -612.0, "logps/rejected": -502.0, "loss": 0.2841, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.984375, "rewards/margins": 2.484375, "rewards/rejected": 0.494140625, "step": 7210 }, { "epoch": 1.9090428344791115, "grad_norm": 24.25, "learning_rate": 4.547858276044422e-06, "logits/chosen": -88.5, "logits/rejected": -86.0, "logps/chosen": -580.0, "logps/rejected": -520.0, "loss": 0.3639, "rewards/accuracies": 0.831250011920929, "rewards/chosen": 2.71875, "rewards/margins": 1.8046875, "rewards/rejected": 0.9140625, "step": 7220 }, { "epoch": 1.9116869381279746, "grad_norm": 27.375, "learning_rate": 4.41565309360127e-06, "logits/chosen": -89.5, "logits/rejected": -86.0, "logps/chosen": -584.0, "logps/rejected": -486.0, "loss": 0.3326, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": 2.71875, "rewards/margins": 1.8828125, "rewards/rejected": 0.84375, "step": 7230 }, { "epoch": 1.9143310417768378, "grad_norm": 37.5, "learning_rate": 4.2834479111581175e-06, "logits/chosen": -89.5, "logits/rejected": -86.5, "logps/chosen": -628.0, "logps/rejected": -564.0, "loss": 0.3065, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": 3.203125, "rewards/margins": 2.484375, "rewards/rejected": 0.70703125, "step": 7240 }, { "epoch": 1.9169751454257007, "grad_norm": 26.375, "learning_rate": 4.151242728714965e-06, "logits/chosen": -90.5, "logits/rejected": -87.0, "logps/chosen": -580.0, "logps/rejected": -492.0, "loss": 0.3474, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": 2.546875, "rewards/margins": 1.875, "rewards/rejected": 0.67578125, "step": 7250 }, { "epoch": 1.9196192490745636, "grad_norm": 45.25, "learning_rate": 4.019037546271814e-06, "logits/chosen": -88.0, "logits/rejected": -85.0, "logps/chosen": -580.0, "logps/rejected": -506.0, "loss": 0.3514, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 2.65625, "rewards/margins": 1.9375, "rewards/rejected": 0.72265625, "step": 7260 }, { "epoch": 1.9222633527234267, "grad_norm": 26.625, "learning_rate": 3.886832363828662e-06, "logits/chosen": -88.0, "logits/rejected": -86.0, "logps/chosen": -608.0, "logps/rejected": -504.0, "loss": 0.3192, "rewards/accuracies": 0.84375, "rewards/chosen": 2.859375, "rewards/margins": 2.109375, "rewards/rejected": 0.75390625, "step": 7270 }, { "epoch": 1.9249074563722899, "grad_norm": 14.25, "learning_rate": 3.7546271813855104e-06, "logits/chosen": -87.0, "logits/rejected": -85.5, "logps/chosen": -608.0, "logps/rejected": -490.0, "loss": 0.3399, "rewards/accuracies": 0.831250011920929, "rewards/chosen": 2.5625, "rewards/margins": 2.265625, "rewards/rejected": 0.3046875, "step": 7280 }, { "epoch": 1.9275515600211528, "grad_norm": 39.25, "learning_rate": 3.6224219989423587e-06, "logits/chosen": -87.5, "logits/rejected": -85.0, "logps/chosen": -620.0, "logps/rejected": -520.0, "loss": 0.3408, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 2.96875, "rewards/margins": 2.359375, "rewards/rejected": 0.609375, "step": 7290 }, { "epoch": 1.9301956636700157, "grad_norm": 36.75, "learning_rate": 3.4902168164992066e-06, "logits/chosen": -84.0, "logits/rejected": -84.0, "logps/chosen": -544.0, "logps/rejected": -516.0, "loss": 0.3093, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 2.3125, "rewards/margins": 2.234375, "rewards/rejected": 0.0732421875, "step": 7300 }, { "epoch": 1.9328397673188789, "grad_norm": 13.9375, "learning_rate": 3.3580116340560553e-06, "logits/chosen": -89.5, "logits/rejected": -86.0, "logps/chosen": -608.0, "logps/rejected": -520.0, "loss": 0.295, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 3.046875, "rewards/margins": 2.40625, "rewards/rejected": 0.64453125, "step": 7310 }, { "epoch": 1.935483870967742, "grad_norm": 17.125, "learning_rate": 3.225806451612903e-06, "logits/chosen": -87.5, "logits/rejected": -85.0, "logps/chosen": -620.0, "logps/rejected": -512.0, "loss": 0.295, "rewards/accuracies": 0.875, "rewards/chosen": 3.078125, "rewards/margins": 2.234375, "rewards/rejected": 0.85546875, "step": 7320 }, { "epoch": 1.938127974616605, "grad_norm": 22.75, "learning_rate": 3.0936012691697515e-06, "logits/chosen": -85.0, "logits/rejected": -84.0, "logps/chosen": -548.0, "logps/rejected": -492.0, "loss": 0.2966, "rewards/accuracies": 0.893750011920929, "rewards/chosen": 2.828125, "rewards/margins": 2.015625, "rewards/rejected": 0.80859375, "step": 7330 }, { "epoch": 1.940772078265468, "grad_norm": 29.375, "learning_rate": 2.9613960867266e-06, "logits/chosen": -89.0, "logits/rejected": -86.0, "logps/chosen": -584.0, "logps/rejected": -524.0, "loss": 0.3036, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 2.71875, "rewards/margins": 2.3125, "rewards/rejected": 0.408203125, "step": 7340 }, { "epoch": 1.9434161819143312, "grad_norm": 17.125, "learning_rate": 2.829190904283448e-06, "logits/chosen": -87.0, "logits/rejected": -86.5, "logps/chosen": -560.0, "logps/rejected": -488.0, "loss": 0.32, "rewards/accuracies": 0.84375, "rewards/chosen": 2.546875, "rewards/margins": 2.109375, "rewards/rejected": 0.431640625, "step": 7350 }, { "epoch": 1.946060285563194, "grad_norm": 26.5, "learning_rate": 2.6969857218402964e-06, "logits/chosen": -87.0, "logits/rejected": -85.0, "logps/chosen": -576.0, "logps/rejected": -486.0, "loss": 0.3366, "rewards/accuracies": 0.875, "rewards/chosen": 2.703125, "rewards/margins": 2.109375, "rewards/rejected": 0.58984375, "step": 7360 }, { "epoch": 1.948704389212057, "grad_norm": 13.0, "learning_rate": 2.5647805393971448e-06, "logits/chosen": -87.0, "logits/rejected": -84.0, "logps/chosen": -592.0, "logps/rejected": -494.0, "loss": 0.3032, "rewards/accuracies": 0.90625, "rewards/chosen": 2.875, "rewards/margins": 2.046875, "rewards/rejected": 0.8203125, "step": 7370 }, { "epoch": 1.9513484928609202, "grad_norm": 53.0, "learning_rate": 2.432575356953993e-06, "logits/chosen": -85.0, "logits/rejected": -84.5, "logps/chosen": -556.0, "logps/rejected": -512.0, "loss": 0.3723, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": 2.546875, "rewards/margins": 1.8203125, "rewards/rejected": 0.73046875, "step": 7380 }, { "epoch": 1.9539925965097833, "grad_norm": 21.0, "learning_rate": 2.300370174510841e-06, "logits/chosen": -88.0, "logits/rejected": -85.0, "logps/chosen": -512.0, "logps/rejected": -442.0, "loss": 0.3794, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": 2.3125, "rewards/margins": 1.734375, "rewards/rejected": 0.578125, "step": 7390 }, { "epoch": 1.9566367001586462, "grad_norm": 18.875, "learning_rate": 2.1681649920676893e-06, "logits/chosen": -86.5, "logits/rejected": -83.5, "logps/chosen": -616.0, "logps/rejected": -520.0, "loss": 0.3185, "rewards/accuracies": 0.831250011920929, "rewards/chosen": 2.625, "rewards/margins": 2.046875, "rewards/rejected": 0.578125, "step": 7400 }, { "epoch": 1.9592808038075091, "grad_norm": 24.75, "learning_rate": 2.035959809624537e-06, "logits/chosen": -88.0, "logits/rejected": -86.5, "logps/chosen": -540.0, "logps/rejected": -500.0, "loss": 0.3147, "rewards/accuracies": 0.856249988079071, "rewards/chosen": 2.5625, "rewards/margins": 2.109375, "rewards/rejected": 0.451171875, "step": 7410 }, { "epoch": 1.9619249074563723, "grad_norm": 22.375, "learning_rate": 1.9037546271813857e-06, "logits/chosen": -87.0, "logits/rejected": -82.0, "logps/chosen": -584.0, "logps/rejected": -466.0, "loss": 0.3264, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": 2.796875, "rewards/margins": 2.140625, "rewards/rejected": 0.65625, "step": 7420 }, { "epoch": 1.9645690111052354, "grad_norm": 16.0, "learning_rate": 1.7715494447382338e-06, "logits/chosen": -87.0, "logits/rejected": -85.5, "logps/chosen": -572.0, "logps/rejected": -490.0, "loss": 0.3628, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": 2.515625, "rewards/margins": 1.875, "rewards/rejected": 0.640625, "step": 7430 }, { "epoch": 1.9672131147540983, "grad_norm": 29.375, "learning_rate": 1.639344262295082e-06, "logits/chosen": -90.0, "logits/rejected": -88.0, "logps/chosen": -652.0, "logps/rejected": -568.0, "loss": 0.294, "rewards/accuracies": 0.84375, "rewards/chosen": 3.0625, "rewards/margins": 2.3125, "rewards/rejected": 0.75390625, "step": 7440 }, { "epoch": 1.9698572184029612, "grad_norm": 51.75, "learning_rate": 1.5071390798519302e-06, "logits/chosen": -90.5, "logits/rejected": -87.0, "logps/chosen": -628.0, "logps/rejected": -532.0, "loss": 0.3475, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": 2.875, "rewards/margins": 2.0, "rewards/rejected": 0.87109375, "step": 7450 }, { "epoch": 1.9725013220518244, "grad_norm": 19.125, "learning_rate": 1.3749338974087785e-06, "logits/chosen": -87.5, "logits/rejected": -85.5, "logps/chosen": -616.0, "logps/rejected": -506.0, "loss": 0.3447, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": 2.65625, "rewards/margins": 1.9765625, "rewards/rejected": 0.68359375, "step": 7460 }, { "epoch": 1.9751454257006875, "grad_norm": 32.75, "learning_rate": 1.2427287149656268e-06, "logits/chosen": -87.5, "logits/rejected": -85.5, "logps/chosen": -584.0, "logps/rejected": -488.0, "loss": 0.358, "rewards/accuracies": 0.8125, "rewards/chosen": 2.5625, "rewards/margins": 1.796875, "rewards/rejected": 0.77734375, "step": 7470 }, { "epoch": 1.9777895293495504, "grad_norm": 29.875, "learning_rate": 1.110523532522475e-06, "logits/chosen": -87.0, "logits/rejected": -88.0, "logps/chosen": -564.0, "logps/rejected": -528.0, "loss": 0.3277, "rewards/accuracies": 0.831250011920929, "rewards/chosen": 2.859375, "rewards/margins": 2.171875, "rewards/rejected": 0.671875, "step": 7480 }, { "epoch": 1.9804336329984136, "grad_norm": 25.25, "learning_rate": 9.78318350079323e-07, "logits/chosen": -88.5, "logits/rejected": -84.0, "logps/chosen": -572.0, "logps/rejected": -460.0, "loss": 0.3043, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 2.5625, "rewards/margins": 2.0625, "rewards/rejected": 0.49609375, "step": 7490 }, { "epoch": 1.9830777366472767, "grad_norm": 28.25, "learning_rate": 8.461131676361713e-07, "logits/chosen": -89.5, "logits/rejected": -85.5, "logps/chosen": -604.0, "logps/rejected": -474.0, "loss": 0.2993, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 2.671875, "rewards/margins": 1.9140625, "rewards/rejected": 0.7578125, "step": 7500 }, { "epoch": 1.9857218402961396, "grad_norm": 14.25, "learning_rate": 7.139079851930197e-07, "logits/chosen": -86.5, "logits/rejected": -83.5, "logps/chosen": -584.0, "logps/rejected": -474.0, "loss": 0.2856, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": 2.59375, "rewards/margins": 2.265625, "rewards/rejected": 0.322265625, "step": 7510 }, { "epoch": 1.9883659439450025, "grad_norm": 17.875, "learning_rate": 5.817028027498679e-07, "logits/chosen": -86.5, "logits/rejected": -85.0, "logps/chosen": -556.0, "logps/rejected": -466.0, "loss": 0.2973, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 2.625, "rewards/margins": 2.125, "rewards/rejected": 0.50390625, "step": 7520 }, { "epoch": 1.9910100475938657, "grad_norm": 25.5, "learning_rate": 4.494976203067161e-07, "logits/chosen": -89.0, "logits/rejected": -86.0, "logps/chosen": -600.0, "logps/rejected": -524.0, "loss": 0.3343, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": 2.640625, "rewards/margins": 2.0, "rewards/rejected": 0.64453125, "step": 7530 }, { "epoch": 1.9936541512427288, "grad_norm": 34.25, "learning_rate": 3.172924378635643e-07, "logits/chosen": -89.0, "logits/rejected": -86.0, "logps/chosen": -632.0, "logps/rejected": -532.0, "loss": 0.2876, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 3.171875, "rewards/margins": 2.21875, "rewards/rejected": 0.94921875, "step": 7540 }, { "epoch": 1.9962982548915917, "grad_norm": 22.625, "learning_rate": 1.850872554204125e-07, "logits/chosen": -88.0, "logits/rejected": -87.5, "logps/chosen": -544.0, "logps/rejected": -524.0, "loss": 0.3775, "rewards/accuracies": 0.831250011920929, "rewards/chosen": 2.53125, "rewards/margins": 1.7734375, "rewards/rejected": 0.75390625, "step": 7550 }, { "epoch": 1.9989423585404547, "grad_norm": 22.0, "learning_rate": 5.288207297726071e-08, "logits/chosen": -85.0, "logits/rejected": -86.0, "logps/chosen": -584.0, "logps/rejected": -544.0, "loss": 0.2987, "rewards/accuracies": 0.893750011920929, "rewards/chosen": 3.015625, "rewards/margins": 2.3125, "rewards/rejected": 0.69921875, "step": 7560 }, { "epoch": 2.0, "eval_logits/chosen": -87.5, "eval_logits/rejected": -85.5, "eval_logps/chosen": -580.0, "eval_logps/rejected": -492.0, "eval_loss": 0.596281886100769, "eval_rewards/accuracies": 0.6956703066825867, "eval_rewards/chosen": 2.265625, "eval_rewards/margins": 1.2734375, "eval_rewards/rejected": 0.99609375, "eval_runtime": 998.6098, "eval_samples_per_second": 15.147, "eval_steps_per_second": 0.947, "step": 7564 } ], "logging_steps": 10, "max_steps": 7564, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 1891, "total_flos": 0.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }