{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.999297541394882, "eval_steps": 400, "global_step": 5604, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.002676032781401572, "grad_norm": 1.48540452899306, "learning_rate": 8.9126559714795e-09, "logits/chosen": -0.0685516744852066, "logits/rejected": 0.14143499732017517, "logps/chosen": -1.7162926197052002, "logps/rejected": -1.8897325992584229, "loss": 0.6976, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.7162926197052002, "rewards/margins": 0.17344002425670624, "rewards/rejected": -1.8897325992584229, "sft_loss": 1.468671202659607, "step": 5 }, { "epoch": 0.005352065562803144, "grad_norm": 1.413805704580881, "learning_rate": 1.7825311942959e-08, "logits/chosen": -0.006755639798939228, "logits/rejected": 0.1146969422698021, "logps/chosen": -1.8025729656219482, "logps/rejected": -1.8460617065429688, "loss": 0.704, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.8025729656219482, "rewards/margins": 0.04348861053586006, "rewards/rejected": -1.8460617065429688, "sft_loss": 1.5083630084991455, "step": 10 }, { "epoch": 0.008028098344204716, "grad_norm": 1.1477130633823904, "learning_rate": 2.67379679144385e-08, "logits/chosen": -0.04371301457285881, "logits/rejected": 0.05566522479057312, "logps/chosen": -1.6346843242645264, "logps/rejected": -1.765125036239624, "loss": 0.7082, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.6346843242645264, "rewards/margins": 0.13044048845767975, "rewards/rejected": -1.765125036239624, "sft_loss": 1.50040602684021, "step": 15 }, { "epoch": 0.010704131125606288, "grad_norm": 1.3186253450806018, "learning_rate": 3.5650623885918e-08, "logits/chosen": -0.04668748378753662, "logits/rejected": 0.041610319167375565, "logps/chosen": -1.7238433361053467, "logps/rejected": -1.805229902267456, "loss": 0.7121, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -1.7238433361053467, "rewards/margins": 0.08138636499643326, "rewards/rejected": -1.805229902267456, "sft_loss": 1.4999595880508423, "step": 20 }, { "epoch": 0.013380163907007862, "grad_norm": 1.201395274341559, "learning_rate": 4.45632798573975e-08, "logits/chosen": -0.04870440810918808, "logits/rejected": 0.03901376202702522, "logps/chosen": -1.8682407140731812, "logps/rejected": -1.7784353494644165, "loss": 0.7417, "rewards/accuracies": 0.3812499940395355, "rewards/chosen": -1.8682407140731812, "rewards/margins": -0.08980532735586166, "rewards/rejected": -1.7784353494644165, "sft_loss": 1.5453672409057617, "step": 25 }, { "epoch": 0.016056196688409432, "grad_norm": 1.1663689864355962, "learning_rate": 5.3475935828877e-08, "logits/chosen": -0.08696512877941132, "logits/rejected": 0.008516276255249977, "logps/chosen": -1.9079726934432983, "logps/rejected": -1.8313806056976318, "loss": 0.7073, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -1.9079726934432983, "rewards/margins": -0.07659195363521576, "rewards/rejected": -1.8313806056976318, "sft_loss": 1.6459842920303345, "step": 30 }, { "epoch": 0.018732229469811006, "grad_norm": 1.2100627166727136, "learning_rate": 6.23885918003565e-08, "logits/chosen": -0.06849979609251022, "logits/rejected": 0.09280852228403091, "logps/chosen": -1.846811294555664, "logps/rejected": -1.9957473278045654, "loss": 0.724, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -1.846811294555664, "rewards/margins": 0.14893609285354614, "rewards/rejected": -1.9957473278045654, "sft_loss": 1.5619454383850098, "step": 35 }, { "epoch": 0.021408262251212576, "grad_norm": 1.2220620735018575, "learning_rate": 7.1301247771836e-08, "logits/chosen": 0.023500319570302963, "logits/rejected": 0.1998632699251175, "logps/chosen": -1.8833141326904297, "logps/rejected": -1.744641900062561, "loss": 0.7216, "rewards/accuracies": 0.45625001192092896, "rewards/chosen": -1.8833141326904297, "rewards/margins": -0.13867226243019104, "rewards/rejected": -1.744641900062561, "sft_loss": 1.5195614099502563, "step": 40 }, { "epoch": 0.02408429503261415, "grad_norm": 1.1538644802166662, "learning_rate": 8.021390374331551e-08, "logits/chosen": 0.041353605687618256, "logits/rejected": 0.24557694792747498, "logps/chosen": -1.8384199142456055, "logps/rejected": -1.8726599216461182, "loss": 0.7116, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -1.8384199142456055, "rewards/margins": 0.034239742904901505, "rewards/rejected": -1.8726599216461182, "sft_loss": 1.5369489192962646, "step": 45 }, { "epoch": 0.026760327814015723, "grad_norm": 1.067987242731931, "learning_rate": 8.9126559714795e-08, "logits/chosen": -0.05942578241229057, "logits/rejected": 0.09370598196983337, "logps/chosen": -1.8994518518447876, "logps/rejected": -1.7794386148452759, "loss": 0.7154, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.8994518518447876, "rewards/margins": -0.12001317739486694, "rewards/rejected": -1.7794386148452759, "sft_loss": 1.5832624435424805, "step": 50 }, { "epoch": 0.029436360595417294, "grad_norm": 1.150489594431797, "learning_rate": 9.80392156862745e-08, "logits/chosen": -0.11642004549503326, "logits/rejected": 0.10642417520284653, "logps/chosen": -1.8365113735198975, "logps/rejected": -1.8701823949813843, "loss": 0.7066, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.8365113735198975, "rewards/margins": 0.0336710661649704, "rewards/rejected": -1.8701823949813843, "sft_loss": 1.5844300985336304, "step": 55 }, { "epoch": 0.032112393376818864, "grad_norm": 1.20034100694076, "learning_rate": 1.06951871657754e-07, "logits/chosen": -0.08687268197536469, "logits/rejected": 0.10528527200222015, "logps/chosen": -1.7937759160995483, "logps/rejected": -1.8984864950180054, "loss": 0.7037, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.7937759160995483, "rewards/margins": 0.1047104150056839, "rewards/rejected": -1.8984864950180054, "sft_loss": 1.545082688331604, "step": 60 }, { "epoch": 0.03478842615822044, "grad_norm": 1.1774202373517098, "learning_rate": 1.158645276292335e-07, "logits/chosen": -0.022284885868430138, "logits/rejected": 0.12686273455619812, "logps/chosen": -1.642361044883728, "logps/rejected": -1.77422297000885, "loss": 0.703, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.642361044883728, "rewards/margins": 0.1318618804216385, "rewards/rejected": -1.77422297000885, "sft_loss": 1.4764430522918701, "step": 65 }, { "epoch": 0.03746445893962201, "grad_norm": 1.514146379790193, "learning_rate": 1.24777183600713e-07, "logits/chosen": -0.07851644605398178, "logits/rejected": 0.07680389285087585, "logps/chosen": -1.7728259563446045, "logps/rejected": -1.8199899196624756, "loss": 0.7192, "rewards/accuracies": 0.4312500059604645, "rewards/chosen": -1.7728259563446045, "rewards/margins": 0.04716411232948303, "rewards/rejected": -1.8199899196624756, "sft_loss": 1.6338894367218018, "step": 70 }, { "epoch": 0.04014049172102358, "grad_norm": 1.1841492258645112, "learning_rate": 1.3368983957219251e-07, "logits/chosen": -0.07346881926059723, "logits/rejected": 0.10432298481464386, "logps/chosen": -1.7868471145629883, "logps/rejected": -2.049077033996582, "loss": 0.7099, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.7868471145629883, "rewards/margins": 0.26223024725914, "rewards/rejected": -2.049077033996582, "sft_loss": 1.5695842504501343, "step": 75 }, { "epoch": 0.04281652450242515, "grad_norm": 1.4171546244542632, "learning_rate": 1.42602495543672e-07, "logits/chosen": 0.011562767438590527, "logits/rejected": 0.12024722248315811, "logps/chosen": -1.7296804189682007, "logps/rejected": -1.7619798183441162, "loss": 0.7154, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.7296804189682007, "rewards/margins": 0.03229951113462448, "rewards/rejected": -1.7619798183441162, "sft_loss": 1.5305752754211426, "step": 80 }, { "epoch": 0.04549255728382673, "grad_norm": 1.2388388222456028, "learning_rate": 1.5151515151515152e-07, "logits/chosen": -0.14274922013282776, "logits/rejected": 0.11063234508037567, "logps/chosen": -1.804538369178772, "logps/rejected": -1.9845695495605469, "loss": 0.7123, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.804538369178772, "rewards/margins": 0.180031418800354, "rewards/rejected": -1.9845695495605469, "sft_loss": 1.5006142854690552, "step": 85 }, { "epoch": 0.0481685900652283, "grad_norm": 1.1389220986989186, "learning_rate": 1.6042780748663102e-07, "logits/chosen": 0.07219815999269485, "logits/rejected": 0.036963194608688354, "logps/chosen": -1.7647689580917358, "logps/rejected": -1.7878549098968506, "loss": 0.7149, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -1.7647689580917358, "rewards/margins": 0.02308591641485691, "rewards/rejected": -1.7878549098968506, "sft_loss": 1.4621435403823853, "step": 90 }, { "epoch": 0.05084462284662987, "grad_norm": 1.4273520301005898, "learning_rate": 1.693404634581105e-07, "logits/chosen": -0.08188000321388245, "logits/rejected": 0.06936556100845337, "logps/chosen": -1.832297921180725, "logps/rejected": -1.9317655563354492, "loss": 0.7116, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.832297921180725, "rewards/margins": 0.09946787357330322, "rewards/rejected": -1.9317655563354492, "sft_loss": 1.5347415208816528, "step": 95 }, { "epoch": 0.05352065562803145, "grad_norm": 1.0902790876389015, "learning_rate": 1.7825311942959e-07, "logits/chosen": -0.025676894932985306, "logits/rejected": 0.04155648872256279, "logps/chosen": -1.7139743566513062, "logps/rejected": -1.8227088451385498, "loss": 0.7064, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.7139743566513062, "rewards/margins": 0.10873470455408096, "rewards/rejected": -1.8227088451385498, "sft_loss": 1.4991495609283447, "step": 100 }, { "epoch": 0.05619668840943302, "grad_norm": 1.3390278945525504, "learning_rate": 1.8716577540106952e-07, "logits/chosen": 0.053162623196840286, "logits/rejected": 0.08006517589092255, "logps/chosen": -1.6687822341918945, "logps/rejected": -1.83384108543396, "loss": 0.6973, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.6687822341918945, "rewards/margins": 0.16505882143974304, "rewards/rejected": -1.83384108543396, "sft_loss": 1.4465464353561401, "step": 105 }, { "epoch": 0.05887272119083459, "grad_norm": 1.1380107250036546, "learning_rate": 1.96078431372549e-07, "logits/chosen": 0.02342567965388298, "logits/rejected": 0.12284733355045319, "logps/chosen": -1.7178386449813843, "logps/rejected": -1.774283766746521, "loss": 0.7154, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.7178386449813843, "rewards/margins": 0.05644518882036209, "rewards/rejected": -1.774283766746521, "sft_loss": 1.4845190048217773, "step": 110 }, { "epoch": 0.06154875397223616, "grad_norm": 1.7449537985357941, "learning_rate": 2.049910873440285e-07, "logits/chosen": 0.05169694870710373, "logits/rejected": 0.26776689291000366, "logps/chosen": -1.7055747509002686, "logps/rejected": -1.9986652135849, "loss": 0.6995, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.7055747509002686, "rewards/margins": 0.2930903434753418, "rewards/rejected": -1.9986652135849, "sft_loss": 1.588734745979309, "step": 115 }, { "epoch": 0.06422478675363773, "grad_norm": 1.1567141674033627, "learning_rate": 2.13903743315508e-07, "logits/chosen": -0.06300070881843567, "logits/rejected": 0.11846703290939331, "logps/chosen": -1.807218313217163, "logps/rejected": -1.9346367120742798, "loss": 0.7002, "rewards/accuracies": 0.5, "rewards/chosen": -1.807218313217163, "rewards/margins": 0.12741819024085999, "rewards/rejected": -1.9346367120742798, "sft_loss": 1.5945073366165161, "step": 120 }, { "epoch": 0.0669008195350393, "grad_norm": 1.2458548746530076, "learning_rate": 2.2281639928698751e-07, "logits/chosen": -0.07810702174901962, "logits/rejected": 0.05697429180145264, "logps/chosen": -1.7272956371307373, "logps/rejected": -1.6646478176116943, "loss": 0.7234, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -1.7272956371307373, "rewards/margins": -0.06264790147542953, "rewards/rejected": -1.6646478176116943, "sft_loss": 1.5413930416107178, "step": 125 }, { "epoch": 0.06957685231644088, "grad_norm": 1.8740559078645729, "learning_rate": 2.31729055258467e-07, "logits/chosen": 0.05399082973599434, "logits/rejected": 0.19395580887794495, "logps/chosen": -1.7780125141143799, "logps/rejected": -1.8939733505249023, "loss": 0.7074, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -1.7780125141143799, "rewards/margins": 0.11596081405878067, "rewards/rejected": -1.8939733505249023, "sft_loss": 1.6091417074203491, "step": 130 }, { "epoch": 0.07225288509784245, "grad_norm": 1.110779459632305, "learning_rate": 2.406417112299465e-07, "logits/chosen": -0.030741384252905846, "logits/rejected": 0.09419815987348557, "logps/chosen": -1.8397916555404663, "logps/rejected": -1.8377549648284912, "loss": 0.7164, "rewards/accuracies": 0.5, "rewards/chosen": -1.8397916555404663, "rewards/margins": -0.002036741469055414, "rewards/rejected": -1.8377549648284912, "sft_loss": 1.5586879253387451, "step": 135 }, { "epoch": 0.07492891787924402, "grad_norm": 1.541796808985463, "learning_rate": 2.49554367201426e-07, "logits/chosen": -0.016195252537727356, "logits/rejected": 0.15794073045253754, "logps/chosen": -1.7996280193328857, "logps/rejected": -1.9841985702514648, "loss": 0.7067, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.7996280193328857, "rewards/margins": 0.1845705807209015, "rewards/rejected": -1.9841985702514648, "sft_loss": 1.6003118753433228, "step": 140 }, { "epoch": 0.0776049506606456, "grad_norm": 1.258768101937461, "learning_rate": 2.5846702317290554e-07, "logits/chosen": 0.0032178417313843966, "logits/rejected": 0.1682175099849701, "logps/chosen": -1.7254054546356201, "logps/rejected": -1.8469938039779663, "loss": 0.7035, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -1.7254054546356201, "rewards/margins": 0.12158823013305664, "rewards/rejected": -1.8469938039779663, "sft_loss": 1.539015293121338, "step": 145 }, { "epoch": 0.08028098344204716, "grad_norm": 1.4744864735547007, "learning_rate": 2.6737967914438503e-07, "logits/chosen": -0.019702356308698654, "logits/rejected": 0.1558084785938263, "logps/chosen": -1.681099534034729, "logps/rejected": -1.6747146844863892, "loss": 0.7081, "rewards/accuracies": 0.53125, "rewards/chosen": -1.681099534034729, "rewards/margins": -0.006384936161339283, "rewards/rejected": -1.6747146844863892, "sft_loss": 1.3968122005462646, "step": 150 }, { "epoch": 0.08295701622344874, "grad_norm": 1.5789944598469368, "learning_rate": 2.762923351158645e-07, "logits/chosen": -0.044391922652721405, "logits/rejected": 0.009145406074821949, "logps/chosen": -1.7172826528549194, "logps/rejected": -1.795111060142517, "loss": 0.7087, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.7172826528549194, "rewards/margins": 0.07782838493585587, "rewards/rejected": -1.795111060142517, "sft_loss": 1.5068585872650146, "step": 155 }, { "epoch": 0.0856330490048503, "grad_norm": 1.3241057971254109, "learning_rate": 2.85204991087344e-07, "logits/chosen": -0.11585366725921631, "logits/rejected": 0.03425910696387291, "logps/chosen": -1.8618148565292358, "logps/rejected": -1.8349769115447998, "loss": 0.726, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -1.8618148565292358, "rewards/margins": -0.02683776617050171, "rewards/rejected": -1.8349769115447998, "sft_loss": 1.5718270540237427, "step": 160 }, { "epoch": 0.08830908178625188, "grad_norm": 1.2997658998322292, "learning_rate": 2.941176470588235e-07, "logits/chosen": -0.053279124200344086, "logits/rejected": 0.1222684383392334, "logps/chosen": -1.6682636737823486, "logps/rejected": -1.8548986911773682, "loss": 0.7049, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.6682636737823486, "rewards/margins": 0.1866351068019867, "rewards/rejected": -1.8548986911773682, "sft_loss": 1.4499410390853882, "step": 165 }, { "epoch": 0.09098511456765346, "grad_norm": 1.3208734338750652, "learning_rate": 3.0303030303030305e-07, "logits/chosen": -0.07479271292686462, "logits/rejected": -0.01702618971467018, "logps/chosen": -1.8411445617675781, "logps/rejected": -1.87027108669281, "loss": 0.714, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -1.8411445617675781, "rewards/margins": 0.029126638546586037, "rewards/rejected": -1.87027108669281, "sft_loss": 1.558387279510498, "step": 170 }, { "epoch": 0.09366114734905502, "grad_norm": 1.3344718053025082, "learning_rate": 3.1194295900178254e-07, "logits/chosen": 0.08084265887737274, "logits/rejected": 0.08330238610506058, "logps/chosen": -1.707576036453247, "logps/rejected": -1.7905025482177734, "loss": 0.7239, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.707576036453247, "rewards/margins": 0.08292657881975174, "rewards/rejected": -1.7905025482177734, "sft_loss": 1.53104567527771, "step": 175 }, { "epoch": 0.0963371801304566, "grad_norm": 1.2266352439696486, "learning_rate": 3.2085561497326203e-07, "logits/chosen": 0.02214609459042549, "logits/rejected": 0.024739524349570274, "logps/chosen": -1.741689682006836, "logps/rejected": -1.8428306579589844, "loss": 0.7046, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.741689682006836, "rewards/margins": 0.1011408120393753, "rewards/rejected": -1.8428306579589844, "sft_loss": 1.5208356380462646, "step": 180 }, { "epoch": 0.09901321291185818, "grad_norm": 1.627310649600377, "learning_rate": 3.297682709447415e-07, "logits/chosen": -0.11873702704906464, "logits/rejected": -0.025531206279993057, "logps/chosen": -1.6886358261108398, "logps/rejected": -1.7515376806259155, "loss": 0.7184, "rewards/accuracies": 0.5, "rewards/chosen": -1.6886358261108398, "rewards/margins": 0.06290177255868912, "rewards/rejected": -1.7515376806259155, "sft_loss": 1.4868758916854858, "step": 185 }, { "epoch": 0.10168924569325974, "grad_norm": 2.2936334873784188, "learning_rate": 3.38680926916221e-07, "logits/chosen": -0.04581695422530174, "logits/rejected": 0.08045488595962524, "logps/chosen": -1.8337981700897217, "logps/rejected": -1.8637981414794922, "loss": 0.7164, "rewards/accuracies": 0.45625001192092896, "rewards/chosen": -1.8337981700897217, "rewards/margins": 0.030000019818544388, "rewards/rejected": -1.8637981414794922, "sft_loss": 1.5478748083114624, "step": 190 }, { "epoch": 0.10436527847466132, "grad_norm": 1.4095395623852622, "learning_rate": 3.475935828877005e-07, "logits/chosen": 0.040932267904281616, "logits/rejected": 0.2044890820980072, "logps/chosen": -1.554783821105957, "logps/rejected": -1.685030221939087, "loss": 0.7067, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.554783821105957, "rewards/margins": 0.13024640083312988, "rewards/rejected": -1.685030221939087, "sft_loss": 1.3969472646713257, "step": 195 }, { "epoch": 0.1070413112560629, "grad_norm": 1.366095338570351, "learning_rate": 3.5650623885918e-07, "logits/chosen": -0.07507927715778351, "logits/rejected": 0.06702321767807007, "logps/chosen": -1.7471084594726562, "logps/rejected": -1.6983184814453125, "loss": 0.7148, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.7471084594726562, "rewards/margins": -0.048790059983730316, "rewards/rejected": -1.6983184814453125, "sft_loss": 1.539676547050476, "step": 200 }, { "epoch": 0.10971734403746446, "grad_norm": 1.450732603887787, "learning_rate": 3.654188948306595e-07, "logits/chosen": -0.038362838327884674, "logits/rejected": 0.11528350412845612, "logps/chosen": -1.7480757236480713, "logps/rejected": -1.6963634490966797, "loss": 0.7129, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -1.7480757236480713, "rewards/margins": -0.05171237513422966, "rewards/rejected": -1.6963634490966797, "sft_loss": 1.4396885633468628, "step": 205 }, { "epoch": 0.11239337681886603, "grad_norm": 1.3763776982907177, "learning_rate": 3.7433155080213904e-07, "logits/chosen": -0.11977557092905045, "logits/rejected": 0.08482502400875092, "logps/chosen": -1.7112598419189453, "logps/rejected": -1.9077314138412476, "loss": 0.6965, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.7112598419189453, "rewards/margins": 0.19647178053855896, "rewards/rejected": -1.9077314138412476, "sft_loss": 1.4883301258087158, "step": 210 }, { "epoch": 0.1150694096002676, "grad_norm": 1.2054543849961346, "learning_rate": 3.8324420677361853e-07, "logits/chosen": -0.16571705043315887, "logits/rejected": 0.08995556831359863, "logps/chosen": -1.6306493282318115, "logps/rejected": -1.7074792385101318, "loss": 0.7, "rewards/accuracies": 0.53125, "rewards/chosen": -1.6306493282318115, "rewards/margins": 0.0768299400806427, "rewards/rejected": -1.7074792385101318, "sft_loss": 1.4890508651733398, "step": 215 }, { "epoch": 0.11774544238166917, "grad_norm": 1.3114056890831345, "learning_rate": 3.92156862745098e-07, "logits/chosen": 0.06645621359348297, "logits/rejected": 0.17207393050193787, "logps/chosen": -1.631810188293457, "logps/rejected": -1.8495622873306274, "loss": 0.6927, "rewards/accuracies": 0.5625, "rewards/chosen": -1.631810188293457, "rewards/margins": 0.21775206923484802, "rewards/rejected": -1.8495622873306274, "sft_loss": 1.4851901531219482, "step": 220 }, { "epoch": 0.12042147516307075, "grad_norm": 1.51541200160965, "learning_rate": 4.010695187165775e-07, "logits/chosen": -0.11003299802541733, "logits/rejected": 0.06239504739642143, "logps/chosen": -1.5596537590026855, "logps/rejected": -1.6928398609161377, "loss": 0.697, "rewards/accuracies": 0.5625, "rewards/chosen": -1.5596537590026855, "rewards/margins": 0.13318602740764618, "rewards/rejected": -1.6928398609161377, "sft_loss": 1.406328558921814, "step": 225 }, { "epoch": 0.12309750794447231, "grad_norm": 1.503571872068748, "learning_rate": 4.09982174688057e-07, "logits/chosen": -0.015409344807267189, "logits/rejected": 0.06427817046642303, "logps/chosen": -1.6280763149261475, "logps/rejected": -1.769737958908081, "loss": 0.7017, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.6280763149261475, "rewards/margins": 0.14166171848773956, "rewards/rejected": -1.769737958908081, "sft_loss": 1.3876667022705078, "step": 230 }, { "epoch": 0.1257735407258739, "grad_norm": 1.6288220279547727, "learning_rate": 4.188948306595365e-07, "logits/chosen": 0.011006379500031471, "logits/rejected": 0.15760047733783722, "logps/chosen": -1.5208041667938232, "logps/rejected": -1.704429268836975, "loss": 0.6852, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.5208041667938232, "rewards/margins": 0.183625265955925, "rewards/rejected": -1.704429268836975, "sft_loss": 1.3771321773529053, "step": 235 }, { "epoch": 0.12844957350727546, "grad_norm": 1.6522624770561913, "learning_rate": 4.27807486631016e-07, "logits/chosen": -0.05322523042559624, "logits/rejected": 0.0770733654499054, "logps/chosen": -1.512322187423706, "logps/rejected": -1.6946868896484375, "loss": 0.7075, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.512322187423706, "rewards/margins": 0.1823645383119583, "rewards/rejected": -1.6946868896484375, "sft_loss": 1.4470535516738892, "step": 240 }, { "epoch": 0.13112560628867703, "grad_norm": 1.7657529349560406, "learning_rate": 4.3672014260249554e-07, "logits/chosen": 0.01734323427081108, "logits/rejected": 0.14011171460151672, "logps/chosen": -1.56759512424469, "logps/rejected": -1.6468995809555054, "loss": 0.7005, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.56759512424469, "rewards/margins": 0.07930465787649155, "rewards/rejected": -1.6468995809555054, "sft_loss": 1.5069551467895508, "step": 245 }, { "epoch": 0.1338016390700786, "grad_norm": 1.859004151149198, "learning_rate": 4.4563279857397503e-07, "logits/chosen": -0.08213020861148834, "logits/rejected": 0.08221803605556488, "logps/chosen": -1.5308703184127808, "logps/rejected": -1.583105444908142, "loss": 0.7119, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -1.5308703184127808, "rewards/margins": 0.052234966307878494, "rewards/rejected": -1.583105444908142, "sft_loss": 1.3916592597961426, "step": 250 }, { "epoch": 0.1364776718514802, "grad_norm": 1.9364919295573748, "learning_rate": 4.545454545454545e-07, "logits/chosen": -0.06537928432226181, "logits/rejected": 0.07307229936122894, "logps/chosen": -1.3984811305999756, "logps/rejected": -1.5488195419311523, "loss": 0.6939, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.3984811305999756, "rewards/margins": 0.15033839643001556, "rewards/rejected": -1.5488195419311523, "sft_loss": 1.3026387691497803, "step": 255 }, { "epoch": 0.13915370463288176, "grad_norm": 2.1865734300951845, "learning_rate": 4.63458110516934e-07, "logits/chosen": -0.2760574221611023, "logits/rejected": -0.17358729243278503, "logps/chosen": -1.527931571006775, "logps/rejected": -1.6576035022735596, "loss": 0.6914, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.527931571006775, "rewards/margins": 0.12967175245285034, "rewards/rejected": -1.6576035022735596, "sft_loss": 1.4426121711730957, "step": 260 }, { "epoch": 0.1418297374142833, "grad_norm": 2.5225617128658664, "learning_rate": 4.723707664884135e-07, "logits/chosen": -0.11230075359344482, "logits/rejected": -0.028052741661667824, "logps/chosen": -1.5118391513824463, "logps/rejected": -1.6556705236434937, "loss": 0.7078, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.5118391513824463, "rewards/margins": 0.14383149147033691, "rewards/rejected": -1.6556705236434937, "sft_loss": 1.4891895055770874, "step": 265 }, { "epoch": 0.1445057701956849, "grad_norm": 2.923055759654654, "learning_rate": 4.81283422459893e-07, "logits/chosen": -0.13042227923870087, "logits/rejected": -0.0014180898433551192, "logps/chosen": -1.393897294998169, "logps/rejected": -1.5104312896728516, "loss": 0.7049, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.393897294998169, "rewards/margins": 0.11653389781713486, "rewards/rejected": -1.5104312896728516, "sft_loss": 1.3747899532318115, "step": 270 }, { "epoch": 0.14718180297708647, "grad_norm": 3.9939481336962137, "learning_rate": 4.901960784313725e-07, "logits/chosen": -0.08223985135555267, "logits/rejected": 0.01174080092459917, "logps/chosen": -1.3294731378555298, "logps/rejected": -1.5173444747924805, "loss": 0.7069, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.3294731378555298, "rewards/margins": 0.1878713071346283, "rewards/rejected": -1.5173444747924805, "sft_loss": 1.2925227880477905, "step": 275 }, { "epoch": 0.14985783575848804, "grad_norm": 6.106718295896271, "learning_rate": 4.99108734402852e-07, "logits/chosen": -0.15224897861480713, "logits/rejected": -0.004387478344142437, "logps/chosen": -1.3839061260223389, "logps/rejected": -1.49300217628479, "loss": 0.707, "rewards/accuracies": 0.5625, "rewards/chosen": -1.3839061260223389, "rewards/margins": 0.1090959757566452, "rewards/rejected": -1.49300217628479, "sft_loss": 1.3571009635925293, "step": 280 }, { "epoch": 0.15253386853988962, "grad_norm": 4.959246349891953, "learning_rate": 5.080213903743315e-07, "logits/chosen": -0.11805672943592072, "logits/rejected": 0.016202565282583237, "logps/chosen": -1.3780156373977661, "logps/rejected": -1.4832361936569214, "loss": 0.6954, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.3780156373977661, "rewards/margins": 0.10522061586380005, "rewards/rejected": -1.4832361936569214, "sft_loss": 1.417772889137268, "step": 285 }, { "epoch": 0.1552099013212912, "grad_norm": 2.7747854707798947, "learning_rate": 5.169340463458111e-07, "logits/chosen": -0.1688564121723175, "logits/rejected": 0.11571399122476578, "logps/chosen": -1.400048017501831, "logps/rejected": -1.5392086505889893, "loss": 0.6834, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.400048017501831, "rewards/margins": 0.13916051387786865, "rewards/rejected": -1.5392086505889893, "sft_loss": 1.3857918977737427, "step": 290 }, { "epoch": 0.15788593410269275, "grad_norm": 8.877597356625866, "learning_rate": 5.258467023172905e-07, "logits/chosen": -0.12207935005426407, "logits/rejected": -0.06441991776227951, "logps/chosen": -1.2970856428146362, "logps/rejected": -1.450477957725525, "loss": 0.6917, "rewards/accuracies": 0.5625, "rewards/chosen": -1.2970856428146362, "rewards/margins": 0.15339213609695435, "rewards/rejected": -1.450477957725525, "sft_loss": 1.2975612878799438, "step": 295 }, { "epoch": 0.16056196688409433, "grad_norm": 3.318035682630868, "learning_rate": 5.347593582887701e-07, "logits/chosen": -0.12218412011861801, "logits/rejected": 0.03679182007908821, "logps/chosen": -1.3453871011734009, "logps/rejected": -1.4296029806137085, "loss": 0.7139, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.3453871011734009, "rewards/margins": 0.08421595394611359, "rewards/rejected": -1.4296029806137085, "sft_loss": 1.3945664167404175, "step": 300 }, { "epoch": 0.1632379996654959, "grad_norm": 4.339443086417133, "learning_rate": 5.436720142602496e-07, "logits/chosen": -0.086027592420578, "logits/rejected": -0.012940932996571064, "logps/chosen": -1.4613901376724243, "logps/rejected": -1.4588768482208252, "loss": 0.7165, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.4613901376724243, "rewards/margins": -0.0025134205352514982, "rewards/rejected": -1.4588768482208252, "sft_loss": 1.4459879398345947, "step": 305 }, { "epoch": 0.16591403244689748, "grad_norm": 3.6558070591746565, "learning_rate": 5.52584670231729e-07, "logits/chosen": -0.2582774758338928, "logits/rejected": -0.16772443056106567, "logps/chosen": -1.4190332889556885, "logps/rejected": -1.5252110958099365, "loss": 0.711, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.4190332889556885, "rewards/margins": 0.10617784410715103, "rewards/rejected": -1.5252110958099365, "sft_loss": 1.4055038690567017, "step": 310 }, { "epoch": 0.16859006522829906, "grad_norm": 3.9092243838995517, "learning_rate": 5.614973262032086e-07, "logits/chosen": -0.07796318829059601, "logits/rejected": 0.07571976631879807, "logps/chosen": -1.4165136814117432, "logps/rejected": -1.5895618200302124, "loss": 0.6889, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.4165136814117432, "rewards/margins": 0.17304803431034088, "rewards/rejected": -1.5895618200302124, "sft_loss": 1.4236104488372803, "step": 315 }, { "epoch": 0.1712660980097006, "grad_norm": 4.434603741805856, "learning_rate": 5.70409982174688e-07, "logits/chosen": -0.12248054891824722, "logits/rejected": 0.0066294134594500065, "logps/chosen": -1.3746910095214844, "logps/rejected": -1.4437358379364014, "loss": 0.7053, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.3746910095214844, "rewards/margins": 0.06904484331607819, "rewards/rejected": -1.4437358379364014, "sft_loss": 1.3798660039901733, "step": 320 }, { "epoch": 0.17394213079110218, "grad_norm": 2.1170803811392447, "learning_rate": 5.793226381461676e-07, "logits/chosen": -0.1800215244293213, "logits/rejected": -0.06589541584253311, "logps/chosen": -1.4075556993484497, "logps/rejected": -1.6932398080825806, "loss": 0.6934, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.4075556993484497, "rewards/margins": 0.285684198141098, "rewards/rejected": -1.6932398080825806, "sft_loss": 1.4548895359039307, "step": 325 }, { "epoch": 0.17661816357250376, "grad_norm": 4.476326136428031, "learning_rate": 5.88235294117647e-07, "logits/chosen": -0.09052817523479462, "logits/rejected": 0.0518343523144722, "logps/chosen": -1.4117047786712646, "logps/rejected": -1.6468381881713867, "loss": 0.6942, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.4117047786712646, "rewards/margins": 0.23513329029083252, "rewards/rejected": -1.6468381881713867, "sft_loss": 1.4054486751556396, "step": 330 }, { "epoch": 0.17929419635390534, "grad_norm": 3.030168461363324, "learning_rate": 5.971479500891266e-07, "logits/chosen": -0.04353749752044678, "logits/rejected": 0.060181625187397, "logps/chosen": -1.4509786367416382, "logps/rejected": -1.5014684200286865, "loss": 0.7139, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.4509786367416382, "rewards/margins": 0.050489895045757294, "rewards/rejected": -1.5014684200286865, "sft_loss": 1.3967310190200806, "step": 335 }, { "epoch": 0.18197022913530692, "grad_norm": 2.6262695039771744, "learning_rate": 6.060606060606061e-07, "logits/chosen": -0.09377865493297577, "logits/rejected": 0.049373142421245575, "logps/chosen": -1.5408929586410522, "logps/rejected": -1.6390235424041748, "loss": 0.7017, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.5408929586410522, "rewards/margins": 0.09813062846660614, "rewards/rejected": -1.6390235424041748, "sft_loss": 1.4603184461593628, "step": 340 }, { "epoch": 0.1846462619167085, "grad_norm": 4.295652610582466, "learning_rate": 6.149732620320855e-07, "logits/chosen": -0.020409051328897476, "logits/rejected": 0.009363172575831413, "logps/chosen": -1.4553922414779663, "logps/rejected": -1.6140689849853516, "loss": 0.6987, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.4553922414779663, "rewards/margins": 0.15867677330970764, "rewards/rejected": -1.6140689849853516, "sft_loss": 1.4302983283996582, "step": 345 }, { "epoch": 0.18732229469811004, "grad_norm": 3.8217455475304716, "learning_rate": 6.238859180035651e-07, "logits/chosen": -0.06983217597007751, "logits/rejected": 0.019797608256340027, "logps/chosen": -1.4062483310699463, "logps/rejected": -1.5235086679458618, "loss": 0.6957, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.4062483310699463, "rewards/margins": 0.11726043373346329, "rewards/rejected": -1.5235086679458618, "sft_loss": 1.4087426662445068, "step": 350 }, { "epoch": 0.18999832747951162, "grad_norm": 6.302535253237734, "learning_rate": 6.327985739750445e-07, "logits/chosen": -0.15332934260368347, "logits/rejected": 0.06179341673851013, "logps/chosen": -1.4927293062210083, "logps/rejected": -1.556748867034912, "loss": 0.7039, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -1.4927293062210083, "rewards/margins": 0.06401960551738739, "rewards/rejected": -1.556748867034912, "sft_loss": 1.4707249402999878, "step": 355 }, { "epoch": 0.1926743602609132, "grad_norm": 4.822321471397462, "learning_rate": 6.417112299465241e-07, "logits/chosen": -0.1483144611120224, "logits/rejected": -0.07352069765329361, "logps/chosen": -1.4491080045700073, "logps/rejected": -1.6051616668701172, "loss": 0.7084, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.4491080045700073, "rewards/margins": 0.15605367720127106, "rewards/rejected": -1.6051616668701172, "sft_loss": 1.3767362833023071, "step": 360 }, { "epoch": 0.19535039304231477, "grad_norm": 9.552981170129502, "learning_rate": 6.506238859180035e-07, "logits/chosen": -0.0526859275996685, "logits/rejected": 0.03089299239218235, "logps/chosen": -1.4122530221939087, "logps/rejected": -1.5068573951721191, "loss": 0.7048, "rewards/accuracies": 0.625, "rewards/chosen": -1.4122530221939087, "rewards/margins": 0.09460441023111343, "rewards/rejected": -1.5068573951721191, "sft_loss": 1.345879316329956, "step": 365 }, { "epoch": 0.19802642582371635, "grad_norm": 2.0321803233059885, "learning_rate": 6.59536541889483e-07, "logits/chosen": -0.08251919597387314, "logits/rejected": 0.010809054598212242, "logps/chosen": -1.3975403308868408, "logps/rejected": -1.4715176820755005, "loss": 0.7186, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.3975403308868408, "rewards/margins": 0.07397731393575668, "rewards/rejected": -1.4715176820755005, "sft_loss": 1.3381938934326172, "step": 370 }, { "epoch": 0.2007024586051179, "grad_norm": 6.659596031603889, "learning_rate": 6.684491978609626e-07, "logits/chosen": -0.1440359652042389, "logits/rejected": 0.005278570111840963, "logps/chosen": -1.387573003768921, "logps/rejected": -1.5763928890228271, "loss": 0.7005, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.387573003768921, "rewards/margins": 0.18881988525390625, "rewards/rejected": -1.5763928890228271, "sft_loss": 1.3912830352783203, "step": 375 }, { "epoch": 0.20337849138651948, "grad_norm": 2.151759579797557, "learning_rate": 6.77361853832442e-07, "logits/chosen": -0.11488963663578033, "logits/rejected": -0.0325799360871315, "logps/chosen": -1.4236202239990234, "logps/rejected": -1.6178334951400757, "loss": 0.6845, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.4236202239990234, "rewards/margins": 0.1942131221294403, "rewards/rejected": -1.6178334951400757, "sft_loss": 1.3828890323638916, "step": 380 }, { "epoch": 0.20605452416792105, "grad_norm": 1.3655878297554467, "learning_rate": 6.862745098039216e-07, "logits/chosen": -0.08240549266338348, "logits/rejected": -0.0049137575551867485, "logps/chosen": -1.4938578605651855, "logps/rejected": -1.5395147800445557, "loss": 0.6938, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.4938578605651855, "rewards/margins": 0.04565705358982086, "rewards/rejected": -1.5395147800445557, "sft_loss": 1.471738576889038, "step": 385 }, { "epoch": 0.20873055694932263, "grad_norm": 4.660223267244464, "learning_rate": 6.95187165775401e-07, "logits/chosen": -0.014319619163870811, "logits/rejected": 0.1460811048746109, "logps/chosen": -1.505752444267273, "logps/rejected": -1.6274387836456299, "loss": 0.704, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.505752444267273, "rewards/margins": 0.12168624252080917, "rewards/rejected": -1.6274387836456299, "sft_loss": 1.4696409702301025, "step": 390 }, { "epoch": 0.2114065897307242, "grad_norm": 1.4807670413046374, "learning_rate": 7.040998217468806e-07, "logits/chosen": -0.12310131639242172, "logits/rejected": 0.03627241775393486, "logps/chosen": -1.4809261560440063, "logps/rejected": -1.5585112571716309, "loss": 0.7054, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.4809261560440063, "rewards/margins": 0.07758528739213943, "rewards/rejected": -1.5585112571716309, "sft_loss": 1.421349048614502, "step": 395 }, { "epoch": 0.2140826225121258, "grad_norm": 2.132106324899655, "learning_rate": 7.1301247771836e-07, "logits/chosen": -0.0066552236676216125, "logits/rejected": 0.08685633540153503, "logps/chosen": -1.5027748346328735, "logps/rejected": -1.6426845788955688, "loss": 0.6889, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.5027748346328735, "rewards/margins": 0.1399095356464386, "rewards/rejected": -1.6426845788955688, "sft_loss": 1.400646448135376, "step": 400 }, { "epoch": 0.2140826225121258, "eval_logits/chosen": 0.19446183741092682, "eval_logits/rejected": 0.2817261815071106, "eval_logps/chosen": -1.5228804349899292, "eval_logps/rejected": -1.6954823732376099, "eval_loss": 0.7002516984939575, "eval_rewards/accuracies": 0.5578634738922119, "eval_rewards/chosen": -1.5228804349899292, "eval_rewards/margins": 0.17260216176509857, "eval_rewards/rejected": -1.6954823732376099, "eval_runtime": 43.9468, "eval_samples_per_second": 30.605, "eval_sft_loss": 1.438156247138977, "eval_steps_per_second": 7.668, "step": 400 }, { "epoch": 0.21675865529352734, "grad_norm": 2.238195461356105, "learning_rate": 7.219251336898395e-07, "logits/chosen": -0.07115691900253296, "logits/rejected": 0.022948969155550003, "logps/chosen": -1.5295543670654297, "logps/rejected": -1.6470978260040283, "loss": 0.7159, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -1.5295543670654297, "rewards/margins": 0.11754367500543594, "rewards/rejected": -1.6470978260040283, "sft_loss": 1.4397932291030884, "step": 405 }, { "epoch": 0.2194346880749289, "grad_norm": 5.739972439145583, "learning_rate": 7.30837789661319e-07, "logits/chosen": -0.049605417996644974, "logits/rejected": 0.07737796008586884, "logps/chosen": -1.4525340795516968, "logps/rejected": -1.6043227910995483, "loss": 0.6972, "rewards/accuracies": 0.5, "rewards/chosen": -1.4525340795516968, "rewards/margins": 0.15178880095481873, "rewards/rejected": -1.6043227910995483, "sft_loss": 1.4185234308242798, "step": 410 }, { "epoch": 0.2221107208563305, "grad_norm": 1.9055908146961429, "learning_rate": 7.397504456327985e-07, "logits/chosen": -0.08537553250789642, "logits/rejected": -0.04558895155787468, "logps/chosen": -1.4500477313995361, "logps/rejected": -1.6230523586273193, "loss": 0.6948, "rewards/accuracies": 0.53125, "rewards/chosen": -1.4500477313995361, "rewards/margins": 0.17300477623939514, "rewards/rejected": -1.6230523586273193, "sft_loss": 1.4089908599853516, "step": 415 }, { "epoch": 0.22478675363773207, "grad_norm": 3.550660441004718, "learning_rate": 7.486631016042781e-07, "logits/chosen": -0.09662449359893799, "logits/rejected": 0.09209617227315903, "logps/chosen": -1.3820686340332031, "logps/rejected": -1.524606704711914, "loss": 0.7069, "rewards/accuracies": 0.5, "rewards/chosen": -1.3820686340332031, "rewards/margins": 0.14253807067871094, "rewards/rejected": -1.524606704711914, "sft_loss": 1.3853243589401245, "step": 420 }, { "epoch": 0.22746278641913364, "grad_norm": 3.1473111106901506, "learning_rate": 7.575757575757575e-07, "logits/chosen": -0.155962273478508, "logits/rejected": 0.03296704962849617, "logps/chosen": -1.4189581871032715, "logps/rejected": -1.6313225030899048, "loss": 0.6883, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.4189581871032715, "rewards/margins": 0.2123643457889557, "rewards/rejected": -1.6313225030899048, "sft_loss": 1.4540103673934937, "step": 425 }, { "epoch": 0.2301388192005352, "grad_norm": 3.396706695086868, "learning_rate": 7.664884135472371e-07, "logits/chosen": -0.17719171941280365, "logits/rejected": 0.012565260753035545, "logps/chosen": -1.4164271354675293, "logps/rejected": -1.6276462078094482, "loss": 0.6913, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.4164271354675293, "rewards/margins": 0.211218923330307, "rewards/rejected": -1.6276462078094482, "sft_loss": 1.4394110441207886, "step": 430 }, { "epoch": 0.23281485198193677, "grad_norm": 3.0261161806756487, "learning_rate": 7.754010695187165e-07, "logits/chosen": -0.12973877787590027, "logits/rejected": -0.046690475195646286, "logps/chosen": -1.2951653003692627, "logps/rejected": -1.447550654411316, "loss": 0.6843, "rewards/accuracies": 0.5625, "rewards/chosen": -1.2951653003692627, "rewards/margins": 0.15238544344902039, "rewards/rejected": -1.447550654411316, "sft_loss": 1.3464162349700928, "step": 435 }, { "epoch": 0.23549088476333835, "grad_norm": 5.1132228847446575, "learning_rate": 7.84313725490196e-07, "logits/chosen": -0.12432174384593964, "logits/rejected": -0.03838655725121498, "logps/chosen": -1.3688849210739136, "logps/rejected": -1.5176225900650024, "loss": 0.6987, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.3688849210739136, "rewards/margins": 0.14873747527599335, "rewards/rejected": -1.5176225900650024, "sft_loss": 1.3759212493896484, "step": 440 }, { "epoch": 0.23816691754473993, "grad_norm": 4.131642586210721, "learning_rate": 7.932263814616755e-07, "logits/chosen": -0.16325688362121582, "logits/rejected": -0.05782388523221016, "logps/chosen": -1.4089410305023193, "logps/rejected": -1.6085622310638428, "loss": 0.703, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.4089410305023193, "rewards/margins": 0.19962140917778015, "rewards/rejected": -1.6085622310638428, "sft_loss": 1.4247992038726807, "step": 445 }, { "epoch": 0.2408429503261415, "grad_norm": 1.750905776750948, "learning_rate": 8.02139037433155e-07, "logits/chosen": -0.10383790731430054, "logits/rejected": 0.019616033881902695, "logps/chosen": -1.4496909379959106, "logps/rejected": -1.6070255041122437, "loss": 0.7122, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.4496909379959106, "rewards/margins": 0.15733470022678375, "rewards/rejected": -1.6070255041122437, "sft_loss": 1.382320523262024, "step": 450 }, { "epoch": 0.24351898310754308, "grad_norm": 3.171172164305393, "learning_rate": 8.110516934046346e-07, "logits/chosen": -0.12625646591186523, "logits/rejected": -0.045686252415180206, "logps/chosen": -1.4090499877929688, "logps/rejected": -1.6708507537841797, "loss": 0.6904, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.4090499877929688, "rewards/margins": 0.2618007957935333, "rewards/rejected": -1.6708507537841797, "sft_loss": 1.3707574605941772, "step": 455 }, { "epoch": 0.24619501588894463, "grad_norm": 3.688534309233378, "learning_rate": 8.19964349376114e-07, "logits/chosen": -0.2515523135662079, "logits/rejected": -0.13388481736183167, "logps/chosen": -1.5547903776168823, "logps/rejected": -1.6721168756484985, "loss": 0.7035, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.5547903776168823, "rewards/margins": 0.11732640117406845, "rewards/rejected": -1.6721168756484985, "sft_loss": 1.5243290662765503, "step": 460 }, { "epoch": 0.2488710486703462, "grad_norm": 4.4349826049597745, "learning_rate": 8.288770053475936e-07, "logits/chosen": -0.016191715374588966, "logits/rejected": -0.002241746988147497, "logps/chosen": -1.5643110275268555, "logps/rejected": -1.755155324935913, "loss": 0.6948, "rewards/accuracies": 0.59375, "rewards/chosen": -1.5643110275268555, "rewards/margins": 0.19084429740905762, "rewards/rejected": -1.755155324935913, "sft_loss": 1.4712715148925781, "step": 465 }, { "epoch": 0.2515470814517478, "grad_norm": 3.172937410518411, "learning_rate": 8.37789661319073e-07, "logits/chosen": 0.01810188591480255, "logits/rejected": -0.020999742671847343, "logps/chosen": -1.4785630702972412, "logps/rejected": -1.6897687911987305, "loss": 0.6951, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.4785630702972412, "rewards/margins": 0.21120555698871613, "rewards/rejected": -1.6897687911987305, "sft_loss": 1.4240328073501587, "step": 470 }, { "epoch": 0.25422311423314936, "grad_norm": 2.3986245435415654, "learning_rate": 8.467023172905525e-07, "logits/chosen": -0.18378640711307526, "logits/rejected": -0.04902447760105133, "logps/chosen": -1.4604475498199463, "logps/rejected": -1.8186414241790771, "loss": 0.6848, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.4604475498199463, "rewards/margins": 0.3581937849521637, "rewards/rejected": -1.8186414241790771, "sft_loss": 1.4552308320999146, "step": 475 }, { "epoch": 0.2568991470145509, "grad_norm": 7.88706357415983, "learning_rate": 8.55614973262032e-07, "logits/chosen": -0.1797124743461609, "logits/rejected": 0.012060348875820637, "logps/chosen": -1.4463261365890503, "logps/rejected": -1.5902550220489502, "loss": 0.7187, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.4463261365890503, "rewards/margins": 0.14392876625061035, "rewards/rejected": -1.5902550220489502, "sft_loss": 1.420996904373169, "step": 480 }, { "epoch": 0.2595751797959525, "grad_norm": 6.436198819046655, "learning_rate": 8.645276292335115e-07, "logits/chosen": -0.1368204802274704, "logits/rejected": -0.09737800061702728, "logps/chosen": -1.592976689338684, "logps/rejected": -1.6954838037490845, "loss": 0.7061, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.592976689338684, "rewards/margins": 0.10250727087259293, "rewards/rejected": -1.6954838037490845, "sft_loss": 1.4994713068008423, "step": 485 }, { "epoch": 0.26225121257735406, "grad_norm": 3.4782486146378475, "learning_rate": 8.734402852049911e-07, "logits/chosen": -0.13630501925945282, "logits/rejected": -0.06619258970022202, "logps/chosen": -1.5162378549575806, "logps/rejected": -1.6546752452850342, "loss": 0.71, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.5162378549575806, "rewards/margins": 0.13843724131584167, "rewards/rejected": -1.6546752452850342, "sft_loss": 1.4422112703323364, "step": 490 }, { "epoch": 0.26492724535875567, "grad_norm": 7.153663720382908, "learning_rate": 8.823529411764705e-07, "logits/chosen": -0.19033238291740417, "logits/rejected": -0.1686793565750122, "logps/chosen": -1.5363590717315674, "logps/rejected": -1.6592457294464111, "loss": 0.7071, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.5363590717315674, "rewards/margins": 0.12288665771484375, "rewards/rejected": -1.6592457294464111, "sft_loss": 1.523964762687683, "step": 495 }, { "epoch": 0.2676032781401572, "grad_norm": 3.1275666171203147, "learning_rate": 8.912655971479501e-07, "logits/chosen": -0.1973910927772522, "logits/rejected": -0.09779137372970581, "logps/chosen": -1.4702038764953613, "logps/rejected": -1.6763232946395874, "loss": 0.7079, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.4702038764953613, "rewards/margins": 0.20611953735351562, "rewards/rejected": -1.6763232946395874, "sft_loss": 1.4103636741638184, "step": 500 }, { "epoch": 0.27027931092155877, "grad_norm": 1.545590729725742, "learning_rate": 9.001782531194295e-07, "logits/chosen": -0.2061690390110016, "logits/rejected": -0.06640944629907608, "logps/chosen": -1.579946756362915, "logps/rejected": -1.6387602090835571, "loss": 0.7003, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.579946756362915, "rewards/margins": 0.05881340429186821, "rewards/rejected": -1.6387602090835571, "sft_loss": 1.489118218421936, "step": 505 }, { "epoch": 0.2729553437029604, "grad_norm": 1.478907150560264, "learning_rate": 9.09090909090909e-07, "logits/chosen": -0.04340224340558052, "logits/rejected": 0.014481568709015846, "logps/chosen": -1.561303973197937, "logps/rejected": -1.7578208446502686, "loss": 0.6952, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.561303973197937, "rewards/margins": 0.19651691615581512, "rewards/rejected": -1.7578208446502686, "sft_loss": 1.4192326068878174, "step": 510 }, { "epoch": 0.2756313764843619, "grad_norm": 3.6300079780170655, "learning_rate": 9.180035650623885e-07, "logits/chosen": -0.11791355907917023, "logits/rejected": -0.02622136101126671, "logps/chosen": -1.4685485363006592, "logps/rejected": -1.6369479894638062, "loss": 0.6788, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.4685485363006592, "rewards/margins": 0.16839949786663055, "rewards/rejected": -1.6369479894638062, "sft_loss": 1.401977300643921, "step": 515 }, { "epoch": 0.27830740926576353, "grad_norm": 1.9461711581313945, "learning_rate": 9.26916221033868e-07, "logits/chosen": -0.24227562546730042, "logits/rejected": -0.10975190252065659, "logps/chosen": -1.4977983236312866, "logps/rejected": -1.6738275289535522, "loss": 0.6954, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.4977983236312866, "rewards/margins": 0.1760290265083313, "rewards/rejected": -1.6738275289535522, "sft_loss": 1.5253281593322754, "step": 520 }, { "epoch": 0.2809834420471651, "grad_norm": 4.39745711780097, "learning_rate": 9.358288770053476e-07, "logits/chosen": -0.06434062868356705, "logits/rejected": 0.00504128634929657, "logps/chosen": -1.5231517553329468, "logps/rejected": -1.7861477136611938, "loss": 0.6937, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.5231517553329468, "rewards/margins": 0.2629958689212799, "rewards/rejected": -1.7861477136611938, "sft_loss": 1.5069881677627563, "step": 525 }, { "epoch": 0.2836594748285666, "grad_norm": 4.74780941859642, "learning_rate": 9.44741532976827e-07, "logits/chosen": -0.08390182256698608, "logits/rejected": -0.0013960630167275667, "logps/chosen": -1.410300612449646, "logps/rejected": -1.5978670120239258, "loss": 0.7043, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.410300612449646, "rewards/margins": 0.18756639957427979, "rewards/rejected": -1.5978670120239258, "sft_loss": 1.3433904647827148, "step": 530 }, { "epoch": 0.28633550760996823, "grad_norm": 3.656009593259164, "learning_rate": 9.536541889483066e-07, "logits/chosen": -0.2296837866306305, "logits/rejected": 0.028430040925741196, "logps/chosen": -1.4031928777694702, "logps/rejected": -1.5763087272644043, "loss": 0.6837, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.4031928777694702, "rewards/margins": 0.17311576008796692, "rewards/rejected": -1.5763087272644043, "sft_loss": 1.34806227684021, "step": 535 }, { "epoch": 0.2890115403913698, "grad_norm": 4.988252598362669, "learning_rate": 9.62566844919786e-07, "logits/chosen": -0.1283363401889801, "logits/rejected": -0.05954523757100105, "logps/chosen": -1.5781985521316528, "logps/rejected": -1.723961591720581, "loss": 0.6985, "rewards/accuracies": 0.5625, "rewards/chosen": -1.5781985521316528, "rewards/margins": 0.14576300978660583, "rewards/rejected": -1.723961591720581, "sft_loss": 1.5613014698028564, "step": 540 }, { "epoch": 0.2916875731727714, "grad_norm": 4.077268940652683, "learning_rate": 9.714795008912655e-07, "logits/chosen": -0.21156974136829376, "logits/rejected": -0.014496455900371075, "logps/chosen": -1.4874637126922607, "logps/rejected": -1.6751620769500732, "loss": 0.6849, "rewards/accuracies": 0.59375, "rewards/chosen": -1.4874637126922607, "rewards/margins": 0.1876983940601349, "rewards/rejected": -1.6751620769500732, "sft_loss": 1.4299863576889038, "step": 545 }, { "epoch": 0.29436360595417294, "grad_norm": 3.843684359548097, "learning_rate": 9.80392156862745e-07, "logits/chosen": -0.10458724200725555, "logits/rejected": -0.03413146734237671, "logps/chosen": -1.4824299812316895, "logps/rejected": -1.6548147201538086, "loss": 0.6957, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.4824299812316895, "rewards/margins": 0.17238478362560272, "rewards/rejected": -1.6548147201538086, "sft_loss": 1.416551947593689, "step": 550 }, { "epoch": 0.2970396387355745, "grad_norm": 7.776033781083553, "learning_rate": 9.893048128342244e-07, "logits/chosen": -0.20451894402503967, "logits/rejected": -0.08231017738580704, "logps/chosen": -1.5417841672897339, "logps/rejected": -1.6531574726104736, "loss": 0.6956, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.5417841672897339, "rewards/margins": 0.11137330532073975, "rewards/rejected": -1.6531574726104736, "sft_loss": 1.5059678554534912, "step": 555 }, { "epoch": 0.2997156715169761, "grad_norm": 5.8735917625409435, "learning_rate": 9.98217468805704e-07, "logits/chosen": -0.08917711675167084, "logits/rejected": -0.07247889041900635, "logps/chosen": -1.4098436832427979, "logps/rejected": -1.5879590511322021, "loss": 0.6897, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.4098436832427979, "rewards/margins": 0.1781153380870819, "rewards/rejected": -1.5879590511322021, "sft_loss": 1.4960367679595947, "step": 560 }, { "epoch": 0.30239170429837764, "grad_norm": 1.9362695189538406, "learning_rate": 9.999984476788462e-07, "logits/chosen": -0.11081753671169281, "logits/rejected": -0.059358786791563034, "logps/chosen": -1.4940464496612549, "logps/rejected": -1.681850790977478, "loss": 0.7057, "rewards/accuracies": 0.5625, "rewards/chosen": -1.4940464496612549, "rewards/margins": 0.1878044307231903, "rewards/rejected": -1.681850790977478, "sft_loss": 1.4923908710479736, "step": 565 }, { "epoch": 0.30506773707977924, "grad_norm": 3.875133594534664, "learning_rate": 9.999921413906797e-07, "logits/chosen": -0.19271458685398102, "logits/rejected": 0.00990169309079647, "logps/chosen": -1.468133807182312, "logps/rejected": -1.6332943439483643, "loss": 0.6908, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.468133807182312, "rewards/margins": 0.16516050696372986, "rewards/rejected": -1.6332943439483643, "sft_loss": 1.4964772462844849, "step": 570 }, { "epoch": 0.3077437698611808, "grad_norm": 2.8053986938228075, "learning_rate": 9.999809841765644e-07, "logits/chosen": -0.1740867793560028, "logits/rejected": -0.11315342038869858, "logps/chosen": -1.4085177183151245, "logps/rejected": -1.6071035861968994, "loss": 0.705, "rewards/accuracies": 0.5625, "rewards/chosen": -1.4085177183151245, "rewards/margins": 0.1985858678817749, "rewards/rejected": -1.6071035861968994, "sft_loss": 1.40883469581604, "step": 575 }, { "epoch": 0.3104198026425824, "grad_norm": 2.262322344262053, "learning_rate": 9.999649761447477e-07, "logits/chosen": -0.17668426036834717, "logits/rejected": -0.02388429269194603, "logps/chosen": -1.4115188121795654, "logps/rejected": -1.6690114736557007, "loss": 0.687, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.4115188121795654, "rewards/margins": 0.25749272108078003, "rewards/rejected": -1.6690114736557007, "sft_loss": 1.399688959121704, "step": 580 }, { "epoch": 0.31309583542398395, "grad_norm": 2.0619495664128116, "learning_rate": 9.999441174505398e-07, "logits/chosen": -0.2010982483625412, "logits/rejected": -0.10281310975551605, "logps/chosen": -1.6184282302856445, "logps/rejected": -1.7165712118148804, "loss": 0.709, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.6184282302856445, "rewards/margins": 0.09814301878213882, "rewards/rejected": -1.7165712118148804, "sft_loss": 1.5608628988265991, "step": 585 }, { "epoch": 0.3157718682053855, "grad_norm": 17.893057152362953, "learning_rate": 9.999184082963116e-07, "logits/chosen": -0.15141572058200836, "logits/rejected": -0.02625897526741028, "logps/chosen": -1.518196702003479, "logps/rejected": -1.5954731702804565, "loss": 0.7093, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.518196702003479, "rewards/margins": 0.07727648317813873, "rewards/rejected": -1.5954731702804565, "sft_loss": 1.5144684314727783, "step": 590 }, { "epoch": 0.3184479009867871, "grad_norm": 5.458181178019198, "learning_rate": 9.998878489314937e-07, "logits/chosen": -0.10900517553091049, "logits/rejected": 0.010265020653605461, "logps/chosen": -1.4177100658416748, "logps/rejected": -1.604640007019043, "loss": 0.7065, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.4177100658416748, "rewards/margins": 0.18692997097969055, "rewards/rejected": -1.604640007019043, "sft_loss": 1.4123425483703613, "step": 595 }, { "epoch": 0.32112393376818865, "grad_norm": 3.2116080209981144, "learning_rate": 9.99852439652573e-07, "logits/chosen": -0.18455770611763, "logits/rejected": -0.041767168790102005, "logps/chosen": -1.434877872467041, "logps/rejected": -1.5541597604751587, "loss": 0.7082, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.434877872467041, "rewards/margins": 0.11928168684244156, "rewards/rejected": -1.5541597604751587, "sft_loss": 1.4335283041000366, "step": 600 }, { "epoch": 0.32379996654959026, "grad_norm": 4.074685283282293, "learning_rate": 9.998121808030904e-07, "logits/chosen": -0.20280078053474426, "logits/rejected": -0.12091977894306183, "logps/chosen": -1.5426971912384033, "logps/rejected": -1.742283582687378, "loss": 0.6872, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.5426971912384033, "rewards/margins": 0.19958636164665222, "rewards/rejected": -1.742283582687378, "sft_loss": 1.5260711908340454, "step": 605 }, { "epoch": 0.3264759993309918, "grad_norm": 9.705402694855097, "learning_rate": 9.997670727736379e-07, "logits/chosen": -0.10903636366128922, "logits/rejected": 0.03241968899965286, "logps/chosen": -1.4781376123428345, "logps/rejected": -1.6694921255111694, "loss": 0.6948, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.4781376123428345, "rewards/margins": 0.1913544237613678, "rewards/rejected": -1.6694921255111694, "sft_loss": 1.4647178649902344, "step": 610 }, { "epoch": 0.32915203211239336, "grad_norm": 1.7131026025783753, "learning_rate": 9.99717116001853e-07, "logits/chosen": -0.18533024191856384, "logits/rejected": -0.08608406037092209, "logps/chosen": -1.4794480800628662, "logps/rejected": -1.7197014093399048, "loss": 0.6895, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.4794480800628662, "rewards/margins": 0.24025335907936096, "rewards/rejected": -1.7197014093399048, "sft_loss": 1.4695863723754883, "step": 615 }, { "epoch": 0.33182806489379496, "grad_norm": 3.8641548703644917, "learning_rate": 9.996623109724173e-07, "logits/chosen": -0.09173591434955597, "logits/rejected": -0.02755848690867424, "logps/chosen": -1.5675022602081299, "logps/rejected": -1.7330118417739868, "loss": 0.6939, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.5675022602081299, "rewards/margins": 0.165509432554245, "rewards/rejected": -1.7330118417739868, "sft_loss": 1.5306199789047241, "step": 620 }, { "epoch": 0.3345040976751965, "grad_norm": 3.676252558960378, "learning_rate": 9.996026582170488e-07, "logits/chosen": -0.10520021617412567, "logits/rejected": -0.004293438978493214, "logps/chosen": -1.4628154039382935, "logps/rejected": -1.7190496921539307, "loss": 0.6817, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.4628154039382935, "rewards/margins": 0.25623443722724915, "rewards/rejected": -1.7190496921539307, "sft_loss": 1.4512197971343994, "step": 625 }, { "epoch": 0.3371801304565981, "grad_norm": 2.259243380926168, "learning_rate": 9.995381583144996e-07, "logits/chosen": -0.19209852814674377, "logits/rejected": -0.09239007532596588, "logps/chosen": -1.5089333057403564, "logps/rejected": -1.741621732711792, "loss": 0.6898, "rewards/accuracies": 0.59375, "rewards/chosen": -1.5089333057403564, "rewards/margins": 0.23268857598304749, "rewards/rejected": -1.741621732711792, "sft_loss": 1.445467233657837, "step": 630 }, { "epoch": 0.33985616323799966, "grad_norm": 3.8794295408319064, "learning_rate": 9.994688118905471e-07, "logits/chosen": -0.14607372879981995, "logits/rejected": 0.07945680618286133, "logps/chosen": -1.569690227508545, "logps/rejected": -1.7385027408599854, "loss": 0.7069, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.569690227508545, "rewards/margins": 0.16881242394447327, "rewards/rejected": -1.7385027408599854, "sft_loss": 1.5495669841766357, "step": 635 }, { "epoch": 0.3425321960194012, "grad_norm": 3.5814865977797874, "learning_rate": 9.993946196179912e-07, "logits/chosen": -0.22887110710144043, "logits/rejected": -0.044267140328884125, "logps/chosen": -1.5432608127593994, "logps/rejected": -1.735375165939331, "loss": 0.7112, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.5432608127593994, "rewards/margins": 0.1921147108078003, "rewards/rejected": -1.735375165939331, "sft_loss": 1.5614808797836304, "step": 640 }, { "epoch": 0.3452082288008028, "grad_norm": 2.5129021167780103, "learning_rate": 9.993155822166455e-07, "logits/chosen": -0.22399599850177765, "logits/rejected": -0.14003901183605194, "logps/chosen": -1.4522173404693604, "logps/rejected": -1.73953378200531, "loss": 0.6995, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.4522173404693604, "rewards/margins": 0.28731635212898254, "rewards/rejected": -1.73953378200531, "sft_loss": 1.4069039821624756, "step": 645 }, { "epoch": 0.34788426158220437, "grad_norm": 3.3412286431292646, "learning_rate": 9.992317004533313e-07, "logits/chosen": -0.18452927470207214, "logits/rejected": -0.04418149217963219, "logps/chosen": -1.625009298324585, "logps/rejected": -1.868293046951294, "loss": 0.7021, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.625009298324585, "rewards/margins": 0.24328365921974182, "rewards/rejected": -1.868293046951294, "sft_loss": 1.5876249074935913, "step": 650 }, { "epoch": 0.350560294363606, "grad_norm": 2.920134399051155, "learning_rate": 9.991429751418696e-07, "logits/chosen": -0.11768593639135361, "logits/rejected": -0.10396134853363037, "logps/chosen": -1.5662527084350586, "logps/rejected": -1.814776062965393, "loss": 0.6946, "rewards/accuracies": 0.5625, "rewards/chosen": -1.5662527084350586, "rewards/margins": 0.24852335453033447, "rewards/rejected": -1.814776062965393, "sft_loss": 1.5303701162338257, "step": 655 }, { "epoch": 0.3532363271450075, "grad_norm": 5.67572610332352, "learning_rate": 9.99049407143074e-07, "logits/chosen": -0.12730535864830017, "logits/rejected": -0.00879682321101427, "logps/chosen": -1.535307765007019, "logps/rejected": -1.6664314270019531, "loss": 0.6939, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.535307765007019, "rewards/margins": 0.1311238706111908, "rewards/rejected": -1.6664314270019531, "sft_loss": 1.5180950164794922, "step": 660 }, { "epoch": 0.35591235992640907, "grad_norm": 4.356303767175146, "learning_rate": 9.989509973647416e-07, "logits/chosen": -0.11556844413280487, "logits/rejected": 0.018486717715859413, "logps/chosen": -1.43712317943573, "logps/rejected": -1.6720008850097656, "loss": 0.6843, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.43712317943573, "rewards/margins": 0.2348775863647461, "rewards/rejected": -1.6720008850097656, "sft_loss": 1.4556975364685059, "step": 665 }, { "epoch": 0.3585883927078107, "grad_norm": 4.5032969292048435, "learning_rate": 9.988477467616445e-07, "logits/chosen": -0.1438135802745819, "logits/rejected": 0.05054495856165886, "logps/chosen": -1.4716922044754028, "logps/rejected": -1.6338465213775635, "loss": 0.6857, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.4716922044754028, "rewards/margins": 0.16215436160564423, "rewards/rejected": -1.6338465213775635, "sft_loss": 1.5467185974121094, "step": 670 }, { "epoch": 0.3612644254892122, "grad_norm": 4.997362924802158, "learning_rate": 9.987396563355205e-07, "logits/chosen": -0.13516470789909363, "logits/rejected": -0.055740244686603546, "logps/chosen": -1.461431860923767, "logps/rejected": -1.7734931707382202, "loss": 0.6841, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.461431860923767, "rewards/margins": 0.3120613992214203, "rewards/rejected": -1.7734931707382202, "sft_loss": 1.4976530075073242, "step": 675 }, { "epoch": 0.36394045827061383, "grad_norm": 12.728909779386376, "learning_rate": 9.986267271350631e-07, "logits/chosen": -0.05813581869006157, "logits/rejected": 0.09768891334533691, "logps/chosen": -1.521172285079956, "logps/rejected": -1.6993563175201416, "loss": 0.7165, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.521172285079956, "rewards/margins": 0.17818418145179749, "rewards/rejected": -1.6993563175201416, "sft_loss": 1.4803167581558228, "step": 680 }, { "epoch": 0.3666164910520154, "grad_norm": 3.256957423420728, "learning_rate": 9.985089602559123e-07, "logits/chosen": -0.10296891629695892, "logits/rejected": 0.04575002193450928, "logps/chosen": -1.4965869188308716, "logps/rejected": -1.6728099584579468, "loss": 0.6928, "rewards/accuracies": 0.59375, "rewards/chosen": -1.4965869188308716, "rewards/margins": 0.1762230098247528, "rewards/rejected": -1.6728099584579468, "sft_loss": 1.4776004552841187, "step": 685 }, { "epoch": 0.369292523833417, "grad_norm": 3.170059736659928, "learning_rate": 9.983863568406428e-07, "logits/chosen": -0.086936354637146, "logits/rejected": -0.05500447005033493, "logps/chosen": -1.4945694208145142, "logps/rejected": -1.7144079208374023, "loss": 0.677, "rewards/accuracies": 0.5625, "rewards/chosen": -1.4945694208145142, "rewards/margins": 0.2198384553194046, "rewards/rejected": -1.7144079208374023, "sft_loss": 1.5325548648834229, "step": 690 }, { "epoch": 0.37196855661481854, "grad_norm": 2.4094839136415516, "learning_rate": 9.982589180787532e-07, "logits/chosen": -0.1298869550228119, "logits/rejected": -0.045692089945077896, "logps/chosen": -1.4005720615386963, "logps/rejected": -1.662145972251892, "loss": 0.6675, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.4005720615386963, "rewards/margins": 0.26157405972480774, "rewards/rejected": -1.662145972251892, "sft_loss": 1.422020673751831, "step": 695 }, { "epoch": 0.3746445893962201, "grad_norm": 3.2382070067603257, "learning_rate": 9.981266452066553e-07, "logits/chosen": -0.23172077536582947, "logits/rejected": -0.1004585400223732, "logps/chosen": -1.5804080963134766, "logps/rejected": -1.7512238025665283, "loss": 0.6935, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.5804080963134766, "rewards/margins": 0.17081551253795624, "rewards/rejected": -1.7512238025665283, "sft_loss": 1.5292551517486572, "step": 700 }, { "epoch": 0.3773206221776217, "grad_norm": 2.5855757075376427, "learning_rate": 9.979895395076608e-07, "logits/chosen": -0.1970127820968628, "logits/rejected": -0.03319654241204262, "logps/chosen": -1.541736364364624, "logps/rejected": -1.8506510257720947, "loss": 0.6861, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.541736364364624, "rewards/margins": 0.30891457200050354, "rewards/rejected": -1.8506510257720947, "sft_loss": 1.530510663986206, "step": 705 }, { "epoch": 0.37999665495902324, "grad_norm": 3.4405670225413147, "learning_rate": 9.9784760231197e-07, "logits/chosen": -0.06787233799695969, "logits/rejected": 0.027986615896224976, "logps/chosen": -1.4952280521392822, "logps/rejected": -1.7443792819976807, "loss": 0.6854, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.4952280521392822, "rewards/margins": 0.24915120005607605, "rewards/rejected": -1.7443792819976807, "sft_loss": 1.4719597101211548, "step": 710 }, { "epoch": 0.38267268774042484, "grad_norm": 3.0786013057975254, "learning_rate": 9.97700834996658e-07, "logits/chosen": -0.13065661489963531, "logits/rejected": 0.029547732323408127, "logps/chosen": -1.6301422119140625, "logps/rejected": -1.8177034854888916, "loss": 0.6997, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.6301422119140625, "rewards/margins": 0.1875612437725067, "rewards/rejected": -1.8177034854888916, "sft_loss": 1.5304896831512451, "step": 715 }, { "epoch": 0.3853487205218264, "grad_norm": 2.5010116709016854, "learning_rate": 9.97549238985662e-07, "logits/chosen": -0.06926265358924866, "logits/rejected": 0.11330119520425797, "logps/chosen": -1.645268440246582, "logps/rejected": -1.8578002452850342, "loss": 0.6989, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.645268440246582, "rewards/margins": 0.21253187954425812, "rewards/rejected": -1.8578002452850342, "sft_loss": 1.623335599899292, "step": 720 }, { "epoch": 0.38802475330322794, "grad_norm": 4.336032649046367, "learning_rate": 9.973928157497674e-07, "logits/chosen": -0.1750633716583252, "logits/rejected": -0.03857022523880005, "logps/chosen": -1.452271580696106, "logps/rejected": -1.8203308582305908, "loss": 0.6765, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.452271580696106, "rewards/margins": 0.3680591583251953, "rewards/rejected": -1.8203308582305908, "sft_loss": 1.4848424196243286, "step": 725 }, { "epoch": 0.39070078608462955, "grad_norm": 6.870768364380895, "learning_rate": 9.972315668065927e-07, "logits/chosen": -0.21457481384277344, "logits/rejected": -0.05655393749475479, "logps/chosen": -1.5485544204711914, "logps/rejected": -1.774266004562378, "loss": 0.6866, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.5485544204711914, "rewards/margins": 0.225711852312088, "rewards/rejected": -1.774266004562378, "sft_loss": 1.5377953052520752, "step": 730 }, { "epoch": 0.3933768188660311, "grad_norm": 2.839351971144575, "learning_rate": 9.97065493720576e-07, "logits/chosen": -0.1863122284412384, "logits/rejected": -0.0894741415977478, "logps/chosen": -1.5549644231796265, "logps/rejected": -1.7339067459106445, "loss": 0.6869, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.5549644231796265, "rewards/margins": 0.17894235253334045, "rewards/rejected": -1.7339067459106445, "sft_loss": 1.5759687423706055, "step": 735 }, { "epoch": 0.3960528516474327, "grad_norm": 4.456809773344875, "learning_rate": 9.968945981029594e-07, "logits/chosen": -0.1679374873638153, "logits/rejected": -0.0004952967283315957, "logps/chosen": -1.6225173473358154, "logps/rejected": -1.7671568393707275, "loss": 0.6921, "rewards/accuracies": 0.53125, "rewards/chosen": -1.6225173473358154, "rewards/margins": 0.14463947713375092, "rewards/rejected": -1.7671568393707275, "sft_loss": 1.6077091693878174, "step": 740 }, { "epoch": 0.39872888442883425, "grad_norm": 3.0879758095157652, "learning_rate": 9.967188816117726e-07, "logits/chosen": -0.045103929936885834, "logits/rejected": 0.02557518519461155, "logps/chosen": -1.6077511310577393, "logps/rejected": -1.9150081872940063, "loss": 0.6895, "rewards/accuracies": 0.5625, "rewards/chosen": -1.6077511310577393, "rewards/margins": 0.3072572350502014, "rewards/rejected": -1.9150081872940063, "sft_loss": 1.5629363059997559, "step": 745 }, { "epoch": 0.4014049172102358, "grad_norm": 8.798708800289246, "learning_rate": 9.965383459518179e-07, "logits/chosen": -0.12965276837348938, "logits/rejected": 0.027466658502817154, "logps/chosen": -1.5537043809890747, "logps/rejected": -1.8158077001571655, "loss": 0.6955, "rewards/accuracies": 0.59375, "rewards/chosen": -1.5537043809890747, "rewards/margins": 0.26210322976112366, "rewards/rejected": -1.8158077001571655, "sft_loss": 1.5344383716583252, "step": 750 }, { "epoch": 0.4040809499916374, "grad_norm": 1.8930806235904913, "learning_rate": 9.963529928746533e-07, "logits/chosen": -0.09079675376415253, "logits/rejected": 0.032065801322460175, "logps/chosen": -1.5689537525177002, "logps/rejected": -1.8230974674224854, "loss": 0.6923, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.5689537525177002, "rewards/margins": 0.2541435956954956, "rewards/rejected": -1.8230974674224854, "sft_loss": 1.5655837059020996, "step": 755 }, { "epoch": 0.40675698277303896, "grad_norm": 1.8032216917676425, "learning_rate": 9.961628241785746e-07, "logits/chosen": -0.17655274271965027, "logits/rejected": -0.1094353199005127, "logps/chosen": -1.5864803791046143, "logps/rejected": -1.8785566091537476, "loss": 0.6962, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.5864803791046143, "rewards/margins": 0.2920762896537781, "rewards/rejected": -1.8785566091537476, "sft_loss": 1.5739283561706543, "step": 760 }, { "epoch": 0.40943301555444056, "grad_norm": 2.762762201186071, "learning_rate": 9.959678417085998e-07, "logits/chosen": -0.1467718631029129, "logits/rejected": -0.05782966688275337, "logps/chosen": -1.569352388381958, "logps/rejected": -1.7833820581436157, "loss": 0.691, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.569352388381958, "rewards/margins": 0.21402959525585175, "rewards/rejected": -1.7833820581436157, "sft_loss": 1.5140148401260376, "step": 765 }, { "epoch": 0.4121090483358421, "grad_norm": 3.989204115760969, "learning_rate": 9.957680473564493e-07, "logits/chosen": -0.06442215293645859, "logits/rejected": 0.05450627952814102, "logps/chosen": -1.5156924724578857, "logps/rejected": -1.930771827697754, "loss": 0.6583, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.5156924724578857, "rewards/margins": 0.41507917642593384, "rewards/rejected": -1.930771827697754, "sft_loss": 1.4957122802734375, "step": 770 }, { "epoch": 0.41478508111724366, "grad_norm": 1.98743781290134, "learning_rate": 9.95563443060529e-07, "logits/chosen": -0.1774439513683319, "logits/rejected": -0.01137492060661316, "logps/chosen": -1.548862099647522, "logps/rejected": -1.8399940729141235, "loss": 0.6824, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.548862099647522, "rewards/margins": 0.2911320626735687, "rewards/rejected": -1.8399940729141235, "sft_loss": 1.480064034461975, "step": 775 }, { "epoch": 0.41746111389864526, "grad_norm": 3.816846659534653, "learning_rate": 9.95354030805911e-07, "logits/chosen": -0.24712248146533966, "logits/rejected": -0.10505137592554092, "logps/chosen": -1.4992997646331787, "logps/rejected": -1.773298978805542, "loss": 0.6846, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.4992997646331787, "rewards/margins": 0.27399933338165283, "rewards/rejected": -1.773298978805542, "sft_loss": 1.51784348487854, "step": 780 }, { "epoch": 0.4201371466800468, "grad_norm": 4.540593941105179, "learning_rate": 9.951398126243133e-07, "logits/chosen": -0.11738238483667374, "logits/rejected": 0.006606881506741047, "logps/chosen": -1.467149019241333, "logps/rejected": -1.7952121496200562, "loss": 0.6702, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.467149019241333, "rewards/margins": 0.32806330919265747, "rewards/rejected": -1.7952121496200562, "sft_loss": 1.474399447441101, "step": 785 }, { "epoch": 0.4228131794614484, "grad_norm": 7.143739814074999, "learning_rate": 9.94920790594082e-07, "logits/chosen": -0.16249966621398926, "logits/rejected": -0.037639666348695755, "logps/chosen": -1.5038591623306274, "logps/rejected": -1.7525460720062256, "loss": 0.69, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.5038591623306274, "rewards/margins": 0.24868711829185486, "rewards/rejected": -1.7525460720062256, "sft_loss": 1.490281343460083, "step": 790 }, { "epoch": 0.42548921224284997, "grad_norm": 1.8637758614434734, "learning_rate": 9.946969668401696e-07, "logits/chosen": -0.18016555905342102, "logits/rejected": -0.002284090965986252, "logps/chosen": -1.4759074449539185, "logps/rejected": -1.8262622356414795, "loss": 0.6782, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.4759074449539185, "rewards/margins": 0.3503546118736267, "rewards/rejected": -1.8262622356414795, "sft_loss": 1.491576075553894, "step": 795 }, { "epoch": 0.4281652450242516, "grad_norm": 3.489424716877646, "learning_rate": 9.944683435341155e-07, "logits/chosen": -0.12133590877056122, "logits/rejected": -0.039609938859939575, "logps/chosen": -1.502015471458435, "logps/rejected": -1.7361009120941162, "loss": 0.6916, "rewards/accuracies": 0.59375, "rewards/chosen": -1.502015471458435, "rewards/margins": 0.23408547043800354, "rewards/rejected": -1.7361009120941162, "sft_loss": 1.4773706197738647, "step": 800 }, { "epoch": 0.4281652450242516, "eval_logits/chosen": 0.2000727504491806, "eval_logits/rejected": 0.2874969244003296, "eval_logps/chosen": -1.5414361953735352, "eval_logps/rejected": -1.8469265699386597, "eval_loss": 0.6821562647819519, "eval_rewards/accuracies": 0.607566773891449, "eval_rewards/chosen": -1.5414361953735352, "eval_rewards/margins": 0.3054904639720917, "eval_rewards/rejected": -1.8469265699386597, "eval_runtime": 43.0492, "eval_samples_per_second": 31.243, "eval_sft_loss": 1.5282062292099, "eval_steps_per_second": 7.828, "step": 800 }, { "epoch": 0.4308412778056531, "grad_norm": 1.475474768839584, "learning_rate": 9.942349228940236e-07, "logits/chosen": -0.1691408008337021, "logits/rejected": -0.016671936959028244, "logps/chosen": -1.5347111225128174, "logps/rejected": -1.9066162109375, "loss": 0.6851, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.5347111225128174, "rewards/margins": 0.3719049096107483, "rewards/rejected": -1.9066162109375, "sft_loss": 1.5340166091918945, "step": 805 }, { "epoch": 0.43351731058705467, "grad_norm": 3.0932573552854667, "learning_rate": 9.939967071845424e-07, "logits/chosen": -0.0883767232298851, "logits/rejected": -0.014658985659480095, "logps/chosen": -1.4988377094268799, "logps/rejected": -1.7675899267196655, "loss": 0.6771, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.4988377094268799, "rewards/margins": 0.2687521278858185, "rewards/rejected": -1.7675899267196655, "sft_loss": 1.4959461688995361, "step": 810 }, { "epoch": 0.4361933433684563, "grad_norm": 3.792863765233088, "learning_rate": 9.937536987168413e-07, "logits/chosen": -0.07489411532878876, "logits/rejected": 0.04157022386789322, "logps/chosen": -1.438208818435669, "logps/rejected": -1.8774745464324951, "loss": 0.6486, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.438208818435669, "rewards/margins": 0.4392658770084381, "rewards/rejected": -1.8774745464324951, "sft_loss": 1.5085432529449463, "step": 815 }, { "epoch": 0.4388693761498578, "grad_norm": 3.5907793135097483, "learning_rate": 9.935058998485896e-07, "logits/chosen": -0.06854422390460968, "logits/rejected": -0.012159738689661026, "logps/chosen": -1.5659635066986084, "logps/rejected": -1.865256905555725, "loss": 0.695, "rewards/accuracies": 0.5625, "rewards/chosen": -1.5659635066986084, "rewards/margins": 0.2992933392524719, "rewards/rejected": -1.865256905555725, "sft_loss": 1.5442118644714355, "step": 820 }, { "epoch": 0.44154540893125943, "grad_norm": 2.69942669661372, "learning_rate": 9.932533129839333e-07, "logits/chosen": -0.11319144070148468, "logits/rejected": 0.010987621732056141, "logps/chosen": -1.4739656448364258, "logps/rejected": -1.768318772315979, "loss": 0.6781, "rewards/accuracies": 0.59375, "rewards/chosen": -1.4739656448364258, "rewards/margins": 0.29435327649116516, "rewards/rejected": -1.768318772315979, "sft_loss": 1.5512921810150146, "step": 825 }, { "epoch": 0.444221441712661, "grad_norm": 3.602350790071803, "learning_rate": 9.929959405734711e-07, "logits/chosen": -0.023851171135902405, "logits/rejected": 0.13682684302330017, "logps/chosen": -1.5443211793899536, "logps/rejected": -1.7456066608428955, "loss": 0.6904, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.5443211793899536, "rewards/margins": 0.20128539204597473, "rewards/rejected": -1.7456066608428955, "sft_loss": 1.4974839687347412, "step": 830 }, { "epoch": 0.44689747449406253, "grad_norm": 8.314425129967058, "learning_rate": 9.927337851142314e-07, "logits/chosen": -0.06683783233165741, "logits/rejected": 0.05744323879480362, "logps/chosen": -1.4640414714813232, "logps/rejected": -1.708734154701233, "loss": 0.6802, "rewards/accuracies": 0.59375, "rewards/chosen": -1.4640414714813232, "rewards/margins": 0.24469268321990967, "rewards/rejected": -1.708734154701233, "sft_loss": 1.5117323398590088, "step": 835 }, { "epoch": 0.44957350727546413, "grad_norm": 5.804501641676646, "learning_rate": 9.924668491496474e-07, "logits/chosen": -0.08640140295028687, "logits/rejected": 0.07182900607585907, "logps/chosen": -1.510978102684021, "logps/rejected": -1.8227230310440063, "loss": 0.696, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.510978102684021, "rewards/margins": 0.3117448687553406, "rewards/rejected": -1.8227230310440063, "sft_loss": 1.5465790033340454, "step": 840 }, { "epoch": 0.4522495400568657, "grad_norm": 1.7114648252088038, "learning_rate": 9.92195135269533e-07, "logits/chosen": -0.028147101402282715, "logits/rejected": 0.03421204164624214, "logps/chosen": -1.5054690837860107, "logps/rejected": -1.6977351903915405, "loss": 0.6932, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.5054690837860107, "rewards/margins": 0.19226618111133575, "rewards/rejected": -1.6977351903915405, "sft_loss": 1.5626037120819092, "step": 845 }, { "epoch": 0.4549255728382673, "grad_norm": 5.095358497485958, "learning_rate": 9.919186461100574e-07, "logits/chosen": -0.07386619597673416, "logits/rejected": -0.007664171047508717, "logps/chosen": -1.4631752967834473, "logps/rejected": -1.7233270406723022, "loss": 0.678, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.4631752967834473, "rewards/margins": 0.2601519227027893, "rewards/rejected": -1.7233270406723022, "sft_loss": 1.4765675067901611, "step": 850 }, { "epoch": 0.45760160561966884, "grad_norm": 2.177709752758681, "learning_rate": 9.9163738435372e-07, "logits/chosen": -0.10711731761693954, "logits/rejected": 0.027393508702516556, "logps/chosen": -1.527491569519043, "logps/rejected": -1.841475248336792, "loss": 0.715, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.527491569519043, "rewards/margins": 0.31398382782936096, "rewards/rejected": -1.841475248336792, "sft_loss": 1.520607829093933, "step": 855 }, { "epoch": 0.4602776384010704, "grad_norm": 1.599740709379549, "learning_rate": 9.913513527293234e-07, "logits/chosen": -0.1450614035129547, "logits/rejected": 0.01107205729931593, "logps/chosen": -1.5596612691879272, "logps/rejected": -1.9219337701797485, "loss": 0.6953, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.5596612691879272, "rewards/margins": 0.3622724413871765, "rewards/rejected": -1.9219337701797485, "sft_loss": 1.571467399597168, "step": 860 }, { "epoch": 0.462953671182472, "grad_norm": 4.7574966687692966, "learning_rate": 9.910605540119474e-07, "logits/chosen": -0.04354889690876007, "logits/rejected": 0.05113743618130684, "logps/chosen": -1.4938322305679321, "logps/rejected": -1.8625303506851196, "loss": 0.6779, "rewards/accuracies": 0.5625, "rewards/chosen": -1.4938322305679321, "rewards/margins": 0.36869820952415466, "rewards/rejected": -1.8625303506851196, "sft_loss": 1.4814262390136719, "step": 865 }, { "epoch": 0.46562970396387354, "grad_norm": 2.319826048394682, "learning_rate": 9.907649910229227e-07, "logits/chosen": -0.16301113367080688, "logits/rejected": 0.08414186537265778, "logps/chosen": -1.5025361776351929, "logps/rejected": -1.8213615417480469, "loss": 0.6724, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.5025361776351929, "rewards/margins": 0.31882524490356445, "rewards/rejected": -1.8213615417480469, "sft_loss": 1.561790108680725, "step": 870 }, { "epoch": 0.46830573674527515, "grad_norm": 2.5724445594754957, "learning_rate": 9.90464666629803e-07, "logits/chosen": -0.02423214539885521, "logits/rejected": 0.05520091578364372, "logps/chosen": -1.5861196517944336, "logps/rejected": -1.8330475091934204, "loss": 0.7036, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.5861196517944336, "rewards/margins": 0.24692782759666443, "rewards/rejected": -1.8330475091934204, "sft_loss": 1.5468577146530151, "step": 875 }, { "epoch": 0.4709817695266767, "grad_norm": 1.9383173865476142, "learning_rate": 9.901595837463363e-07, "logits/chosen": -0.010085579939186573, "logits/rejected": 0.16605985164642334, "logps/chosen": -1.6291682720184326, "logps/rejected": -1.936402678489685, "loss": 0.6845, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.6291682720184326, "rewards/margins": 0.3072342574596405, "rewards/rejected": -1.936402678489685, "sft_loss": 1.5209325551986694, "step": 880 }, { "epoch": 0.47365780230807825, "grad_norm": 5.317339130080701, "learning_rate": 9.898497453324384e-07, "logits/chosen": -0.10790582746267319, "logits/rejected": -0.022575518116354942, "logps/chosen": -1.5415394306182861, "logps/rejected": -1.8893182277679443, "loss": 0.6734, "rewards/accuracies": 0.65625, "rewards/chosen": -1.5415394306182861, "rewards/margins": 0.3477786183357239, "rewards/rejected": -1.8893182277679443, "sft_loss": 1.5552213191986084, "step": 885 }, { "epoch": 0.47633383508947985, "grad_norm": 3.0959199433553253, "learning_rate": 9.895351543941628e-07, "logits/chosen": -0.20399203896522522, "logits/rejected": -0.07375577837228775, "logps/chosen": -1.563641905784607, "logps/rejected": -1.8401918411254883, "loss": 0.678, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.563641905784607, "rewards/margins": 0.2765499949455261, "rewards/rejected": -1.8401918411254883, "sft_loss": 1.5999377965927124, "step": 890 }, { "epoch": 0.4790098678708814, "grad_norm": 2.5676282566774216, "learning_rate": 9.892158139836724e-07, "logits/chosen": 0.009197077713906765, "logits/rejected": 0.12456144392490387, "logps/chosen": -1.4496952295303345, "logps/rejected": -1.7056611776351929, "loss": 0.6805, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.4496952295303345, "rewards/margins": 0.2559662461280823, "rewards/rejected": -1.7056611776351929, "sft_loss": 1.4831942319869995, "step": 895 }, { "epoch": 0.481685900652283, "grad_norm": 2.751949084255799, "learning_rate": 9.88891727199209e-07, "logits/chosen": -0.12211020290851593, "logits/rejected": -0.04600541293621063, "logps/chosen": -1.4672808647155762, "logps/rejected": -1.8262830972671509, "loss": 0.68, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.4672808647155762, "rewards/margins": 0.35900211334228516, "rewards/rejected": -1.8262830972671509, "sft_loss": 1.4929606914520264, "step": 900 }, { "epoch": 0.48436193343368455, "grad_norm": 3.6068251039747845, "learning_rate": 9.885628971850641e-07, "logits/chosen": -0.025395523756742477, "logits/rejected": 0.16361042857170105, "logps/chosen": -1.5420572757720947, "logps/rejected": -1.9077465534210205, "loss": 0.6829, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.5420572757720947, "rewards/margins": 0.36568912863731384, "rewards/rejected": -1.9077465534210205, "sft_loss": 1.5835001468658447, "step": 905 }, { "epoch": 0.48703796621508616, "grad_norm": 2.4716053388381574, "learning_rate": 9.882293271315481e-07, "logits/chosen": -0.07407438009977341, "logits/rejected": 0.02558054029941559, "logps/chosen": -1.5568764209747314, "logps/rejected": -1.822037696838379, "loss": 0.6959, "rewards/accuracies": 0.53125, "rewards/chosen": -1.5568764209747314, "rewards/margins": 0.2651612162590027, "rewards/rejected": -1.822037696838379, "sft_loss": 1.5309042930603027, "step": 910 }, { "epoch": 0.4897139989964877, "grad_norm": 2.6328372724664586, "learning_rate": 9.878910202749589e-07, "logits/chosen": -0.0730830505490303, "logits/rejected": 0.10969231277704239, "logps/chosen": -1.48826003074646, "logps/rejected": -1.7999156713485718, "loss": 0.678, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.48826003074646, "rewards/margins": 0.3116556406021118, "rewards/rejected": -1.7999156713485718, "sft_loss": 1.4933485984802246, "step": 915 }, { "epoch": 0.49239003177788926, "grad_norm": 4.032568675460934, "learning_rate": 9.875479798975512e-07, "logits/chosen": 0.06822212785482407, "logits/rejected": 0.20987817645072937, "logps/chosen": -1.4264925718307495, "logps/rejected": -1.7993030548095703, "loss": 0.6783, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.4264925718307495, "rewards/margins": 0.372810423374176, "rewards/rejected": -1.7993030548095703, "sft_loss": 1.4660489559173584, "step": 920 }, { "epoch": 0.49506606455929086, "grad_norm": 2.776730542659782, "learning_rate": 9.87200209327504e-07, "logits/chosen": -0.09571783244609833, "logits/rejected": 0.06038772314786911, "logps/chosen": -1.5659586191177368, "logps/rejected": -1.7870457172393799, "loss": 0.6844, "rewards/accuracies": 0.59375, "rewards/chosen": -1.5659586191177368, "rewards/margins": 0.22108721733093262, "rewards/rejected": -1.7870457172393799, "sft_loss": 1.5310592651367188, "step": 925 }, { "epoch": 0.4977420973406924, "grad_norm": 9.725427285571993, "learning_rate": 9.868477119388894e-07, "logits/chosen": -0.10902807861566544, "logits/rejected": 0.005884545389562845, "logps/chosen": -1.4849127531051636, "logps/rejected": -1.9125696420669556, "loss": 0.6933, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.4849127531051636, "rewards/margins": 0.4276568293571472, "rewards/rejected": -1.9125696420669556, "sft_loss": 1.4972728490829468, "step": 930 }, { "epoch": 0.500418130122094, "grad_norm": 3.6272589230670826, "learning_rate": 9.864904911516383e-07, "logits/chosen": -0.019013594835996628, "logits/rejected": 0.030416369438171387, "logps/chosen": -1.462626576423645, "logps/rejected": -1.7875211238861084, "loss": 0.6793, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.462626576423645, "rewards/margins": 0.3248947858810425, "rewards/rejected": -1.7875211238861084, "sft_loss": 1.503143072128296, "step": 935 }, { "epoch": 0.5030941629034956, "grad_norm": 2.956874199257772, "learning_rate": 9.861285504315084e-07, "logits/chosen": -0.04178246855735779, "logits/rejected": 0.06810219585895538, "logps/chosen": -1.5072715282440186, "logps/rejected": -1.7970716953277588, "loss": 0.6726, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.5072715282440186, "rewards/margins": 0.2898002564907074, "rewards/rejected": -1.7970716953277588, "sft_loss": 1.5333386659622192, "step": 940 }, { "epoch": 0.5057701956848971, "grad_norm": 3.264190471072544, "learning_rate": 9.857618932900502e-07, "logits/chosen": -0.10239378362894058, "logits/rejected": 0.022822603583335876, "logps/chosen": -1.488178014755249, "logps/rejected": -1.878474235534668, "loss": 0.675, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.488178014755249, "rewards/margins": 0.39029616117477417, "rewards/rejected": -1.878474235534668, "sft_loss": 1.497314214706421, "step": 945 }, { "epoch": 0.5084462284662987, "grad_norm": 3.043037021365146, "learning_rate": 9.853905232845727e-07, "logits/chosen": -0.08505845814943314, "logits/rejected": 0.07929755002260208, "logps/chosen": -1.5903935432434082, "logps/rejected": -1.8115625381469727, "loss": 0.6943, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.5903935432434082, "rewards/margins": 0.22116902470588684, "rewards/rejected": -1.8115625381469727, "sft_loss": 1.5833137035369873, "step": 950 }, { "epoch": 0.5111222612477003, "grad_norm": 1.7750056065969102, "learning_rate": 9.850144440181095e-07, "logits/chosen": -0.039474982768297195, "logits/rejected": 0.17847755551338196, "logps/chosen": -1.6058772802352905, "logps/rejected": -1.8520838022232056, "loss": 0.6964, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.6058772802352905, "rewards/margins": 0.24620631337165833, "rewards/rejected": -1.8520838022232056, "sft_loss": 1.644765853881836, "step": 955 }, { "epoch": 0.5137982940291018, "grad_norm": 3.8779332519956418, "learning_rate": 9.846336591393832e-07, "logits/chosen": -0.08051494508981705, "logits/rejected": 0.058378733694553375, "logps/chosen": -1.557374119758606, "logps/rejected": -1.7982490062713623, "loss": 0.6883, "rewards/accuracies": 0.625, "rewards/chosen": -1.557374119758606, "rewards/margins": 0.2408749759197235, "rewards/rejected": -1.7982490062713623, "sft_loss": 1.567808747291565, "step": 960 }, { "epoch": 0.5164743268105034, "grad_norm": 5.284646234885677, "learning_rate": 9.842481723427704e-07, "logits/chosen": 0.021580982953310013, "logits/rejected": 0.025302177295088768, "logps/chosen": -1.6421302556991577, "logps/rejected": -1.9848263263702393, "loss": 0.6849, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.6421302556991577, "rewards/margins": 0.3426961302757263, "rewards/rejected": -1.9848263263702393, "sft_loss": 1.6509084701538086, "step": 965 }, { "epoch": 0.519150359591905, "grad_norm": 6.072098892973339, "learning_rate": 9.838579873682658e-07, "logits/chosen": -0.004926004912704229, "logits/rejected": 0.003906804136931896, "logps/chosen": -1.5009397268295288, "logps/rejected": -1.800026535987854, "loss": 0.6973, "rewards/accuracies": 0.59375, "rewards/chosen": -1.5009397268295288, "rewards/margins": 0.29908668994903564, "rewards/rejected": -1.800026535987854, "sft_loss": 1.487810730934143, "step": 970 }, { "epoch": 0.5218263923733065, "grad_norm": 2.0130232142621383, "learning_rate": 9.834631080014457e-07, "logits/chosen": -0.13648810982704163, "logits/rejected": 0.041145212948322296, "logps/chosen": -1.5612967014312744, "logps/rejected": -1.838753342628479, "loss": 0.6881, "rewards/accuracies": 0.625, "rewards/chosen": -1.5612967014312744, "rewards/margins": 0.277456670999527, "rewards/rejected": -1.838753342628479, "sft_loss": 1.5941799879074097, "step": 975 }, { "epoch": 0.5245024251547081, "grad_norm": 10.066727538058064, "learning_rate": 9.830635380734312e-07, "logits/chosen": -0.13426437973976135, "logits/rejected": 0.055471908301115036, "logps/chosen": -1.6321967840194702, "logps/rejected": -1.9175608158111572, "loss": 0.7019, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.6321967840194702, "rewards/margins": 0.28536421060562134, "rewards/rejected": -1.9175608158111572, "sft_loss": 1.6166508197784424, "step": 980 }, { "epoch": 0.5271784579361097, "grad_norm": 4.760869708002439, "learning_rate": 9.826592814608517e-07, "logits/chosen": -0.025986677035689354, "logits/rejected": 0.16338834166526794, "logps/chosen": -1.5891669988632202, "logps/rejected": -1.8759765625, "loss": 0.6823, "rewards/accuracies": 0.625, "rewards/chosen": -1.5891669988632202, "rewards/margins": 0.28680968284606934, "rewards/rejected": -1.8759765625, "sft_loss": 1.6372543573379517, "step": 985 }, { "epoch": 0.5298544907175113, "grad_norm": 3.9398474753155375, "learning_rate": 9.822503420858067e-07, "logits/chosen": 0.008121741004288197, "logits/rejected": 0.047742851078510284, "logps/chosen": -1.3868433237075806, "logps/rejected": -1.7650667428970337, "loss": 0.6691, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.3868433237075806, "rewards/margins": 0.3782234787940979, "rewards/rejected": -1.7650667428970337, "sft_loss": 1.4816190004348755, "step": 990 }, { "epoch": 0.5325305234989128, "grad_norm": 6.487750700925487, "learning_rate": 9.818367239158277e-07, "logits/chosen": 0.007622921373695135, "logits/rejected": 0.08368368446826935, "logps/chosen": -1.5163058042526245, "logps/rejected": -1.7254890203475952, "loss": 0.7072, "rewards/accuracies": 0.53125, "rewards/chosen": -1.5163058042526245, "rewards/margins": 0.20918314158916473, "rewards/rejected": -1.7254890203475952, "sft_loss": 1.573646068572998, "step": 995 }, { "epoch": 0.5352065562803144, "grad_norm": 2.4530057893357196, "learning_rate": 9.8141843096384e-07, "logits/chosen": 0.004174326546490192, "logits/rejected": 0.11828531324863434, "logps/chosen": -1.5501234531402588, "logps/rejected": -1.928701400756836, "loss": 0.6713, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.5501234531402588, "rewards/margins": 0.3785778284072876, "rewards/rejected": -1.928701400756836, "sft_loss": 1.5593101978302002, "step": 1000 }, { "epoch": 0.537882589061716, "grad_norm": 7.3330298167234, "learning_rate": 9.809954672881237e-07, "logits/chosen": -0.007203756831586361, "logits/rejected": 0.14894136786460876, "logps/chosen": -1.604166030883789, "logps/rejected": -1.933411955833435, "loss": 0.6907, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.604166030883789, "rewards/margins": 0.32924580574035645, "rewards/rejected": -1.933411955833435, "sft_loss": 1.6360177993774414, "step": 1005 }, { "epoch": 0.5405586218431175, "grad_norm": 1.9102616369453456, "learning_rate": 9.80567836992274e-07, "logits/chosen": -0.05284330993890762, "logits/rejected": 0.12526333332061768, "logps/chosen": -1.4626752138137817, "logps/rejected": -1.8857272863388062, "loss": 0.6693, "rewards/accuracies": 0.59375, "rewards/chosen": -1.4626752138137817, "rewards/margins": 0.4230521619319916, "rewards/rejected": -1.8857272863388062, "sft_loss": 1.4868780374526978, "step": 1010 }, { "epoch": 0.5432346546245191, "grad_norm": 4.945281070667856, "learning_rate": 9.801355442251625e-07, "logits/chosen": -0.06405460834503174, "logits/rejected": 0.10280958563089371, "logps/chosen": -1.4857017993927002, "logps/rejected": -1.8511368036270142, "loss": 0.6692, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.4857017993927002, "rewards/margins": 0.3654349148273468, "rewards/rejected": -1.8511368036270142, "sft_loss": 1.5392990112304688, "step": 1015 }, { "epoch": 0.5459106874059207, "grad_norm": 3.4180212766462117, "learning_rate": 9.796985931808949e-07, "logits/chosen": -0.035744160413742065, "logits/rejected": 0.0958310067653656, "logps/chosen": -1.5413570404052734, "logps/rejected": -1.9201008081436157, "loss": 0.6724, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.5413570404052734, "rewards/margins": 0.37874364852905273, "rewards/rejected": -1.9201008081436157, "sft_loss": 1.590633511543274, "step": 1020 }, { "epoch": 0.5485867201873222, "grad_norm": 3.377588865739187, "learning_rate": 9.792569880987724e-07, "logits/chosen": -0.08251983672380447, "logits/rejected": 0.03991267830133438, "logps/chosen": -1.5043127536773682, "logps/rejected": -1.9541893005371094, "loss": 0.6588, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.5043127536773682, "rewards/margins": 0.44987648725509644, "rewards/rejected": -1.9541893005371094, "sft_loss": 1.514143705368042, "step": 1025 }, { "epoch": 0.5512627529687238, "grad_norm": 3.632024377290369, "learning_rate": 9.788107332632493e-07, "logits/chosen": -0.021055901423096657, "logits/rejected": 0.06645959615707397, "logps/chosen": -1.5638939142227173, "logps/rejected": -1.8698257207870483, "loss": 0.68, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.5638939142227173, "rewards/margins": 0.3059318959712982, "rewards/rejected": -1.8698257207870483, "sft_loss": 1.5990447998046875, "step": 1030 }, { "epoch": 0.5539387857501255, "grad_norm": 2.447338930418058, "learning_rate": 9.783598330038924e-07, "logits/chosen": -0.03157895803451538, "logits/rejected": 0.0898062214255333, "logps/chosen": -1.646627426147461, "logps/rejected": -1.8698803186416626, "loss": 0.6907, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.646627426147461, "rewards/margins": 0.22325296700000763, "rewards/rejected": -1.8698803186416626, "sft_loss": 1.629865050315857, "step": 1035 }, { "epoch": 0.5566148185315271, "grad_norm": 1.9963882491988056, "learning_rate": 9.779042916953376e-07, "logits/chosen": 0.0005223065381869674, "logits/rejected": 0.16107748448848724, "logps/chosen": -1.5309419631958008, "logps/rejected": -1.9671926498413086, "loss": 0.6754, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.5309419631958008, "rewards/margins": 0.4362506866455078, "rewards/rejected": -1.9671926498413086, "sft_loss": 1.5817630290985107, "step": 1040 }, { "epoch": 0.5592908513129285, "grad_norm": 3.1737320220160146, "learning_rate": 9.774441137572487e-07, "logits/chosen": -0.06241927295923233, "logits/rejected": 0.08840295672416687, "logps/chosen": -1.5109721422195435, "logps/rejected": -1.9459927082061768, "loss": 0.6927, "rewards/accuracies": 0.5625, "rewards/chosen": -1.5109721422195435, "rewards/margins": 0.4350206255912781, "rewards/rejected": -1.9459927082061768, "sft_loss": 1.587691307067871, "step": 1045 }, { "epoch": 0.5619668840943302, "grad_norm": 17.451231917179324, "learning_rate": 9.76979303654274e-07, "logits/chosen": -0.07376951724290848, "logits/rejected": 0.03201202303171158, "logps/chosen": -1.6037285327911377, "logps/rejected": -2.057170867919922, "loss": 0.6891, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.6037285327911377, "rewards/margins": 0.4534422755241394, "rewards/rejected": -2.057170867919922, "sft_loss": 1.6328027248382568, "step": 1050 }, { "epoch": 0.5646429168757318, "grad_norm": 9.148526949692766, "learning_rate": 9.765098658960035e-07, "logits/chosen": -0.02030854858458042, "logits/rejected": 0.06003781035542488, "logps/chosen": -1.5734418630599976, "logps/rejected": -1.9668676853179932, "loss": 0.6757, "rewards/accuracies": 0.625, "rewards/chosen": -1.5734418630599976, "rewards/margins": 0.3934256434440613, "rewards/rejected": -1.9668676853179932, "sft_loss": 1.5984394550323486, "step": 1055 }, { "epoch": 0.5673189496571333, "grad_norm": 3.534136389187126, "learning_rate": 9.76035805036924e-07, "logits/chosen": 0.031088626012206078, "logits/rejected": 0.21325087547302246, "logps/chosen": -1.666304588317871, "logps/rejected": -1.995410680770874, "loss": 0.6915, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.666304588317871, "rewards/margins": 0.3291061818599701, "rewards/rejected": -1.995410680770874, "sft_loss": 1.626892328262329, "step": 1060 }, { "epoch": 0.5699949824385349, "grad_norm": 3.465326478762225, "learning_rate": 9.755571256763764e-07, "logits/chosen": 0.017907222732901573, "logits/rejected": 0.14953218400478363, "logps/chosen": -1.5562763214111328, "logps/rejected": -2.0238089561462402, "loss": 0.6665, "rewards/accuracies": 0.6875, "rewards/chosen": -1.5562763214111328, "rewards/margins": 0.4675326943397522, "rewards/rejected": -2.0238089561462402, "sft_loss": 1.6350345611572266, "step": 1065 }, { "epoch": 0.5726710152199365, "grad_norm": 2.1403087762463056, "learning_rate": 9.750738324585097e-07, "logits/chosen": -0.11527495086193085, "logits/rejected": 0.11920982599258423, "logps/chosen": -1.592320203781128, "logps/rejected": -1.9931793212890625, "loss": 0.6708, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.592320203781128, "rewards/margins": 0.4008590579032898, "rewards/rejected": -1.9931793212890625, "sft_loss": 1.589946985244751, "step": 1070 }, { "epoch": 0.5753470480013381, "grad_norm": 2.840207538309126, "learning_rate": 9.74585930072237e-07, "logits/chosen": -0.07293342798948288, "logits/rejected": 0.04848995804786682, "logps/chosen": -1.5443998575210571, "logps/rejected": -1.9821274280548096, "loss": 0.6806, "rewards/accuracies": 0.65625, "rewards/chosen": -1.5443998575210571, "rewards/margins": 0.4377274513244629, "rewards/rejected": -1.9821274280548096, "sft_loss": 1.5755616426467896, "step": 1075 }, { "epoch": 0.5780230807827396, "grad_norm": 3.4426385094229985, "learning_rate": 9.740934232511892e-07, "logits/chosen": -0.14516997337341309, "logits/rejected": -0.037937816232442856, "logps/chosen": -1.703258752822876, "logps/rejected": -1.9618574380874634, "loss": 0.693, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.703258752822876, "rewards/margins": 0.2585986852645874, "rewards/rejected": -1.9618574380874634, "sft_loss": 1.6879968643188477, "step": 1080 }, { "epoch": 0.5806991135641412, "grad_norm": 3.3916420368703224, "learning_rate": 9.735963167736698e-07, "logits/chosen": -0.06839573383331299, "logits/rejected": 0.09993009269237518, "logps/chosen": -1.6345523595809937, "logps/rejected": -1.83871328830719, "loss": 0.704, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.6345523595809937, "rewards/margins": 0.20416107773780823, "rewards/rejected": -1.83871328830719, "sft_loss": 1.6142337322235107, "step": 1085 }, { "epoch": 0.5833751463455428, "grad_norm": 2.1757545678128323, "learning_rate": 9.730946154626078e-07, "logits/chosen": -0.07336665689945221, "logits/rejected": 0.030426010489463806, "logps/chosen": -1.6222292184829712, "logps/rejected": -1.806684136390686, "loss": 0.6954, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.6222292184829712, "rewards/margins": 0.18445488810539246, "rewards/rejected": -1.806684136390686, "sft_loss": 1.560254693031311, "step": 1090 }, { "epoch": 0.5860511791269443, "grad_norm": 3.103314219915245, "learning_rate": 9.725883241855117e-07, "logits/chosen": -0.1993558704853058, "logits/rejected": -0.06145832687616348, "logps/chosen": -1.6323795318603516, "logps/rejected": -2.0493452548980713, "loss": 0.6879, "rewards/accuracies": 0.65625, "rewards/chosen": -1.6323795318603516, "rewards/margins": 0.4169657826423645, "rewards/rejected": -2.0493452548980713, "sft_loss": 1.6026633977890015, "step": 1095 }, { "epoch": 0.5887272119083459, "grad_norm": 5.244140816418331, "learning_rate": 9.720774478544218e-07, "logits/chosen": -0.0671815425157547, "logits/rejected": 0.04157133400440216, "logps/chosen": -1.5560808181762695, "logps/rejected": -2.0572400093078613, "loss": 0.6703, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.5560808181762695, "rewards/margins": 0.5011593103408813, "rewards/rejected": -2.0572400093078613, "sft_loss": 1.509490966796875, "step": 1100 }, { "epoch": 0.5914032446897475, "grad_norm": 3.360600229736125, "learning_rate": 9.715619914258624e-07, "logits/chosen": -0.13002343475818634, "logits/rejected": -0.04660915583372116, "logps/chosen": -1.6290388107299805, "logps/rejected": -1.9336614608764648, "loss": 0.6895, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.6290388107299805, "rewards/margins": 0.304622620344162, "rewards/rejected": -1.9336614608764648, "sft_loss": 1.5408756732940674, "step": 1105 }, { "epoch": 0.594079277471149, "grad_norm": 4.119757221087749, "learning_rate": 9.710419599007937e-07, "logits/chosen": -0.0894903913140297, "logits/rejected": 0.03421594947576523, "logps/chosen": -1.5715105533599854, "logps/rejected": -1.8307205438613892, "loss": 0.6916, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.5715105533599854, "rewards/margins": 0.2592098116874695, "rewards/rejected": -1.8307205438613892, "sft_loss": 1.5369572639465332, "step": 1110 }, { "epoch": 0.5967553102525506, "grad_norm": 9.965157899426226, "learning_rate": 9.705173583245643e-07, "logits/chosen": -0.00039502381696365774, "logits/rejected": 0.12465800344944, "logps/chosen": -1.50301194190979, "logps/rejected": -1.9628212451934814, "loss": 0.6709, "rewards/accuracies": 0.6875, "rewards/chosen": -1.50301194190979, "rewards/margins": 0.45980945229530334, "rewards/rejected": -1.9628212451934814, "sft_loss": 1.4338786602020264, "step": 1115 }, { "epoch": 0.5994313430339522, "grad_norm": 3.5833047947801533, "learning_rate": 9.699881917868609e-07, "logits/chosen": -0.17405982315540314, "logits/rejected": -0.06815730035305023, "logps/chosen": -1.4872372150421143, "logps/rejected": -1.8442754745483398, "loss": 0.6695, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.4872372150421143, "rewards/margins": 0.3570381700992584, "rewards/rejected": -1.8442754745483398, "sft_loss": 1.5191495418548584, "step": 1120 }, { "epoch": 0.6021073758153538, "grad_norm": 3.9352368286368864, "learning_rate": 9.694544654216594e-07, "logits/chosen": -0.1691105216741562, "logits/rejected": 0.017339913174510002, "logps/chosen": -1.5508992671966553, "logps/rejected": -1.9966919422149658, "loss": 0.6686, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.5508992671966553, "rewards/margins": 0.4457928240299225, "rewards/rejected": -1.9966919422149658, "sft_loss": 1.5330065488815308, "step": 1125 }, { "epoch": 0.6047834085967553, "grad_norm": 2.761349442479446, "learning_rate": 9.689161844071755e-07, "logits/chosen": -0.0050021009519696236, "logits/rejected": 0.06278308480978012, "logps/chosen": -1.5469844341278076, "logps/rejected": -1.8885329961776733, "loss": 0.6849, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.5469844341278076, "rewards/margins": 0.34154844284057617, "rewards/rejected": -1.8885329961776733, "sft_loss": 1.4816900491714478, "step": 1130 }, { "epoch": 0.6074594413781569, "grad_norm": 3.9949752862983052, "learning_rate": 9.683733539658138e-07, "logits/chosen": -0.06258795410394669, "logits/rejected": 0.09723483771085739, "logps/chosen": -1.6089904308319092, "logps/rejected": -1.987370252609253, "loss": 0.6762, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.6089904308319092, "rewards/margins": 0.37837955355644226, "rewards/rejected": -1.987370252609253, "sft_loss": 1.4937976598739624, "step": 1135 }, { "epoch": 0.6101354741595585, "grad_norm": 6.5844448907344395, "learning_rate": 9.678259793641178e-07, "logits/chosen": -0.07619436085224152, "logits/rejected": -0.03522288054227829, "logps/chosen": -1.5906662940979004, "logps/rejected": -1.7915910482406616, "loss": 0.6957, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.5906662940979004, "rewards/margins": 0.20092466473579407, "rewards/rejected": -1.7915910482406616, "sft_loss": 1.6237761974334717, "step": 1140 }, { "epoch": 0.61281150694096, "grad_norm": 5.35326679908279, "learning_rate": 9.672740659127183e-07, "logits/chosen": -0.2236470729112625, "logits/rejected": -0.10575082153081894, "logps/chosen": -1.545088291168213, "logps/rejected": -1.9672361612319946, "loss": 0.6871, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.545088291168213, "rewards/margins": 0.42214781045913696, "rewards/rejected": -1.9672361612319946, "sft_loss": 1.5959243774414062, "step": 1145 }, { "epoch": 0.6154875397223616, "grad_norm": 4.0563338921443535, "learning_rate": 9.667176189662818e-07, "logits/chosen": -0.2021702527999878, "logits/rejected": -0.0658130794763565, "logps/chosen": -1.4679720401763916, "logps/rejected": -1.866217851638794, "loss": 0.672, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.4679720401763916, "rewards/margins": 0.39824575185775757, "rewards/rejected": -1.866217851638794, "sft_loss": 1.447820782661438, "step": 1150 }, { "epoch": 0.6181635725037632, "grad_norm": 3.0364513987292607, "learning_rate": 9.661566439234592e-07, "logits/chosen": -0.05491810292005539, "logits/rejected": 0.03884059563279152, "logps/chosen": -1.555903673171997, "logps/rejected": -1.8066275119781494, "loss": 0.7026, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.555903673171997, "rewards/margins": 0.25072377920150757, "rewards/rejected": -1.8066275119781494, "sft_loss": 1.578338384628296, "step": 1155 }, { "epoch": 0.6208396052851648, "grad_norm": 3.3942088776852763, "learning_rate": 9.655911462268327e-07, "logits/chosen": -0.0013744793832302094, "logits/rejected": 0.10036492347717285, "logps/chosen": -1.4689775705337524, "logps/rejected": -1.851769208908081, "loss": 0.6687, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.4689775705337524, "rewards/margins": 0.3827916979789734, "rewards/rejected": -1.851769208908081, "sft_loss": 1.533111333847046, "step": 1160 }, { "epoch": 0.6235156380665663, "grad_norm": 3.560423950376706, "learning_rate": 9.650211313628636e-07, "logits/chosen": -0.06372959911823273, "logits/rejected": 0.011961914598941803, "logps/chosen": -1.4778475761413574, "logps/rejected": -1.7782748937606812, "loss": 0.6678, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.4778475761413574, "rewards/margins": 0.3004273772239685, "rewards/rejected": -1.7782748937606812, "sft_loss": 1.4882819652557373, "step": 1165 }, { "epoch": 0.6261916708479679, "grad_norm": 3.47458523618669, "learning_rate": 9.644466048618386e-07, "logits/chosen": -0.10142596065998077, "logits/rejected": 0.052701033651828766, "logps/chosen": -1.748448133468628, "logps/rejected": -2.0238280296325684, "loss": 0.6941, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.748448133468628, "rewards/margins": 0.2753797173500061, "rewards/rejected": -2.0238280296325684, "sft_loss": 1.6605145931243896, "step": 1170 }, { "epoch": 0.6288677036293695, "grad_norm": 2.4810129789735975, "learning_rate": 9.63867572297816e-07, "logits/chosen": -0.07748343050479889, "logits/rejected": 0.10826456546783447, "logps/chosen": -1.5186102390289307, "logps/rejected": -1.8455289602279663, "loss": 0.6808, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.5186102390289307, "rewards/margins": 0.32691866159439087, "rewards/rejected": -1.8455289602279663, "sft_loss": 1.5482877492904663, "step": 1175 }, { "epoch": 0.631543736410771, "grad_norm": 1.5032379010693973, "learning_rate": 9.632840392885727e-07, "logits/chosen": -0.10568971931934357, "logits/rejected": 0.030995279550552368, "logps/chosen": -1.6142991781234741, "logps/rejected": -1.9687931537628174, "loss": 0.6828, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.6142991781234741, "rewards/margins": 0.3544939458370209, "rewards/rejected": -1.9687931537628174, "sft_loss": 1.591963529586792, "step": 1180 }, { "epoch": 0.6342197691921726, "grad_norm": 2.1484957476697897, "learning_rate": 9.626960114955483e-07, "logits/chosen": -0.042309779673814774, "logits/rejected": 0.09029584378004074, "logps/chosen": -1.5777010917663574, "logps/rejected": -2.0055551528930664, "loss": 0.6825, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.5777010917663574, "rewards/margins": 0.4278542399406433, "rewards/rejected": -2.0055551528930664, "sft_loss": 1.558754324913025, "step": 1185 }, { "epoch": 0.6368958019735742, "grad_norm": 3.3425717204320717, "learning_rate": 9.621034946237909e-07, "logits/chosen": -0.12501846253871918, "logits/rejected": 0.015322742983698845, "logps/chosen": -1.6131792068481445, "logps/rejected": -2.024101734161377, "loss": 0.6636, "rewards/accuracies": 0.625, "rewards/chosen": -1.6131792068481445, "rewards/margins": 0.4109226167201996, "rewards/rejected": -2.024101734161377, "sft_loss": 1.6081043481826782, "step": 1190 }, { "epoch": 0.6395718347549757, "grad_norm": 5.308022190457999, "learning_rate": 9.615064944219021e-07, "logits/chosen": -0.07765809446573257, "logits/rejected": 0.03634956479072571, "logps/chosen": -1.4784530401229858, "logps/rejected": -1.9112812280654907, "loss": 0.6714, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.4784530401229858, "rewards/margins": 0.43282780051231384, "rewards/rejected": -1.9112812280654907, "sft_loss": 1.542213797569275, "step": 1195 }, { "epoch": 0.6422478675363773, "grad_norm": 8.271258182294176, "learning_rate": 9.609050166819803e-07, "logits/chosen": -0.13535340130329132, "logits/rejected": -0.06341275572776794, "logps/chosen": -1.5599576234817505, "logps/rejected": -1.8556772470474243, "loss": 0.6757, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.5599576234817505, "rewards/margins": 0.29571956396102905, "rewards/rejected": -1.8556772470474243, "sft_loss": 1.5496803522109985, "step": 1200 }, { "epoch": 0.6422478675363773, "eval_logits/chosen": 0.20428466796875, "eval_logits/rejected": 0.2921755313873291, "eval_logps/chosen": -1.5600173473358154, "eval_logps/rejected": -1.9539307355880737, "eval_loss": 0.6770769357681274, "eval_rewards/accuracies": 0.6216617226600647, "eval_rewards/chosen": -1.5600173473358154, "eval_rewards/margins": 0.39391323924064636, "eval_rewards/rejected": -1.9539307355880737, "eval_runtime": 43.3031, "eval_samples_per_second": 31.06, "eval_sft_loss": 1.5573872327804565, "eval_steps_per_second": 7.782, "step": 1200 }, { "epoch": 0.6449239003177789, "grad_norm": 7.702822915063556, "learning_rate": 9.602990672395653e-07, "logits/chosen": -0.2256489247083664, "logits/rejected": -0.05405454710125923, "logps/chosen": -1.5026648044586182, "logps/rejected": -1.8729171752929688, "loss": 0.6732, "rewards/accuracies": 0.65625, "rewards/chosen": -1.5026648044586182, "rewards/margins": 0.37025216221809387, "rewards/rejected": -1.8729171752929688, "sft_loss": 1.5461900234222412, "step": 1205 }, { "epoch": 0.6475999330991805, "grad_norm": 6.35721552375853, "learning_rate": 9.59688651973581e-07, "logits/chosen": -0.12963464856147766, "logits/rejected": 0.05106347054243088, "logps/chosen": -1.5107980966567993, "logps/rejected": -1.8580278158187866, "loss": 0.6868, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.5107980966567993, "rewards/margins": 0.3472297489643097, "rewards/rejected": -1.8580278158187866, "sft_loss": 1.5099369287490845, "step": 1210 }, { "epoch": 0.650275965880582, "grad_norm": 1.866771016177349, "learning_rate": 9.590737768062792e-07, "logits/chosen": -0.20429301261901855, "logits/rejected": -0.09392275661230087, "logps/chosen": -1.5623438358306885, "logps/rejected": -1.8264286518096924, "loss": 0.6989, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.5623438358306885, "rewards/margins": 0.26408472657203674, "rewards/rejected": -1.8264286518096924, "sft_loss": 1.5577224493026733, "step": 1215 }, { "epoch": 0.6529519986619836, "grad_norm": 6.372271471250699, "learning_rate": 9.584544477031816e-07, "logits/chosen": 0.006414422299712896, "logits/rejected": 0.10992386192083359, "logps/chosen": -1.464158058166504, "logps/rejected": -1.7496296167373657, "loss": 0.6895, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.464158058166504, "rewards/margins": 0.28547143936157227, "rewards/rejected": -1.7496296167373657, "sft_loss": 1.4565626382827759, "step": 1220 }, { "epoch": 0.6556280314433852, "grad_norm": 3.3112098860934522, "learning_rate": 9.578306706730215e-07, "logits/chosen": -0.22578708827495575, "logits/rejected": -0.021116072311997414, "logps/chosen": -1.5680522918701172, "logps/rejected": -1.8750545978546143, "loss": 0.685, "rewards/accuracies": 0.625, "rewards/chosen": -1.5680522918701172, "rewards/margins": 0.3070022463798523, "rewards/rejected": -1.8750545978546143, "sft_loss": 1.5702797174453735, "step": 1225 }, { "epoch": 0.6583040642247867, "grad_norm": 5.484447391905437, "learning_rate": 9.572024517676865e-07, "logits/chosen": -0.11542798578739166, "logits/rejected": -0.012674192897975445, "logps/chosen": -1.5529954433441162, "logps/rejected": -1.849373459815979, "loss": 0.6977, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.5529954433441162, "rewards/margins": 0.2963777482509613, "rewards/rejected": -1.849373459815979, "sft_loss": 1.498388648033142, "step": 1230 }, { "epoch": 0.6609800970061883, "grad_norm": 2.6988517762160065, "learning_rate": 9.565697970821593e-07, "logits/chosen": -0.08795280009508133, "logits/rejected": 0.031953997910022736, "logps/chosen": -1.6094152927398682, "logps/rejected": -1.8868507146835327, "loss": 0.6931, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.6094152927398682, "rewards/margins": 0.2774355411529541, "rewards/rejected": -1.8868507146835327, "sft_loss": 1.6308940649032593, "step": 1235 }, { "epoch": 0.6636561297875899, "grad_norm": 3.1410372474228634, "learning_rate": 9.559327127544585e-07, "logits/chosen": -0.23440854251384735, "logits/rejected": -0.10367898643016815, "logps/chosen": -1.6422284841537476, "logps/rejected": -1.9875221252441406, "loss": 0.6862, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.6422284841537476, "rewards/margins": 0.34529370069503784, "rewards/rejected": -1.9875221252441406, "sft_loss": 1.636867880821228, "step": 1240 }, { "epoch": 0.6663321625689914, "grad_norm": 3.3956525710849648, "learning_rate": 9.552912049655789e-07, "logits/chosen": -0.14539551734924316, "logits/rejected": 0.03338898345828056, "logps/chosen": -1.6730928421020508, "logps/rejected": -1.9549732208251953, "loss": 0.6896, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.6730928421020508, "rewards/margins": 0.28188052773475647, "rewards/rejected": -1.9549732208251953, "sft_loss": 1.6566932201385498, "step": 1245 }, { "epoch": 0.669008195350393, "grad_norm": 7.15480911411517, "learning_rate": 9.546452799394315e-07, "logits/chosen": -0.1386650800704956, "logits/rejected": 0.05835481360554695, "logps/chosen": -1.7480783462524414, "logps/rejected": -2.050711154937744, "loss": 0.6999, "rewards/accuracies": 0.59375, "rewards/chosen": -1.7480783462524414, "rewards/margins": 0.3026331067085266, "rewards/rejected": -2.050711154937744, "sft_loss": 1.7057584524154663, "step": 1250 }, { "epoch": 0.6716842281317946, "grad_norm": 5.122868397150163, "learning_rate": 9.539949439427846e-07, "logits/chosen": -0.13555511832237244, "logits/rejected": -0.009329566732048988, "logps/chosen": -1.6446784734725952, "logps/rejected": -2.0092029571533203, "loss": 0.6889, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.6446784734725952, "rewards/margins": 0.3645244836807251, "rewards/rejected": -2.0092029571533203, "sft_loss": 1.6743896007537842, "step": 1255 }, { "epoch": 0.6743602609131962, "grad_norm": 4.072451407401296, "learning_rate": 9.533402032852002e-07, "logits/chosen": -0.19094884395599365, "logits/rejected": -0.06241123750805855, "logps/chosen": -1.5872955322265625, "logps/rejected": -2.1329524517059326, "loss": 0.6851, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.5872955322265625, "rewards/margins": 0.5456571578979492, "rewards/rejected": -2.1329524517059326, "sft_loss": 1.610181450843811, "step": 1260 }, { "epoch": 0.6770362936945977, "grad_norm": 3.5520563975970263, "learning_rate": 9.526810643189754e-07, "logits/chosen": -0.08579371869564056, "logits/rejected": 0.05589524656534195, "logps/chosen": -1.5661556720733643, "logps/rejected": -2.0218825340270996, "loss": 0.6791, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.5661556720733643, "rewards/margins": 0.45572710037231445, "rewards/rejected": -2.0218825340270996, "sft_loss": 1.5690847635269165, "step": 1265 }, { "epoch": 0.6797123264759993, "grad_norm": 2.5863182066589667, "learning_rate": 9.52017533439079e-07, "logits/chosen": -0.1772998571395874, "logits/rejected": -0.07537718117237091, "logps/chosen": -1.5737218856811523, "logps/rejected": -2.018108606338501, "loss": 0.674, "rewards/accuracies": 0.625, "rewards/chosen": -1.5737218856811523, "rewards/margins": 0.4443867802619934, "rewards/rejected": -2.018108606338501, "sft_loss": 1.608443021774292, "step": 1270 }, { "epoch": 0.6823883592574009, "grad_norm": 1.7393209176572886, "learning_rate": 9.513496170830909e-07, "logits/chosen": -0.17091652750968933, "logits/rejected": -0.06616854667663574, "logps/chosen": -1.5976166725158691, "logps/rejected": -1.964280366897583, "loss": 0.6779, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.5976166725158691, "rewards/margins": 0.3666638135910034, "rewards/rejected": -1.964280366897583, "sft_loss": 1.543792486190796, "step": 1275 }, { "epoch": 0.6850643920388024, "grad_norm": 8.241710987478688, "learning_rate": 9.506773217311382e-07, "logits/chosen": -0.1482701599597931, "logits/rejected": 0.001603972166776657, "logps/chosen": -1.6374969482421875, "logps/rejected": -1.916908621788025, "loss": 0.695, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.6374969482421875, "rewards/margins": 0.2794113755226135, "rewards/rejected": -1.916908621788025, "sft_loss": 1.6501718759536743, "step": 1280 }, { "epoch": 0.687740424820204, "grad_norm": 7.128832532182297, "learning_rate": 9.500006539058334e-07, "logits/chosen": -0.12469017505645752, "logits/rejected": -0.0011989653576165438, "logps/chosen": -1.5048984289169312, "logps/rejected": -1.7811921834945679, "loss": 0.674, "rewards/accuracies": 0.59375, "rewards/chosen": -1.5048984289169312, "rewards/margins": 0.2762937545776367, "rewards/rejected": -1.7811921834945679, "sft_loss": 1.4971561431884766, "step": 1285 }, { "epoch": 0.6904164576016056, "grad_norm": 2.5701358301942276, "learning_rate": 9.493196201722109e-07, "logits/chosen": -0.2623763978481293, "logits/rejected": -0.11838710308074951, "logps/chosen": -1.5606590509414673, "logps/rejected": -1.7689409255981445, "loss": 0.6898, "rewards/accuracies": 0.53125, "rewards/chosen": -1.5606590509414673, "rewards/margins": 0.20828208327293396, "rewards/rejected": -1.7689409255981445, "sft_loss": 1.5570552349090576, "step": 1290 }, { "epoch": 0.6930924903830072, "grad_norm": 2.995443118039142, "learning_rate": 9.486342271376628e-07, "logits/chosen": -0.15557818114757538, "logits/rejected": -0.13853155076503754, "logps/chosen": -1.557213544845581, "logps/rejected": -1.975672721862793, "loss": 0.6714, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.557213544845581, "rewards/margins": 0.4184592366218567, "rewards/rejected": -1.975672721862793, "sft_loss": 1.5470623970031738, "step": 1295 }, { "epoch": 0.6957685231644087, "grad_norm": 3.1857631474864787, "learning_rate": 9.479444814518755e-07, "logits/chosen": -0.15887439250946045, "logits/rejected": 0.08108071982860565, "logps/chosen": -1.5032182931900024, "logps/rejected": -1.9598249197006226, "loss": 0.6751, "rewards/accuracies": 0.625, "rewards/chosen": -1.5032182931900024, "rewards/margins": 0.4566067159175873, "rewards/rejected": -1.9598249197006226, "sft_loss": 1.5519423484802246, "step": 1300 }, { "epoch": 0.6984445559458103, "grad_norm": 2.9846126418031127, "learning_rate": 9.472503898067645e-07, "logits/chosen": -0.020437534898519516, "logits/rejected": 0.03360765427350998, "logps/chosen": -1.5451034307479858, "logps/rejected": -1.8681617975234985, "loss": 0.6791, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.5451034307479858, "rewards/margins": 0.323058158159256, "rewards/rejected": -1.8681617975234985, "sft_loss": 1.5251609086990356, "step": 1305 }, { "epoch": 0.701120588727212, "grad_norm": 2.7939468269777143, "learning_rate": 9.465519589364099e-07, "logits/chosen": -0.04316322132945061, "logits/rejected": 0.040403760969638824, "logps/chosen": -1.49526047706604, "logps/rejected": -1.839127779006958, "loss": 0.6796, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.49526047706604, "rewards/margins": 0.34386715292930603, "rewards/rejected": -1.839127779006958, "sft_loss": 1.5057623386383057, "step": 1310 }, { "epoch": 0.7037966215086134, "grad_norm": 2.012780081071824, "learning_rate": 9.458491956169914e-07, "logits/chosen": -0.12500867247581482, "logits/rejected": 0.04848942905664444, "logps/chosen": -1.4530186653137207, "logps/rejected": -1.8870136737823486, "loss": 0.6543, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.4530186653137207, "rewards/margins": 0.43399494886398315, "rewards/rejected": -1.8870136737823486, "sft_loss": 1.4522100687026978, "step": 1315 }, { "epoch": 0.706472654290015, "grad_norm": 2.3132974309008896, "learning_rate": 9.451421066667215e-07, "logits/chosen": -0.2322191447019577, "logits/rejected": -0.04070020467042923, "logps/chosen": -1.467179536819458, "logps/rejected": -1.8701632022857666, "loss": 0.6749, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.467179536819458, "rewards/margins": 0.4029836058616638, "rewards/rejected": -1.8701632022857666, "sft_loss": 1.488556146621704, "step": 1320 }, { "epoch": 0.7091486870714167, "grad_norm": 5.872677785280684, "learning_rate": 9.444306989457805e-07, "logits/chosen": -0.08441654592752457, "logits/rejected": 0.020516756922006607, "logps/chosen": -1.551811933517456, "logps/rejected": -1.8700692653656006, "loss": 0.6895, "rewards/accuracies": 0.5625, "rewards/chosen": -1.551811933517456, "rewards/margins": 0.31825751066207886, "rewards/rejected": -1.8700692653656006, "sft_loss": 1.5086562633514404, "step": 1325 }, { "epoch": 0.7118247198528181, "grad_norm": 2.6113187087154888, "learning_rate": 9.437149793562489e-07, "logits/chosen": -0.1095929890871048, "logits/rejected": 0.0020243481267243624, "logps/chosen": -1.5338222980499268, "logps/rejected": -1.770532250404358, "loss": 0.6847, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.5338222980499268, "rewards/margins": 0.23670975863933563, "rewards/rejected": -1.770532250404358, "sft_loss": 1.525867223739624, "step": 1330 }, { "epoch": 0.7145007526342197, "grad_norm": 3.6022868512669715, "learning_rate": 9.429949548420417e-07, "logits/chosen": -0.09152556955814362, "logits/rejected": -0.01481366716325283, "logps/chosen": -1.6264232397079468, "logps/rejected": -1.9128234386444092, "loss": 0.6819, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.6264232397079468, "rewards/margins": 0.2864004373550415, "rewards/rejected": -1.9128234386444092, "sft_loss": 1.6053024530410767, "step": 1335 }, { "epoch": 0.7171767854156214, "grad_norm": 2.9818283371238694, "learning_rate": 9.422706323888396e-07, "logits/chosen": -0.09121497720479965, "logits/rejected": -0.053193580359220505, "logps/chosen": -1.5617108345031738, "logps/rejected": -1.851728081703186, "loss": 0.6968, "rewards/accuracies": 0.625, "rewards/chosen": -1.5617108345031738, "rewards/margins": 0.290017306804657, "rewards/rejected": -1.851728081703186, "sft_loss": 1.541918158531189, "step": 1340 }, { "epoch": 0.719852818197023, "grad_norm": 3.2254024286317704, "learning_rate": 9.415420190240225e-07, "logits/chosen": -0.05244135856628418, "logits/rejected": 0.13419947028160095, "logps/chosen": -1.527730941772461, "logps/rejected": -1.9169437885284424, "loss": 0.6789, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.527730941772461, "rewards/margins": 0.38921260833740234, "rewards/rejected": -1.9169437885284424, "sft_loss": 1.5584609508514404, "step": 1345 }, { "epoch": 0.7225288509784245, "grad_norm": 3.184994861200645, "learning_rate": 9.408091218166002e-07, "logits/chosen": -0.04931309074163437, "logits/rejected": 0.011732319369912148, "logps/chosen": -1.5444519519805908, "logps/rejected": -1.713204026222229, "loss": 0.6983, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.5444519519805908, "rewards/margins": 0.1687517762184143, "rewards/rejected": -1.713204026222229, "sft_loss": 1.5547934770584106, "step": 1350 }, { "epoch": 0.7252048837598261, "grad_norm": 2.3196954532425895, "learning_rate": 9.400719478771449e-07, "logits/chosen": -0.07705724984407425, "logits/rejected": 0.20211537182331085, "logps/chosen": -1.6409122943878174, "logps/rejected": -1.981188416481018, "loss": 0.6877, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.6409122943878174, "rewards/margins": 0.34027615189552307, "rewards/rejected": -1.981188416481018, "sft_loss": 1.6130729913711548, "step": 1355 }, { "epoch": 0.7278809165412277, "grad_norm": 4.604948427878151, "learning_rate": 9.393305043577209e-07, "logits/chosen": -0.17080985009670258, "logits/rejected": -0.026930373162031174, "logps/chosen": -1.6719783544540405, "logps/rejected": -2.09201979637146, "loss": 0.6854, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.6719783544540405, "rewards/margins": 0.4200412631034851, "rewards/rejected": -2.09201979637146, "sft_loss": 1.6854654550552368, "step": 1360 }, { "epoch": 0.7305569493226292, "grad_norm": 1.9140637251332389, "learning_rate": 9.38584798451817e-07, "logits/chosen": -0.07638595253229141, "logits/rejected": 0.06088540703058243, "logps/chosen": -1.5613833665847778, "logps/rejected": -1.8865737915039062, "loss": 0.6861, "rewards/accuracies": 0.625, "rewards/chosen": -1.5613833665847778, "rewards/margins": 0.32519054412841797, "rewards/rejected": -1.8865737915039062, "sft_loss": 1.5651633739471436, "step": 1365 }, { "epoch": 0.7332329821040308, "grad_norm": 2.003458713392101, "learning_rate": 9.37834837394275e-07, "logits/chosen": -0.06622512638568878, "logits/rejected": 0.049443237483501434, "logps/chosen": -1.6531356573104858, "logps/rejected": -2.1463818550109863, "loss": 0.6654, "rewards/accuracies": 0.65625, "rewards/chosen": -1.6531356573104858, "rewards/margins": 0.49324607849121094, "rewards/rejected": -2.1463818550109863, "sft_loss": 1.633424997329712, "step": 1370 }, { "epoch": 0.7359090148854324, "grad_norm": 3.2176370131268883, "learning_rate": 9.370806284612203e-07, "logits/chosen": -0.11674849689006805, "logits/rejected": 0.030340248718857765, "logps/chosen": -1.5437183380126953, "logps/rejected": -2.0395216941833496, "loss": 0.6638, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.5437183380126953, "rewards/margins": 0.49580326676368713, "rewards/rejected": -2.0395216941833496, "sft_loss": 1.5829532146453857, "step": 1375 }, { "epoch": 0.738585047666834, "grad_norm": 4.8124347775749605, "learning_rate": 9.363221789699912e-07, "logits/chosen": -0.15775957703590393, "logits/rejected": -0.0332581028342247, "logps/chosen": -1.5887658596038818, "logps/rejected": -1.8498271703720093, "loss": 0.6951, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.5887658596038818, "rewards/margins": 0.26106134057044983, "rewards/rejected": -1.8498271703720093, "sft_loss": 1.5464975833892822, "step": 1380 }, { "epoch": 0.7412610804482355, "grad_norm": 8.86609371083185, "learning_rate": 9.355594962790682e-07, "logits/chosen": -0.1439005434513092, "logits/rejected": -0.015216085128486156, "logps/chosen": -1.4790098667144775, "logps/rejected": -1.861185073852539, "loss": 0.6906, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.4790098667144775, "rewards/margins": 0.382175087928772, "rewards/rejected": -1.861185073852539, "sft_loss": 1.5054775476455688, "step": 1385 }, { "epoch": 0.7439371132296371, "grad_norm": 5.132404998842138, "learning_rate": 9.34792587788002e-07, "logits/chosen": -0.048094429075717926, "logits/rejected": 0.07193388044834137, "logps/chosen": -1.5926799774169922, "logps/rejected": -1.9508346319198608, "loss": 0.6881, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.5926799774169922, "rewards/margins": 0.358154833316803, "rewards/rejected": -1.9508346319198608, "sft_loss": 1.6146783828735352, "step": 1390 }, { "epoch": 0.7466131460110387, "grad_norm": 3.231833246170535, "learning_rate": 9.34021460937342e-07, "logits/chosen": 0.00042394845513626933, "logits/rejected": 0.09290830790996552, "logps/chosen": -1.5559685230255127, "logps/rejected": -1.8194761276245117, "loss": 0.6892, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.5559685230255127, "rewards/margins": 0.26350778341293335, "rewards/rejected": -1.8194761276245117, "sft_loss": 1.539088487625122, "step": 1395 }, { "epoch": 0.7492891787924402, "grad_norm": 1.4119295681940902, "learning_rate": 9.332461232085646e-07, "logits/chosen": -0.21388454735279083, "logits/rejected": -0.07617992907762527, "logps/chosen": -1.6378538608551025, "logps/rejected": -2.004629373550415, "loss": 0.6852, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.6378538608551025, "rewards/margins": 0.36677560210227966, "rewards/rejected": -2.004629373550415, "sft_loss": 1.642905831336975, "step": 1400 }, { "epoch": 0.7519652115738418, "grad_norm": 2.6573615191238322, "learning_rate": 9.324665821239998e-07, "logits/chosen": -0.11038383096456528, "logits/rejected": 0.070514015853405, "logps/chosen": -1.437596321105957, "logps/rejected": -1.957358717918396, "loss": 0.6812, "rewards/accuracies": 0.625, "rewards/chosen": -1.437596321105957, "rewards/margins": 0.5197626948356628, "rewards/rejected": -1.957358717918396, "sft_loss": 1.4881653785705566, "step": 1405 }, { "epoch": 0.7546412443552434, "grad_norm": 3.3860319941892083, "learning_rate": 9.316828452467583e-07, "logits/chosen": -0.16537480056285858, "logits/rejected": 0.008626123890280724, "logps/chosen": -1.59507155418396, "logps/rejected": -1.9980379343032837, "loss": 0.6697, "rewards/accuracies": 0.59375, "rewards/chosen": -1.59507155418396, "rewards/margins": 0.4029662609100342, "rewards/rejected": -1.9980379343032837, "sft_loss": 1.6412235498428345, "step": 1410 }, { "epoch": 0.7573172771366449, "grad_norm": 5.364636442909648, "learning_rate": 9.30894920180659e-07, "logits/chosen": -0.058947961777448654, "logits/rejected": 0.08670911192893982, "logps/chosen": -1.609555959701538, "logps/rejected": -1.789323091506958, "loss": 0.6968, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.609555959701538, "rewards/margins": 0.17976680397987366, "rewards/rejected": -1.789323091506958, "sft_loss": 1.5581715106964111, "step": 1415 }, { "epoch": 0.7599933099180465, "grad_norm": 2.6038674841340193, "learning_rate": 9.301028145701543e-07, "logits/chosen": -0.045800067484378815, "logits/rejected": 0.08673261106014252, "logps/chosen": -1.5158113241195679, "logps/rejected": -2.089888334274292, "loss": 0.6823, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.5158113241195679, "rewards/margins": 0.5740770697593689, "rewards/rejected": -2.089888334274292, "sft_loss": 1.545422911643982, "step": 1420 }, { "epoch": 0.7626693426994481, "grad_norm": 1.9927918386030432, "learning_rate": 9.293065361002563e-07, "logits/chosen": -0.007310047745704651, "logits/rejected": 0.07895542681217194, "logps/chosen": -1.5619844198226929, "logps/rejected": -2.0922622680664062, "loss": 0.6698, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.5619844198226929, "rewards/margins": 0.5302778482437134, "rewards/rejected": -2.0922622680664062, "sft_loss": 1.5573543310165405, "step": 1425 }, { "epoch": 0.7653453754808497, "grad_norm": 4.490498215579793, "learning_rate": 9.285060924964622e-07, "logits/chosen": -0.13646551966667175, "logits/rejected": 0.0039533572271466255, "logps/chosen": -1.6439111232757568, "logps/rejected": -1.954990029335022, "loss": 0.6815, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.6439111232757568, "rewards/margins": 0.31107890605926514, "rewards/rejected": -1.954990029335022, "sft_loss": 1.572951078414917, "step": 1430 }, { "epoch": 0.7680214082622512, "grad_norm": 2.801289868801277, "learning_rate": 9.277014915246792e-07, "logits/chosen": 0.009105369448661804, "logits/rejected": 0.07025135308504105, "logps/chosen": -1.5027073621749878, "logps/rejected": -2.024282693862915, "loss": 0.6839, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.5027073621749878, "rewards/margins": 0.5215753316879272, "rewards/rejected": -2.024282693862915, "sft_loss": 1.5148102045059204, "step": 1435 }, { "epoch": 0.7706974410436528, "grad_norm": 3.395666019700634, "learning_rate": 9.268927409911498e-07, "logits/chosen": -0.0959697738289833, "logits/rejected": 0.015020926482975483, "logps/chosen": -1.5443953275680542, "logps/rejected": -1.8177766799926758, "loss": 0.686, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.5443953275680542, "rewards/margins": 0.27338117361068726, "rewards/rejected": -1.8177766799926758, "sft_loss": 1.5885334014892578, "step": 1440 }, { "epoch": 0.7733734738250544, "grad_norm": 5.468030761727132, "learning_rate": 9.260798487423749e-07, "logits/chosen": -0.14049549400806427, "logits/rejected": 0.07801493257284164, "logps/chosen": -1.670945167541504, "logps/rejected": -2.048828363418579, "loss": 0.676, "rewards/accuracies": 0.59375, "rewards/chosen": -1.670945167541504, "rewards/margins": 0.3778831958770752, "rewards/rejected": -2.048828363418579, "sft_loss": 1.6721107959747314, "step": 1445 }, { "epoch": 0.7760495066064559, "grad_norm": 26.45492077745353, "learning_rate": 9.252628226650389e-07, "logits/chosen": -0.01423375029116869, "logits/rejected": 0.08405263721942902, "logps/chosen": -1.73532235622406, "logps/rejected": -1.9594646692276, "loss": 0.6902, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.73532235622406, "rewards/margins": 0.22414252161979675, "rewards/rejected": -1.9594646692276, "sft_loss": 1.6139285564422607, "step": 1450 }, { "epoch": 0.7787255393878575, "grad_norm": 1.6370612121298826, "learning_rate": 9.244416706859321e-07, "logits/chosen": -0.0711984857916832, "logits/rejected": 0.09706473350524902, "logps/chosen": -1.7322229146957397, "logps/rejected": -2.187854051589966, "loss": 0.676, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.7322229146957397, "rewards/margins": 0.4556312561035156, "rewards/rejected": -2.187854051589966, "sft_loss": 1.666243314743042, "step": 1455 }, { "epoch": 0.7814015721692591, "grad_norm": 1.3578427220948361, "learning_rate": 9.23616400771875e-07, "logits/chosen": -0.02941352128982544, "logits/rejected": 0.1412985622882843, "logps/chosen": -1.897125244140625, "logps/rejected": -2.1788127422332764, "loss": 0.688, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.897125244140625, "rewards/margins": 0.2816876471042633, "rewards/rejected": -2.1788127422332764, "sft_loss": 1.6443207263946533, "step": 1460 }, { "epoch": 0.7840776049506607, "grad_norm": 1.1439248265728157, "learning_rate": 9.227870209296395e-07, "logits/chosen": -0.016499606892466545, "logits/rejected": 0.09893260896205902, "logps/chosen": -1.961816430091858, "logps/rejected": -2.18733549118042, "loss": 0.698, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.961816430091858, "rewards/margins": 0.22551891207695007, "rewards/rejected": -2.18733549118042, "sft_loss": 1.7368465662002563, "step": 1465 }, { "epoch": 0.7867536377320622, "grad_norm": 1.2935551708071662, "learning_rate": 9.219535392058728e-07, "logits/chosen": -0.10018036514520645, "logits/rejected": -0.0658569410443306, "logps/chosen": -2.0715017318725586, "logps/rejected": -2.278127431869507, "loss": 0.7069, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -2.0715017318725586, "rewards/margins": 0.20662562549114227, "rewards/rejected": -2.278127431869507, "sft_loss": 1.8013073205947876, "step": 1470 }, { "epoch": 0.7894296705134638, "grad_norm": 1.3289250577714493, "learning_rate": 9.211159636870181e-07, "logits/chosen": -0.12351454794406891, "logits/rejected": 0.05726455897092819, "logps/chosen": -1.958917260169983, "logps/rejected": -2.243590831756592, "loss": 0.6972, "rewards/accuracies": 0.5625, "rewards/chosen": -1.958917260169983, "rewards/margins": 0.28467339277267456, "rewards/rejected": -2.243590831756592, "sft_loss": 1.6803343296051025, "step": 1475 }, { "epoch": 0.7921057032948654, "grad_norm": 1.482748397618818, "learning_rate": 9.202743024992367e-07, "logits/chosen": -0.02373456209897995, "logits/rejected": 0.0855960100889206, "logps/chosen": -2.0908074378967285, "logps/rejected": -2.3277366161346436, "loss": 0.7072, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -2.0908074378967285, "rewards/margins": 0.23692938685417175, "rewards/rejected": -2.3277366161346436, "sft_loss": 1.735292673110962, "step": 1480 }, { "epoch": 0.7947817360762669, "grad_norm": 1.643821120380416, "learning_rate": 9.194285638083293e-07, "logits/chosen": -0.027933578938245773, "logits/rejected": 0.1411551535129547, "logps/chosen": -1.9436454772949219, "logps/rejected": -2.229135036468506, "loss": 0.6911, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.9436454772949219, "rewards/margins": 0.2854893207550049, "rewards/rejected": -2.229135036468506, "sft_loss": 1.614961862564087, "step": 1485 }, { "epoch": 0.7974577688576685, "grad_norm": 1.893708518906089, "learning_rate": 9.185787558196562e-07, "logits/chosen": -0.1228971853852272, "logits/rejected": -0.0020554482471197844, "logps/chosen": -1.8631207942962646, "logps/rejected": -2.0909080505371094, "loss": 0.7057, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.8631207942962646, "rewards/margins": 0.2277870923280716, "rewards/rejected": -2.0909080505371094, "sft_loss": 1.654308557510376, "step": 1490 }, { "epoch": 0.8001338016390701, "grad_norm": 1.9499816452217278, "learning_rate": 9.177248867780583e-07, "logits/chosen": -0.11362478882074356, "logits/rejected": -0.0030496090184897184, "logps/chosen": -2.018123149871826, "logps/rejected": -2.1381494998931885, "loss": 0.7083, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -2.018123149871826, "rewards/margins": 0.12002629041671753, "rewards/rejected": -2.1381494998931885, "sft_loss": 1.8265842199325562, "step": 1495 }, { "epoch": 0.8028098344204716, "grad_norm": 1.6020532710600257, "learning_rate": 9.168669649677769e-07, "logits/chosen": -0.12686367332935333, "logits/rejected": -0.023370999842882156, "logps/chosen": -1.9039256572723389, "logps/rejected": -2.173767566680908, "loss": 0.7034, "rewards/accuracies": 0.53125, "rewards/chosen": -1.9039256572723389, "rewards/margins": 0.2698422074317932, "rewards/rejected": -2.173767566680908, "sft_loss": 1.7249925136566162, "step": 1500 }, { "epoch": 0.8054858672018732, "grad_norm": 2.7728240416119223, "learning_rate": 9.16004998712373e-07, "logits/chosen": -0.07930402457714081, "logits/rejected": -0.011636780574917793, "logps/chosen": -1.7571052312850952, "logps/rejected": -2.135777711868286, "loss": 0.6826, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.7571052312850952, "rewards/margins": 0.378672331571579, "rewards/rejected": -2.135777711868286, "sft_loss": 1.5940500497817993, "step": 1505 }, { "epoch": 0.8081618999832748, "grad_norm": 1.6316113332161346, "learning_rate": 9.151389963746472e-07, "logits/chosen": -0.15546007454395294, "logits/rejected": 0.11401049792766571, "logps/chosen": -1.739991545677185, "logps/rejected": -2.1746387481689453, "loss": 0.6812, "rewards/accuracies": 0.6875, "rewards/chosen": -1.739991545677185, "rewards/margins": 0.4346471428871155, "rewards/rejected": -2.1746387481689453, "sft_loss": 1.6523933410644531, "step": 1510 }, { "epoch": 0.8108379327646764, "grad_norm": 2.8595704944653146, "learning_rate": 9.142689663565577e-07, "logits/chosen": -0.08268715441226959, "logits/rejected": -0.013263854198157787, "logps/chosen": -1.6454627513885498, "logps/rejected": -2.0212502479553223, "loss": 0.689, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.6454627513885498, "rewards/margins": 0.3757875859737396, "rewards/rejected": -2.0212502479553223, "sft_loss": 1.592815637588501, "step": 1515 }, { "epoch": 0.8135139655460779, "grad_norm": 4.485806385390208, "learning_rate": 9.133949170991397e-07, "logits/chosen": -0.06977352499961853, "logits/rejected": 0.022602787241339684, "logps/chosen": -1.679253339767456, "logps/rejected": -1.99455988407135, "loss": 0.6847, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.679253339767456, "rewards/margins": 0.3153064250946045, "rewards/rejected": -1.99455988407135, "sft_loss": 1.725643515586853, "step": 1520 }, { "epoch": 0.8161899983274795, "grad_norm": 1.5757725922095123, "learning_rate": 9.125168570824231e-07, "logits/chosen": -0.1212872862815857, "logits/rejected": 0.056601427495479584, "logps/chosen": -1.6112515926361084, "logps/rejected": -1.9420926570892334, "loss": 0.6834, "rewards/accuracies": 0.625, "rewards/chosen": -1.6112515926361084, "rewards/margins": 0.3308410048484802, "rewards/rejected": -1.9420926570892334, "sft_loss": 1.5881706476211548, "step": 1525 }, { "epoch": 0.8188660311088811, "grad_norm": 2.800830308826896, "learning_rate": 9.116347948253496e-07, "logits/chosen": -0.12238524109125137, "logits/rejected": -0.00532518932595849, "logps/chosen": -1.6283352375030518, "logps/rejected": -1.928149938583374, "loss": 0.6924, "rewards/accuracies": 0.5625, "rewards/chosen": -1.6283352375030518, "rewards/margins": 0.2998148798942566, "rewards/rejected": -1.928149938583374, "sft_loss": 1.5983941555023193, "step": 1530 }, { "epoch": 0.8215420638902826, "grad_norm": 2.0165413656926114, "learning_rate": 9.107487388856916e-07, "logits/chosen": -0.12431217730045319, "logits/rejected": 0.05323215574026108, "logps/chosen": -1.5410443544387817, "logps/rejected": -1.9448493719100952, "loss": 0.6646, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.5410443544387817, "rewards/margins": 0.4038047790527344, "rewards/rejected": -1.9448493719100952, "sft_loss": 1.5656566619873047, "step": 1535 }, { "epoch": 0.8242180966716842, "grad_norm": 3.6027657530567905, "learning_rate": 9.098586978599673e-07, "logits/chosen": -0.09710733592510223, "logits/rejected": 0.0640190914273262, "logps/chosen": -1.5832998752593994, "logps/rejected": -2.163719892501831, "loss": 0.6669, "rewards/accuracies": 0.6875, "rewards/chosen": -1.5832998752593994, "rewards/margins": 0.5804203748703003, "rewards/rejected": -2.163719892501831, "sft_loss": 1.5556138753890991, "step": 1540 }, { "epoch": 0.8268941294530858, "grad_norm": 4.383434026201945, "learning_rate": 9.089646803833588e-07, "logits/chosen": -0.05685956031084061, "logits/rejected": 0.10388834774494171, "logps/chosen": -1.5632593631744385, "logps/rejected": -1.8989818096160889, "loss": 0.6938, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.5632593631744385, "rewards/margins": 0.33572250604629517, "rewards/rejected": -1.8989818096160889, "sft_loss": 1.5883595943450928, "step": 1545 }, { "epoch": 0.8295701622344873, "grad_norm": 3.579654924226385, "learning_rate": 9.080666951296276e-07, "logits/chosen": -0.21645978093147278, "logits/rejected": 0.0461578443646431, "logps/chosen": -1.6038854122161865, "logps/rejected": -2.0388576984405518, "loss": 0.6847, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.6038854122161865, "rewards/margins": 0.43497198820114136, "rewards/rejected": -2.0388576984405518, "sft_loss": 1.6251455545425415, "step": 1550 }, { "epoch": 0.8322461950158889, "grad_norm": 1.449373832558233, "learning_rate": 9.071647508110305e-07, "logits/chosen": -0.17601588368415833, "logits/rejected": 0.07178305834531784, "logps/chosen": -1.627812147140503, "logps/rejected": -2.0813026428222656, "loss": 0.6906, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.627812147140503, "rewards/margins": 0.4534904360771179, "rewards/rejected": -2.0813026428222656, "sft_loss": 1.5838168859481812, "step": 1555 }, { "epoch": 0.8349222277972905, "grad_norm": 2.3986835135715947, "learning_rate": 9.062588561782354e-07, "logits/chosen": -0.053179144859313965, "logits/rejected": 0.019892878830432892, "logps/chosen": -1.6524696350097656, "logps/rejected": -1.9607833623886108, "loss": 0.67, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.6524696350097656, "rewards/margins": 0.30831378698349, "rewards/rejected": -1.9607833623886108, "sft_loss": 1.6692003011703491, "step": 1560 }, { "epoch": 0.8375982605786921, "grad_norm": 1.8093814907030827, "learning_rate": 9.053490200202358e-07, "logits/chosen": -0.05682498216629028, "logits/rejected": 0.041855137795209885, "logps/chosen": -1.638074278831482, "logps/rejected": -1.9424121379852295, "loss": 0.6873, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.638074278831482, "rewards/margins": 0.30433768033981323, "rewards/rejected": -1.9424121379852295, "sft_loss": 1.6524169445037842, "step": 1565 }, { "epoch": 0.8402742933600936, "grad_norm": 3.979211925428682, "learning_rate": 9.044352511642661e-07, "logits/chosen": -0.025512274354696274, "logits/rejected": -0.005897931754589081, "logps/chosen": -1.5600687265396118, "logps/rejected": -1.8334335088729858, "loss": 0.6876, "rewards/accuracies": 0.625, "rewards/chosen": -1.5600687265396118, "rewards/margins": 0.273364782333374, "rewards/rejected": -1.8334335088729858, "sft_loss": 1.5935394763946533, "step": 1570 }, { "epoch": 0.8429503261414952, "grad_norm": 5.017621580214614, "learning_rate": 9.03517558475716e-07, "logits/chosen": -0.07320438325405121, "logits/rejected": 0.03042033314704895, "logps/chosen": -1.5760712623596191, "logps/rejected": -1.838953971862793, "loss": 0.6916, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.5760712623596191, "rewards/margins": 0.2628825902938843, "rewards/rejected": -1.838953971862793, "sft_loss": 1.5679020881652832, "step": 1575 }, { "epoch": 0.8456263589228968, "grad_norm": 2.770920413022084, "learning_rate": 9.025959508580436e-07, "logits/chosen": -0.027145802974700928, "logits/rejected": 0.2162061482667923, "logps/chosen": -1.6395962238311768, "logps/rejected": -2.0249321460723877, "loss": 0.6722, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.6395962238311768, "rewards/margins": 0.3853360712528229, "rewards/rejected": -2.0249321460723877, "sft_loss": 1.6166515350341797, "step": 1580 }, { "epoch": 0.8483023917042983, "grad_norm": 2.6484303604459223, "learning_rate": 9.016704372526905e-07, "logits/chosen": -0.08917608112096786, "logits/rejected": 0.06780953705310822, "logps/chosen": -1.50400972366333, "logps/rejected": -2.0031304359436035, "loss": 0.6684, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.50400972366333, "rewards/margins": 0.4991206228733063, "rewards/rejected": -2.0031304359436035, "sft_loss": 1.498430609703064, "step": 1585 }, { "epoch": 0.8509784244856999, "grad_norm": 10.533324718921243, "learning_rate": 9.007410266389934e-07, "logits/chosen": -0.11556092649698257, "logits/rejected": -0.02518743835389614, "logps/chosen": -1.5511956214904785, "logps/rejected": -1.8880646228790283, "loss": 0.6763, "rewards/accuracies": 0.59375, "rewards/chosen": -1.5511956214904785, "rewards/margins": 0.33686885237693787, "rewards/rejected": -1.8880646228790283, "sft_loss": 1.6106898784637451, "step": 1590 }, { "epoch": 0.8536544572671015, "grad_norm": 4.531173815612611, "learning_rate": 8.998077280340981e-07, "logits/chosen": -0.03725462406873703, "logits/rejected": 0.05171867460012436, "logps/chosen": -1.7447688579559326, "logps/rejected": -1.9502300024032593, "loss": 0.7021, "rewards/accuracies": 0.5625, "rewards/chosen": -1.7447688579559326, "rewards/margins": 0.20546121895313263, "rewards/rejected": -1.9502300024032593, "sft_loss": 1.691070556640625, "step": 1595 }, { "epoch": 0.8563304900485031, "grad_norm": 2.398422908898425, "learning_rate": 8.988705504928722e-07, "logits/chosen": -0.16255098581314087, "logits/rejected": 0.03137350454926491, "logps/chosen": -1.6325353384017944, "logps/rejected": -2.1362509727478027, "loss": 0.6744, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.6325353384017944, "rewards/margins": 0.5037158727645874, "rewards/rejected": -2.1362509727478027, "sft_loss": 1.6324422359466553, "step": 1600 }, { "epoch": 0.8563304900485031, "eval_logits/chosen": 0.1912696212530136, "eval_logits/rejected": 0.28272679448127747, "eval_logps/chosen": -1.6092780828475952, "eval_logps/rejected": -2.0407536029815674, "eval_loss": 0.6739305257797241, "eval_rewards/accuracies": 0.6335311532020569, "eval_rewards/chosen": -1.6092780828475952, "eval_rewards/margins": 0.4314754903316498, "eval_rewards/rejected": -2.0407536029815674, "eval_runtime": 46.9151, "eval_samples_per_second": 28.669, "eval_sft_loss": 1.595874309539795, "eval_steps_per_second": 7.183, "step": 1600 }, { "epoch": 0.8590065228299046, "grad_norm": 2.325941888707781, "learning_rate": 8.979295031078157e-07, "logits/chosen": -0.15885820984840393, "logits/rejected": 0.061292119324207306, "logps/chosen": -1.6002050638198853, "logps/rejected": -2.0625171661376953, "loss": 0.6704, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.6002050638198853, "rewards/margins": 0.46231213212013245, "rewards/rejected": -2.0625171661376953, "sft_loss": 1.5941084623336792, "step": 1605 }, { "epoch": 0.8616825556113062, "grad_norm": 2.6887603502483137, "learning_rate": 8.969845950089751e-07, "logits/chosen": -0.16491663455963135, "logits/rejected": 0.005011633038520813, "logps/chosen": -1.5596110820770264, "logps/rejected": -2.1306350231170654, "loss": 0.6768, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.5596110820770264, "rewards/margins": 0.5710240602493286, "rewards/rejected": -2.1306350231170654, "sft_loss": 1.5888437032699585, "step": 1610 }, { "epoch": 0.8643585883927078, "grad_norm": 3.5086793422707188, "learning_rate": 8.960358353638526e-07, "logits/chosen": -0.11782260239124298, "logits/rejected": -0.012947884388267994, "logps/chosen": -1.6803264617919922, "logps/rejected": -2.164822816848755, "loss": 0.6761, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.6803264617919922, "rewards/margins": 0.48449650406837463, "rewards/rejected": -2.164822816848755, "sft_loss": 1.6974128484725952, "step": 1615 }, { "epoch": 0.8670346211741093, "grad_norm": 2.736107063051043, "learning_rate": 8.950832333773184e-07, "logits/chosen": -0.06576915830373764, "logits/rejected": 0.08024446666240692, "logps/chosen": -1.509300947189331, "logps/rejected": -1.94968581199646, "loss": 0.6684, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.509300947189331, "rewards/margins": 0.44038495421409607, "rewards/rejected": -1.94968581199646, "sft_loss": 1.513148546218872, "step": 1620 }, { "epoch": 0.869710653955511, "grad_norm": 2.1306604254638954, "learning_rate": 8.941267982915213e-07, "logits/chosen": -0.003141905413940549, "logits/rejected": 0.04678082466125488, "logps/chosen": -1.6815744638442993, "logps/rejected": -1.9233620166778564, "loss": 0.6948, "rewards/accuracies": 0.5625, "rewards/chosen": -1.6815744638442993, "rewards/margins": 0.24178750813007355, "rewards/rejected": -1.9233620166778564, "sft_loss": 1.6074094772338867, "step": 1625 }, { "epoch": 0.8723866867369126, "grad_norm": 5.170181882532501, "learning_rate": 8.931665393857983e-07, "logits/chosen": -0.0599781759083271, "logits/rejected": 0.08817584812641144, "logps/chosen": -1.5599714517593384, "logps/rejected": -1.941277265548706, "loss": 0.6729, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.5599714517593384, "rewards/margins": 0.381305992603302, "rewards/rejected": -1.941277265548706, "sft_loss": 1.5579124689102173, "step": 1630 }, { "epoch": 0.875062719518314, "grad_norm": 5.875083105284614, "learning_rate": 8.922024659765861e-07, "logits/chosen": -0.16240473091602325, "logits/rejected": -0.04438484460115433, "logps/chosen": -1.4770807027816772, "logps/rejected": -1.9241431951522827, "loss": 0.6772, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.4770807027816772, "rewards/margins": 0.4470624029636383, "rewards/rejected": -1.9241431951522827, "sft_loss": 1.5148780345916748, "step": 1635 }, { "epoch": 0.8777387522997157, "grad_norm": 2.29849411000814, "learning_rate": 8.912345874173288e-07, "logits/chosen": -0.14882834255695343, "logits/rejected": -0.03332848101854324, "logps/chosen": -1.491604208946228, "logps/rejected": -1.9753957986831665, "loss": 0.6773, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.491604208946228, "rewards/margins": 0.4837915301322937, "rewards/rejected": -1.9753957986831665, "sft_loss": 1.5202728509902954, "step": 1640 }, { "epoch": 0.8804147850811173, "grad_norm": 5.056281533703873, "learning_rate": 8.902629130983885e-07, "logits/chosen": -0.0903366208076477, "logits/rejected": -0.03441023826599121, "logps/chosen": -1.5394830703735352, "logps/rejected": -1.837863564491272, "loss": 0.6886, "rewards/accuracies": 0.625, "rewards/chosen": -1.5394830703735352, "rewards/margins": 0.2983805239200592, "rewards/rejected": -1.837863564491272, "sft_loss": 1.5584405660629272, "step": 1645 }, { "epoch": 0.8830908178625189, "grad_norm": 5.480908431753782, "learning_rate": 8.892874524469537e-07, "logits/chosen": -0.01808544062077999, "logits/rejected": 0.051542092114686966, "logps/chosen": -1.515172004699707, "logps/rejected": -1.8816455602645874, "loss": 0.6712, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.515172004699707, "rewards/margins": 0.36647361516952515, "rewards/rejected": -1.8816455602645874, "sft_loss": 1.4930051565170288, "step": 1650 }, { "epoch": 0.8857668506439204, "grad_norm": 4.592937357483597, "learning_rate": 8.883082149269478e-07, "logits/chosen": -0.14737102389335632, "logits/rejected": -0.03177300840616226, "logps/chosen": -1.5563279390335083, "logps/rejected": -1.9180982112884521, "loss": 0.7019, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.5563279390335083, "rewards/margins": 0.3617701232433319, "rewards/rejected": -1.9180982112884521, "sft_loss": 1.5393438339233398, "step": 1655 }, { "epoch": 0.888442883425322, "grad_norm": 3.821892841718335, "learning_rate": 8.873252100389377e-07, "logits/chosen": -0.031492680311203, "logits/rejected": -0.0020574121735990047, "logps/chosen": -1.4627248048782349, "logps/rejected": -1.8346326351165771, "loss": 0.6765, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.4627248048782349, "rewards/margins": 0.37190794944763184, "rewards/rejected": -1.8346326351165771, "sft_loss": 1.411965012550354, "step": 1660 }, { "epoch": 0.8911189162067236, "grad_norm": 3.712972200243005, "learning_rate": 8.863384473200411e-07, "logits/chosen": -0.07665841281414032, "logits/rejected": -0.0004884630325250328, "logps/chosen": -1.6424614191055298, "logps/rejected": -1.8931677341461182, "loss": 0.6884, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.6424614191055298, "rewards/margins": 0.2507062554359436, "rewards/rejected": -1.8931677341461182, "sft_loss": 1.5956447124481201, "step": 1665 }, { "epoch": 0.8937949489881251, "grad_norm": 3.8721602308327197, "learning_rate": 8.853479363438342e-07, "logits/chosen": -0.042667657136917114, "logits/rejected": 0.1281338334083557, "logps/chosen": -1.6979026794433594, "logps/rejected": -2.0013856887817383, "loss": 0.7094, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.6979026794433594, "rewards/margins": 0.3034830689430237, "rewards/rejected": -2.0013856887817383, "sft_loss": 1.6016740798950195, "step": 1670 }, { "epoch": 0.8964709817695267, "grad_norm": 2.855384143369643, "learning_rate": 8.843536867202588e-07, "logits/chosen": -0.07494497299194336, "logits/rejected": 0.1362690031528473, "logps/chosen": -1.6354612112045288, "logps/rejected": -2.150341749191284, "loss": 0.6934, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.6354612112045288, "rewards/margins": 0.514880359172821, "rewards/rejected": -2.150341749191284, "sft_loss": 1.6575100421905518, "step": 1675 }, { "epoch": 0.8991470145509283, "grad_norm": 5.104972393628471, "learning_rate": 8.833557080955292e-07, "logits/chosen": -0.1633204072713852, "logits/rejected": -0.05091344565153122, "logps/chosen": -1.6553776264190674, "logps/rejected": -2.027411699295044, "loss": 0.6862, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.6553776264190674, "rewards/margins": 0.3720341622829437, "rewards/rejected": -2.027411699295044, "sft_loss": 1.6414705514907837, "step": 1680 }, { "epoch": 0.9018230473323299, "grad_norm": 3.8592230555707423, "learning_rate": 8.823540101520381e-07, "logits/chosen": -0.12714692950248718, "logits/rejected": 0.11072331666946411, "logps/chosen": -1.6420742273330688, "logps/rejected": -2.115112781524658, "loss": 0.6898, "rewards/accuracies": 0.59375, "rewards/chosen": -1.6420742273330688, "rewards/margins": 0.47303861379623413, "rewards/rejected": -2.115112781524658, "sft_loss": 1.635709524154663, "step": 1685 }, { "epoch": 0.9044990801137314, "grad_norm": 2.9593871241841145, "learning_rate": 8.813486026082637e-07, "logits/chosen": -0.1302056759595871, "logits/rejected": 0.061131738126277924, "logps/chosen": -1.5277483463287354, "logps/rejected": -2.047550916671753, "loss": 0.6678, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.5277483463287354, "rewards/margins": 0.5198026895523071, "rewards/rejected": -2.047550916671753, "sft_loss": 1.588708519935608, "step": 1690 }, { "epoch": 0.907175112895133, "grad_norm": 5.614753547947297, "learning_rate": 8.803394952186742e-07, "logits/chosen": -0.2522553503513336, "logits/rejected": -0.10259109735488892, "logps/chosen": -1.6107975244522095, "logps/rejected": -2.0465805530548096, "loss": 0.6766, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.6107975244522095, "rewards/margins": 0.4357830584049225, "rewards/rejected": -2.0465805530548096, "sft_loss": 1.6689703464508057, "step": 1695 }, { "epoch": 0.9098511456765346, "grad_norm": 7.2355387166733465, "learning_rate": 8.793266977736342e-07, "logits/chosen": -0.07041691988706589, "logits/rejected": -0.10695245116949081, "logps/chosen": -1.628130316734314, "logps/rejected": -1.8095182180404663, "loss": 0.6951, "rewards/accuracies": 0.53125, "rewards/chosen": -1.628130316734314, "rewards/margins": 0.18138787150382996, "rewards/rejected": -1.8095182180404663, "sft_loss": 1.6410319805145264, "step": 1700 }, { "epoch": 0.9125271784579361, "grad_norm": 2.962926044065382, "learning_rate": 8.783102200993085e-07, "logits/chosen": -0.07322728633880615, "logits/rejected": 0.07346437871456146, "logps/chosen": -1.5739026069641113, "logps/rejected": -1.9100176095962524, "loss": 0.6844, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.5739026069641113, "rewards/margins": 0.3361150622367859, "rewards/rejected": -1.9100176095962524, "sft_loss": 1.5770145654678345, "step": 1705 }, { "epoch": 0.9152032112393377, "grad_norm": 2.9598461510264396, "learning_rate": 8.772900720575683e-07, "logits/chosen": -0.11749941110610962, "logits/rejected": -0.03795120120048523, "logps/chosen": -1.5487481355667114, "logps/rejected": -1.897014856338501, "loss": 0.681, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.5487481355667114, "rewards/margins": 0.3482665419578552, "rewards/rejected": -1.897014856338501, "sft_loss": 1.5692429542541504, "step": 1710 }, { "epoch": 0.9178792440207393, "grad_norm": 2.147733896750781, "learning_rate": 8.762662635458944e-07, "logits/chosen": -0.11868778616189957, "logits/rejected": 0.09065614640712738, "logps/chosen": -1.6962509155273438, "logps/rejected": -2.0103964805603027, "loss": 0.6931, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.6962509155273438, "rewards/margins": 0.3141458034515381, "rewards/rejected": -2.0103964805603027, "sft_loss": 1.630626916885376, "step": 1715 }, { "epoch": 0.9205552768021408, "grad_norm": 3.1619078999272006, "learning_rate": 8.752388044972811e-07, "logits/chosen": -0.09264856576919556, "logits/rejected": -0.01981327496469021, "logps/chosen": -1.5092804431915283, "logps/rejected": -1.969607949256897, "loss": 0.6664, "rewards/accuracies": 0.625, "rewards/chosen": -1.5092804431915283, "rewards/margins": 0.46032723784446716, "rewards/rejected": -1.969607949256897, "sft_loss": 1.4860968589782715, "step": 1720 }, { "epoch": 0.9232313095835424, "grad_norm": 2.8083488278583673, "learning_rate": 8.74207704880141e-07, "logits/chosen": -0.0894165113568306, "logits/rejected": 0.018659692257642746, "logps/chosen": -1.6026160717010498, "logps/rejected": -2.1429691314697266, "loss": 0.6692, "rewards/accuracies": 0.625, "rewards/chosen": -1.6026160717010498, "rewards/margins": 0.5403528213500977, "rewards/rejected": -2.1429691314697266, "sft_loss": 1.633073091506958, "step": 1725 }, { "epoch": 0.925907342364944, "grad_norm": 7.867167915315164, "learning_rate": 8.731729746982068e-07, "logits/chosen": -0.0508013479411602, "logits/rejected": 0.010919039137661457, "logps/chosen": -1.5411659479141235, "logps/rejected": -1.8381420373916626, "loss": 0.6846, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.5411659479141235, "rewards/margins": 0.2969761788845062, "rewards/rejected": -1.8381420373916626, "sft_loss": 1.5529801845550537, "step": 1730 }, { "epoch": 0.9285833751463456, "grad_norm": 2.748657377959956, "learning_rate": 8.721346239904355e-07, "logits/chosen": -0.18835203349590302, "logits/rejected": -0.023120930418372154, "logps/chosen": -1.5490851402282715, "logps/rejected": -2.185800075531006, "loss": 0.6654, "rewards/accuracies": 0.6875, "rewards/chosen": -1.5490851402282715, "rewards/margins": 0.6367148160934448, "rewards/rejected": -2.185800075531006, "sft_loss": 1.473778486251831, "step": 1735 }, { "epoch": 0.9312594079277471, "grad_norm": 1.7240456217497198, "learning_rate": 8.710926628309101e-07, "logits/chosen": -0.14346204698085785, "logits/rejected": -0.001293714391067624, "logps/chosen": -1.5688560009002686, "logps/rejected": -2.0389304161071777, "loss": 0.6728, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.5688560009002686, "rewards/margins": 0.4700745642185211, "rewards/rejected": -2.0389304161071777, "sft_loss": 1.500208854675293, "step": 1740 }, { "epoch": 0.9339354407091487, "grad_norm": 3.8100275253920333, "learning_rate": 8.700471013287424e-07, "logits/chosen": -0.09344641864299774, "logits/rejected": -0.05481022596359253, "logps/chosen": -1.5765666961669922, "logps/rejected": -1.9928195476531982, "loss": 0.6807, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.5765666961669922, "rewards/margins": 0.41625308990478516, "rewards/rejected": -1.9928195476531982, "sft_loss": 1.609368920326233, "step": 1745 }, { "epoch": 0.9366114734905503, "grad_norm": 3.2646387720028565, "learning_rate": 8.689979496279746e-07, "logits/chosen": -0.12338890880346298, "logits/rejected": -0.06230727955698967, "logps/chosen": -1.6930673122406006, "logps/rejected": -2.1141300201416016, "loss": 0.6792, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.6930673122406006, "rewards/margins": 0.42106279730796814, "rewards/rejected": -2.1141300201416016, "sft_loss": 1.6491073369979858, "step": 1750 }, { "epoch": 0.9392875062719518, "grad_norm": 2.559943077562952, "learning_rate": 8.679452179074811e-07, "logits/chosen": -0.1306913197040558, "logits/rejected": -0.03029678203165531, "logps/chosen": -1.5664455890655518, "logps/rejected": -1.947819709777832, "loss": 0.6754, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.5664455890655518, "rewards/margins": 0.38137391209602356, "rewards/rejected": -1.947819709777832, "sft_loss": 1.540327548980713, "step": 1755 }, { "epoch": 0.9419635390533534, "grad_norm": 2.22133408064078, "learning_rate": 8.668889163808698e-07, "logits/chosen": -0.1324535310268402, "logits/rejected": -0.015176964923739433, "logps/chosen": -1.5364208221435547, "logps/rejected": -1.9308210611343384, "loss": 0.6769, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.5364208221435547, "rewards/margins": 0.394400417804718, "rewards/rejected": -1.9308210611343384, "sft_loss": 1.5744932889938354, "step": 1760 }, { "epoch": 0.944639571834755, "grad_norm": 3.5146639998653613, "learning_rate": 8.658290552963827e-07, "logits/chosen": -0.05859723687171936, "logits/rejected": -0.03487427160143852, "logps/chosen": -1.556701898574829, "logps/rejected": -1.9352022409439087, "loss": 0.6918, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.556701898574829, "rewards/margins": 0.37850040197372437, "rewards/rejected": -1.9352022409439087, "sft_loss": 1.5356924533843994, "step": 1765 }, { "epoch": 0.9473156046161565, "grad_norm": 2.3927352220671403, "learning_rate": 8.647656449367966e-07, "logits/chosen": -0.07654085010290146, "logits/rejected": 0.0713915079832077, "logps/chosen": -1.6044076681137085, "logps/rejected": -1.892507553100586, "loss": 0.6759, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.6044076681137085, "rewards/margins": 0.28809982538223267, "rewards/rejected": -1.892507553100586, "sft_loss": 1.644179105758667, "step": 1770 }, { "epoch": 0.9499916373975581, "grad_norm": 5.443172287517489, "learning_rate": 8.636986956193235e-07, "logits/chosen": -0.12157295644283295, "logits/rejected": -0.029175758361816406, "logps/chosen": -1.4955778121948242, "logps/rejected": -1.8605883121490479, "loss": 0.6741, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.4955778121948242, "rewards/margins": 0.36501047015190125, "rewards/rejected": -1.8605883121490479, "sft_loss": 1.5303313732147217, "step": 1775 }, { "epoch": 0.9526676701789597, "grad_norm": 3.711177108393341, "learning_rate": 8.626282176955104e-07, "logits/chosen": -0.14020344614982605, "logits/rejected": -0.01927454024553299, "logps/chosen": -1.5508153438568115, "logps/rejected": -1.9805715084075928, "loss": 0.6793, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.5508153438568115, "rewards/margins": 0.4297560751438141, "rewards/rejected": -1.9805715084075928, "sft_loss": 1.4936689138412476, "step": 1780 }, { "epoch": 0.9553437029603613, "grad_norm": 2.444443823624952, "learning_rate": 8.615542215511389e-07, "logits/chosen": -0.027949964627623558, "logits/rejected": 0.04656906798481941, "logps/chosen": -1.5221688747406006, "logps/rejected": -1.7465623617172241, "loss": 0.6931, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.5221688747406006, "rewards/margins": 0.22439360618591309, "rewards/rejected": -1.7465623617172241, "sft_loss": 1.4782495498657227, "step": 1785 }, { "epoch": 0.9580197357417628, "grad_norm": 4.113717811033466, "learning_rate": 8.604767176061241e-07, "logits/chosen": -0.008868610486388206, "logits/rejected": 0.04976072162389755, "logps/chosen": -1.5966230630874634, "logps/rejected": -1.966228723526001, "loss": 0.679, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.5966230630874634, "rewards/margins": 0.36960554122924805, "rewards/rejected": -1.966228723526001, "sft_loss": 1.589078664779663, "step": 1790 }, { "epoch": 0.9606957685231644, "grad_norm": 3.257044099078646, "learning_rate": 8.593957163144141e-07, "logits/chosen": -0.1507941633462906, "logits/rejected": -0.010530698113143444, "logps/chosen": -1.4972823858261108, "logps/rejected": -2.0315799713134766, "loss": 0.6675, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.4972823858261108, "rewards/margins": 0.5342975854873657, "rewards/rejected": -2.0315799713134766, "sft_loss": 1.5237815380096436, "step": 1795 }, { "epoch": 0.963371801304566, "grad_norm": 2.2342951425617086, "learning_rate": 8.58311228163888e-07, "logits/chosen": -0.10327861458063126, "logits/rejected": -0.022930169478058815, "logps/chosen": -1.5314559936523438, "logps/rejected": -1.8455177545547485, "loss": 0.6798, "rewards/accuracies": 0.625, "rewards/chosen": -1.5314559936523438, "rewards/margins": 0.31406185030937195, "rewards/rejected": -1.8455177545547485, "sft_loss": 1.5912697315216064, "step": 1800 }, { "epoch": 0.9660478340859675, "grad_norm": 3.4486990877578068, "learning_rate": 8.57223263676255e-07, "logits/chosen": -0.23526854813098907, "logits/rejected": -0.1062394380569458, "logps/chosen": -1.4703363180160522, "logps/rejected": -2.0053153038024902, "loss": 0.6678, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.4703363180160522, "rewards/margins": 0.5349791646003723, "rewards/rejected": -2.0053153038024902, "sft_loss": 1.5218839645385742, "step": 1805 }, { "epoch": 0.9687238668673691, "grad_norm": 5.274171248765637, "learning_rate": 8.561318334069511e-07, "logits/chosen": -0.10490190982818604, "logits/rejected": 0.04200034216046333, "logps/chosen": -1.5042073726654053, "logps/rejected": -1.8845596313476562, "loss": 0.6852, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.5042073726654053, "rewards/margins": 0.3803521990776062, "rewards/rejected": -1.8845596313476562, "sft_loss": 1.5251801013946533, "step": 1810 }, { "epoch": 0.9713998996487707, "grad_norm": 3.384379823608095, "learning_rate": 8.550369479450375e-07, "logits/chosen": -0.14682337641716003, "logits/rejected": -0.017906129360198975, "logps/chosen": -1.5452371835708618, "logps/rejected": -1.9544401168823242, "loss": 0.6747, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.5452371835708618, "rewards/margins": 0.40920290350914, "rewards/rejected": -1.9544401168823242, "sft_loss": 1.5843769311904907, "step": 1815 }, { "epoch": 0.9740759324301723, "grad_norm": 2.1764985463320174, "learning_rate": 8.539386179130977e-07, "logits/chosen": -0.12845490872859955, "logits/rejected": -0.053113799542188644, "logps/chosen": -1.5860878229141235, "logps/rejected": -1.8734443187713623, "loss": 0.6819, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.5860878229141235, "rewards/margins": 0.287356436252594, "rewards/rejected": -1.8734443187713623, "sft_loss": 1.5250016450881958, "step": 1820 }, { "epoch": 0.9767519652115738, "grad_norm": 2.975329281636474, "learning_rate": 8.528368539671347e-07, "logits/chosen": -0.19761589169502258, "logits/rejected": -0.060870569199323654, "logps/chosen": -1.5228192806243896, "logps/rejected": -2.0270698070526123, "loss": 0.6821, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.5228192806243896, "rewards/margins": 0.5042504668235779, "rewards/rejected": -2.0270698070526123, "sft_loss": 1.5352400541305542, "step": 1825 }, { "epoch": 0.9794279979929754, "grad_norm": 3.5373981714487654, "learning_rate": 8.51731666796467e-07, "logits/chosen": -0.05435476452112198, "logits/rejected": -0.021604064851999283, "logps/chosen": -1.5740587711334229, "logps/rejected": -1.8411388397216797, "loss": 0.6843, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.5740587711334229, "rewards/margins": 0.267080157995224, "rewards/rejected": -1.8411388397216797, "sft_loss": 1.5605738162994385, "step": 1830 }, { "epoch": 0.982104030774377, "grad_norm": 2.5979519570662237, "learning_rate": 8.506230671236254e-07, "logits/chosen": -0.14526249468326569, "logits/rejected": -0.08094936609268188, "logps/chosen": -1.5423524379730225, "logps/rejected": -1.7739944458007812, "loss": 0.6863, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.5423524379730225, "rewards/margins": 0.23164193332195282, "rewards/rejected": -1.7739944458007812, "sft_loss": 1.5814307928085327, "step": 1835 }, { "epoch": 0.9847800635557785, "grad_norm": 3.4902212698196577, "learning_rate": 8.495110657042488e-07, "logits/chosen": -0.08723638206720352, "logits/rejected": 0.01722792722284794, "logps/chosen": -1.5690443515777588, "logps/rejected": -2.032050132751465, "loss": 0.6752, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.5690443515777588, "rewards/margins": 0.46300578117370605, "rewards/rejected": -2.032050132751465, "sft_loss": 1.6070778369903564, "step": 1840 }, { "epoch": 0.9874560963371801, "grad_norm": 4.70587093676687, "learning_rate": 8.483956733269799e-07, "logits/chosen": -0.15140441060066223, "logits/rejected": -0.05931438133120537, "logps/chosen": -1.5859469175338745, "logps/rejected": -1.8986486196517944, "loss": 0.6807, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.5859469175338745, "rewards/margins": 0.31270185112953186, "rewards/rejected": -1.8986486196517944, "sft_loss": 1.5984251499176025, "step": 1845 }, { "epoch": 0.9901321291185817, "grad_norm": 1.6964966931429128, "learning_rate": 8.472769008133602e-07, "logits/chosen": -0.26614508032798767, "logits/rejected": -0.1424245536327362, "logps/chosen": -1.6399457454681396, "logps/rejected": -1.8817602396011353, "loss": 0.7036, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.6399457454681396, "rewards/margins": 0.241814523935318, "rewards/rejected": -1.8817602396011353, "sft_loss": 1.5788856744766235, "step": 1850 }, { "epoch": 0.9928081618999832, "grad_norm": 3.3134105939667897, "learning_rate": 8.461547590177259e-07, "logits/chosen": -0.16021369397640228, "logits/rejected": -0.05174224451184273, "logps/chosen": -1.5435289144515991, "logps/rejected": -1.9880918264389038, "loss": 0.6745, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.5435289144515991, "rewards/margins": 0.4445629119873047, "rewards/rejected": -1.9880918264389038, "sft_loss": 1.5799753665924072, "step": 1855 }, { "epoch": 0.9954841946813848, "grad_norm": 3.543600343763058, "learning_rate": 8.450292588271014e-07, "logits/chosen": -0.14376217126846313, "logits/rejected": -0.06231715530157089, "logps/chosen": -1.702498435974121, "logps/rejected": -2.086392879486084, "loss": 0.6769, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.702498435974121, "rewards/margins": 0.38389450311660767, "rewards/rejected": -2.086392879486084, "sft_loss": 1.6510069370269775, "step": 1860 }, { "epoch": 0.9981602274627864, "grad_norm": 3.735182065545256, "learning_rate": 8.439004111610945e-07, "logits/chosen": -0.1657623052597046, "logits/rejected": -0.08684898167848587, "logps/chosen": -1.603272795677185, "logps/rejected": -2.073019504547119, "loss": 0.6675, "rewards/accuracies": 0.65625, "rewards/chosen": -1.603272795677185, "rewards/margins": 0.46974676847457886, "rewards/rejected": -2.073019504547119, "sft_loss": 1.5904573202133179, "step": 1865 }, { "epoch": 1.000836260244188, "grad_norm": 3.0067181950688338, "learning_rate": 8.427682269717901e-07, "logits/chosen": -0.2079596072435379, "logits/rejected": -0.051432013511657715, "logps/chosen": -1.779923677444458, "logps/rejected": -2.2986772060394287, "loss": 0.6911, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.779923677444458, "rewards/margins": 0.5187537670135498, "rewards/rejected": -2.2986772060394287, "sft_loss": 1.7402454614639282, "step": 1870 }, { "epoch": 1.0035122930255895, "grad_norm": 1.8008577817804836, "learning_rate": 8.416327172436446e-07, "logits/chosen": -0.24255767464637756, "logits/rejected": -0.10895420610904694, "logps/chosen": -1.8133251667022705, "logps/rejected": -2.1065123081207275, "loss": 0.69, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.8133251667022705, "rewards/margins": 0.2931869328022003, "rewards/rejected": -2.1065123081207275, "sft_loss": 1.6673797369003296, "step": 1875 }, { "epoch": 1.0061883258069912, "grad_norm": 3.214189820214113, "learning_rate": 8.404938929933778e-07, "logits/chosen": -0.11197970062494278, "logits/rejected": 0.06118257716298103, "logps/chosen": -1.7502024173736572, "logps/rejected": -2.4015679359436035, "loss": 0.6733, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.7502024173736572, "rewards/margins": 0.6513655185699463, "rewards/rejected": -2.4015679359436035, "sft_loss": 1.679987907409668, "step": 1880 }, { "epoch": 1.0088643585883927, "grad_norm": 1.843152406956296, "learning_rate": 8.39351765269868e-07, "logits/chosen": -0.14984950423240662, "logits/rejected": -0.06991840898990631, "logps/chosen": -1.6768118143081665, "logps/rejected": -2.119256019592285, "loss": 0.6709, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.6768118143081665, "rewards/margins": 0.44244417548179626, "rewards/rejected": -2.119256019592285, "sft_loss": 1.6225124597549438, "step": 1885 }, { "epoch": 1.0115403913697942, "grad_norm": 3.3615092610738317, "learning_rate": 8.382063451540431e-07, "logits/chosen": -0.15404468774795532, "logits/rejected": 0.045191358774900436, "logps/chosen": -1.6460826396942139, "logps/rejected": -2.1567981243133545, "loss": 0.6888, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.6460826396942139, "rewards/margins": 0.5107154846191406, "rewards/rejected": -2.1567981243133545, "sft_loss": 1.706637978553772, "step": 1890 }, { "epoch": 1.014216424151196, "grad_norm": 6.204257382868284, "learning_rate": 8.370576437587742e-07, "logits/chosen": -0.08804275840520859, "logits/rejected": -0.029958754777908325, "logps/chosen": -1.678462028503418, "logps/rejected": -2.050363779067993, "loss": 0.6745, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.678462028503418, "rewards/margins": 0.37190166115760803, "rewards/rejected": -2.050363779067993, "sft_loss": 1.5845019817352295, "step": 1895 }, { "epoch": 1.0168924569325974, "grad_norm": 3.0052606984490824, "learning_rate": 8.359056722287674e-07, "logits/chosen": -0.23166868090629578, "logits/rejected": 0.019547026604413986, "logps/chosen": -1.6740500926971436, "logps/rejected": -2.0996651649475098, "loss": 0.6714, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.6740500926971436, "rewards/margins": 0.42561525106430054, "rewards/rejected": -2.0996651649475098, "sft_loss": 1.651079773902893, "step": 1900 }, { "epoch": 1.019568489713999, "grad_norm": 2.599228606225825, "learning_rate": 8.347504417404553e-07, "logits/chosen": -0.129378542304039, "logits/rejected": 0.02050795778632164, "logps/chosen": -1.6524471044540405, "logps/rejected": -1.946677565574646, "loss": 0.6851, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.6524471044540405, "rewards/margins": 0.2942304313182831, "rewards/rejected": -1.946677565574646, "sft_loss": 1.5889383554458618, "step": 1905 }, { "epoch": 1.0222445224954007, "grad_norm": 2.7196530986636303, "learning_rate": 8.335919635018893e-07, "logits/chosen": -0.21800541877746582, "logits/rejected": -0.09155451506376266, "logps/chosen": -1.5629202127456665, "logps/rejected": -1.9020274877548218, "loss": 0.689, "rewards/accuracies": 0.65625, "rewards/chosen": -1.5629202127456665, "rewards/margins": 0.33910730481147766, "rewards/rejected": -1.9020274877548218, "sft_loss": 1.5543124675750732, "step": 1910 }, { "epoch": 1.0249205552768021, "grad_norm": 2.389383825205269, "learning_rate": 8.324302487526303e-07, "logits/chosen": -0.14448337256908417, "logits/rejected": -0.0616963729262352, "logps/chosen": -1.5610359907150269, "logps/rejected": -1.8518545627593994, "loss": 0.6822, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.5610359907150269, "rewards/margins": 0.2908182442188263, "rewards/rejected": -1.8518545627593994, "sft_loss": 1.5730682611465454, "step": 1915 }, { "epoch": 1.0275965880582036, "grad_norm": 2.519521640106637, "learning_rate": 8.312653087636398e-07, "logits/chosen": -0.1483561098575592, "logits/rejected": -0.06898792088031769, "logps/chosen": -1.4000225067138672, "logps/rejected": -1.8333406448364258, "loss": 0.654, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.4000225067138672, "rewards/margins": 0.4333181381225586, "rewards/rejected": -1.8333406448364258, "sft_loss": 1.432531714439392, "step": 1920 }, { "epoch": 1.0302726208396054, "grad_norm": 2.827697197982811, "learning_rate": 8.300971548371711e-07, "logits/chosen": -0.2625359892845154, "logits/rejected": -0.055143196135759354, "logps/chosen": -1.5686523914337158, "logps/rejected": -1.9178880453109741, "loss": 0.6765, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.5686523914337158, "rewards/margins": 0.3492355942726135, "rewards/rejected": -1.9178880453109741, "sft_loss": 1.5924729108810425, "step": 1925 }, { "epoch": 1.0329486536210069, "grad_norm": 3.874785154702929, "learning_rate": 8.289257983066582e-07, "logits/chosen": -0.18027618527412415, "logits/rejected": -0.041930388659238815, "logps/chosen": -1.440549612045288, "logps/rejected": -1.9213539361953735, "loss": 0.6565, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.440549612045288, "rewards/margins": 0.48080435395240784, "rewards/rejected": -1.9213539361953735, "sft_loss": 1.4900104999542236, "step": 1930 }, { "epoch": 1.0356246864024083, "grad_norm": 3.0687794402615913, "learning_rate": 8.277512505366077e-07, "logits/chosen": -0.22330304980278015, "logits/rejected": -0.04363471269607544, "logps/chosen": -1.5808871984481812, "logps/rejected": -2.0564162731170654, "loss": 0.6773, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.5808871984481812, "rewards/margins": 0.47552910447120667, "rewards/rejected": -2.0564162731170654, "sft_loss": 1.5744086503982544, "step": 1935 }, { "epoch": 1.03830071918381, "grad_norm": 3.3563856521239184, "learning_rate": 8.265735229224868e-07, "logits/chosen": -0.13265278935432434, "logits/rejected": -0.011842799372971058, "logps/chosen": -1.5311095714569092, "logps/rejected": -2.0399792194366455, "loss": 0.6708, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.5311095714569092, "rewards/margins": 0.5088695287704468, "rewards/rejected": -2.0399792194366455, "sft_loss": 1.5216306447982788, "step": 1940 }, { "epoch": 1.0409767519652116, "grad_norm": 3.216162602204211, "learning_rate": 8.253926268906144e-07, "logits/chosen": -0.22170314192771912, "logits/rejected": -0.04994767904281616, "logps/chosen": -1.5834558010101318, "logps/rejected": -2.1816983222961426, "loss": 0.6626, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.5834558010101318, "rewards/margins": 0.5982425212860107, "rewards/rejected": -2.1816983222961426, "sft_loss": 1.614985704421997, "step": 1945 }, { "epoch": 1.043652784746613, "grad_norm": 2.184799152816952, "learning_rate": 8.242085738980487e-07, "logits/chosen": -0.1607782393693924, "logits/rejected": 0.047008074820041656, "logps/chosen": -1.622387170791626, "logps/rejected": -2.0742850303649902, "loss": 0.6761, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.622387170791626, "rewards/margins": 0.4518980085849762, "rewards/rejected": -2.0742850303649902, "sft_loss": 1.6672132015228271, "step": 1950 }, { "epoch": 1.0463288175280148, "grad_norm": 6.250157245132641, "learning_rate": 8.230213754324772e-07, "logits/chosen": -0.14060020446777344, "logits/rejected": -0.0696086436510086, "logps/chosen": -1.472314476966858, "logps/rejected": -1.8883994817733765, "loss": 0.6626, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.472314476966858, "rewards/margins": 0.4160851538181305, "rewards/rejected": -1.8883994817733765, "sft_loss": 1.487228512763977, "step": 1955 }, { "epoch": 1.0490048503094163, "grad_norm": 1.6480790156412097, "learning_rate": 8.218310430121045e-07, "logits/chosen": -0.19000086188316345, "logits/rejected": -0.15001313388347626, "logps/chosen": -1.524418592453003, "logps/rejected": -1.8568360805511475, "loss": 0.6727, "rewards/accuracies": 0.65625, "rewards/chosen": -1.524418592453003, "rewards/margins": 0.3324173390865326, "rewards/rejected": -1.8568360805511475, "sft_loss": 1.5695242881774902, "step": 1960 }, { "epoch": 1.051680883090818, "grad_norm": 4.760518771846394, "learning_rate": 8.20637588185541e-07, "logits/chosen": -0.1314769834280014, "logits/rejected": -0.06649098545312881, "logps/chosen": -1.4572066068649292, "logps/rejected": -2.0760457515716553, "loss": 0.6541, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.4572066068649292, "rewards/margins": 0.6188389658927917, "rewards/rejected": -2.0760457515716553, "sft_loss": 1.508234977722168, "step": 1965 }, { "epoch": 1.0543569158722195, "grad_norm": 3.4784089052305083, "learning_rate": 8.194410225316906e-07, "logits/chosen": -0.1936931610107422, "logits/rejected": -0.057973574846982956, "logps/chosen": -1.4674887657165527, "logps/rejected": -1.873656988143921, "loss": 0.677, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.4674887657165527, "rewards/margins": 0.40616822242736816, "rewards/rejected": -1.873656988143921, "sft_loss": 1.5056754350662231, "step": 1970 }, { "epoch": 1.057032948653621, "grad_norm": 2.5732562782095068, "learning_rate": 8.182413576596385e-07, "logits/chosen": -0.10193010419607162, "logits/rejected": -0.017011495307087898, "logps/chosen": -1.4385443925857544, "logps/rejected": -1.8158655166625977, "loss": 0.672, "rewards/accuracies": 0.625, "rewards/chosen": -1.4385443925857544, "rewards/margins": 0.3773210942745209, "rewards/rejected": -1.8158655166625977, "sft_loss": 1.4791452884674072, "step": 1975 }, { "epoch": 1.0597089814350227, "grad_norm": 3.3913885943645914, "learning_rate": 8.170386052085389e-07, "logits/chosen": -0.08598263561725616, "logits/rejected": 0.0331895537674427, "logps/chosen": -1.5553724765777588, "logps/rejected": -1.9766775369644165, "loss": 0.6723, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.5553724765777588, "rewards/margins": 0.4213050901889801, "rewards/rejected": -1.9766775369644165, "sft_loss": 1.550366759300232, "step": 1980 }, { "epoch": 1.0623850142164242, "grad_norm": 3.052299834948276, "learning_rate": 8.158327768475008e-07, "logits/chosen": -0.14798256754875183, "logits/rejected": 0.005163169465959072, "logps/chosen": -1.654259443283081, "logps/rejected": -1.9187091588974, "loss": 0.6827, "rewards/accuracies": 0.625, "rewards/chosen": -1.654259443283081, "rewards/margins": 0.2644497752189636, "rewards/rejected": -1.9187091588974, "sft_loss": 1.562561273574829, "step": 1985 }, { "epoch": 1.0650610469978257, "grad_norm": 2.957691286330317, "learning_rate": 8.146238842754767e-07, "logits/chosen": -0.20001676678657532, "logits/rejected": -0.10823854058980942, "logps/chosen": -1.6180994510650635, "logps/rejected": -1.9572150707244873, "loss": 0.676, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.6180994510650635, "rewards/margins": 0.3391154706478119, "rewards/rejected": -1.9572150707244873, "sft_loss": 1.6023613214492798, "step": 1990 }, { "epoch": 1.0677370797792274, "grad_norm": 3.5448089859794485, "learning_rate": 8.134119392211476e-07, "logits/chosen": -0.08415170013904572, "logits/rejected": 0.06598736345767975, "logps/chosen": -1.5128848552703857, "logps/rejected": -2.073805093765259, "loss": 0.6516, "rewards/accuracies": 0.6875, "rewards/chosen": -1.5128848552703857, "rewards/margins": 0.5609201192855835, "rewards/rejected": -2.073805093765259, "sft_loss": 1.5354398488998413, "step": 1995 }, { "epoch": 1.0704131125606289, "grad_norm": 11.781259970594936, "learning_rate": 8.121969534428094e-07, "logits/chosen": -0.17054155468940735, "logits/rejected": -0.01642870530486107, "logps/chosen": -1.6963222026824951, "logps/rejected": -2.1025562286376953, "loss": 0.714, "rewards/accuracies": 0.5625, "rewards/chosen": -1.6963222026824951, "rewards/margins": 0.40623408555984497, "rewards/rejected": -2.1025562286376953, "sft_loss": 1.662977933883667, "step": 2000 }, { "epoch": 1.0704131125606289, "eval_logits/chosen": 0.11042266339063644, "eval_logits/rejected": 0.19898873567581177, "eval_logps/chosen": -1.562518835067749, "eval_logps/rejected": -2.0465967655181885, "eval_loss": 0.6718986630439758, "eval_rewards/accuracies": 0.6268545985221863, "eval_rewards/chosen": -1.562518835067749, "eval_rewards/margins": 0.48407796025276184, "eval_rewards/rejected": -2.0465967655181885, "eval_runtime": 47.4282, "eval_samples_per_second": 28.359, "eval_sft_loss": 1.5563881397247314, "eval_steps_per_second": 7.105, "step": 2000 }, { "epoch": 1.0730891453420304, "grad_norm": 3.513893008993406, "learning_rate": 8.109789387282599e-07, "logits/chosen": -0.1328616440296173, "logits/rejected": -0.04587624594569206, "logps/chosen": -1.6134157180786133, "logps/rejected": -2.0356030464172363, "loss": 0.6805, "rewards/accuracies": 0.59375, "rewards/chosen": -1.6134157180786133, "rewards/margins": 0.4221871793270111, "rewards/rejected": -2.0356030464172363, "sft_loss": 1.6057875156402588, "step": 2005 }, { "epoch": 1.075765178123432, "grad_norm": 4.802954296253678, "learning_rate": 8.097579068946827e-07, "logits/chosen": -0.10831431299448013, "logits/rejected": -0.0031776546966284513, "logps/chosen": -1.4763798713684082, "logps/rejected": -1.8952471017837524, "loss": 0.672, "rewards/accuracies": 0.625, "rewards/chosen": -1.4763798713684082, "rewards/margins": 0.41886717081069946, "rewards/rejected": -1.8952471017837524, "sft_loss": 1.5348972082138062, "step": 2010 }, { "epoch": 1.0784412109048336, "grad_norm": 2.858256991560063, "learning_rate": 8.085338697885344e-07, "logits/chosen": -0.18605680763721466, "logits/rejected": -0.04593934118747711, "logps/chosen": -1.5805212259292603, "logps/rejected": -2.0479989051818848, "loss": 0.6793, "rewards/accuracies": 0.65625, "rewards/chosen": -1.5805212259292603, "rewards/margins": 0.4674775004386902, "rewards/rejected": -2.0479989051818848, "sft_loss": 1.506347417831421, "step": 2015 }, { "epoch": 1.081117243686235, "grad_norm": 4.329723234751204, "learning_rate": 8.073068392854282e-07, "logits/chosen": -0.24087968468666077, "logits/rejected": -0.062106020748615265, "logps/chosen": -1.6259753704071045, "logps/rejected": -2.1355271339416504, "loss": 0.6662, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.6259753704071045, "rewards/margins": 0.5095517039299011, "rewards/rejected": -2.1355271339416504, "sft_loss": 1.5672664642333984, "step": 2020 }, { "epoch": 1.0837932764676368, "grad_norm": 3.495146311508932, "learning_rate": 8.060768272900193e-07, "logits/chosen": -0.12188255786895752, "logits/rejected": 0.0037569478154182434, "logps/chosen": -1.6243022680282593, "logps/rejected": -2.1850533485412598, "loss": 0.6581, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.6243022680282593, "rewards/margins": 0.5607510805130005, "rewards/rejected": -2.1850533485412598, "sft_loss": 1.6390899419784546, "step": 2025 }, { "epoch": 1.0864693092490383, "grad_norm": 3.039671083603455, "learning_rate": 8.0484384573589e-07, "logits/chosen": -0.19984665513038635, "logits/rejected": -0.16852447390556335, "logps/chosen": -1.5463429689407349, "logps/rejected": -1.9309475421905518, "loss": 0.6711, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.5463429689407349, "rewards/margins": 0.3846047520637512, "rewards/rejected": -1.9309475421905518, "sft_loss": 1.5829219818115234, "step": 2030 }, { "epoch": 1.0891453420304398, "grad_norm": 2.790160745985205, "learning_rate": 8.03607906585432e-07, "logits/chosen": -0.22109492123126984, "logits/rejected": -0.05157407373189926, "logps/chosen": -1.6182200908660889, "logps/rejected": -2.0302791595458984, "loss": 0.6748, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.6182200908660889, "rewards/margins": 0.4120589792728424, "rewards/rejected": -2.0302791595458984, "sft_loss": 1.6120506525039673, "step": 2035 }, { "epoch": 1.0918213748118415, "grad_norm": 3.315920683366051, "learning_rate": 8.023690218297329e-07, "logits/chosen": -0.2908003330230713, "logits/rejected": -0.2197912484407425, "logps/chosen": -1.6452051401138306, "logps/rejected": -1.9569673538208008, "loss": 0.6658, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.6452051401138306, "rewards/margins": 0.3117622435092926, "rewards/rejected": -1.9569673538208008, "sft_loss": 1.6148096323013306, "step": 2040 }, { "epoch": 1.094497407593243, "grad_norm": 5.462304473351597, "learning_rate": 8.01127203488458e-07, "logits/chosen": -0.14959219098091125, "logits/rejected": -0.11621057987213135, "logps/chosen": -1.5611058473587036, "logps/rejected": -1.9687639474868774, "loss": 0.6683, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.5611058473587036, "rewards/margins": 0.40765801072120667, "rewards/rejected": -1.9687639474868774, "sft_loss": 1.4946987628936768, "step": 2045 }, { "epoch": 1.0971734403746445, "grad_norm": 8.662994976379249, "learning_rate": 7.998824636097339e-07, "logits/chosen": -0.24751749634742737, "logits/rejected": -0.1236846074461937, "logps/chosen": -1.6202363967895508, "logps/rejected": -1.980786919593811, "loss": 0.6921, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.6202363967895508, "rewards/margins": 0.36055055260658264, "rewards/rejected": -1.980786919593811, "sft_loss": 1.6707451343536377, "step": 2050 }, { "epoch": 1.0998494731560462, "grad_norm": 4.456899179225936, "learning_rate": 7.986348142700328e-07, "logits/chosen": -0.17690253257751465, "logits/rejected": -0.0510096549987793, "logps/chosen": -1.5945512056350708, "logps/rejected": -2.002166271209717, "loss": 0.6654, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.5945512056350708, "rewards/margins": 0.40761512517929077, "rewards/rejected": -2.002166271209717, "sft_loss": 1.628100037574768, "step": 2055 }, { "epoch": 1.1025255059374477, "grad_norm": 4.65806576514881, "learning_rate": 7.973842675740539e-07, "logits/chosen": -0.1397233009338379, "logits/rejected": -0.09292219579219818, "logps/chosen": -1.5926469564437866, "logps/rejected": -2.1180152893066406, "loss": 0.6667, "rewards/accuracies": 0.625, "rewards/chosen": -1.5926469564437866, "rewards/margins": 0.5253681540489197, "rewards/rejected": -2.1180152893066406, "sft_loss": 1.6516544818878174, "step": 2060 }, { "epoch": 1.1052015387188494, "grad_norm": 3.5342850756940387, "learning_rate": 7.961308356546066e-07, "logits/chosen": -0.2090418040752411, "logits/rejected": -0.07881750911474228, "logps/chosen": -1.5486667156219482, "logps/rejected": -1.9510724544525146, "loss": 0.6652, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.5486667156219482, "rewards/margins": 0.40240558981895447, "rewards/rejected": -1.9510724544525146, "sft_loss": 1.530504584312439, "step": 2065 }, { "epoch": 1.107877571500251, "grad_norm": 2.832539801553843, "learning_rate": 7.948745306724931e-07, "logits/chosen": -0.20894074440002441, "logits/rejected": -0.06344683468341827, "logps/chosen": -1.530551791191101, "logps/rejected": -2.070535182952881, "loss": 0.6511, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.530551791191101, "rewards/margins": 0.5399833917617798, "rewards/rejected": -2.070535182952881, "sft_loss": 1.517613172531128, "step": 2070 }, { "epoch": 1.1105536042816524, "grad_norm": 3.38191618292415, "learning_rate": 7.936153648163897e-07, "logits/chosen": -0.23570159077644348, "logits/rejected": -0.13040268421173096, "logps/chosen": -1.644474744796753, "logps/rejected": -2.1541659832000732, "loss": 0.6699, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.644474744796753, "rewards/margins": 0.5096911191940308, "rewards/rejected": -2.1541659832000732, "sft_loss": 1.7060950994491577, "step": 2075 }, { "epoch": 1.1132296370630541, "grad_norm": 2.364002055632971, "learning_rate": 7.92353350302729e-07, "logits/chosen": -0.25074702501296997, "logits/rejected": -0.0874941349029541, "logps/chosen": -1.4881782531738281, "logps/rejected": -2.0220913887023926, "loss": 0.6526, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.4881782531738281, "rewards/margins": 0.5339129567146301, "rewards/rejected": -2.0220913887023926, "sft_loss": 1.530760407447815, "step": 2080 }, { "epoch": 1.1159056698444556, "grad_norm": 6.815710847699838, "learning_rate": 7.910884993755816e-07, "logits/chosen": -0.20786544680595398, "logits/rejected": -0.0966242104768753, "logps/chosen": -1.5392001867294312, "logps/rejected": -2.1013989448547363, "loss": 0.6623, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.5392001867294312, "rewards/margins": 0.5621985793113708, "rewards/rejected": -2.1013989448547363, "sft_loss": 1.576633334159851, "step": 2085 }, { "epoch": 1.118581702625857, "grad_norm": 4.49999671804145, "learning_rate": 7.898208243065367e-07, "logits/chosen": -0.259671151638031, "logits/rejected": -0.24447309970855713, "logps/chosen": -1.5871320962905884, "logps/rejected": -1.8874315023422241, "loss": 0.6689, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.5871320962905884, "rewards/margins": 0.3002995252609253, "rewards/rejected": -1.8874315023422241, "sft_loss": 1.626317024230957, "step": 2090 }, { "epoch": 1.1212577354072588, "grad_norm": 2.0323666230061845, "learning_rate": 7.88550337394583e-07, "logits/chosen": -0.24338272213935852, "logits/rejected": -0.10547523200511932, "logps/chosen": -1.7075008153915405, "logps/rejected": -2.0710959434509277, "loss": 0.6956, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.7075008153915405, "rewards/margins": 0.36359527707099915, "rewards/rejected": -2.0710959434509277, "sft_loss": 1.6951372623443604, "step": 2095 }, { "epoch": 1.1239337681886603, "grad_norm": 6.765873292332573, "learning_rate": 7.872770509659905e-07, "logits/chosen": -0.16340097784996033, "logits/rejected": -0.11491282284259796, "logps/chosen": -1.7300260066986084, "logps/rejected": -2.051795482635498, "loss": 0.6792, "rewards/accuracies": 0.59375, "rewards/chosen": -1.7300260066986084, "rewards/margins": 0.3217691481113434, "rewards/rejected": -2.051795482635498, "sft_loss": 1.6949584484100342, "step": 2100 }, { "epoch": 1.1266098009700618, "grad_norm": 3.199587380600216, "learning_rate": 7.860009773741896e-07, "logits/chosen": -0.10508096218109131, "logits/rejected": 0.014907196164131165, "logps/chosen": -1.619755506515503, "logps/rejected": -2.115018606185913, "loss": 0.6639, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.619755506515503, "rewards/margins": 0.4952631890773773, "rewards/rejected": -2.115018606185913, "sft_loss": 1.5901477336883545, "step": 2105 }, { "epoch": 1.1292858337514635, "grad_norm": 8.121098754677774, "learning_rate": 7.84722128999652e-07, "logits/chosen": -0.19740596413612366, "logits/rejected": -0.04385928064584732, "logps/chosen": -1.5759496688842773, "logps/rejected": -2.2588329315185547, "loss": 0.6571, "rewards/accuracies": 0.6875, "rewards/chosen": -1.5759496688842773, "rewards/margins": 0.682883083820343, "rewards/rejected": -2.2588329315185547, "sft_loss": 1.6312938928604126, "step": 2110 }, { "epoch": 1.131961866532865, "grad_norm": 4.349147420252072, "learning_rate": 7.834405182497699e-07, "logits/chosen": -0.08879465609788895, "logits/rejected": -0.03234269469976425, "logps/chosen": -1.6562553644180298, "logps/rejected": -2.0740723609924316, "loss": 0.6553, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.6562553644180298, "rewards/margins": 0.41781702637672424, "rewards/rejected": -2.0740723609924316, "sft_loss": 1.6364749670028687, "step": 2115 }, { "epoch": 1.1346378993142665, "grad_norm": 3.943676183821946, "learning_rate": 7.821561575587368e-07, "logits/chosen": -0.17019066214561462, "logits/rejected": -0.126693457365036, "logps/chosen": -1.5662453174591064, "logps/rejected": -1.9887056350708008, "loss": 0.6629, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.5662453174591064, "rewards/margins": 0.4224603772163391, "rewards/rejected": -1.9887056350708008, "sft_loss": 1.633723497390747, "step": 2120 }, { "epoch": 1.1373139320956682, "grad_norm": 5.473259104854902, "learning_rate": 7.808690593874254e-07, "logits/chosen": -0.14707279205322266, "logits/rejected": -0.08167170733213425, "logps/chosen": -1.516506552696228, "logps/rejected": -2.081937789916992, "loss": 0.6483, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.516506552696228, "rewards/margins": 0.5654313564300537, "rewards/rejected": -2.081937789916992, "sft_loss": 1.5508047342300415, "step": 2125 }, { "epoch": 1.1399899648770697, "grad_norm": 4.218506802957272, "learning_rate": 7.79579236223268e-07, "logits/chosen": -0.10259685665369034, "logits/rejected": 0.1375846266746521, "logps/chosen": -1.573190689086914, "logps/rejected": -2.189675807952881, "loss": 0.6609, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.573190689086914, "rewards/margins": 0.6164848804473877, "rewards/rejected": -2.189675807952881, "sft_loss": 1.6187585592269897, "step": 2130 }, { "epoch": 1.1426659976584714, "grad_norm": 5.6719658150784396, "learning_rate": 7.782867005801346e-07, "logits/chosen": -0.11972524225711823, "logits/rejected": 0.0455777645111084, "logps/chosen": -1.6146034002304077, "logps/rejected": -2.1865992546081543, "loss": 0.6688, "rewards/accuracies": 0.65625, "rewards/chosen": -1.6146034002304077, "rewards/margins": 0.5719958543777466, "rewards/rejected": -2.1865992546081543, "sft_loss": 1.6162227392196655, "step": 2135 }, { "epoch": 1.145342030439873, "grad_norm": 3.953793622265437, "learning_rate": 7.769914649982117e-07, "logits/chosen": -0.15830545127391815, "logits/rejected": -0.015896636992692947, "logps/chosen": -1.6073856353759766, "logps/rejected": -2.148798704147339, "loss": 0.676, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.6073856353759766, "rewards/margins": 0.5414127707481384, "rewards/rejected": -2.148798704147339, "sft_loss": 1.6097323894500732, "step": 2140 }, { "epoch": 1.1480180632212744, "grad_norm": 2.7461565163394446, "learning_rate": 7.756935420438803e-07, "logits/chosen": -0.1359499990940094, "logits/rejected": -0.04135057330131531, "logps/chosen": -1.536054253578186, "logps/rejected": -2.0973961353302, "loss": 0.6463, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.536054253578186, "rewards/margins": 0.5613418817520142, "rewards/rejected": -2.0973961353302, "sft_loss": 1.5942299365997314, "step": 2145 }, { "epoch": 1.1506940960026761, "grad_norm": 1.772647002953551, "learning_rate": 7.743929443095951e-07, "logits/chosen": -0.21985526382923126, "logits/rejected": -0.16376951336860657, "logps/chosen": -1.7838491201400757, "logps/rejected": -2.25213885307312, "loss": 0.6791, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.7838491201400757, "rewards/margins": 0.46828994154930115, "rewards/rejected": -2.25213885307312, "sft_loss": 1.699902892112732, "step": 2150 }, { "epoch": 1.1533701287840776, "grad_norm": 1.8222620144441337, "learning_rate": 7.730896844137609e-07, "logits/chosen": -0.1366005688905716, "logits/rejected": -0.07194123417139053, "logps/chosen": -1.7815555334091187, "logps/rejected": -2.1918790340423584, "loss": 0.6683, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.7815555334091187, "rewards/margins": 0.4103233218193054, "rewards/rejected": -2.1918790340423584, "sft_loss": 1.7492650747299194, "step": 2155 }, { "epoch": 1.1560461615654791, "grad_norm": 7.720740241090713, "learning_rate": 7.717837750006106e-07, "logits/chosen": -0.1920839548110962, "logits/rejected": -0.08554677665233612, "logps/chosen": -1.6699411869049072, "logps/rejected": -2.223794937133789, "loss": 0.6466, "rewards/accuracies": 0.65625, "rewards/chosen": -1.6699411869049072, "rewards/margins": 0.5538536310195923, "rewards/rejected": -2.223794937133789, "sft_loss": 1.6859760284423828, "step": 2160 }, { "epoch": 1.1587221943468808, "grad_norm": 1.815984222290429, "learning_rate": 7.704752287400832e-07, "logits/chosen": -0.18362686038017273, "logits/rejected": -0.0012839033734053373, "logps/chosen": -1.6417992115020752, "logps/rejected": -2.2869229316711426, "loss": 0.6707, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.6417992115020752, "rewards/margins": 0.6451237797737122, "rewards/rejected": -2.2869229316711426, "sft_loss": 1.6056686639785767, "step": 2165 }, { "epoch": 1.1613982271282823, "grad_norm": 2.811239975892915, "learning_rate": 7.691640583277004e-07, "logits/chosen": -0.17116299271583557, "logits/rejected": -0.00892619788646698, "logps/chosen": -1.5159931182861328, "logps/rejected": -2.1569645404815674, "loss": 0.6551, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.5159931182861328, "rewards/margins": 0.640971302986145, "rewards/rejected": -2.1569645404815674, "sft_loss": 1.5590221881866455, "step": 2170 }, { "epoch": 1.1640742599096838, "grad_norm": 3.8606992562661486, "learning_rate": 7.678502764844433e-07, "logits/chosen": -0.2119617462158203, "logits/rejected": -0.04030895233154297, "logps/chosen": -1.575308084487915, "logps/rejected": -2.019559144973755, "loss": 0.67, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.575308084487915, "rewards/margins": 0.4442509114742279, "rewards/rejected": -2.019559144973755, "sft_loss": 1.589772343635559, "step": 2175 }, { "epoch": 1.1667502926910855, "grad_norm": 5.051142678262766, "learning_rate": 7.665338959566288e-07, "logits/chosen": -0.17688782513141632, "logits/rejected": -0.09141966700553894, "logps/chosen": -1.5610404014587402, "logps/rejected": -2.0049726963043213, "loss": 0.656, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.5610404014587402, "rewards/margins": 0.4439323842525482, "rewards/rejected": -2.0049726963043213, "sft_loss": 1.5945446491241455, "step": 2180 }, { "epoch": 1.169426325472487, "grad_norm": 3.7442822333080703, "learning_rate": 7.652149295157868e-07, "logits/chosen": -0.11327006667852402, "logits/rejected": 0.024983903393149376, "logps/chosen": -1.6136016845703125, "logps/rejected": -1.9793922901153564, "loss": 0.673, "rewards/accuracies": 0.6875, "rewards/chosen": -1.6136016845703125, "rewards/margins": 0.3657905161380768, "rewards/rejected": -1.9793922901153564, "sft_loss": 1.566320538520813, "step": 2185 }, { "epoch": 1.1721023582538885, "grad_norm": 3.412670516498388, "learning_rate": 7.638933899585354e-07, "logits/chosen": -0.050362229347229004, "logits/rejected": 0.006541428156197071, "logps/chosen": -1.5444839000701904, "logps/rejected": -2.030472993850708, "loss": 0.6607, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.5444839000701904, "rewards/margins": 0.48598918318748474, "rewards/rejected": -2.030472993850708, "sft_loss": 1.6165252923965454, "step": 2190 }, { "epoch": 1.1747783910352902, "grad_norm": 3.6855124550664695, "learning_rate": 7.625692901064573e-07, "logits/chosen": -0.12177852541208267, "logits/rejected": -0.029640281572937965, "logps/chosen": -1.5576467514038086, "logps/rejected": -2.102085590362549, "loss": 0.6617, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.5576467514038086, "rewards/margins": 0.5444390177726746, "rewards/rejected": -2.102085590362549, "sft_loss": 1.6117064952850342, "step": 2195 }, { "epoch": 1.1774544238166917, "grad_norm": 3.0318364406998137, "learning_rate": 7.61242642805975e-07, "logits/chosen": -0.17540039122104645, "logits/rejected": -0.18315133452415466, "logps/chosen": -1.6013386249542236, "logps/rejected": -2.041822910308838, "loss": 0.6668, "rewards/accuracies": 0.65625, "rewards/chosen": -1.6013386249542236, "rewards/margins": 0.4404842257499695, "rewards/rejected": -2.041822910308838, "sft_loss": 1.660768747329712, "step": 2200 }, { "epoch": 1.1801304565980932, "grad_norm": 4.709270443883367, "learning_rate": 7.599134609282266e-07, "logits/chosen": -0.224747896194458, "logits/rejected": -0.038868196308612823, "logps/chosen": -1.5257173776626587, "logps/rejected": -2.0317251682281494, "loss": 0.667, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.5257173776626587, "rewards/margins": 0.506007969379425, "rewards/rejected": -2.0317251682281494, "sft_loss": 1.5080684423446655, "step": 2205 }, { "epoch": 1.182806489379495, "grad_norm": 3.7929751048578484, "learning_rate": 7.585817573689402e-07, "logits/chosen": -0.2617315649986267, "logits/rejected": -0.14034771919250488, "logps/chosen": -1.3956866264343262, "logps/rejected": -2.0973429679870605, "loss": 0.6389, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.3956866264343262, "rewards/margins": 0.7016563415527344, "rewards/rejected": -2.0973429679870605, "sft_loss": 1.4621050357818604, "step": 2210 }, { "epoch": 1.1854825221608964, "grad_norm": 3.3729236083614635, "learning_rate": 7.572475450483098e-07, "logits/chosen": -0.21176567673683167, "logits/rejected": -0.13093852996826172, "logps/chosen": -1.7197602987289429, "logps/rejected": -2.2689871788024902, "loss": 0.6712, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.7197602987289429, "rewards/margins": 0.5492271184921265, "rewards/rejected": -2.2689871788024902, "sft_loss": 1.6264690160751343, "step": 2215 }, { "epoch": 1.188158554942298, "grad_norm": 3.1390038372846467, "learning_rate": 7.559108369108689e-07, "logits/chosen": -0.26345133781433105, "logits/rejected": -0.1286747008562088, "logps/chosen": -1.534954309463501, "logps/rejected": -1.9984760284423828, "loss": 0.6728, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.534954309463501, "rewards/margins": 0.46352171897888184, "rewards/rejected": -1.9984760284423828, "sft_loss": 1.5593990087509155, "step": 2220 }, { "epoch": 1.1908345877236997, "grad_norm": 3.360746336759213, "learning_rate": 7.54571645925366e-07, "logits/chosen": -0.2391415536403656, "logits/rejected": -0.035399384796619415, "logps/chosen": -1.5200650691986084, "logps/rejected": -2.197875499725342, "loss": 0.6529, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.5200650691986084, "rewards/margins": 0.677810549736023, "rewards/rejected": -2.197875499725342, "sft_loss": 1.5431331396102905, "step": 2225 }, { "epoch": 1.1935106205051011, "grad_norm": 21.355656846058746, "learning_rate": 7.532299850846378e-07, "logits/chosen": -0.2608916461467743, "logits/rejected": -0.11797686666250229, "logps/chosen": -1.5888410806655884, "logps/rejected": -2.375009059906006, "loss": 0.6799, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.5888410806655884, "rewards/margins": 0.7861679196357727, "rewards/rejected": -2.375009059906006, "sft_loss": 1.5629527568817139, "step": 2230 }, { "epoch": 1.1961866532865026, "grad_norm": 5.423272631066221, "learning_rate": 7.518858674054838e-07, "logits/chosen": -0.2376869022846222, "logits/rejected": -0.05963951349258423, "logps/chosen": -1.5007435083389282, "logps/rejected": -2.112490177154541, "loss": 0.6556, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.5007435083389282, "rewards/margins": 0.6117470264434814, "rewards/rejected": -2.112490177154541, "sft_loss": 1.5177980661392212, "step": 2235 }, { "epoch": 1.1988626860679044, "grad_norm": 4.5684604787024155, "learning_rate": 7.505393059285394e-07, "logits/chosen": -0.19655776023864746, "logits/rejected": -0.038988981395959854, "logps/chosen": -1.5234743356704712, "logps/rejected": -2.0442726612091064, "loss": 0.65, "rewards/accuracies": 0.71875, "rewards/chosen": -1.5234743356704712, "rewards/margins": 0.5207983255386353, "rewards/rejected": -2.0442726612091064, "sft_loss": 1.541851282119751, "step": 2240 }, { "epoch": 1.2015387188493059, "grad_norm": 5.618983740559261, "learning_rate": 7.491903137181501e-07, "logits/chosen": -0.1544310748577118, "logits/rejected": -0.10041675716638565, "logps/chosen": -1.4842172861099243, "logps/rejected": -1.9160614013671875, "loss": 0.6643, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.4842172861099243, "rewards/margins": 0.4318443238735199, "rewards/rejected": -1.9160614013671875, "sft_loss": 1.5359599590301514, "step": 2245 }, { "epoch": 1.2042147516307076, "grad_norm": 3.503081298811264, "learning_rate": 7.478389038622441e-07, "logits/chosen": -0.08281540125608444, "logits/rejected": -0.056744955480098724, "logps/chosen": -1.470792293548584, "logps/rejected": -2.0276589393615723, "loss": 0.6547, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.470792293548584, "rewards/margins": 0.5568663477897644, "rewards/rejected": -2.0276589393615723, "sft_loss": 1.491330862045288, "step": 2250 }, { "epoch": 1.206890784412109, "grad_norm": 2.6902293197930613, "learning_rate": 7.46485089472206e-07, "logits/chosen": -0.16725048422813416, "logits/rejected": -0.05352597311139107, "logps/chosen": -1.603641152381897, "logps/rejected": -1.9243053197860718, "loss": 0.6768, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.603641152381897, "rewards/margins": 0.3206642270088196, "rewards/rejected": -1.9243053197860718, "sft_loss": 1.5715465545654297, "step": 2255 }, { "epoch": 1.2095668171935106, "grad_norm": 3.188819818525931, "learning_rate": 7.451288836827487e-07, "logits/chosen": -0.09861128032207489, "logits/rejected": -0.10165087878704071, "logps/chosen": -1.4987351894378662, "logps/rejected": -1.8199284076690674, "loss": 0.6648, "rewards/accuracies": 0.59375, "rewards/chosen": -1.4987351894378662, "rewards/margins": 0.3211931884288788, "rewards/rejected": -1.8199284076690674, "sft_loss": 1.538835883140564, "step": 2260 }, { "epoch": 1.2122428499749123, "grad_norm": 3.759510277724772, "learning_rate": 7.437702996517869e-07, "logits/chosen": -0.17928043007850647, "logits/rejected": -0.07855254411697388, "logps/chosen": -1.5587952136993408, "logps/rejected": -1.9538202285766602, "loss": 0.6721, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.5587952136993408, "rewards/margins": 0.3950250744819641, "rewards/rejected": -1.9538202285766602, "sft_loss": 1.632261872291565, "step": 2265 }, { "epoch": 1.2149188827563138, "grad_norm": 4.262150922630274, "learning_rate": 7.424093505603087e-07, "logits/chosen": -0.26483815908432007, "logits/rejected": -0.09061449021100998, "logps/chosen": -1.5146794319152832, "logps/rejected": -2.103245735168457, "loss": 0.6441, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.5146794319152832, "rewards/margins": 0.5885661840438843, "rewards/rejected": -2.103245735168457, "sft_loss": 1.484770655632019, "step": 2270 }, { "epoch": 1.2175949155377153, "grad_norm": 6.369910309273914, "learning_rate": 7.410460496122482e-07, "logits/chosen": -0.15041914582252502, "logits/rejected": -0.01669810339808464, "logps/chosen": -1.4834332466125488, "logps/rejected": -2.0982134342193604, "loss": 0.641, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.4834332466125488, "rewards/margins": 0.614780068397522, "rewards/rejected": -2.0982134342193604, "sft_loss": 1.4943815469741821, "step": 2275 }, { "epoch": 1.220270948319117, "grad_norm": 3.8965884076523145, "learning_rate": 7.396804100343572e-07, "logits/chosen": -0.200050950050354, "logits/rejected": -0.024930734187364578, "logps/chosen": -1.3915150165557861, "logps/rejected": -1.8986610174179077, "loss": 0.6561, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.3915150165557861, "rewards/margins": 0.5071460008621216, "rewards/rejected": -1.8986610174179077, "sft_loss": 1.4466291666030884, "step": 2280 }, { "epoch": 1.2229469811005185, "grad_norm": 3.1816520286868695, "learning_rate": 7.383124450760768e-07, "logits/chosen": -0.12159286439418793, "logits/rejected": 0.08534251153469086, "logps/chosen": -1.5690300464630127, "logps/rejected": -2.113259792327881, "loss": 0.659, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.5690300464630127, "rewards/margins": 0.5442299246788025, "rewards/rejected": -2.113259792327881, "sft_loss": 1.5607188940048218, "step": 2285 }, { "epoch": 1.22562301388192, "grad_norm": 10.016477380540637, "learning_rate": 7.369421680094091e-07, "logits/chosen": -0.2426680326461792, "logits/rejected": -0.07869232445955276, "logps/chosen": -1.4299724102020264, "logps/rejected": -1.9123938083648682, "loss": 0.6885, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.4299724102020264, "rewards/margins": 0.4824213981628418, "rewards/rejected": -1.9123938083648682, "sft_loss": 1.4660427570343018, "step": 2290 }, { "epoch": 1.2282990466633217, "grad_norm": 4.9025494746692875, "learning_rate": 7.355695921287881e-07, "logits/chosen": -0.16157862544059753, "logits/rejected": -0.06997451931238174, "logps/chosen": -1.5286527872085571, "logps/rejected": -2.041487216949463, "loss": 0.6687, "rewards/accuracies": 0.65625, "rewards/chosen": -1.5286527872085571, "rewards/margins": 0.5128344893455505, "rewards/rejected": -2.041487216949463, "sft_loss": 1.6155637502670288, "step": 2295 }, { "epoch": 1.2309750794447232, "grad_norm": 4.784052292853247, "learning_rate": 7.341947307509513e-07, "logits/chosen": -0.1238420233130455, "logits/rejected": 0.01755066215991974, "logps/chosen": -1.559314489364624, "logps/rejected": -1.9448274374008179, "loss": 0.6812, "rewards/accuracies": 0.625, "rewards/chosen": -1.559314489364624, "rewards/margins": 0.385513037443161, "rewards/rejected": -1.9448274374008179, "sft_loss": 1.6126524209976196, "step": 2300 }, { "epoch": 1.233651112226125, "grad_norm": 2.9649760356128496, "learning_rate": 7.328175972148094e-07, "logits/chosen": -0.18504095077514648, "logits/rejected": -0.03801523894071579, "logps/chosen": -1.7024242877960205, "logps/rejected": -2.318399429321289, "loss": 0.659, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.7024242877960205, "rewards/margins": 0.6159749627113342, "rewards/rejected": -2.318399429321289, "sft_loss": 1.6507160663604736, "step": 2305 }, { "epoch": 1.2363271450075264, "grad_norm": 9.849081860091095, "learning_rate": 7.314382048813185e-07, "logits/chosen": -0.15487918257713318, "logits/rejected": 0.11469636857509613, "logps/chosen": -1.5609652996063232, "logps/rejected": -2.2454323768615723, "loss": 0.6696, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.5609652996063232, "rewards/margins": 0.6844668388366699, "rewards/rejected": -2.2454323768615723, "sft_loss": 1.538551688194275, "step": 2310 }, { "epoch": 1.2390031777889279, "grad_norm": 5.162481401345401, "learning_rate": 7.300565671333486e-07, "logits/chosen": -0.1427270919084549, "logits/rejected": 0.0529114305973053, "logps/chosen": -1.6295175552368164, "logps/rejected": -2.1949503421783447, "loss": 0.6645, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.6295175552368164, "rewards/margins": 0.5654329061508179, "rewards/rejected": -2.1949503421783447, "sft_loss": 1.6471545696258545, "step": 2315 }, { "epoch": 1.2416792105703296, "grad_norm": 3.5235516282761807, "learning_rate": 7.286726973755554e-07, "logits/chosen": -0.047027476131916046, "logits/rejected": -0.007574816700071096, "logps/chosen": -1.6011667251586914, "logps/rejected": -2.0129342079162598, "loss": 0.6819, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.6011667251586914, "rewards/margins": 0.41176754236221313, "rewards/rejected": -2.0129342079162598, "sft_loss": 1.5609718561172485, "step": 2320 }, { "epoch": 1.244355243351731, "grad_norm": 3.186771606317666, "learning_rate": 7.272866090342493e-07, "logits/chosen": -0.04279576614499092, "logits/rejected": 0.03906116262078285, "logps/chosen": -1.6204216480255127, "logps/rejected": -2.166551351547241, "loss": 0.673, "rewards/accuracies": 0.625, "rewards/chosen": -1.6204216480255127, "rewards/margins": 0.5461298227310181, "rewards/rejected": -2.166551351547241, "sft_loss": 1.5434439182281494, "step": 2325 }, { "epoch": 1.2470312761331326, "grad_norm": 3.401960753855456, "learning_rate": 7.258983155572656e-07, "logits/chosen": -0.19755621254444122, "logits/rejected": -0.07851502299308777, "logps/chosen": -1.576912522315979, "logps/rejected": -2.0294346809387207, "loss": 0.6715, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.576912522315979, "rewards/margins": 0.452522337436676, "rewards/rejected": -2.0294346809387207, "sft_loss": 1.6274635791778564, "step": 2330 }, { "epoch": 1.2497073089145343, "grad_norm": 4.701880233448562, "learning_rate": 7.245078304138335e-07, "logits/chosen": -0.08824042975902557, "logits/rejected": -0.016690267249941826, "logps/chosen": -1.6092437505722046, "logps/rejected": -2.20479679107666, "loss": 0.6621, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.6092437505722046, "rewards/margins": 0.5955528020858765, "rewards/rejected": -2.20479679107666, "sft_loss": 1.614297866821289, "step": 2335 }, { "epoch": 1.2523833416959358, "grad_norm": 2.51524692247004, "learning_rate": 7.231151670944462e-07, "logits/chosen": -0.25718605518341064, "logits/rejected": -0.06741656363010406, "logps/chosen": -1.6128227710723877, "logps/rejected": -2.1121630668640137, "loss": 0.6764, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.6128227710723877, "rewards/margins": 0.49934014678001404, "rewards/rejected": -2.1121630668640137, "sft_loss": 1.599364995956421, "step": 2340 }, { "epoch": 1.2550593744773373, "grad_norm": 4.293545314379525, "learning_rate": 7.217203391107291e-07, "logits/chosen": -0.1881771683692932, "logits/rejected": -0.01125810481607914, "logps/chosen": -1.5785601139068604, "logps/rejected": -2.2719714641571045, "loss": 0.652, "rewards/accuracies": 0.6875, "rewards/chosen": -1.5785601139068604, "rewards/margins": 0.6934112310409546, "rewards/rejected": -2.2719714641571045, "sft_loss": 1.5927412509918213, "step": 2345 }, { "epoch": 1.257735407258739, "grad_norm": 3.0130648130551183, "learning_rate": 7.203233599953096e-07, "logits/chosen": -0.17986498773097992, "logits/rejected": -0.025739211589097977, "logps/chosen": -1.6297643184661865, "logps/rejected": -2.0613436698913574, "loss": 0.674, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.6297643184661865, "rewards/margins": 0.43157950043678284, "rewards/rejected": -2.0613436698913574, "sft_loss": 1.6227772235870361, "step": 2350 }, { "epoch": 1.2604114400401405, "grad_norm": 3.5574036891629355, "learning_rate": 7.189242433016852e-07, "logits/chosen": -0.14336520433425903, "logits/rejected": 0.007078066468238831, "logps/chosen": -1.4993739128112793, "logps/rejected": -2.249967336654663, "loss": 0.6512, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.4993739128112793, "rewards/margins": 0.7505934238433838, "rewards/rejected": -2.249967336654663, "sft_loss": 1.539982557296753, "step": 2355 }, { "epoch": 1.263087472821542, "grad_norm": 4.607600533757477, "learning_rate": 7.17523002604092e-07, "logits/chosen": -0.14995525777339935, "logits/rejected": -0.007469749543815851, "logps/chosen": -1.5812904834747314, "logps/rejected": -2.2936625480651855, "loss": 0.6543, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.5812904834747314, "rewards/margins": 0.7123721241950989, "rewards/rejected": -2.2936625480651855, "sft_loss": 1.6329816579818726, "step": 2360 }, { "epoch": 1.2657635056029437, "grad_norm": 2.202312490868307, "learning_rate": 7.161196514973734e-07, "logits/chosen": -0.1573416292667389, "logits/rejected": -0.022275719791650772, "logps/chosen": -1.5800076723098755, "logps/rejected": -2.2222046852111816, "loss": 0.6606, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.5800076723098755, "rewards/margins": 0.6421971321105957, "rewards/rejected": -2.2222046852111816, "sft_loss": 1.6377489566802979, "step": 2365 }, { "epoch": 1.2684395383843452, "grad_norm": 5.428563326051036, "learning_rate": 7.147142035968483e-07, "logits/chosen": -0.11440400034189224, "logits/rejected": 0.021290892735123634, "logps/chosen": -1.6060960292816162, "logps/rejected": -2.099410057067871, "loss": 0.664, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.6060960292816162, "rewards/margins": 0.49331387877464294, "rewards/rejected": -2.099410057067871, "sft_loss": 1.6460119485855103, "step": 2370 }, { "epoch": 1.2711155711657467, "grad_norm": 2.9292725092051186, "learning_rate": 7.133066725381781e-07, "logits/chosen": -0.2844286561012268, "logits/rejected": -0.10208000987768173, "logps/chosen": -1.4620226621627808, "logps/rejected": -1.8907419443130493, "loss": 0.6757, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.4620226621627808, "rewards/margins": 0.4287194609642029, "rewards/rejected": -1.8907419443130493, "sft_loss": 1.501758098602295, "step": 2375 }, { "epoch": 1.2737916039471484, "grad_norm": 4.760229882974814, "learning_rate": 7.118970719772354e-07, "logits/chosen": -0.23014874756336212, "logits/rejected": -0.02316945232450962, "logps/chosen": -1.621044397354126, "logps/rejected": -2.260775327682495, "loss": 0.6514, "rewards/accuracies": 0.65625, "rewards/chosen": -1.621044397354126, "rewards/margins": 0.639731228351593, "rewards/rejected": -2.260775327682495, "sft_loss": 1.6766067743301392, "step": 2380 }, { "epoch": 1.27646763672855, "grad_norm": 4.284439527519819, "learning_rate": 7.104854155899711e-07, "logits/chosen": -0.11533300578594208, "logits/rejected": -0.005840751342475414, "logps/chosen": -1.590293288230896, "logps/rejected": -2.0981833934783936, "loss": 0.6667, "rewards/accuracies": 0.625, "rewards/chosen": -1.590293288230896, "rewards/margins": 0.5078902244567871, "rewards/rejected": -2.0981833934783936, "sft_loss": 1.5520540475845337, "step": 2385 }, { "epoch": 1.2791436695099514, "grad_norm": 3.2520408120851583, "learning_rate": 7.090717170722817e-07, "logits/chosen": -0.13598333299160004, "logits/rejected": -0.06311875581741333, "logps/chosen": -1.5608255863189697, "logps/rejected": -2.1643850803375244, "loss": 0.6505, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.5608255863189697, "rewards/margins": 0.6035594940185547, "rewards/rejected": -2.1643850803375244, "sft_loss": 1.5832816362380981, "step": 2390 }, { "epoch": 1.2818197022913531, "grad_norm": 4.450140568534266, "learning_rate": 7.076559901398762e-07, "logits/chosen": -0.29020220041275024, "logits/rejected": -0.15658116340637207, "logps/chosen": -1.4511228799819946, "logps/rejected": -2.0163347721099854, "loss": 0.6668, "rewards/accuracies": 0.65625, "rewards/chosen": -1.4511228799819946, "rewards/margins": 0.5652118921279907, "rewards/rejected": -2.0163347721099854, "sft_loss": 1.5070421695709229, "step": 2395 }, { "epoch": 1.2844957350727546, "grad_norm": 4.555054953431903, "learning_rate": 7.062382485281436e-07, "logits/chosen": -0.1606305092573166, "logits/rejected": -0.038159389048814774, "logps/chosen": -1.4936621189117432, "logps/rejected": -1.953710913658142, "loss": 0.6715, "rewards/accuracies": 0.71875, "rewards/chosen": -1.4936621189117432, "rewards/margins": 0.46004876494407654, "rewards/rejected": -1.953710913658142, "sft_loss": 1.5176284313201904, "step": 2400 }, { "epoch": 1.2844957350727546, "eval_logits/chosen": 0.15357214212417603, "eval_logits/rejected": 0.24874404072761536, "eval_logps/chosen": -1.5844873189926147, "eval_logps/rejected": -2.108262300491333, "eval_loss": 0.6719112396240234, "eval_rewards/accuracies": 0.637982189655304, "eval_rewards/chosen": -1.5844873189926147, "eval_rewards/margins": 0.5237749814987183, "eval_rewards/rejected": -2.108262300491333, "eval_runtime": 46.9387, "eval_samples_per_second": 28.654, "eval_sft_loss": 1.5798624753952026, "eval_steps_per_second": 7.18, "step": 2400 }, { "epoch": 1.287171767854156, "grad_norm": 3.2817263234158642, "learning_rate": 7.048185059920193e-07, "logits/chosen": -0.13636358082294464, "logits/rejected": 0.016568128019571304, "logps/chosen": -1.5544657707214355, "logps/rejected": -2.173511266708374, "loss": 0.6612, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.5544657707214355, "rewards/margins": 0.6190455555915833, "rewards/rejected": -2.173511266708374, "sft_loss": 1.5676701068878174, "step": 2405 }, { "epoch": 1.2898478006355578, "grad_norm": 2.4873833817716045, "learning_rate": 7.033967763058516e-07, "logits/chosen": -0.2956455647945404, "logits/rejected": -0.09363798052072525, "logps/chosen": -1.5647292137145996, "logps/rejected": -1.8906043767929077, "loss": 0.685, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.5647292137145996, "rewards/margins": 0.3258754014968872, "rewards/rejected": -1.8906043767929077, "sft_loss": 1.5583795309066772, "step": 2410 }, { "epoch": 1.2925238334169593, "grad_norm": 2.266065266546848, "learning_rate": 7.019730732632681e-07, "logits/chosen": -0.10976632684469223, "logits/rejected": -0.03128939867019653, "logps/chosen": -1.4546763896942139, "logps/rejected": -2.143770694732666, "loss": 0.6437, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.4546763896942139, "rewards/margins": 0.6890941858291626, "rewards/rejected": -2.143770694732666, "sft_loss": 1.453282356262207, "step": 2415 }, { "epoch": 1.2951998661983608, "grad_norm": 4.142183019556512, "learning_rate": 7.005474106770418e-07, "logits/chosen": -0.2382328063249588, "logits/rejected": -0.10706863552331924, "logps/chosen": -1.585900068283081, "logps/rejected": -2.180056095123291, "loss": 0.6464, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.585900068283081, "rewards/margins": 0.5941557884216309, "rewards/rejected": -2.180056095123291, "sft_loss": 1.6415297985076904, "step": 2420 }, { "epoch": 1.2978758989797625, "grad_norm": 3.1261348241576195, "learning_rate": 6.991198023789577e-07, "logits/chosen": -0.13553068041801453, "logits/rejected": -0.054775677621364594, "logps/chosen": -1.4355140924453735, "logps/rejected": -1.8778979778289795, "loss": 0.6544, "rewards/accuracies": 0.6875, "rewards/chosen": -1.4355140924453735, "rewards/margins": 0.4423840641975403, "rewards/rejected": -1.8778979778289795, "sft_loss": 1.5382192134857178, "step": 2425 }, { "epoch": 1.300551931761164, "grad_norm": 1.567158682723937, "learning_rate": 6.976902622196776e-07, "logits/chosen": -0.11800148338079453, "logits/rejected": -0.0526716485619545, "logps/chosen": -1.696176290512085, "logps/rejected": -2.1083648204803467, "loss": 0.6821, "rewards/accuracies": 0.625, "rewards/chosen": -1.696176290512085, "rewards/margins": 0.41218847036361694, "rewards/rejected": -2.1083648204803467, "sft_loss": 1.6392204761505127, "step": 2430 }, { "epoch": 1.3032279645425655, "grad_norm": 1.957028998509613, "learning_rate": 6.962588040686064e-07, "logits/chosen": -0.10932836681604385, "logits/rejected": 0.0219273678958416, "logps/chosen": -1.5512540340423584, "logps/rejected": -1.9477107524871826, "loss": 0.6736, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.5512540340423584, "rewards/margins": 0.39645668864250183, "rewards/rejected": -1.9477107524871826, "sft_loss": 1.5412626266479492, "step": 2435 }, { "epoch": 1.3059039973239672, "grad_norm": 3.552787449098246, "learning_rate": 6.948254418137573e-07, "logits/chosen": -0.22610628604888916, "logits/rejected": -0.1075693815946579, "logps/chosen": -1.5580403804779053, "logps/rejected": -2.112811326980591, "loss": 0.6763, "rewards/accuracies": 0.625, "rewards/chosen": -1.5580403804779053, "rewards/margins": 0.5547709465026855, "rewards/rejected": -2.112811326980591, "sft_loss": 1.492222547531128, "step": 2440 }, { "epoch": 1.3085800301053687, "grad_norm": 2.9339544586198723, "learning_rate": 6.933901893616174e-07, "logits/chosen": -0.1766672432422638, "logits/rejected": -0.027664726600050926, "logps/chosen": -1.6293308734893799, "logps/rejected": -2.0644872188568115, "loss": 0.6705, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.6293308734893799, "rewards/margins": 0.4351561665534973, "rewards/rejected": -2.0644872188568115, "sft_loss": 1.6100572347640991, "step": 2445 }, { "epoch": 1.3112560628867704, "grad_norm": 2.6433000494373906, "learning_rate": 6.919530606370121e-07, "logits/chosen": -0.17485761642456055, "logits/rejected": -0.015716422349214554, "logps/chosen": -1.568189024925232, "logps/rejected": -2.212991714477539, "loss": 0.6637, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.568189024925232, "rewards/margins": 0.6448027491569519, "rewards/rejected": -2.212991714477539, "sft_loss": 1.5443557500839233, "step": 2450 }, { "epoch": 1.313932095668172, "grad_norm": 1.5803897852634907, "learning_rate": 6.905140695829706e-07, "logits/chosen": -0.18889057636260986, "logits/rejected": 0.06050186604261398, "logps/chosen": -1.6265077590942383, "logps/rejected": -2.2065577507019043, "loss": 0.6627, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.6265077590942383, "rewards/margins": 0.580049991607666, "rewards/rejected": -2.2065577507019043, "sft_loss": 1.6225849390029907, "step": 2455 }, { "epoch": 1.3166081284495736, "grad_norm": 2.343820078187948, "learning_rate": 6.890732301605904e-07, "logits/chosen": -0.15496715903282166, "logits/rejected": -0.040690433233976364, "logps/chosen": -1.6261556148529053, "logps/rejected": -2.021845579147339, "loss": 0.666, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.6261556148529053, "rewards/margins": 0.3956899642944336, "rewards/rejected": -2.021845579147339, "sft_loss": 1.609708547592163, "step": 2460 }, { "epoch": 1.3192841612309751, "grad_norm": 4.181663506103672, "learning_rate": 6.876305563489021e-07, "logits/chosen": -0.14380675554275513, "logits/rejected": -0.05919576808810234, "logps/chosen": -1.571504831314087, "logps/rejected": -2.152155876159668, "loss": 0.6648, "rewards/accuracies": 0.625, "rewards/chosen": -1.571504831314087, "rewards/margins": 0.5806511044502258, "rewards/rejected": -2.152155876159668, "sft_loss": 1.526630163192749, "step": 2465 }, { "epoch": 1.3219601940123766, "grad_norm": 6.117249686412447, "learning_rate": 6.861860621447331e-07, "logits/chosen": -0.28105050325393677, "logits/rejected": -0.14320190250873566, "logps/chosen": -1.5202324390411377, "logps/rejected": -1.9660730361938477, "loss": 0.6713, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.5202324390411377, "rewards/margins": 0.4458405375480652, "rewards/rejected": -1.9660730361938477, "sft_loss": 1.5822843313217163, "step": 2470 }, { "epoch": 1.3246362267937783, "grad_norm": 3.4734110417559187, "learning_rate": 6.847397615625725e-07, "logits/chosen": -0.16598021984100342, "logits/rejected": -0.0934697836637497, "logps/chosen": -1.5609283447265625, "logps/rejected": -2.0777783393859863, "loss": 0.6628, "rewards/accuracies": 0.65625, "rewards/chosen": -1.5609283447265625, "rewards/margins": 0.5168498754501343, "rewards/rejected": -2.0777783393859863, "sft_loss": 1.577129602432251, "step": 2475 }, { "epoch": 1.3273122595751798, "grad_norm": 2.954704936714105, "learning_rate": 6.83291668634435e-07, "logits/chosen": -0.30001145601272583, "logits/rejected": -0.11963419616222382, "logps/chosen": -1.5922280550003052, "logps/rejected": -2.2104010581970215, "loss": 0.6462, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.5922280550003052, "rewards/margins": 0.6181727647781372, "rewards/rejected": -2.2104010581970215, "sft_loss": 1.666888952255249, "step": 2480 }, { "epoch": 1.3299882923565813, "grad_norm": 2.448905033086133, "learning_rate": 6.818417974097246e-07, "logits/chosen": -0.11784724146127701, "logits/rejected": 0.05674583837389946, "logps/chosen": -1.5779105424880981, "logps/rejected": -2.2519729137420654, "loss": 0.6741, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.5779105424880981, "rewards/margins": 0.6740623712539673, "rewards/rejected": -2.2519729137420654, "sft_loss": 1.6603368520736694, "step": 2485 }, { "epoch": 1.332664325137983, "grad_norm": 4.414698044694562, "learning_rate": 6.803901619550981e-07, "logits/chosen": -0.22511418163776398, "logits/rejected": -0.16420219838619232, "logps/chosen": -1.6116619110107422, "logps/rejected": -2.096935749053955, "loss": 0.6599, "rewards/accuracies": 0.6875, "rewards/chosen": -1.6116619110107422, "rewards/margins": 0.48527368903160095, "rewards/rejected": -2.096935749053955, "sft_loss": 1.6449838876724243, "step": 2490 }, { "epoch": 1.3353403579193845, "grad_norm": 3.648026469136519, "learning_rate": 6.789367763543292e-07, "logits/chosen": -0.12327933311462402, "logits/rejected": -0.10511080920696259, "logps/chosen": -1.6098442077636719, "logps/rejected": -2.1295418739318848, "loss": 0.6748, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.6098442077636719, "rewards/margins": 0.5196977853775024, "rewards/rejected": -2.1295418739318848, "sft_loss": 1.6177031993865967, "step": 2495 }, { "epoch": 1.338016390700786, "grad_norm": 3.976663031956452, "learning_rate": 6.774816547081714e-07, "logits/chosen": -0.12157398462295532, "logits/rejected": 0.034795455634593964, "logps/chosen": -1.5347206592559814, "logps/rejected": -2.0359296798706055, "loss": 0.6675, "rewards/accuracies": 0.6875, "rewards/chosen": -1.5347206592559814, "rewards/margins": 0.5012091398239136, "rewards/rejected": -2.0359296798706055, "sft_loss": 1.5952433347702026, "step": 2500 }, { "epoch": 1.3406924234821878, "grad_norm": 3.8336725585450404, "learning_rate": 6.760248111342211e-07, "logits/chosen": -0.14082691073417664, "logits/rejected": 0.03473823145031929, "logps/chosen": -1.5047399997711182, "logps/rejected": -2.0923469066619873, "loss": 0.654, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.5047399997711182, "rewards/margins": 0.5876072645187378, "rewards/rejected": -2.0923469066619873, "sft_loss": 1.5038917064666748, "step": 2505 }, { "epoch": 1.3433684562635893, "grad_norm": 7.052185955639947, "learning_rate": 6.745662597667813e-07, "logits/chosen": -0.21331937611103058, "logits/rejected": -0.07680104672908783, "logps/chosen": -1.4796664714813232, "logps/rejected": -2.0214037895202637, "loss": 0.6581, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.4796664714813232, "rewards/margins": 0.5417372584342957, "rewards/rejected": -2.0214037895202637, "sft_loss": 1.5362671613693237, "step": 2510 }, { "epoch": 1.3460444890449907, "grad_norm": 3.397500264067202, "learning_rate": 6.731060147567236e-07, "logits/chosen": -0.1217583566904068, "logits/rejected": 0.0007851526024751365, "logps/chosen": -1.579552412033081, "logps/rejected": -2.0797359943389893, "loss": 0.6742, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.579552412033081, "rewards/margins": 0.5001831650733948, "rewards/rejected": -2.0797359943389893, "sft_loss": 1.6408942937850952, "step": 2515 }, { "epoch": 1.3487205218263925, "grad_norm": 1.8049247352380564, "learning_rate": 6.716440902713515e-07, "logits/chosen": -0.2258315086364746, "logits/rejected": -0.1402186155319214, "logps/chosen": -1.5952908992767334, "logps/rejected": -2.016566514968872, "loss": 0.6693, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.5952908992767334, "rewards/margins": 0.4212755560874939, "rewards/rejected": -2.016566514968872, "sft_loss": 1.4963607788085938, "step": 2520 }, { "epoch": 1.351396554607794, "grad_norm": 7.382090470127641, "learning_rate": 6.701805004942627e-07, "logits/chosen": -0.20450296998023987, "logits/rejected": -0.1239570751786232, "logps/chosen": -1.602783203125, "logps/rejected": -2.1566102504730225, "loss": 0.6528, "rewards/accuracies": 0.65625, "rewards/chosen": -1.602783203125, "rewards/margins": 0.553827166557312, "rewards/rejected": -2.1566102504730225, "sft_loss": 1.7019307613372803, "step": 2525 }, { "epoch": 1.3540725873891954, "grad_norm": 9.4844763217049, "learning_rate": 6.687152596252119e-07, "logits/chosen": -0.22892682254314423, "logits/rejected": -0.17251577973365784, "logps/chosen": -1.6106479167938232, "logps/rejected": -2.0148985385894775, "loss": 0.6758, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.6106479167938232, "rewards/margins": 0.4042506217956543, "rewards/rejected": -2.0148985385894775, "sft_loss": 1.6035655736923218, "step": 2530 }, { "epoch": 1.3567486201705972, "grad_norm": 3.352412444441813, "learning_rate": 6.672483818799722e-07, "logits/chosen": -0.26767629384994507, "logits/rejected": -0.11004316806793213, "logps/chosen": -1.5166822671890259, "logps/rejected": -2.0801236629486084, "loss": 0.6687, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.5166822671890259, "rewards/margins": 0.5634415745735168, "rewards/rejected": -2.0801236629486084, "sft_loss": 1.5298277139663696, "step": 2535 }, { "epoch": 1.3594246529519987, "grad_norm": 14.55625660152246, "learning_rate": 6.657798814901978e-07, "logits/chosen": -0.19522763788700104, "logits/rejected": -0.014734324999153614, "logps/chosen": -1.7307504415512085, "logps/rejected": -2.087800979614258, "loss": 0.6776, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.7307504415512085, "rewards/margins": 0.3570505976676941, "rewards/rejected": -2.087800979614258, "sft_loss": 1.6963192224502563, "step": 2540 }, { "epoch": 1.3621006857334002, "grad_norm": 4.187645415508513, "learning_rate": 6.643097727032863e-07, "logits/chosen": -0.18527694046497345, "logits/rejected": 0.0016733307857066393, "logps/chosen": -1.5385725498199463, "logps/rejected": -2.181457281112671, "loss": 0.6652, "rewards/accuracies": 0.65625, "rewards/chosen": -1.5385725498199463, "rewards/margins": 0.6428850293159485, "rewards/rejected": -2.181457281112671, "sft_loss": 1.5796239376068115, "step": 2545 }, { "epoch": 1.3647767185148019, "grad_norm": 2.373501532859784, "learning_rate": 6.628380697822392e-07, "logits/chosen": -0.19420263171195984, "logits/rejected": -0.027106571942567825, "logps/chosen": -1.5866892337799072, "logps/rejected": -1.9564670324325562, "loss": 0.6795, "rewards/accuracies": 0.625, "rewards/chosen": -1.5866892337799072, "rewards/margins": 0.3697775602340698, "rewards/rejected": -1.9564670324325562, "sft_loss": 1.5613502264022827, "step": 2550 }, { "epoch": 1.3674527512962034, "grad_norm": 3.7834706241934524, "learning_rate": 6.61364787005525e-07, "logits/chosen": -0.15926238894462585, "logits/rejected": -0.0661858469247818, "logps/chosen": -1.4821274280548096, "logps/rejected": -2.1808743476867676, "loss": 0.6528, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.4821274280548096, "rewards/margins": 0.698746919631958, "rewards/rejected": -2.1808743476867676, "sft_loss": 1.5234520435333252, "step": 2555 }, { "epoch": 1.3701287840776049, "grad_norm": 3.583960186561033, "learning_rate": 6.598899386669395e-07, "logits/chosen": -0.1691628098487854, "logits/rejected": -0.04530046880245209, "logps/chosen": -1.5531885623931885, "logps/rejected": -2.1100735664367676, "loss": 0.6622, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.5531885623931885, "rewards/margins": 0.5568851828575134, "rewards/rejected": -2.1100735664367676, "sft_loss": 1.5481550693511963, "step": 2560 }, { "epoch": 1.3728048168590066, "grad_norm": 13.56875223974927, "learning_rate": 6.584135390754679e-07, "logits/chosen": -0.18169988691806793, "logits/rejected": -0.05722617357969284, "logps/chosen": -1.5489635467529297, "logps/rejected": -2.208829402923584, "loss": 0.6621, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.5489635467529297, "rewards/margins": 0.6598660349845886, "rewards/rejected": -2.208829402923584, "sft_loss": 1.5578101873397827, "step": 2565 }, { "epoch": 1.375480849640408, "grad_norm": 3.269721055203262, "learning_rate": 6.569356025551454e-07, "logits/chosen": -0.12570646405220032, "logits/rejected": -0.04687528684735298, "logps/chosen": -1.5527559518814087, "logps/rejected": -2.0612478256225586, "loss": 0.665, "rewards/accuracies": 0.65625, "rewards/chosen": -1.5527559518814087, "rewards/margins": 0.5084918737411499, "rewards/rejected": -2.0612478256225586, "sft_loss": 1.535414457321167, "step": 2570 }, { "epoch": 1.3781568824218096, "grad_norm": 7.6401000061639195, "learning_rate": 6.554561434449186e-07, "logits/chosen": -0.2719722390174866, "logits/rejected": -0.1282080113887787, "logps/chosen": -1.5063210725784302, "logps/rejected": -2.0476253032684326, "loss": 0.6635, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.5063210725784302, "rewards/margins": 0.5413042306900024, "rewards/rejected": -2.0476253032684326, "sft_loss": 1.5388996601104736, "step": 2575 }, { "epoch": 1.3808329152032113, "grad_norm": 5.53193179030059, "learning_rate": 6.539751760985063e-07, "logits/chosen": -0.2048530876636505, "logits/rejected": -0.12116770446300507, "logps/chosen": -1.559245228767395, "logps/rejected": -1.8791849613189697, "loss": 0.6719, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.559245228767395, "rewards/margins": 0.31993982195854187, "rewards/rejected": -1.8791849613189697, "sft_loss": 1.5968759059906006, "step": 2580 }, { "epoch": 1.3835089479846128, "grad_norm": 5.8336608420197775, "learning_rate": 6.524927148842602e-07, "logits/chosen": -0.11024793237447739, "logits/rejected": 0.04365754872560501, "logps/chosen": -1.4740869998931885, "logps/rejected": -2.0004656314849854, "loss": 0.6473, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.4740869998931885, "rewards/margins": 0.5263785719871521, "rewards/rejected": -2.0004656314849854, "sft_loss": 1.4342414140701294, "step": 2585 }, { "epoch": 1.3861849807660143, "grad_norm": 4.838704544675485, "learning_rate": 6.510087741850254e-07, "logits/chosen": -0.21626317501068115, "logits/rejected": -0.07905010879039764, "logps/chosen": -1.4662044048309326, "logps/rejected": -1.9638229608535767, "loss": 0.667, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.4662044048309326, "rewards/margins": 0.49761828780174255, "rewards/rejected": -1.9638229608535767, "sft_loss": 1.5521126985549927, "step": 2590 }, { "epoch": 1.388861013547416, "grad_norm": 5.6321716050921475, "learning_rate": 6.495233683980012e-07, "logits/chosen": -0.1686750203371048, "logits/rejected": -0.11953876912593842, "logps/chosen": -1.5056140422821045, "logps/rejected": -1.9153554439544678, "loss": 0.6725, "rewards/accuracies": 0.59375, "rewards/chosen": -1.5056140422821045, "rewards/margins": 0.4097414016723633, "rewards/rejected": -1.9153554439544678, "sft_loss": 1.4823278188705444, "step": 2595 }, { "epoch": 1.3915370463288175, "grad_norm": 5.256141485844498, "learning_rate": 6.480365119346011e-07, "logits/chosen": -0.07481982558965683, "logits/rejected": 0.05477939918637276, "logps/chosen": -1.5007654428482056, "logps/rejected": -1.840966820716858, "loss": 0.6668, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.5007654428482056, "rewards/margins": 0.340201199054718, "rewards/rejected": -1.840966820716858, "sft_loss": 1.5125921964645386, "step": 2600 }, { "epoch": 1.394213079110219, "grad_norm": 2.4415775706067118, "learning_rate": 6.465482192203129e-07, "logits/chosen": -0.0970289334654808, "logits/rejected": -0.04464380070567131, "logps/chosen": -1.5349600315093994, "logps/rejected": -1.9589964151382446, "loss": 0.6791, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.5349600315093994, "rewards/margins": 0.42403656244277954, "rewards/rejected": -1.9589964151382446, "sft_loss": 1.5748927593231201, "step": 2605 }, { "epoch": 1.3968891118916207, "grad_norm": 4.649699563839675, "learning_rate": 6.45058504694559e-07, "logits/chosen": -0.07690870761871338, "logits/rejected": -0.000336170953232795, "logps/chosen": -1.5294123888015747, "logps/rejected": -1.9836593866348267, "loss": 0.6744, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.5294123888015747, "rewards/margins": 0.4542468190193176, "rewards/rejected": -1.9836593866348267, "sft_loss": 1.5801916122436523, "step": 2610 }, { "epoch": 1.3995651446730222, "grad_norm": 6.0153698780292, "learning_rate": 6.435673828105564e-07, "logits/chosen": -0.1915212720632553, "logits/rejected": -0.04982220381498337, "logps/chosen": -1.463560938835144, "logps/rejected": -2.0906307697296143, "loss": 0.6569, "rewards/accuracies": 0.625, "rewards/chosen": -1.463560938835144, "rewards/margins": 0.6270699501037598, "rewards/rejected": -2.0906307697296143, "sft_loss": 1.5472733974456787, "step": 2615 }, { "epoch": 1.402241177454424, "grad_norm": 4.624166029738389, "learning_rate": 6.420748680351763e-07, "logits/chosen": -0.18850019574165344, "logits/rejected": -0.19020205736160278, "logps/chosen": -1.5914949178695679, "logps/rejected": -1.8966413736343384, "loss": 0.6831, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.5914949178695679, "rewards/margins": 0.3051464855670929, "rewards/rejected": -1.8966413736343384, "sft_loss": 1.6427654027938843, "step": 2620 }, { "epoch": 1.4049172102358254, "grad_norm": 2.0745035306044537, "learning_rate": 6.405809748488032e-07, "logits/chosen": -0.15994513034820557, "logits/rejected": -0.00917014293372631, "logps/chosen": -1.6022281646728516, "logps/rejected": -2.0769033432006836, "loss": 0.6677, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.6022281646728516, "rewards/margins": 0.47467517852783203, "rewards/rejected": -2.0769033432006836, "sft_loss": 1.5459787845611572, "step": 2625 }, { "epoch": 1.4075932430172269, "grad_norm": 4.20200577999301, "learning_rate": 6.390857177451956e-07, "logits/chosen": -0.31244519352912903, "logits/rejected": -0.11458522081375122, "logps/chosen": -1.5718451738357544, "logps/rejected": -2.1408886909484863, "loss": 0.6771, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.5718451738357544, "rewards/margins": 0.5690435171127319, "rewards/rejected": -2.1408886909484863, "sft_loss": 1.5779874324798584, "step": 2630 }, { "epoch": 1.4102692757986286, "grad_norm": 3.4210463268097926, "learning_rate": 6.375891112313445e-07, "logits/chosen": -0.241429403424263, "logits/rejected": -0.1481015533208847, "logps/chosen": -1.5252244472503662, "logps/rejected": -2.0902552604675293, "loss": 0.6601, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.5252244472503662, "rewards/margins": 0.5650309324264526, "rewards/rejected": -2.0902552604675293, "sft_loss": 1.5835373401641846, "step": 2635 }, { "epoch": 1.41294530858003, "grad_norm": 4.097976614384087, "learning_rate": 6.360911698273326e-07, "logits/chosen": -0.16206932067871094, "logits/rejected": -0.07781902700662613, "logps/chosen": -1.6320310831069946, "logps/rejected": -2.0864644050598145, "loss": 0.6729, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.6320310831069946, "rewards/margins": 0.4544333517551422, "rewards/rejected": -2.0864644050598145, "sft_loss": 1.5919148921966553, "step": 2640 }, { "epoch": 1.4156213413614318, "grad_norm": 5.291704044136723, "learning_rate": 6.345919080661944e-07, "logits/chosen": -0.1960441768169403, "logits/rejected": -0.11005222797393799, "logps/chosen": -1.5638240575790405, "logps/rejected": -2.1008219718933105, "loss": 0.6674, "rewards/accuracies": 0.625, "rewards/chosen": -1.5638240575790405, "rewards/margins": 0.5369978547096252, "rewards/rejected": -2.1008219718933105, "sft_loss": 1.5387166738510132, "step": 2645 }, { "epoch": 1.4182973741428333, "grad_norm": 5.5685439824404845, "learning_rate": 6.330913404937737e-07, "logits/chosen": -0.2559325098991394, "logits/rejected": -0.10011889785528183, "logps/chosen": -1.5849932432174683, "logps/rejected": -2.3768601417541504, "loss": 0.6778, "rewards/accuracies": 0.65625, "rewards/chosen": -1.5849932432174683, "rewards/margins": 0.791866660118103, "rewards/rejected": -2.3768601417541504, "sft_loss": 1.5990684032440186, "step": 2650 }, { "epoch": 1.4209734069242348, "grad_norm": 8.078595435314279, "learning_rate": 6.315894816685838e-07, "logits/chosen": -0.16147413849830627, "logits/rejected": -0.006514549255371094, "logps/chosen": -1.505885362625122, "logps/rejected": -2.0246315002441406, "loss": 0.6516, "rewards/accuracies": 0.6875, "rewards/chosen": -1.505885362625122, "rewards/margins": 0.5187458992004395, "rewards/rejected": -2.0246315002441406, "sft_loss": 1.5155709981918335, "step": 2655 }, { "epoch": 1.4236494397056365, "grad_norm": 5.341300478947878, "learning_rate": 6.300863461616657e-07, "logits/chosen": -0.11378873884677887, "logits/rejected": -0.06429148465394974, "logps/chosen": -1.5142958164215088, "logps/rejected": -2.185347080230713, "loss": 0.6616, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.5142958164215088, "rewards/margins": 0.6710509061813354, "rewards/rejected": -2.185347080230713, "sft_loss": 1.5100260972976685, "step": 2660 }, { "epoch": 1.426325472487038, "grad_norm": 3.176988277206947, "learning_rate": 6.285819485564465e-07, "logits/chosen": -0.2651599049568176, "logits/rejected": -0.13692238926887512, "logps/chosen": -1.6227686405181885, "logps/rejected": -2.251343011856079, "loss": 0.6522, "rewards/accuracies": 0.65625, "rewards/chosen": -1.6227686405181885, "rewards/margins": 0.6285744309425354, "rewards/rejected": -2.251343011856079, "sft_loss": 1.6871635913848877, "step": 2665 }, { "epoch": 1.4290015052684395, "grad_norm": 2.883049496689386, "learning_rate": 6.270763034485986e-07, "logits/chosen": -0.12162493169307709, "logits/rejected": -0.021850673481822014, "logps/chosen": -1.6583884954452515, "logps/rejected": -2.1100821495056152, "loss": 0.6593, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.6583884954452515, "rewards/margins": 0.45169371366500854, "rewards/rejected": -2.1100821495056152, "sft_loss": 1.6291186809539795, "step": 2670 }, { "epoch": 1.4316775380498412, "grad_norm": 12.871054148089197, "learning_rate": 6.255694254458972e-07, "logits/chosen": -0.2270089089870453, "logits/rejected": -0.07251622527837753, "logps/chosen": -1.7089569568634033, "logps/rejected": -2.04795503616333, "loss": 0.6807, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.7089569568634033, "rewards/margins": 0.3389982581138611, "rewards/rejected": -2.04795503616333, "sft_loss": 1.5236625671386719, "step": 2675 }, { "epoch": 1.4343535708312427, "grad_norm": 6.283613019960803, "learning_rate": 6.240613291680795e-07, "logits/chosen": -0.2575947046279907, "logits/rejected": -0.08951815217733383, "logps/chosen": -1.582740068435669, "logps/rejected": -2.117541551589966, "loss": 0.6733, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.582740068435669, "rewards/margins": 0.5348014831542969, "rewards/rejected": -2.117541551589966, "sft_loss": 1.5981295108795166, "step": 2680 }, { "epoch": 1.4370296036126442, "grad_norm": 6.264530990495215, "learning_rate": 6.225520292467021e-07, "logits/chosen": -0.20818033814430237, "logits/rejected": 0.002076371107250452, "logps/chosen": -1.5417582988739014, "logps/rejected": -2.0970585346221924, "loss": 0.6658, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.5417582988739014, "rewards/margins": 0.5553001761436462, "rewards/rejected": -2.0970585346221924, "sft_loss": 1.5650211572647095, "step": 2685 }, { "epoch": 1.439705636394046, "grad_norm": 1.9064429797923519, "learning_rate": 6.210415403249993e-07, "logits/chosen": -0.3717069625854492, "logits/rejected": -0.1375175565481186, "logps/chosen": -1.6472127437591553, "logps/rejected": -2.391467332839966, "loss": 0.6576, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.6472127437591553, "rewards/margins": 0.7442543506622314, "rewards/rejected": -2.391467332839966, "sft_loss": 1.5960744619369507, "step": 2690 }, { "epoch": 1.4423816691754474, "grad_norm": 2.5469317246859338, "learning_rate": 6.195298770577415e-07, "logits/chosen": -0.18331435322761536, "logits/rejected": -0.15281887352466583, "logps/chosen": -1.5887569189071655, "logps/rejected": -2.1782190799713135, "loss": 0.67, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.5887569189071655, "rewards/margins": 0.5894622206687927, "rewards/rejected": -2.1782190799713135, "sft_loss": 1.5789341926574707, "step": 2695 }, { "epoch": 1.445057701956849, "grad_norm": 5.17353516640267, "learning_rate": 6.180170541110923e-07, "logits/chosen": -0.2531064450740814, "logits/rejected": -0.06976678222417831, "logps/chosen": -1.6449716091156006, "logps/rejected": -2.1559338569641113, "loss": 0.6767, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.6449716091156006, "rewards/margins": 0.5109620690345764, "rewards/rejected": -2.1559338569641113, "sft_loss": 1.663458228111267, "step": 2700 }, { "epoch": 1.4477337347382506, "grad_norm": 2.741290627931337, "learning_rate": 6.165030861624663e-07, "logits/chosen": -0.25593826174736023, "logits/rejected": -0.043508801609277725, "logps/chosen": -1.453940749168396, "logps/rejected": -2.434394598007202, "loss": 0.6342, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.453940749168396, "rewards/margins": 0.9804538488388062, "rewards/rejected": -2.434394598007202, "sft_loss": 1.415795087814331, "step": 2705 }, { "epoch": 1.4504097675196521, "grad_norm": 3.1028823700547408, "learning_rate": 6.149879879003876e-07, "logits/chosen": -0.14798538386821747, "logits/rejected": -0.11400546878576279, "logps/chosen": -1.5622608661651611, "logps/rejected": -2.07262921333313, "loss": 0.6499, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.5622608661651611, "rewards/margins": 0.5103680491447449, "rewards/rejected": -2.07262921333313, "sft_loss": 1.5270684957504272, "step": 2710 }, { "epoch": 1.4530858003010536, "grad_norm": 4.332292650790028, "learning_rate": 6.13471774024346e-07, "logits/chosen": -0.29937106370925903, "logits/rejected": -0.19251208007335663, "logps/chosen": -1.4233131408691406, "logps/rejected": -1.9493812322616577, "loss": 0.6512, "rewards/accuracies": 0.625, "rewards/chosen": -1.4233131408691406, "rewards/margins": 0.5260680913925171, "rewards/rejected": -1.9493812322616577, "sft_loss": 1.5380337238311768, "step": 2715 }, { "epoch": 1.4557618330824553, "grad_norm": 7.4402227586030065, "learning_rate": 6.119544592446551e-07, "logits/chosen": -0.25485068559646606, "logits/rejected": -0.13746242225170135, "logps/chosen": -1.5280907154083252, "logps/rejected": -1.8951940536499023, "loss": 0.6741, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.5280907154083252, "rewards/margins": 0.3671031594276428, "rewards/rejected": -1.8951940536499023, "sft_loss": 1.4867727756500244, "step": 2720 }, { "epoch": 1.4584378658638568, "grad_norm": 4.465298263051977, "learning_rate": 6.104360582823096e-07, "logits/chosen": -0.2098223716020584, "logits/rejected": -0.10190926492214203, "logps/chosen": -1.4999598264694214, "logps/rejected": -2.030771255493164, "loss": 0.6535, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.4999598264694214, "rewards/margins": 0.5308112502098083, "rewards/rejected": -2.030771255493164, "sft_loss": 1.4926955699920654, "step": 2725 }, { "epoch": 1.4611138986452583, "grad_norm": 5.828776774520263, "learning_rate": 6.089165858688423e-07, "logits/chosen": -0.2677415609359741, "logits/rejected": -0.09359410405158997, "logps/chosen": -1.4975014925003052, "logps/rejected": -2.195223093032837, "loss": 0.66, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.4975014925003052, "rewards/margins": 0.6977213621139526, "rewards/rejected": -2.195223093032837, "sft_loss": 1.507609486579895, "step": 2730 }, { "epoch": 1.46378993142666, "grad_norm": 6.085674622259741, "learning_rate": 6.073960567461811e-07, "logits/chosen": -0.25986337661743164, "logits/rejected": -0.06538162380456924, "logps/chosen": -1.4231359958648682, "logps/rejected": -2.107517719268799, "loss": 0.6484, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.4231359958648682, "rewards/margins": 0.6843816637992859, "rewards/rejected": -2.107517719268799, "sft_loss": 1.4849355220794678, "step": 2735 }, { "epoch": 1.4664659642080615, "grad_norm": 5.793144292379904, "learning_rate": 6.058744856665065e-07, "logits/chosen": -0.2694844603538513, "logits/rejected": -0.1628873646259308, "logps/chosen": -1.5163061618804932, "logps/rejected": -2.3208680152893066, "loss": 0.6526, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.5163061618804932, "rewards/margins": 0.804561972618103, "rewards/rejected": -2.3208680152893066, "sft_loss": 1.5444607734680176, "step": 2740 }, { "epoch": 1.469141996989463, "grad_norm": 4.465041295648728, "learning_rate": 6.043518873921074e-07, "logits/chosen": -0.2704235911369324, "logits/rejected": -0.14123213291168213, "logps/chosen": -1.4960787296295166, "logps/rejected": -1.9699077606201172, "loss": 0.6543, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.4960787296295166, "rewards/margins": 0.4738289713859558, "rewards/rejected": -1.9699077606201172, "sft_loss": 1.4975306987762451, "step": 2745 }, { "epoch": 1.4718180297708647, "grad_norm": 3.7597880435407323, "learning_rate": 6.028282766952393e-07, "logits/chosen": -0.24096722900867462, "logits/rejected": -0.13594664633274078, "logps/chosen": -1.5836585760116577, "logps/rejected": -2.232564687728882, "loss": 0.6445, "rewards/accuracies": 0.71875, "rewards/chosen": -1.5836585760116577, "rewards/margins": 0.6489061117172241, "rewards/rejected": -2.232564687728882, "sft_loss": 1.5925681591033936, "step": 2750 }, { "epoch": 1.4744940625522662, "grad_norm": 8.458017615838024, "learning_rate": 6.013036683579798e-07, "logits/chosen": -0.1675444394350052, "logits/rejected": -0.02611861564218998, "logps/chosen": -1.4565662145614624, "logps/rejected": -1.983507513999939, "loss": 0.6629, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.4565662145614624, "rewards/margins": 0.5269410610198975, "rewards/rejected": -1.983507513999939, "sft_loss": 1.5315477848052979, "step": 2755 }, { "epoch": 1.4771700953336677, "grad_norm": 3.2438935797505413, "learning_rate": 5.997780771720854e-07, "logits/chosen": -0.2979353070259094, "logits/rejected": -0.12887240946292877, "logps/chosen": -1.5524100065231323, "logps/rejected": -2.2004122734069824, "loss": 0.6605, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.5524100065231323, "rewards/margins": 0.6480022072792053, "rewards/rejected": -2.2004122734069824, "sft_loss": 1.546685814857483, "step": 2760 }, { "epoch": 1.4798461281150694, "grad_norm": 4.32323300694089, "learning_rate": 5.982515179388486e-07, "logits/chosen": -0.20667262375354767, "logits/rejected": -0.07070144265890121, "logps/chosen": -1.4959765672683716, "logps/rejected": -1.9927799701690674, "loss": 0.6459, "rewards/accuracies": 0.65625, "rewards/chosen": -1.4959765672683716, "rewards/margins": 0.4968033730983734, "rewards/rejected": -1.9927799701690674, "sft_loss": 1.5539121627807617, "step": 2765 }, { "epoch": 1.482522160896471, "grad_norm": 8.115148746232252, "learning_rate": 5.967240054689541e-07, "logits/chosen": -0.28089088201522827, "logits/rejected": -0.2090112864971161, "logps/chosen": -1.451894760131836, "logps/rejected": -1.7988488674163818, "loss": 0.6661, "rewards/accuracies": 0.65625, "rewards/chosen": -1.451894760131836, "rewards/margins": 0.34695425629615784, "rewards/rejected": -1.7988488674163818, "sft_loss": 1.5190246105194092, "step": 2770 }, { "epoch": 1.4851981936778724, "grad_norm": 2.9528466265049724, "learning_rate": 5.951955545823342e-07, "logits/chosen": -0.24385952949523926, "logits/rejected": -0.19614621996879578, "logps/chosen": -1.504969835281372, "logps/rejected": -2.081859588623047, "loss": 0.6539, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.504969835281372, "rewards/margins": 0.5768897533416748, "rewards/rejected": -2.081859588623047, "sft_loss": 1.5464751720428467, "step": 2775 }, { "epoch": 1.4878742264592741, "grad_norm": 2.556932366962606, "learning_rate": 5.936661801080263e-07, "logits/chosen": -0.23676061630249023, "logits/rejected": -0.1409497857093811, "logps/chosen": -1.6686662435531616, "logps/rejected": -2.109144449234009, "loss": 0.6907, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.6686662435531616, "rewards/margins": 0.44047823548316956, "rewards/rejected": -2.109144449234009, "sft_loss": 1.6145015954971313, "step": 2780 }, { "epoch": 1.4905502592406756, "grad_norm": 6.677454504540487, "learning_rate": 5.92135896884028e-07, "logits/chosen": -0.28398019075393677, "logits/rejected": -0.1465751677751541, "logps/chosen": -1.6540460586547852, "logps/rejected": -2.3016133308410645, "loss": 0.6763, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.6540460586547852, "rewards/margins": 0.6475671529769897, "rewards/rejected": -2.3016133308410645, "sft_loss": 1.6323429346084595, "step": 2785 }, { "epoch": 1.4932262920220774, "grad_norm": 5.837506585015335, "learning_rate": 5.906047197571541e-07, "logits/chosen": -0.21055534482002258, "logits/rejected": -0.22708992660045624, "logps/chosen": -1.5476744174957275, "logps/rejected": -2.052155017852783, "loss": 0.6651, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.5476744174957275, "rewards/margins": 0.5044804215431213, "rewards/rejected": -2.052155017852783, "sft_loss": 1.6932910680770874, "step": 2790 }, { "epoch": 1.4959023248034788, "grad_norm": 3.794310744789843, "learning_rate": 5.890726635828919e-07, "logits/chosen": -0.11941961199045181, "logits/rejected": -0.1020720824599266, "logps/chosen": -1.5011706352233887, "logps/rejected": -2.019202709197998, "loss": 0.672, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.5011706352233887, "rewards/margins": 0.5180321931838989, "rewards/rejected": -2.019202709197998, "sft_loss": 1.484691858291626, "step": 2795 }, { "epoch": 1.4985783575848803, "grad_norm": 3.05795063441273, "learning_rate": 5.875397432252569e-07, "logits/chosen": -0.28632354736328125, "logits/rejected": -0.19636675715446472, "logps/chosen": -1.623683214187622, "logps/rejected": -2.1551148891448975, "loss": 0.6658, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.623683214187622, "rewards/margins": 0.5314315557479858, "rewards/rejected": -2.1551148891448975, "sft_loss": 1.6839704513549805, "step": 2800 }, { "epoch": 1.4985783575848803, "eval_logits/chosen": 0.025673363357782364, "eval_logits/rejected": 0.11081632226705551, "eval_logps/chosen": -1.6197056770324707, "eval_logps/rejected": -2.181790590286255, "eval_loss": 0.6707143783569336, "eval_rewards/accuracies": 0.6454005837440491, "eval_rewards/chosen": -1.6197056770324707, "eval_rewards/margins": 0.5620848536491394, "eval_rewards/rejected": -2.181790590286255, "eval_runtime": 46.4594, "eval_samples_per_second": 28.95, "eval_sft_loss": 1.6055032014846802, "eval_steps_per_second": 7.254, "step": 2800 }, { "epoch": 1.5012543903662818, "grad_norm": 2.0226207098379314, "learning_rate": 5.860059735566491e-07, "logits/chosen": -0.39112040400505066, "logits/rejected": -0.24808260798454285, "logps/chosen": -1.4235303401947021, "logps/rejected": -1.9543339014053345, "loss": 0.6509, "rewards/accuracies": 0.6875, "rewards/chosen": -1.4235303401947021, "rewards/margins": 0.5308033227920532, "rewards/rejected": -1.9543339014053345, "sft_loss": 1.4845623970031738, "step": 2805 }, { "epoch": 1.5039304231476835, "grad_norm": 9.915134760660942, "learning_rate": 5.844713694577087e-07, "logits/chosen": -0.22110262513160706, "logits/rejected": -0.17764215171337128, "logps/chosen": -1.5744935274124146, "logps/rejected": -2.0595247745513916, "loss": 0.6595, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.5744935274124146, "rewards/margins": 0.4850311875343323, "rewards/rejected": -2.0595247745513916, "sft_loss": 1.6815903186798096, "step": 2810 }, { "epoch": 1.5066064559290853, "grad_norm": 3.022713285905784, "learning_rate": 5.829359458171714e-07, "logits/chosen": -0.1872178614139557, "logits/rejected": -0.06712029874324799, "logps/chosen": -1.5897722244262695, "logps/rejected": -2.178109645843506, "loss": 0.6587, "rewards/accuracies": 0.625, "rewards/chosen": -1.5897722244262695, "rewards/margins": 0.5883373022079468, "rewards/rejected": -2.178109645843506, "sft_loss": 1.5976924896240234, "step": 2815 }, { "epoch": 1.5092824887104868, "grad_norm": 3.716607098661702, "learning_rate": 5.81399717531724e-07, "logits/chosen": -0.21846675872802734, "logits/rejected": -0.046916164457798004, "logps/chosen": -1.58573317527771, "logps/rejected": -2.0916264057159424, "loss": 0.6722, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.58573317527771, "rewards/margins": 0.5058929324150085, "rewards/rejected": -2.0916264057159424, "sft_loss": 1.5854836702346802, "step": 2820 }, { "epoch": 1.5119585214918883, "grad_norm": 3.4725896292535148, "learning_rate": 5.798626995058602e-07, "logits/chosen": -0.26726624369621277, "logits/rejected": -0.08334805816411972, "logps/chosen": -1.6209615468978882, "logps/rejected": -2.3004868030548096, "loss": 0.6695, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.6209615468978882, "rewards/margins": 0.6795251965522766, "rewards/rejected": -2.3004868030548096, "sft_loss": 1.5951423645019531, "step": 2825 }, { "epoch": 1.51463455427329, "grad_norm": 9.544540172332685, "learning_rate": 5.783249066517354e-07, "logits/chosen": -0.21872110664844513, "logits/rejected": -0.06208853796124458, "logps/chosen": -1.643139123916626, "logps/rejected": -2.0365586280822754, "loss": 0.695, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.643139123916626, "rewards/margins": 0.39341965317726135, "rewards/rejected": -2.0365586280822754, "sft_loss": 1.676015853881836, "step": 2830 }, { "epoch": 1.5173105870546915, "grad_norm": 5.649184349410335, "learning_rate": 5.767863538890228e-07, "logits/chosen": -0.2260381281375885, "logits/rejected": -0.06770346313714981, "logps/chosen": -1.5730047225952148, "logps/rejected": -2.150031089782715, "loss": 0.6663, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.5730047225952148, "rewards/margins": 0.5770264863967896, "rewards/rejected": -2.150031089782715, "sft_loss": 1.583595871925354, "step": 2835 }, { "epoch": 1.519986619836093, "grad_norm": 7.166373629229626, "learning_rate": 5.75247056144768e-07, "logits/chosen": -0.20473213493824005, "logits/rejected": -0.10485156625509262, "logps/chosen": -1.5947294235229492, "logps/rejected": -1.987549066543579, "loss": 0.6681, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.5947294235229492, "rewards/margins": 0.39281970262527466, "rewards/rejected": -1.987549066543579, "sft_loss": 1.6280778646469116, "step": 2840 }, { "epoch": 1.5226626526174947, "grad_norm": 5.79631646232252, "learning_rate": 5.737070283532444e-07, "logits/chosen": -0.17899790406227112, "logits/rejected": -0.0887211412191391, "logps/chosen": -1.6026455163955688, "logps/rejected": -2.146700859069824, "loss": 0.6727, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.6026455163955688, "rewards/margins": 0.5440553426742554, "rewards/rejected": -2.146700859069824, "sft_loss": 1.540010690689087, "step": 2845 }, { "epoch": 1.5253386853988962, "grad_norm": 11.689193206305001, "learning_rate": 5.721662854558084e-07, "logits/chosen": -0.2543259561061859, "logits/rejected": -0.17087538540363312, "logps/chosen": -1.5911014080047607, "logps/rejected": -2.1432793140411377, "loss": 0.6798, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.5911014080047607, "rewards/margins": 0.5521779656410217, "rewards/rejected": -2.1432793140411377, "sft_loss": 1.6015312671661377, "step": 2850 }, { "epoch": 1.5280147181802977, "grad_norm": 2.7080491563427267, "learning_rate": 5.706248424007545e-07, "logits/chosen": -0.2648487687110901, "logits/rejected": -0.09542088210582733, "logps/chosen": -1.7155059576034546, "logps/rejected": -2.2239832878112793, "loss": 0.6788, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.7155059576034546, "rewards/margins": 0.5084772109985352, "rewards/rejected": -2.2239832878112793, "sft_loss": 1.7024424076080322, "step": 2855 }, { "epoch": 1.5306907509616994, "grad_norm": 2.878698675596129, "learning_rate": 5.690827141431699e-07, "logits/chosen": -0.3109549880027771, "logits/rejected": -0.12017013877630234, "logps/chosen": -1.6655937433242798, "logps/rejected": -2.110398769378662, "loss": 0.6693, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.6655937433242798, "rewards/margins": 0.44480523467063904, "rewards/rejected": -2.110398769378662, "sft_loss": 1.5922762155532837, "step": 2860 }, { "epoch": 1.5333667837431009, "grad_norm": 6.031810915102935, "learning_rate": 5.675399156447897e-07, "logits/chosen": -0.31219014525413513, "logits/rejected": -0.16434085369110107, "logps/chosen": -1.5579341650009155, "logps/rejected": -2.12554669380188, "loss": 0.6829, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.5579341650009155, "rewards/margins": 0.56761234998703, "rewards/rejected": -2.12554669380188, "sft_loss": 1.6126997470855713, "step": 2865 }, { "epoch": 1.5360428165245024, "grad_norm": 6.298021089512181, "learning_rate": 5.659964618738515e-07, "logits/chosen": -0.22607514262199402, "logits/rejected": -0.09791434556245804, "logps/chosen": -1.5956169366836548, "logps/rejected": -1.9712276458740234, "loss": 0.6828, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.5956169366836548, "rewards/margins": 0.3756106197834015, "rewards/rejected": -1.9712276458740234, "sft_loss": 1.5619713068008423, "step": 2870 }, { "epoch": 1.538718849305904, "grad_norm": 3.0670744673020494, "learning_rate": 5.644523678049509e-07, "logits/chosen": -0.23713460564613342, "logits/rejected": -0.12866348028182983, "logps/chosen": -1.5899370908737183, "logps/rejected": -2.066519260406494, "loss": 0.6664, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.5899370908737183, "rewards/margins": 0.47658228874206543, "rewards/rejected": -2.066519260406494, "sft_loss": 1.5455102920532227, "step": 2875 }, { "epoch": 1.5413948820873056, "grad_norm": 4.091790425211791, "learning_rate": 5.629076484188952e-07, "logits/chosen": -0.10790850222110748, "logits/rejected": 0.008631653152406216, "logps/chosen": -1.606191635131836, "logps/rejected": -2.1437809467315674, "loss": 0.6585, "rewards/accuracies": 0.71875, "rewards/chosen": -1.606191635131836, "rewards/margins": 0.5375891923904419, "rewards/rejected": -2.1437809467315674, "sft_loss": 1.5784814357757568, "step": 2880 }, { "epoch": 1.544070914868707, "grad_norm": 3.9005798756672756, "learning_rate": 5.613623187025587e-07, "logits/chosen": -0.2036016881465912, "logits/rejected": -0.07130200415849686, "logps/chosen": -1.640968680381775, "logps/rejected": -2.1466243267059326, "loss": 0.6689, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.640968680381775, "rewards/margins": 0.5056557655334473, "rewards/rejected": -2.1466243267059326, "sft_loss": 1.650861144065857, "step": 2885 }, { "epoch": 1.5467469476501088, "grad_norm": 4.24708693894575, "learning_rate": 5.598163936487369e-07, "logits/chosen": -0.2895212471485138, "logits/rejected": -0.09275839477777481, "logps/chosen": -1.5821725130081177, "logps/rejected": -2.12876558303833, "loss": 0.6735, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.5821725130081177, "rewards/margins": 0.5465930700302124, "rewards/rejected": -2.12876558303833, "sft_loss": 1.551035761833191, "step": 2890 }, { "epoch": 1.5494229804315103, "grad_norm": 4.237934129093345, "learning_rate": 5.582698882560017e-07, "logits/chosen": -0.28006845712661743, "logits/rejected": -0.1313776671886444, "logps/chosen": -1.5778753757476807, "logps/rejected": -2.209226131439209, "loss": 0.6642, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.5778753757476807, "rewards/margins": 0.6313506364822388, "rewards/rejected": -2.209226131439209, "sft_loss": 1.5569813251495361, "step": 2895 }, { "epoch": 1.5520990132129118, "grad_norm": 5.009728052455516, "learning_rate": 5.567228175285549e-07, "logits/chosen": -0.20519180595874786, "logits/rejected": -0.09929974377155304, "logps/chosen": -1.58689284324646, "logps/rejected": -2.2199573516845703, "loss": 0.6682, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.58689284324646, "rewards/margins": 0.6330643892288208, "rewards/rejected": -2.2199573516845703, "sft_loss": 1.5671353340148926, "step": 2900 }, { "epoch": 1.5547750459943135, "grad_norm": 2.918493022779605, "learning_rate": 5.551751964760838e-07, "logits/chosen": -0.12294672429561615, "logits/rejected": -0.09764774888753891, "logps/chosen": -1.5841805934906006, "logps/rejected": -2.107809543609619, "loss": 0.6577, "rewards/accuracies": 0.625, "rewards/chosen": -1.5841805934906006, "rewards/margins": 0.5236291885375977, "rewards/rejected": -2.107809543609619, "sft_loss": 1.6166483163833618, "step": 2905 }, { "epoch": 1.557451078775715, "grad_norm": 2.5726111441709683, "learning_rate": 5.536270401136145e-07, "logits/chosen": -0.1962536871433258, "logits/rejected": -0.08937899023294449, "logps/chosen": -1.4637311697006226, "logps/rejected": -1.9759318828582764, "loss": 0.6521, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.4637311697006226, "rewards/margins": 0.5122007131576538, "rewards/rejected": -1.9759318828582764, "sft_loss": 1.5529893636703491, "step": 2910 }, { "epoch": 1.5601271115571165, "grad_norm": 2.5475667771729413, "learning_rate": 5.520783634613667e-07, "logits/chosen": -0.16151954233646393, "logits/rejected": 0.012316593900322914, "logps/chosen": -1.6571800708770752, "logps/rejected": -2.189068555831909, "loss": 0.6674, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.6571800708770752, "rewards/margins": 0.5318886041641235, "rewards/rejected": -2.189068555831909, "sft_loss": 1.7012519836425781, "step": 2915 }, { "epoch": 1.5628031443385182, "grad_norm": 1.52301957259393, "learning_rate": 5.505291815446082e-07, "logits/chosen": -0.14033019542694092, "logits/rejected": -0.015837164595723152, "logps/chosen": -1.6222553253173828, "logps/rejected": -2.169674873352051, "loss": 0.6757, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.6222553253173828, "rewards/margins": 0.5474194288253784, "rewards/rejected": -2.169674873352051, "sft_loss": 1.618524193763733, "step": 2920 }, { "epoch": 1.5654791771199197, "grad_norm": 4.221517759195312, "learning_rate": 5.489795093935089e-07, "logits/chosen": -0.15154433250427246, "logits/rejected": -0.08596460521221161, "logps/chosen": -1.591717004776001, "logps/rejected": -2.138441324234009, "loss": 0.6492, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.591717004776001, "rewards/margins": 0.5467241406440735, "rewards/rejected": -2.138441324234009, "sft_loss": 1.571844458580017, "step": 2925 }, { "epoch": 1.5681552099013212, "grad_norm": 2.5282174445739853, "learning_rate": 5.474293620429946e-07, "logits/chosen": -0.29474323987960815, "logits/rejected": -0.11456756293773651, "logps/chosen": -1.5818030834197998, "logps/rejected": -2.5438389778137207, "loss": 0.6563, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.5818030834197998, "rewards/margins": 0.9620355367660522, "rewards/rejected": -2.5438389778137207, "sft_loss": 1.6236823797225952, "step": 2930 }, { "epoch": 1.570831242682723, "grad_norm": 3.3302415118771487, "learning_rate": 5.458787545326018e-07, "logits/chosen": -0.24398522078990936, "logits/rejected": -0.09933225810527802, "logps/chosen": -1.6292442083358765, "logps/rejected": -2.180849552154541, "loss": 0.662, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.6292442083358765, "rewards/margins": 0.5516052842140198, "rewards/rejected": -2.180849552154541, "sft_loss": 1.6737045049667358, "step": 2935 }, { "epoch": 1.5735072754641244, "grad_norm": 2.447525860866938, "learning_rate": 5.443277019063311e-07, "logits/chosen": -0.24691593647003174, "logits/rejected": -0.07334215939044952, "logps/chosen": -1.5735365152359009, "logps/rejected": -2.3229589462280273, "loss": 0.6428, "rewards/accuracies": 0.6875, "rewards/chosen": -1.5735365152359009, "rewards/margins": 0.7494224309921265, "rewards/rejected": -2.3229589462280273, "sft_loss": 1.6313596963882446, "step": 2940 }, { "epoch": 1.5761833082455259, "grad_norm": 6.579022510254535, "learning_rate": 5.427762192125023e-07, "logits/chosen": -0.23249594867229462, "logits/rejected": -0.0880482941865921, "logps/chosen": -1.6138916015625, "logps/rejected": -2.014530897140503, "loss": 0.6691, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.6138916015625, "rewards/margins": 0.40063929557800293, "rewards/rejected": -2.014530897140503, "sft_loss": 1.5987266302108765, "step": 2945 }, { "epoch": 1.5788593410269276, "grad_norm": 4.445224356828172, "learning_rate": 5.41224321503607e-07, "logits/chosen": -0.11989017575979233, "logits/rejected": 0.1379929631948471, "logps/chosen": -1.487082600593567, "logps/rejected": -2.2836666107177734, "loss": 0.6409, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.487082600593567, "rewards/margins": 0.7965839505195618, "rewards/rejected": -2.2836666107177734, "sft_loss": 1.5135908126831055, "step": 2950 }, { "epoch": 1.5815353738083293, "grad_norm": 4.411394972956897, "learning_rate": 5.396720238361637e-07, "logits/chosen": -0.10843691974878311, "logits/rejected": 0.0039287833496928215, "logps/chosen": -1.5481619834899902, "logps/rejected": -2.1295180320739746, "loss": 0.6628, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.5481619834899902, "rewards/margins": 0.5813560485839844, "rewards/rejected": -2.1295180320739746, "sft_loss": 1.5984420776367188, "step": 2955 }, { "epoch": 1.5842114065897306, "grad_norm": 2.813638099032081, "learning_rate": 5.381193412705711e-07, "logits/chosen": -0.23725099861621857, "logits/rejected": -0.0956241562962532, "logps/chosen": -1.5859758853912354, "logps/rejected": -2.1242527961730957, "loss": 0.6539, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.5859758853912354, "rewards/margins": 0.5382769703865051, "rewards/rejected": -2.1242527961730957, "sft_loss": 1.5380891561508179, "step": 2960 }, { "epoch": 1.5868874393711323, "grad_norm": 5.144347674500833, "learning_rate": 5.365662888709622e-07, "logits/chosen": -0.18725113570690155, "logits/rejected": -0.08537117391824722, "logps/chosen": -1.5215387344360352, "logps/rejected": -2.1008687019348145, "loss": 0.652, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.5215387344360352, "rewards/margins": 0.5793299078941345, "rewards/rejected": -2.1008687019348145, "sft_loss": 1.5025901794433594, "step": 2965 }, { "epoch": 1.589563472152534, "grad_norm": 6.205747754627465, "learning_rate": 5.350128817050585e-07, "logits/chosen": -0.18871183693408966, "logits/rejected": 0.00811255443841219, "logps/chosen": -1.598693609237671, "logps/rejected": -2.211864709854126, "loss": 0.6744, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.598693609237671, "rewards/margins": 0.6131712794303894, "rewards/rejected": -2.211864709854126, "sft_loss": 1.6509023904800415, "step": 2970 }, { "epoch": 1.5922395049339353, "grad_norm": 4.2464036589658045, "learning_rate": 5.334591348440229e-07, "logits/chosen": -0.17433153092861176, "logits/rejected": -0.02227271907031536, "logps/chosen": -1.5661596059799194, "logps/rejected": -2.295907497406006, "loss": 0.655, "rewards/accuracies": 0.65625, "rewards/chosen": -1.5661596059799194, "rewards/margins": 0.729748010635376, "rewards/rejected": -2.295907497406006, "sft_loss": 1.6411842107772827, "step": 2975 }, { "epoch": 1.594915537715337, "grad_norm": 6.976785428121013, "learning_rate": 5.319050633623141e-07, "logits/chosen": -0.22953324019908905, "logits/rejected": -0.05818880721926689, "logps/chosen": -1.6724097728729248, "logps/rejected": -2.1595005989074707, "loss": 0.6828, "rewards/accuracies": 0.6875, "rewards/chosen": -1.6724097728729248, "rewards/margins": 0.48709067702293396, "rewards/rejected": -2.1595005989074707, "sft_loss": 1.6616108417510986, "step": 2980 }, { "epoch": 1.5975915704967387, "grad_norm": 3.5408258715654886, "learning_rate": 5.303506823375409e-07, "logits/chosen": -0.21573512256145477, "logits/rejected": -0.017917849123477936, "logps/chosen": -1.65218186378479, "logps/rejected": -2.1986420154571533, "loss": 0.6765, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.65218186378479, "rewards/margins": 0.5464602708816528, "rewards/rejected": -2.1986420154571533, "sft_loss": 1.6009705066680908, "step": 2985 }, { "epoch": 1.60026760327814, "grad_norm": 6.592240750655103, "learning_rate": 5.287960068503143e-07, "logits/chosen": -0.19919119775295258, "logits/rejected": -0.005335810594260693, "logps/chosen": -1.5355892181396484, "logps/rejected": -2.2174465656280518, "loss": 0.6627, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.5355892181396484, "rewards/margins": 0.6818572282791138, "rewards/rejected": -2.2174465656280518, "sft_loss": 1.5363190174102783, "step": 2990 }, { "epoch": 1.6029436360595417, "grad_norm": 3.47366638355546, "learning_rate": 5.272410519841032e-07, "logits/chosen": -0.14296242594718933, "logits/rejected": -0.01892923191189766, "logps/chosen": -1.6547492742538452, "logps/rejected": -2.495971202850342, "loss": 0.6519, "rewards/accuracies": 0.6875, "rewards/chosen": -1.6547492742538452, "rewards/margins": 0.8412219882011414, "rewards/rejected": -2.495971202850342, "sft_loss": 1.6823389530181885, "step": 2995 }, { "epoch": 1.6056196688409434, "grad_norm": 4.8846488670917925, "learning_rate": 5.256858328250861e-07, "logits/chosen": -0.19661283493041992, "logits/rejected": -0.03337302431464195, "logps/chosen": -1.6535663604736328, "logps/rejected": -2.164750576019287, "loss": 0.6891, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.6535663604736328, "rewards/margins": 0.5111840963363647, "rewards/rejected": -2.164750576019287, "sft_loss": 1.5775151252746582, "step": 3000 }, { "epoch": 1.608295701622345, "grad_norm": 5.160745165468197, "learning_rate": 5.241303644620063e-07, "logits/chosen": -0.2481531798839569, "logits/rejected": -0.08857151120901108, "logps/chosen": -1.5439527034759521, "logps/rejected": -2.017119884490967, "loss": 0.6696, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.5439527034759521, "rewards/margins": 0.47316694259643555, "rewards/rejected": -2.017119884490967, "sft_loss": 1.4927449226379395, "step": 3005 }, { "epoch": 1.6109717344037464, "grad_norm": 4.381947958181414, "learning_rate": 5.225746619860248e-07, "logits/chosen": -0.2527027428150177, "logits/rejected": -0.12018553167581558, "logps/chosen": -1.5270473957061768, "logps/rejected": -2.1087257862091064, "loss": 0.6744, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.5270473957061768, "rewards/margins": 0.5816782116889954, "rewards/rejected": -2.1087257862091064, "sft_loss": 1.545583724975586, "step": 3010 }, { "epoch": 1.6136477671851481, "grad_norm": 2.9727093199437515, "learning_rate": 5.210187404905735e-07, "logits/chosen": -0.05318199470639229, "logits/rejected": 0.0316009446978569, "logps/chosen": -1.644573450088501, "logps/rejected": -2.1216087341308594, "loss": 0.6565, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.644573450088501, "rewards/margins": 0.4770355224609375, "rewards/rejected": -2.1216087341308594, "sft_loss": 1.639564871788025, "step": 3015 }, { "epoch": 1.6163237999665496, "grad_norm": 3.7309407734006514, "learning_rate": 5.194626150712098e-07, "logits/chosen": -0.2446902096271515, "logits/rejected": -0.08602052927017212, "logps/chosen": -1.5557916164398193, "logps/rejected": -2.083207368850708, "loss": 0.6657, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.5557916164398193, "rewards/margins": 0.5274157524108887, "rewards/rejected": -2.083207368850708, "sft_loss": 1.6140753030776978, "step": 3020 }, { "epoch": 1.6189998327479511, "grad_norm": 3.9303733965572953, "learning_rate": 5.179063008254695e-07, "logits/chosen": -0.19570858776569366, "logits/rejected": -0.02743830345571041, "logps/chosen": -1.4865481853485107, "logps/rejected": -1.9939115047454834, "loss": 0.6748, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.4865481853485107, "rewards/margins": 0.5073633193969727, "rewards/rejected": -1.9939115047454834, "sft_loss": 1.554677963256836, "step": 3025 }, { "epoch": 1.6216758655293528, "grad_norm": 3.2539139154960326, "learning_rate": 5.163498128527199e-07, "logits/chosen": -0.18052852153778076, "logits/rejected": -0.03774655982851982, "logps/chosen": -1.686672568321228, "logps/rejected": -2.1763644218444824, "loss": 0.6646, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.686672568321228, "rewards/margins": 0.4896920323371887, "rewards/rejected": -2.1763644218444824, "sft_loss": 1.6342140436172485, "step": 3030 }, { "epoch": 1.6243518983107543, "grad_norm": 4.422880196611135, "learning_rate": 5.147931662540144e-07, "logits/chosen": -0.054852940142154694, "logits/rejected": 0.07409236580133438, "logps/chosen": -1.5826961994171143, "logps/rejected": -1.96904718875885, "loss": 0.6725, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.5826961994171143, "rewards/margins": 0.3863510191440582, "rewards/rejected": -1.96904718875885, "sft_loss": 1.553628921508789, "step": 3035 }, { "epoch": 1.6270279310921558, "grad_norm": 6.7335124089664795, "learning_rate": 5.132363761319449e-07, "logits/chosen": -0.17560145258903503, "logits/rejected": -0.10570497810840607, "logps/chosen": -1.4192087650299072, "logps/rejected": -2.121539831161499, "loss": 0.6416, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.4192087650299072, "rewards/margins": 0.7023310661315918, "rewards/rejected": -2.121539831161499, "sft_loss": 1.4606475830078125, "step": 3040 }, { "epoch": 1.6297039638735575, "grad_norm": 4.315507698172223, "learning_rate": 5.116794575904962e-07, "logits/chosen": -0.20051035284996033, "logits/rejected": -0.09910713881254196, "logps/chosen": -1.5228033065795898, "logps/rejected": -1.9885295629501343, "loss": 0.6692, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.5228033065795898, "rewards/margins": 0.46572622656822205, "rewards/rejected": -1.9885295629501343, "sft_loss": 1.5164625644683838, "step": 3045 }, { "epoch": 1.632379996654959, "grad_norm": 4.409635070267732, "learning_rate": 5.101224257348987e-07, "logits/chosen": -0.231184720993042, "logits/rejected": -0.0673697367310524, "logps/chosen": -1.635107398033142, "logps/rejected": -2.1991419792175293, "loss": 0.6605, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.635107398033142, "rewards/margins": 0.5640343427658081, "rewards/rejected": -2.1991419792175293, "sft_loss": 1.6451671123504639, "step": 3050 }, { "epoch": 1.6350560294363605, "grad_norm": 2.2495017944096625, "learning_rate": 5.085652956714823e-07, "logits/chosen": -0.2617799639701843, "logits/rejected": -0.12326967716217041, "logps/chosen": -1.5735450983047485, "logps/rejected": -2.2101778984069824, "loss": 0.6563, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.5735450983047485, "rewards/margins": 0.6366329789161682, "rewards/rejected": -2.2101778984069824, "sft_loss": 1.5666234493255615, "step": 3055 }, { "epoch": 1.6377320622177622, "grad_norm": 3.8372559186929984, "learning_rate": 5.070080825075298e-07, "logits/chosen": -0.23476651310920715, "logits/rejected": -0.052711568772792816, "logps/chosen": -1.5595481395721436, "logps/rejected": -2.0955264568328857, "loss": 0.6704, "rewards/accuracies": 0.59375, "rewards/chosen": -1.5595481395721436, "rewards/margins": 0.5359782576560974, "rewards/rejected": -2.0955264568328857, "sft_loss": 1.60107421875, "step": 3060 }, { "epoch": 1.6404080949991637, "grad_norm": 5.453924102853809, "learning_rate": 5.0545080135113e-07, "logits/chosen": -0.12843522429466248, "logits/rejected": -0.07940709590911865, "logps/chosen": -1.5772250890731812, "logps/rejected": -2.2718729972839355, "loss": 0.6515, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.5772250890731812, "rewards/margins": 0.6946476697921753, "rewards/rejected": -2.2718729972839355, "sft_loss": 1.6094601154327393, "step": 3065 }, { "epoch": 1.6430841277805652, "grad_norm": 3.152373845146598, "learning_rate": 5.038934673110316e-07, "logits/chosen": -0.2801273763179779, "logits/rejected": -0.1516799032688141, "logps/chosen": -1.5942035913467407, "logps/rejected": -2.177546262741089, "loss": 0.6678, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.5942035913467407, "rewards/margins": 0.5833428502082825, "rewards/rejected": -2.177546262741089, "sft_loss": 1.6442228555679321, "step": 3070 }, { "epoch": 1.645760160561967, "grad_norm": 4.23043494021146, "learning_rate": 5.023360954964963e-07, "logits/chosen": -0.26326099038124084, "logits/rejected": -0.1866808533668518, "logps/chosen": -1.5465459823608398, "logps/rejected": -2.1609504222869873, "loss": 0.6618, "rewards/accuracies": 0.625, "rewards/chosen": -1.5465459823608398, "rewards/margins": 0.6144044399261475, "rewards/rejected": -2.1609504222869873, "sft_loss": 1.4961479902267456, "step": 3075 }, { "epoch": 1.6484361933433684, "grad_norm": 6.4660259398849345, "learning_rate": 5.007787010171524e-07, "logits/chosen": -0.3211483359336853, "logits/rejected": -0.11951088905334473, "logps/chosen": -1.4363352060317993, "logps/rejected": -2.0495729446411133, "loss": 0.66, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.4363352060317993, "rewards/margins": 0.613237738609314, "rewards/rejected": -2.0495729446411133, "sft_loss": 1.4899227619171143, "step": 3080 }, { "epoch": 1.65111222612477, "grad_norm": 11.36741274310511, "learning_rate": 4.992212989828477e-07, "logits/chosen": -0.1367950141429901, "logits/rejected": -0.1153026595711708, "logps/chosen": -1.4984586238861084, "logps/rejected": -2.107638359069824, "loss": 0.6633, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.4984586238861084, "rewards/margins": 0.6091797351837158, "rewards/rejected": -2.107638359069824, "sft_loss": 1.534341812133789, "step": 3085 }, { "epoch": 1.6537882589061716, "grad_norm": 4.08127766292726, "learning_rate": 4.976639045035036e-07, "logits/chosen": -0.11941780894994736, "logits/rejected": -0.041212014853954315, "logps/chosen": -1.5258735418319702, "logps/rejected": -1.9783833026885986, "loss": 0.6681, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.5258735418319702, "rewards/margins": 0.45250973105430603, "rewards/rejected": -1.9783833026885986, "sft_loss": 1.6122392416000366, "step": 3090 }, { "epoch": 1.6564642916875731, "grad_norm": 4.142980500759543, "learning_rate": 4.961065326889683e-07, "logits/chosen": -0.16154876351356506, "logits/rejected": -0.008436007425189018, "logps/chosen": -1.5421125888824463, "logps/rejected": -2.1379971504211426, "loss": 0.6642, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.5421125888824463, "rewards/margins": 0.5958844423294067, "rewards/rejected": -2.1379971504211426, "sft_loss": 1.5496423244476318, "step": 3095 }, { "epoch": 1.6591403244689746, "grad_norm": 3.3827559336720023, "learning_rate": 4.9454919864887e-07, "logits/chosen": -0.28748637437820435, "logits/rejected": -0.137380450963974, "logps/chosen": -1.5870611667633057, "logps/rejected": -2.0440049171447754, "loss": 0.691, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.5870611667633057, "rewards/margins": 0.4569437503814697, "rewards/rejected": -2.0440049171447754, "sft_loss": 1.6794652938842773, "step": 3100 }, { "epoch": 1.6618163572503764, "grad_norm": 3.8765153027358497, "learning_rate": 4.929919174924701e-07, "logits/chosen": -0.2404676377773285, "logits/rejected": -0.04566841199994087, "logps/chosen": -1.5718721151351929, "logps/rejected": -2.0481221675872803, "loss": 0.6709, "rewards/accuracies": 0.65625, "rewards/chosen": -1.5718721151351929, "rewards/margins": 0.4762501120567322, "rewards/rejected": -2.0481221675872803, "sft_loss": 1.6210445165634155, "step": 3105 }, { "epoch": 1.6644923900317778, "grad_norm": 2.7125040680828545, "learning_rate": 4.914347043285177e-07, "logits/chosen": -0.17511853575706482, "logits/rejected": -0.059401821345090866, "logps/chosen": -1.5637508630752563, "logps/rejected": -2.0859580039978027, "loss": 0.6644, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.5637508630752563, "rewards/margins": 0.5222072601318359, "rewards/rejected": -2.0859580039978027, "sft_loss": 1.4825503826141357, "step": 3110 }, { "epoch": 1.6671684228131793, "grad_norm": 1.837498071457063, "learning_rate": 4.898775742651013e-07, "logits/chosen": -0.1399889588356018, "logits/rejected": -0.04128175228834152, "logps/chosen": -1.5877896547317505, "logps/rejected": -2.170602321624756, "loss": 0.6475, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.5877896547317505, "rewards/margins": 0.5828126072883606, "rewards/rejected": -2.170602321624756, "sft_loss": 1.6309263706207275, "step": 3115 }, { "epoch": 1.669844455594581, "grad_norm": 3.0047021986668674, "learning_rate": 4.883205424095037e-07, "logits/chosen": -0.26999109983444214, "logits/rejected": -0.1168588176369667, "logps/chosen": -1.6641719341278076, "logps/rejected": -2.2500786781311035, "loss": 0.6924, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.6641719341278076, "rewards/margins": 0.585906445980072, "rewards/rejected": -2.2500786781311035, "sft_loss": 1.6384944915771484, "step": 3120 }, { "epoch": 1.6725204883759828, "grad_norm": 2.5450039000979245, "learning_rate": 4.86763623868055e-07, "logits/chosen": -0.1542874127626419, "logits/rejected": -0.03618238866329193, "logps/chosen": -1.7048299312591553, "logps/rejected": -2.316401481628418, "loss": 0.675, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.7048299312591553, "rewards/margins": 0.6115714311599731, "rewards/rejected": -2.316401481628418, "sft_loss": 1.5900487899780273, "step": 3125 }, { "epoch": 1.675196521157384, "grad_norm": 4.731359554444461, "learning_rate": 4.852068337459856e-07, "logits/chosen": -0.17984583973884583, "logits/rejected": -0.029300883412361145, "logps/chosen": -1.642860770225525, "logps/rejected": -2.2364511489868164, "loss": 0.6764, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.642860770225525, "rewards/margins": 0.5935903787612915, "rewards/rejected": -2.2364511489868164, "sft_loss": 1.6760437488555908, "step": 3130 }, { "epoch": 1.6778725539387858, "grad_norm": 3.5207430962242725, "learning_rate": 4.8365018714728e-07, "logits/chosen": -0.14204739034175873, "logits/rejected": -0.07755346596240997, "logps/chosen": -1.6895830631256104, "logps/rejected": -2.2165913581848145, "loss": 0.6697, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.6895830631256104, "rewards/margins": 0.527008593082428, "rewards/rejected": -2.2165913581848145, "sft_loss": 1.5765527486801147, "step": 3135 }, { "epoch": 1.6805485867201875, "grad_norm": 3.6629297139978796, "learning_rate": 4.820936991745304e-07, "logits/chosen": -0.4186740815639496, "logits/rejected": -0.26618799567222595, "logps/chosen": -1.502715826034546, "logps/rejected": -1.992854356765747, "loss": 0.6598, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.502715826034546, "rewards/margins": 0.4901384711265564, "rewards/rejected": -1.992854356765747, "sft_loss": 1.5291118621826172, "step": 3140 }, { "epoch": 1.6832246195015887, "grad_norm": 5.915396790393318, "learning_rate": 4.8053738492879e-07, "logits/chosen": -0.18335725367069244, "logits/rejected": -0.04013916105031967, "logps/chosen": -1.5186660289764404, "logps/rejected": -2.1200027465820312, "loss": 0.663, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.5186660289764404, "rewards/margins": 0.6013368964195251, "rewards/rejected": -2.1200027465820312, "sft_loss": 1.5608304738998413, "step": 3145 }, { "epoch": 1.6859006522829905, "grad_norm": 4.3467671734022355, "learning_rate": 4.789812595094265e-07, "logits/chosen": -0.3105720281600952, "logits/rejected": -0.18775293231010437, "logps/chosen": -1.608131766319275, "logps/rejected": -2.0446836948394775, "loss": 0.6691, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.608131766319275, "rewards/margins": 0.4365520477294922, "rewards/rejected": -2.0446836948394775, "sft_loss": 1.558866262435913, "step": 3150 }, { "epoch": 1.6885766850643922, "grad_norm": 8.05953814997876, "learning_rate": 4.774253380139752e-07, "logits/chosen": -0.3036400079727173, "logits/rejected": -0.17522230744361877, "logps/chosen": -1.4665193557739258, "logps/rejected": -2.0359203815460205, "loss": 0.6463, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.4665193557739258, "rewards/margins": 0.5694009065628052, "rewards/rejected": -2.0359203815460205, "sft_loss": 1.4616729021072388, "step": 3155 }, { "epoch": 1.6912527178457935, "grad_norm": 4.6680877708152435, "learning_rate": 4.758696355379936e-07, "logits/chosen": -0.23240847885608673, "logits/rejected": -0.21550750732421875, "logps/chosen": -1.5334768295288086, "logps/rejected": -2.082609176635742, "loss": 0.6575, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.5334768295288086, "rewards/margins": 0.5491322875022888, "rewards/rejected": -2.082609176635742, "sft_loss": 1.6290900707244873, "step": 3160 }, { "epoch": 1.6939287506271952, "grad_norm": 2.779228534406303, "learning_rate": 4.743141671749138e-07, "logits/chosen": -0.318700909614563, "logits/rejected": -0.2146882265806198, "logps/chosen": -1.573927640914917, "logps/rejected": -1.9949705600738525, "loss": 0.6794, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.573927640914917, "rewards/margins": 0.4210427701473236, "rewards/rejected": -1.9949705600738525, "sft_loss": 1.6389739513397217, "step": 3165 }, { "epoch": 1.6966047834085969, "grad_norm": 3.4045042632324374, "learning_rate": 4.727589480158968e-07, "logits/chosen": -0.2660903334617615, "logits/rejected": -0.1637762188911438, "logps/chosen": -1.5688062906265259, "logps/rejected": -2.2330851554870605, "loss": 0.6723, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.5688062906265259, "rewards/margins": 0.664279043674469, "rewards/rejected": -2.2330851554870605, "sft_loss": 1.6072860956192017, "step": 3170 }, { "epoch": 1.6992808161899984, "grad_norm": 6.639672136295296, "learning_rate": 4.712039931496855e-07, "logits/chosen": -0.29735398292541504, "logits/rejected": -0.1823483258485794, "logps/chosen": -1.6251757144927979, "logps/rejected": -1.9575645923614502, "loss": 0.6946, "rewards/accuracies": 0.625, "rewards/chosen": -1.6251757144927979, "rewards/margins": 0.33238908648490906, "rewards/rejected": -1.9575645923614502, "sft_loss": 1.6332619190216064, "step": 3175 }, { "epoch": 1.7019568489713999, "grad_norm": 1.7769204924020565, "learning_rate": 4.6964931766245905e-07, "logits/chosen": -0.1574326455593109, "logits/rejected": -0.10058436542749405, "logps/chosen": -1.608525037765503, "logps/rejected": -2.324282169342041, "loss": 0.6644, "rewards/accuracies": 0.65625, "rewards/chosen": -1.608525037765503, "rewards/margins": 0.7157570123672485, "rewards/rejected": -2.324282169342041, "sft_loss": 1.601483941078186, "step": 3180 }, { "epoch": 1.7046328817528016, "grad_norm": 5.238907987535943, "learning_rate": 4.6809493663768575e-07, "logits/chosen": -0.20695853233337402, "logits/rejected": -0.1775607317686081, "logps/chosen": -1.6417983770370483, "logps/rejected": -1.962201476097107, "loss": 0.6726, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.6417983770370483, "rewards/margins": 0.3204033374786377, "rewards/rejected": -1.962201476097107, "sft_loss": 1.6719862222671509, "step": 3185 }, { "epoch": 1.707308914534203, "grad_norm": 3.2650118759743316, "learning_rate": 4.6654086515597716e-07, "logits/chosen": -0.3084166646003723, "logits/rejected": -0.1510084867477417, "logps/chosen": -1.5061914920806885, "logps/rejected": -2.324479579925537, "loss": 0.6452, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.5061914920806885, "rewards/margins": 0.8182878494262695, "rewards/rejected": -2.324479579925537, "sft_loss": 1.5005475282669067, "step": 3190 }, { "epoch": 1.7099849473156046, "grad_norm": 3.1120595174991936, "learning_rate": 4.6498711829494154e-07, "logits/chosen": -0.31429943442344666, "logits/rejected": -0.205793097615242, "logps/chosen": -1.5214614868164062, "logps/rejected": -2.128938913345337, "loss": 0.6549, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.5214614868164062, "rewards/margins": 0.6074774265289307, "rewards/rejected": -2.128938913345337, "sft_loss": 1.4796510934829712, "step": 3195 }, { "epoch": 1.7126609800970063, "grad_norm": 3.7733075424203335, "learning_rate": 4.6343371112903777e-07, "logits/chosen": -0.1961269974708557, "logits/rejected": -0.05360075831413269, "logps/chosen": -1.6085929870605469, "logps/rejected": -2.3059115409851074, "loss": 0.6709, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.6085929870605469, "rewards/margins": 0.6973183751106262, "rewards/rejected": -2.3059115409851074, "sft_loss": 1.6058056354522705, "step": 3200 }, { "epoch": 1.7126609800970063, "eval_logits/chosen": 0.05022057890892029, "eval_logits/rejected": 0.13729551434516907, "eval_logps/chosen": -1.5941189527511597, "eval_logps/rejected": -2.1720833778381348, "eval_loss": 0.6700656414031982, "eval_rewards/accuracies": 0.6476261019706726, "eval_rewards/chosen": -1.5941189527511597, "eval_rewards/margins": 0.577964723110199, "eval_rewards/rejected": -2.1720833778381348, "eval_runtime": 43.8762, "eval_samples_per_second": 30.654, "eval_sft_loss": 1.5844956636428833, "eval_steps_per_second": 7.681, "step": 3200 }, { "epoch": 1.7153370128784078, "grad_norm": 2.3281305995229844, "learning_rate": 4.618806587294291e-07, "logits/chosen": -0.3298807144165039, "logits/rejected": -0.20993852615356445, "logps/chosen": -1.5723832845687866, "logps/rejected": -2.127077102661133, "loss": 0.6648, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.5723832845687866, "rewards/margins": 0.5546938180923462, "rewards/rejected": -2.127077102661133, "sft_loss": 1.5659022331237793, "step": 3205 }, { "epoch": 1.7180130456598093, "grad_norm": 4.646094569181479, "learning_rate": 4.603279761638365e-07, "logits/chosen": -0.29386386275291443, "logits/rejected": -0.1871393471956253, "logps/chosen": -1.5456677675247192, "logps/rejected": -2.1093719005584717, "loss": 0.665, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.5456677675247192, "rewards/margins": 0.5637043118476868, "rewards/rejected": -2.1093719005584717, "sft_loss": 1.5557401180267334, "step": 3210 }, { "epoch": 1.720689078441211, "grad_norm": 11.118365684823452, "learning_rate": 4.5877567849639315e-07, "logits/chosen": -0.2562563717365265, "logits/rejected": -0.1333739459514618, "logps/chosen": -1.5714125633239746, "logps/rejected": -2.154006242752075, "loss": 0.67, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.5714125633239746, "rewards/margins": 0.5825936198234558, "rewards/rejected": -2.154006242752075, "sft_loss": 1.5748682022094727, "step": 3215 }, { "epoch": 1.7233651112226125, "grad_norm": 6.593994824049016, "learning_rate": 4.572237807874979e-07, "logits/chosen": -0.2744905352592468, "logits/rejected": -0.05123863369226456, "logps/chosen": -1.7629566192626953, "logps/rejected": -2.3401918411254883, "loss": 0.6708, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.7629566192626953, "rewards/margins": 0.5772350430488586, "rewards/rejected": -2.3401918411254883, "sft_loss": 1.6627269983291626, "step": 3220 }, { "epoch": 1.726041144004014, "grad_norm": 4.439427646218787, "learning_rate": 4.5567229809366895e-07, "logits/chosen": -0.2580259442329407, "logits/rejected": -0.1263856738805771, "logps/chosen": -1.4843460321426392, "logps/rejected": -2.166602849960327, "loss": 0.657, "rewards/accuracies": 0.65625, "rewards/chosen": -1.4843460321426392, "rewards/margins": 0.6822569370269775, "rewards/rejected": -2.166602849960327, "sft_loss": 1.5099159479141235, "step": 3225 }, { "epoch": 1.7287171767854157, "grad_norm": 4.3990206098793045, "learning_rate": 4.541212454673984e-07, "logits/chosen": -0.28104525804519653, "logits/rejected": -0.11986222118139267, "logps/chosen": -1.6033084392547607, "logps/rejected": -2.4513392448425293, "loss": 0.6509, "rewards/accuracies": 0.6875, "rewards/chosen": -1.6033084392547607, "rewards/margins": 0.8480307459831238, "rewards/rejected": -2.4513392448425293, "sft_loss": 1.5755976438522339, "step": 3230 }, { "epoch": 1.7313932095668172, "grad_norm": 4.822929273440541, "learning_rate": 4.525706379570055e-07, "logits/chosen": -0.2546038031578064, "logits/rejected": -0.17843613028526306, "logps/chosen": -1.5555105209350586, "logps/rejected": -2.129605770111084, "loss": 0.6641, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.5555105209350586, "rewards/margins": 0.5740953683853149, "rewards/rejected": -2.129605770111084, "sft_loss": 1.5992151498794556, "step": 3235 }, { "epoch": 1.7340692423482187, "grad_norm": 2.790602526801848, "learning_rate": 4.510204906064911e-07, "logits/chosen": -0.1786082237958908, "logits/rejected": -0.07207518815994263, "logps/chosen": -1.6006673574447632, "logps/rejected": -2.2379143238067627, "loss": 0.6486, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.6006673574447632, "rewards/margins": 0.6372469067573547, "rewards/rejected": -2.2379143238067627, "sft_loss": 1.4736950397491455, "step": 3240 }, { "epoch": 1.7367452751296204, "grad_norm": 3.527773904971378, "learning_rate": 4.4947081845539177e-07, "logits/chosen": -0.35635730624198914, "logits/rejected": -0.21600675582885742, "logps/chosen": -1.6099275350570679, "logps/rejected": -2.149055242538452, "loss": 0.6725, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.6099275350570679, "rewards/margins": 0.5391277074813843, "rewards/rejected": -2.149055242538452, "sft_loss": 1.566629409790039, "step": 3245 }, { "epoch": 1.739421307911022, "grad_norm": 3.5194697362592726, "learning_rate": 4.479216365386333e-07, "logits/chosen": -0.12450122833251953, "logits/rejected": 0.024506190791726112, "logps/chosen": -1.541261076927185, "logps/rejected": -2.20589017868042, "loss": 0.6624, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.541261076927185, "rewards/margins": 0.6646289229393005, "rewards/rejected": -2.20589017868042, "sft_loss": 1.5440499782562256, "step": 3250 }, { "epoch": 1.7420973406924234, "grad_norm": 2.0734042744332535, "learning_rate": 4.4637295988638555e-07, "logits/chosen": -0.17247922718524933, "logits/rejected": -0.0935923159122467, "logps/chosen": -1.6456283330917358, "logps/rejected": -2.0860137939453125, "loss": 0.6677, "rewards/accuracies": 0.625, "rewards/chosen": -1.6456283330917358, "rewards/margins": 0.44038552045822144, "rewards/rejected": -2.0860137939453125, "sft_loss": 1.630672812461853, "step": 3255 }, { "epoch": 1.744773373473825, "grad_norm": 2.6240527895390082, "learning_rate": 4.4482480352391623e-07, "logits/chosen": -0.28131115436553955, "logits/rejected": -0.13464775681495667, "logps/chosen": -1.540771245956421, "logps/rejected": -2.0464205741882324, "loss": 0.6561, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.540771245956421, "rewards/margins": 0.5056491494178772, "rewards/rejected": -2.0464205741882324, "sft_loss": 1.5565450191497803, "step": 3260 }, { "epoch": 1.7474494062552266, "grad_norm": 6.75352646316694, "learning_rate": 4.4327718247144507e-07, "logits/chosen": -0.15234871208667755, "logits/rejected": -0.037437863647937775, "logps/chosen": -1.5263092517852783, "logps/rejected": -2.2033650875091553, "loss": 0.656, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.5263092517852783, "rewards/margins": 0.6770559549331665, "rewards/rejected": -2.2033650875091553, "sft_loss": 1.5696706771850586, "step": 3265 }, { "epoch": 1.750125439036628, "grad_norm": 6.6376623109556405, "learning_rate": 4.417301117439984e-07, "logits/chosen": -0.16901914775371552, "logits/rejected": -0.023870373144745827, "logps/chosen": -1.4356871843338013, "logps/rejected": -2.0282955169677734, "loss": 0.647, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.4356871843338013, "rewards/margins": 0.5926082134246826, "rewards/rejected": -2.0282955169677734, "sft_loss": 1.4541518688201904, "step": 3270 }, { "epoch": 1.7528014718180298, "grad_norm": 3.6683261434355945, "learning_rate": 4.401836063512631e-07, "logits/chosen": -0.23975209891796112, "logits/rejected": 0.0791751816868782, "logps/chosen": -1.5243529081344604, "logps/rejected": -2.145411252975464, "loss": 0.6541, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.5243529081344604, "rewards/margins": 0.6210582852363586, "rewards/rejected": -2.145411252975464, "sft_loss": 1.5562708377838135, "step": 3275 }, { "epoch": 1.7554775045994313, "grad_norm": 4.898270621041198, "learning_rate": 4.386376812974413e-07, "logits/chosen": -0.20579198002815247, "logits/rejected": -0.10366296768188477, "logps/chosen": -1.4743316173553467, "logps/rejected": -2.0218100547790527, "loss": 0.6529, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.4743316173553467, "rewards/margins": 0.5474783182144165, "rewards/rejected": -2.0218100547790527, "sft_loss": 1.5755841732025146, "step": 3280 }, { "epoch": 1.7581535373808328, "grad_norm": 3.861413615441136, "learning_rate": 4.370923515811048e-07, "logits/chosen": -0.22842450439929962, "logits/rejected": 0.0003024935722351074, "logps/chosen": -1.4898465871810913, "logps/rejected": -2.079397201538086, "loss": 0.6545, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.4898465871810913, "rewards/margins": 0.5895504355430603, "rewards/rejected": -2.079397201538086, "sft_loss": 1.526920199394226, "step": 3285 }, { "epoch": 1.7608295701622345, "grad_norm": 4.130081611622876, "learning_rate": 4.35547632195049e-07, "logits/chosen": -0.18690963089466095, "logits/rejected": -0.06916667520999908, "logps/chosen": -1.5715512037277222, "logps/rejected": -2.125609874725342, "loss": 0.6739, "rewards/accuracies": 0.59375, "rewards/chosen": -1.5715512037277222, "rewards/margins": 0.5540584921836853, "rewards/rejected": -2.125609874725342, "sft_loss": 1.5971730947494507, "step": 3290 }, { "epoch": 1.763505602943636, "grad_norm": 5.680075231329591, "learning_rate": 4.340035381261484e-07, "logits/chosen": -0.1997305452823639, "logits/rejected": -0.12079212814569473, "logps/chosen": -1.6892131567001343, "logps/rejected": -2.155735492706299, "loss": 0.6799, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.6892131567001343, "rewards/margins": 0.4665220379829407, "rewards/rejected": -2.155735492706299, "sft_loss": 1.6960713863372803, "step": 3295 }, { "epoch": 1.7661816357250375, "grad_norm": 2.931451514787955, "learning_rate": 4.324600843552104e-07, "logits/chosen": -0.30347341299057007, "logits/rejected": -0.15210089087486267, "logps/chosen": -1.7506462335586548, "logps/rejected": -2.338987112045288, "loss": 0.6733, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.7506462335586548, "rewards/margins": 0.5883409380912781, "rewards/rejected": -2.338987112045288, "sft_loss": 1.6921055316925049, "step": 3300 }, { "epoch": 1.7688576685064392, "grad_norm": 5.547422306866258, "learning_rate": 4.309172858568302e-07, "logits/chosen": -0.314720094203949, "logits/rejected": -0.16665521264076233, "logps/chosen": -1.5950864553451538, "logps/rejected": -2.1158535480499268, "loss": 0.6813, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.5950864553451538, "rewards/margins": 0.520767092704773, "rewards/rejected": -2.1158535480499268, "sft_loss": 1.5926302671432495, "step": 3305 }, { "epoch": 1.771533701287841, "grad_norm": 5.088090669899126, "learning_rate": 4.293751575992455e-07, "logits/chosen": -0.12834122776985168, "logits/rejected": -0.07507321983575821, "logps/chosen": -1.6144376993179321, "logps/rejected": -2.0841517448425293, "loss": 0.67, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.6144376993179321, "rewards/margins": 0.4697140157222748, "rewards/rejected": -2.0841517448425293, "sft_loss": 1.6303374767303467, "step": 3310 }, { "epoch": 1.7742097340692422, "grad_norm": 6.267169922611252, "learning_rate": 4.278337145441916e-07, "logits/chosen": -0.32739704847335815, "logits/rejected": -0.17024415731430054, "logps/chosen": -1.5832575559616089, "logps/rejected": -2.1148574352264404, "loss": 0.6732, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.5832575559616089, "rewards/margins": 0.5315998196601868, "rewards/rejected": -2.1148574352264404, "sft_loss": 1.5724600553512573, "step": 3315 }, { "epoch": 1.776885766850644, "grad_norm": 3.414609963561676, "learning_rate": 4.262929716467556e-07, "logits/chosen": -0.24356892704963684, "logits/rejected": -0.049546681344509125, "logps/chosen": -1.5284088850021362, "logps/rejected": -2.3344571590423584, "loss": 0.66, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.5284088850021362, "rewards/margins": 0.8060482740402222, "rewards/rejected": -2.3344571590423584, "sft_loss": 1.6087051630020142, "step": 3320 }, { "epoch": 1.7795617996320456, "grad_norm": 6.679784628976017, "learning_rate": 4.247529438552321e-07, "logits/chosen": -0.3191532492637634, "logits/rejected": -0.13674207031726837, "logps/chosen": -1.6017239093780518, "logps/rejected": -2.1782643795013428, "loss": 0.6558, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.6017239093780518, "rewards/margins": 0.5765405893325806, "rewards/rejected": -2.1782643795013428, "sft_loss": 1.6856014728546143, "step": 3325 }, { "epoch": 1.782237832413447, "grad_norm": 11.060334378583923, "learning_rate": 4.232136461109773e-07, "logits/chosen": -0.20316722989082336, "logits/rejected": -0.08586695045232773, "logps/chosen": -1.4768259525299072, "logps/rejected": -2.1830265522003174, "loss": 0.6401, "rewards/accuracies": 0.6875, "rewards/chosen": -1.4768259525299072, "rewards/margins": 0.7062004208564758, "rewards/rejected": -2.1830265522003174, "sft_loss": 1.5168195962905884, "step": 3330 }, { "epoch": 1.7849138651948486, "grad_norm": 4.1087649701684885, "learning_rate": 4.216750933482646e-07, "logits/chosen": -0.2536730170249939, "logits/rejected": -0.09729192405939102, "logps/chosen": -1.6899553537368774, "logps/rejected": -2.173895835876465, "loss": 0.6668, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.6899553537368774, "rewards/margins": 0.4839404225349426, "rewards/rejected": -2.173895835876465, "sft_loss": 1.6380395889282227, "step": 3335 }, { "epoch": 1.7875898979762503, "grad_norm": 3.157861243925943, "learning_rate": 4.2013730049413986e-07, "logits/chosen": -0.21639016270637512, "logits/rejected": -0.07234219461679459, "logps/chosen": -1.4712656736373901, "logps/rejected": -2.205264091491699, "loss": 0.6525, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.4712656736373901, "rewards/margins": 0.7339984178543091, "rewards/rejected": -2.205264091491699, "sft_loss": 1.5232994556427002, "step": 3340 }, { "epoch": 1.7902659307576518, "grad_norm": 6.890344570453813, "learning_rate": 4.1860028246827594e-07, "logits/chosen": -0.21207043528556824, "logits/rejected": -0.03308100253343582, "logps/chosen": -1.424570083618164, "logps/rejected": -2.0149474143981934, "loss": 0.6476, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.424570083618164, "rewards/margins": 0.5903773307800293, "rewards/rejected": -2.0149474143981934, "sft_loss": 1.4861271381378174, "step": 3345 }, { "epoch": 1.7929419635390533, "grad_norm": 4.1095339352997335, "learning_rate": 4.170640541828285e-07, "logits/chosen": -0.3412007689476013, "logits/rejected": -0.19217181205749512, "logps/chosen": -1.64543879032135, "logps/rejected": -2.1718764305114746, "loss": 0.6755, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.64543879032135, "rewards/margins": 0.5264378786087036, "rewards/rejected": -2.1718764305114746, "sft_loss": 1.64764404296875, "step": 3350 }, { "epoch": 1.795617996320455, "grad_norm": 9.631527390186543, "learning_rate": 4.1552863054229116e-07, "logits/chosen": -0.12420465797185898, "logits/rejected": -0.07888902723789215, "logps/chosen": -1.750036597251892, "logps/rejected": -2.160815715789795, "loss": 0.6941, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.750036597251892, "rewards/margins": 0.4107791781425476, "rewards/rejected": -2.160815715789795, "sft_loss": 1.6819007396697998, "step": 3355 }, { "epoch": 1.7982940291018565, "grad_norm": 4.0032765855071375, "learning_rate": 4.139940264433508e-07, "logits/chosen": -0.2751021087169647, "logits/rejected": -0.052364956587553024, "logps/chosen": -1.493459701538086, "logps/rejected": -2.2093310356140137, "loss": 0.6606, "rewards/accuracies": 0.65625, "rewards/chosen": -1.493459701538086, "rewards/margins": 0.7158714532852173, "rewards/rejected": -2.2093310356140137, "sft_loss": 1.5034425258636475, "step": 3360 }, { "epoch": 1.800970061883258, "grad_norm": 3.4612579048779377, "learning_rate": 4.1246025677474303e-07, "logits/chosen": -0.25418537855148315, "logits/rejected": -0.09127441793680191, "logps/chosen": -1.59048330783844, "logps/rejected": -2.2793405055999756, "loss": 0.6625, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.59048330783844, "rewards/margins": 0.6888570189476013, "rewards/rejected": -2.2793405055999756, "sft_loss": 1.6225292682647705, "step": 3365 }, { "epoch": 1.8036460946646597, "grad_norm": 3.612183359211995, "learning_rate": 4.10927336417108e-07, "logits/chosen": -0.22967293858528137, "logits/rejected": -0.06436924636363983, "logps/chosen": -1.604569673538208, "logps/rejected": -2.123711109161377, "loss": 0.6642, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.604569673538208, "rewards/margins": 0.5191417932510376, "rewards/rejected": -2.123711109161377, "sft_loss": 1.5751338005065918, "step": 3370 }, { "epoch": 1.8063221274460612, "grad_norm": 3.7546805011871975, "learning_rate": 4.093952802428457e-07, "logits/chosen": -0.0654725581407547, "logits/rejected": -0.0029380307532846928, "logps/chosen": -1.7108027935028076, "logps/rejected": -2.1099960803985596, "loss": 0.6938, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.7108027935028076, "rewards/margins": 0.3991931080818176, "rewards/rejected": -2.1099960803985596, "sft_loss": 1.6306812763214111, "step": 3375 }, { "epoch": 1.8089981602274627, "grad_norm": 2.6074496359152914, "learning_rate": 4.0786410311597184e-07, "logits/chosen": -0.3023606538772583, "logits/rejected": -0.15595746040344238, "logps/chosen": -1.5669963359832764, "logps/rejected": -2.1247708797454834, "loss": 0.6666, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.5669963359832764, "rewards/margins": 0.5577744245529175, "rewards/rejected": -2.1247708797454834, "sft_loss": 1.5600146055221558, "step": 3380 }, { "epoch": 1.8116741930088645, "grad_norm": 3.5661741439601697, "learning_rate": 4.063338198919737e-07, "logits/chosen": -0.2698851525783539, "logits/rejected": -0.23949381709098816, "logps/chosen": -1.617048978805542, "logps/rejected": -2.055820941925049, "loss": 0.6773, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.617048978805542, "rewards/margins": 0.43877163529396057, "rewards/rejected": -2.055820941925049, "sft_loss": 1.6467010974884033, "step": 3385 }, { "epoch": 1.814350225790266, "grad_norm": 3.7044403708843396, "learning_rate": 4.0480444541766575e-07, "logits/chosen": -0.2357718050479889, "logits/rejected": -0.1072898730635643, "logps/chosen": -1.6923530101776123, "logps/rejected": -2.2098324298858643, "loss": 0.6778, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.6923530101776123, "rewards/margins": 0.517479419708252, "rewards/rejected": -2.2098324298858643, "sft_loss": 1.6890838146209717, "step": 3390 }, { "epoch": 1.8170262585716674, "grad_norm": 6.973444286465607, "learning_rate": 4.0327599453104606e-07, "logits/chosen": -0.24866466224193573, "logits/rejected": -0.13903850317001343, "logps/chosen": -1.4910849332809448, "logps/rejected": -2.1087069511413574, "loss": 0.6534, "rewards/accuracies": 0.65625, "rewards/chosen": -1.4910849332809448, "rewards/margins": 0.6176217794418335, "rewards/rejected": -2.1087069511413574, "sft_loss": 1.4826654195785522, "step": 3395 }, { "epoch": 1.8197022913530692, "grad_norm": 4.253394673156057, "learning_rate": 4.017484820611514e-07, "logits/chosen": -0.2572954297065735, "logits/rejected": -0.1391439139842987, "logps/chosen": -1.6212129592895508, "logps/rejected": -2.2261252403259277, "loss": 0.6597, "rewards/accuracies": 0.625, "rewards/chosen": -1.6212129592895508, "rewards/margins": 0.6049124002456665, "rewards/rejected": -2.2261252403259277, "sft_loss": 1.6586673259735107, "step": 3400 }, { "epoch": 1.8223783241344707, "grad_norm": 6.2400965553296155, "learning_rate": 4.002219228279148e-07, "logits/chosen": -0.2455395758152008, "logits/rejected": -0.08400936424732208, "logps/chosen": -1.509826898574829, "logps/rejected": -2.068021774291992, "loss": 0.6773, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.509826898574829, "rewards/margins": 0.5581950545310974, "rewards/rejected": -2.068021774291992, "sft_loss": 1.5645593404769897, "step": 3405 }, { "epoch": 1.8250543569158721, "grad_norm": 4.447093603110371, "learning_rate": 3.9869633164202045e-07, "logits/chosen": -0.25325924158096313, "logits/rejected": -0.03236902132630348, "logps/chosen": -1.681740164756775, "logps/rejected": -2.2139956951141357, "loss": 0.6854, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.681740164756775, "rewards/margins": 0.5322555303573608, "rewards/rejected": -2.2139956951141357, "sft_loss": 1.654750108718872, "step": 3410 }, { "epoch": 1.8277303896972739, "grad_norm": 4.005222117059965, "learning_rate": 3.9717172330476077e-07, "logits/chosen": -0.22943027317523956, "logits/rejected": -0.11498402059078217, "logps/chosen": -1.5746691226959229, "logps/rejected": -2.2544422149658203, "loss": 0.6758, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.5746691226959229, "rewards/margins": 0.679773211479187, "rewards/rejected": -2.2544422149658203, "sft_loss": 1.6747443675994873, "step": 3415 }, { "epoch": 1.8304064224786754, "grad_norm": 4.146766720366405, "learning_rate": 3.956481126078927e-07, "logits/chosen": -0.17182201147079468, "logits/rejected": -0.042574524879455566, "logps/chosen": -1.6990025043487549, "logps/rejected": -2.359184741973877, "loss": 0.6712, "rewards/accuracies": 0.59375, "rewards/chosen": -1.6990025043487549, "rewards/margins": 0.6601821780204773, "rewards/rejected": -2.359184741973877, "sft_loss": 1.7276928424835205, "step": 3420 }, { "epoch": 1.8330824552600768, "grad_norm": 3.1039010392795388, "learning_rate": 3.941255143334937e-07, "logits/chosen": -0.3011739253997803, "logits/rejected": -0.249053955078125, "logps/chosen": -1.559574842453003, "logps/rejected": -2.173431873321533, "loss": 0.6685, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.559574842453003, "rewards/margins": 0.6138567924499512, "rewards/rejected": -2.173431873321533, "sft_loss": 1.5596319437026978, "step": 3425 }, { "epoch": 1.8357584880414786, "grad_norm": 16.226665192040024, "learning_rate": 3.9260394325381895e-07, "logits/chosen": -0.2586892247200012, "logits/rejected": -0.1233101636171341, "logps/chosen": -1.565425157546997, "logps/rejected": -2.539271354675293, "loss": 0.6813, "rewards/accuracies": 0.65625, "rewards/chosen": -1.565425157546997, "rewards/margins": 0.9738461375236511, "rewards/rejected": -2.539271354675293, "sft_loss": 1.5736205577850342, "step": 3430 }, { "epoch": 1.83843452082288, "grad_norm": 6.192896673655969, "learning_rate": 3.9108341413115784e-07, "logits/chosen": -0.2718811631202698, "logits/rejected": -0.16693784296512604, "logps/chosen": -1.4871734380722046, "logps/rejected": -2.1045596599578857, "loss": 0.6433, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.4871734380722046, "rewards/margins": 0.6173862218856812, "rewards/rejected": -2.1045596599578857, "sft_loss": 1.5424994230270386, "step": 3435 }, { "epoch": 1.8411105536042816, "grad_norm": 5.251785598075094, "learning_rate": 3.895639417176905e-07, "logits/chosen": -0.33662140369415283, "logits/rejected": -0.2634265422821045, "logps/chosen": -1.4642804861068726, "logps/rejected": -2.2976937294006348, "loss": 0.6679, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.4642804861068726, "rewards/margins": 0.833413302898407, "rewards/rejected": -2.2976937294006348, "sft_loss": 1.498628854751587, "step": 3440 }, { "epoch": 1.8437865863856833, "grad_norm": 5.977642868911891, "learning_rate": 3.8804554075534497e-07, "logits/chosen": -0.29837948083877563, "logits/rejected": -0.07646293193101883, "logps/chosen": -1.5494577884674072, "logps/rejected": -2.1560275554656982, "loss": 0.6637, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.5494577884674072, "rewards/margins": 0.6065697073936462, "rewards/rejected": -2.1560275554656982, "sft_loss": 1.5897536277770996, "step": 3445 }, { "epoch": 1.8464626191670848, "grad_norm": 12.649656965505333, "learning_rate": 3.8652822597565403e-07, "logits/chosen": -0.36968958377838135, "logits/rejected": -0.17792022228240967, "logps/chosen": -1.5691545009613037, "logps/rejected": -2.228562355041504, "loss": 0.6608, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.5691545009613037, "rewards/margins": 0.6594076156616211, "rewards/rejected": -2.228562355041504, "sft_loss": 1.635911226272583, "step": 3450 }, { "epoch": 1.8491386519484863, "grad_norm": 3.260610776171183, "learning_rate": 3.850120120996123e-07, "logits/chosen": -0.23752427101135254, "logits/rejected": -0.06146562844514847, "logps/chosen": -1.7419025897979736, "logps/rejected": -2.3571693897247314, "loss": 0.6827, "rewards/accuracies": 0.625, "rewards/chosen": -1.7419025897979736, "rewards/margins": 0.6152670979499817, "rewards/rejected": -2.3571693897247314, "sft_loss": 1.741065263748169, "step": 3455 }, { "epoch": 1.851814684729888, "grad_norm": 9.52227662285607, "learning_rate": 3.8349691383753356e-07, "logits/chosen": -0.12958386540412903, "logits/rejected": -0.0022026679944247007, "logps/chosen": -1.5865254402160645, "logps/rejected": -2.2228541374206543, "loss": 0.6833, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.5865254402160645, "rewards/margins": 0.6363285779953003, "rewards/rejected": -2.2228541374206543, "sft_loss": 1.5762921571731567, "step": 3460 }, { "epoch": 1.8544907175112895, "grad_norm": 2.263229018573152, "learning_rate": 3.819829458889078e-07, "logits/chosen": -0.30748385190963745, "logits/rejected": -0.1827574074268341, "logps/chosen": -1.4545913934707642, "logps/rejected": -1.9467980861663818, "loss": 0.6628, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.4545913934707642, "rewards/margins": 0.49220672249794006, "rewards/rejected": -1.9467980861663818, "sft_loss": 1.4630444049835205, "step": 3465 }, { "epoch": 1.857166750292691, "grad_norm": 2.5218768620989045, "learning_rate": 3.804701229422585e-07, "logits/chosen": -0.31609639525413513, "logits/rejected": -0.21111683547496796, "logps/chosen": -1.7103404998779297, "logps/rejected": -2.197878122329712, "loss": 0.6742, "rewards/accuracies": 0.65625, "rewards/chosen": -1.7103404998779297, "rewards/margins": 0.48753732442855835, "rewards/rejected": -2.197878122329712, "sft_loss": 1.681815505027771, "step": 3470 }, { "epoch": 1.8598427830740927, "grad_norm": 3.065626489258694, "learning_rate": 3.789584596750007e-07, "logits/chosen": -0.3215027451515198, "logits/rejected": -0.2551218867301941, "logps/chosen": -1.5578937530517578, "logps/rejected": -2.097822666168213, "loss": 0.6535, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.5578937530517578, "rewards/margins": 0.5399289727210999, "rewards/rejected": -2.097822666168213, "sft_loss": 1.5477797985076904, "step": 3475 }, { "epoch": 1.8625188158554944, "grad_norm": 3.2673106603359585, "learning_rate": 3.77447970753298e-07, "logits/chosen": -0.18195316195487976, "logits/rejected": -0.1400514543056488, "logps/chosen": -1.6033868789672852, "logps/rejected": -2.114414930343628, "loss": 0.6608, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.6033868789672852, "rewards/margins": 0.5110281705856323, "rewards/rejected": -2.114414930343628, "sft_loss": 1.5997934341430664, "step": 3480 }, { "epoch": 1.8651948486368957, "grad_norm": 5.970318175906083, "learning_rate": 3.7593867083192057e-07, "logits/chosen": -0.22843074798583984, "logits/rejected": -0.10632483661174774, "logps/chosen": -1.5803484916687012, "logps/rejected": -2.1380927562713623, "loss": 0.6746, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.5803484916687012, "rewards/margins": 0.5577442049980164, "rewards/rejected": -2.1380927562713623, "sft_loss": 1.6084749698638916, "step": 3485 }, { "epoch": 1.8678708814182974, "grad_norm": 3.184778905784191, "learning_rate": 3.7443057455410276e-07, "logits/chosen": -0.20204658806324005, "logits/rejected": -0.07352867722511292, "logps/chosen": -1.4979311227798462, "logps/rejected": -2.0505619049072266, "loss": 0.6623, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.4979311227798462, "rewards/margins": 0.5526308417320251, "rewards/rejected": -2.0505619049072266, "sft_loss": 1.604762077331543, "step": 3490 }, { "epoch": 1.870546914199699, "grad_norm": 3.286481646732373, "learning_rate": 3.7292369655140145e-07, "logits/chosen": -0.29412025213241577, "logits/rejected": -0.12024509906768799, "logps/chosen": -1.647534728050232, "logps/rejected": -2.1293785572052, "loss": 0.6664, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.647534728050232, "rewards/margins": 0.48184362053871155, "rewards/rejected": -2.1293785572052, "sft_loss": 1.665687918663025, "step": 3495 }, { "epoch": 1.8732229469811004, "grad_norm": 3.353615127880277, "learning_rate": 3.714180514435534e-07, "logits/chosen": -0.19847139716148376, "logits/rejected": -0.034717656672000885, "logps/chosen": -1.6773347854614258, "logps/rejected": -2.419734239578247, "loss": 0.6503, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.6773347854614258, "rewards/margins": 0.7423990964889526, "rewards/rejected": -2.419734239578247, "sft_loss": 1.6499170064926147, "step": 3500 }, { "epoch": 1.875898979762502, "grad_norm": 5.210551530691637, "learning_rate": 3.6991365383833426e-07, "logits/chosen": -0.22429361939430237, "logits/rejected": -0.0938660278916359, "logps/chosen": -1.628684401512146, "logps/rejected": -2.2493913173675537, "loss": 0.6737, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.628684401512146, "rewards/margins": 0.6207069158554077, "rewards/rejected": -2.2493913173675537, "sft_loss": 1.616787314414978, "step": 3505 }, { "epoch": 1.8785750125439038, "grad_norm": 5.361114817879042, "learning_rate": 3.684105183314162e-07, "logits/chosen": -0.23827588558197021, "logits/rejected": -0.1531572937965393, "logps/chosen": -1.5106375217437744, "logps/rejected": -2.0941097736358643, "loss": 0.6549, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.5106375217437744, "rewards/margins": 0.5834720134735107, "rewards/rejected": -2.0941097736358643, "sft_loss": 1.5395634174346924, "step": 3510 }, { "epoch": 1.881251045325305, "grad_norm": 6.650010887041627, "learning_rate": 3.669086595062263e-07, "logits/chosen": -0.2544296681880951, "logits/rejected": -0.05539902299642563, "logps/chosen": -1.5374723672866821, "logps/rejected": -2.1580734252929688, "loss": 0.6823, "rewards/accuracies": 0.65625, "rewards/chosen": -1.5374723672866821, "rewards/margins": 0.6206012964248657, "rewards/rejected": -2.1580734252929688, "sft_loss": 1.525309443473816, "step": 3515 }, { "epoch": 1.8839270781067068, "grad_norm": 4.947389267072906, "learning_rate": 3.654080919338056e-07, "logits/chosen": -0.299444317817688, "logits/rejected": -0.1548498123884201, "logps/chosen": -1.580509901046753, "logps/rejected": -2.2148938179016113, "loss": 0.6664, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.580509901046753, "rewards/margins": 0.6343838572502136, "rewards/rejected": -2.2148938179016113, "sft_loss": 1.6169801950454712, "step": 3520 }, { "epoch": 1.8866031108881085, "grad_norm": 4.308547829856648, "learning_rate": 3.639088301726673e-07, "logits/chosen": -0.22670379281044006, "logits/rejected": -0.029366493225097656, "logps/chosen": -1.557122826576233, "logps/rejected": -2.1413769721984863, "loss": 0.6637, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.557122826576233, "rewards/margins": 0.584254264831543, "rewards/rejected": -2.1413769721984863, "sft_loss": 1.6209783554077148, "step": 3525 }, { "epoch": 1.88927914366951, "grad_norm": 5.075572874613157, "learning_rate": 3.624108887686556e-07, "logits/chosen": -0.2328222543001175, "logits/rejected": -0.1593681126832962, "logps/chosen": -1.5033090114593506, "logps/rejected": -1.9975850582122803, "loss": 0.6678, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.5033090114593506, "rewards/margins": 0.4942760467529297, "rewards/rejected": -1.9975850582122803, "sft_loss": 1.599522352218628, "step": 3530 }, { "epoch": 1.8919551764509115, "grad_norm": 4.517737430236565, "learning_rate": 3.6091428225480433e-07, "logits/chosen": -0.31304430961608887, "logits/rejected": -0.17922961711883545, "logps/chosen": -1.4516620635986328, "logps/rejected": -2.0089287757873535, "loss": 0.6662, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.4516620635986328, "rewards/margins": 0.5572668313980103, "rewards/rejected": -2.0089287757873535, "sft_loss": 1.5084701776504517, "step": 3535 }, { "epoch": 1.8946312092323132, "grad_norm": 12.721843344506373, "learning_rate": 3.5941902515119674e-07, "logits/chosen": -0.29551082849502563, "logits/rejected": -0.06938707828521729, "logps/chosen": -1.5136016607284546, "logps/rejected": -1.977852463722229, "loss": 0.6633, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.5136016607284546, "rewards/margins": 0.46425071358680725, "rewards/rejected": -1.977852463722229, "sft_loss": 1.5562989711761475, "step": 3540 }, { "epoch": 1.8973072420137147, "grad_norm": 12.960155392502875, "learning_rate": 3.5792513196482373e-07, "logits/chosen": -0.4133186936378479, "logits/rejected": -0.15416371822357178, "logps/chosen": -1.5097110271453857, "logps/rejected": -2.037909507751465, "loss": 0.6673, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.5097110271453857, "rewards/margins": 0.5281984806060791, "rewards/rejected": -2.037909507751465, "sft_loss": 1.5186493396759033, "step": 3545 }, { "epoch": 1.8999832747951162, "grad_norm": 3.071835516804881, "learning_rate": 3.5643261718944346e-07, "logits/chosen": -0.22974996268749237, "logits/rejected": -0.1381615251302719, "logps/chosen": -1.5370875597000122, "logps/rejected": -2.025336503982544, "loss": 0.671, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.5370875597000122, "rewards/margins": 0.488248735666275, "rewards/rejected": -2.025336503982544, "sft_loss": 1.4795281887054443, "step": 3550 }, { "epoch": 1.902659307576518, "grad_norm": 2.9337359048630907, "learning_rate": 3.5494149530544087e-07, "logits/chosen": -0.3561546206474304, "logits/rejected": -0.22161278128623962, "logps/chosen": -1.4861180782318115, "logps/rejected": -2.141692638397217, "loss": 0.6721, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.4861180782318115, "rewards/margins": 0.6555744409561157, "rewards/rejected": -2.141692638397217, "sft_loss": 1.4962797164916992, "step": 3555 }, { "epoch": 1.9053353403579194, "grad_norm": 5.309838370949358, "learning_rate": 3.534517807796871e-07, "logits/chosen": -0.25040486454963684, "logits/rejected": -0.16132552921772003, "logps/chosen": -1.5923874378204346, "logps/rejected": -2.195833683013916, "loss": 0.6695, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.5923874378204346, "rewards/margins": 0.6034458875656128, "rewards/rejected": -2.195833683013916, "sft_loss": 1.6128263473510742, "step": 3560 }, { "epoch": 1.908011373139321, "grad_norm": 3.3420280599413, "learning_rate": 3.519634880653988e-07, "logits/chosen": -0.22517654299736023, "logits/rejected": -0.14259059727191925, "logps/chosen": -1.5689737796783447, "logps/rejected": -2.3011300563812256, "loss": 0.6617, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.5689737796783447, "rewards/margins": 0.7321562767028809, "rewards/rejected": -2.3011300563812256, "sft_loss": 1.6068799495697021, "step": 3565 }, { "epoch": 1.9106874059207226, "grad_norm": 3.4535664969591804, "learning_rate": 3.504766316019987e-07, "logits/chosen": -0.2836065888404846, "logits/rejected": -0.1251402199268341, "logps/chosen": -1.5470378398895264, "logps/rejected": -2.167536973953247, "loss": 0.6596, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.5470378398895264, "rewards/margins": 0.6204993724822998, "rewards/rejected": -2.167536973953247, "sft_loss": 1.5432517528533936, "step": 3570 }, { "epoch": 1.913363438702124, "grad_norm": 3.875237419980098, "learning_rate": 3.489912258149745e-07, "logits/chosen": -0.19648095965385437, "logits/rejected": -0.07422200590372086, "logps/chosen": -1.5497782230377197, "logps/rejected": -2.1988131999969482, "loss": 0.6753, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.5497782230377197, "rewards/margins": 0.6490351557731628, "rewards/rejected": -2.1988131999969482, "sft_loss": 1.528440237045288, "step": 3575 }, { "epoch": 1.9160394714835256, "grad_norm": 5.110834671223678, "learning_rate": 3.475072851157397e-07, "logits/chosen": -0.2754908502101898, "logits/rejected": -0.221980482339859, "logps/chosen": -1.5026241540908813, "logps/rejected": -2.2867393493652344, "loss": 0.6447, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.5026241540908813, "rewards/margins": 0.7841153144836426, "rewards/rejected": -2.2867393493652344, "sft_loss": 1.5356286764144897, "step": 3580 }, { "epoch": 1.9187155042649273, "grad_norm": 3.5772522043441386, "learning_rate": 3.460248239014936e-07, "logits/chosen": -0.18519827723503113, "logits/rejected": -0.12182275950908661, "logps/chosen": -1.6455638408660889, "logps/rejected": -2.175966739654541, "loss": 0.6552, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.6455638408660889, "rewards/margins": 0.5304029583930969, "rewards/rejected": -2.175966739654541, "sft_loss": 1.693975806236267, "step": 3585 }, { "epoch": 1.9213915370463288, "grad_norm": 2.7451754612170163, "learning_rate": 3.4454385655508134e-07, "logits/chosen": -0.22382116317749023, "logits/rejected": -0.1571996957063675, "logps/chosen": -1.6252641677856445, "logps/rejected": -1.9908462762832642, "loss": 0.6766, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.6252641677856445, "rewards/margins": 0.36558184027671814, "rewards/rejected": -1.9908462762832642, "sft_loss": 1.6530452966690063, "step": 3590 }, { "epoch": 1.9240675698277303, "grad_norm": 3.577943615855644, "learning_rate": 3.4306439744485447e-07, "logits/chosen": -0.3366868495941162, "logits/rejected": -0.12997911870479584, "logps/chosen": -1.5786283016204834, "logps/rejected": -2.2249855995178223, "loss": 0.6663, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.5786283016204834, "rewards/margins": 0.6463571786880493, "rewards/rejected": -2.2249855995178223, "sft_loss": 1.5080569982528687, "step": 3595 }, { "epoch": 1.926743602609132, "grad_norm": 4.96797129383118, "learning_rate": 3.415864609245322e-07, "logits/chosen": -0.18899324536323547, "logits/rejected": -0.011387032456696033, "logps/chosen": -1.5805315971374512, "logps/rejected": -2.2807295322418213, "loss": 0.659, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.5805315971374512, "rewards/margins": 0.7001979947090149, "rewards/rejected": -2.2807295322418213, "sft_loss": 1.5917775630950928, "step": 3600 }, { "epoch": 1.926743602609132, "eval_logits/chosen": 0.033155109733343124, "eval_logits/rejected": 0.11886825412511826, "eval_logps/chosen": -1.5548663139343262, "eval_logps/rejected": -2.1383421421051025, "eval_loss": 0.6686433553695679, "eval_rewards/accuracies": 0.6454005837440491, "eval_rewards/chosen": -1.5548663139343262, "eval_rewards/margins": 0.583476185798645, "eval_rewards/rejected": -2.1383421421051025, "eval_runtime": 51.671, "eval_samples_per_second": 26.03, "eval_sft_loss": 1.5567699670791626, "eval_steps_per_second": 6.522, "step": 3600 }, { "epoch": 1.9294196353905335, "grad_norm": 4.6127131848133995, "learning_rate": 3.401100613330605e-07, "logits/chosen": -0.29335904121398926, "logits/rejected": -0.25090211629867554, "logps/chosen": -1.569379210472107, "logps/rejected": -1.9675638675689697, "loss": 0.6762, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.569379210472107, "rewards/margins": 0.3981846868991852, "rewards/rejected": -1.9675638675689697, "sft_loss": 1.5933501720428467, "step": 3605 }, { "epoch": 1.932095668171935, "grad_norm": 3.2191921337472507, "learning_rate": 3.3863521299447514e-07, "logits/chosen": -0.23906013369560242, "logits/rejected": -0.11687849462032318, "logps/chosen": -1.5674329996109009, "logps/rejected": -2.1900410652160645, "loss": 0.6544, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.5674329996109009, "rewards/margins": 0.6226081252098083, "rewards/rejected": -2.1900410652160645, "sft_loss": 1.6243540048599243, "step": 3610 }, { "epoch": 1.9347717009533367, "grad_norm": 3.14323697314147, "learning_rate": 3.371619302177609e-07, "logits/chosen": -0.1652398258447647, "logits/rejected": -0.047273315489292145, "logps/chosen": -1.5660088062286377, "logps/rejected": -2.0418858528137207, "loss": 0.6756, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.5660088062286377, "rewards/margins": 0.47587698698043823, "rewards/rejected": -2.0418858528137207, "sft_loss": 1.5622031688690186, "step": 3615 }, { "epoch": 1.9374477337347382, "grad_norm": 3.5297502350856966, "learning_rate": 3.3569022729671393e-07, "logits/chosen": -0.23163394629955292, "logits/rejected": -0.1616125851869583, "logps/chosen": -1.597439169883728, "logps/rejected": -2.0517027378082275, "loss": 0.6636, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.597439169883728, "rewards/margins": 0.45426350831985474, "rewards/rejected": -2.0517027378082275, "sft_loss": 1.6499122381210327, "step": 3620 }, { "epoch": 1.9401237665161397, "grad_norm": 2.8994360995806447, "learning_rate": 3.342201185098024e-07, "logits/chosen": -0.15500815212726593, "logits/rejected": -0.12774141132831573, "logps/chosen": -1.534929633140564, "logps/rejected": -1.972755789756775, "loss": 0.6625, "rewards/accuracies": 0.625, "rewards/chosen": -1.534929633140564, "rewards/margins": 0.437826007604599, "rewards/rejected": -1.972755789756775, "sft_loss": 1.5774520635604858, "step": 3625 }, { "epoch": 1.9427997992975414, "grad_norm": 5.11590460585965, "learning_rate": 3.3275161812002807e-07, "logits/chosen": -0.25896185636520386, "logits/rejected": -0.2015591561794281, "logps/chosen": -1.6104027032852173, "logps/rejected": -2.128175735473633, "loss": 0.6787, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.6104027032852173, "rewards/margins": 0.5177728533744812, "rewards/rejected": -2.128175735473633, "sft_loss": 1.7070982456207275, "step": 3630 }, { "epoch": 1.945475832078943, "grad_norm": 6.88697763601533, "learning_rate": 3.312847403747883e-07, "logits/chosen": -0.2818135619163513, "logits/rejected": -0.18838824331760406, "logps/chosen": -1.5450584888458252, "logps/rejected": -2.018775463104248, "loss": 0.6674, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.5450584888458252, "rewards/margins": 0.4737167954444885, "rewards/rejected": -2.018775463104248, "sft_loss": 1.5849950313568115, "step": 3635 }, { "epoch": 1.9481518648603444, "grad_norm": 3.560060583655417, "learning_rate": 3.2981949950573733e-07, "logits/chosen": -0.23087699711322784, "logits/rejected": -0.11774490773677826, "logps/chosen": -1.6925204992294312, "logps/rejected": -1.9472980499267578, "loss": 0.6799, "rewards/accuracies": 0.625, "rewards/chosen": -1.6925204992294312, "rewards/margins": 0.2547776997089386, "rewards/rejected": -1.9472980499267578, "sft_loss": 1.6750530004501343, "step": 3640 }, { "epoch": 1.9508278976417461, "grad_norm": 3.4389483248324906, "learning_rate": 3.283559097286486e-07, "logits/chosen": -0.30328264832496643, "logits/rejected": -0.1819484382867813, "logps/chosen": -1.6709623336791992, "logps/rejected": -1.9654725790023804, "loss": 0.6793, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.6709623336791992, "rewards/margins": 0.29451045393943787, "rewards/rejected": -1.9654725790023804, "sft_loss": 1.6666209697723389, "step": 3645 }, { "epoch": 1.9535039304231478, "grad_norm": 4.228768748209242, "learning_rate": 3.268939852432765e-07, "logits/chosen": -0.2856011390686035, "logits/rejected": -0.1793937236070633, "logps/chosen": -1.5447874069213867, "logps/rejected": -2.0078554153442383, "loss": 0.6554, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.5447874069213867, "rewards/margins": 0.4630679488182068, "rewards/rejected": -2.0078554153442383, "sft_loss": 1.6015489101409912, "step": 3650 }, { "epoch": 1.9561799632045491, "grad_norm": 10.273668179191976, "learning_rate": 3.254337402332187e-07, "logits/chosen": -0.25895020365715027, "logits/rejected": -0.12569567561149597, "logps/chosen": -1.7206480503082275, "logps/rejected": -2.194127321243286, "loss": 0.6877, "rewards/accuracies": 0.625, "rewards/chosen": -1.7206480503082275, "rewards/margins": 0.47347909212112427, "rewards/rejected": -2.194127321243286, "sft_loss": 1.6576461791992188, "step": 3655 }, { "epoch": 1.9588559959859508, "grad_norm": 4.1471744569265425, "learning_rate": 3.239751888657788e-07, "logits/chosen": -0.32585427165031433, "logits/rejected": -0.20504073798656464, "logps/chosen": -1.5201367139816284, "logps/rejected": -2.1893112659454346, "loss": 0.6626, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.5201367139816284, "rewards/margins": 0.6691744327545166, "rewards/rejected": -2.1893112659454346, "sft_loss": 1.5931421518325806, "step": 3660 }, { "epoch": 1.9615320287673526, "grad_norm": 4.687182181868727, "learning_rate": 3.2251834529182856e-07, "logits/chosen": -0.2540055215358734, "logits/rejected": -0.13636021316051483, "logps/chosen": -1.5124223232269287, "logps/rejected": -2.206188201904297, "loss": 0.6558, "rewards/accuracies": 0.65625, "rewards/chosen": -1.5124223232269287, "rewards/margins": 0.6937659978866577, "rewards/rejected": -2.206188201904297, "sft_loss": 1.4735386371612549, "step": 3665 }, { "epoch": 1.9642080615487538, "grad_norm": 2.3357998647710394, "learning_rate": 3.2106322364567075e-07, "logits/chosen": -0.3001280725002289, "logits/rejected": -0.16386644542217255, "logps/chosen": -1.5323134660720825, "logps/rejected": -2.2306857109069824, "loss": 0.6475, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.5323134660720825, "rewards/margins": 0.6983723640441895, "rewards/rejected": -2.2306857109069824, "sft_loss": 1.612134337425232, "step": 3670 }, { "epoch": 1.9668840943301555, "grad_norm": 4.424587556968458, "learning_rate": 3.1960983804490183e-07, "logits/chosen": -0.28842639923095703, "logits/rejected": -0.1421942263841629, "logps/chosen": -1.6424224376678467, "logps/rejected": -2.4093194007873535, "loss": 0.6581, "rewards/accuracies": 0.71875, "rewards/chosen": -1.6424224376678467, "rewards/margins": 0.7668969035148621, "rewards/rejected": -2.4093194007873535, "sft_loss": 1.6991245746612549, "step": 3675 }, { "epoch": 1.9695601271115573, "grad_norm": 5.685577126456062, "learning_rate": 3.1815820259027537e-07, "logits/chosen": -0.28574010729789734, "logits/rejected": -0.1689317226409912, "logps/chosen": -1.4668850898742676, "logps/rejected": -2.0467896461486816, "loss": 0.6454, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.4668850898742676, "rewards/margins": 0.5799044370651245, "rewards/rejected": -2.0467896461486816, "sft_loss": 1.4965256452560425, "step": 3680 }, { "epoch": 1.9722361598929585, "grad_norm": 4.113095336765455, "learning_rate": 3.16708331365565e-07, "logits/chosen": -0.30331823229789734, "logits/rejected": -0.2203049212694168, "logps/chosen": -1.5306308269500732, "logps/rejected": -2.2132725715637207, "loss": 0.6466, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.5306308269500732, "rewards/margins": 0.6826415061950684, "rewards/rejected": -2.2132725715637207, "sft_loss": 1.593216896057129, "step": 3685 }, { "epoch": 1.9749121926743602, "grad_norm": 2.9720597069657355, "learning_rate": 3.152602384374275e-07, "logits/chosen": -0.2730967104434967, "logits/rejected": -0.1127859354019165, "logps/chosen": -1.5642287731170654, "logps/rejected": -2.2336513996124268, "loss": 0.6702, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.5642287731170654, "rewards/margins": 0.6694225072860718, "rewards/rejected": -2.2336513996124268, "sft_loss": 1.5468215942382812, "step": 3690 }, { "epoch": 1.977588225455762, "grad_norm": 2.1520133586927157, "learning_rate": 3.1381393785526697e-07, "logits/chosen": -0.23968365788459778, "logits/rejected": -0.18710513412952423, "logps/chosen": -1.6503604650497437, "logps/rejected": -2.2875962257385254, "loss": 0.6614, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.6503604650497437, "rewards/margins": 0.6372357606887817, "rewards/rejected": -2.2875962257385254, "sft_loss": 1.695789098739624, "step": 3695 }, { "epoch": 1.9802642582371635, "grad_norm": 5.8010732155099225, "learning_rate": 3.123694436510979e-07, "logits/chosen": -0.21912772953510284, "logits/rejected": -0.10847660154104233, "logps/chosen": -1.5232489109039307, "logps/rejected": -2.078616142272949, "loss": 0.6552, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.5232489109039307, "rewards/margins": 0.5553671717643738, "rewards/rejected": -2.078616142272949, "sft_loss": 1.5513193607330322, "step": 3700 }, { "epoch": 1.982940291018565, "grad_norm": 4.062617243352844, "learning_rate": 3.1092676983940946e-07, "logits/chosen": -0.26170676946640015, "logits/rejected": -0.18718138337135315, "logps/chosen": -1.5802628993988037, "logps/rejected": -2.1678214073181152, "loss": 0.668, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.5802628993988037, "rewards/margins": 0.5875582098960876, "rewards/rejected": -2.1678214073181152, "sft_loss": 1.5466351509094238, "step": 3705 }, { "epoch": 1.9856163237999667, "grad_norm": 4.399706829417717, "learning_rate": 3.094859304170293e-07, "logits/chosen": -0.11614084243774414, "logits/rejected": -0.049933575093746185, "logps/chosen": -1.5520092248916626, "logps/rejected": -2.0661747455596924, "loss": 0.6619, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.5520092248916626, "rewards/margins": 0.5141655206680298, "rewards/rejected": -2.0661747455596924, "sft_loss": 1.6254345178604126, "step": 3710 }, { "epoch": 1.9882923565813682, "grad_norm": 3.131404415148893, "learning_rate": 3.0804693936298795e-07, "logits/chosen": -0.2017994225025177, "logits/rejected": -0.12663564085960388, "logps/chosen": -1.5812351703643799, "logps/rejected": -2.2547194957733154, "loss": 0.6603, "rewards/accuracies": 0.65625, "rewards/chosen": -1.5812351703643799, "rewards/margins": 0.6734842658042908, "rewards/rejected": -2.2547194957733154, "sft_loss": 1.6243197917938232, "step": 3715 }, { "epoch": 1.9909683893627697, "grad_norm": 3.4885571992111593, "learning_rate": 3.066098106383826e-07, "logits/chosen": -0.2505125403404236, "logits/rejected": -0.1667691022157669, "logps/chosen": -1.5788905620574951, "logps/rejected": -2.0330827236175537, "loss": 0.6847, "rewards/accuracies": 0.625, "rewards/chosen": -1.5788905620574951, "rewards/margins": 0.45419207215309143, "rewards/rejected": -2.0330827236175537, "sft_loss": 1.519217848777771, "step": 3720 }, { "epoch": 1.9936444221441714, "grad_norm": 3.5935337638493237, "learning_rate": 3.0517455818624263e-07, "logits/chosen": -0.2921850085258484, "logits/rejected": -0.1891443431377411, "logps/chosen": -1.5669046640396118, "logps/rejected": -2.1437056064605713, "loss": 0.6698, "rewards/accuracies": 0.625, "rewards/chosen": -1.5669046640396118, "rewards/margins": 0.576801061630249, "rewards/rejected": -2.1437056064605713, "sft_loss": 1.6299469470977783, "step": 3725 }, { "epoch": 1.9963204549255729, "grad_norm": 3.83568295626339, "learning_rate": 3.037411959313936e-07, "logits/chosen": -0.20006486773490906, "logits/rejected": -0.07037347555160522, "logps/chosen": -1.504563570022583, "logps/rejected": -2.0518879890441895, "loss": 0.6536, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.504563570022583, "rewards/margins": 0.5473244786262512, "rewards/rejected": -2.0518879890441895, "sft_loss": 1.5179142951965332, "step": 3730 }, { "epoch": 1.9989964877069744, "grad_norm": 4.007642145196086, "learning_rate": 3.023097377803224e-07, "logits/chosen": -0.17184345424175262, "logits/rejected": -0.08726246654987335, "logps/chosen": -1.6674957275390625, "logps/rejected": -1.9999866485595703, "loss": 0.691, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.6674957275390625, "rewards/margins": 0.3324908912181854, "rewards/rejected": -1.9999866485595703, "sft_loss": 1.6672461032867432, "step": 3735 }, { "epoch": 2.001672520488376, "grad_norm": 2.6207362127955203, "learning_rate": 3.008801976210423e-07, "logits/chosen": -0.1908237785100937, "logits/rejected": -0.13750803470611572, "logps/chosen": -1.671579360961914, "logps/rejected": -2.0861868858337402, "loss": 0.6707, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.671579360961914, "rewards/margins": 0.41460782289505005, "rewards/rejected": -2.0861868858337402, "sft_loss": 1.5895977020263672, "step": 3740 }, { "epoch": 2.0043485532697773, "grad_norm": 3.4651381445485896, "learning_rate": 2.994525893229581e-07, "logits/chosen": -0.24086709320545197, "logits/rejected": -0.149921253323555, "logps/chosen": -1.6103401184082031, "logps/rejected": -2.1344754695892334, "loss": 0.6629, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.6103401184082031, "rewards/margins": 0.5241352915763855, "rewards/rejected": -2.1344754695892334, "sft_loss": 1.5900582075119019, "step": 3745 }, { "epoch": 2.007024586051179, "grad_norm": 2.9554036013361262, "learning_rate": 2.98026926736732e-07, "logits/chosen": -0.30406874418258667, "logits/rejected": -0.21627768874168396, "logps/chosen": -1.4910343885421753, "logps/rejected": -2.131004810333252, "loss": 0.6288, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.4910343885421753, "rewards/margins": 0.6399704217910767, "rewards/rejected": -2.131004810333252, "sft_loss": 1.4939466714859009, "step": 3750 }, { "epoch": 2.0097006188325808, "grad_norm": 5.576360948284245, "learning_rate": 2.9660322369414846e-07, "logits/chosen": -0.25901031494140625, "logits/rejected": -0.1560325026512146, "logps/chosen": -1.5033503770828247, "logps/rejected": -2.2636451721191406, "loss": 0.6319, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.5033503770828247, "rewards/margins": 0.7602945566177368, "rewards/rejected": -2.2636451721191406, "sft_loss": 1.546751618385315, "step": 3755 }, { "epoch": 2.0123766516139825, "grad_norm": 3.1028811255439384, "learning_rate": 2.9518149400798063e-07, "logits/chosen": -0.34134721755981445, "logits/rejected": -0.30794721841812134, "logps/chosen": -1.512274980545044, "logps/rejected": -2.315218687057495, "loss": 0.638, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.512274980545044, "rewards/margins": 0.8029438853263855, "rewards/rejected": -2.315218687057495, "sft_loss": 1.582011342048645, "step": 3760 }, { "epoch": 2.0150526843953838, "grad_norm": 11.087816656768059, "learning_rate": 2.9376175147185633e-07, "logits/chosen": -0.20825648307800293, "logits/rejected": -0.02212923765182495, "logps/chosen": -1.614130973815918, "logps/rejected": -2.3036086559295654, "loss": 0.6606, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.614130973815918, "rewards/margins": 0.6894776821136475, "rewards/rejected": -2.3036086559295654, "sft_loss": 1.5822635889053345, "step": 3765 }, { "epoch": 2.0177287171767855, "grad_norm": 4.456838947656932, "learning_rate": 2.9234400986012376e-07, "logits/chosen": -0.35738760232925415, "logits/rejected": -0.19936344027519226, "logps/chosen": -1.4727087020874023, "logps/rejected": -2.3376715183258057, "loss": 0.6227, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.4727087020874023, "rewards/margins": 0.8649626970291138, "rewards/rejected": -2.3376715183258057, "sft_loss": 1.5102074146270752, "step": 3770 }, { "epoch": 2.020404749958187, "grad_norm": 3.416057918617279, "learning_rate": 2.9092828292771817e-07, "logits/chosen": -0.2814391255378723, "logits/rejected": -0.2211610972881317, "logps/chosen": -1.594720721244812, "logps/rejected": -2.1970386505126953, "loss": 0.6653, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.594720721244812, "rewards/margins": 0.6023179292678833, "rewards/rejected": -2.1970386505126953, "sft_loss": 1.5907560586929321, "step": 3775 }, { "epoch": 2.0230807827395885, "grad_norm": 4.021249068139565, "learning_rate": 2.8951458441002875e-07, "logits/chosen": -0.21834734082221985, "logits/rejected": -0.17740324139595032, "logps/chosen": -1.569536566734314, "logps/rejected": -2.200033664703369, "loss": 0.6593, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.569536566734314, "rewards/margins": 0.6304970979690552, "rewards/rejected": -2.200033664703369, "sft_loss": 1.605539321899414, "step": 3780 }, { "epoch": 2.02575681552099, "grad_norm": 3.621760161352139, "learning_rate": 2.881029280227643e-07, "logits/chosen": -0.28344467282295227, "logits/rejected": -0.1576431542634964, "logps/chosen": -1.5895583629608154, "logps/rejected": -2.3175249099731445, "loss": 0.6516, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.5895583629608154, "rewards/margins": 0.7279663681983948, "rewards/rejected": -2.3175249099731445, "sft_loss": 1.5516364574432373, "step": 3785 }, { "epoch": 2.028432848302392, "grad_norm": 3.4096565110748682, "learning_rate": 2.8669332746182177e-07, "logits/chosen": -0.3263254165649414, "logits/rejected": -0.15622875094413757, "logps/chosen": -1.5148825645446777, "logps/rejected": -2.298409938812256, "loss": 0.6225, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.5148825645446777, "rewards/margins": 0.7835271954536438, "rewards/rejected": -2.298409938812256, "sft_loss": 1.5991499423980713, "step": 3790 }, { "epoch": 2.031108881083793, "grad_norm": 3.0479720277715945, "learning_rate": 2.8528579640315156e-07, "logits/chosen": -0.24460425972938538, "logits/rejected": -0.2142985314130783, "logps/chosen": -1.5362884998321533, "logps/rejected": -2.007601261138916, "loss": 0.6638, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.5362884998321533, "rewards/margins": 0.47131237387657166, "rewards/rejected": -2.007601261138916, "sft_loss": 1.5806416273117065, "step": 3795 }, { "epoch": 2.033784913865195, "grad_norm": 3.5161509907695585, "learning_rate": 2.8388034850262646e-07, "logits/chosen": -0.27440527081489563, "logits/rejected": -0.15014337003231049, "logps/chosen": -1.641608476638794, "logps/rejected": -2.256495237350464, "loss": 0.646, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.641608476638794, "rewards/margins": 0.6148865818977356, "rewards/rejected": -2.256495237350464, "sft_loss": 1.6694062948226929, "step": 3800 }, { "epoch": 2.0364609466465966, "grad_norm": 10.42396913663668, "learning_rate": 2.824769973959079e-07, "logits/chosen": -0.2544073164463043, "logits/rejected": -0.14254014194011688, "logps/chosen": -1.496002435684204, "logps/rejected": -2.034226417541504, "loss": 0.6467, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.496002435684204, "rewards/margins": 0.5382241010665894, "rewards/rejected": -2.034226417541504, "sft_loss": 1.5158672332763672, "step": 3805 }, { "epoch": 2.039136979427998, "grad_norm": 4.383190990614828, "learning_rate": 2.81075756698315e-07, "logits/chosen": -0.16585926711559296, "logits/rejected": -0.07460494339466095, "logps/chosen": -1.492472767829895, "logps/rejected": -2.326233386993408, "loss": 0.6423, "rewards/accuracies": 0.75, "rewards/chosen": -1.492472767829895, "rewards/margins": 0.8337606191635132, "rewards/rejected": -2.326233386993408, "sft_loss": 1.4879860877990723, "step": 3810 }, { "epoch": 2.0418130122093996, "grad_norm": 3.167380791186394, "learning_rate": 2.7967664000469035e-07, "logits/chosen": -0.3844323754310608, "logits/rejected": -0.26187294721603394, "logps/chosen": -1.649530053138733, "logps/rejected": -2.1097311973571777, "loss": 0.6691, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.649530053138733, "rewards/margins": 0.46020111441612244, "rewards/rejected": -2.1097311973571777, "sft_loss": 1.5694371461868286, "step": 3815 }, { "epoch": 2.0444890449908013, "grad_norm": 2.1309857741540585, "learning_rate": 2.7827966088927095e-07, "logits/chosen": -0.3505423665046692, "logits/rejected": -0.15348538756370544, "logps/chosen": -1.6408374309539795, "logps/rejected": -2.3227627277374268, "loss": 0.6696, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.6408374309539795, "rewards/margins": 0.6819251775741577, "rewards/rejected": -2.3227627277374268, "sft_loss": 1.650193452835083, "step": 3820 }, { "epoch": 2.0471650777722026, "grad_norm": 6.177280648723578, "learning_rate": 2.768848329055538e-07, "logits/chosen": -0.2825758159160614, "logits/rejected": -0.17822478711605072, "logps/chosen": -1.496321439743042, "logps/rejected": -2.104548931121826, "loss": 0.6507, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.496321439743042, "rewards/margins": 0.608227550983429, "rewards/rejected": -2.104548931121826, "sft_loss": 1.5814179182052612, "step": 3825 }, { "epoch": 2.0498411105536043, "grad_norm": 3.385853799454223, "learning_rate": 2.7549216958616657e-07, "logits/chosen": -0.3522363603115082, "logits/rejected": -0.20810142159461975, "logps/chosen": -1.589411973953247, "logps/rejected": -2.328275680541992, "loss": 0.6449, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.589411973953247, "rewards/margins": 0.7388638854026794, "rewards/rejected": -2.328275680541992, "sft_loss": 1.5857503414154053, "step": 3830 }, { "epoch": 2.052517143335006, "grad_norm": 5.855398115427702, "learning_rate": 2.741016844427344e-07, "logits/chosen": -0.2700735330581665, "logits/rejected": -0.11895928531885147, "logps/chosen": -1.561927080154419, "logps/rejected": -2.117443561553955, "loss": 0.6631, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.561927080154419, "rewards/margins": 0.5555165410041809, "rewards/rejected": -2.117443561553955, "sft_loss": 1.5860936641693115, "step": 3835 }, { "epoch": 2.0551931761164073, "grad_norm": 3.8412171916270657, "learning_rate": 2.7271339096575073e-07, "logits/chosen": -0.23423747718334198, "logits/rejected": -0.12965528666973114, "logps/chosen": -1.4988791942596436, "logps/rejected": -2.19246768951416, "loss": 0.6334, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.4988791942596436, "rewards/margins": 0.6935886144638062, "rewards/rejected": -2.19246768951416, "sft_loss": 1.516981840133667, "step": 3840 }, { "epoch": 2.057869208897809, "grad_norm": 3.476287578265999, "learning_rate": 2.713273026244446e-07, "logits/chosen": -0.3897637724876404, "logits/rejected": -0.16315045952796936, "logps/chosen": -1.5861146450042725, "logps/rejected": -2.204552412033081, "loss": 0.6569, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.5861146450042725, "rewards/margins": 0.618437647819519, "rewards/rejected": -2.204552412033081, "sft_loss": 1.6054118871688843, "step": 3845 }, { "epoch": 2.0605452416792107, "grad_norm": 4.297658138812751, "learning_rate": 2.6994343286665156e-07, "logits/chosen": -0.28043732047080994, "logits/rejected": -0.11735528707504272, "logps/chosen": -1.5826841592788696, "logps/rejected": -2.1900174617767334, "loss": 0.6648, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.5826841592788696, "rewards/margins": 0.6073335409164429, "rewards/rejected": -2.1900174617767334, "sft_loss": 1.5941132307052612, "step": 3850 }, { "epoch": 2.063221274460612, "grad_norm": 3.9597984184611454, "learning_rate": 2.6856179511868156e-07, "logits/chosen": -0.2457316368818283, "logits/rejected": -0.06879222393035889, "logps/chosen": -1.5499293804168701, "logps/rejected": -2.3246564865112305, "loss": 0.6629, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.5499293804168701, "rewards/margins": 0.7747267484664917, "rewards/rejected": -2.3246564865112305, "sft_loss": 1.531903624534607, "step": 3855 }, { "epoch": 2.0658973072420137, "grad_norm": 2.403023847709752, "learning_rate": 2.6718240278519056e-07, "logits/chosen": -0.27923932671546936, "logits/rejected": -0.1388339400291443, "logps/chosen": -1.5517905950546265, "logps/rejected": -2.3513119220733643, "loss": 0.6612, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.5517905950546265, "rewards/margins": 0.7995215654373169, "rewards/rejected": -2.3513119220733643, "sft_loss": 1.5326491594314575, "step": 3860 }, { "epoch": 2.0685733400234154, "grad_norm": 4.016783042699588, "learning_rate": 2.6580526924904866e-07, "logits/chosen": -0.3711521625518799, "logits/rejected": -0.19031788408756256, "logps/chosen": -1.6756954193115234, "logps/rejected": -2.082862615585327, "loss": 0.6621, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.6756954193115234, "rewards/margins": 0.40716734528541565, "rewards/rejected": -2.082862615585327, "sft_loss": 1.6878407001495361, "step": 3865 }, { "epoch": 2.0712493728048167, "grad_norm": 3.024186762527514, "learning_rate": 2.6443040787121186e-07, "logits/chosen": -0.28381603956222534, "logits/rejected": -0.22497475147247314, "logps/chosen": -1.4522531032562256, "logps/rejected": -2.0683815479278564, "loss": 0.6459, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.4522531032562256, "rewards/margins": 0.6161283850669861, "rewards/rejected": -2.0683815479278564, "sft_loss": 1.4766418933868408, "step": 3870 }, { "epoch": 2.0739254055862184, "grad_norm": 4.156505428484555, "learning_rate": 2.6305783199059084e-07, "logits/chosen": -0.2825126647949219, "logits/rejected": -0.17621538043022156, "logps/chosen": -1.5356099605560303, "logps/rejected": -2.134596586227417, "loss": 0.6389, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.5356099605560303, "rewards/margins": 0.5989863276481628, "rewards/rejected": -2.134596586227417, "sft_loss": 1.6044524908065796, "step": 3875 }, { "epoch": 2.07660143836762, "grad_norm": 2.3562440156593265, "learning_rate": 2.6168755492392324e-07, "logits/chosen": -0.29220303893089294, "logits/rejected": -0.130177840590477, "logps/chosen": -1.3875385522842407, "logps/rejected": -2.1555233001708984, "loss": 0.636, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.3875385522842407, "rewards/margins": 0.7679846882820129, "rewards/rejected": -2.1555233001708984, "sft_loss": 1.4205989837646484, "step": 3880 }, { "epoch": 2.0792774711490214, "grad_norm": 3.358840986599196, "learning_rate": 2.6031958996564274e-07, "logits/chosen": -0.327506422996521, "logits/rejected": -0.18542982637882233, "logps/chosen": -1.4863008260726929, "logps/rejected": -2.287219762802124, "loss": 0.6454, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.4863008260726929, "rewards/margins": 0.8009187579154968, "rewards/rejected": -2.287219762802124, "sft_loss": 1.52559494972229, "step": 3885 }, { "epoch": 2.081953503930423, "grad_norm": 5.410217880305303, "learning_rate": 2.589539503877518e-07, "logits/chosen": -0.22475166618824005, "logits/rejected": -0.12966808676719666, "logps/chosen": -1.5296286344528198, "logps/rejected": -2.1420018672943115, "loss": 0.6536, "rewards/accuracies": 0.59375, "rewards/chosen": -1.5296286344528198, "rewards/margins": 0.6123733520507812, "rewards/rejected": -2.1420018672943115, "sft_loss": 1.5668294429779053, "step": 3890 }, { "epoch": 2.084629536711825, "grad_norm": 3.9021667690936757, "learning_rate": 2.5759064943969125e-07, "logits/chosen": -0.31892210245132446, "logits/rejected": -0.08537305891513824, "logps/chosen": -1.5146350860595703, "logps/rejected": -2.2054097652435303, "loss": 0.6532, "rewards/accuracies": 0.6875, "rewards/chosen": -1.5146350860595703, "rewards/margins": 0.6907747983932495, "rewards/rejected": -2.2054097652435303, "sft_loss": 1.554931640625, "step": 3895 }, { "epoch": 2.087305569493226, "grad_norm": 3.063283540434889, "learning_rate": 2.562297003482131e-07, "logits/chosen": -0.180180162191391, "logits/rejected": -0.1619834005832672, "logps/chosen": -1.5252013206481934, "logps/rejected": -2.1712911128997803, "loss": 0.6483, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.5252013206481934, "rewards/margins": 0.6460900902748108, "rewards/rejected": -2.1712911128997803, "sft_loss": 1.5667892694473267, "step": 3900 }, { "epoch": 2.089981602274628, "grad_norm": 14.199748316575338, "learning_rate": 2.548711163172512e-07, "logits/chosen": -0.246128648519516, "logits/rejected": -0.15304933488368988, "logps/chosen": -1.6168506145477295, "logps/rejected": -2.1350083351135254, "loss": 0.6645, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.6168506145477295, "rewards/margins": 0.518157422542572, "rewards/rejected": -2.1350083351135254, "sft_loss": 1.5538456439971924, "step": 3905 }, { "epoch": 2.0926576350560295, "grad_norm": 2.8839854578578246, "learning_rate": 2.53514910527794e-07, "logits/chosen": -0.22317853569984436, "logits/rejected": -0.10486292839050293, "logps/chosen": -1.4757004976272583, "logps/rejected": -2.0389535427093506, "loss": 0.6426, "rewards/accuracies": 0.65625, "rewards/chosen": -1.4757004976272583, "rewards/margins": 0.5632533431053162, "rewards/rejected": -2.0389535427093506, "sft_loss": 1.4846004247665405, "step": 3910 }, { "epoch": 2.095333667837431, "grad_norm": 2.2850158307723807, "learning_rate": 2.5216109613775573e-07, "logits/chosen": -0.30444300174713135, "logits/rejected": -0.14454534649848938, "logps/chosen": -1.5955278873443604, "logps/rejected": -2.248922824859619, "loss": 0.6674, "rewards/accuracies": 0.65625, "rewards/chosen": -1.5955278873443604, "rewards/margins": 0.6533945798873901, "rewards/rejected": -2.248922824859619, "sft_loss": 1.6477130651474, "step": 3915 }, { "epoch": 2.0980097006188325, "grad_norm": 2.2209000145296645, "learning_rate": 2.5080968628184993e-07, "logits/chosen": -0.3080112934112549, "logits/rejected": -0.15310274064540863, "logps/chosen": -1.5924137830734253, "logps/rejected": -2.4808449745178223, "loss": 0.6519, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.5924137830734253, "rewards/margins": 0.8884310722351074, "rewards/rejected": -2.4808449745178223, "sft_loss": 1.5781221389770508, "step": 3920 }, { "epoch": 2.1006857334002342, "grad_norm": 5.045439828682632, "learning_rate": 2.494606940714605e-07, "logits/chosen": -0.2884170711040497, "logits/rejected": -0.18335895240306854, "logps/chosen": -1.4607820510864258, "logps/rejected": -2.164994716644287, "loss": 0.6311, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.4607820510864258, "rewards/margins": 0.7042129039764404, "rewards/rejected": -2.164994716644287, "sft_loss": 1.5072251558303833, "step": 3925 }, { "epoch": 2.103361766181636, "grad_norm": 2.3941074380032408, "learning_rate": 2.4811413259451625e-07, "logits/chosen": -0.3501512110233307, "logits/rejected": -0.2043345868587494, "logps/chosen": -1.5407905578613281, "logps/rejected": -2.1239986419677734, "loss": 0.6568, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.5407905578613281, "rewards/margins": 0.5832081437110901, "rewards/rejected": -2.1239986419677734, "sft_loss": 1.5432384014129639, "step": 3930 }, { "epoch": 2.106037798963037, "grad_norm": 3.2289831570641385, "learning_rate": 2.46770014915362e-07, "logits/chosen": -0.23360323905944824, "logits/rejected": -0.16891071200370789, "logps/chosen": -1.556929349899292, "logps/rejected": -2.2463905811309814, "loss": 0.6374, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.556929349899292, "rewards/margins": 0.689461350440979, "rewards/rejected": -2.2463905811309814, "sft_loss": 1.5343983173370361, "step": 3935 }, { "epoch": 2.108713831744439, "grad_norm": 6.043689603162791, "learning_rate": 2.45428354074634e-07, "logits/chosen": -0.2577422857284546, "logits/rejected": -0.20512935519218445, "logps/chosen": -1.484359860420227, "logps/rejected": -2.2721476554870605, "loss": 0.6306, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.484359860420227, "rewards/margins": 0.7877878546714783, "rewards/rejected": -2.2721476554870605, "sft_loss": 1.422821283340454, "step": 3940 }, { "epoch": 2.1113898645258407, "grad_norm": 3.7634708186300014, "learning_rate": 2.4408916308913105e-07, "logits/chosen": -0.26054519414901733, "logits/rejected": -0.09344630688428879, "logps/chosen": -1.6033008098602295, "logps/rejected": -2.1131608486175537, "loss": 0.6645, "rewards/accuracies": 0.625, "rewards/chosen": -1.6033008098602295, "rewards/margins": 0.5098596215248108, "rewards/rejected": -2.1131608486175537, "sft_loss": 1.6177575588226318, "step": 3945 }, { "epoch": 2.114065897307242, "grad_norm": 5.82072101056835, "learning_rate": 2.4275245495169025e-07, "logits/chosen": -0.20669682323932648, "logits/rejected": -0.05159702152013779, "logps/chosen": -1.501251459121704, "logps/rejected": -2.1043949127197266, "loss": 0.6557, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.501251459121704, "rewards/margins": 0.6031434535980225, "rewards/rejected": -2.1043949127197266, "sft_loss": 1.5311315059661865, "step": 3950 }, { "epoch": 2.1167419300886436, "grad_norm": 3.6290893610895165, "learning_rate": 2.414182426310597e-07, "logits/chosen": -0.3596315085887909, "logits/rejected": -0.27861496806144714, "logps/chosen": -1.4975078105926514, "logps/rejected": -2.2116639614105225, "loss": 0.6398, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.4975078105926514, "rewards/margins": 0.7141561508178711, "rewards/rejected": -2.2116639614105225, "sft_loss": 1.5466364622116089, "step": 3955 }, { "epoch": 2.1194179628700454, "grad_norm": 5.50609314622216, "learning_rate": 2.400865390717734e-07, "logits/chosen": -0.23720446228981018, "logits/rejected": -0.12143871933221817, "logps/chosen": -1.564563274383545, "logps/rejected": -2.4740467071533203, "loss": 0.6358, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.564563274383545, "rewards/margins": 0.9094836115837097, "rewards/rejected": -2.4740467071533203, "sft_loss": 1.6424548625946045, "step": 3960 }, { "epoch": 2.1220939956514466, "grad_norm": 5.574264917815816, "learning_rate": 2.3875735719402475e-07, "logits/chosen": -0.21673135459423065, "logits/rejected": -0.10895593464374542, "logps/chosen": -1.4948537349700928, "logps/rejected": -2.264178514480591, "loss": 0.6406, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.4948537349700928, "rewards/margins": 0.769324779510498, "rewards/rejected": -2.264178514480591, "sft_loss": 1.5858509540557861, "step": 3965 }, { "epoch": 2.1247700284328483, "grad_norm": 2.995368540835511, "learning_rate": 2.3743070989354258e-07, "logits/chosen": -0.26554349064826965, "logits/rejected": -0.17646172642707825, "logps/chosen": -1.5605789422988892, "logps/rejected": -2.2075772285461426, "loss": 0.6429, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.5605789422988892, "rewards/margins": 0.6469985842704773, "rewards/rejected": -2.2075772285461426, "sft_loss": 1.6384786367416382, "step": 3970 }, { "epoch": 2.12744606121425, "grad_norm": 4.42950380392317, "learning_rate": 2.3610661004146454e-07, "logits/chosen": -0.19056487083435059, "logits/rejected": -0.092097207903862, "logps/chosen": -1.4302465915679932, "logps/rejected": -1.9939210414886475, "loss": 0.6379, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.4302465915679932, "rewards/margins": 0.5636744499206543, "rewards/rejected": -1.9939210414886475, "sft_loss": 1.4193260669708252, "step": 3975 }, { "epoch": 2.1301220939956513, "grad_norm": 3.2045560843805427, "learning_rate": 2.3478507048421314e-07, "logits/chosen": -0.29241254925727844, "logits/rejected": -0.217814639210701, "logps/chosen": -1.4759756326675415, "logps/rejected": -2.192446231842041, "loss": 0.6197, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.4759756326675415, "rewards/margins": 0.7164705395698547, "rewards/rejected": -2.192446231842041, "sft_loss": 1.5651328563690186, "step": 3980 }, { "epoch": 2.132798126777053, "grad_norm": 3.104383102470415, "learning_rate": 2.334661040433713e-07, "logits/chosen": -0.33143699169158936, "logits/rejected": -0.2266085147857666, "logps/chosen": -1.6010643243789673, "logps/rejected": -2.2874233722686768, "loss": 0.6492, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.6010643243789673, "rewards/margins": 0.6863590478897095, "rewards/rejected": -2.2874233722686768, "sft_loss": 1.561039686203003, "step": 3985 }, { "epoch": 2.1354741595584548, "grad_norm": 4.58052162712318, "learning_rate": 2.321497235155568e-07, "logits/chosen": -0.36959215998649597, "logits/rejected": -0.23951435089111328, "logps/chosen": -1.376029372215271, "logps/rejected": -2.1417298316955566, "loss": 0.6264, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.376029372215271, "rewards/margins": 0.7657004594802856, "rewards/rejected": -2.1417298316955566, "sft_loss": 1.4588136672973633, "step": 3990 }, { "epoch": 2.138150192339856, "grad_norm": 2.7058154520054187, "learning_rate": 2.3083594167229965e-07, "logits/chosen": -0.417828232049942, "logits/rejected": -0.17566026747226715, "logps/chosen": -1.5764210224151611, "logps/rejected": -2.2656896114349365, "loss": 0.6617, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.5764210224151611, "rewards/margins": 0.6892686486244202, "rewards/rejected": -2.2656896114349365, "sft_loss": 1.5459423065185547, "step": 3995 }, { "epoch": 2.1408262251212578, "grad_norm": 12.57124495807514, "learning_rate": 2.295247712599167e-07, "logits/chosen": -0.27160823345184326, "logits/rejected": -0.17148330807685852, "logps/chosen": -1.4522453546524048, "logps/rejected": -2.23818302154541, "loss": 0.6241, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.4522453546524048, "rewards/margins": 0.7859378457069397, "rewards/rejected": -2.23818302154541, "sft_loss": 1.5470503568649292, "step": 4000 }, { "epoch": 2.1408262251212578, "eval_logits/chosen": 0.09168983995914459, "eval_logits/rejected": 0.1840367317199707, "eval_logps/chosen": -1.5837377309799194, "eval_logps/rejected": -2.1770241260528564, "eval_loss": 0.6688703298568726, "eval_rewards/accuracies": 0.6454005837440491, "eval_rewards/chosen": -1.5837377309799194, "eval_rewards/margins": 0.5932866930961609, "eval_rewards/rejected": -2.1770241260528564, "eval_runtime": 43.8362, "eval_samples_per_second": 30.682, "eval_sft_loss": 1.5858901739120483, "eval_steps_per_second": 7.688, "step": 4000 }, { "epoch": 2.1435022579026595, "grad_norm": 2.5281695533267596, "learning_rate": 2.2821622499938948e-07, "logits/chosen": -0.2702171206474304, "logits/rejected": -0.056992627680301666, "logps/chosen": -1.7079509496688843, "logps/rejected": -2.2261626720428467, "loss": 0.6732, "rewards/accuracies": 0.625, "rewards/chosen": -1.7079509496688843, "rewards/margins": 0.5182119607925415, "rewards/rejected": -2.2261626720428467, "sft_loss": 1.642205834388733, "step": 4005 }, { "epoch": 2.1461782906840607, "grad_norm": 3.0027996441018887, "learning_rate": 2.269103155862391e-07, "logits/chosen": -0.30804362893104553, "logits/rejected": -0.21121926605701447, "logps/chosen": -1.5518161058425903, "logps/rejected": -2.1368958950042725, "loss": 0.6447, "rewards/accuracies": 0.6875, "rewards/chosen": -1.5518161058425903, "rewards/margins": 0.5850798487663269, "rewards/rejected": -2.1368958950042725, "sft_loss": 1.5524580478668213, "step": 4010 }, { "epoch": 2.1488543234654625, "grad_norm": 4.040083493049222, "learning_rate": 2.2560705569040483e-07, "logits/chosen": -0.3009427487850189, "logits/rejected": -0.051604628562927246, "logps/chosen": -1.5372031927108765, "logps/rejected": -2.121778964996338, "loss": 0.662, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.5372031927108765, "rewards/margins": 0.5845755338668823, "rewards/rejected": -2.121778964996338, "sft_loss": 1.578824758529663, "step": 4015 }, { "epoch": 2.151530356246864, "grad_norm": 3.234270915114087, "learning_rate": 2.2430645795611963e-07, "logits/chosen": -0.3834315538406372, "logits/rejected": -0.23764514923095703, "logps/chosen": -1.5670106410980225, "logps/rejected": -2.0982117652893066, "loss": 0.667, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.5670106410980225, "rewards/margins": 0.5312010049819946, "rewards/rejected": -2.0982117652893066, "sft_loss": 1.6243221759796143, "step": 4020 }, { "epoch": 2.1542063890282654, "grad_norm": 3.270374268099964, "learning_rate": 2.230085350017884e-07, "logits/chosen": -0.2835990786552429, "logits/rejected": -0.18782752752304077, "logps/chosen": -1.4929298162460327, "logps/rejected": -2.2040629386901855, "loss": 0.654, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.4929298162460327, "rewards/margins": 0.7111331820487976, "rewards/rejected": -2.2040629386901855, "sft_loss": 1.5205652713775635, "step": 4025 }, { "epoch": 2.156882421809667, "grad_norm": 4.252244719400194, "learning_rate": 2.2171329941986554e-07, "logits/chosen": -0.3299658000469208, "logits/rejected": -0.2453681230545044, "logps/chosen": -1.530896782875061, "logps/rejected": -2.152971029281616, "loss": 0.6482, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.530896782875061, "rewards/margins": 0.6220741868019104, "rewards/rejected": -2.152971029281616, "sft_loss": 1.5590530633926392, "step": 4030 }, { "epoch": 2.159558454591069, "grad_norm": 8.025842772881083, "learning_rate": 2.2042076377673202e-07, "logits/chosen": -0.2988958954811096, "logits/rejected": -0.25589337944984436, "logps/chosen": -1.4339474439620972, "logps/rejected": -1.829405426979065, "loss": 0.6578, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.4339474439620972, "rewards/margins": 0.3954579532146454, "rewards/rejected": -1.829405426979065, "sft_loss": 1.5186035633087158, "step": 4035 }, { "epoch": 2.16223448737247, "grad_norm": 3.6713172453330567, "learning_rate": 2.1913094061257476e-07, "logits/chosen": -0.30180278420448303, "logits/rejected": -0.2629424035549164, "logps/chosen": -1.4053932428359985, "logps/rejected": -2.0193417072296143, "loss": 0.6537, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.4053932428359985, "rewards/margins": 0.613948404788971, "rewards/rejected": -2.0193417072296143, "sft_loss": 1.430471658706665, "step": 4040 }, { "epoch": 2.164910520153872, "grad_norm": 3.6517684799723407, "learning_rate": 2.178438424412633e-07, "logits/chosen": -0.22194473445415497, "logits/rejected": -0.08967797458171844, "logps/chosen": -1.540501356124878, "logps/rejected": -2.0949809551239014, "loss": 0.6628, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.540501356124878, "rewards/margins": 0.554479718208313, "rewards/rejected": -2.0949809551239014, "sft_loss": 1.6057672500610352, "step": 4045 }, { "epoch": 2.1675865529352736, "grad_norm": 3.402377140438689, "learning_rate": 2.165594817502302e-07, "logits/chosen": -0.32742300629615784, "logits/rejected": -0.22545237839221954, "logps/chosen": -1.543937087059021, "logps/rejected": -1.9740203619003296, "loss": 0.655, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.543937087059021, "rewards/margins": 0.43008336424827576, "rewards/rejected": -1.9740203619003296, "sft_loss": 1.6045637130737305, "step": 4050 }, { "epoch": 2.170262585716675, "grad_norm": 2.534355319341989, "learning_rate": 2.1527787100034806e-07, "logits/chosen": -0.21653883159160614, "logits/rejected": -0.16043604910373688, "logps/chosen": -1.5343761444091797, "logps/rejected": -1.8853505849838257, "loss": 0.6673, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.5343761444091797, "rewards/margins": 0.3509746193885803, "rewards/rejected": -1.8853505849838257, "sft_loss": 1.550243854522705, "step": 4055 }, { "epoch": 2.1729386184980766, "grad_norm": 3.171312253268627, "learning_rate": 2.1399902262581037e-07, "logits/chosen": -0.12519961595535278, "logits/rejected": -0.020199311897158623, "logps/chosen": -1.4559218883514404, "logps/rejected": -2.0748279094696045, "loss": 0.6433, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.4559218883514404, "rewards/margins": 0.6189061999320984, "rewards/rejected": -2.0748279094696045, "sft_loss": 1.5062953233718872, "step": 4060 }, { "epoch": 2.1756146512794783, "grad_norm": 2.7594609214548056, "learning_rate": 2.127229490340094e-07, "logits/chosen": -0.358981192111969, "logits/rejected": -0.2713465690612793, "logps/chosen": -1.5313631296157837, "logps/rejected": -2.1783242225646973, "loss": 0.6448, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.5313631296157837, "rewards/margins": 0.6469610333442688, "rewards/rejected": -2.1783242225646973, "sft_loss": 1.5525610446929932, "step": 4065 }, { "epoch": 2.1782906840608796, "grad_norm": 6.121162843444098, "learning_rate": 2.1144966260541698e-07, "logits/chosen": -0.2529239058494568, "logits/rejected": -0.05141264200210571, "logps/chosen": -1.5103384256362915, "logps/rejected": -2.1851370334625244, "loss": 0.6413, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.5103384256362915, "rewards/margins": 0.6747983694076538, "rewards/rejected": -2.1851370334625244, "sft_loss": 1.5632578134536743, "step": 4070 }, { "epoch": 2.1809667168422813, "grad_norm": 2.9500682885294665, "learning_rate": 2.1017917569346332e-07, "logits/chosen": -0.300545334815979, "logits/rejected": -0.12688735127449036, "logps/chosen": -1.6066920757293701, "logps/rejected": -2.146780014038086, "loss": 0.6537, "rewards/accuracies": 0.6875, "rewards/chosen": -1.6066920757293701, "rewards/margins": 0.5400879383087158, "rewards/rejected": -2.146780014038086, "sft_loss": 1.584996223449707, "step": 4075 }, { "epoch": 2.183642749623683, "grad_norm": 5.603556259094692, "learning_rate": 2.0891150062441837e-07, "logits/chosen": -0.29074740409851074, "logits/rejected": -0.15702712535858154, "logps/chosen": -1.6081063747406006, "logps/rejected": -2.3623788356781006, "loss": 0.6541, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.6081063747406006, "rewards/margins": 0.7542725801467896, "rewards/rejected": -2.3623788356781006, "sft_loss": 1.6760278940200806, "step": 4080 }, { "epoch": 2.1863187824050843, "grad_norm": 4.607702897541227, "learning_rate": 2.0764664969727086e-07, "logits/chosen": -0.24949641525745392, "logits/rejected": -0.14471934735774994, "logps/chosen": -1.5316954851150513, "logps/rejected": -2.07235050201416, "loss": 0.648, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.5316954851150513, "rewards/margins": 0.540655255317688, "rewards/rejected": -2.07235050201416, "sft_loss": 1.5349326133728027, "step": 4085 }, { "epoch": 2.188994815186486, "grad_norm": 3.898894706432375, "learning_rate": 2.0638463518361033e-07, "logits/chosen": -0.3402459919452667, "logits/rejected": -0.14765772223472595, "logps/chosen": -1.5174437761306763, "logps/rejected": -2.1393513679504395, "loss": 0.6519, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.5174437761306763, "rewards/margins": 0.6219078302383423, "rewards/rejected": -2.1393513679504395, "sft_loss": 1.5297417640686035, "step": 4090 }, { "epoch": 2.1916708479678877, "grad_norm": 3.829847486281691, "learning_rate": 2.0512546932750702e-07, "logits/chosen": -0.30611926317214966, "logits/rejected": -0.22039365768432617, "logps/chosen": -1.5752049684524536, "logps/rejected": -2.075282335281372, "loss": 0.6605, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.5752049684524536, "rewards/margins": 0.5000771284103394, "rewards/rejected": -2.075282335281372, "sft_loss": 1.624929428100586, "step": 4095 }, { "epoch": 2.194346880749289, "grad_norm": 4.689857036556933, "learning_rate": 2.0386916434539343e-07, "logits/chosen": -0.22249610722064972, "logits/rejected": -0.08447030931711197, "logps/chosen": -1.3815150260925293, "logps/rejected": -2.1207103729248047, "loss": 0.6157, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.3815150260925293, "rewards/margins": 0.7391951680183411, "rewards/rejected": -2.1207103729248047, "sft_loss": 1.515148401260376, "step": 4100 }, { "epoch": 2.1970229135306907, "grad_norm": 3.130096858285971, "learning_rate": 2.0261573242594627e-07, "logits/chosen": -0.2264455258846283, "logits/rejected": -0.024552499875426292, "logps/chosen": -1.628535270690918, "logps/rejected": -2.1242098808288574, "loss": 0.6589, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.628535270690918, "rewards/margins": 0.4956747591495514, "rewards/rejected": -2.1242098808288574, "sft_loss": 1.5905323028564453, "step": 4105 }, { "epoch": 2.1996989463120924, "grad_norm": 6.957922549826785, "learning_rate": 2.0136518572996724e-07, "logits/chosen": -0.2159980833530426, "logits/rejected": -0.037352751940488815, "logps/chosen": -1.5755895376205444, "logps/rejected": -2.074293375015259, "loss": 0.6552, "rewards/accuracies": 0.625, "rewards/chosen": -1.5755895376205444, "rewards/margins": 0.4987037777900696, "rewards/rejected": -2.074293375015259, "sft_loss": 1.5881279706954956, "step": 4110 }, { "epoch": 2.202374979093494, "grad_norm": 4.253885036940023, "learning_rate": 2.0011753639026617e-07, "logits/chosen": -0.20722968876361847, "logits/rejected": -0.1629769206047058, "logps/chosen": -1.5342538356781006, "logps/rejected": -2.29413104057312, "loss": 0.6493, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.5342538356781006, "rewards/margins": 0.7598771452903748, "rewards/rejected": -2.29413104057312, "sft_loss": 1.5792230367660522, "step": 4115 }, { "epoch": 2.2050510118748954, "grad_norm": 3.799657751145327, "learning_rate": 1.988727965115421e-07, "logits/chosen": -0.26215869188308716, "logits/rejected": -0.1872785985469818, "logps/chosen": -1.4091601371765137, "logps/rejected": -2.145296812057495, "loss": 0.6212, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.4091601371765137, "rewards/margins": 0.736136794090271, "rewards/rejected": -2.145296812057495, "sft_loss": 1.5159685611724854, "step": 4120 }, { "epoch": 2.207727044656297, "grad_norm": 3.359631802849689, "learning_rate": 1.9763097817026713e-07, "logits/chosen": -0.3127399981021881, "logits/rejected": -0.10671690851449966, "logps/chosen": -1.4820116758346558, "logps/rejected": -2.274005174636841, "loss": 0.6364, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.4820116758346558, "rewards/margins": 0.7919936180114746, "rewards/rejected": -2.274005174636841, "sft_loss": 1.5597707033157349, "step": 4125 }, { "epoch": 2.210403077437699, "grad_norm": 4.013079286547791, "learning_rate": 1.9639209341456796e-07, "logits/chosen": -0.2609347999095917, "logits/rejected": -0.17760607600212097, "logps/chosen": -1.5475456714630127, "logps/rejected": -2.2146785259246826, "loss": 0.6566, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.5475456714630127, "rewards/margins": 0.6671331524848938, "rewards/rejected": -2.2146785259246826, "sft_loss": 1.6043469905853271, "step": 4130 }, { "epoch": 2.2130791102191, "grad_norm": 5.388820016376569, "learning_rate": 1.951561542641102e-07, "logits/chosen": -0.25289368629455566, "logits/rejected": -0.2507743835449219, "logps/chosen": -1.6624103784561157, "logps/rejected": -2.370983600616455, "loss": 0.6575, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.6624103784561157, "rewards/margins": 0.7085734605789185, "rewards/rejected": -2.370983600616455, "sft_loss": 1.6707814931869507, "step": 4135 }, { "epoch": 2.215755143000502, "grad_norm": 3.4805850054249725, "learning_rate": 1.939231727099806e-07, "logits/chosen": -0.3757517337799072, "logits/rejected": -0.29435932636260986, "logps/chosen": -1.5222690105438232, "logps/rejected": -2.18015718460083, "loss": 0.6552, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.5222690105438232, "rewards/margins": 0.6578881740570068, "rewards/rejected": -2.18015718460083, "sft_loss": 1.5597175359725952, "step": 4140 }, { "epoch": 2.2184311757819035, "grad_norm": 4.618202865883473, "learning_rate": 1.926931607145719e-07, "logits/chosen": -0.15316875278949738, "logits/rejected": -0.01105407066643238, "logps/chosen": -1.7164844274520874, "logps/rejected": -2.389507293701172, "loss": 0.6654, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.7164844274520874, "rewards/margins": 0.6730228662490845, "rewards/rejected": -2.389507293701172, "sft_loss": 1.6216520071029663, "step": 4145 }, { "epoch": 2.221107208563305, "grad_norm": 4.228533631865109, "learning_rate": 1.9146613021146564e-07, "logits/chosen": -0.2290269434452057, "logits/rejected": -0.138050839304924, "logps/chosen": -1.4907031059265137, "logps/rejected": -2.2492947578430176, "loss": 0.6385, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.4907031059265137, "rewards/margins": 0.7585914731025696, "rewards/rejected": -2.2492947578430176, "sft_loss": 1.5580991506576538, "step": 4150 }, { "epoch": 2.2237832413447065, "grad_norm": 3.554780898138054, "learning_rate": 1.9024209310531736e-07, "logits/chosen": -0.19468359649181366, "logits/rejected": -0.20953765511512756, "logps/chosen": -1.5502973794937134, "logps/rejected": -2.0763301849365234, "loss": 0.6427, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.5502973794937134, "rewards/margins": 0.5260329246520996, "rewards/rejected": -2.0763301849365234, "sft_loss": 1.5064983367919922, "step": 4155 }, { "epoch": 2.2264592741261082, "grad_norm": 3.635594365625625, "learning_rate": 1.890210612717401e-07, "logits/chosen": -0.27188238501548767, "logits/rejected": -0.12990307807922363, "logps/chosen": -1.6696773767471313, "logps/rejected": -2.155397415161133, "loss": 0.6537, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.6696773767471313, "rewards/margins": 0.48572009801864624, "rewards/rejected": -2.155397415161133, "sft_loss": 1.6960710287094116, "step": 4160 }, { "epoch": 2.2291353069075095, "grad_norm": 3.9050399786833174, "learning_rate": 1.8780304655719054e-07, "logits/chosen": -0.3017478287220001, "logits/rejected": -0.20494893193244934, "logps/chosen": -1.5754562616348267, "logps/rejected": -2.265209197998047, "loss": 0.6462, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.5754562616348267, "rewards/margins": 0.6897529363632202, "rewards/rejected": -2.265209197998047, "sft_loss": 1.5912220478057861, "step": 4165 }, { "epoch": 2.231811339688911, "grad_norm": 4.382291005981211, "learning_rate": 1.865880607788523e-07, "logits/chosen": -0.19715535640716553, "logits/rejected": -0.13701540231704712, "logps/chosen": -1.5412023067474365, "logps/rejected": -2.240354061126709, "loss": 0.6442, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.5412023067474365, "rewards/margins": 0.6991516947746277, "rewards/rejected": -2.240354061126709, "sft_loss": 1.5862661600112915, "step": 4170 }, { "epoch": 2.234487372470313, "grad_norm": 4.050601551719255, "learning_rate": 1.8537611572452316e-07, "logits/chosen": -0.312357634305954, "logits/rejected": -0.21406638622283936, "logps/chosen": -1.5535626411437988, "logps/rejected": -1.9893906116485596, "loss": 0.6473, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.5535626411437988, "rewards/margins": 0.4358278214931488, "rewards/rejected": -1.9893906116485596, "sft_loss": 1.5592749118804932, "step": 4175 }, { "epoch": 2.237163405251714, "grad_norm": 5.3727559137210115, "learning_rate": 1.84167223152499e-07, "logits/chosen": -0.3214295506477356, "logits/rejected": -0.11621556431055069, "logps/chosen": -1.5065670013427734, "logps/rejected": -2.185896396636963, "loss": 0.6436, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.5065670013427734, "rewards/margins": 0.679329514503479, "rewards/rejected": -2.185896396636963, "sft_loss": 1.5351276397705078, "step": 4180 }, { "epoch": 2.239839438033116, "grad_norm": 12.061432178974616, "learning_rate": 1.8296139479146112e-07, "logits/chosen": -0.34014302492141724, "logits/rejected": -0.3240959048271179, "logps/chosen": -1.446756362915039, "logps/rejected": -2.0364651679992676, "loss": 0.6389, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.446756362915039, "rewards/margins": 0.589708685874939, "rewards/rejected": -2.0364651679992676, "sft_loss": 1.487903356552124, "step": 4185 }, { "epoch": 2.2425154708145176, "grad_norm": 6.0753641029472485, "learning_rate": 1.8175864234036132e-07, "logits/chosen": -0.20700252056121826, "logits/rejected": -0.11087401211261749, "logps/chosen": -1.531902551651001, "logps/rejected": -2.245913505554199, "loss": 0.6461, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.531902551651001, "rewards/margins": 0.7140110731124878, "rewards/rejected": -2.245913505554199, "sft_loss": 1.5783272981643677, "step": 4190 }, { "epoch": 2.245191503595919, "grad_norm": 2.6050887282150343, "learning_rate": 1.805589774683094e-07, "logits/chosen": -0.3839908242225647, "logits/rejected": -0.23379337787628174, "logps/chosen": -1.4995805025100708, "logps/rejected": -2.030235767364502, "loss": 0.6446, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.4995805025100708, "rewards/margins": 0.5306550860404968, "rewards/rejected": -2.030235767364502, "sft_loss": 1.5718003511428833, "step": 4195 }, { "epoch": 2.2478675363773206, "grad_norm": 2.2359474207528915, "learning_rate": 1.79362411814459e-07, "logits/chosen": -0.17385700345039368, "logits/rejected": -0.183371439576149, "logps/chosen": -1.6142679452896118, "logps/rejected": -2.1658880710601807, "loss": 0.6418, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.6142679452896118, "rewards/margins": 0.5516203045845032, "rewards/rejected": -2.1658880710601807, "sft_loss": 1.6339337825775146, "step": 4200 }, { "epoch": 2.2505435691587223, "grad_norm": 2.9839216622314373, "learning_rate": 1.7816895698789552e-07, "logits/chosen": -0.3360547423362732, "logits/rejected": -0.25803929567337036, "logps/chosen": -1.5377719402313232, "logps/rejected": -2.0626683235168457, "loss": 0.646, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.5377719402313232, "rewards/margins": 0.5248963832855225, "rewards/rejected": -2.0626683235168457, "sft_loss": 1.5281660556793213, "step": 4205 }, { "epoch": 2.2532196019401236, "grad_norm": 5.017980829186697, "learning_rate": 1.7697862456752271e-07, "logits/chosen": -0.3281833529472351, "logits/rejected": -0.21899166703224182, "logps/chosen": -1.560179591178894, "logps/rejected": -2.4935262203216553, "loss": 0.6466, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.560179591178894, "rewards/margins": 0.9333463907241821, "rewards/rejected": -2.4935262203216553, "sft_loss": 1.6043787002563477, "step": 4210 }, { "epoch": 2.2558956347215253, "grad_norm": 3.5454291356920855, "learning_rate": 1.7579142610195124e-07, "logits/chosen": -0.28248411417007446, "logits/rejected": -0.14311420917510986, "logps/chosen": -1.6289985179901123, "logps/rejected": -2.2365944385528564, "loss": 0.6501, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.6289985179901123, "rewards/margins": 0.607595682144165, "rewards/rejected": -2.2365944385528564, "sft_loss": 1.574914574623108, "step": 4215 }, { "epoch": 2.258571667502927, "grad_norm": 3.0073974431466333, "learning_rate": 1.7460737310938568e-07, "logits/chosen": -0.34595996141433716, "logits/rejected": -0.15653648972511292, "logps/chosen": -1.5226856470108032, "logps/rejected": -2.2048676013946533, "loss": 0.641, "rewards/accuracies": 0.6875, "rewards/chosen": -1.5226856470108032, "rewards/margins": 0.6821819543838501, "rewards/rejected": -2.2048676013946533, "sft_loss": 1.5713951587677002, "step": 4220 }, { "epoch": 2.2612477002843283, "grad_norm": 4.209084915617624, "learning_rate": 1.734264770775133e-07, "logits/chosen": -0.3309343755245209, "logits/rejected": -0.15454891324043274, "logps/chosen": -1.5566303730010986, "logps/rejected": -2.306165933609009, "loss": 0.6392, "rewards/accuracies": 0.75, "rewards/chosen": -1.5566303730010986, "rewards/margins": 0.7495354413986206, "rewards/rejected": -2.306165933609009, "sft_loss": 1.5894476175308228, "step": 4225 }, { "epoch": 2.26392373306573, "grad_norm": 5.114930891343891, "learning_rate": 1.7224874946339241e-07, "logits/chosen": -0.3384936451911926, "logits/rejected": -0.25529247522354126, "logps/chosen": -1.7155176401138306, "logps/rejected": -2.2585268020629883, "loss": 0.6576, "rewards/accuracies": 0.65625, "rewards/chosen": -1.7155176401138306, "rewards/margins": 0.5430089831352234, "rewards/rejected": -2.2585268020629883, "sft_loss": 1.6163198947906494, "step": 4230 }, { "epoch": 2.2665997658471317, "grad_norm": 2.306857615936622, "learning_rate": 1.7107420169334186e-07, "logits/chosen": -0.2622672915458679, "logits/rejected": -0.1470358669757843, "logps/chosen": -1.6400104761123657, "logps/rejected": -2.070843458175659, "loss": 0.6734, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.6400104761123657, "rewards/margins": 0.43083301186561584, "rewards/rejected": -2.070843458175659, "sft_loss": 1.6886297464370728, "step": 4235 }, { "epoch": 2.269275798628533, "grad_norm": 2.8919252340955417, "learning_rate": 1.6990284516282893e-07, "logits/chosen": -0.28206485509872437, "logits/rejected": -0.16051602363586426, "logps/chosen": -1.50666344165802, "logps/rejected": -2.0285463333129883, "loss": 0.6582, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.50666344165802, "rewards/margins": 0.5218832492828369, "rewards/rejected": -2.0285463333129883, "sft_loss": 1.5844346284866333, "step": 4240 }, { "epoch": 2.2719518314099347, "grad_norm": 4.013146099551089, "learning_rate": 1.687346912363602e-07, "logits/chosen": -0.3432479500770569, "logits/rejected": -0.22454170882701874, "logps/chosen": -1.561184287071228, "logps/rejected": -2.172863721847534, "loss": 0.6492, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.561184287071228, "rewards/margins": 0.6116796135902405, "rewards/rejected": -2.172863721847534, "sft_loss": 1.589215636253357, "step": 4245 }, { "epoch": 2.2746278641913364, "grad_norm": 1.7215037172081569, "learning_rate": 1.675697512473697e-07, "logits/chosen": -0.26819589734077454, "logits/rejected": -0.09419532120227814, "logps/chosen": -1.6141561269760132, "logps/rejected": -2.2761244773864746, "loss": 0.6564, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.6141561269760132, "rewards/margins": 0.6619683504104614, "rewards/rejected": -2.2761244773864746, "sft_loss": 1.5889050960540771, "step": 4250 }, { "epoch": 2.2773038969727377, "grad_norm": 3.426816636652408, "learning_rate": 1.6640803649811087e-07, "logits/chosen": -0.29647761583328247, "logits/rejected": -0.06999702751636505, "logps/chosen": -1.6192327737808228, "logps/rejected": -2.3246865272521973, "loss": 0.6454, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.6192327737808228, "rewards/margins": 0.7054537534713745, "rewards/rejected": -2.3246865272521973, "sft_loss": 1.6010668277740479, "step": 4255 }, { "epoch": 2.2799799297541394, "grad_norm": 4.059369893080733, "learning_rate": 1.6524955825954472e-07, "logits/chosen": -0.2631566524505615, "logits/rejected": -0.15566667914390564, "logps/chosen": -1.4722065925598145, "logps/rejected": -2.1321847438812256, "loss": 0.6334, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.4722065925598145, "rewards/margins": 0.6599782705307007, "rewards/rejected": -2.1321847438812256, "sft_loss": 1.4477260112762451, "step": 4260 }, { "epoch": 2.282655962535541, "grad_norm": 2.8264964738448075, "learning_rate": 1.6409432777123277e-07, "logits/chosen": -0.38454973697662354, "logits/rejected": -0.2207675725221634, "logps/chosen": -1.5005874633789062, "logps/rejected": -2.2709789276123047, "loss": 0.6458, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.5005874633789062, "rewards/margins": 0.770391583442688, "rewards/rejected": -2.2709789276123047, "sft_loss": 1.544205665588379, "step": 4265 }, { "epoch": 2.285331995316943, "grad_norm": 3.7044371198908705, "learning_rate": 1.6294235624122577e-07, "logits/chosen": -0.20141199231147766, "logits/rejected": 0.038046397268772125, "logps/chosen": -1.5491031408309937, "logps/rejected": -2.337373733520508, "loss": 0.6471, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.5491031408309937, "rewards/margins": 0.7882707715034485, "rewards/rejected": -2.337373733520508, "sft_loss": 1.5153697729110718, "step": 4270 }, { "epoch": 2.288008028098344, "grad_norm": 4.801419996115539, "learning_rate": 1.6179365484595697e-07, "logits/chosen": -0.2652028203010559, "logits/rejected": -0.18785782158374786, "logps/chosen": -1.613633394241333, "logps/rejected": -2.23420786857605, "loss": 0.6655, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.613633394241333, "rewards/margins": 0.6205744743347168, "rewards/rejected": -2.23420786857605, "sft_loss": 1.6279270648956299, "step": 4275 }, { "epoch": 2.290684060879746, "grad_norm": 4.468703765027423, "learning_rate": 1.60648234730132e-07, "logits/chosen": -0.29716956615448, "logits/rejected": -0.20758691430091858, "logps/chosen": -1.5046494007110596, "logps/rejected": -2.1827492713928223, "loss": 0.6342, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.5046494007110596, "rewards/margins": 0.6780996322631836, "rewards/rejected": -2.1827492713928223, "sft_loss": 1.498509407043457, "step": 4280 }, { "epoch": 2.293360093661147, "grad_norm": 6.282108459199361, "learning_rate": 1.595061070066222e-07, "logits/chosen": -0.19836069643497467, "logits/rejected": -0.20581555366516113, "logps/chosen": -1.507200002670288, "logps/rejected": -2.1410250663757324, "loss": 0.6457, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.507200002670288, "rewards/margins": 0.6338250041007996, "rewards/rejected": -2.1410250663757324, "sft_loss": 1.5645170211791992, "step": 4285 }, { "epoch": 2.296036126442549, "grad_norm": 4.765230507949718, "learning_rate": 1.5836728275635542e-07, "logits/chosen": -0.3440048098564148, "logits/rejected": -0.18142908811569214, "logps/chosen": -1.604383111000061, "logps/rejected": -2.225277900695801, "loss": 0.6648, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.604383111000061, "rewards/margins": 0.6208949089050293, "rewards/rejected": -2.225277900695801, "sft_loss": 1.6145012378692627, "step": 4290 }, { "epoch": 2.2987121592239506, "grad_norm": 3.376587341931551, "learning_rate": 1.5723177302820984e-07, "logits/chosen": -0.31696051359176636, "logits/rejected": -0.2621343433856964, "logps/chosen": -1.5680984258651733, "logps/rejected": -2.136096954345703, "loss": 0.668, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.5680984258651733, "rewards/margins": 0.5679982900619507, "rewards/rejected": -2.136096954345703, "sft_loss": 1.5849583148956299, "step": 4295 }, { "epoch": 2.3013881920053523, "grad_norm": 3.195937393196168, "learning_rate": 1.5609958883890544e-07, "logits/chosen": -0.25740814208984375, "logits/rejected": -0.13126549124717712, "logps/chosen": -1.63578200340271, "logps/rejected": -2.066324472427368, "loss": 0.6535, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.63578200340271, "rewards/margins": 0.4305424094200134, "rewards/rejected": -2.066324472427368, "sft_loss": 1.5332118272781372, "step": 4300 }, { "epoch": 2.3040642247867535, "grad_norm": 3.299213352267835, "learning_rate": 1.5497074117289865e-07, "logits/chosen": -0.3346925675868988, "logits/rejected": -0.21511289477348328, "logps/chosen": -1.5720036029815674, "logps/rejected": -2.3347842693328857, "loss": 0.6268, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.5720036029815674, "rewards/margins": 0.7627805471420288, "rewards/rejected": -2.3347842693328857, "sft_loss": 1.6377277374267578, "step": 4305 }, { "epoch": 2.3067402575681553, "grad_norm": 5.568185476548528, "learning_rate": 1.5384524098227402e-07, "logits/chosen": -0.28449270129203796, "logits/rejected": -0.08388768136501312, "logps/chosen": -1.519416093826294, "logps/rejected": -2.3788251876831055, "loss": 0.6451, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.519416093826294, "rewards/margins": 0.8594093322753906, "rewards/rejected": -2.3788251876831055, "sft_loss": 1.5618746280670166, "step": 4310 }, { "epoch": 2.3094162903495565, "grad_norm": 3.8036289931086484, "learning_rate": 1.5272309918663974e-07, "logits/chosen": -0.2648676931858063, "logits/rejected": -0.10690250247716904, "logps/chosen": -1.6507768630981445, "logps/rejected": -2.0753867626190186, "loss": 0.6671, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.6507768630981445, "rewards/margins": 0.4246101379394531, "rewards/rejected": -2.0753867626190186, "sft_loss": 1.6561870574951172, "step": 4315 }, { "epoch": 2.3120923231309582, "grad_norm": 3.505938051751801, "learning_rate": 1.516043266730201e-07, "logits/chosen": -0.29961520433425903, "logits/rejected": -0.16704608500003815, "logps/chosen": -1.6481430530548096, "logps/rejected": -2.1766676902770996, "loss": 0.6463, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.6481430530548096, "rewards/margins": 0.5285248756408691, "rewards/rejected": -2.1766676902770996, "sft_loss": 1.6171963214874268, "step": 4320 }, { "epoch": 2.31476835591236, "grad_norm": 5.308196788764888, "learning_rate": 1.504889342957512e-07, "logits/chosen": -0.2850746810436249, "logits/rejected": -0.12523740530014038, "logps/chosen": -1.6003822088241577, "logps/rejected": -2.354501247406006, "loss": 0.6449, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.6003822088241577, "rewards/margins": 0.7541190385818481, "rewards/rejected": -2.354501247406006, "sft_loss": 1.6053550243377686, "step": 4325 }, { "epoch": 2.3174443886937617, "grad_norm": 3.782695455254811, "learning_rate": 1.4937693287637453e-07, "logits/chosen": -0.26460617780685425, "logits/rejected": -0.1535172164440155, "logps/chosen": -1.7259595394134521, "logps/rejected": -2.2482681274414062, "loss": 0.6807, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.7259595394134521, "rewards/margins": 0.5223087072372437, "rewards/rejected": -2.2482681274414062, "sft_loss": 1.6770446300506592, "step": 4330 }, { "epoch": 2.320120421475163, "grad_norm": 4.0863537745622045, "learning_rate": 1.4826833320353305e-07, "logits/chosen": -0.24232418835163116, "logits/rejected": -0.17023006081581116, "logps/chosen": -1.721494436264038, "logps/rejected": -2.3115336894989014, "loss": 0.6655, "rewards/accuracies": 0.65625, "rewards/chosen": -1.721494436264038, "rewards/margins": 0.5900392532348633, "rewards/rejected": -2.3115336894989014, "sft_loss": 1.5996655225753784, "step": 4335 }, { "epoch": 2.3227964542565647, "grad_norm": 5.08775407845278, "learning_rate": 1.4716314603286528e-07, "logits/chosen": -0.28603631258010864, "logits/rejected": -0.11851084232330322, "logps/chosen": -1.4622247219085693, "logps/rejected": -2.409731149673462, "loss": 0.6344, "rewards/accuracies": 0.71875, "rewards/chosen": -1.4622247219085693, "rewards/margins": 0.9475065469741821, "rewards/rejected": -2.409731149673462, "sft_loss": 1.5201201438903809, "step": 4340 }, { "epoch": 2.3254724870379664, "grad_norm": 3.3750509387088394, "learning_rate": 1.4606138208690233e-07, "logits/chosen": -0.31978458166122437, "logits/rejected": -0.24338774383068085, "logps/chosen": -1.8174854516983032, "logps/rejected": -2.1474390029907227, "loss": 0.6798, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.8174854516983032, "rewards/margins": 0.329953670501709, "rewards/rejected": -2.1474390029907227, "sft_loss": 1.6977739334106445, "step": 4345 }, { "epoch": 2.3281485198193677, "grad_norm": 3.0296914825809864, "learning_rate": 1.4496305205496251e-07, "logits/chosen": -0.2576034665107727, "logits/rejected": -0.19173723459243774, "logps/chosen": -1.6291992664337158, "logps/rejected": -2.3262884616851807, "loss": 0.6595, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.6291992664337158, "rewards/margins": 0.6970890760421753, "rewards/rejected": -2.3262884616851807, "sft_loss": 1.6249767541885376, "step": 4350 }, { "epoch": 2.3308245526007694, "grad_norm": 3.385482536388645, "learning_rate": 1.4386816659304895e-07, "logits/chosen": -0.32519760727882385, "logits/rejected": -0.2157323807477951, "logps/chosen": -1.5996949672698975, "logps/rejected": -2.1245455741882324, "loss": 0.6469, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.5996949672698975, "rewards/margins": 0.5248507261276245, "rewards/rejected": -2.1245455741882324, "sft_loss": 1.616973876953125, "step": 4355 }, { "epoch": 2.333500585382171, "grad_norm": 3.036633704742196, "learning_rate": 1.4277673632374492e-07, "logits/chosen": -0.38313087821006775, "logits/rejected": -0.1867683082818985, "logps/chosen": -1.6491420269012451, "logps/rejected": -2.2258849143981934, "loss": 0.6628, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.6491420269012451, "rewards/margins": 0.5767428278923035, "rewards/rejected": -2.2258849143981934, "sft_loss": 1.6371958255767822, "step": 4360 }, { "epoch": 2.3361766181635724, "grad_norm": 3.1469095206902136, "learning_rate": 1.416887718361119e-07, "logits/chosen": -0.21445195376873016, "logits/rejected": -0.216725155711174, "logps/chosen": -1.5153484344482422, "logps/rejected": -2.1543962955474854, "loss": 0.6457, "rewards/accuracies": 0.6875, "rewards/chosen": -1.5153484344482422, "rewards/margins": 0.6390475630760193, "rewards/rejected": -2.1543962955474854, "sft_loss": 1.5688116550445557, "step": 4365 }, { "epoch": 2.338852650944974, "grad_norm": 4.9960979584716325, "learning_rate": 1.406042836855859e-07, "logits/chosen": -0.2531622350215912, "logits/rejected": -0.13175725936889648, "logps/chosen": -1.4258439540863037, "logps/rejected": -2.2913336753845215, "loss": 0.625, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.4258439540863037, "rewards/margins": 0.8654899597167969, "rewards/rejected": -2.2913336753845215, "sft_loss": 1.4867799282073975, "step": 4370 }, { "epoch": 2.341528683726376, "grad_norm": 5.347358695623874, "learning_rate": 1.3952328239387595e-07, "logits/chosen": -0.3808794617652893, "logits/rejected": -0.19544386863708496, "logps/chosen": -1.6057853698730469, "logps/rejected": -2.25998854637146, "loss": 0.6425, "rewards/accuracies": 0.65625, "rewards/chosen": -1.6057853698730469, "rewards/margins": 0.6542031168937683, "rewards/rejected": -2.25998854637146, "sft_loss": 1.6914764642715454, "step": 4375 }, { "epoch": 2.344204716507777, "grad_norm": 3.692579308133713, "learning_rate": 1.3844577844886109e-07, "logits/chosen": -0.27982017397880554, "logits/rejected": -0.08442778885364532, "logps/chosen": -1.5698671340942383, "logps/rejected": -2.2227370738983154, "loss": 0.6657, "rewards/accuracies": 0.6875, "rewards/chosen": -1.5698671340942383, "rewards/margins": 0.6528701782226562, "rewards/rejected": -2.2227370738983154, "sft_loss": 1.5974746942520142, "step": 4380 }, { "epoch": 2.346880749289179, "grad_norm": 4.484161356916124, "learning_rate": 1.3737178230448955e-07, "logits/chosen": -0.3287569582462311, "logits/rejected": -0.1841241419315338, "logps/chosen": -1.5247882604599, "logps/rejected": -2.231912136077881, "loss": 0.6458, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.5247882604599, "rewards/margins": 0.7071238160133362, "rewards/rejected": -2.231912136077881, "sft_loss": 1.5711970329284668, "step": 4385 }, { "epoch": 2.3495567820705805, "grad_norm": 2.888037570851758, "learning_rate": 1.363013043806764e-07, "logits/chosen": -0.2781091034412384, "logits/rejected": -0.14470918476581573, "logps/chosen": -1.5172996520996094, "logps/rejected": -2.0534207820892334, "loss": 0.6601, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.5172996520996094, "rewards/margins": 0.536121129989624, "rewards/rejected": -2.0534207820892334, "sft_loss": 1.5588241815567017, "step": 4390 }, { "epoch": 2.3522328148519818, "grad_norm": 3.4537566419987793, "learning_rate": 1.352343550632034e-07, "logits/chosen": -0.2543310523033142, "logits/rejected": -0.10968288034200668, "logps/chosen": -1.5393258333206177, "logps/rejected": -2.4675350189208984, "loss": 0.6527, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.5393258333206177, "rewards/margins": 0.9282093048095703, "rewards/rejected": -2.4675350189208984, "sft_loss": 1.5896894931793213, "step": 4395 }, { "epoch": 2.3549088476333835, "grad_norm": 4.520510904522744, "learning_rate": 1.3417094470361722e-07, "logits/chosen": -0.3551029562950134, "logits/rejected": -0.2078694850206375, "logps/chosen": -1.5725879669189453, "logps/rejected": -2.226203441619873, "loss": 0.6443, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.5725879669189453, "rewards/margins": 0.6536153554916382, "rewards/rejected": -2.226203441619873, "sft_loss": 1.5988163948059082, "step": 4400 }, { "epoch": 2.3549088476333835, "eval_logits/chosen": -0.03984348848462105, "eval_logits/rejected": 0.04256761446595192, "eval_logps/chosen": -1.6001262664794922, "eval_logps/rejected": -2.216757297515869, "eval_loss": 0.66923588514328, "eval_rewards/accuracies": 0.6461424231529236, "eval_rewards/chosen": -1.6001262664794922, "eval_rewards/margins": 0.6166310906410217, "eval_rewards/rejected": -2.216757297515869, "eval_runtime": 44.4182, "eval_samples_per_second": 30.28, "eval_sft_loss": 1.5919395685195923, "eval_steps_per_second": 7.587, "step": 4400 }, { "epoch": 2.357584880414785, "grad_norm": 5.03762010664131, "learning_rate": 1.3311108361913015e-07, "logits/chosen": -0.36141958832740784, "logits/rejected": -0.3205726444721222, "logps/chosen": -1.536834478378296, "logps/rejected": -2.015439510345459, "loss": 0.6594, "rewards/accuracies": 0.6875, "rewards/chosen": -1.536834478378296, "rewards/margins": 0.4786049425601959, "rewards/rejected": -2.015439510345459, "sft_loss": 1.5660918951034546, "step": 4405 }, { "epoch": 2.3602609131961865, "grad_norm": 2.9750365763093645, "learning_rate": 1.3205478209251874e-07, "logits/chosen": -0.27190545201301575, "logits/rejected": -0.22115862369537354, "logps/chosen": -1.6559785604476929, "logps/rejected": -2.380772829055786, "loss": 0.6574, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.6559785604476929, "rewards/margins": 0.724794328212738, "rewards/rejected": -2.380772829055786, "sft_loss": 1.6743500232696533, "step": 4410 }, { "epoch": 2.362936945977588, "grad_norm": 3.183733452919812, "learning_rate": 1.310020503720254e-07, "logits/chosen": -0.2622893750667572, "logits/rejected": -0.09581082314252853, "logps/chosen": -1.6069536209106445, "logps/rejected": -2.1633763313293457, "loss": 0.6639, "rewards/accuracies": 0.625, "rewards/chosen": -1.6069536209106445, "rewards/margins": 0.5564228296279907, "rewards/rejected": -2.1633763313293457, "sft_loss": 1.6053905487060547, "step": 4415 }, { "epoch": 2.36561297875899, "grad_norm": 4.6850878320413205, "learning_rate": 1.2995289867125752e-07, "logits/chosen": -0.27317818999290466, "logits/rejected": -0.18975523114204407, "logps/chosen": -1.6047519445419312, "logps/rejected": -2.055290699005127, "loss": 0.6648, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.6047519445419312, "rewards/margins": 0.45053917169570923, "rewards/rejected": -2.055290699005127, "sft_loss": 1.58820641040802, "step": 4420 }, { "epoch": 2.368289011540391, "grad_norm": 4.483761518462918, "learning_rate": 1.2890733716908986e-07, "logits/chosen": -0.2564404010772705, "logits/rejected": -0.15235671401023865, "logps/chosen": -1.4038350582122803, "logps/rejected": -2.0266623497009277, "loss": 0.6178, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.4038350582122803, "rewards/margins": 0.6228272318840027, "rewards/rejected": -2.0266623497009277, "sft_loss": 1.5363707542419434, "step": 4425 }, { "epoch": 2.370965044321793, "grad_norm": 3.053354746961691, "learning_rate": 1.2786537600956454e-07, "logits/chosen": -0.31015071272850037, "logits/rejected": -0.14849421381950378, "logps/chosen": -1.5916285514831543, "logps/rejected": -2.3258919715881348, "loss": 0.6485, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.5916285514831543, "rewards/margins": 0.7342632412910461, "rewards/rejected": -2.3258919715881348, "sft_loss": 1.6185725927352905, "step": 4430 }, { "epoch": 2.3736410771031946, "grad_norm": 2.828988858883003, "learning_rate": 1.268270253017933e-07, "logits/chosen": -0.38335493206977844, "logits/rejected": -0.17762592434883118, "logps/chosen": -1.4943480491638184, "logps/rejected": -2.273847818374634, "loss": 0.6411, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.4943480491638184, "rewards/margins": 0.7794996500015259, "rewards/rejected": -2.273847818374634, "sft_loss": 1.5764858722686768, "step": 4435 }, { "epoch": 2.376317109884596, "grad_norm": 5.391975532843501, "learning_rate": 1.257922951198591e-07, "logits/chosen": -0.36591753363609314, "logits/rejected": -0.14763380587100983, "logps/chosen": -1.5593401193618774, "logps/rejected": -2.1390182971954346, "loss": 0.6635, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.5593401193618774, "rewards/margins": 0.5796783566474915, "rewards/rejected": -2.1390182971954346, "sft_loss": 1.6117656230926514, "step": 4440 }, { "epoch": 2.3789931426659976, "grad_norm": 6.870915257727768, "learning_rate": 1.24761195502719e-07, "logits/chosen": -0.33582472801208496, "logits/rejected": -0.15217497944831848, "logps/chosen": -1.5762287378311157, "logps/rejected": -2.1126303672790527, "loss": 0.6549, "rewards/accuracies": 0.59375, "rewards/chosen": -1.5762287378311157, "rewards/margins": 0.5364019274711609, "rewards/rejected": -2.1126303672790527, "sft_loss": 1.6227718591690063, "step": 4445 }, { "epoch": 2.3816691754473993, "grad_norm": 3.9406011916032324, "learning_rate": 1.2373373645410573e-07, "logits/chosen": -0.2725081443786621, "logits/rejected": -0.13785557448863983, "logps/chosen": -1.7564218044281006, "logps/rejected": -2.3719990253448486, "loss": 0.6558, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.7564218044281006, "rewards/margins": 0.6155771017074585, "rewards/rejected": -2.3719990253448486, "sft_loss": 1.6522209644317627, "step": 4450 }, { "epoch": 2.384345208228801, "grad_norm": 2.935103695680747, "learning_rate": 1.2270992794243175e-07, "logits/chosen": -0.36804503202438354, "logits/rejected": -0.2704111337661743, "logps/chosen": -1.593177080154419, "logps/rejected": -2.316232442855835, "loss": 0.6544, "rewards/accuracies": 0.71875, "rewards/chosen": -1.593177080154419, "rewards/margins": 0.7230552434921265, "rewards/rejected": -2.316232442855835, "sft_loss": 1.633183240890503, "step": 4455 }, { "epoch": 2.3870212410102023, "grad_norm": 3.7772885916093477, "learning_rate": 1.2168977990069147e-07, "logits/chosen": -0.3856295943260193, "logits/rejected": -0.16961362957954407, "logps/chosen": -1.5195658206939697, "logps/rejected": -2.4012515544891357, "loss": 0.6425, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.5195658206939697, "rewards/margins": 0.8816859126091003, "rewards/rejected": -2.4012515544891357, "sft_loss": 1.5922685861587524, "step": 4460 }, { "epoch": 2.389697273791604, "grad_norm": 2.457379225337726, "learning_rate": 1.206733022263659e-07, "logits/chosen": -0.38077300786972046, "logits/rejected": -0.2020655870437622, "logps/chosen": -1.6155993938446045, "logps/rejected": -2.188157796859741, "loss": 0.6811, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.6155993938446045, "rewards/margins": 0.5725584626197815, "rewards/rejected": -2.188157796859741, "sft_loss": 1.614628553390503, "step": 4465 }, { "epoch": 2.3923733065730053, "grad_norm": 4.384140130616499, "learning_rate": 1.1966050478132572e-07, "logits/chosen": -0.2263101041316986, "logits/rejected": -0.14471349120140076, "logps/chosen": -1.472354769706726, "logps/rejected": -2.0752902030944824, "loss": 0.6482, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.472354769706726, "rewards/margins": 0.6029354929924011, "rewards/rejected": -2.0752902030944824, "sft_loss": 1.5009453296661377, "step": 4470 }, { "epoch": 2.395049339354407, "grad_norm": 2.914679887842728, "learning_rate": 1.1865139739173635e-07, "logits/chosen": -0.3159586489200592, "logits/rejected": -0.10197826474905014, "logps/chosen": -1.5932174921035767, "logps/rejected": -2.2627813816070557, "loss": 0.6444, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.5932174921035767, "rewards/margins": 0.669563889503479, "rewards/rejected": -2.2627813816070557, "sft_loss": 1.5354959964752197, "step": 4475 }, { "epoch": 2.3977253721358087, "grad_norm": 2.401984695844185, "learning_rate": 1.1764598984796187e-07, "logits/chosen": -0.30270200967788696, "logits/rejected": -0.21523793041706085, "logps/chosen": -1.456754446029663, "logps/rejected": -2.046170473098755, "loss": 0.6512, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.456754446029663, "rewards/margins": 0.5894161462783813, "rewards/rejected": -2.046170473098755, "sft_loss": 1.4913976192474365, "step": 4480 }, { "epoch": 2.4004014049172104, "grad_norm": 6.470755790004308, "learning_rate": 1.1664429190447095e-07, "logits/chosen": -0.30092233419418335, "logits/rejected": -0.20879137516021729, "logps/chosen": -1.5819059610366821, "logps/rejected": -2.2939720153808594, "loss": 0.6551, "rewards/accuracies": 0.71875, "rewards/chosen": -1.5819059610366821, "rewards/margins": 0.7120662331581116, "rewards/rejected": -2.2939720153808594, "sft_loss": 1.5738495588302612, "step": 4485 }, { "epoch": 2.4030774376986117, "grad_norm": 3.2800575699036085, "learning_rate": 1.1564631327974122e-07, "logits/chosen": -0.35377007722854614, "logits/rejected": -0.14289887249469757, "logps/chosen": -1.5346969366073608, "logps/rejected": -2.4079833030700684, "loss": 0.6284, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.5346969366073608, "rewards/margins": 0.8732865452766418, "rewards/rejected": -2.4079833030700684, "sft_loss": 1.5910435914993286, "step": 4490 }, { "epoch": 2.4057534704800134, "grad_norm": 4.253805285650443, "learning_rate": 1.1465206365616587e-07, "logits/chosen": -0.39720767736434937, "logits/rejected": -0.1897422969341278, "logps/chosen": -1.523134469985962, "logps/rejected": -2.223334312438965, "loss": 0.6535, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.523134469985962, "rewards/margins": 0.7001999020576477, "rewards/rejected": -2.223334312438965, "sft_loss": 1.5863049030303955, "step": 4495 }, { "epoch": 2.408429503261415, "grad_norm": 3.4154369606317903, "learning_rate": 1.1366155267995887e-07, "logits/chosen": -0.21695657074451447, "logits/rejected": -0.21344704926013947, "logps/chosen": -1.5514976978302002, "logps/rejected": -2.1682536602020264, "loss": 0.6479, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.5514976978302002, "rewards/margins": 0.6167561411857605, "rewards/rejected": -2.1682536602020264, "sft_loss": 1.590196132659912, "step": 4500 }, { "epoch": 2.4111055360428164, "grad_norm": 5.205115407141752, "learning_rate": 1.1267478996106228e-07, "logits/chosen": -0.35822954773902893, "logits/rejected": -0.21628502011299133, "logps/chosen": -1.5886913537979126, "logps/rejected": -2.2281494140625, "loss": 0.6517, "rewards/accuracies": 0.65625, "rewards/chosen": -1.5886913537979126, "rewards/margins": 0.6394580006599426, "rewards/rejected": -2.2281494140625, "sft_loss": 1.6002895832061768, "step": 4505 }, { "epoch": 2.413781568824218, "grad_norm": 7.7046151520557355, "learning_rate": 1.116917850730521e-07, "logits/chosen": -0.37813323736190796, "logits/rejected": -0.251869797706604, "logps/chosen": -1.553252935409546, "logps/rejected": -2.095944404602051, "loss": 0.654, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.553252935409546, "rewards/margins": 0.5426915287971497, "rewards/rejected": -2.095944404602051, "sft_loss": 1.5301573276519775, "step": 4510 }, { "epoch": 2.41645760160562, "grad_norm": 3.344080421762281, "learning_rate": 1.1071254755304637e-07, "logits/chosen": -0.34831443428993225, "logits/rejected": -0.2738765478134155, "logps/chosen": -1.5086171627044678, "logps/rejected": -2.204338788986206, "loss": 0.6431, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.5086171627044678, "rewards/margins": 0.6957216858863831, "rewards/rejected": -2.204338788986206, "sft_loss": 1.5479824542999268, "step": 4515 }, { "epoch": 2.419133634387021, "grad_norm": 3.905727187770449, "learning_rate": 1.0973708690161143e-07, "logits/chosen": -0.2951086461544037, "logits/rejected": -0.2081606686115265, "logps/chosen": -1.4997297525405884, "logps/rejected": -2.3100061416625977, "loss": 0.6229, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.4997297525405884, "rewards/margins": 0.8102763891220093, "rewards/rejected": -2.3100061416625977, "sft_loss": 1.5412787199020386, "step": 4520 }, { "epoch": 2.421809667168423, "grad_norm": 3.289702012753996, "learning_rate": 1.0876541258267119e-07, "logits/chosen": -0.36814767122268677, "logits/rejected": -0.1877470761537552, "logps/chosen": -1.6343269348144531, "logps/rejected": -2.331125497817993, "loss": 0.6513, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.6343269348144531, "rewards/margins": 0.6967986226081848, "rewards/rejected": -2.331125497817993, "sft_loss": 1.6918836832046509, "step": 4525 }, { "epoch": 2.4244856999498245, "grad_norm": 3.1599141021190102, "learning_rate": 1.0779753402341379e-07, "logits/chosen": -0.3575204014778137, "logits/rejected": -0.2661263942718506, "logps/chosen": -1.6031525135040283, "logps/rejected": -2.1195476055145264, "loss": 0.6732, "rewards/accuracies": 0.65625, "rewards/chosen": -1.6031525135040283, "rewards/margins": 0.5163952708244324, "rewards/rejected": -2.1195476055145264, "sft_loss": 1.5328352451324463, "step": 4530 }, { "epoch": 2.427161732731226, "grad_norm": 5.177464157174504, "learning_rate": 1.0683346061420157e-07, "logits/chosen": -0.19036388397216797, "logits/rejected": -0.07992489635944366, "logps/chosen": -1.512872338294983, "logps/rejected": -2.2547543048858643, "loss": 0.6504, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.512872338294983, "rewards/margins": 0.7418821454048157, "rewards/rejected": -2.2547543048858643, "sft_loss": 1.5622663497924805, "step": 4535 }, { "epoch": 2.4298377655126275, "grad_norm": 3.7895314021271957, "learning_rate": 1.0587320170847874e-07, "logits/chosen": -0.2304213047027588, "logits/rejected": -0.15390145778656006, "logps/chosen": -1.4937970638275146, "logps/rejected": -2.169123649597168, "loss": 0.6532, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.4937970638275146, "rewards/margins": 0.6753265261650085, "rewards/rejected": -2.169123649597168, "sft_loss": 1.5163295269012451, "step": 4540 }, { "epoch": 2.4325137982940293, "grad_norm": 3.9449040606215453, "learning_rate": 1.0491676662268156e-07, "logits/chosen": -0.2762044072151184, "logits/rejected": -0.1484527587890625, "logps/chosen": -1.5513827800750732, "logps/rejected": -2.183565855026245, "loss": 0.6705, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.5513827800750732, "rewards/margins": 0.6321829557418823, "rewards/rejected": -2.183565855026245, "sft_loss": 1.5500465631484985, "step": 4545 }, { "epoch": 2.4351898310754305, "grad_norm": 5.804362394565838, "learning_rate": 1.0396416463614732e-07, "logits/chosen": -0.3430117964744568, "logits/rejected": -0.23104672133922577, "logps/chosen": -1.4433432817459106, "logps/rejected": -2.170539379119873, "loss": 0.6434, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.4433432817459106, "rewards/margins": 0.727196455001831, "rewards/rejected": -2.170539379119873, "sft_loss": 1.4824066162109375, "step": 4550 }, { "epoch": 2.4378658638568322, "grad_norm": 2.827950745351294, "learning_rate": 1.0301540499102479e-07, "logits/chosen": -0.31212273240089417, "logits/rejected": -0.24187254905700684, "logps/chosen": -1.669054627418518, "logps/rejected": -2.145447254180908, "loss": 0.663, "rewards/accuracies": 0.625, "rewards/chosen": -1.669054627418518, "rewards/margins": 0.4763926565647125, "rewards/rejected": -2.145447254180908, "sft_loss": 1.6982654333114624, "step": 4555 }, { "epoch": 2.440541896638234, "grad_norm": 4.434657094438286, "learning_rate": 1.0207049689218405e-07, "logits/chosen": -0.3376582860946655, "logits/rejected": -0.16643747687339783, "logps/chosen": -1.5501635074615479, "logps/rejected": -2.198604106903076, "loss": 0.646, "rewards/accuracies": 0.6875, "rewards/chosen": -1.5501635074615479, "rewards/margins": 0.6484406590461731, "rewards/rejected": -2.198604106903076, "sft_loss": 1.4974991083145142, "step": 4560 }, { "epoch": 2.4432179294196352, "grad_norm": 6.878050594650564, "learning_rate": 1.0112944950712782e-07, "logits/chosen": -0.2798961102962494, "logits/rejected": -0.18283778429031372, "logps/chosen": -1.583674669265747, "logps/rejected": -2.2180368900299072, "loss": 0.6497, "rewards/accuracies": 0.71875, "rewards/chosen": -1.583674669265747, "rewards/margins": 0.6343621015548706, "rewards/rejected": -2.2180368900299072, "sft_loss": 1.567760944366455, "step": 4565 }, { "epoch": 2.445893962201037, "grad_norm": 3.5233828625827037, "learning_rate": 1.0019227196590174e-07, "logits/chosen": -0.22706007957458496, "logits/rejected": -0.0819624587893486, "logps/chosen": -1.5315229892730713, "logps/rejected": -2.269984245300293, "loss": 0.6477, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.5315229892730713, "rewards/margins": 0.7384613752365112, "rewards/rejected": -2.269984245300293, "sft_loss": 1.5649573802947998, "step": 4570 }, { "epoch": 2.4485699949824387, "grad_norm": 3.1923233685494954, "learning_rate": 9.925897336100664e-08, "logits/chosen": -0.2299167662858963, "logits/rejected": -0.17413330078125, "logps/chosen": -1.5149710178375244, "logps/rejected": -2.1643295288085938, "loss": 0.6469, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.5149710178375244, "rewards/margins": 0.6493586301803589, "rewards/rejected": -2.1643295288085938, "sft_loss": 1.5522096157073975, "step": 4575 }, { "epoch": 2.45124602776384, "grad_norm": 3.8568871271134166, "learning_rate": 9.832956274730946e-08, "logits/chosen": -0.2824332118034363, "logits/rejected": -0.2337716519832611, "logps/chosen": -1.5122730731964111, "logps/rejected": -2.1140122413635254, "loss": 0.6332, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.5122730731964111, "rewards/margins": 0.6017390489578247, "rewards/rejected": -2.1140122413635254, "sft_loss": 1.5447158813476562, "step": 4580 }, { "epoch": 2.4539220605452416, "grad_norm": 3.2764156874574533, "learning_rate": 9.740404914195633e-08, "logits/chosen": -0.23593628406524658, "logits/rejected": -0.09226083010435104, "logps/chosen": -1.5803115367889404, "logps/rejected": -2.2210307121276855, "loss": 0.6574, "rewards/accuracies": 0.6875, "rewards/chosen": -1.5803115367889404, "rewards/margins": 0.6407192945480347, "rewards/rejected": -2.2210307121276855, "sft_loss": 1.6641523838043213, "step": 4585 }, { "epoch": 2.4565980933266434, "grad_norm": 5.246348662800345, "learning_rate": 9.648244152428392e-08, "logits/chosen": -0.35039710998535156, "logits/rejected": -0.1667160540819168, "logps/chosen": -1.4865039587020874, "logps/rejected": -2.0124218463897705, "loss": 0.6528, "rewards/accuracies": 0.65625, "rewards/chosen": -1.4865039587020874, "rewards/margins": 0.5259180665016174, "rewards/rejected": -2.0124218463897705, "sft_loss": 1.5617542266845703, "step": 4590 }, { "epoch": 2.4592741261080446, "grad_norm": 3.664602000177623, "learning_rate": 9.556474883573379e-08, "logits/chosen": -0.3106224834918976, "logits/rejected": -0.190143883228302, "logps/chosen": -1.510183572769165, "logps/rejected": -2.31964111328125, "loss": 0.6512, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.510183572769165, "rewards/margins": 0.8094574809074402, "rewards/rejected": -2.31964111328125, "sft_loss": 1.547892451286316, "step": 4595 }, { "epoch": 2.4619501588894463, "grad_norm": 4.362679956691847, "learning_rate": 9.465097997976412e-08, "logits/chosen": -0.3063965439796448, "logits/rejected": -0.0520622618496418, "logps/chosen": -1.5510295629501343, "logps/rejected": -2.333533763885498, "loss": 0.6483, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.5510295629501343, "rewards/margins": 0.7825039029121399, "rewards/rejected": -2.333533763885498, "sft_loss": 1.6380821466445923, "step": 4600 }, { "epoch": 2.464626191670848, "grad_norm": 4.317027417531272, "learning_rate": 9.374114382176457e-08, "logits/chosen": -0.2685374617576599, "logits/rejected": -0.127393901348114, "logps/chosen": -1.544651746749878, "logps/rejected": -2.279839277267456, "loss": 0.6565, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.544651746749878, "rewards/margins": 0.7351876497268677, "rewards/rejected": -2.279839277267456, "sft_loss": 1.5730217695236206, "step": 4605 }, { "epoch": 2.46730222445225, "grad_norm": 4.669673690973595, "learning_rate": 9.283524918896945e-08, "logits/chosen": -0.3165430426597595, "logits/rejected": -0.1727561503648758, "logps/chosen": -1.566685676574707, "logps/rejected": -2.318009376525879, "loss": 0.6452, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.566685676574707, "rewards/margins": 0.7513237595558167, "rewards/rejected": -2.318009376525879, "sft_loss": 1.5769126415252686, "step": 4610 }, { "epoch": 2.469978257233651, "grad_norm": 4.072456844305052, "learning_rate": 9.193330487037232e-08, "logits/chosen": -0.25783801078796387, "logits/rejected": -0.12196172773838043, "logps/chosen": -1.5721653699874878, "logps/rejected": -2.546551465988159, "loss": 0.6502, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.5721653699874878, "rewards/margins": 0.9743859171867371, "rewards/rejected": -2.546551465988159, "sft_loss": 1.6333370208740234, "step": 4615 }, { "epoch": 2.4726542900150528, "grad_norm": 4.227785346418531, "learning_rate": 9.103531961664118e-08, "logits/chosen": -0.27418097853660583, "logits/rejected": -0.07549251616001129, "logps/chosen": -1.4618358612060547, "logps/rejected": -2.13948130607605, "loss": 0.6386, "rewards/accuracies": 0.65625, "rewards/chosen": -1.4618358612060547, "rewards/margins": 0.6776456832885742, "rewards/rejected": -2.13948130607605, "sft_loss": 1.5798118114471436, "step": 4620 }, { "epoch": 2.475330322796454, "grad_norm": 2.4850185145803105, "learning_rate": 9.014130214003269e-08, "logits/chosen": -0.35881730914115906, "logits/rejected": -0.30776986479759216, "logps/chosen": -1.5527098178863525, "logps/rejected": -2.214914321899414, "loss": 0.6546, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.5527098178863525, "rewards/margins": 0.6622045636177063, "rewards/rejected": -2.214914321899414, "sft_loss": 1.5944491624832153, "step": 4625 }, { "epoch": 2.4780063555778558, "grad_norm": 19.21874409478125, "learning_rate": 8.925126111430848e-08, "logits/chosen": -0.2559207081794739, "logits/rejected": -0.17347490787506104, "logps/chosen": -1.482177495956421, "logps/rejected": -2.1850438117980957, "loss": 0.6479, "rewards/accuracies": 0.6875, "rewards/chosen": -1.482177495956421, "rewards/margins": 0.7028661966323853, "rewards/rejected": -2.1850438117980957, "sft_loss": 1.5517394542694092, "step": 4630 }, { "epoch": 2.4806823883592575, "grad_norm": 7.228158952382859, "learning_rate": 8.83652051746504e-08, "logits/chosen": -0.16527710855007172, "logits/rejected": -0.008183039724826813, "logps/chosen": -1.592286467552185, "logps/rejected": -2.4563093185424805, "loss": 0.6491, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.592286467552185, "rewards/margins": 0.8640230298042297, "rewards/rejected": -2.4563093185424805, "sft_loss": 1.557730793952942, "step": 4635 }, { "epoch": 2.483358421140659, "grad_norm": 3.149954021366626, "learning_rate": 8.748314291757696e-08, "logits/chosen": -0.19139009714126587, "logits/rejected": -0.07891600579023361, "logps/chosen": -1.568982481956482, "logps/rejected": -2.2609806060791016, "loss": 0.6486, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.568982481956482, "rewards/margins": 0.6919980049133301, "rewards/rejected": -2.2609806060791016, "sft_loss": 1.5659347772598267, "step": 4640 }, { "epoch": 2.4860344539220605, "grad_norm": 6.69841358488078, "learning_rate": 8.660508290086032e-08, "logits/chosen": -0.2480025738477707, "logits/rejected": -0.10911725461483002, "logps/chosen": -1.4874827861785889, "logps/rejected": -2.092480182647705, "loss": 0.6445, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.4874827861785889, "rewards/margins": 0.6049972772598267, "rewards/rejected": -2.092480182647705, "sft_loss": 1.551493763923645, "step": 4645 }, { "epoch": 2.488710486703462, "grad_norm": 3.597871974349541, "learning_rate": 8.573103364344231e-08, "logits/chosen": -0.2941132187843323, "logits/rejected": -0.055225301533937454, "logps/chosen": -1.5422605276107788, "logps/rejected": -2.309134006500244, "loss": 0.6447, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.5422605276107788, "rewards/margins": 0.7668734788894653, "rewards/rejected": -2.309134006500244, "sft_loss": 1.5496307611465454, "step": 4650 }, { "epoch": 2.4913865194848634, "grad_norm": 2.8976921514005363, "learning_rate": 8.486100362535292e-08, "logits/chosen": -0.30288466811180115, "logits/rejected": -0.15188676118850708, "logps/chosen": -1.5327774286270142, "logps/rejected": -2.0788652896881104, "loss": 0.6624, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.5327774286270142, "rewards/margins": 0.5460880398750305, "rewards/rejected": -2.0788652896881104, "sft_loss": 1.603165626525879, "step": 4655 }, { "epoch": 2.494062552266265, "grad_norm": 3.0556336510613438, "learning_rate": 8.399500128762693e-08, "logits/chosen": -0.3329222500324249, "logits/rejected": -0.20975284278392792, "logps/chosen": -1.6097650527954102, "logps/rejected": -2.1826891899108887, "loss": 0.652, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.6097650527954102, "rewards/margins": 0.5729240775108337, "rewards/rejected": -2.1826891899108887, "sft_loss": 1.6203798055648804, "step": 4660 }, { "epoch": 2.496738585047667, "grad_norm": 3.6312464079223603, "learning_rate": 8.313303503222313e-08, "logits/chosen": -0.2509586215019226, "logits/rejected": -0.17944104969501495, "logps/chosen": -1.7537086009979248, "logps/rejected": -2.327963352203369, "loss": 0.6712, "rewards/accuracies": 0.625, "rewards/chosen": -1.7537086009979248, "rewards/margins": 0.5742546319961548, "rewards/rejected": -2.327963352203369, "sft_loss": 1.6698287725448608, "step": 4665 }, { "epoch": 2.4994146178290686, "grad_norm": 3.6172161673241825, "learning_rate": 8.227511322194164e-08, "logits/chosen": -0.3168428838253021, "logits/rejected": -0.19337889552116394, "logps/chosen": -1.560003399848938, "logps/rejected": -2.045964241027832, "loss": 0.6544, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.560003399848938, "rewards/margins": 0.48596081137657166, "rewards/rejected": -2.045964241027832, "sft_loss": 1.607225775718689, "step": 4670 }, { "epoch": 2.50209065061047, "grad_norm": 2.6556568106703256, "learning_rate": 8.142124418034385e-08, "logits/chosen": -0.2559506595134735, "logits/rejected": -0.11300382763147354, "logps/chosen": -1.5347226858139038, "logps/rejected": -2.135934591293335, "loss": 0.6385, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.5347226858139038, "rewards/margins": 0.6012121438980103, "rewards/rejected": -2.135934591293335, "sft_loss": 1.4947535991668701, "step": 4675 }, { "epoch": 2.5047666833918716, "grad_norm": 3.3277907557458306, "learning_rate": 8.057143619167073e-08, "logits/chosen": -0.21113601326942444, "logits/rejected": -0.10068633407354355, "logps/chosen": -1.5916776657104492, "logps/rejected": -2.350574254989624, "loss": 0.6509, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.5916776657104492, "rewards/margins": 0.7588964700698853, "rewards/rejected": -2.350574254989624, "sft_loss": 1.557253122329712, "step": 4680 }, { "epoch": 2.507442716173273, "grad_norm": 3.050861899566008, "learning_rate": 7.97256975007633e-08, "logits/chosen": -0.3704945743083954, "logits/rejected": -0.17528492212295532, "logps/chosen": -1.4895609617233276, "logps/rejected": -2.116403818130493, "loss": 0.6472, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.4895609617233276, "rewards/margins": 0.6268427968025208, "rewards/rejected": -2.116403818130493, "sft_loss": 1.5278401374816895, "step": 4685 }, { "epoch": 2.5101187489546746, "grad_norm": 4.411901521813038, "learning_rate": 7.888403631298186e-08, "logits/chosen": -0.25644347071647644, "logits/rejected": -0.20167987048625946, "logps/chosen": -1.5314282178878784, "logps/rejected": -2.092921733856201, "loss": 0.6582, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.5314282178878784, "rewards/margins": 0.5614933967590332, "rewards/rejected": -2.092921733856201, "sft_loss": 1.516566276550293, "step": 4690 }, { "epoch": 2.5127947817360763, "grad_norm": 2.577157103525146, "learning_rate": 7.804646079412719e-08, "logits/chosen": -0.256502628326416, "logits/rejected": -0.0664641335606575, "logps/chosen": -1.5892064571380615, "logps/rejected": -2.223823070526123, "loss": 0.643, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.5892064571380615, "rewards/margins": 0.6346170902252197, "rewards/rejected": -2.223823070526123, "sft_loss": 1.6083158254623413, "step": 4695 }, { "epoch": 2.515470814517478, "grad_norm": 2.958201859488952, "learning_rate": 7.72129790703604e-08, "logits/chosen": -0.36429911851882935, "logits/rejected": -0.24181333184242249, "logps/chosen": -1.4857892990112305, "logps/rejected": -2.1328506469726562, "loss": 0.6482, "rewards/accuracies": 0.65625, "rewards/chosen": -1.4857892990112305, "rewards/margins": 0.6470614075660706, "rewards/rejected": -2.1328506469726562, "sft_loss": 1.5447801351547241, "step": 4700 }, { "epoch": 2.5181468472988793, "grad_norm": 4.75676160474035, "learning_rate": 7.638359922812504e-08, "logits/chosen": -0.23873957991600037, "logits/rejected": -0.19696664810180664, "logps/chosen": -1.6221033334732056, "logps/rejected": -2.195286273956299, "loss": 0.6488, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.6221033334732056, "rewards/margins": 0.5731828808784485, "rewards/rejected": -2.195286273956299, "sft_loss": 1.5847132205963135, "step": 4705 }, { "epoch": 2.520822880080281, "grad_norm": 10.625975645164717, "learning_rate": 7.555832931406774e-08, "logits/chosen": -0.3477099537849426, "logits/rejected": -0.20050649344921112, "logps/chosen": -1.6094754934310913, "logps/rejected": -2.2957520484924316, "loss": 0.6557, "rewards/accuracies": 0.71875, "rewards/chosen": -1.6094754934310913, "rewards/margins": 0.6862764954566956, "rewards/rejected": -2.2957520484924316, "sft_loss": 1.6209688186645508, "step": 4710 }, { "epoch": 2.5234989128616827, "grad_norm": 2.7541151173206897, "learning_rate": 7.47371773349611e-08, "logits/chosen": -0.294449120759964, "logits/rejected": -0.25376224517822266, "logps/chosen": -1.6765820980072021, "logps/rejected": -2.2641329765319824, "loss": 0.6513, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.6765820980072021, "rewards/margins": 0.5875504612922668, "rewards/rejected": -2.2641329765319824, "sft_loss": 1.7093673944473267, "step": 4715 }, { "epoch": 2.526174945643084, "grad_norm": 5.381113186903328, "learning_rate": 7.392015125762496e-08, "logits/chosen": -0.2696743607521057, "logits/rejected": -0.16265803575515747, "logps/chosen": -1.4993641376495361, "logps/rejected": -2.1580071449279785, "loss": 0.641, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.4993641376495361, "rewards/margins": 0.6586429476737976, "rewards/rejected": -2.1580071449279785, "sft_loss": 1.501667857170105, "step": 4720 }, { "epoch": 2.5288509784244857, "grad_norm": 4.363832899390693, "learning_rate": 7.310725900885018e-08, "logits/chosen": -0.3282429873943329, "logits/rejected": -0.2649852931499481, "logps/chosen": -1.5814213752746582, "logps/rejected": -2.1092419624328613, "loss": 0.656, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.5814213752746582, "rewards/margins": 0.527820348739624, "rewards/rejected": -2.1092419624328613, "sft_loss": 1.6161930561065674, "step": 4725 }, { "epoch": 2.5315270112058874, "grad_norm": 5.399454559214379, "learning_rate": 7.229850847532076e-08, "logits/chosen": -0.27031904458999634, "logits/rejected": -0.14918789267539978, "logps/chosen": -1.4463913440704346, "logps/rejected": -2.188873529434204, "loss": 0.6323, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.4463913440704346, "rewards/margins": 0.7424818277359009, "rewards/rejected": -2.188873529434204, "sft_loss": 1.5468406677246094, "step": 4730 }, { "epoch": 2.5342030439872887, "grad_norm": 3.542438782265644, "learning_rate": 7.149390750353779e-08, "logits/chosen": -0.24759738147258759, "logits/rejected": -0.2921431064605713, "logps/chosen": -1.5594487190246582, "logps/rejected": -2.0642504692077637, "loss": 0.6454, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.5594487190246582, "rewards/margins": 0.504801869392395, "rewards/rejected": -2.0642504692077637, "sft_loss": 1.593975305557251, "step": 4735 }, { "epoch": 2.5368790767686904, "grad_norm": 4.184646108234178, "learning_rate": 7.069346389974374e-08, "logits/chosen": -0.3225509226322174, "logits/rejected": -0.180209219455719, "logps/chosen": -1.5613961219787598, "logps/rejected": -2.246628522872925, "loss": 0.6549, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.5613961219787598, "rewards/margins": 0.6852323412895203, "rewards/rejected": -2.246628522872925, "sft_loss": 1.5798487663269043, "step": 4740 }, { "epoch": 2.539555109550092, "grad_norm": 3.6110635278658036, "learning_rate": 6.989718542984563e-08, "logits/chosen": -0.2740827202796936, "logits/rejected": -0.22729679942131042, "logps/chosen": -1.5970016717910767, "logps/rejected": -2.1766319274902344, "loss": 0.6458, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.5970016717910767, "rewards/margins": 0.5796300172805786, "rewards/rejected": -2.1766319274902344, "sft_loss": 1.6187547445297241, "step": 4745 }, { "epoch": 2.5422311423314934, "grad_norm": 3.311222532309807, "learning_rate": 6.9105079819341e-08, "logits/chosen": -0.213700532913208, "logits/rejected": -0.0047538997605443, "logps/chosen": -1.5098040103912354, "logps/rejected": -2.4965951442718506, "loss": 0.6444, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.5098040103912354, "rewards/margins": 0.9867914319038391, "rewards/rejected": -2.4965951442718506, "sft_loss": 1.5469787120819092, "step": 4750 }, { "epoch": 2.544907175112895, "grad_norm": 2.110121081515731, "learning_rate": 6.831715475324163e-08, "logits/chosen": -0.31291213631629944, "logits/rejected": -0.1478758156299591, "logps/chosen": -1.4469785690307617, "logps/rejected": -2.3485617637634277, "loss": 0.63, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.4469785690307617, "rewards/margins": 0.9015833139419556, "rewards/rejected": -2.3485617637634277, "sft_loss": 1.5024610757827759, "step": 4755 }, { "epoch": 2.547583207894297, "grad_norm": 5.092788021650294, "learning_rate": 6.753341787600026e-08, "logits/chosen": -0.3487478792667389, "logits/rejected": -0.2094249725341797, "logps/chosen": -1.4646135568618774, "logps/rejected": -2.1586451530456543, "loss": 0.6525, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.4646135568618774, "rewards/margins": 0.6940317153930664, "rewards/rejected": -2.1586451530456543, "sft_loss": 1.5558288097381592, "step": 4760 }, { "epoch": 2.5502592406756985, "grad_norm": 3.2332751465828644, "learning_rate": 6.67538767914353e-08, "logits/chosen": -0.3419414460659027, "logits/rejected": -0.15927313268184662, "logps/chosen": -1.6466041803359985, "logps/rejected": -2.3158504962921143, "loss": 0.6687, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.6466041803359985, "rewards/margins": 0.6692460775375366, "rewards/rejected": -2.3158504962921143, "sft_loss": 1.6631864309310913, "step": 4765 }, { "epoch": 2.5529352734571, "grad_norm": 3.737944816854884, "learning_rate": 6.597853906265793e-08, "logits/chosen": -0.28476816415786743, "logits/rejected": -0.17611289024353027, "logps/chosen": -1.5729347467422485, "logps/rejected": -2.3755788803100586, "loss": 0.6585, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.5729347467422485, "rewards/margins": 0.8026441335678101, "rewards/rejected": -2.3755788803100586, "sft_loss": 1.5690858364105225, "step": 4770 }, { "epoch": 2.5556113062385015, "grad_norm": 2.389334041003902, "learning_rate": 6.5207412211998e-08, "logits/chosen": -0.17536935210227966, "logits/rejected": -0.07199620455503464, "logps/chosen": -1.5184999704360962, "logps/rejected": -2.2542495727539062, "loss": 0.6439, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.5184999704360962, "rewards/margins": 0.7357496023178101, "rewards/rejected": -2.2542495727539062, "sft_loss": 1.4695793390274048, "step": 4775 }, { "epoch": 2.558287339019903, "grad_norm": 2.9252819329362207, "learning_rate": 6.444050372093186e-08, "logits/chosen": -0.26403623819351196, "logits/rejected": -0.163148432970047, "logps/chosen": -1.5481630563735962, "logps/rejected": -2.1773335933685303, "loss": 0.6527, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.5481630563735962, "rewards/margins": 0.6291707754135132, "rewards/rejected": -2.1773335933685303, "sft_loss": 1.5882800817489624, "step": 4780 }, { "epoch": 2.5609633718013045, "grad_norm": 5.040500759338175, "learning_rate": 6.367782103000873e-08, "logits/chosen": -0.2527233958244324, "logits/rejected": -0.19683003425598145, "logps/chosen": -1.5815021991729736, "logps/rejected": -2.067279815673828, "loss": 0.668, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.5815021991729736, "rewards/margins": 0.4857775568962097, "rewards/rejected": -2.067279815673828, "sft_loss": 1.5577093362808228, "step": 4785 }, { "epoch": 2.5636394045827062, "grad_norm": 3.006823313342819, "learning_rate": 6.29193715387798e-08, "logits/chosen": -0.34018540382385254, "logits/rejected": -0.20170053839683533, "logps/chosen": -1.576545000076294, "logps/rejected": -2.3532755374908447, "loss": 0.6513, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.576545000076294, "rewards/margins": 0.776730477809906, "rewards/rejected": -2.3532755374908447, "sft_loss": 1.5583680868148804, "step": 4790 }, { "epoch": 2.566315437364108, "grad_norm": 4.808664555754724, "learning_rate": 6.216516260572502e-08, "logits/chosen": -0.231277734041214, "logits/rejected": -0.15128371119499207, "logps/chosen": -1.6274127960205078, "logps/rejected": -2.1662192344665527, "loss": 0.6609, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.6274127960205078, "rewards/margins": 0.5388065576553345, "rewards/rejected": -2.1662192344665527, "sft_loss": 1.6247822046279907, "step": 4795 }, { "epoch": 2.568991470145509, "grad_norm": 2.4118704077225996, "learning_rate": 6.141520154818297e-08, "logits/chosen": -0.2833016514778137, "logits/rejected": -0.17131468653678894, "logps/chosen": -1.4958717823028564, "logps/rejected": -2.128330707550049, "loss": 0.6356, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.4958717823028564, "rewards/margins": 0.6324589848518372, "rewards/rejected": -2.128330707550049, "sft_loss": 1.573540449142456, "step": 4800 }, { "epoch": 2.568991470145509, "eval_logits/chosen": 0.022585922852158546, "eval_logits/rejected": 0.11062997579574585, "eval_logps/chosen": -1.59637451171875, "eval_logps/rejected": -2.2215960025787354, "eval_loss": 0.6686033606529236, "eval_rewards/accuracies": 0.6483679413795471, "eval_rewards/chosen": -1.59637451171875, "eval_rewards/margins": 0.6252216696739197, "eval_rewards/rejected": -2.2215960025787354, "eval_runtime": 44.1182, "eval_samples_per_second": 30.486, "eval_sft_loss": 1.5864040851593018, "eval_steps_per_second": 7.639, "step": 4800 }, { "epoch": 2.571667502926911, "grad_norm": 4.713763181183661, "learning_rate": 6.066949564227897e-08, "logits/chosen": -0.33478087186813354, "logits/rejected": -0.20814982056617737, "logps/chosen": -1.5814807415008545, "logps/rejected": -2.2600340843200684, "loss": 0.6477, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.5814807415008545, "rewards/margins": 0.6785534620285034, "rewards/rejected": -2.2600340843200684, "sft_loss": 1.5960379838943481, "step": 4805 }, { "epoch": 2.574343535708312, "grad_norm": 3.0658788495930707, "learning_rate": 5.992805212285523e-08, "logits/chosen": -0.24046310782432556, "logits/rejected": -0.1067015677690506, "logps/chosen": -1.618259072303772, "logps/rejected": -2.225800037384033, "loss": 0.6609, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.618259072303772, "rewards/margins": 0.6075407266616821, "rewards/rejected": -2.225800037384033, "sft_loss": 1.616916298866272, "step": 4810 }, { "epoch": 2.577019568489714, "grad_norm": 7.482433847006735, "learning_rate": 5.9190878183399684e-08, "logits/chosen": -0.25230371952056885, "logits/rejected": -0.123440682888031, "logps/chosen": -1.4187307357788086, "logps/rejected": -2.219444751739502, "loss": 0.6197, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.4187307357788086, "rewards/margins": 0.800714373588562, "rewards/rejected": -2.219444751739502, "sft_loss": 1.478771448135376, "step": 4815 }, { "epoch": 2.5796956012711156, "grad_norm": 9.582868855705456, "learning_rate": 5.845798097597748e-08, "logits/chosen": -0.22999422252178192, "logits/rejected": -0.15210556983947754, "logps/chosen": -1.6584303379058838, "logps/rejected": -2.1190333366394043, "loss": 0.6575, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.6584303379058838, "rewards/margins": 0.46060290932655334, "rewards/rejected": -2.1190333366394043, "sft_loss": 1.6076551675796509, "step": 4820 }, { "epoch": 2.5823716340525174, "grad_norm": 8.24557548071507, "learning_rate": 5.772936761116026e-08, "logits/chosen": -0.2544562518596649, "logits/rejected": -0.12751419842243195, "logps/chosen": -1.5638656616210938, "logps/rejected": -2.0885910987854004, "loss": 0.6527, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.5638656616210938, "rewards/margins": 0.524725615978241, "rewards/rejected": -2.0885910987854004, "sft_loss": 1.5143884420394897, "step": 4825 }, { "epoch": 2.5850476668339186, "grad_norm": 2.765376813488303, "learning_rate": 5.700504515795829e-08, "logits/chosen": -0.31105369329452515, "logits/rejected": -0.1702210158109665, "logps/chosen": -1.6085840463638306, "logps/rejected": -2.1430881023406982, "loss": 0.658, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.6085840463638306, "rewards/margins": 0.5345041751861572, "rewards/rejected": -2.1430881023406982, "sft_loss": 1.6004174947738647, "step": 4830 }, { "epoch": 2.5877236996153203, "grad_norm": 6.023616839080402, "learning_rate": 5.628502064375101e-08, "logits/chosen": -0.39601072669029236, "logits/rejected": -0.20983512699604034, "logps/chosen": -1.5357941389083862, "logps/rejected": -2.263760566711426, "loss": 0.6555, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.5357941389083862, "rewards/margins": 0.7279661893844604, "rewards/rejected": -2.263760566711426, "sft_loss": 1.5607926845550537, "step": 4835 }, { "epoch": 2.5903997323967216, "grad_norm": 4.578710871693104, "learning_rate": 5.55693010542197e-08, "logits/chosen": -0.34746941924095154, "logits/rejected": -0.15091314911842346, "logps/chosen": -1.5270739793777466, "logps/rejected": -2.2461137771606445, "loss": 0.6446, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.5270739793777466, "rewards/margins": 0.7190399169921875, "rewards/rejected": -2.2461137771606445, "sft_loss": 1.5467076301574707, "step": 4840 }, { "epoch": 2.5930757651781233, "grad_norm": 3.077774959202608, "learning_rate": 5.485789333327856e-08, "logits/chosen": -0.3103768229484558, "logits/rejected": -0.20531435310840607, "logps/chosen": -1.5353167057037354, "logps/rejected": -2.1882166862487793, "loss": 0.6514, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.5353167057037354, "rewards/margins": 0.6528998613357544, "rewards/rejected": -2.1882166862487793, "sft_loss": 1.6176245212554932, "step": 4845 }, { "epoch": 2.595751797959525, "grad_norm": 5.342418138387475, "learning_rate": 5.4150804383008675e-08, "logits/chosen": -0.439974844455719, "logits/rejected": -0.2942073345184326, "logps/chosen": -1.5611355304718018, "logps/rejected": -2.311400890350342, "loss": 0.6555, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.5611355304718018, "rewards/margins": 0.7502651214599609, "rewards/rejected": -2.311400890350342, "sft_loss": 1.564321517944336, "step": 4850 }, { "epoch": 2.5984278307409268, "grad_norm": 5.070791428130677, "learning_rate": 5.344804106359002e-08, "logits/chosen": -0.2757100760936737, "logits/rejected": -0.12967705726623535, "logps/chosen": -1.4793365001678467, "logps/rejected": -2.2874951362609863, "loss": 0.6413, "rewards/accuracies": 0.65625, "rewards/chosen": -1.4793365001678467, "rewards/margins": 0.8081587553024292, "rewards/rejected": -2.2874951362609863, "sft_loss": 1.540814995765686, "step": 4855 }, { "epoch": 2.601103863522328, "grad_norm": 10.029441019921531, "learning_rate": 5.274961019323559e-08, "logits/chosen": -0.2699975371360779, "logits/rejected": -0.20369283854961395, "logps/chosen": -1.400427222251892, "logps/rejected": -2.2184746265411377, "loss": 0.6167, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.400427222251892, "rewards/margins": 0.8180474042892456, "rewards/rejected": -2.2184746265411377, "sft_loss": 1.4655218124389648, "step": 4860 }, { "epoch": 2.6037798963037297, "grad_norm": 2.957964263025511, "learning_rate": 5.205551854812451e-08, "logits/chosen": -0.3764112591743469, "logits/rejected": -0.2846323549747467, "logps/chosen": -1.508000373840332, "logps/rejected": -2.2652816772460938, "loss": 0.6397, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.508000373840332, "rewards/margins": 0.7572811841964722, "rewards/rejected": -2.2652816772460938, "sft_loss": 1.5318291187286377, "step": 4865 }, { "epoch": 2.606455929085131, "grad_norm": 2.8734137531975854, "learning_rate": 5.1365772862337177e-08, "logits/chosen": -0.2513507008552551, "logits/rejected": -0.12049970775842667, "logps/chosen": -1.5877680778503418, "logps/rejected": -2.249692678451538, "loss": 0.6478, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.5877680778503418, "rewards/margins": 0.6619247198104858, "rewards/rejected": -2.249692678451538, "sft_loss": 1.567791223526001, "step": 4870 }, { "epoch": 2.6091319618665327, "grad_norm": 3.5603848715105464, "learning_rate": 5.068037982778905e-08, "logits/chosen": -0.11695779860019684, "logits/rejected": -0.03770308196544647, "logps/chosen": -1.512600064277649, "logps/rejected": -2.315598726272583, "loss": 0.6438, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.512600064277649, "rewards/margins": 0.8029987215995789, "rewards/rejected": -2.315598726272583, "sft_loss": 1.5583285093307495, "step": 4875 }, { "epoch": 2.6118079946479344, "grad_norm": 5.285715997291284, "learning_rate": 4.999934609416656e-08, "logits/chosen": -0.18301086127758026, "logits/rejected": -0.06374426931142807, "logps/chosen": -1.464905023574829, "logps/rejected": -2.3175206184387207, "loss": 0.6273, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.464905023574829, "rewards/margins": 0.8526156544685364, "rewards/rejected": -2.3175206184387207, "sft_loss": 1.5341923236846924, "step": 4880 }, { "epoch": 2.614484027429336, "grad_norm": 4.094377413513966, "learning_rate": 4.932267826886183e-08, "logits/chosen": -0.18760551512241364, "logits/rejected": -0.11009053885936737, "logps/chosen": -1.7325265407562256, "logps/rejected": -2.3709464073181152, "loss": 0.6565, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.7325265407562256, "rewards/margins": 0.6384199261665344, "rewards/rejected": -2.3709464073181152, "sft_loss": 1.7828750610351562, "step": 4885 }, { "epoch": 2.6171600602107374, "grad_norm": 3.609069401041533, "learning_rate": 4.8650382916909206e-08, "logits/chosen": -0.3653804659843445, "logits/rejected": -0.2060328722000122, "logps/chosen": -1.5337392091751099, "logps/rejected": -2.2383885383605957, "loss": 0.6428, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.5337392091751099, "rewards/margins": 0.7046495676040649, "rewards/rejected": -2.2383885383605957, "sft_loss": 1.5887818336486816, "step": 4890 }, { "epoch": 2.619836092992139, "grad_norm": 2.8786237616867143, "learning_rate": 4.7982466560920976e-08, "logits/chosen": -0.29676735401153564, "logits/rejected": -0.205804705619812, "logps/chosen": -1.6045278310775757, "logps/rejected": -2.143934965133667, "loss": 0.6551, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.6045278310775757, "rewards/margins": 0.53940749168396, "rewards/rejected": -2.143934965133667, "sft_loss": 1.7019850015640259, "step": 4895 }, { "epoch": 2.622512125773541, "grad_norm": 2.655176451010275, "learning_rate": 4.7318935681024685e-08, "logits/chosen": -0.2333211600780487, "logits/rejected": -0.09696978330612183, "logps/chosen": -1.5167087316513062, "logps/rejected": -2.274289131164551, "loss": 0.641, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.5167087316513062, "rewards/margins": 0.7575803995132446, "rewards/rejected": -2.274289131164551, "sft_loss": 1.5980942249298096, "step": 4900 }, { "epoch": 2.625188158554942, "grad_norm": 2.8007372584673638, "learning_rate": 4.6659796714799745e-08, "logits/chosen": -0.2577126622200012, "logits/rejected": -0.10144130885601044, "logps/chosen": -1.5728174448013306, "logps/rejected": -2.333540678024292, "loss": 0.6518, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.5728174448013306, "rewards/margins": 0.7607229948043823, "rewards/rejected": -2.333540678024292, "sft_loss": 1.6706300973892212, "step": 4905 }, { "epoch": 2.627864191336344, "grad_norm": 3.31920416819512, "learning_rate": 4.60050560572155e-08, "logits/chosen": -0.30460885167121887, "logits/rejected": -0.31871408224105835, "logps/chosen": -1.6616413593292236, "logps/rejected": -2.6286277770996094, "loss": 0.6606, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.6616413593292236, "rewards/margins": 0.9669864773750305, "rewards/rejected": -2.6286277770996094, "sft_loss": 1.6571128368377686, "step": 4910 }, { "epoch": 2.6305402241177456, "grad_norm": 5.451558355471818, "learning_rate": 4.535472006056834e-08, "logits/chosen": -0.21932478249073029, "logits/rejected": -0.10749228298664093, "logps/chosen": -1.426175594329834, "logps/rejected": -2.144705295562744, "loss": 0.6305, "rewards/accuracies": 0.71875, "rewards/chosen": -1.426175594329834, "rewards/margins": 0.7185295820236206, "rewards/rejected": -2.144705295562744, "sft_loss": 1.535812497138977, "step": 4915 }, { "epoch": 2.6332162568991473, "grad_norm": 3.13030607289851, "learning_rate": 4.470879503442132e-08, "logits/chosen": -0.23966650664806366, "logits/rejected": -0.1604257971048355, "logps/chosen": -1.6174768209457397, "logps/rejected": -2.1807265281677246, "loss": 0.6631, "rewards/accuracies": 0.625, "rewards/chosen": -1.6174768209457397, "rewards/margins": 0.5632495880126953, "rewards/rejected": -2.1807265281677246, "sft_loss": 1.5944290161132812, "step": 4920 }, { "epoch": 2.6358922896805486, "grad_norm": 3.3190440778545014, "learning_rate": 4.406728724554154e-08, "logits/chosen": -0.40968436002731323, "logits/rejected": -0.16236819326877594, "logps/chosen": -1.4841177463531494, "logps/rejected": -2.236159086227417, "loss": 0.6345, "rewards/accuracies": 0.75, "rewards/chosen": -1.4841177463531494, "rewards/margins": 0.7520411014556885, "rewards/rejected": -2.236159086227417, "sft_loss": 1.5622247457504272, "step": 4925 }, { "epoch": 2.6385683224619503, "grad_norm": 2.270307742945154, "learning_rate": 4.3430202917840664e-08, "logits/chosen": -0.22066080570220947, "logits/rejected": -0.06292378902435303, "logps/chosen": -1.7009875774383545, "logps/rejected": -2.5794739723205566, "loss": 0.6494, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.7009875774383545, "rewards/margins": 0.8784863352775574, "rewards/rejected": -2.5794739723205566, "sft_loss": 1.628734827041626, "step": 4930 }, { "epoch": 2.6412443552433515, "grad_norm": 3.9659339990265505, "learning_rate": 4.279754823231346e-08, "logits/chosen": -0.3291738033294678, "logits/rejected": -0.17241686582565308, "logps/chosen": -1.5517101287841797, "logps/rejected": -2.16611909866333, "loss": 0.6478, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.5517101287841797, "rewards/margins": 0.6144087910652161, "rewards/rejected": -2.16611909866333, "sft_loss": 1.5594476461410522, "step": 4935 }, { "epoch": 2.6439203880247533, "grad_norm": 2.5452526134457325, "learning_rate": 4.216932932697859e-08, "logits/chosen": -0.2720089554786682, "logits/rejected": -0.19060388207435608, "logps/chosen": -1.5684664249420166, "logps/rejected": -1.973120927810669, "loss": 0.65, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.5684664249420166, "rewards/margins": 0.40465426445007324, "rewards/rejected": -1.973120927810669, "sft_loss": 1.5459164381027222, "step": 4940 }, { "epoch": 2.646596420806155, "grad_norm": 4.884060085397572, "learning_rate": 4.154555229681844e-08, "logits/chosen": -0.3061191439628601, "logits/rejected": -0.1086500734090805, "logps/chosen": -1.523645043373108, "logps/rejected": -2.4154305458068848, "loss": 0.6398, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.523645043373108, "rewards/margins": 0.8917851448059082, "rewards/rejected": -2.4154305458068848, "sft_loss": 1.5471687316894531, "step": 4945 }, { "epoch": 2.6492724535875567, "grad_norm": 6.943349502257102, "learning_rate": 4.092622319372069e-08, "logits/chosen": -0.3145865797996521, "logits/rejected": -0.19390031695365906, "logps/chosen": -1.5767168998718262, "logps/rejected": -2.212101936340332, "loss": 0.6458, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.5767168998718262, "rewards/margins": 0.6353851556777954, "rewards/rejected": -2.212101936340332, "sft_loss": 1.5280985832214355, "step": 4950 }, { "epoch": 2.651948486368958, "grad_norm": 5.359155241881496, "learning_rate": 4.031134802641889e-08, "logits/chosen": -0.2595178484916687, "logits/rejected": -0.2513166666030884, "logps/chosen": -1.63980233669281, "logps/rejected": -2.196504831314087, "loss": 0.6431, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.63980233669281, "rewards/margins": 0.5567022562026978, "rewards/rejected": -2.196504831314087, "sft_loss": 1.6192525625228882, "step": 4955 }, { "epoch": 2.6546245191503597, "grad_norm": 2.2051574072073836, "learning_rate": 3.970093276043468e-08, "logits/chosen": -0.18522128462791443, "logits/rejected": -0.08331882953643799, "logps/chosen": -1.5592795610427856, "logps/rejected": -2.1713385581970215, "loss": 0.6496, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.5592795610427856, "rewards/margins": 0.6120591759681702, "rewards/rejected": -2.1713385581970215, "sft_loss": 1.558884859085083, "step": 4960 }, { "epoch": 2.657300551931761, "grad_norm": 6.925666390173548, "learning_rate": 3.9094983318019584e-08, "logits/chosen": -0.310133159160614, "logits/rejected": -0.18919309973716736, "logps/chosen": -1.3932650089263916, "logps/rejected": -2.144735813140869, "loss": 0.625, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.3932650089263916, "rewards/margins": 0.7514708638191223, "rewards/rejected": -2.144735813140869, "sft_loss": 1.5189520120620728, "step": 4965 }, { "epoch": 2.6599765847131627, "grad_norm": 3.1649653830969893, "learning_rate": 3.849350557809789e-08, "logits/chosen": -0.1915835440158844, "logits/rejected": -0.11682520061731339, "logps/chosen": -1.5142855644226074, "logps/rejected": -2.183206558227539, "loss": 0.641, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.5142855644226074, "rewards/margins": 0.6689208149909973, "rewards/rejected": -2.183206558227539, "sft_loss": 1.4711930751800537, "step": 4970 }, { "epoch": 2.6626526174945644, "grad_norm": 2.4456036606950793, "learning_rate": 3.789650537620903e-08, "logits/chosen": -0.2733498215675354, "logits/rejected": -0.2143729031085968, "logps/chosen": -1.6624078750610352, "logps/rejected": -2.2634522914886475, "loss": 0.67, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.6624078750610352, "rewards/margins": 0.6010446548461914, "rewards/rejected": -2.2634522914886475, "sft_loss": 1.6090848445892334, "step": 4975 }, { "epoch": 2.665328650275966, "grad_norm": 9.539019846441738, "learning_rate": 3.730398850445182e-08, "logits/chosen": -0.13736586272716522, "logits/rejected": -0.07004425674676895, "logps/chosen": -1.7409322261810303, "logps/rejected": -2.433462142944336, "loss": 0.673, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.7409322261810303, "rewards/margins": 0.6925297975540161, "rewards/rejected": -2.433462142944336, "sft_loss": 1.5974147319793701, "step": 4980 }, { "epoch": 2.6680046830573674, "grad_norm": 7.167630686802431, "learning_rate": 3.671596071142735e-08, "logits/chosen": -0.23968443274497986, "logits/rejected": -0.06813397258520126, "logps/chosen": -1.5134305953979492, "logps/rejected": -2.390659809112549, "loss": 0.6366, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.5134305953979492, "rewards/margins": 0.8772293925285339, "rewards/rejected": -2.390659809112549, "sft_loss": 1.5120487213134766, "step": 4985 }, { "epoch": 2.670680715838769, "grad_norm": 2.8825014543320644, "learning_rate": 3.6132427702183996e-08, "logits/chosen": -0.34571391344070435, "logits/rejected": -0.13643576204776764, "logps/chosen": -1.5214189291000366, "logps/rejected": -2.2927017211914062, "loss": 0.6436, "rewards/accuracies": 0.6875, "rewards/chosen": -1.5214189291000366, "rewards/margins": 0.7712829113006592, "rewards/rejected": -2.2927017211914062, "sft_loss": 1.5628582239151, "step": 4990 }, { "epoch": 2.6733567486201704, "grad_norm": 3.6960718669777375, "learning_rate": 3.555339513816147e-08, "logits/chosen": -0.3258179724216461, "logits/rejected": -0.30619877576828003, "logps/chosen": -1.6004127264022827, "logps/rejected": -2.1896750926971436, "loss": 0.6655, "rewards/accuracies": 0.625, "rewards/chosen": -1.6004127264022827, "rewards/margins": 0.5892623662948608, "rewards/rejected": -2.1896750926971436, "sft_loss": 1.6143624782562256, "step": 4995 }, { "epoch": 2.676032781401572, "grad_norm": 6.243950321723729, "learning_rate": 3.497886863713639e-08, "logits/chosen": -0.2789207398891449, "logits/rejected": -0.25094786286354065, "logps/chosen": -1.6025774478912354, "logps/rejected": -2.304051160812378, "loss": 0.6485, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.6025774478912354, "rewards/margins": 0.7014739513397217, "rewards/rejected": -2.304051160812378, "sft_loss": 1.595412015914917, "step": 5000 }, { "epoch": 2.678708814182974, "grad_norm": 4.093118605787219, "learning_rate": 3.440885377316721e-08, "logits/chosen": -0.23231148719787598, "logits/rejected": -0.18588271737098694, "logps/chosen": -1.567158818244934, "logps/rejected": -2.132018566131592, "loss": 0.652, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.567158818244934, "rewards/margins": 0.5648595094680786, "rewards/rejected": -2.132018566131592, "sft_loss": 1.5321518182754517, "step": 5005 }, { "epoch": 2.6813848469643755, "grad_norm": 3.6018307312522753, "learning_rate": 3.384335607654082e-08, "logits/chosen": -0.2296139895915985, "logits/rejected": -0.12728151679039001, "logps/chosen": -1.7475757598876953, "logps/rejected": -2.402459144592285, "loss": 0.6692, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.7475757598876953, "rewards/margins": 0.6548832654953003, "rewards/rejected": -2.402459144592285, "sft_loss": 1.7332561016082764, "step": 5010 }, { "epoch": 2.684060879745777, "grad_norm": 3.343841512434198, "learning_rate": 3.328238103371811e-08, "logits/chosen": -0.32258838415145874, "logits/rejected": -0.25735121965408325, "logps/chosen": -1.661834478378296, "logps/rejected": -2.3188681602478027, "loss": 0.6459, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.661834478378296, "rewards/margins": 0.6570338010787964, "rewards/rejected": -2.3188681602478027, "sft_loss": 1.5429569482803345, "step": 5015 }, { "epoch": 2.6867369125271785, "grad_norm": 2.69101166999373, "learning_rate": 3.272593408728169e-08, "logits/chosen": -0.3427257239818573, "logits/rejected": -0.15235450863838196, "logps/chosen": -1.5064865350723267, "logps/rejected": -2.1955904960632324, "loss": 0.6611, "rewards/accuracies": 0.65625, "rewards/chosen": -1.5064865350723267, "rewards/margins": 0.6891041398048401, "rewards/rejected": -2.1955904960632324, "sft_loss": 1.537213921546936, "step": 5020 }, { "epoch": 2.6894129453085798, "grad_norm": 2.8557599983363597, "learning_rate": 3.217402063588204e-08, "logits/chosen": -0.31604811549186707, "logits/rejected": -0.17826204001903534, "logps/chosen": -1.597011685371399, "logps/rejected": -2.380256175994873, "loss": 0.662, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.597011685371399, "rewards/margins": 0.7832446098327637, "rewards/rejected": -2.380256175994873, "sft_loss": 1.5641099214553833, "step": 5025 }, { "epoch": 2.6920889780899815, "grad_norm": 3.686247803705324, "learning_rate": 3.162664603418608e-08, "logits/chosen": -0.2612699568271637, "logits/rejected": -0.2027551680803299, "logps/chosen": -1.5545036792755127, "logps/rejected": -2.373471975326538, "loss": 0.6556, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.5545036792755127, "rewards/margins": 0.8189681768417358, "rewards/rejected": -2.373471975326538, "sft_loss": 1.5277230739593506, "step": 5030 }, { "epoch": 2.694765010871383, "grad_norm": 4.738638244496703, "learning_rate": 3.1083815592824416e-08, "logits/chosen": -0.3124659061431885, "logits/rejected": -0.18985530734062195, "logps/chosen": -1.681979775428772, "logps/rejected": -2.28306245803833, "loss": 0.66, "rewards/accuracies": 0.65625, "rewards/chosen": -1.681979775428772, "rewards/margins": 0.6010830402374268, "rewards/rejected": -2.28306245803833, "sft_loss": 1.6827268600463867, "step": 5035 }, { "epoch": 2.697441043652785, "grad_norm": 4.123633534097979, "learning_rate": 3.054553457834053e-08, "logits/chosen": -0.09936905652284622, "logits/rejected": -0.13657012581825256, "logps/chosen": -1.5900905132293701, "logps/rejected": -2.274933338165283, "loss": 0.6433, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.5900905132293701, "rewards/margins": 0.6848429441452026, "rewards/rejected": -2.274933338165283, "sft_loss": 1.5598783493041992, "step": 5040 }, { "epoch": 2.700117076434186, "grad_norm": 3.126735515936529, "learning_rate": 3.0011808213139036e-08, "logits/chosen": -0.20340998470783234, "logits/rejected": -0.17745666205883026, "logps/chosen": -1.6050293445587158, "logps/rejected": -2.101975917816162, "loss": 0.647, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.6050293445587158, "rewards/margins": 0.4969464838504791, "rewards/rejected": -2.101975917816162, "sft_loss": 1.584077000617981, "step": 5045 }, { "epoch": 2.702793109215588, "grad_norm": 3.5855929394333246, "learning_rate": 2.948264167543568e-08, "logits/chosen": -0.26367872953414917, "logits/rejected": -0.19293320178985596, "logps/chosen": -1.3915940523147583, "logps/rejected": -2.081035852432251, "loss": 0.6271, "rewards/accuracies": 0.71875, "rewards/chosen": -1.3915940523147583, "rewards/margins": 0.6894418597221375, "rewards/rejected": -2.081035852432251, "sft_loss": 1.4362269639968872, "step": 5050 }, { "epoch": 2.7054691419969896, "grad_norm": 5.509628511652226, "learning_rate": 2.8958040099206216e-08, "logits/chosen": -0.3857537806034088, "logits/rejected": -0.30278491973876953, "logps/chosen": -1.4894828796386719, "logps/rejected": -2.191249370574951, "loss": 0.6392, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.4894828796386719, "rewards/margins": 0.7017661929130554, "rewards/rejected": -2.191249370574951, "sft_loss": 1.4975861310958862, "step": 5055 }, { "epoch": 2.708145174778391, "grad_norm": 4.008088307428305, "learning_rate": 2.843800857413775e-08, "logits/chosen": -0.2717474400997162, "logits/rejected": -0.20423254370689392, "logps/chosen": -1.5966397523880005, "logps/rejected": -2.134685754776001, "loss": 0.6378, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.5966397523880005, "rewards/margins": 0.5380457639694214, "rewards/rejected": -2.134685754776001, "sft_loss": 1.6769453287124634, "step": 5060 }, { "epoch": 2.7108212075597926, "grad_norm": 5.957312484952148, "learning_rate": 2.7922552145578203e-08, "logits/chosen": -0.32032904028892517, "logits/rejected": -0.0922280102968216, "logps/chosen": -1.5731478929519653, "logps/rejected": -2.2426846027374268, "loss": 0.6453, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.5731478929519653, "rewards/margins": 0.6695364713668823, "rewards/rejected": -2.2426846027374268, "sft_loss": 1.5867424011230469, "step": 5065 }, { "epoch": 2.7134972403411943, "grad_norm": 3.89727463675, "learning_rate": 2.7411675814488277e-08, "logits/chosen": -0.15399877727031708, "logits/rejected": 0.004412566777318716, "logps/chosen": -1.4596203565597534, "logps/rejected": -1.9406630992889404, "loss": 0.6556, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.4596203565597534, "rewards/margins": 0.4810427725315094, "rewards/rejected": -1.9406630992889404, "sft_loss": 1.4862662553787231, "step": 5070 }, { "epoch": 2.7161732731225956, "grad_norm": 2.960012412141032, "learning_rate": 2.690538453739216e-08, "logits/chosen": -0.22320058941841125, "logits/rejected": -0.14689429104328156, "logps/chosen": -1.5327249765396118, "logps/rejected": -2.014547824859619, "loss": 0.6622, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.5327249765396118, "rewards/margins": 0.4818227291107178, "rewards/rejected": -2.014547824859619, "sft_loss": 1.5514531135559082, "step": 5075 }, { "epoch": 2.7188493059039973, "grad_norm": 3.490855056544133, "learning_rate": 2.6403683226330298e-08, "logits/chosen": -0.2697138786315918, "logits/rejected": -0.13029912114143372, "logps/chosen": -1.5601483583450317, "logps/rejected": -2.2007036209106445, "loss": 0.6519, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.5601483583450317, "rewards/margins": 0.6405550241470337, "rewards/rejected": -2.2007036209106445, "sft_loss": 1.5582091808319092, "step": 5080 }, { "epoch": 2.721525338685399, "grad_norm": 7.90813154898587, "learning_rate": 2.5906576748810804e-08, "logits/chosen": -0.30239084362983704, "logits/rejected": -0.17908033728599548, "logps/chosen": -1.4057493209838867, "logps/rejected": -2.244786262512207, "loss": 0.6209, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.4057493209838867, "rewards/margins": 0.839036762714386, "rewards/rejected": -2.244786262512207, "sft_loss": 1.4612065553665161, "step": 5085 }, { "epoch": 2.7242013714668003, "grad_norm": 11.903186469798271, "learning_rate": 2.5414069927763016e-08, "logits/chosen": -0.3835442364215851, "logits/rejected": -0.2295791357755661, "logps/chosen": -1.5785081386566162, "logps/rejected": -2.267824649810791, "loss": 0.6608, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.5785081386566162, "rewards/margins": 0.68931645154953, "rewards/rejected": -2.267824649810791, "sft_loss": 1.6034950017929077, "step": 5090 }, { "epoch": 2.726877404248202, "grad_norm": 3.276626505410632, "learning_rate": 2.4926167541490185e-08, "logits/chosen": -0.3765779435634613, "logits/rejected": -0.17551617324352264, "logps/chosen": -1.5655916929244995, "logps/rejected": -2.552032470703125, "loss": 0.6348, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.5655916929244995, "rewards/margins": 0.9864408373832703, "rewards/rejected": -2.552032470703125, "sft_loss": 1.5733572244644165, "step": 5095 }, { "epoch": 2.7295534370296037, "grad_norm": 3.8762685392197835, "learning_rate": 2.4442874323623574e-08, "logits/chosen": -0.20498760044574738, "logits/rejected": -0.05653483793139458, "logps/chosen": -1.575437307357788, "logps/rejected": -2.520442008972168, "loss": 0.6411, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.575437307357788, "rewards/margins": 0.9450046420097351, "rewards/rejected": -2.520442008972168, "sft_loss": 1.5051355361938477, "step": 5100 }, { "epoch": 2.7322294698110055, "grad_norm": 6.6542950500767954, "learning_rate": 2.396419496307589e-08, "logits/chosen": -0.24930492043495178, "logits/rejected": -0.08691352605819702, "logps/chosen": -1.600701093673706, "logps/rejected": -2.2791550159454346, "loss": 0.6538, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.600701093673706, "rewards/margins": 0.678453803062439, "rewards/rejected": -2.2791550159454346, "sft_loss": 1.5768768787384033, "step": 5105 }, { "epoch": 2.7349055025924067, "grad_norm": 6.431115821488516, "learning_rate": 2.349013410399653e-08, "logits/chosen": -0.32295745611190796, "logits/rejected": -0.18493905663490295, "logps/chosen": -1.6067241430282593, "logps/rejected": -2.3083348274230957, "loss": 0.6458, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.6067241430282593, "rewards/margins": 0.701610803604126, "rewards/rejected": -2.3083348274230957, "sft_loss": 1.5453039407730103, "step": 5110 }, { "epoch": 2.7375815353738084, "grad_norm": 3.5398598206435405, "learning_rate": 2.3020696345725954e-08, "logits/chosen": -0.3503361642360687, "logits/rejected": -0.14796528220176697, "logps/chosen": -1.5512923002243042, "logps/rejected": -2.379483461380005, "loss": 0.6448, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.5512923002243042, "rewards/margins": 0.8281909823417664, "rewards/rejected": -2.379483461380005, "sft_loss": 1.6102874279022217, "step": 5115 }, { "epoch": 2.7402575681552097, "grad_norm": 4.563942521186379, "learning_rate": 2.2555886242751398e-08, "logits/chosen": -0.3200659155845642, "logits/rejected": -0.25785502791404724, "logps/chosen": -1.613032579421997, "logps/rejected": -2.366720676422119, "loss": 0.6446, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.613032579421997, "rewards/margins": 0.7536881566047668, "rewards/rejected": -2.366720676422119, "sft_loss": 1.608844518661499, "step": 5120 }, { "epoch": 2.7429336009366114, "grad_norm": 4.0383245647248085, "learning_rate": 2.2095708304662453e-08, "logits/chosen": -0.35934972763061523, "logits/rejected": -0.1265702247619629, "logps/chosen": -1.5786548852920532, "logps/rejected": -2.128849744796753, "loss": 0.6373, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.5786548852920532, "rewards/margins": 0.5501947999000549, "rewards/rejected": -2.128849744796753, "sft_loss": 1.5914679765701294, "step": 5125 }, { "epoch": 2.745609633718013, "grad_norm": 3.159487481443361, "learning_rate": 2.16401669961076e-08, "logits/chosen": -0.41795772314071655, "logits/rejected": -0.2262589931488037, "logps/chosen": -1.5015418529510498, "logps/rejected": -2.3633315563201904, "loss": 0.642, "rewards/accuracies": 0.6875, "rewards/chosen": -1.5015418529510498, "rewards/margins": 0.861789882183075, "rewards/rejected": -2.3633315563201904, "sft_loss": 1.5934960842132568, "step": 5130 }, { "epoch": 2.748285666499415, "grad_norm": 6.020860195660031, "learning_rate": 2.1189266736750532e-08, "logits/chosen": -0.1939457505941391, "logits/rejected": -0.1189827099442482, "logps/chosen": -1.5646998882293701, "logps/rejected": -2.241270065307617, "loss": 0.6504, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.5646998882293701, "rewards/margins": 0.6765701770782471, "rewards/rejected": -2.241270065307617, "sft_loss": 1.5589029788970947, "step": 5135 }, { "epoch": 2.750961699280816, "grad_norm": 6.348729922378617, "learning_rate": 2.0743011901227623e-08, "logits/chosen": -0.23039841651916504, "logits/rejected": -0.09529824554920197, "logps/chosen": -1.6141326427459717, "logps/rejected": -2.1430962085723877, "loss": 0.6735, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.6141326427459717, "rewards/margins": 0.5289635062217712, "rewards/rejected": -2.1430962085723877, "sft_loss": 1.6168859004974365, "step": 5140 }, { "epoch": 2.753637732062218, "grad_norm": 7.055913054164829, "learning_rate": 2.030140681910508e-08, "logits/chosen": -0.29964110255241394, "logits/rejected": -0.1342334747314453, "logps/chosen": -1.4779428243637085, "logps/rejected": -2.107789993286133, "loss": 0.6531, "rewards/accuracies": 0.65625, "rewards/chosen": -1.4779428243637085, "rewards/margins": 0.6298472285270691, "rewards/rejected": -2.107789993286133, "sft_loss": 1.533432960510254, "step": 5145 }, { "epoch": 2.756313764843619, "grad_norm": 3.2676365993423557, "learning_rate": 1.986445577483753e-08, "logits/chosen": -0.34181922674179077, "logits/rejected": -0.2150876522064209, "logps/chosen": -1.4962215423583984, "logps/rejected": -2.098529100418091, "loss": 0.6449, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.4962215423583984, "rewards/margins": 0.6023076176643372, "rewards/rejected": -2.098529100418091, "sft_loss": 1.5299503803253174, "step": 5150 }, { "epoch": 2.758989797625021, "grad_norm": 4.71361619614656, "learning_rate": 1.9432163007725765e-08, "logits/chosen": -0.3603561222553253, "logits/rejected": -0.25522202253341675, "logps/chosen": -1.6200759410858154, "logps/rejected": -2.1835179328918457, "loss": 0.6421, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.6200759410858154, "rewards/margins": 0.563442051410675, "rewards/rejected": -2.1835179328918457, "sft_loss": 1.6475378274917603, "step": 5155 }, { "epoch": 2.7616658304064226, "grad_norm": 2.5972991715549223, "learning_rate": 1.9004532711876297e-08, "logits/chosen": -0.2789638042449951, "logits/rejected": -0.211712047457695, "logps/chosen": -1.5185253620147705, "logps/rejected": -2.2027554512023926, "loss": 0.6123, "rewards/accuracies": 0.71875, "rewards/chosen": -1.5185253620147705, "rewards/margins": 0.6842299699783325, "rewards/rejected": -2.2027554512023926, "sft_loss": 1.5905386209487915, "step": 5160 }, { "epoch": 2.7643418631878243, "grad_norm": 2.6839215043247804, "learning_rate": 1.8581569036159928e-08, "logits/chosen": -0.27316179871559143, "logits/rejected": -0.0884641632437706, "logps/chosen": -1.455093264579773, "logps/rejected": -2.277247905731201, "loss": 0.6307, "rewards/accuracies": 0.71875, "rewards/chosen": -1.455093264579773, "rewards/margins": 0.8221546411514282, "rewards/rejected": -2.277247905731201, "sft_loss": 1.4999747276306152, "step": 5165 }, { "epoch": 2.7670178959692255, "grad_norm": 3.8919432235391316, "learning_rate": 1.8163276084172285e-08, "logits/chosen": -0.2556915879249573, "logits/rejected": -0.11450406163930893, "logps/chosen": -1.6157286167144775, "logps/rejected": -2.233842134475708, "loss": 0.6536, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.6157286167144775, "rewards/margins": 0.6181135177612305, "rewards/rejected": -2.233842134475708, "sft_loss": 1.6188828945159912, "step": 5170 }, { "epoch": 2.7696939287506273, "grad_norm": 5.318265389016087, "learning_rate": 1.7749657914193194e-08, "logits/chosen": -0.31146538257598877, "logits/rejected": -0.20772738754749298, "logps/chosen": -1.5671066045761108, "logps/rejected": -2.3081376552581787, "loss": 0.6414, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.5671066045761108, "rewards/margins": 0.7410310506820679, "rewards/rejected": -2.3081376552581787, "sft_loss": 1.5558583736419678, "step": 5175 }, { "epoch": 2.7723699615320285, "grad_norm": 3.857002508565315, "learning_rate": 1.7340718539148203e-08, "logits/chosen": -0.22554603219032288, "logits/rejected": -0.17537787556648254, "logps/chosen": -1.6106573343276978, "logps/rejected": -2.254077196121216, "loss": 0.6622, "rewards/accuracies": 0.65625, "rewards/chosen": -1.6106573343276978, "rewards/margins": 0.6434198617935181, "rewards/rejected": -2.254077196121216, "sft_loss": 1.6596205234527588, "step": 5180 }, { "epoch": 2.7750459943134302, "grad_norm": 5.61133508602783, "learning_rate": 1.6936461926568724e-08, "logits/chosen": -0.26040753722190857, "logits/rejected": -0.1419600248336792, "logps/chosen": -1.4490015506744385, "logps/rejected": -2.20499849319458, "loss": 0.6401, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.4490015506744385, "rewards/margins": 0.755996823310852, "rewards/rejected": -2.20499849319458, "sft_loss": 1.4934262037277222, "step": 5185 }, { "epoch": 2.777722027094832, "grad_norm": 4.324923819447895, "learning_rate": 1.6536891998554346e-08, "logits/chosen": -0.4055728316307068, "logits/rejected": -0.24443522095680237, "logps/chosen": -1.5296590328216553, "logps/rejected": -2.17419695854187, "loss": 0.631, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.5296590328216553, "rewards/margins": 0.6445378661155701, "rewards/rejected": -2.17419695854187, "sft_loss": 1.6248109340667725, "step": 5190 }, { "epoch": 2.7803980598762337, "grad_norm": 3.325761392550191, "learning_rate": 1.6142012631734093e-08, "logits/chosen": -0.2833782732486725, "logits/rejected": -0.15649259090423584, "logps/chosen": -1.5566611289978027, "logps/rejected": -2.14329195022583, "loss": 0.6519, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.5566611289978027, "rewards/margins": 0.5866307020187378, "rewards/rejected": -2.14329195022583, "sft_loss": 1.547494649887085, "step": 5195 }, { "epoch": 2.783074092657635, "grad_norm": 6.33032465055404, "learning_rate": 1.575182765722949e-08, "logits/chosen": -0.3567900061607361, "logits/rejected": -0.20585401356220245, "logps/chosen": -1.4549974203109741, "logps/rejected": -2.2079615592956543, "loss": 0.6448, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.4549974203109741, "rewards/margins": 0.7529643774032593, "rewards/rejected": -2.2079615592956543, "sft_loss": 1.535797357559204, "step": 5200 }, { "epoch": 2.783074092657635, "eval_logits/chosen": 0.01046951673924923, "eval_logits/rejected": 0.09744537621736526, "eval_logps/chosen": -1.5994479656219482, "eval_logps/rejected": -2.2308409214019775, "eval_loss": 0.6683368682861328, "eval_rewards/accuracies": 0.6505934596061707, "eval_rewards/chosen": -1.5994479656219482, "eval_rewards/margins": 0.6313928961753845, "eval_rewards/rejected": -2.2308409214019775, "eval_runtime": 43.2663, "eval_samples_per_second": 31.087, "eval_sft_loss": 1.5882415771484375, "eval_steps_per_second": 7.789, "step": 5200 }, { "epoch": 2.7857501254390367, "grad_norm": 2.680162601967427, "learning_rate": 1.536634086061672e-08, "logits/chosen": -0.24441878497600555, "logits/rejected": -0.2090197503566742, "logps/chosen": -1.5445456504821777, "logps/rejected": -2.1508004665374756, "loss": 0.6597, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.5445456504821777, "rewards/margins": 0.6062547564506531, "rewards/rejected": -2.1508004665374756, "sft_loss": 1.5353953838348389, "step": 5205 }, { "epoch": 2.788426158220438, "grad_norm": 3.054166573705804, "learning_rate": 1.4985555981890495e-08, "logits/chosen": -0.3028886914253235, "logits/rejected": -0.2101995050907135, "logps/chosen": -1.6469351053237915, "logps/rejected": -2.328545093536377, "loss": 0.6422, "rewards/accuracies": 0.6875, "rewards/chosen": -1.6469351053237915, "rewards/margins": 0.6816102266311646, "rewards/rejected": -2.328545093536377, "sft_loss": 1.56937837600708, "step": 5210 }, { "epoch": 2.7911021910018396, "grad_norm": 4.641939170377754, "learning_rate": 1.4609476715427226e-08, "logits/chosen": -0.24724645912647247, "logits/rejected": -0.1400899440050125, "logps/chosen": -1.4612640142440796, "logps/rejected": -2.281329870223999, "loss": 0.6408, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.4612640142440796, "rewards/margins": 0.820065975189209, "rewards/rejected": -2.281329870223999, "sft_loss": 1.5288423299789429, "step": 5215 }, { "epoch": 2.7937782237832414, "grad_norm": 6.570214132928371, "learning_rate": 1.4238106709949792e-08, "logits/chosen": -0.2952974736690521, "logits/rejected": -0.21536913514137268, "logps/chosen": -1.4964368343353271, "logps/rejected": -2.307354688644409, "loss": 0.6324, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.4964368343353271, "rewards/margins": 0.8109177350997925, "rewards/rejected": -2.307354688644409, "sft_loss": 1.5774792432785034, "step": 5220 }, { "epoch": 2.796454256564643, "grad_norm": 6.497025295805081, "learning_rate": 1.3871449568491511e-08, "logits/chosen": -0.2488592565059662, "logits/rejected": -0.11200263351202011, "logps/chosen": -1.6183297634124756, "logps/rejected": -2.316483974456787, "loss": 0.6522, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.6183297634124756, "rewards/margins": 0.6981542706489563, "rewards/rejected": -2.316483974456787, "sft_loss": 1.6047000885009766, "step": 5225 }, { "epoch": 2.7991302893460444, "grad_norm": 4.641487498277689, "learning_rate": 1.3509508848361606e-08, "logits/chosen": -0.39767542481422424, "logits/rejected": -0.25908923149108887, "logps/chosen": -1.5914344787597656, "logps/rejected": -2.2321314811706543, "loss": 0.6467, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.5914344787597656, "rewards/margins": 0.6406969428062439, "rewards/rejected": -2.2321314811706543, "sft_loss": 1.5355947017669678, "step": 5230 }, { "epoch": 2.801806322127446, "grad_norm": 3.0558847707427317, "learning_rate": 1.3152288061110517e-08, "logits/chosen": -0.3713652491569519, "logits/rejected": -0.2548134922981262, "logps/chosen": -1.5258245468139648, "logps/rejected": -2.2273030281066895, "loss": 0.6565, "rewards/accuracies": 0.65625, "rewards/chosen": -1.5258245468139648, "rewards/margins": 0.7014786601066589, "rewards/rejected": -2.2273030281066895, "sft_loss": 1.5043542385101318, "step": 5235 }, { "epoch": 2.804482354908848, "grad_norm": 6.559227207256044, "learning_rate": 1.2799790672495814e-08, "logits/chosen": -0.3284551799297333, "logits/rejected": -0.12331485748291016, "logps/chosen": -1.5666601657867432, "logps/rejected": -2.338242769241333, "loss": 0.6515, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.5666601657867432, "rewards/margins": 0.7715827226638794, "rewards/rejected": -2.338242769241333, "sft_loss": 1.5562536716461182, "step": 5240 }, { "epoch": 2.807158387690249, "grad_norm": 2.4036030650374505, "learning_rate": 1.2452020102448835e-08, "logits/chosen": -0.24320153892040253, "logits/rejected": -0.19215801358222961, "logps/chosen": -1.5605636835098267, "logps/rejected": -2.0627856254577637, "loss": 0.6685, "rewards/accuracies": 0.59375, "rewards/chosen": -1.5605636835098267, "rewards/margins": 0.5022218823432922, "rewards/rejected": -2.0627856254577637, "sft_loss": 1.5639190673828125, "step": 5245 }, { "epoch": 2.8098344204716508, "grad_norm": 8.49748018591424, "learning_rate": 1.2108979725041103e-08, "logits/chosen": -0.3559146523475647, "logits/rejected": -0.218179389834404, "logps/chosen": -1.5596989393234253, "logps/rejected": -2.18450665473938, "loss": 0.6614, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.5596989393234253, "rewards/margins": 0.6248075366020203, "rewards/rejected": -2.18450665473938, "sft_loss": 1.591348648071289, "step": 5250 }, { "epoch": 2.8125104532530525, "grad_norm": 4.364225513311659, "learning_rate": 1.1770672868451958e-08, "logits/chosen": -0.3184386193752289, "logits/rejected": -0.11092986166477203, "logps/chosen": -1.5821802616119385, "logps/rejected": -2.2117114067077637, "loss": 0.6499, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.5821802616119385, "rewards/margins": 0.6295314431190491, "rewards/rejected": -2.2117114067077637, "sft_loss": 1.5408474206924438, "step": 5255 }, { "epoch": 2.8151864860344538, "grad_norm": 4.4509052230402215, "learning_rate": 1.1437102814935872e-08, "logits/chosen": -0.2785467803478241, "logits/rejected": -0.2110917866230011, "logps/chosen": -1.5520164966583252, "logps/rejected": -2.3556294441223145, "loss": 0.6417, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.5520164966583252, "rewards/margins": 0.8036128878593445, "rewards/rejected": -2.3556294441223145, "sft_loss": 1.6278345584869385, "step": 5260 }, { "epoch": 2.8178625188158555, "grad_norm": 3.909993815706318, "learning_rate": 1.1108272800791018e-08, "logits/chosen": -0.416201651096344, "logits/rejected": -0.20776471495628357, "logps/chosen": -1.7517368793487549, "logps/rejected": -2.3358230590820312, "loss": 0.6689, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.7517368793487549, "rewards/margins": 0.5840864181518555, "rewards/rejected": -2.3358230590820312, "sft_loss": 1.7384824752807617, "step": 5265 }, { "epoch": 2.820538551597257, "grad_norm": 7.9290690308624425, "learning_rate": 1.078418601632769e-08, "logits/chosen": -0.2663486897945404, "logits/rejected": -0.11282068490982056, "logps/chosen": -1.5236700773239136, "logps/rejected": -2.178757905960083, "loss": 0.6383, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.5236700773239136, "rewards/margins": 0.6550878286361694, "rewards/rejected": -2.178757905960083, "sft_loss": 1.5293155908584595, "step": 5270 }, { "epoch": 2.8232145843786585, "grad_norm": 4.591601960097695, "learning_rate": 1.0464845605837159e-08, "logits/chosen": -0.26813387870788574, "logits/rejected": -0.1137576550245285, "logps/chosen": -1.5806580781936646, "logps/rejected": -2.180480480194092, "loss": 0.6528, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.5806580781936646, "rewards/margins": 0.5998224020004272, "rewards/rejected": -2.180480480194092, "sft_loss": 1.561107873916626, "step": 5275 }, { "epoch": 2.82589061716006, "grad_norm": 3.5423652427077172, "learning_rate": 1.0150254667561642e-08, "logits/chosen": -0.26632776856422424, "logits/rejected": -0.09625057876110077, "logps/chosen": -1.7073192596435547, "logps/rejected": -2.460038423538208, "loss": 0.6569, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.7073192596435547, "rewards/margins": 0.7527190446853638, "rewards/rejected": -2.460038423538208, "sft_loss": 1.6452124118804932, "step": 5280 }, { "epoch": 2.828566649941462, "grad_norm": 2.8404674053871117, "learning_rate": 9.840416253663719e-09, "logits/chosen": -0.33919811248779297, "logits/rejected": -0.23609662055969238, "logps/chosen": -1.4722113609313965, "logps/rejected": -2.3265535831451416, "loss": 0.6373, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.4722113609313965, "rewards/margins": 0.854341983795166, "rewards/rejected": -2.3265535831451416, "sft_loss": 1.4935623407363892, "step": 5285 }, { "epoch": 2.8312426827228636, "grad_norm": 4.0797872044318915, "learning_rate": 9.535333370197074e-09, "logits/chosen": -0.32204926013946533, "logits/rejected": -0.1875072419643402, "logps/chosen": -1.5559985637664795, "logps/rejected": -2.128537654876709, "loss": 0.6622, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.5559985637664795, "rewards/margins": 0.572539210319519, "rewards/rejected": -2.128537654876709, "sft_loss": 1.6035436391830444, "step": 5290 }, { "epoch": 2.833918715504265, "grad_norm": 2.8017841765647, "learning_rate": 9.23500897707713e-09, "logits/chosen": -0.36441072821617126, "logits/rejected": -0.18555672466754913, "logps/chosen": -1.6992461681365967, "logps/rejected": -2.3326878547668457, "loss": 0.6643, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.6992461681365967, "rewards/margins": 0.6334417462348938, "rewards/rejected": -2.3326878547668457, "sft_loss": 1.6844333410263062, "step": 5295 }, { "epoch": 2.8365947482856666, "grad_norm": 3.571625497941765, "learning_rate": 8.939445988052574e-09, "logits/chosen": -0.3157997727394104, "logits/rejected": -0.2676517367362976, "logps/chosen": -1.5811196565628052, "logps/rejected": -2.3758318424224854, "loss": 0.645, "rewards/accuracies": 0.6875, "rewards/chosen": -1.5811196565628052, "rewards/margins": 0.7947121858596802, "rewards/rejected": -2.3758318424224854, "sft_loss": 1.5407793521881104, "step": 5300 }, { "epoch": 2.839270781067068, "grad_norm": 4.93834901122263, "learning_rate": 8.648647270676656e-09, "logits/chosen": -0.2959749102592468, "logits/rejected": -0.17174866795539856, "logps/chosen": -1.5706027746200562, "logps/rejected": -2.3317575454711914, "loss": 0.6561, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.5706027746200562, "rewards/margins": 0.7611549496650696, "rewards/rejected": -2.3317575454711914, "sft_loss": 1.6460769176483154, "step": 5305 }, { "epoch": 2.8419468138484696, "grad_norm": 3.762595430783216, "learning_rate": 8.362615646279991e-09, "logits/chosen": -0.4393877387046814, "logits/rejected": -0.2043670117855072, "logps/chosen": -1.5172808170318604, "logps/rejected": -2.394310474395752, "loss": 0.6391, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.5172808170318604, "rewards/margins": 0.8770295977592468, "rewards/rejected": -2.394310474395752, "sft_loss": 1.5674129724502563, "step": 5310 }, { "epoch": 2.8446228466298713, "grad_norm": 4.443768579457408, "learning_rate": 8.081353889942466e-09, "logits/chosen": -0.19146893918514252, "logits/rejected": -0.07724063098430634, "logps/chosen": -1.553026795387268, "logps/rejected": -2.1316425800323486, "loss": 0.6546, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.553026795387268, "rewards/margins": 0.5786157846450806, "rewards/rejected": -2.1316425800323486, "sft_loss": 1.562693476676941, "step": 5315 }, { "epoch": 2.847298879411273, "grad_norm": 5.722678383489239, "learning_rate": 7.804864730467042e-09, "logits/chosen": -0.2044384479522705, "logits/rejected": -0.11913935840129852, "logps/chosen": -1.5690629482269287, "logps/rejected": -2.0477561950683594, "loss": 0.6522, "rewards/accuracies": 0.625, "rewards/chosen": -1.5690629482269287, "rewards/margins": 0.4786931872367859, "rewards/rejected": -2.0477561950683594, "sft_loss": 1.4781545400619507, "step": 5320 }, { "epoch": 2.8499749121926743, "grad_norm": 2.0470696594380793, "learning_rate": 7.533150850352665e-09, "logits/chosen": -0.24658894538879395, "logits/rejected": -0.10288085043430328, "logps/chosen": -1.6159712076187134, "logps/rejected": -2.430453062057495, "loss": 0.6456, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.6159712076187134, "rewards/margins": 0.8144820332527161, "rewards/rejected": -2.430453062057495, "sft_loss": 1.5644042491912842, "step": 5325 }, { "epoch": 2.852650944974076, "grad_norm": 7.680564951023092, "learning_rate": 7.2662148857686175e-09, "logits/chosen": -0.2537384331226349, "logits/rejected": -0.186565563082695, "logps/chosen": -1.5757337808609009, "logps/rejected": -2.265808582305908, "loss": 0.6431, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.5757337808609009, "rewards/margins": 0.6900747418403625, "rewards/rejected": -2.265808582305908, "sft_loss": 1.5722882747650146, "step": 5330 }, { "epoch": 2.8553269777554773, "grad_norm": 8.408067887122359, "learning_rate": 7.0040594265287635e-09, "logits/chosen": -0.21966715157032013, "logits/rejected": -0.2432653158903122, "logps/chosen": -1.5844430923461914, "logps/rejected": -2.055832624435425, "loss": 0.6625, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.5844430923461914, "rewards/margins": 0.4713897109031677, "rewards/rejected": -2.055832624435425, "sft_loss": 1.5317051410675049, "step": 5335 }, { "epoch": 2.858003010536879, "grad_norm": 4.190014485730588, "learning_rate": 6.746687016066566e-09, "logits/chosen": -0.2685621380805969, "logits/rejected": -0.22150039672851562, "logps/chosen": -1.5397356748580933, "logps/rejected": -2.119022846221924, "loss": 0.6605, "rewards/accuracies": 0.65625, "rewards/chosen": -1.5397356748580933, "rewards/margins": 0.5792871713638306, "rewards/rejected": -2.119022846221924, "sft_loss": 1.502520203590393, "step": 5340 }, { "epoch": 2.8606790433182807, "grad_norm": 2.41810357458596, "learning_rate": 6.494100151410276e-09, "logits/chosen": -0.4025016725063324, "logits/rejected": -0.2326141893863678, "logps/chosen": -1.4536174535751343, "logps/rejected": -2.1672511100769043, "loss": 0.6297, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.4536174535751343, "rewards/margins": 0.71363365650177, "rewards/rejected": -2.1672511100769043, "sft_loss": 1.5110516548156738, "step": 5345 }, { "epoch": 2.8633550760996824, "grad_norm": 4.798281016538979, "learning_rate": 6.246301283158728e-09, "logits/chosen": -0.1976149082183838, "logits/rejected": -0.21479268372058868, "logps/chosen": -1.6265045404434204, "logps/rejected": -2.204802989959717, "loss": 0.6641, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.6265045404434204, "rewards/margins": 0.5782985687255859, "rewards/rejected": -2.204802989959717, "sft_loss": 1.5611135959625244, "step": 5350 }, { "epoch": 2.8660311088810837, "grad_norm": 3.3184133275680985, "learning_rate": 6.0032928154576944e-09, "logits/chosen": -0.2918176054954529, "logits/rejected": -0.20298846065998077, "logps/chosen": -1.5935795307159424, "logps/rejected": -2.1652448177337646, "loss": 0.6706, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.5935795307159424, "rewards/margins": 0.5716655850410461, "rewards/rejected": -2.1652448177337646, "sft_loss": 1.602667212486267, "step": 5355 }, { "epoch": 2.8687071416624854, "grad_norm": 9.405439850958698, "learning_rate": 5.76507710597629e-09, "logits/chosen": -0.2645604908466339, "logits/rejected": -0.06947880983352661, "logps/chosen": -1.5236592292785645, "logps/rejected": -2.236105442047119, "loss": 0.6433, "rewards/accuracies": 0.625, "rewards/chosen": -1.5236592292785645, "rewards/margins": 0.7124462127685547, "rewards/rejected": -2.236105442047119, "sft_loss": 1.584325909614563, "step": 5360 }, { "epoch": 2.8713831744438867, "grad_norm": 3.9460809975459488, "learning_rate": 5.531656465884438e-09, "logits/chosen": -0.33310556411743164, "logits/rejected": -0.16928938031196594, "logps/chosen": -1.6103332042694092, "logps/rejected": -2.4057180881500244, "loss": 0.6489, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.6103332042694092, "rewards/margins": 0.7953847646713257, "rewards/rejected": -2.4057180881500244, "sft_loss": 1.6243871450424194, "step": 5365 }, { "epoch": 2.8740592072252884, "grad_norm": 3.3978433077924657, "learning_rate": 5.303033159830217e-09, "logits/chosen": -0.18408358097076416, "logits/rejected": -0.14971300959587097, "logps/chosen": -1.610769510269165, "logps/rejected": -1.966650366783142, "loss": 0.6748, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.610769510269165, "rewards/margins": 0.355881005525589, "rewards/rejected": -1.966650366783142, "sft_loss": 1.5604798793792725, "step": 5370 }, { "epoch": 2.87673524000669, "grad_norm": 4.7975969213514515, "learning_rate": 5.079209405917939e-09, "logits/chosen": -0.24301142990589142, "logits/rejected": -0.14734292030334473, "logps/chosen": -1.503179907798767, "logps/rejected": -2.5005712509155273, "loss": 0.6303, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.503179907798767, "rewards/margins": 0.9973915219306946, "rewards/rejected": -2.5005712509155273, "sft_loss": 1.5697084665298462, "step": 5375 }, { "epoch": 2.879411272788092, "grad_norm": 5.333869239307722, "learning_rate": 4.860187375686664e-09, "logits/chosen": -0.33797964453697205, "logits/rejected": -0.12401552498340607, "logps/chosen": -1.6644957065582275, "logps/rejected": -2.3786120414733887, "loss": 0.6483, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.6644957065582275, "rewards/margins": 0.7141159772872925, "rewards/rejected": -2.3786120414733887, "sft_loss": 1.7237141132354736, "step": 5380 }, { "epoch": 2.882087305569493, "grad_norm": 4.832180713170385, "learning_rate": 4.64596919408905e-09, "logits/chosen": -0.22082212567329407, "logits/rejected": -0.1354362666606903, "logps/chosen": -1.6065336465835571, "logps/rejected": -2.0319314002990723, "loss": 0.6486, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.6065336465835571, "rewards/margins": 0.42539745569229126, "rewards/rejected": -2.0319314002990723, "sft_loss": 1.5567100048065186, "step": 5385 }, { "epoch": 2.884763338350895, "grad_norm": 4.212411607729048, "learning_rate": 4.436556939470814e-09, "logits/chosen": -0.2250661551952362, "logits/rejected": -0.0926482304930687, "logps/chosen": -1.6468353271484375, "logps/rejected": -2.17008638381958, "loss": 0.648, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.6468353271484375, "rewards/margins": 0.5232512950897217, "rewards/rejected": -2.17008638381958, "sft_loss": 1.6684173345565796, "step": 5390 }, { "epoch": 2.887439371132296, "grad_norm": 2.5706071635517014, "learning_rate": 4.23195264355064e-09, "logits/chosen": -0.3760332465171814, "logits/rejected": -0.17019043862819672, "logps/chosen": -1.448594331741333, "logps/rejected": -2.1622211933135986, "loss": 0.6252, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.448594331741333, "rewards/margins": 0.7136268615722656, "rewards/rejected": -2.1622211933135986, "sft_loss": 1.4942363500595093, "step": 5395 }, { "epoch": 2.890115403913698, "grad_norm": 4.723585039176921, "learning_rate": 4.032158291400245e-09, "logits/chosen": -0.27923649549484253, "logits/rejected": -0.02440541796386242, "logps/chosen": -1.5308659076690674, "logps/rejected": -2.5761771202087402, "loss": 0.6419, "rewards/accuracies": 0.75, "rewards/chosen": -1.5308659076690674, "rewards/margins": 1.0453112125396729, "rewards/rejected": -2.5761771202087402, "sft_loss": 1.5155452489852905, "step": 5400 }, { "epoch": 2.8927914366950995, "grad_norm": 2.891451796857847, "learning_rate": 3.837175821425398e-09, "logits/chosen": -0.21183153986930847, "logits/rejected": -0.15966220200061798, "logps/chosen": -1.8173433542251587, "logps/rejected": -2.3437368869781494, "loss": 0.6572, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.8173433542251587, "rewards/margins": 0.5263934135437012, "rewards/rejected": -2.3437368869781494, "sft_loss": 1.6783323287963867, "step": 5405 }, { "epoch": 2.8954674694765012, "grad_norm": 3.0268618418878908, "learning_rate": 3.6470071253467683e-09, "logits/chosen": -0.20779451727867126, "logits/rejected": -0.08706004917621613, "logps/chosen": -1.557328701019287, "logps/rejected": -2.4320130348205566, "loss": 0.6405, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.557328701019287, "rewards/margins": 0.8746845126152039, "rewards/rejected": -2.4320130348205566, "sft_loss": 1.577407956123352, "step": 5410 }, { "epoch": 2.8981435022579025, "grad_norm": 2.970437881892552, "learning_rate": 3.461654048181939e-09, "logits/chosen": -0.28720521926879883, "logits/rejected": -0.10532665252685547, "logps/chosen": -1.5876381397247314, "logps/rejected": -2.1869492530822754, "loss": 0.6588, "rewards/accuracies": 0.625, "rewards/chosen": -1.5876381397247314, "rewards/margins": 0.5993111729621887, "rewards/rejected": -2.1869492530822754, "sft_loss": 1.6944866180419922, "step": 5415 }, { "epoch": 2.9008195350393042, "grad_norm": 5.731162383629648, "learning_rate": 3.281118388227255e-09, "logits/chosen": -0.22121545672416687, "logits/rejected": -0.1469089388847351, "logps/chosen": -1.5260721445083618, "logps/rejected": -2.099198579788208, "loss": 0.6441, "rewards/accuracies": 0.65625, "rewards/chosen": -1.5260721445083618, "rewards/margins": 0.5731264352798462, "rewards/rejected": -2.099198579788208, "sft_loss": 1.5268216133117676, "step": 5420 }, { "epoch": 2.903495567820706, "grad_norm": 5.047704291212816, "learning_rate": 3.1054018970405048e-09, "logits/chosen": -0.2476317435503006, "logits/rejected": -0.11390645802021027, "logps/chosen": -1.580430030822754, "logps/rejected": -2.254305601119995, "loss": 0.65, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.580430030822754, "rewards/margins": 0.6738757491111755, "rewards/rejected": -2.254305601119995, "sft_loss": 1.5548816919326782, "step": 5425 }, { "epoch": 2.906171600602107, "grad_norm": 4.722096505210587, "learning_rate": 2.9345062794238207e-09, "logits/chosen": -0.29212015867233276, "logits/rejected": -0.12381164729595184, "logps/chosen": -1.5768067836761475, "logps/rejected": -2.2491636276245117, "loss": 0.6373, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.5768067836761475, "rewards/margins": 0.6723569631576538, "rewards/rejected": -2.2491636276245117, "sft_loss": 1.6210720539093018, "step": 5430 }, { "epoch": 2.908847633383509, "grad_norm": 3.2095242212092137, "learning_rate": 2.7684331934072492e-09, "logits/chosen": -0.4141046404838562, "logits/rejected": -0.3132175803184509, "logps/chosen": -1.5623668432235718, "logps/rejected": -2.2315099239349365, "loss": 0.6424, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.5623668432235718, "rewards/margins": 0.6691430807113647, "rewards/rejected": -2.2315099239349365, "sft_loss": 1.5677772760391235, "step": 5435 }, { "epoch": 2.9115236661649107, "grad_norm": 3.237316941256485, "learning_rate": 2.6071842502326526e-09, "logits/chosen": -0.29903578758239746, "logits/rejected": -0.1807752251625061, "logps/chosen": -1.5542609691619873, "logps/rejected": -2.1173808574676514, "loss": 0.649, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.5542609691619873, "rewards/margins": 0.5631201863288879, "rewards/rejected": -2.1173808574676514, "sft_loss": 1.5871899127960205, "step": 5440 }, { "epoch": 2.9141996989463124, "grad_norm": 4.449145241684852, "learning_rate": 2.450761014337888e-09, "logits/chosen": -0.10690195858478546, "logits/rejected": -0.09125228226184845, "logps/chosen": -1.6235891580581665, "logps/rejected": -2.516434907913208, "loss": 0.6415, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.6235891580581665, "rewards/margins": 0.892845630645752, "rewards/rejected": -2.516434907913208, "sft_loss": 1.5800119638442993, "step": 5445 }, { "epoch": 2.9168757317277136, "grad_norm": 4.611714592430611, "learning_rate": 2.299165003341985e-09, "logits/chosen": -0.14552733302116394, "logits/rejected": -0.04473976045846939, "logps/chosen": -1.5664246082305908, "logps/rejected": -2.4397082328796387, "loss": 0.6506, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.5664246082305908, "rewards/margins": 0.8732835054397583, "rewards/rejected": -2.4397082328796387, "sft_loss": 1.5751209259033203, "step": 5450 }, { "epoch": 2.9195517645091154, "grad_norm": 5.547178095937166, "learning_rate": 2.1523976880299945e-09, "logits/chosen": -0.28409165143966675, "logits/rejected": -0.10616005957126617, "logps/chosen": -1.579883098602295, "logps/rejected": -2.171001672744751, "loss": 0.666, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.579883098602295, "rewards/margins": 0.591118574142456, "rewards/rejected": -2.171001672744751, "sft_loss": 1.5925939083099365, "step": 5455 }, { "epoch": 2.9222277972905166, "grad_norm": 4.930655705283452, "learning_rate": 2.010460492339161e-09, "logits/chosen": -0.2638966143131256, "logits/rejected": -0.16020557284355164, "logps/chosen": -1.5293127298355103, "logps/rejected": -2.3441619873046875, "loss": 0.6344, "rewards/accuracies": 0.71875, "rewards/chosen": -1.5293127298355103, "rewards/margins": 0.8148494958877563, "rewards/rejected": -2.3441619873046875, "sft_loss": 1.5903923511505127, "step": 5460 }, { "epoch": 2.9249038300719183, "grad_norm": 5.135646130457434, "learning_rate": 1.8733547933446614e-09, "logits/chosen": -0.36130863428115845, "logits/rejected": -0.17520561814308167, "logps/chosen": -1.6114094257354736, "logps/rejected": -2.241234064102173, "loss": 0.6592, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.6114094257354736, "rewards/margins": 0.6298244595527649, "rewards/rejected": -2.241234064102173, "sft_loss": 1.5910862684249878, "step": 5465 }, { "epoch": 2.92757986285332, "grad_norm": 4.723087213112723, "learning_rate": 1.7410819212467231e-09, "logits/chosen": -0.22094795107841492, "logits/rejected": -0.12475217878818512, "logps/chosen": -1.5482748746871948, "logps/rejected": -2.1302969455718994, "loss": 0.6556, "rewards/accuracies": 0.65625, "rewards/chosen": -1.5482748746871948, "rewards/margins": 0.582021951675415, "rewards/rejected": -2.1302969455718994, "sft_loss": 1.5279250144958496, "step": 5470 }, { "epoch": 2.9302558956347218, "grad_norm": 3.8916001177332555, "learning_rate": 1.613643159357192e-09, "logits/chosen": -0.20356476306915283, "logits/rejected": -0.24255618453025818, "logps/chosen": -1.5652469396591187, "logps/rejected": -2.1905503273010254, "loss": 0.6414, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.5652469396591187, "rewards/margins": 0.6253035068511963, "rewards/rejected": -2.1905503273010254, "sft_loss": 1.6226780414581299, "step": 5475 }, { "epoch": 2.932931928416123, "grad_norm": 4.672538576812152, "learning_rate": 1.4910397440875967e-09, "logits/chosen": -0.26059651374816895, "logits/rejected": -0.15020112693309784, "logps/chosen": -1.5707706212997437, "logps/rejected": -2.1416842937469482, "loss": 0.6543, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.5707706212997437, "rewards/margins": 0.5709136724472046, "rewards/rejected": -2.1416842937469482, "sft_loss": 1.595155119895935, "step": 5480 }, { "epoch": 2.9356079611975248, "grad_norm": 7.175122015176713, "learning_rate": 1.3732728649368253e-09, "logits/chosen": -0.2035883218050003, "logits/rejected": -0.031918738037347794, "logps/chosen": -1.4992424249649048, "logps/rejected": -2.2319529056549072, "loss": 0.642, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.4992424249649048, "rewards/margins": 0.7327104806900024, "rewards/rejected": -2.2319529056549072, "sft_loss": 1.5359947681427002, "step": 5485 }, { "epoch": 2.938283993978926, "grad_norm": 4.796408365540654, "learning_rate": 1.260343664479524e-09, "logits/chosen": -0.23860719799995422, "logits/rejected": -0.19544881582260132, "logps/chosen": -1.5336408615112305, "logps/rejected": -2.178316593170166, "loss": 0.6522, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.5336408615112305, "rewards/margins": 0.6446754336357117, "rewards/rejected": -2.178316593170166, "sft_loss": 1.5777490139007568, "step": 5490 }, { "epoch": 2.9409600267603278, "grad_norm": 4.451082691730898, "learning_rate": 1.1522532383554384e-09, "logits/chosen": -0.3357131779193878, "logits/rejected": -0.1376783400774002, "logps/chosen": -1.5098403692245483, "logps/rejected": -2.2403979301452637, "loss": 0.6342, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.5098403692245483, "rewards/margins": 0.7305575609207153, "rewards/rejected": -2.2403979301452637, "sft_loss": 1.5593976974487305, "step": 5495 }, { "epoch": 2.9436360595417295, "grad_norm": 2.2502258639724393, "learning_rate": 1.049002635258256e-09, "logits/chosen": -0.2203347384929657, "logits/rejected": -0.10753805935382843, "logps/chosen": -1.7016786336898804, "logps/rejected": -2.338911771774292, "loss": 0.6609, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.7016786336898804, "rewards/margins": 0.6372330188751221, "rewards/rejected": -2.338911771774292, "sft_loss": 1.6395244598388672, "step": 5500 }, { "epoch": 2.946312092323131, "grad_norm": 2.1470051189006942, "learning_rate": 9.505928569258358e-10, "logits/chosen": -0.19624938070774078, "logits/rejected": -0.18421564996242523, "logps/chosen": -1.5481547117233276, "logps/rejected": -2.177351474761963, "loss": 0.6356, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.5481547117233276, "rewards/margins": 0.6291965246200562, "rewards/rejected": -2.177351474761963, "sft_loss": 1.6014915704727173, "step": 5505 }, { "epoch": 2.9489881251045325, "grad_norm": 2.8801187629736114, "learning_rate": 8.57024858130273e-10, "logits/chosen": -0.2564873993396759, "logits/rejected": -0.12604650855064392, "logps/chosen": -1.596642255783081, "logps/rejected": -2.6083335876464844, "loss": 0.635, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.596642255783081, "rewards/margins": 1.0116910934448242, "rewards/rejected": -2.6083335876464844, "sft_loss": 1.5695760250091553, "step": 5510 }, { "epoch": 2.951664157885934, "grad_norm": 5.366481336133112, "learning_rate": 7.682995466686826e-10, "logits/chosen": -0.3548617959022522, "logits/rejected": -0.2135559767484665, "logps/chosen": -1.679685354232788, "logps/rejected": -2.35361647605896, "loss": 0.6378, "rewards/accuracies": 0.65625, "rewards/chosen": -1.679685354232788, "rewards/margins": 0.6739312410354614, "rewards/rejected": -2.35361647605896, "sft_loss": 1.6176130771636963, "step": 5515 }, { "epoch": 2.9543401906673354, "grad_norm": 7.126941759915519, "learning_rate": 6.844177833543741e-10, "logits/chosen": -0.24436382949352264, "logits/rejected": -0.18135519325733185, "logps/chosen": -1.5110137462615967, "logps/rejected": -2.1283164024353027, "loss": 0.6535, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.5110137462615967, "rewards/margins": 0.6173027753829956, "rewards/rejected": -2.1283164024353027, "sft_loss": 1.5346163511276245, "step": 5520 }, { "epoch": 2.957016223448737, "grad_norm": 4.35177873431574, "learning_rate": 6.053803820087467e-10, "logits/chosen": -0.25509151816368103, "logits/rejected": -0.12994591891765594, "logps/chosen": -1.594662070274353, "logps/rejected": -2.4333889484405518, "loss": 0.6349, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.594662070274353, "rewards/margins": 0.8387266993522644, "rewards/rejected": -2.4333889484405518, "sft_loss": 1.6135175228118896, "step": 5525 }, { "epoch": 2.959692256230139, "grad_norm": 3.6754757430630676, "learning_rate": 5.311881094528514e-10, "logits/chosen": -0.3149816393852234, "logits/rejected": -0.08832015097141266, "logps/chosen": -1.7405153512954712, "logps/rejected": -2.200068473815918, "loss": 0.6767, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.7405153512954712, "rewards/margins": 0.4595530927181244, "rewards/rejected": -2.200068473815918, "sft_loss": 1.6633392572402954, "step": 5530 }, { "epoch": 2.9623682890115406, "grad_norm": 6.369769662890393, "learning_rate": 4.6184168550050806e-10, "logits/chosen": -0.27111494541168213, "logits/rejected": -0.218735933303833, "logps/chosen": -1.5510780811309814, "logps/rejected": -2.116690158843994, "loss": 0.6484, "rewards/accuracies": 0.625, "rewards/chosen": -1.5510780811309814, "rewards/margins": 0.5656118988990784, "rewards/rejected": -2.116690158843994, "sft_loss": 1.630822777748108, "step": 5535 }, { "epoch": 2.965044321792942, "grad_norm": 6.366283924334156, "learning_rate": 3.973417829510328e-10, "logits/chosen": -0.38514214754104614, "logits/rejected": -0.23634997010231018, "logps/chosen": -1.619696855545044, "logps/rejected": -2.1308228969573975, "loss": 0.6604, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.619696855545044, "rewards/margins": 0.5111261606216431, "rewards/rejected": -2.1308228969573975, "sft_loss": 1.5566799640655518, "step": 5540 }, { "epoch": 2.9677203545743436, "grad_norm": 5.181437814631028, "learning_rate": 3.3768902758274377e-10, "logits/chosen": -0.2535046637058258, "logits/rejected": -0.14234676957130432, "logps/chosen": -1.4892795085906982, "logps/rejected": -2.0935254096984863, "loss": 0.6488, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.4892795085906982, "rewards/margins": 0.6042462587356567, "rewards/rejected": -2.0935254096984863, "sft_loss": 1.5298943519592285, "step": 5545 }, { "epoch": 2.970396387355745, "grad_norm": 2.7048965748061162, "learning_rate": 2.8288399814691e-10, "logits/chosen": -0.14938172698020935, "logits/rejected": -0.03671771287918091, "logps/chosen": -1.6389738321304321, "logps/rejected": -2.2659668922424316, "loss": 0.6439, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.6389738321304321, "rewards/margins": 0.6269931793212891, "rewards/rejected": -2.2659668922424316, "sft_loss": 1.6594352722167969, "step": 5550 }, { "epoch": 2.9730724201371466, "grad_norm": 4.614081575961701, "learning_rate": 2.3292722636220066e-10, "logits/chosen": -0.25345319509506226, "logits/rejected": -0.04088393598794937, "logps/chosen": -1.6478992700576782, "logps/rejected": -2.488649606704712, "loss": 0.6508, "rewards/accuracies": 0.625, "rewards/chosen": -1.6478992700576782, "rewards/margins": 0.8407502174377441, "rewards/rejected": -2.488649606704712, "sft_loss": 1.6653321981430054, "step": 5555 }, { "epoch": 2.9757484529185483, "grad_norm": 5.103698961424684, "learning_rate": 1.8781919690946668e-10, "logits/chosen": -0.17153751850128174, "logits/rejected": -0.15024232864379883, "logps/chosen": -1.629041314125061, "logps/rejected": -2.036198377609253, "loss": 0.6636, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.629041314125061, "rewards/margins": 0.4071568548679352, "rewards/rejected": -2.036198377609253, "sft_loss": 1.6581542491912842, "step": 5560 }, { "epoch": 2.97842448569995, "grad_norm": 7.034656102261009, "learning_rate": 1.4756034742696711e-10, "logits/chosen": -0.2786411643028259, "logits/rejected": -0.21582278609275818, "logps/chosen": -1.4769775867462158, "logps/rejected": -2.1230950355529785, "loss": 0.6537, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.4769775867462158, "rewards/margins": 0.646117627620697, "rewards/rejected": -2.1230950355529785, "sft_loss": 1.5027004480361938, "step": 5565 }, { "epoch": 2.9811005184813513, "grad_norm": 4.850132598841623, "learning_rate": 1.12151068506261e-10, "logits/chosen": -0.1930471956729889, "logits/rejected": -0.05508570745587349, "logps/chosen": -1.4973084926605225, "logps/rejected": -2.397449016571045, "loss": 0.6252, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.4973084926605225, "rewards/margins": 0.9001402854919434, "rewards/rejected": -2.397449016571045, "sft_loss": 1.560623049736023, "step": 5570 }, { "epoch": 2.983776551262753, "grad_norm": 7.094611040703844, "learning_rate": 8.159170368826629e-11, "logits/chosen": -0.23316261172294617, "logits/rejected": -0.07678346335887909, "logps/chosen": -1.532983660697937, "logps/rejected": -2.2914743423461914, "loss": 0.6309, "rewards/accuracies": 0.75, "rewards/chosen": -1.532983660697937, "rewards/margins": 0.7584906816482544, "rewards/rejected": -2.2914743423461914, "sft_loss": 1.5512092113494873, "step": 5575 }, { "epoch": 2.9864525840441547, "grad_norm": 3.6689372276733128, "learning_rate": 5.588254946015114e-11, "logits/chosen": -0.32375577092170715, "logits/rejected": -0.057362549006938934, "logps/chosen": -1.4568369388580322, "logps/rejected": -2.2207484245300293, "loss": 0.6377, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.4568369388580322, "rewards/margins": 0.7639115452766418, "rewards/rejected": -2.2207484245300293, "sft_loss": 1.5315742492675781, "step": 5580 }, { "epoch": 2.989128616825556, "grad_norm": 3.901840869174015, "learning_rate": 3.502385525216978e-11, "logits/chosen": -0.27934569120407104, "logits/rejected": -0.11083509773015976, "logps/chosen": -1.56743586063385, "logps/rejected": -2.3566184043884277, "loss": 0.6448, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.56743586063385, "rewards/margins": 0.7891825437545776, "rewards/rejected": -2.3566184043884277, "sft_loss": 1.6707375049591064, "step": 5585 }, { "epoch": 2.9918046496069577, "grad_norm": 2.7744122100860062, "learning_rate": 1.901582343555308e-11, "logits/chosen": -0.19820816814899445, "logits/rejected": -0.13724127411842346, "logps/chosen": -1.6933910846710205, "logps/rejected": -2.307555913925171, "loss": 0.6645, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.6933910846710205, "rewards/margins": 0.6141648292541504, "rewards/rejected": -2.307555913925171, "sft_loss": 1.6180213689804077, "step": 5590 }, { "epoch": 2.9944806823883594, "grad_norm": 4.146374408259338, "learning_rate": 7.858609320232634e-12, "logits/chosen": -0.22346720099449158, "logits/rejected": -0.0670088678598404, "logps/chosen": -1.53458571434021, "logps/rejected": -2.1950907707214355, "loss": 0.6368, "rewards/accuracies": 0.6875, "rewards/chosen": -1.53458571434021, "rewards/margins": 0.6605050563812256, "rewards/rejected": -2.1950907707214355, "sft_loss": 1.5120246410369873, "step": 5595 }, { "epoch": 2.9971567151697607, "grad_norm": 4.666700730222403, "learning_rate": 1.5523211535639624e-12, "logits/chosen": -0.21801479160785675, "logits/rejected": -0.1040673479437828, "logps/chosen": -1.5110822916030884, "logps/rejected": -2.4874258041381836, "loss": 0.6368, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.5110822916030884, "rewards/margins": 0.9763437509536743, "rewards/rejected": -2.4874258041381836, "sft_loss": 1.6090261936187744, "step": 5600 }, { "epoch": 2.9971567151697607, "eval_logits/chosen": 0.11780353635549545, "eval_logits/rejected": 0.21422991156578064, "eval_logps/chosen": -1.6016647815704346, "eval_logps/rejected": -2.2330212593078613, "eval_loss": 0.6684653759002686, "eval_rewards/accuracies": 0.6505934596061707, "eval_rewards/chosen": -1.6016647815704346, "eval_rewards/margins": 0.6313564777374268, "eval_rewards/rejected": -2.2330212593078613, "eval_runtime": 43.1555, "eval_samples_per_second": 31.166, "eval_sft_loss": 1.5896871089935303, "eval_steps_per_second": 7.809, "step": 5600 }, { "epoch": 2.999297541394882, "step": 5604, "total_flos": 0.0, "train_loss": 0.6685084665272595, "train_runtime": 33774.3849, "train_samples_per_second": 5.311, "train_steps_per_second": 0.166 } ], "logging_steps": 5, "max_steps": 5604, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 1000000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }