diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,23868 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 100, + "global_step": 15284, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "learning_rate": 3.270111183780249e-09, + "logits/chosen": -2.634561777114868, + "logits/rejected": -2.673060417175293, + "logps/chosen": -207.5323944091797, + "logps/rejected": -286.9266052246094, + "loss": 0.0999, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1 + }, + { + "epoch": 0.0, + "learning_rate": 3.270111183780249e-08, + "logits/chosen": -2.2176363468170166, + "logits/rejected": -1.965279459953308, + "logps/chosen": -185.93206787109375, + "logps/rejected": -165.39874267578125, + "loss": 0.0677, + "rewards/accuracies": 0.2777777910232544, + "rewards/chosen": -5.8274250477552414e-05, + "rewards/margins": -0.0002318211190868169, + "rewards/rejected": 0.0001735468686092645, + "step": 10 + }, + { + "epoch": 0.0, + "learning_rate": 6.540222367560497e-08, + "logits/chosen": -2.4317684173583984, + "logits/rejected": -2.2229201793670654, + "logps/chosen": -232.4213409423828, + "logps/rejected": -231.39962768554688, + "loss": 0.0519, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.00013405675417743623, + "rewards/margins": 1.7013855540426448e-05, + "rewards/rejected": 0.00011704283679137006, + "step": 20 + }, + { + "epoch": 0.0, + "learning_rate": 9.810333551340746e-08, + "logits/chosen": -2.258877754211426, + "logits/rejected": -2.162977695465088, + "logps/chosen": -197.35354614257812, + "logps/rejected": -219.13766479492188, + "loss": 0.0565, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.00035292975371703506, + "rewards/margins": 0.0005842813989147544, + "rewards/rejected": -0.00023135158699005842, + "step": 30 + }, + { + "epoch": 0.0, + "learning_rate": 1.3080444735120995e-07, + "logits/chosen": -2.2115323543548584, + "logits/rejected": -2.2505645751953125, + "logps/chosen": -276.000244140625, + "logps/rejected": -265.74652099609375, + "loss": 0.0472, + "rewards/accuracies": 0.375, + "rewards/chosen": -6.294570630416274e-05, + "rewards/margins": -0.00012715658522211015, + "rewards/rejected": 6.421087891794741e-05, + "step": 40 + }, + { + "epoch": 0.0, + "learning_rate": 1.6350555918901243e-07, + "logits/chosen": -2.348076581954956, + "logits/rejected": -2.141223192214966, + "logps/chosen": -204.8376007080078, + "logps/rejected": -184.71292114257812, + "loss": 0.0762, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": -0.00016025640070438385, + "rewards/margins": -7.07902800058946e-05, + "rewards/rejected": -8.946609159465879e-05, + "step": 50 + }, + { + "epoch": 0.0, + "learning_rate": 1.9620667102681492e-07, + "logits/chosen": -2.3073477745056152, + "logits/rejected": -2.067842483520508, + "logps/chosen": -209.7217254638672, + "logps/rejected": -185.87832641601562, + "loss": 0.0891, + "rewards/accuracies": 0.3499999940395355, + "rewards/chosen": 0.00042088530608452857, + "rewards/margins": -0.00010498131450731307, + "rewards/rejected": 0.000525866576936096, + "step": 60 + }, + { + "epoch": 0.0, + "learning_rate": 2.289077828646174e-07, + "logits/chosen": -2.2689318656921387, + "logits/rejected": -2.1558871269226074, + "logps/chosen": -218.03952026367188, + "logps/rejected": -207.98361206054688, + "loss": 0.0432, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": 0.00035006407415494323, + "rewards/margins": 6.475891859736294e-05, + "rewards/rejected": 0.0002853051701094955, + "step": 70 + }, + { + "epoch": 0.01, + "learning_rate": 2.616088947024199e-07, + "logits/chosen": -2.5095014572143555, + "logits/rejected": -2.228682041168213, + "logps/chosen": -258.87213134765625, + "logps/rejected": -213.68508911132812, + "loss": 0.0631, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.0003914732369594276, + "rewards/margins": -9.930254600476474e-05, + "rewards/rejected": 0.0004907757975161076, + "step": 80 + }, + { + "epoch": 0.01, + "learning_rate": 2.943100065402224e-07, + "logits/chosen": -2.258244037628174, + "logits/rejected": -2.173391819000244, + "logps/chosen": -184.66891479492188, + "logps/rejected": -165.49636840820312, + "loss": 0.0224, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": 0.0005794145399704576, + "rewards/margins": 0.0002798104251269251, + "rewards/rejected": 0.00029960396932438016, + "step": 90 + }, + { + "epoch": 0.01, + "learning_rate": 3.2701111837802487e-07, + "logits/chosen": -2.431363344192505, + "logits/rejected": -2.425813674926758, + "logps/chosen": -168.80804443359375, + "logps/rejected": -183.89962768554688, + "loss": 0.0706, + "rewards/accuracies": 0.42500001192092896, + "rewards/chosen": 0.0006342270062305033, + "rewards/margins": -0.0003130243276245892, + "rewards/rejected": 0.0009472514502704144, + "step": 100 + }, + { + "epoch": 0.01, + "eval_logits/chosen": -2.3487260341644287, + "eval_logits/rejected": -2.1602516174316406, + "eval_logps/chosen": -231.7681884765625, + "eval_logps/rejected": -211.45697021484375, + "eval_loss": 0.05356210842728615, + "eval_rewards/accuracies": 0.4925000071525574, + "eval_rewards/chosen": 0.0011838467326015234, + "eval_rewards/margins": 0.0004092271556146443, + "eval_rewards/rejected": 0.0007746195769868791, + "eval_runtime": 712.6016, + "eval_samples_per_second": 2.807, + "eval_steps_per_second": 1.403, + "step": 100 + }, + { + "epoch": 0.01, + "learning_rate": 3.5971223021582736e-07, + "logits/chosen": -2.3473939895629883, + "logits/rejected": -1.9998290538787842, + "logps/chosen": -222.42153930664062, + "logps/rejected": -167.03335571289062, + "loss": 0.0746, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": 0.0018700523069128394, + "rewards/margins": 0.000863722525537014, + "rewards/rejected": 0.0010063296649605036, + "step": 110 + }, + { + "epoch": 0.01, + "learning_rate": 3.9241334205362984e-07, + "logits/chosen": -2.340332508087158, + "logits/rejected": -2.245119571685791, + "logps/chosen": -224.01327514648438, + "logps/rejected": -234.0855255126953, + "loss": 0.0405, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": 0.0013942383229732513, + "rewards/margins": -3.433373785810545e-05, + "rewards/rejected": 0.0014285718789324164, + "step": 120 + }, + { + "epoch": 0.01, + "learning_rate": 4.251144538914324e-07, + "logits/chosen": -2.261899471282959, + "logits/rejected": -2.217729091644287, + "logps/chosen": -149.3909454345703, + "logps/rejected": -148.33004760742188, + "loss": 0.0442, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": 0.0012747000437229872, + "rewards/margins": 0.0008335200254805386, + "rewards/rejected": 0.0004411800764501095, + "step": 130 + }, + { + "epoch": 0.01, + "learning_rate": 4.578155657292348e-07, + "logits/chosen": -2.3222341537475586, + "logits/rejected": -2.223220109939575, + "logps/chosen": -225.5640106201172, + "logps/rejected": -159.45921325683594, + "loss": 0.0515, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.0019747335463762283, + "rewards/margins": 0.000815176113974303, + "rewards/rejected": 0.0011595574906095862, + "step": 140 + }, + { + "epoch": 0.01, + "learning_rate": 4.905166775670374e-07, + "logits/chosen": -2.367952585220337, + "logits/rejected": -2.1589996814727783, + "logps/chosen": -231.03799438476562, + "logps/rejected": -229.10598754882812, + "loss": 0.097, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.0015882834559306502, + "rewards/margins": 0.001329472055658698, + "rewards/rejected": 0.0002588116331025958, + "step": 150 + }, + { + "epoch": 0.01, + "learning_rate": 5.232177894048398e-07, + "logits/chosen": -2.213862180709839, + "logits/rejected": -2.2266170978546143, + "logps/chosen": -260.2805480957031, + "logps/rejected": -224.8443145751953, + "loss": 0.0642, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.002939788158982992, + "rewards/margins": 0.002163815079256892, + "rewards/rejected": 0.0007759730797261, + "step": 160 + }, + { + "epoch": 0.01, + "learning_rate": 5.559189012426422e-07, + "logits/chosen": -2.314908981323242, + "logits/rejected": -2.0375542640686035, + "logps/chosen": -180.51136779785156, + "logps/rejected": -156.74420166015625, + "loss": 0.0532, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.0017061032122001052, + "rewards/margins": 0.0009744632989168167, + "rewards/rejected": 0.0007316397386603057, + "step": 170 + }, + { + "epoch": 0.01, + "learning_rate": 5.886200130804448e-07, + "logits/chosen": -2.3991408348083496, + "logits/rejected": -2.3391599655151367, + "logps/chosen": -217.56588745117188, + "logps/rejected": -198.68756103515625, + "loss": 0.0484, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.0033133826218545437, + "rewards/margins": 0.0027288985438644886, + "rewards/rejected": 0.0005844842526130378, + "step": 180 + }, + { + "epoch": 0.01, + "learning_rate": 6.213211249182473e-07, + "logits/chosen": -2.0700626373291016, + "logits/rejected": -2.175288677215576, + "logps/chosen": -190.9939727783203, + "logps/rejected": -208.7053985595703, + "loss": 0.0427, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.0025113599840551615, + "rewards/margins": 0.0027750071603804827, + "rewards/rejected": -0.000263646972598508, + "step": 190 + }, + { + "epoch": 0.01, + "learning_rate": 6.540222367560497e-07, + "logits/chosen": -2.2762703895568848, + "logits/rejected": -2.2404098510742188, + "logps/chosen": -146.71702575683594, + "logps/rejected": -177.6888427734375, + "loss": 0.0614, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.0024750891607254744, + "rewards/margins": 0.003276436123996973, + "rewards/rejected": -0.0008013470214791596, + "step": 200 + }, + { + "epoch": 0.01, + "eval_logits/chosen": -2.349438428878784, + "eval_logits/rejected": -2.1609809398651123, + "eval_logps/chosen": -231.4156036376953, + "eval_logps/rejected": -211.6427459716797, + "eval_loss": 0.05235092341899872, + "eval_rewards/accuracies": 0.5889999866485596, + "eval_rewards/chosen": 0.0029466315172612667, + "eval_rewards/margins": 0.0031008960213512182, + "eval_rewards/rejected": -0.00015426499885506928, + "eval_runtime": 715.0358, + "eval_samples_per_second": 2.797, + "eval_steps_per_second": 1.399, + "step": 200 + }, + { + "epoch": 0.01, + "learning_rate": 6.867233485938523e-07, + "logits/chosen": -2.4264657497406006, + "logits/rejected": -2.202401638031006, + "logps/chosen": -218.64462280273438, + "logps/rejected": -188.3276824951172, + "loss": 0.0449, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.004444460850208998, + "rewards/margins": 0.005253319162875414, + "rewards/rejected": -0.000808858429081738, + "step": 210 + }, + { + "epoch": 0.01, + "learning_rate": 7.194244604316547e-07, + "logits/chosen": -2.2130541801452637, + "logits/rejected": -2.040802240371704, + "logps/chosen": -182.76869201660156, + "logps/rejected": -174.8404083251953, + "loss": 0.0287, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.002838475862517953, + "rewards/margins": 0.0023240107111632824, + "rewards/rejected": 0.000514464860316366, + "step": 220 + }, + { + "epoch": 0.02, + "learning_rate": 7.521255722694571e-07, + "logits/chosen": -2.4207396507263184, + "logits/rejected": -2.0251927375793457, + "logps/chosen": -278.82373046875, + "logps/rejected": -184.07823181152344, + "loss": 0.0582, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.004778166767209768, + "rewards/margins": 0.0035492833703756332, + "rewards/rejected": 0.001228884095326066, + "step": 230 + }, + { + "epoch": 0.02, + "learning_rate": 7.848266841072597e-07, + "logits/chosen": -2.2012083530426025, + "logits/rejected": -2.1659531593322754, + "logps/chosen": -214.45327758789062, + "logps/rejected": -206.27804565429688, + "loss": 0.0648, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.006988098379224539, + "rewards/margins": 0.0069609819911420345, + "rewards/rejected": 2.711568959057331e-05, + "step": 240 + }, + { + "epoch": 0.02, + "learning_rate": 8.175277959450622e-07, + "logits/chosen": -2.168360471725464, + "logits/rejected": -2.3281311988830566, + "logps/chosen": -217.9949493408203, + "logps/rejected": -220.6697540283203, + "loss": 0.0215, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.012156028300523758, + "rewards/margins": 0.005674418993294239, + "rewards/rejected": 0.0064816102385520935, + "step": 250 + }, + { + "epoch": 0.02, + "learning_rate": 8.502289077828648e-07, + "logits/chosen": -2.5064778327941895, + "logits/rejected": -2.1448421478271484, + "logps/chosen": -254.0000457763672, + "logps/rejected": -188.9385986328125, + "loss": 0.0525, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.015774715691804886, + "rewards/margins": 0.006485472433269024, + "rewards/rejected": 0.009289243258535862, + "step": 260 + }, + { + "epoch": 0.02, + "learning_rate": 8.829300196206672e-07, + "logits/chosen": -2.423475742340088, + "logits/rejected": -2.1279618740081787, + "logps/chosen": -246.0577392578125, + "logps/rejected": -230.53659057617188, + "loss": 0.0873, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.013765650801360607, + "rewards/margins": 0.008541886694729328, + "rewards/rejected": 0.005223765503615141, + "step": 270 + }, + { + "epoch": 0.02, + "learning_rate": 9.156311314584696e-07, + "logits/chosen": -2.3068556785583496, + "logits/rejected": -2.193019151687622, + "logps/chosen": -159.5777130126953, + "logps/rejected": -146.2738494873047, + "loss": 0.0397, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": 0.013053646311163902, + "rewards/margins": 0.0058270045556128025, + "rewards/rejected": 0.007226643152534962, + "step": 280 + }, + { + "epoch": 0.02, + "learning_rate": 9.483322432962722e-07, + "logits/chosen": -2.554816722869873, + "logits/rejected": -2.160402536392212, + "logps/chosen": -281.8065185546875, + "logps/rejected": -225.3990478515625, + "loss": 0.0275, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.016861405223608017, + "rewards/margins": 0.005430780816823244, + "rewards/rejected": 0.011430625803768635, + "step": 290 + }, + { + "epoch": 0.02, + "learning_rate": 9.810333551340747e-07, + "logits/chosen": -2.341587543487549, + "logits/rejected": -2.1964211463928223, + "logps/chosen": -264.63897705078125, + "logps/rejected": -238.7764129638672, + "loss": 0.0495, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.020103512331843376, + "rewards/margins": 0.01393399853259325, + "rewards/rejected": 0.006169513799250126, + "step": 300 + }, + { + "epoch": 0.02, + "eval_logits/chosen": -2.3531229496002197, + "eval_logits/rejected": -2.1645307540893555, + "eval_logps/chosen": -228.2109832763672, + "eval_logps/rejected": -209.68736267089844, + "eval_loss": 0.049944277852773666, + "eval_rewards/accuracies": 0.5805000066757202, + "eval_rewards/chosen": 0.018969887867569923, + "eval_rewards/margins": 0.009347214363515377, + "eval_rewards/rejected": 0.009622674435377121, + "eval_runtime": 711.7456, + "eval_samples_per_second": 2.81, + "eval_steps_per_second": 1.405, + "step": 300 + }, + { + "epoch": 0.02, + "learning_rate": 1.0137344669718771e-06, + "logits/chosen": -2.350700855255127, + "logits/rejected": -2.365297317504883, + "logps/chosen": -166.68600463867188, + "logps/rejected": -155.6295623779297, + "loss": 0.0358, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.014021609909832478, + "rewards/margins": 0.002970766741782427, + "rewards/rejected": 0.011050843633711338, + "step": 310 + }, + { + "epoch": 0.02, + "learning_rate": 1.0464355788096796e-06, + "logits/chosen": -2.456861972808838, + "logits/rejected": -2.061683177947998, + "logps/chosen": -220.7519073486328, + "logps/rejected": -191.7381591796875, + "loss": 0.0499, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.022065896540880203, + "rewards/margins": 0.008685490116477013, + "rewards/rejected": 0.013380405493080616, + "step": 320 + }, + { + "epoch": 0.02, + "learning_rate": 1.079136690647482e-06, + "logits/chosen": -2.437948703765869, + "logits/rejected": -2.2054593563079834, + "logps/chosen": -202.8629150390625, + "logps/rejected": -175.33372497558594, + "loss": 0.0886, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.02774173393845558, + "rewards/margins": 0.013089954853057861, + "rewards/rejected": 0.014651775360107422, + "step": 330 + }, + { + "epoch": 0.02, + "learning_rate": 1.1118378024852844e-06, + "logits/chosen": -2.1878809928894043, + "logits/rejected": -2.354051113128662, + "logps/chosen": -150.0569610595703, + "logps/rejected": -177.16937255859375, + "loss": 0.0381, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": 0.018287161365151405, + "rewards/margins": -0.0016521146753802896, + "rewards/rejected": 0.019939277321100235, + "step": 340 + }, + { + "epoch": 0.02, + "learning_rate": 1.144538914323087e-06, + "logits/chosen": -2.4268805980682373, + "logits/rejected": -1.9881579875946045, + "logps/chosen": -317.0654602050781, + "logps/rejected": -247.81021118164062, + "loss": 0.0528, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.025226067751646042, + "rewards/margins": 0.017956417053937912, + "rewards/rejected": 0.007269649300724268, + "step": 350 + }, + { + "epoch": 0.02, + "learning_rate": 1.1772400261608895e-06, + "logits/chosen": -2.4913463592529297, + "logits/rejected": -2.197359085083008, + "logps/chosen": -219.1552276611328, + "logps/rejected": -192.1640167236328, + "loss": 0.0422, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.03162415325641632, + "rewards/margins": 0.01586632803082466, + "rewards/rejected": 0.01575782522559166, + "step": 360 + }, + { + "epoch": 0.02, + "learning_rate": 1.2099411379986922e-06, + "logits/chosen": -2.1705100536346436, + "logits/rejected": -2.2467100620269775, + "logps/chosen": -191.59078979492188, + "logps/rejected": -205.70358276367188, + "loss": 0.0569, + "rewards/accuracies": 0.3499999940395355, + "rewards/chosen": 0.029529735445976257, + "rewards/margins": 0.006725566927343607, + "rewards/rejected": 0.022804168984293938, + "step": 370 + }, + { + "epoch": 0.02, + "learning_rate": 1.2426422498364946e-06, + "logits/chosen": -2.332655429840088, + "logits/rejected": -2.044221878051758, + "logps/chosen": -215.96444702148438, + "logps/rejected": -161.74484252929688, + "loss": 0.0443, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.02841884456574917, + "rewards/margins": 0.016788840293884277, + "rewards/rejected": 0.011630003340542316, + "step": 380 + }, + { + "epoch": 0.03, + "learning_rate": 1.2753433616742968e-06, + "logits/chosen": -2.329526424407959, + "logits/rejected": -2.2495083808898926, + "logps/chosen": -180.93544006347656, + "logps/rejected": -244.7272186279297, + "loss": 0.0764, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.02182791940867901, + "rewards/margins": 0.01554826833307743, + "rewards/rejected": 0.006279653403908014, + "step": 390 + }, + { + "epoch": 0.03, + "learning_rate": 1.3080444735120995e-06, + "logits/chosen": -2.489765167236328, + "logits/rejected": -2.1206138134002686, + "logps/chosen": -219.66854858398438, + "logps/rejected": -179.7932891845703, + "loss": 0.065, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.0291206743568182, + "rewards/margins": 0.026874784380197525, + "rewards/rejected": 0.002245891373604536, + "step": 400 + }, + { + "epoch": 0.03, + "eval_logits/chosen": -2.3541910648345947, + "eval_logits/rejected": -2.1655361652374268, + "eval_logps/chosen": -227.16796875, + "eval_logps/rejected": -210.1238555908203, + "eval_loss": 0.04696543887257576, + "eval_rewards/accuracies": 0.5979999899864197, + "eval_rewards/chosen": 0.024184904992580414, + "eval_rewards/margins": 0.016744675114750862, + "eval_rewards/rejected": 0.007440229412168264, + "eval_runtime": 711.4016, + "eval_samples_per_second": 2.811, + "eval_steps_per_second": 1.406, + "step": 400 + }, + { + "epoch": 0.03, + "learning_rate": 1.3407455853499021e-06, + "logits/chosen": -2.4746851921081543, + "logits/rejected": -2.323978900909424, + "logps/chosen": -256.46856689453125, + "logps/rejected": -223.9653778076172, + "loss": 0.0368, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.029423344880342484, + "rewards/margins": 0.01946599781513214, + "rewards/rejected": 0.009957349859178066, + "step": 410 + }, + { + "epoch": 0.03, + "learning_rate": 1.3734466971877046e-06, + "logits/chosen": -2.3022310733795166, + "logits/rejected": -2.201465606689453, + "logps/chosen": -176.49359130859375, + "logps/rejected": -173.44068908691406, + "loss": 0.0444, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.0271303653717041, + "rewards/margins": 0.022079220041632652, + "rewards/rejected": 0.005051146261394024, + "step": 420 + }, + { + "epoch": 0.03, + "learning_rate": 1.406147809025507e-06, + "logits/chosen": -2.2948622703552246, + "logits/rejected": -2.107673168182373, + "logps/chosen": -209.0362091064453, + "logps/rejected": -181.45956420898438, + "loss": 0.0591, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.03055526874959469, + "rewards/margins": 0.024613896384835243, + "rewards/rejected": 0.005941365379840136, + "step": 430 + }, + { + "epoch": 0.03, + "learning_rate": 1.4388489208633094e-06, + "logits/chosen": -2.394108295440674, + "logits/rejected": -2.107898235321045, + "logps/chosen": -250.1384735107422, + "logps/rejected": -222.7085418701172, + "loss": 0.045, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.021793970838189125, + "rewards/margins": 0.012408947572112083, + "rewards/rejected": 0.009385021403431892, + "step": 440 + }, + { + "epoch": 0.03, + "learning_rate": 1.471550032701112e-06, + "logits/chosen": -2.413292169570923, + "logits/rejected": -2.2094132900238037, + "logps/chosen": -185.10714721679688, + "logps/rejected": -198.40866088867188, + "loss": 0.0766, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.022703688591718674, + "rewards/margins": 0.0375007800757885, + "rewards/rejected": -0.014797091484069824, + "step": 450 + }, + { + "epoch": 0.03, + "learning_rate": 1.5042511445389143e-06, + "logits/chosen": -2.156865358352661, + "logits/rejected": -2.204716920852661, + "logps/chosen": -146.9668426513672, + "logps/rejected": -214.1243133544922, + "loss": 0.0952, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.01986575871706009, + "rewards/margins": 0.01865716651082039, + "rewards/rejected": 0.0012085925554856658, + "step": 460 + }, + { + "epoch": 0.03, + "learning_rate": 1.536952256376717e-06, + "logits/chosen": -2.046077013015747, + "logits/rejected": -2.108354330062866, + "logps/chosen": -197.14544677734375, + "logps/rejected": -252.22900390625, + "loss": 0.0563, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.01892588473856449, + "rewards/margins": 0.04759754613041878, + "rewards/rejected": -0.028671661391854286, + "step": 470 + }, + { + "epoch": 0.03, + "learning_rate": 1.5696533682145194e-06, + "logits/chosen": -2.4412786960601807, + "logits/rejected": -2.2085611820220947, + "logps/chosen": -178.488037109375, + "logps/rejected": -153.7083740234375, + "loss": 0.0382, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.02879374288022518, + "rewards/margins": 0.03139546141028404, + "rewards/rejected": -0.0026017185300588608, + "step": 480 + }, + { + "epoch": 0.03, + "learning_rate": 1.602354480052322e-06, + "logits/chosen": -2.4102816581726074, + "logits/rejected": -2.3218040466308594, + "logps/chosen": -260.2879638671875, + "logps/rejected": -209.857177734375, + "loss": 0.0475, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.03175920993089676, + "rewards/margins": 0.02260466292500496, + "rewards/rejected": 0.00915454886853695, + "step": 490 + }, + { + "epoch": 0.03, + "learning_rate": 1.6350555918901245e-06, + "logits/chosen": -2.1970908641815186, + "logits/rejected": -1.8694721460342407, + "logps/chosen": -218.60458374023438, + "logps/rejected": -216.32119750976562, + "loss": 0.04, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.0026772418059408665, + "rewards/margins": 0.061300646513700485, + "rewards/rejected": -0.06397788971662521, + "step": 500 + }, + { + "epoch": 0.03, + "eval_logits/chosen": -2.3290927410125732, + "eval_logits/rejected": -2.1417860984802246, + "eval_logps/chosen": -242.22723388671875, + "eval_logps/rejected": -229.62611389160156, + "eval_loss": 0.04155317693948746, + "eval_rewards/accuracies": 0.612500011920929, + "eval_rewards/chosen": -0.05111142620444298, + "eval_rewards/margins": 0.0389595590531826, + "eval_rewards/rejected": -0.09007100015878677, + "eval_runtime": 715.9988, + "eval_samples_per_second": 2.793, + "eval_steps_per_second": 1.397, + "step": 500 + }, + { + "epoch": 0.03, + "learning_rate": 1.6677567037279269e-06, + "logits/chosen": -2.4629549980163574, + "logits/rejected": -2.116116523742676, + "logps/chosen": -308.35797119140625, + "logps/rejected": -279.0551452636719, + "loss": 0.015, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.060471516102552414, + "rewards/margins": 0.055410999804735184, + "rewards/rejected": -0.1158825010061264, + "step": 510 + }, + { + "epoch": 0.03, + "learning_rate": 1.7004578155657295e-06, + "logits/chosen": -2.2604610919952393, + "logits/rejected": -2.2810425758361816, + "logps/chosen": -222.1859588623047, + "logps/rejected": -210.0410614013672, + "loss": 0.0338, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.07733304798603058, + "rewards/margins": 0.04137866944074631, + "rewards/rejected": -0.11871170997619629, + "step": 520 + }, + { + "epoch": 0.03, + "learning_rate": 1.7331589274035318e-06, + "logits/chosen": -2.134936809539795, + "logits/rejected": -2.0191311836242676, + "logps/chosen": -194.81320190429688, + "logps/rejected": -204.94418334960938, + "loss": 0.047, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.05443320795893669, + "rewards/margins": 0.05342613533139229, + "rewards/rejected": -0.10785935074090958, + "step": 530 + }, + { + "epoch": 0.04, + "learning_rate": 1.7658600392413344e-06, + "logits/chosen": -2.3616204261779785, + "logits/rejected": -2.203824520111084, + "logps/chosen": -219.65774536132812, + "logps/rejected": -208.9588165283203, + "loss": 0.0389, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.08401162922382355, + "rewards/margins": 0.0333532877266407, + "rewards/rejected": -0.11736490577459335, + "step": 540 + }, + { + "epoch": 0.04, + "learning_rate": 1.7985611510791368e-06, + "logits/chosen": -2.3590781688690186, + "logits/rejected": -1.9083945751190186, + "logps/chosen": -275.25823974609375, + "logps/rejected": -266.81097412109375, + "loss": 0.034, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.058542896062135696, + "rewards/margins": 0.03970601409673691, + "rewards/rejected": -0.0982489138841629, + "step": 550 + }, + { + "epoch": 0.04, + "learning_rate": 1.8312622629169393e-06, + "logits/chosen": -2.269632577896118, + "logits/rejected": -2.1327080726623535, + "logps/chosen": -279.70147705078125, + "logps/rejected": -247.4019012451172, + "loss": 0.0447, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.08123533427715302, + "rewards/margins": 0.011105300858616829, + "rewards/rejected": -0.0923406332731247, + "step": 560 + }, + { + "epoch": 0.04, + "learning_rate": 1.8639633747547417e-06, + "logits/chosen": -2.358708620071411, + "logits/rejected": -2.187506675720215, + "logps/chosen": -244.6337127685547, + "logps/rejected": -213.9905548095703, + "loss": 0.0687, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.07744290679693222, + "rewards/margins": 0.024968545883893967, + "rewards/rejected": -0.10241146385669708, + "step": 570 + }, + { + "epoch": 0.04, + "learning_rate": 1.8966644865925443e-06, + "logits/chosen": -2.209902286529541, + "logits/rejected": -2.238129138946533, + "logps/chosen": -255.62509155273438, + "logps/rejected": -270.8184814453125, + "loss": 0.0611, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.08208145946264267, + "rewards/margins": 0.018994811922311783, + "rewards/rejected": -0.10107628256082535, + "step": 580 + }, + { + "epoch": 0.04, + "learning_rate": 1.9293655984303466e-06, + "logits/chosen": -2.6335816383361816, + "logits/rejected": -2.211545467376709, + "logps/chosen": -296.67333984375, + "logps/rejected": -218.890869140625, + "loss": 0.062, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.06853550672531128, + "rewards/margins": 0.026111140847206116, + "rewards/rejected": -0.0946466475725174, + "step": 590 + }, + { + "epoch": 0.04, + "learning_rate": 1.9620667102681494e-06, + "logits/chosen": -2.366116762161255, + "logits/rejected": -2.3787477016448975, + "logps/chosen": -199.81320190429688, + "logps/rejected": -188.07069396972656, + "loss": 0.0313, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": -0.052803684026002884, + "rewards/margins": 0.012890547513961792, + "rewards/rejected": -0.06569422781467438, + "step": 600 + }, + { + "epoch": 0.04, + "eval_logits/chosen": -2.3278696537017822, + "eval_logits/rejected": -2.140028715133667, + "eval_logps/chosen": -241.02426147460938, + "eval_logps/rejected": -227.2308807373047, + "eval_loss": 0.041324835270643234, + "eval_rewards/accuracies": 0.6159999966621399, + "eval_rewards/chosen": -0.04509655386209488, + "eval_rewards/margins": 0.032998330891132355, + "eval_rewards/rejected": -0.07809487730264664, + "eval_runtime": 714.8275, + "eval_samples_per_second": 2.798, + "eval_steps_per_second": 1.399, + "step": 600 + }, + { + "epoch": 0.04, + "learning_rate": 1.994767822105952e-06, + "logits/chosen": -2.27354097366333, + "logits/rejected": -2.180142402648926, + "logps/chosen": -182.80899047851562, + "logps/rejected": -197.1210174560547, + "loss": 0.0782, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -0.038026656955480576, + "rewards/margins": 0.03938506916165352, + "rewards/rejected": -0.0774117261171341, + "step": 610 + }, + { + "epoch": 0.04, + "learning_rate": 2.0274689339437543e-06, + "logits/chosen": -2.2067208290100098, + "logits/rejected": -1.9959625005722046, + "logps/chosen": -282.072509765625, + "logps/rejected": -252.33084106445312, + "loss": 0.0265, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.016913847997784615, + "rewards/margins": 0.0280628465116024, + "rewards/rejected": -0.044976696372032166, + "step": 620 + }, + { + "epoch": 0.04, + "learning_rate": 2.0601700457815567e-06, + "logits/chosen": -2.360585927963257, + "logits/rejected": -1.9948257207870483, + "logps/chosen": -266.9506530761719, + "logps/rejected": -228.4451141357422, + "loss": 0.0394, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.03797965496778488, + "rewards/margins": 0.025038376450538635, + "rewards/rejected": -0.06301803141832352, + "step": 630 + }, + { + "epoch": 0.04, + "learning_rate": 2.092871157619359e-06, + "logits/chosen": -2.409940004348755, + "logits/rejected": -2.2259554862976074, + "logps/chosen": -180.29090881347656, + "logps/rejected": -202.35862731933594, + "loss": 0.0327, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.018143683671951294, + "rewards/margins": 0.04178124666213989, + "rewards/rejected": -0.059924930334091187, + "step": 640 + }, + { + "epoch": 0.04, + "learning_rate": 2.1255722694571616e-06, + "logits/chosen": -2.4271957874298096, + "logits/rejected": -2.0304219722747803, + "logps/chosen": -279.4947204589844, + "logps/rejected": -195.42269897460938, + "loss": 0.0465, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.003000382799655199, + "rewards/margins": 0.04447781294584274, + "rewards/rejected": -0.04747819900512695, + "step": 650 + }, + { + "epoch": 0.04, + "learning_rate": 2.158273381294964e-06, + "logits/chosen": -2.3339033126831055, + "logits/rejected": -2.228659152984619, + "logps/chosen": -211.0321044921875, + "logps/rejected": -219.0637664794922, + "loss": 0.041, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": 0.006974533200263977, + "rewards/margins": 0.012601134367287159, + "rewards/rejected": -0.005626601167023182, + "step": 660 + }, + { + "epoch": 0.04, + "learning_rate": 2.190974493132767e-06, + "logits/chosen": -2.330313205718994, + "logits/rejected": -2.0267796516418457, + "logps/chosen": -235.9166259765625, + "logps/rejected": -188.2518768310547, + "loss": 0.0471, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.008812380954623222, + "rewards/margins": 0.042960282415151596, + "rewards/rejected": -0.034147895872592926, + "step": 670 + }, + { + "epoch": 0.04, + "learning_rate": 2.223675604970569e-06, + "logits/chosen": -2.39384388923645, + "logits/rejected": -2.1898155212402344, + "logps/chosen": -208.85214233398438, + "logps/rejected": -198.39894104003906, + "loss": 0.0353, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.009410612285137177, + "rewards/margins": 0.034385450184345245, + "rewards/rejected": -0.02497483789920807, + "step": 680 + }, + { + "epoch": 0.05, + "learning_rate": 2.2563767168083718e-06, + "logits/chosen": -2.4716198444366455, + "logits/rejected": -1.982600212097168, + "logps/chosen": -253.2783660888672, + "logps/rejected": -198.31503295898438, + "loss": 0.0571, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.019866937771439552, + "rewards/margins": 0.04034467041492462, + "rewards/rejected": -0.02047773264348507, + "step": 690 + }, + { + "epoch": 0.05, + "learning_rate": 2.289077828646174e-06, + "logits/chosen": -2.3059027194976807, + "logits/rejected": -2.070786952972412, + "logps/chosen": -238.1765594482422, + "logps/rejected": -219.28604125976562, + "loss": 0.0519, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.004421982914209366, + "rewards/margins": 0.029075268656015396, + "rewards/rejected": -0.024653282016515732, + "step": 700 + }, + { + "epoch": 0.05, + "eval_logits/chosen": -2.3373641967773438, + "eval_logits/rejected": -2.1490402221679688, + "eval_logps/chosen": -224.81228637695312, + "eval_logps/rejected": -211.1453399658203, + "eval_loss": 0.04084743186831474, + "eval_rewards/accuracies": 0.6154999732971191, + "eval_rewards/chosen": 0.035963330417871475, + "eval_rewards/margins": 0.03363055735826492, + "eval_rewards/rejected": 0.0023327735252678394, + "eval_runtime": 711.8752, + "eval_samples_per_second": 2.809, + "eval_steps_per_second": 1.405, + "step": 700 + }, + { + "epoch": 0.05, + "learning_rate": 2.3217789404839766e-06, + "logits/chosen": -2.1930441856384277, + "logits/rejected": -2.278282880783081, + "logps/chosen": -153.32977294921875, + "logps/rejected": -205.1284637451172, + "loss": 0.0277, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": 0.01750679686665535, + "rewards/margins": 0.024280013516545296, + "rewards/rejected": -0.006773218512535095, + "step": 710 + }, + { + "epoch": 0.05, + "learning_rate": 2.354480052321779e-06, + "logits/chosen": -2.521277666091919, + "logits/rejected": -2.0904335975646973, + "logps/chosen": -249.94601440429688, + "logps/rejected": -201.07015991210938, + "loss": 0.0492, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.042217105627059937, + "rewards/margins": 0.05382740497589111, + "rewards/rejected": -0.0116103021427989, + "step": 720 + }, + { + "epoch": 0.05, + "learning_rate": 2.3871811641595815e-06, + "logits/chosen": -2.344003200531006, + "logits/rejected": -2.1647818088531494, + "logps/chosen": -242.52621459960938, + "logps/rejected": -190.82498168945312, + "loss": 0.0245, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.021331200376152992, + "rewards/margins": 0.03477324917912483, + "rewards/rejected": -0.013442049734294415, + "step": 730 + }, + { + "epoch": 0.05, + "learning_rate": 2.4198822759973843e-06, + "logits/chosen": -2.177978992462158, + "logits/rejected": -2.1955149173736572, + "logps/chosen": -192.08859252929688, + "logps/rejected": -215.31790161132812, + "loss": 0.0619, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.007412579841911793, + "rewards/margins": 0.046483904123306274, + "rewards/rejected": -0.05389648675918579, + "step": 740 + }, + { + "epoch": 0.05, + "learning_rate": 2.4525833878351864e-06, + "logits/chosen": -2.4476373195648193, + "logits/rejected": -2.2447783946990967, + "logps/chosen": -247.8760528564453, + "logps/rejected": -192.84451293945312, + "loss": 0.0299, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.02462439239025116, + "rewards/margins": 0.056242309510707855, + "rewards/rejected": -0.031617920845746994, + "step": 750 + }, + { + "epoch": 0.05, + "learning_rate": 2.4852844996729892e-06, + "logits/chosen": -2.2039551734924316, + "logits/rejected": -2.108445405960083, + "logps/chosen": -238.0047607421875, + "logps/rejected": -252.89584350585938, + "loss": 0.0629, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.005597786512225866, + "rewards/margins": 0.0671648383140564, + "rewards/rejected": -0.061567049473524094, + "step": 760 + }, + { + "epoch": 0.05, + "learning_rate": 2.5179856115107916e-06, + "logits/chosen": -2.3416836261749268, + "logits/rejected": -2.0228421688079834, + "logps/chosen": -284.6044006347656, + "logps/rejected": -243.303955078125, + "loss": 0.0465, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.05554069206118584, + "rewards/margins": 0.06323963403701782, + "rewards/rejected": -0.11878032982349396, + "step": 770 + }, + { + "epoch": 0.05, + "learning_rate": 2.5506867233485937e-06, + "logits/chosen": -2.4263076782226562, + "logits/rejected": -2.002530336380005, + "logps/chosen": -286.38623046875, + "logps/rejected": -245.4661407470703, + "loss": 0.0476, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1034514307975769, + "rewards/margins": 0.08289273083209991, + "rewards/rejected": -0.18634414672851562, + "step": 780 + }, + { + "epoch": 0.05, + "learning_rate": 2.5833878351863965e-06, + "logits/chosen": -2.4578182697296143, + "logits/rejected": -2.389268159866333, + "logps/chosen": -260.0544738769531, + "logps/rejected": -270.0205993652344, + "loss": 0.0349, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.04028266668319702, + "rewards/margins": 0.06293262541294098, + "rewards/rejected": -0.103215292096138, + "step": 790 + }, + { + "epoch": 0.05, + "learning_rate": 2.616088947024199e-06, + "logits/chosen": -2.1505801677703857, + "logits/rejected": -2.310819387435913, + "logps/chosen": -200.5609588623047, + "logps/rejected": -234.65188598632812, + "loss": 0.034, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.020357560366392136, + "rewards/margins": 0.047822583466768265, + "rewards/rejected": -0.0681801438331604, + "step": 800 + }, + { + "epoch": 0.05, + "eval_logits/chosen": -2.322722911834717, + "eval_logits/rejected": -2.133898973464966, + "eval_logps/chosen": -236.33895874023438, + "eval_logps/rejected": -226.1372528076172, + "eval_loss": 0.03693564981222153, + "eval_rewards/accuracies": 0.612500011920929, + "eval_rewards/chosen": -0.021670011803507805, + "eval_rewards/margins": 0.05095669999718666, + "eval_rewards/rejected": -0.07262670993804932, + "eval_runtime": 714.003, + "eval_samples_per_second": 2.801, + "eval_steps_per_second": 1.401, + "step": 800 + }, + { + "epoch": 0.05, + "learning_rate": 2.6487900588620014e-06, + "logits/chosen": -2.0959088802337646, + "logits/rejected": -1.863910436630249, + "logps/chosen": -205.66055297851562, + "logps/rejected": -168.29428100585938, + "loss": 0.0523, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.027724791318178177, + "rewards/margins": 0.02442805841565132, + "rewards/rejected": -0.0521528497338295, + "step": 810 + }, + { + "epoch": 0.05, + "learning_rate": 2.6814911706998042e-06, + "logits/chosen": -2.2854485511779785, + "logits/rejected": -2.113757371902466, + "logps/chosen": -228.0641326904297, + "logps/rejected": -201.61636352539062, + "loss": 0.0299, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.013624387793242931, + "rewards/margins": 0.02153131738305092, + "rewards/rejected": -0.035155706107616425, + "step": 820 + }, + { + "epoch": 0.05, + "learning_rate": 2.7141922825376067e-06, + "logits/chosen": -2.1840600967407227, + "logits/rejected": -2.1884942054748535, + "logps/chosen": -256.2579040527344, + "logps/rejected": -279.2230529785156, + "loss": 0.0252, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.00833116751164198, + "rewards/margins": 0.0651707872748375, + "rewards/rejected": -0.05683961510658264, + "step": 830 + }, + { + "epoch": 0.05, + "learning_rate": 2.746893394375409e-06, + "logits/chosen": -2.3274707794189453, + "logits/rejected": -2.1984059810638428, + "logps/chosen": -228.68453979492188, + "logps/rejected": -234.34878540039062, + "loss": 0.0293, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.02854933775961399, + "rewards/margins": 0.03722766786813736, + "rewards/rejected": -0.008678330108523369, + "step": 840 + }, + { + "epoch": 0.06, + "learning_rate": 2.779594506213211e-06, + "logits/chosen": -2.22013258934021, + "logits/rejected": -1.9830286502838135, + "logps/chosen": -189.00003051757812, + "logps/rejected": -197.3227081298828, + "loss": 0.0422, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.004294353537261486, + "rewards/margins": 0.0431797169148922, + "rewards/rejected": -0.038885362446308136, + "step": 850 + }, + { + "epoch": 0.06, + "learning_rate": 2.812295618051014e-06, + "logits/chosen": -2.4056875705718994, + "logits/rejected": -2.2440567016601562, + "logps/chosen": -275.1996154785156, + "logps/rejected": -223.43252563476562, + "loss": 0.0238, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.02880767546594143, + "rewards/margins": 0.04980029910802841, + "rewards/rejected": -0.020992618054151535, + "step": 860 + }, + { + "epoch": 0.06, + "learning_rate": 2.8449967298888164e-06, + "logits/chosen": -2.2810773849487305, + "logits/rejected": -2.0782113075256348, + "logps/chosen": -177.49105834960938, + "logps/rejected": -155.1948699951172, + "loss": 0.0286, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.003335345070809126, + "rewards/margins": 0.028810903429985046, + "rewards/rejected": -0.03214624896645546, + "step": 870 + }, + { + "epoch": 0.06, + "learning_rate": 2.877697841726619e-06, + "logits/chosen": -2.36423921585083, + "logits/rejected": -2.2662618160247803, + "logps/chosen": -217.5684814453125, + "logps/rejected": -207.69131469726562, + "loss": 0.04, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.006336563732475042, + "rewards/margins": 0.0346880666911602, + "rewards/rejected": -0.028351500630378723, + "step": 880 + }, + { + "epoch": 0.06, + "learning_rate": 2.9103989535644217e-06, + "logits/chosen": -2.288578748703003, + "logits/rejected": -2.4027316570281982, + "logps/chosen": -210.6328125, + "logps/rejected": -246.1505126953125, + "loss": 0.0351, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.02425410784780979, + "rewards/margins": 0.02493629790842533, + "rewards/rejected": -0.0006821897113695741, + "step": 890 + }, + { + "epoch": 0.06, + "learning_rate": 2.943100065402224e-06, + "logits/chosen": -2.3284528255462646, + "logits/rejected": -2.18131685256958, + "logps/chosen": -289.2096862792969, + "logps/rejected": -278.4032897949219, + "loss": 0.0343, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.01716744527220726, + "rewards/margins": 0.022551458328962326, + "rewards/rejected": -0.0053840139880776405, + "step": 900 + }, + { + "epoch": 0.06, + "eval_logits/chosen": -2.3029963970184326, + "eval_logits/rejected": -2.1160192489624023, + "eval_logps/chosen": -231.2078094482422, + "eval_logps/rejected": -220.38314819335938, + "eval_loss": 0.03613131120800972, + "eval_rewards/accuracies": 0.6004999876022339, + "eval_rewards/chosen": 0.0039856997318565845, + "eval_rewards/margins": 0.0478418804705143, + "eval_rewards/rejected": -0.043856181204319, + "eval_runtime": 711.5047, + "eval_samples_per_second": 2.811, + "eval_steps_per_second": 1.405, + "step": 900 + }, + { + "epoch": 0.06, + "learning_rate": 2.9758011772400266e-06, + "logits/chosen": -2.226459264755249, + "logits/rejected": -2.2192370891571045, + "logps/chosen": -258.35595703125, + "logps/rejected": -271.165283203125, + "loss": 0.0186, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.005233976989984512, + "rewards/margins": 0.027173664420843124, + "rewards/rejected": -0.032407645136117935, + "step": 910 + }, + { + "epoch": 0.06, + "learning_rate": 3.0085022890778286e-06, + "logits/chosen": -2.2650656700134277, + "logits/rejected": -2.0100250244140625, + "logps/chosen": -182.3318634033203, + "logps/rejected": -164.67588806152344, + "loss": 0.0392, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.032961465418338776, + "rewards/margins": 0.04633763059973717, + "rewards/rejected": -0.07929908484220505, + "step": 920 + }, + { + "epoch": 0.06, + "learning_rate": 3.0412034009156314e-06, + "logits/chosen": -2.1937007904052734, + "logits/rejected": -2.340569019317627, + "logps/chosen": -255.5501708984375, + "logps/rejected": -256.684814453125, + "loss": 0.0342, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.09931950271129608, + "rewards/margins": 0.026981692761182785, + "rewards/rejected": -0.12630119919776917, + "step": 930 + }, + { + "epoch": 0.06, + "learning_rate": 3.073904512753434e-06, + "logits/chosen": -2.3723065853118896, + "logits/rejected": -2.013484477996826, + "logps/chosen": -247.80746459960938, + "logps/rejected": -222.70068359375, + "loss": 0.0387, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.09079430997371674, + "rewards/margins": 0.04910730570554733, + "rewards/rejected": -0.13990160822868347, + "step": 940 + }, + { + "epoch": 0.06, + "learning_rate": 3.1066056245912363e-06, + "logits/chosen": -2.237149953842163, + "logits/rejected": -2.311701774597168, + "logps/chosen": -274.0514221191406, + "logps/rejected": -253.67379760742188, + "loss": 0.0369, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.21533706784248352, + "rewards/margins": 0.05790703371167183, + "rewards/rejected": -0.27324408292770386, + "step": 950 + }, + { + "epoch": 0.06, + "learning_rate": 3.1393067364290387e-06, + "logits/chosen": -2.2728891372680664, + "logits/rejected": -2.0648486614227295, + "logps/chosen": -269.82293701171875, + "logps/rejected": -241.70669555664062, + "loss": 0.05, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.22348091006278992, + "rewards/margins": 0.05234675481915474, + "rewards/rejected": -0.27582764625549316, + "step": 960 + }, + { + "epoch": 0.06, + "learning_rate": 3.1720078482668416e-06, + "logits/chosen": -2.210095167160034, + "logits/rejected": -1.9791282415390015, + "logps/chosen": -241.1488494873047, + "logps/rejected": -205.7684326171875, + "loss": 0.0277, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.1478911191225052, + "rewards/margins": 0.05863233655691147, + "rewards/rejected": -0.20652346312999725, + "step": 970 + }, + { + "epoch": 0.06, + "learning_rate": 3.204708960104644e-06, + "logits/chosen": -2.2816624641418457, + "logits/rejected": -1.940813422203064, + "logps/chosen": -230.817138671875, + "logps/rejected": -211.7601776123047, + "loss": 0.0295, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.06707186251878738, + "rewards/margins": 0.06765398383140564, + "rewards/rejected": -0.13472583889961243, + "step": 980 + }, + { + "epoch": 0.06, + "learning_rate": 3.237410071942446e-06, + "logits/chosen": -2.326986312866211, + "logits/rejected": -2.061952829360962, + "logps/chosen": -221.90771484375, + "logps/rejected": -188.48504638671875, + "loss": 0.0291, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": -0.07773331552743912, + "rewards/margins": 0.0007057116599753499, + "rewards/rejected": -0.07843901962041855, + "step": 990 + }, + { + "epoch": 0.07, + "learning_rate": 3.270111183780249e-06, + "logits/chosen": -2.259770393371582, + "logits/rejected": -2.0372934341430664, + "logps/chosen": -218.0242156982422, + "logps/rejected": -207.89443969726562, + "loss": 0.0482, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.09000325947999954, + "rewards/margins": 0.03936609625816345, + "rewards/rejected": -0.1293693482875824, + "step": 1000 + }, + { + "epoch": 0.07, + "eval_logits/chosen": -2.2986462116241455, + "eval_logits/rejected": -2.1121134757995605, + "eval_logps/chosen": -251.03607177734375, + "eval_logps/rejected": -241.4310760498047, + "eval_loss": 0.03604179993271828, + "eval_rewards/accuracies": 0.6100000143051147, + "eval_rewards/chosen": -0.09515552967786789, + "eval_rewards/margins": 0.053940195590257645, + "eval_rewards/rejected": -0.14909571409225464, + "eval_runtime": 713.0944, + "eval_samples_per_second": 2.805, + "eval_steps_per_second": 1.402, + "step": 1000 + }, + { + "epoch": 0.07, + "learning_rate": 3.3028122956180513e-06, + "logits/chosen": -2.163093328475952, + "logits/rejected": -2.220931053161621, + "logps/chosen": -231.88125610351562, + "logps/rejected": -268.298828125, + "loss": 0.0426, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.0989166647195816, + "rewards/margins": 0.044743381440639496, + "rewards/rejected": -0.1436600387096405, + "step": 1010 + }, + { + "epoch": 0.07, + "learning_rate": 3.3355134074558538e-06, + "logits/chosen": -2.179591655731201, + "logits/rejected": -2.0796704292297363, + "logps/chosen": -249.61349487304688, + "logps/rejected": -233.9440155029297, + "loss": 0.0365, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.14440378546714783, + "rewards/margins": 0.049590133130550385, + "rewards/rejected": -0.1939939260482788, + "step": 1020 + }, + { + "epoch": 0.07, + "learning_rate": 3.368214519293656e-06, + "logits/chosen": -2.1318297386169434, + "logits/rejected": -1.919557809829712, + "logps/chosen": -225.65652465820312, + "logps/rejected": -212.3909912109375, + "loss": 0.0425, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.13343092799186707, + "rewards/margins": 0.038164906203746796, + "rewards/rejected": -0.17159582674503326, + "step": 1030 + }, + { + "epoch": 0.07, + "learning_rate": 3.400915631131459e-06, + "logits/chosen": -2.105020046234131, + "logits/rejected": -2.1336417198181152, + "logps/chosen": -227.83447265625, + "logps/rejected": -270.49041748046875, + "loss": 0.057, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.14931000769138336, + "rewards/margins": 0.07002317160367966, + "rewards/rejected": -0.2193332016468048, + "step": 1040 + }, + { + "epoch": 0.07, + "learning_rate": 3.4336167429692615e-06, + "logits/chosen": -2.3600306510925293, + "logits/rejected": -2.140617609024048, + "logps/chosen": -239.68075561523438, + "logps/rejected": -232.2493133544922, + "loss": 0.0248, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.12565191090106964, + "rewards/margins": 0.04411160573363304, + "rewards/rejected": -0.16976352035999298, + "step": 1050 + }, + { + "epoch": 0.07, + "learning_rate": 3.4663178548070635e-06, + "logits/chosen": -2.173041582107544, + "logits/rejected": -2.229940176010132, + "logps/chosen": -227.50448608398438, + "logps/rejected": -222.6018524169922, + "loss": 0.0397, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.06275273859500885, + "rewards/margins": 0.05769491195678711, + "rewards/rejected": -0.12044765800237656, + "step": 1060 + }, + { + "epoch": 0.07, + "learning_rate": 3.499018966644866e-06, + "logits/chosen": -2.328925609588623, + "logits/rejected": -2.1343941688537598, + "logps/chosen": -204.39268493652344, + "logps/rejected": -208.36209106445312, + "loss": 0.0302, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.06283830106258392, + "rewards/margins": 0.053908735513687134, + "rewards/rejected": -0.11674702167510986, + "step": 1070 + }, + { + "epoch": 0.07, + "learning_rate": 3.531720078482669e-06, + "logits/chosen": -2.2606778144836426, + "logits/rejected": -1.9218966960906982, + "logps/chosen": -233.92880249023438, + "logps/rejected": -235.57632446289062, + "loss": 0.0652, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.040295444428920746, + "rewards/margins": 0.06439373642206192, + "rewards/rejected": -0.10468918085098267, + "step": 1080 + }, + { + "epoch": 0.07, + "learning_rate": 3.5644211903204712e-06, + "logits/chosen": -2.2620184421539307, + "logits/rejected": -2.3110594749450684, + "logps/chosen": -203.2140655517578, + "logps/rejected": -214.1947784423828, + "loss": 0.0469, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.02165791392326355, + "rewards/margins": 0.040836118161678314, + "rewards/rejected": -0.062494028359651566, + "step": 1090 + }, + { + "epoch": 0.07, + "learning_rate": 3.5971223021582737e-06, + "logits/chosen": -2.4198689460754395, + "logits/rejected": -2.280808925628662, + "logps/chosen": -265.135009765625, + "logps/rejected": -214.5912322998047, + "loss": 0.0316, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.04667113721370697, + "rewards/margins": 0.03285752236843109, + "rewards/rejected": -0.07952866703271866, + "step": 1100 + }, + { + "epoch": 0.07, + "eval_logits/chosen": -2.2966220378875732, + "eval_logits/rejected": -2.11063551902771, + "eval_logps/chosen": -243.27603149414062, + "eval_logps/rejected": -236.5680694580078, + "eval_loss": 0.04146208241581917, + "eval_rewards/accuracies": 0.6044999957084656, + "eval_rewards/chosen": -0.056355465203523636, + "eval_rewards/margins": 0.06842530518770218, + "eval_rewards/rejected": -0.12478075921535492, + "eval_runtime": 716.3476, + "eval_samples_per_second": 2.792, + "eval_steps_per_second": 1.396, + "step": 1100 + }, + { + "epoch": 0.07, + "learning_rate": 3.6298234139960765e-06, + "logits/chosen": -2.3681674003601074, + "logits/rejected": -2.0149314403533936, + "logps/chosen": -232.5952606201172, + "logps/rejected": -194.56239318847656, + "loss": 0.0579, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.07413096725940704, + "rewards/margins": 0.05819229036569595, + "rewards/rejected": -0.1323232501745224, + "step": 1110 + }, + { + "epoch": 0.07, + "learning_rate": 3.6625245258338785e-06, + "logits/chosen": -2.1520678997039795, + "logits/rejected": -2.0752205848693848, + "logps/chosen": -262.2667541503906, + "logps/rejected": -357.5063171386719, + "loss": 0.0465, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.11744166910648346, + "rewards/margins": 0.09389489889144897, + "rewards/rejected": -0.21133653819561005, + "step": 1120 + }, + { + "epoch": 0.07, + "learning_rate": 3.695225637671681e-06, + "logits/chosen": -2.3903768062591553, + "logits/rejected": -2.191129684448242, + "logps/chosen": -222.4154510498047, + "logps/rejected": -201.04275512695312, + "loss": 0.0436, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.0756942480802536, + "rewards/margins": 0.05851646512746811, + "rewards/rejected": -0.1342107057571411, + "step": 1130 + }, + { + "epoch": 0.07, + "learning_rate": 3.7279267495094834e-06, + "logits/chosen": -2.305600166320801, + "logits/rejected": -2.173811435699463, + "logps/chosen": -168.5917205810547, + "logps/rejected": -197.49795532226562, + "loss": 0.0342, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.0408240482211113, + "rewards/margins": 0.09390485286712646, + "rewards/rejected": -0.13472887873649597, + "step": 1140 + }, + { + "epoch": 0.08, + "learning_rate": 3.7606278613472863e-06, + "logits/chosen": -2.372283458709717, + "logits/rejected": -2.028932571411133, + "logps/chosen": -291.4856262207031, + "logps/rejected": -222.21963500976562, + "loss": 0.0498, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.04607173055410385, + "rewards/margins": 0.04151051491498947, + "rewards/rejected": -0.08758225291967392, + "step": 1150 + }, + { + "epoch": 0.08, + "learning_rate": 3.7933289731850887e-06, + "logits/chosen": -2.1950457096099854, + "logits/rejected": -1.9749637842178345, + "logps/chosen": -226.8663787841797, + "logps/rejected": -215.56277465820312, + "loss": 0.0176, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.008471393026411533, + "rewards/margins": 0.08477363735437393, + "rewards/rejected": -0.07630225270986557, + "step": 1160 + }, + { + "epoch": 0.08, + "learning_rate": 3.826030085022891e-06, + "logits/chosen": -2.3531956672668457, + "logits/rejected": -2.1216812133789062, + "logps/chosen": -250.2491912841797, + "logps/rejected": -225.8344268798828, + "loss": 0.0404, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.012996921315789223, + "rewards/margins": 0.05868647247552872, + "rewards/rejected": -0.0716833844780922, + "step": 1170 + }, + { + "epoch": 0.08, + "learning_rate": 3.858731196860693e-06, + "logits/chosen": -2.401371479034424, + "logits/rejected": -1.9539715051651, + "logps/chosen": -254.2443389892578, + "logps/rejected": -216.92532348632812, + "loss": 0.0393, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.017054477706551552, + "rewards/margins": 0.024582907557487488, + "rewards/rejected": -0.04163738340139389, + "step": 1180 + }, + { + "epoch": 0.08, + "learning_rate": 3.891432308698496e-06, + "logits/chosen": -2.0956592559814453, + "logits/rejected": -2.03969669342041, + "logps/chosen": -193.86561584472656, + "logps/rejected": -230.97763061523438, + "loss": 0.05, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.006823881063610315, + "rewards/margins": 0.05933377146720886, + "rewards/rejected": -0.05250988528132439, + "step": 1190 + }, + { + "epoch": 0.08, + "learning_rate": 3.924133420536299e-06, + "logits/chosen": -2.20378041267395, + "logits/rejected": -2.013878345489502, + "logps/chosen": -191.03379821777344, + "logps/rejected": -170.40415954589844, + "loss": 0.0326, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.044445477426052094, + "rewards/margins": 0.0638650581240654, + "rewards/rejected": -0.10831055790185928, + "step": 1200 + }, + { + "epoch": 0.08, + "eval_logits/chosen": -2.2876923084259033, + "eval_logits/rejected": -2.102726697921753, + "eval_logps/chosen": -243.8900909423828, + "eval_logps/rejected": -237.54696655273438, + "eval_loss": 0.03310655057430267, + "eval_rewards/accuracies": 0.640500009059906, + "eval_rewards/chosen": -0.05942576006054878, + "eval_rewards/margins": 0.07024962455034256, + "eval_rewards/rejected": -0.12967538833618164, + "eval_runtime": 711.2733, + "eval_samples_per_second": 2.812, + "eval_steps_per_second": 1.406, + "step": 1200 + }, + { + "epoch": 0.08, + "learning_rate": 3.956834532374101e-06, + "logits/chosen": -2.400481700897217, + "logits/rejected": -2.0159413814544678, + "logps/chosen": -216.20870971679688, + "logps/rejected": -194.70791625976562, + "loss": 0.0294, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.05473031476140022, + "rewards/margins": 0.11225248873233795, + "rewards/rejected": -0.16698279976844788, + "step": 1210 + }, + { + "epoch": 0.08, + "learning_rate": 3.989535644211904e-06, + "logits/chosen": -2.2711410522460938, + "logits/rejected": -2.007876396179199, + "logps/chosen": -228.341552734375, + "logps/rejected": -223.26144409179688, + "loss": 0.0488, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.05607137084007263, + "rewards/margins": 0.08770473301410675, + "rewards/rejected": -0.1437760889530182, + "step": 1220 + }, + { + "epoch": 0.08, + "learning_rate": 4.022236756049706e-06, + "logits/chosen": -2.4662814140319824, + "logits/rejected": -2.0836684703826904, + "logps/chosen": -278.93316650390625, + "logps/rejected": -255.7671356201172, + "loss": 0.0194, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.008979836478829384, + "rewards/margins": 0.046547479927539825, + "rewards/rejected": -0.05552731081843376, + "step": 1230 + }, + { + "epoch": 0.08, + "learning_rate": 4.054937867887509e-06, + "logits/chosen": -2.339547634124756, + "logits/rejected": -1.9021472930908203, + "logps/chosen": -223.43222045898438, + "logps/rejected": -222.7755889892578, + "loss": 0.0203, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.004172854125499725, + "rewards/margins": 0.050659067928791046, + "rewards/rejected": -0.04648621380329132, + "step": 1240 + }, + { + "epoch": 0.08, + "learning_rate": 4.087638979725311e-06, + "logits/chosen": -2.250339984893799, + "logits/rejected": -2.3162214756011963, + "logps/chosen": -226.05819702148438, + "logps/rejected": -234.34072875976562, + "loss": 0.029, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.008907961659133434, + "rewards/margins": 0.034137699753046036, + "rewards/rejected": -0.02522973157465458, + "step": 1250 + }, + { + "epoch": 0.08, + "learning_rate": 4.1203400915631135e-06, + "logits/chosen": -2.1639037132263184, + "logits/rejected": -2.088408946990967, + "logps/chosen": -240.08273315429688, + "logps/rejected": -220.23916625976562, + "loss": 0.0764, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.013758843764662743, + "rewards/margins": 0.042185962200164795, + "rewards/rejected": -0.05594480782747269, + "step": 1260 + }, + { + "epoch": 0.08, + "learning_rate": 4.153041203400916e-06, + "logits/chosen": -2.3826661109924316, + "logits/rejected": -2.2230827808380127, + "logps/chosen": -258.5919189453125, + "logps/rejected": -240.993408203125, + "loss": 0.0076, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.02915186807513237, + "rewards/margins": 0.03518051654100418, + "rewards/rejected": -0.0060286447405815125, + "step": 1270 + }, + { + "epoch": 0.08, + "learning_rate": 4.185742315238718e-06, + "logits/chosen": -2.2890408039093018, + "logits/rejected": -2.0284957885742188, + "logps/chosen": -194.86972045898438, + "logps/rejected": -194.9738006591797, + "loss": 0.0293, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.01615927927196026, + "rewards/margins": 0.05985509231686592, + "rewards/rejected": -0.04369581490755081, + "step": 1280 + }, + { + "epoch": 0.08, + "learning_rate": 4.218443427076521e-06, + "logits/chosen": -2.233799457550049, + "logits/rejected": -2.0911786556243896, + "logps/chosen": -184.27633666992188, + "logps/rejected": -192.11636352539062, + "loss": 0.0532, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.00758881401270628, + "rewards/margins": 0.03184492141008377, + "rewards/rejected": -0.039433736354112625, + "step": 1290 + }, + { + "epoch": 0.09, + "learning_rate": 4.251144538914323e-06, + "logits/chosen": -2.207096576690674, + "logits/rejected": -2.070316791534424, + "logps/chosen": -224.5314178466797, + "logps/rejected": -222.47225952148438, + "loss": 0.0313, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.019705668091773987, + "rewards/margins": 0.05568262189626694, + "rewards/rejected": -0.07538828998804092, + "step": 1300 + }, + { + "epoch": 0.09, + "eval_logits/chosen": -2.299447774887085, + "eval_logits/rejected": -2.113861560821533, + "eval_logps/chosen": -232.5399627685547, + "eval_logps/rejected": -223.8951873779297, + "eval_loss": 0.0312928780913353, + "eval_rewards/accuracies": 0.6320000290870667, + "eval_rewards/chosen": -0.002674978692084551, + "eval_rewards/margins": 0.058741528540849686, + "eval_rewards/rejected": -0.06141650676727295, + "eval_runtime": 715.0773, + "eval_samples_per_second": 2.797, + "eval_steps_per_second": 1.398, + "step": 1300 + }, + { + "epoch": 0.09, + "learning_rate": 4.283845650752126e-06, + "logits/chosen": -2.3264384269714355, + "logits/rejected": -2.1164462566375732, + "logps/chosen": -287.3799743652344, + "logps/rejected": -236.7661895751953, + "loss": 0.0243, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.005840280093252659, + "rewards/margins": 0.06788822263479233, + "rewards/rejected": -0.07372850924730301, + "step": 1310 + }, + { + "epoch": 0.09, + "learning_rate": 4.316546762589928e-06, + "logits/chosen": -2.2621026039123535, + "logits/rejected": -2.1790411472320557, + "logps/chosen": -217.2468719482422, + "logps/rejected": -196.61880493164062, + "loss": 0.055, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.046596817672252655, + "rewards/margins": 0.0683503970503807, + "rewards/rejected": -0.11494722217321396, + "step": 1320 + }, + { + "epoch": 0.09, + "learning_rate": 4.349247874427731e-06, + "logits/chosen": -2.310753345489502, + "logits/rejected": -2.022246837615967, + "logps/chosen": -238.8385467529297, + "logps/rejected": -293.2956237792969, + "loss": 0.0251, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.061522673815488815, + "rewards/margins": 0.07187248766422272, + "rewards/rejected": -0.13339515030384064, + "step": 1330 + }, + { + "epoch": 0.09, + "learning_rate": 4.381948986265534e-06, + "logits/chosen": -2.4638831615448, + "logits/rejected": -2.203160285949707, + "logps/chosen": -264.6558532714844, + "logps/rejected": -270.74285888671875, + "loss": 0.0286, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.00905582308769226, + "rewards/margins": 0.09546998143196106, + "rewards/rejected": -0.10452580451965332, + "step": 1340 + }, + { + "epoch": 0.09, + "learning_rate": 4.414650098103336e-06, + "logits/chosen": -2.6136481761932373, + "logits/rejected": -2.3436598777770996, + "logps/chosen": -261.24114990234375, + "logps/rejected": -248.14596557617188, + "loss": 0.0305, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.0035567854065448046, + "rewards/margins": 0.04664462059736252, + "rewards/rejected": -0.043087832629680634, + "step": 1350 + }, + { + "epoch": 0.09, + "learning_rate": 4.447351209941138e-06, + "logits/chosen": -2.295720338821411, + "logits/rejected": -2.031731367111206, + "logps/chosen": -219.7317657470703, + "logps/rejected": -209.2897186279297, + "loss": 0.0456, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.05033557862043381, + "rewards/margins": 0.03849739953875542, + "rewards/rejected": -0.08883297443389893, + "step": 1360 + }, + { + "epoch": 0.09, + "learning_rate": 4.480052321778941e-06, + "logits/chosen": -2.247638463973999, + "logits/rejected": -2.257481575012207, + "logps/chosen": -226.5643768310547, + "logps/rejected": -217.638671875, + "loss": 0.0375, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.0017294908175244927, + "rewards/margins": 0.05885141342878342, + "rewards/rejected": -0.05712192505598068, + "step": 1370 + }, + { + "epoch": 0.09, + "learning_rate": 4.5127534336167435e-06, + "logits/chosen": -2.31245493888855, + "logits/rejected": -2.042786121368408, + "logps/chosen": -251.3003692626953, + "logps/rejected": -216.68307495117188, + "loss": 0.042, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.020632009953260422, + "rewards/margins": 0.04521378129720688, + "rewards/rejected": -0.0658458024263382, + "step": 1380 + }, + { + "epoch": 0.09, + "learning_rate": 4.5454545454545455e-06, + "logits/chosen": -2.2685723304748535, + "logits/rejected": -2.216639280319214, + "logps/chosen": -173.1841583251953, + "logps/rejected": -176.2059783935547, + "loss": 0.0509, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.01862729713320732, + "rewards/margins": 0.06615440547466278, + "rewards/rejected": -0.084781713783741, + "step": 1390 + }, + { + "epoch": 0.09, + "learning_rate": 4.578155657292348e-06, + "logits/chosen": -2.2430644035339355, + "logits/rejected": -2.2024948596954346, + "logps/chosen": -255.028076171875, + "logps/rejected": -279.7099304199219, + "loss": 0.0345, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.0018879823619499803, + "rewards/margins": 0.07879987359046936, + "rewards/rejected": -0.07691188901662827, + "step": 1400 + }, + { + "epoch": 0.09, + "eval_logits/chosen": -2.2307913303375244, + "eval_logits/rejected": -2.05076265335083, + "eval_logps/chosen": -234.12046813964844, + "eval_logps/rejected": -224.87071228027344, + "eval_loss": 0.033073075115680695, + "eval_rewards/accuracies": 0.6274999976158142, + "eval_rewards/chosen": -0.010577631182968616, + "eval_rewards/margins": 0.05571650341153145, + "eval_rewards/rejected": -0.0662941262125969, + "eval_runtime": 712.6064, + "eval_samples_per_second": 2.807, + "eval_steps_per_second": 1.403, + "step": 1400 + }, + { + "epoch": 0.09, + "learning_rate": 4.610856769130151e-06, + "logits/chosen": -2.3056623935699463, + "logits/rejected": -2.176156520843506, + "logps/chosen": -241.0175323486328, + "logps/rejected": -222.33029174804688, + "loss": 0.0122, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.009366462007164955, + "rewards/margins": 0.05407185107469559, + "rewards/rejected": -0.0634383112192154, + "step": 1410 + }, + { + "epoch": 0.09, + "learning_rate": 4.643557880967953e-06, + "logits/chosen": -2.2978873252868652, + "logits/rejected": -2.1032276153564453, + "logps/chosen": -203.88720703125, + "logps/rejected": -210.26089477539062, + "loss": 0.0356, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.025442058220505714, + "rewards/margins": 0.04436718672513962, + "rewards/rejected": -0.06980924308300018, + "step": 1420 + }, + { + "epoch": 0.09, + "learning_rate": 4.676258992805755e-06, + "logits/chosen": -2.2503609657287598, + "logits/rejected": -1.983534812927246, + "logps/chosen": -273.1319580078125, + "logps/rejected": -244.86367797851562, + "loss": 0.0223, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.01756000705063343, + "rewards/margins": 0.06940056383609772, + "rewards/rejected": -0.086960569024086, + "step": 1430 + }, + { + "epoch": 0.09, + "learning_rate": 4.708960104643558e-06, + "logits/chosen": -2.248351573944092, + "logits/rejected": -2.2094709873199463, + "logps/chosen": -309.40234375, + "logps/rejected": -283.4943542480469, + "loss": 0.0197, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.015962861478328705, + "rewards/margins": 0.05426085740327835, + "rewards/rejected": -0.07022371143102646, + "step": 1440 + }, + { + "epoch": 0.09, + "learning_rate": 4.741661216481361e-06, + "logits/chosen": -2.1934053897857666, + "logits/rejected": -2.1544718742370605, + "logps/chosen": -239.0615234375, + "logps/rejected": -268.44793701171875, + "loss": 0.0252, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.013579867780208588, + "rewards/margins": 0.09070058166980743, + "rewards/rejected": -0.10428045690059662, + "step": 1450 + }, + { + "epoch": 0.1, + "learning_rate": 4.774362328319163e-06, + "logits/chosen": -2.1640710830688477, + "logits/rejected": -2.009326696395874, + "logps/chosen": -220.5846710205078, + "logps/rejected": -196.79995727539062, + "loss": 0.0201, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.023972881957888603, + "rewards/margins": 0.03082362748682499, + "rewards/rejected": -0.05479650944471359, + "step": 1460 + }, + { + "epoch": 0.1, + "learning_rate": 4.807063440156966e-06, + "logits/chosen": -2.1977009773254395, + "logits/rejected": -1.9679205417633057, + "logps/chosen": -264.31158447265625, + "logps/rejected": -225.3589324951172, + "loss": 0.0354, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.005010455381125212, + "rewards/margins": 0.11208884418010712, + "rewards/rejected": -0.11709930747747421, + "step": 1470 + }, + { + "epoch": 0.1, + "learning_rate": 4.839764551994769e-06, + "logits/chosen": -2.254770278930664, + "logits/rejected": -1.957808256149292, + "logps/chosen": -258.8214111328125, + "logps/rejected": -239.7161407470703, + "loss": 0.0432, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.05056464672088623, + "rewards/margins": 0.09314145147800446, + "rewards/rejected": -0.1437060832977295, + "step": 1480 + }, + { + "epoch": 0.1, + "learning_rate": 4.872465663832571e-06, + "logits/chosen": -2.11043119430542, + "logits/rejected": -1.9940143823623657, + "logps/chosen": -223.37466430664062, + "logps/rejected": -210.67971801757812, + "loss": 0.0507, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.07808069884777069, + "rewards/margins": 0.038826070725917816, + "rewards/rejected": -0.1169067770242691, + "step": 1490 + }, + { + "epoch": 0.1, + "learning_rate": 4.905166775670373e-06, + "logits/chosen": -2.224799394607544, + "logits/rejected": -1.9152275323867798, + "logps/chosen": -230.69625854492188, + "logps/rejected": -216.72360229492188, + "loss": 0.0629, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.04468151926994324, + "rewards/margins": 0.06710448861122131, + "rewards/rejected": -0.11178600788116455, + "step": 1500 + }, + { + "epoch": 0.1, + "eval_logits/chosen": -2.177633285522461, + "eval_logits/rejected": -2.000908136367798, + "eval_logps/chosen": -242.75218200683594, + "eval_logps/rejected": -237.29566955566406, + "eval_loss": 0.03400981053709984, + "eval_rewards/accuracies": 0.6439999938011169, + "eval_rewards/chosen": -0.053736183792352676, + "eval_rewards/margins": 0.0746825784444809, + "eval_rewards/rejected": -0.12841875851154327, + "eval_runtime": 714.8353, + "eval_samples_per_second": 2.798, + "eval_steps_per_second": 1.399, + "step": 1500 + }, + { + "epoch": 0.1, + "learning_rate": 4.9378678875081756e-06, + "logits/chosen": -2.325528860092163, + "logits/rejected": -2.0406839847564697, + "logps/chosen": -222.1483917236328, + "logps/rejected": -206.7466583251953, + "loss": 0.0351, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.03702390938997269, + "rewards/margins": 0.07803212106227875, + "rewards/rejected": -0.11505603790283203, + "step": 1510 + }, + { + "epoch": 0.1, + "learning_rate": 4.9705689993459784e-06, + "logits/chosen": -2.305387496948242, + "logits/rejected": -1.8941253423690796, + "logps/chosen": -218.213623046875, + "logps/rejected": -181.945556640625, + "loss": 0.0471, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.06644146144390106, + "rewards/margins": 0.10722502321004868, + "rewards/rejected": -0.17366649210453033, + "step": 1520 + }, + { + "epoch": 0.1, + "learning_rate": 4.999999934793849e-06, + "logits/chosen": -2.258678674697876, + "logits/rejected": -2.1702895164489746, + "logps/chosen": -255.1768035888672, + "logps/rejected": -231.4876708984375, + "loss": 0.0354, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.03611770272254944, + "rewards/margins": 0.028636153787374496, + "rewards/rejected": -0.06475386023521423, + "step": 1530 + }, + { + "epoch": 0.1, + "learning_rate": 4.999992110059814e-06, + "logits/chosen": -2.24711275100708, + "logits/rejected": -2.2306511402130127, + "logps/chosen": -278.0412902832031, + "logps/rejected": -269.61517333984375, + "loss": 0.0262, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.0017689462983980775, + "rewards/margins": 0.05843646451830864, + "rewards/rejected": -0.060205407440662384, + "step": 1540 + }, + { + "epoch": 0.1, + "learning_rate": 4.999971244142299e-06, + "logits/chosen": -2.398763418197632, + "logits/rejected": -2.106553554534912, + "logps/chosen": -274.9897766113281, + "logps/rejected": -249.78518676757812, + "loss": 0.0168, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.007683471776545048, + "rewards/margins": 0.07020819187164307, + "rewards/rejected": -0.07789166271686554, + "step": 1550 + }, + { + "epoch": 0.1, + "learning_rate": 4.999937337150149e-06, + "logits/chosen": -2.0848517417907715, + "logits/rejected": -2.0338871479034424, + "logps/chosen": -237.2061309814453, + "logps/rejected": -231.5194091796875, + "loss": 0.0411, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.013456342741847038, + "rewards/margins": 0.048470061272382736, + "rewards/rejected": -0.061926405876874924, + "step": 1560 + }, + { + "epoch": 0.1, + "learning_rate": 4.99989038926024e-06, + "logits/chosen": -2.029017925262451, + "logits/rejected": -2.1551504135131836, + "logps/chosen": -208.94229125976562, + "logps/rejected": -225.96371459960938, + "loss": 0.0249, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.0525711365044117, + "rewards/margins": 0.07409703731536865, + "rewards/rejected": -0.12666817009449005, + "step": 1570 + }, + { + "epoch": 0.1, + "learning_rate": 4.999830400717476e-06, + "logits/chosen": -2.213772773742676, + "logits/rejected": -2.084939479827881, + "logps/chosen": -295.6604309082031, + "logps/rejected": -295.1983947753906, + "loss": 0.0152, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.04136674478650093, + "rewards/margins": 0.0790671780705452, + "rewards/rejected": -0.12043392658233643, + "step": 1580 + }, + { + "epoch": 0.1, + "learning_rate": 4.999757371834787e-06, + "logits/chosen": -2.088376045227051, + "logits/rejected": -2.008604049682617, + "logps/chosen": -254.0402374267578, + "logps/rejected": -268.31842041015625, + "loss": 0.021, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.07110683619976044, + "rewards/margins": 0.1489681601524353, + "rewards/rejected": -0.22007498145103455, + "step": 1590 + }, + { + "epoch": 0.1, + "learning_rate": 4.999671302993125e-06, + "logits/chosen": -2.0464565753936768, + "logits/rejected": -1.986285924911499, + "logps/chosen": -263.01983642578125, + "logps/rejected": -294.6103515625, + "loss": 0.0313, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.07219687849283218, + "rewards/margins": 0.08101221174001694, + "rewards/rejected": -0.15320907533168793, + "step": 1600 + }, + { + "epoch": 0.1, + "eval_logits/chosen": -2.2403852939605713, + "eval_logits/rejected": -2.057793378829956, + "eval_logps/chosen": -244.79440307617188, + "eval_logps/rejected": -239.94647216796875, + "eval_loss": 0.031098267063498497, + "eval_rewards/accuracies": 0.6309999823570251, + "eval_rewards/chosen": -0.06394734233617783, + "eval_rewards/margins": 0.07772543281316757, + "eval_rewards/rejected": -0.1416727900505066, + "eval_runtime": 713.2681, + "eval_samples_per_second": 2.804, + "eval_steps_per_second": 1.402, + "step": 1600 + }, + { + "epoch": 0.11, + "learning_rate": 4.999572194641471e-06, + "logits/chosen": -2.2111706733703613, + "logits/rejected": -2.0525684356689453, + "logps/chosen": -289.5194091796875, + "logps/rejected": -258.9083557128906, + "loss": 0.0393, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.07000543922185898, + "rewards/margins": 0.109294593334198, + "rewards/rejected": -0.17930002510547638, + "step": 1610 + }, + { + "epoch": 0.11, + "learning_rate": 4.999460047296819e-06, + "logits/chosen": -2.1922264099121094, + "logits/rejected": -2.0641796588897705, + "logps/chosen": -238.1181640625, + "logps/rejected": -233.39846801757812, + "loss": 0.0221, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.11257002502679825, + "rewards/margins": 0.08094724267721176, + "rewards/rejected": -0.1935172826051712, + "step": 1620 + }, + { + "epoch": 0.11, + "learning_rate": 4.999334861544186e-06, + "logits/chosen": -2.312042474746704, + "logits/rejected": -1.9850116968154907, + "logps/chosen": -240.84591674804688, + "logps/rejected": -206.13882446289062, + "loss": 0.038, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.05696337670087814, + "rewards/margins": 0.10648103058338165, + "rewards/rejected": -0.1634444147348404, + "step": 1630 + }, + { + "epoch": 0.11, + "learning_rate": 4.999196638036604e-06, + "logits/chosen": -2.379241943359375, + "logits/rejected": -2.183292865753174, + "logps/chosen": -300.8702087402344, + "logps/rejected": -272.499267578125, + "loss": 0.0068, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.06727579981088638, + "rewards/margins": 0.034327052533626556, + "rewards/rejected": -0.10160285234451294, + "step": 1640 + }, + { + "epoch": 0.11, + "learning_rate": 4.999045377495111e-06, + "logits/chosen": -2.0479378700256348, + "logits/rejected": -2.301999568939209, + "logps/chosen": -191.126708984375, + "logps/rejected": -295.76654052734375, + "loss": 0.0358, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.08396010845899582, + "rewards/margins": 0.08428677171468735, + "rewards/rejected": -0.16824688017368317, + "step": 1650 + }, + { + "epoch": 0.11, + "learning_rate": 4.998881080708759e-06, + "logits/chosen": -2.2068817615509033, + "logits/rejected": -2.130096673965454, + "logps/chosen": -253.0929718017578, + "logps/rejected": -226.85226440429688, + "loss": 0.0386, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.0638570636510849, + "rewards/margins": 0.02352083846926689, + "rewards/rejected": -0.0873778909444809, + "step": 1660 + }, + { + "epoch": 0.11, + "learning_rate": 4.998703748534599e-06, + "logits/chosen": -2.0564498901367188, + "logits/rejected": -1.8033416271209717, + "logps/chosen": -246.2660675048828, + "logps/rejected": -203.96421813964844, + "loss": 0.0612, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.02818126417696476, + "rewards/margins": 0.041943810880184174, + "rewards/rejected": -0.07012508064508438, + "step": 1670 + }, + { + "epoch": 0.11, + "learning_rate": 4.998513381897683e-06, + "logits/chosen": -2.287282943725586, + "logits/rejected": -2.057615041732788, + "logps/chosen": -242.29696655273438, + "logps/rejected": -190.59234619140625, + "loss": 0.0348, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.015266014263033867, + "rewards/margins": 0.02644859254360199, + "rewards/rejected": -0.041714608669281006, + "step": 1680 + }, + { + "epoch": 0.11, + "learning_rate": 4.9983099817910565e-06, + "logits/chosen": -2.205864191055298, + "logits/rejected": -2.0282130241394043, + "logps/chosen": -252.55630493164062, + "logps/rejected": -259.43914794921875, + "loss": 0.0346, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.036472074687480927, + "rewards/margins": 0.056804411113262177, + "rewards/rejected": -0.0932764858007431, + "step": 1690 + }, + { + "epoch": 0.11, + "learning_rate": 4.998093549275754e-06, + "logits/chosen": -2.187556505203247, + "logits/rejected": -2.182443141937256, + "logps/chosen": -263.99151611328125, + "logps/rejected": -301.42822265625, + "loss": 0.0287, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.01828739047050476, + "rewards/margins": 0.07821901887655258, + "rewards/rejected": -0.09650642424821854, + "step": 1700 + }, + { + "epoch": 0.11, + "eval_logits/chosen": -2.286701202392578, + "eval_logits/rejected": -2.1022112369537354, + "eval_logps/chosen": -237.6215057373047, + "eval_logps/rejected": -230.36648559570312, + "eval_loss": 0.030306359753012657, + "eval_rewards/accuracies": 0.6359999775886536, + "eval_rewards/chosen": -0.02808281220495701, + "eval_rewards/margins": 0.0656900480389595, + "eval_rewards/rejected": -0.09377285838127136, + "eval_runtime": 712.8613, + "eval_samples_per_second": 2.806, + "eval_steps_per_second": 1.403, + "step": 1700 + }, + { + "epoch": 0.11, + "learning_rate": 4.997864085480794e-06, + "logits/chosen": -2.333749771118164, + "logits/rejected": -2.161771774291992, + "logps/chosen": -271.87738037109375, + "logps/rejected": -267.2635192871094, + "loss": 0.0136, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.027655865997076035, + "rewards/margins": 0.06961538642644882, + "rewards/rejected": -0.09727124869823456, + "step": 1710 + }, + { + "epoch": 0.11, + "learning_rate": 4.997621591603171e-06, + "logits/chosen": -2.2935166358947754, + "logits/rejected": -2.1049695014953613, + "logps/chosen": -166.76449584960938, + "logps/rejected": -180.1530303955078, + "loss": 0.0556, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.04365837201476097, + "rewards/margins": 0.07036669552326202, + "rewards/rejected": -0.1140250712633133, + "step": 1720 + }, + { + "epoch": 0.11, + "learning_rate": 4.997366068907853e-06, + "logits/chosen": -2.275902271270752, + "logits/rejected": -2.221653461456299, + "logps/chosen": -260.5364074707031, + "logps/rejected": -249.9969024658203, + "loss": 0.0358, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.010633264668285847, + "rewards/margins": 0.046677958220243454, + "rewards/rejected": -0.05731121823191643, + "step": 1730 + }, + { + "epoch": 0.11, + "learning_rate": 4.997097518727771e-06, + "logits/chosen": -2.3315846920013428, + "logits/rejected": -2.088407039642334, + "logps/chosen": -227.2711181640625, + "logps/rejected": -207.3436737060547, + "loss": 0.0383, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.021573388949036598, + "rewards/margins": 0.06597500294446945, + "rewards/rejected": -0.0875483900308609, + "step": 1740 + }, + { + "epoch": 0.11, + "learning_rate": 4.9968159424638155e-06, + "logits/chosen": -2.1644344329833984, + "logits/rejected": -2.3732848167419434, + "logps/chosen": -221.4119415283203, + "logps/rejected": -281.27215576171875, + "loss": 0.024, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.01902773603796959, + "rewards/margins": 0.040075235068798065, + "rewards/rejected": -0.059102971106767654, + "step": 1750 + }, + { + "epoch": 0.12, + "learning_rate": 4.9965213415848235e-06, + "logits/chosen": -2.2339000701904297, + "logits/rejected": -1.8621151447296143, + "logps/chosen": -237.12387084960938, + "logps/rejected": -214.28848266601562, + "loss": 0.0263, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.04359662905335426, + "rewards/margins": 0.07012289762496948, + "rewards/rejected": -0.11371952295303345, + "step": 1760 + }, + { + "epoch": 0.12, + "learning_rate": 4.9962137176275805e-06, + "logits/chosen": -2.3237321376800537, + "logits/rejected": -2.1237401962280273, + "logps/chosen": -233.67996215820312, + "logps/rejected": -243.418212890625, + "loss": 0.0125, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.0030772259924560785, + "rewards/margins": 0.05148119851946831, + "rewards/rejected": -0.04840397089719772, + "step": 1770 + }, + { + "epoch": 0.12, + "learning_rate": 4.9958930721968015e-06, + "logits/chosen": -2.193448543548584, + "logits/rejected": -2.2796072959899902, + "logps/chosen": -215.48025512695312, + "logps/rejected": -235.7574005126953, + "loss": 0.0294, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.01722542755305767, + "rewards/margins": 0.04387689009308815, + "rewards/rejected": -0.061102308332920074, + "step": 1780 + }, + { + "epoch": 0.12, + "learning_rate": 4.995559406965132e-06, + "logits/chosen": -2.4174957275390625, + "logits/rejected": -2.064757823944092, + "logps/chosen": -232.0088348388672, + "logps/rejected": -215.20706176757812, + "loss": 0.0262, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.000467650213977322, + "rewards/margins": 0.06406942009925842, + "rewards/rejected": -0.06453706324100494, + "step": 1790 + }, + { + "epoch": 0.12, + "learning_rate": 4.995212723673131e-06, + "logits/chosen": -2.374816417694092, + "logits/rejected": -2.1405506134033203, + "logps/chosen": -227.5681610107422, + "logps/rejected": -192.76089477539062, + "loss": 0.0335, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.013174636289477348, + "rewards/margins": 0.07587815076112747, + "rewards/rejected": -0.06270351260900497, + "step": 1800 + }, + { + "epoch": 0.12, + "eval_logits/chosen": -2.3010594844818115, + "eval_logits/rejected": -2.115757465362549, + "eval_logps/chosen": -231.86740112304688, + "eval_logps/rejected": -222.27854919433594, + "eval_loss": 0.03156345337629318, + "eval_rewards/accuracies": 0.6259999871253967, + "eval_rewards/chosen": 0.0006877080886624753, + "eval_rewards/margins": 0.054020799696445465, + "eval_rewards/rejected": -0.0533330924808979, + "eval_runtime": 711.5953, + "eval_samples_per_second": 2.811, + "eval_steps_per_second": 1.405, + "step": 1800 + }, + { + "epoch": 0.12, + "learning_rate": 4.99485302412927e-06, + "logits/chosen": -2.0493340492248535, + "logits/rejected": -1.9822384119033813, + "logps/chosen": -206.83642578125, + "logps/rejected": -222.2565155029297, + "loss": 0.0412, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.005873044487088919, + "rewards/margins": 0.07363691926002502, + "rewards/rejected": -0.06776387244462967, + "step": 1810 + }, + { + "epoch": 0.12, + "learning_rate": 4.994480310209918e-06, + "logits/chosen": -2.2592854499816895, + "logits/rejected": -2.417466402053833, + "logps/chosen": -239.39797973632812, + "logps/rejected": -262.6976623535156, + "loss": 0.0275, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.012925192713737488, + "rewards/margins": 0.0477270744740963, + "rewards/rejected": -0.03480188176035881, + "step": 1820 + }, + { + "epoch": 0.12, + "learning_rate": 4.994094583859332e-06, + "logits/chosen": -2.2714853286743164, + "logits/rejected": -2.022733211517334, + "logps/chosen": -160.02210998535156, + "logps/rejected": -203.35848999023438, + "loss": 0.0416, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.02445300668478012, + "rewards/margins": 0.05653420090675354, + "rewards/rejected": -0.03208119422197342, + "step": 1830 + }, + { + "epoch": 0.12, + "learning_rate": 4.9936958470896525e-06, + "logits/chosen": -2.2610347270965576, + "logits/rejected": -2.020301580429077, + "logps/chosen": -229.99380493164062, + "logps/rejected": -208.194091796875, + "loss": 0.0402, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.03124316595494747, + "rewards/margins": 0.10551712661981583, + "rewards/rejected": -0.13676029443740845, + "step": 1840 + }, + { + "epoch": 0.12, + "learning_rate": 4.993284101980883e-06, + "logits/chosen": -2.201612949371338, + "logits/rejected": -2.017625331878662, + "logps/chosen": -261.00958251953125, + "logps/rejected": -254.49404907226562, + "loss": 0.0373, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.09257099777460098, + "rewards/margins": 0.16558702290058136, + "rewards/rejected": -0.25815802812576294, + "step": 1850 + }, + { + "epoch": 0.12, + "learning_rate": 4.9928593506808885e-06, + "logits/chosen": -2.3184077739715576, + "logits/rejected": -2.123654365539551, + "logps/chosen": -275.52569580078125, + "logps/rejected": -253.30953979492188, + "loss": 0.0606, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.0935564711689949, + "rewards/margins": 0.07380557060241699, + "rewards/rejected": -0.1673620492219925, + "step": 1860 + }, + { + "epoch": 0.12, + "learning_rate": 4.992421595405381e-06, + "logits/chosen": -2.2751145362854004, + "logits/rejected": -2.027334690093994, + "logps/chosen": -239.30380249023438, + "logps/rejected": -177.33480834960938, + "loss": 0.054, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.05555707961320877, + "rewards/margins": 0.04438935965299606, + "rewards/rejected": -0.09994643181562424, + "step": 1870 + }, + { + "epoch": 0.12, + "learning_rate": 4.991970838437905e-06, + "logits/chosen": -2.213554620742798, + "logits/rejected": -2.11810040473938, + "logps/chosen": -239.0460968017578, + "logps/rejected": -277.94354248046875, + "loss": 0.0483, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.06858281046152115, + "rewards/margins": 0.053963709622621536, + "rewards/rejected": -0.12254651635885239, + "step": 1880 + }, + { + "epoch": 0.12, + "learning_rate": 4.9915070821298294e-06, + "logits/chosen": -2.317843198776245, + "logits/rejected": -1.995692491531372, + "logps/chosen": -180.86282348632812, + "logps/rejected": -177.86102294921875, + "loss": 0.0228, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": -0.06946498900651932, + "rewards/margins": 0.0304332934319973, + "rewards/rejected": -0.09989828616380692, + "step": 1890 + }, + { + "epoch": 0.12, + "learning_rate": 4.991030328900336e-06, + "logits/chosen": -2.239023208618164, + "logits/rejected": -1.9848921298980713, + "logps/chosen": -294.36920166015625, + "logps/rejected": -237.84078979492188, + "loss": 0.0209, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.04899997264146805, + "rewards/margins": 0.06753884255886078, + "rewards/rejected": -0.11653882265090942, + "step": 1900 + }, + { + "epoch": 0.12, + "eval_logits/chosen": -2.289330244064331, + "eval_logits/rejected": -2.104606866836548, + "eval_logps/chosen": -244.22193908691406, + "eval_logps/rejected": -234.0950164794922, + "eval_loss": 0.033284034579992294, + "eval_rewards/accuracies": 0.6399999856948853, + "eval_rewards/chosen": -0.06108501926064491, + "eval_rewards/margins": 0.051330603659152985, + "eval_rewards/rejected": -0.1124156191945076, + "eval_runtime": 714.8462, + "eval_samples_per_second": 2.798, + "eval_steps_per_second": 1.399, + "step": 1900 + }, + { + "epoch": 0.12, + "learning_rate": 4.9905405812364014e-06, + "logits/chosen": -2.2534711360931396, + "logits/rejected": -2.238356828689575, + "logps/chosen": -213.00283813476562, + "logps/rejected": -226.8634490966797, + "loss": 0.0351, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.05353314429521561, + "rewards/margins": 0.05697651952505112, + "rewards/rejected": -0.11050967127084732, + "step": 1910 + }, + { + "epoch": 0.13, + "learning_rate": 4.990037841692791e-06, + "logits/chosen": -2.214384078979492, + "logits/rejected": -1.9799760580062866, + "logps/chosen": -217.17306518554688, + "logps/rejected": -192.190185546875, + "loss": 0.046, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.05566094070672989, + "rewards/margins": 0.09150851517915726, + "rewards/rejected": -0.14716944098472595, + "step": 1920 + }, + { + "epoch": 0.13, + "learning_rate": 4.989522112892039e-06, + "logits/chosen": -2.2701072692871094, + "logits/rejected": -2.235013246536255, + "logps/chosen": -212.87271118164062, + "logps/rejected": -225.2215118408203, + "loss": 0.0411, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.07621657103300095, + "rewards/margins": 0.06619422137737274, + "rewards/rejected": -0.1424107849597931, + "step": 1930 + }, + { + "epoch": 0.13, + "learning_rate": 4.98899339752444e-06, + "logits/chosen": -2.3187501430511475, + "logits/rejected": -2.0728302001953125, + "logps/chosen": -236.3536834716797, + "logps/rejected": -225.99551391601562, + "loss": 0.0484, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.0337567999958992, + "rewards/margins": 0.09292588382959366, + "rewards/rejected": -0.12668268382549286, + "step": 1940 + }, + { + "epoch": 0.13, + "learning_rate": 4.988451698348033e-06, + "logits/chosen": -2.243349552154541, + "logits/rejected": -2.227822780609131, + "logps/chosen": -185.9561309814453, + "logps/rejected": -213.9745330810547, + "loss": 0.0379, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.038142912089824677, + "rewards/margins": 0.03874513879418373, + "rewards/rejected": -0.0768880546092987, + "step": 1950 + }, + { + "epoch": 0.13, + "learning_rate": 4.987897018188585e-06, + "logits/chosen": -2.235739231109619, + "logits/rejected": -1.9942162036895752, + "logps/chosen": -229.75186157226562, + "logps/rejected": -184.48373413085938, + "loss": 0.0256, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.03566255420446396, + "rewards/margins": 0.05703993886709213, + "rewards/rejected": -0.09270249307155609, + "step": 1960 + }, + { + "epoch": 0.13, + "learning_rate": 4.9873293599395814e-06, + "logits/chosen": -2.2663984298706055, + "logits/rejected": -2.1350181102752686, + "logps/chosen": -200.5733184814453, + "logps/rejected": -212.0470733642578, + "loss": 0.0495, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.03570770472288132, + "rewards/margins": 0.07728613913059235, + "rewards/rejected": -0.11299383640289307, + "step": 1970 + }, + { + "epoch": 0.13, + "learning_rate": 4.986748726562203e-06, + "logits/chosen": -2.293747663497925, + "logits/rejected": -2.1642115116119385, + "logps/chosen": -221.1876983642578, + "logps/rejected": -205.5155029296875, + "loss": 0.0261, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.026666143909096718, + "rewards/margins": 0.046100765466690063, + "rewards/rejected": -0.07276691496372223, + "step": 1980 + }, + { + "epoch": 0.13, + "learning_rate": 4.98615512108532e-06, + "logits/chosen": -2.384824275970459, + "logits/rejected": -2.2395741939544678, + "logps/chosen": -225.223388671875, + "logps/rejected": -237.7670135498047, + "loss": 0.0459, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.018563272431492805, + "rewards/margins": 0.05380113795399666, + "rewards/rejected": -0.07236441224813461, + "step": 1990 + }, + { + "epoch": 0.13, + "learning_rate": 4.985548546605469e-06, + "logits/chosen": -2.1302530765533447, + "logits/rejected": -2.267808437347412, + "logps/chosen": -227.83688354492188, + "logps/rejected": -253.89321899414062, + "loss": 0.0183, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.07365544140338898, + "rewards/margins": 0.0420791395008564, + "rewards/rejected": -0.11573459208011627, + "step": 2000 + }, + { + "epoch": 0.13, + "eval_logits/chosen": -2.3084776401519775, + "eval_logits/rejected": -2.121295928955078, + "eval_logps/chosen": -244.44662475585938, + "eval_logps/rejected": -238.9348907470703, + "eval_loss": 0.030150586739182472, + "eval_rewards/accuracies": 0.6499999761581421, + "eval_rewards/chosen": -0.062208425253629684, + "eval_rewards/margins": 0.07440651953220367, + "eval_rewards/rejected": -0.13661494851112366, + "eval_runtime": 712.1385, + "eval_samples_per_second": 2.808, + "eval_steps_per_second": 1.404, + "step": 2000 + }, + { + "epoch": 0.13, + "learning_rate": 4.984929006286838e-06, + "logits/chosen": -2.146847724914551, + "logits/rejected": -2.093820333480835, + "logps/chosen": -222.1940460205078, + "logps/rejected": -230.22390747070312, + "loss": 0.0533, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -0.06440014392137527, + "rewards/margins": 0.022867945954203606, + "rewards/rejected": -0.08726808428764343, + "step": 2010 + }, + { + "epoch": 0.13, + "learning_rate": 4.984296503361256e-06, + "logits/chosen": -2.3741021156311035, + "logits/rejected": -2.0217337608337402, + "logps/chosen": -210.68020629882812, + "logps/rejected": -181.44540405273438, + "loss": 0.0185, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.04380156099796295, + "rewards/margins": 0.06384517252445221, + "rewards/rejected": -0.10764674842357635, + "step": 2020 + }, + { + "epoch": 0.13, + "learning_rate": 4.9836510411281645e-06, + "logits/chosen": -2.2192230224609375, + "logits/rejected": -2.1009650230407715, + "logps/chosen": -286.519287109375, + "logps/rejected": -271.2211608886719, + "loss": 0.028, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.03993731737136841, + "rewards/margins": 0.12052154541015625, + "rewards/rejected": -0.16045884788036346, + "step": 2030 + }, + { + "epoch": 0.13, + "learning_rate": 4.982992622954613e-06, + "logits/chosen": -2.352003812789917, + "logits/rejected": -2.058192729949951, + "logps/chosen": -291.65899658203125, + "logps/rejected": -192.23446655273438, + "loss": 0.0452, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.0406540110707283, + "rewards/margins": 0.07372018694877625, + "rewards/rejected": -0.11437419801950455, + "step": 2040 + }, + { + "epoch": 0.13, + "learning_rate": 4.9823212522752325e-06, + "logits/chosen": -2.445817470550537, + "logits/rejected": -2.1775293350219727, + "logps/chosen": -280.30621337890625, + "logps/rejected": -267.19482421875, + "loss": 0.0254, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.012462446466088295, + "rewards/margins": 0.10462311655282974, + "rewards/rejected": -0.11708555370569229, + "step": 2050 + }, + { + "epoch": 0.13, + "learning_rate": 4.981636932592222e-06, + "logits/chosen": -2.201385021209717, + "logits/rejected": -2.096060276031494, + "logps/chosen": -209.18569946289062, + "logps/rejected": -218.5018768310547, + "loss": 0.0178, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.018636580556631088, + "rewards/margins": 0.06668587028980255, + "rewards/rejected": -0.04804928973317146, + "step": 2060 + }, + { + "epoch": 0.14, + "learning_rate": 4.980939667475328e-06, + "logits/chosen": -2.4133598804473877, + "logits/rejected": -2.060161590576172, + "logps/chosen": -271.41278076171875, + "logps/rejected": -221.82763671875, + "loss": 0.0222, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.01381192822009325, + "rewards/margins": 0.055034469813108444, + "rewards/rejected": -0.041222553700208664, + "step": 2070 + }, + { + "epoch": 0.14, + "learning_rate": 4.980229460561826e-06, + "logits/chosen": -2.2730422019958496, + "logits/rejected": -2.2006144523620605, + "logps/chosen": -217.75338745117188, + "logps/rejected": -225.47537231445312, + "loss": 0.0177, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.001639772206544876, + "rewards/margins": 0.11514203250408173, + "rewards/rejected": -0.11350226402282715, + "step": 2080 + }, + { + "epoch": 0.14, + "learning_rate": 4.979506315556503e-06, + "logits/chosen": -2.2308144569396973, + "logits/rejected": -1.9013233184814453, + "logps/chosen": -286.318359375, + "logps/rejected": -250.24365234375, + "loss": 0.013, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.0008690860122442245, + "rewards/margins": 0.07456686347723007, + "rewards/rejected": -0.0736977756023407, + "step": 2090 + }, + { + "epoch": 0.14, + "learning_rate": 4.9787702362316395e-06, + "logits/chosen": -2.2909300327301025, + "logits/rejected": -2.505516529083252, + "logps/chosen": -195.52487182617188, + "logps/rejected": -230.25, + "loss": 0.0235, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.02348853088915348, + "rewards/margins": 0.04605535790324211, + "rewards/rejected": -0.06954388320446014, + "step": 2100 + }, + { + "epoch": 0.14, + "eval_logits/chosen": -2.278724431991577, + "eval_logits/rejected": -2.0933449268341064, + "eval_logps/chosen": -239.6732940673828, + "eval_logps/rejected": -234.6192626953125, + "eval_loss": 0.028895698487758636, + "eval_rewards/accuracies": 0.6474999785423279, + "eval_rewards/chosen": -0.0383417047560215, + "eval_rewards/margins": 0.07669514417648315, + "eval_rewards/rejected": -0.11503685265779495, + "eval_runtime": 712.4011, + "eval_samples_per_second": 2.807, + "eval_steps_per_second": 1.404, + "step": 2100 + }, + { + "epoch": 0.14, + "learning_rate": 4.9780212264269835e-06, + "logits/chosen": -2.210439443588257, + "logits/rejected": -1.9650099277496338, + "logps/chosen": -195.4822998046875, + "logps/rejected": -188.45339965820312, + "loss": 0.0189, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.05331619456410408, + "rewards/margins": 0.05053389072418213, + "rewards/rejected": -0.10385008901357651, + "step": 2110 + }, + { + "epoch": 0.14, + "learning_rate": 4.977259290049739e-06, + "logits/chosen": -2.466681480407715, + "logits/rejected": -1.907947301864624, + "logps/chosen": -293.2063293457031, + "logps/rejected": -251.5894012451172, + "loss": 0.0152, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.031319208443164825, + "rewards/margins": 0.13950778543949127, + "rewards/rejected": -0.1708270013332367, + "step": 2120 + }, + { + "epoch": 0.14, + "learning_rate": 4.976484431074538e-06, + "logits/chosen": -2.2047533988952637, + "logits/rejected": -2.1507277488708496, + "logps/chosen": -201.1926727294922, + "logps/rejected": -195.83145141601562, + "loss": 0.051, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.03693586587905884, + "rewards/margins": 0.06120552867650986, + "rewards/rejected": -0.09814140945672989, + "step": 2130 + }, + { + "epoch": 0.14, + "learning_rate": 4.975696653543425e-06, + "logits/chosen": -2.2936761379241943, + "logits/rejected": -2.0483224391937256, + "logps/chosen": -258.71966552734375, + "logps/rejected": -264.5902404785156, + "loss": 0.0289, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.04535426199436188, + "rewards/margins": 0.09219890832901001, + "rewards/rejected": -0.13755318522453308, + "step": 2140 + }, + { + "epoch": 0.14, + "learning_rate": 4.974895961565835e-06, + "logits/chosen": -2.20737886428833, + "logits/rejected": -1.882495641708374, + "logps/chosen": -188.53988647460938, + "logps/rejected": -210.54006958007812, + "loss": 0.0309, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.06644473224878311, + "rewards/margins": 0.08386100828647614, + "rewards/rejected": -0.15030571818351746, + "step": 2150 + }, + { + "epoch": 0.14, + "learning_rate": 4.974082359318566e-06, + "logits/chosen": -2.206138849258423, + "logits/rejected": -2.036780595779419, + "logps/chosen": -264.5321960449219, + "logps/rejected": -241.1393585205078, + "loss": 0.0221, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.054410744458436966, + "rewards/margins": 0.11043145507574081, + "rewards/rejected": -0.16484220325946808, + "step": 2160 + }, + { + "epoch": 0.14, + "learning_rate": 4.973255851045769e-06, + "logits/chosen": -2.209059238433838, + "logits/rejected": -2.216773509979248, + "logps/chosen": -225.8814697265625, + "logps/rejected": -204.2803955078125, + "loss": 0.038, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.04261628910899162, + "rewards/margins": 0.07573209702968597, + "rewards/rejected": -0.11834839731454849, + "step": 2170 + }, + { + "epoch": 0.14, + "learning_rate": 4.972416441058915e-06, + "logits/chosen": -2.144322633743286, + "logits/rejected": -1.9922664165496826, + "logps/chosen": -242.1470947265625, + "logps/rejected": -235.1527862548828, + "loss": 0.0328, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.06195644289255142, + "rewards/margins": 0.09823786467313766, + "rewards/rejected": -0.1601943075656891, + "step": 2180 + }, + { + "epoch": 0.14, + "learning_rate": 4.971564133736777e-06, + "logits/chosen": -2.0853326320648193, + "logits/rejected": -1.9092786312103271, + "logps/chosen": -185.86534118652344, + "logps/rejected": -210.11111450195312, + "loss": 0.0469, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.03932350128889084, + "rewards/margins": 0.09632328152656555, + "rewards/rejected": -0.135646790266037, + "step": 2190 + }, + { + "epoch": 0.14, + "learning_rate": 4.970698933525409e-06, + "logits/chosen": -2.394420862197876, + "logits/rejected": -2.1105847358703613, + "logps/chosen": -295.5723571777344, + "logps/rejected": -272.0895080566406, + "loss": 0.0401, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.0811271220445633, + "rewards/margins": 0.08094374090433121, + "rewards/rejected": -0.1620708405971527, + "step": 2200 + }, + { + "epoch": 0.14, + "eval_logits/chosen": -2.2756831645965576, + "eval_logits/rejected": -2.089794397354126, + "eval_logps/chosen": -243.54811096191406, + "eval_logps/rejected": -238.95559692382812, + "eval_loss": 0.02840990014374256, + "eval_rewards/accuracies": 0.6370000243186951, + "eval_rewards/chosen": -0.057715822011232376, + "eval_rewards/margins": 0.07900260388851166, + "eval_rewards/rejected": -0.13671842217445374, + "eval_runtime": 711.0099, + "eval_samples_per_second": 2.813, + "eval_steps_per_second": 1.406, + "step": 2200 + }, + { + "epoch": 0.14, + "learning_rate": 4.969820844938118e-06, + "logits/chosen": -2.3965530395507812, + "logits/rejected": -2.073350429534912, + "logps/chosen": -232.9188232421875, + "logps/rejected": -195.14671325683594, + "loss": 0.0208, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.05157778412103653, + "rewards/margins": 0.08065281808376312, + "rewards/rejected": -0.13223060965538025, + "step": 2210 + }, + { + "epoch": 0.15, + "learning_rate": 4.968929872555444e-06, + "logits/chosen": -1.928214430809021, + "logits/rejected": -2.0975098609924316, + "logps/chosen": -236.0997772216797, + "logps/rejected": -281.67559814453125, + "loss": 0.033, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.09482688456773758, + "rewards/margins": 0.049563802778720856, + "rewards/rejected": -0.14439070224761963, + "step": 2220 + }, + { + "epoch": 0.15, + "learning_rate": 4.968026021025137e-06, + "logits/chosen": -2.3393139839172363, + "logits/rejected": -2.1214444637298584, + "logps/chosen": -209.51931762695312, + "logps/rejected": -189.69772338867188, + "loss": 0.0226, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.039772000163793564, + "rewards/margins": 0.1012188047170639, + "rewards/rejected": -0.14099080860614777, + "step": 2230 + }, + { + "epoch": 0.15, + "learning_rate": 4.967109295062128e-06, + "logits/chosen": -2.1921162605285645, + "logits/rejected": -1.9794528484344482, + "logps/chosen": -234.14883422851562, + "logps/rejected": -273.10064697265625, + "loss": 0.0219, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.04573686048388481, + "rewards/margins": 0.09015445411205292, + "rewards/rejected": -0.13589131832122803, + "step": 2240 + }, + { + "epoch": 0.15, + "learning_rate": 4.966179699448509e-06, + "logits/chosen": -2.1765666007995605, + "logits/rejected": -1.9855120182037354, + "logps/chosen": -201.95956420898438, + "logps/rejected": -190.47109985351562, + "loss": 0.042, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.05602306127548218, + "rewards/margins": 0.03168831020593643, + "rewards/rejected": -0.0877113789319992, + "step": 2250 + }, + { + "epoch": 0.15, + "learning_rate": 4.965237239033506e-06, + "logits/chosen": -2.3550071716308594, + "logits/rejected": -2.1704366207122803, + "logps/chosen": -295.1136169433594, + "logps/rejected": -280.4603271484375, + "loss": 0.0318, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.02727457322180271, + "rewards/margins": 0.150642991065979, + "rewards/rejected": -0.17791756987571716, + "step": 2260 + }, + { + "epoch": 0.15, + "learning_rate": 4.964281918733453e-06, + "logits/chosen": -2.2895469665527344, + "logits/rejected": -2.046518325805664, + "logps/chosen": -194.8838653564453, + "logps/rejected": -209.9123077392578, + "loss": 0.0393, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.05884706974029541, + "rewards/margins": 0.09742596000432968, + "rewards/rejected": -0.1562730371952057, + "step": 2270 + }, + { + "epoch": 0.15, + "learning_rate": 4.9633137435317715e-06, + "logits/chosen": -2.292795181274414, + "logits/rejected": -1.662411093711853, + "logps/chosen": -234.4496612548828, + "logps/rejected": -186.72970581054688, + "loss": 0.0241, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.05777313560247421, + "rewards/margins": 0.10032601654529572, + "rewards/rejected": -0.15809914469718933, + "step": 2280 + }, + { + "epoch": 0.15, + "learning_rate": 4.9623327184789355e-06, + "logits/chosen": -2.3812899589538574, + "logits/rejected": -2.300156831741333, + "logps/chosen": -227.98934936523438, + "logps/rejected": -233.48876953125, + "loss": 0.0191, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.05124448984861374, + "rewards/margins": 0.05162835866212845, + "rewards/rejected": -0.10287284851074219, + "step": 2290 + }, + { + "epoch": 0.15, + "learning_rate": 4.9613388486924525e-06, + "logits/chosen": -1.9750115871429443, + "logits/rejected": -2.1227240562438965, + "logps/chosen": -191.4607391357422, + "logps/rejected": -226.56640625, + "loss": 0.0257, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.05663357302546501, + "rewards/margins": 0.088627889752388, + "rewards/rejected": -0.1452614665031433, + "step": 2300 + }, + { + "epoch": 0.15, + "eval_logits/chosen": -2.2834084033966064, + "eval_logits/rejected": -2.0974628925323486, + "eval_logps/chosen": -236.52151489257812, + "eval_logps/rejected": -233.99488830566406, + "eval_loss": 0.03041422739624977, + "eval_rewards/accuracies": 0.6299999952316284, + "eval_rewards/chosen": -0.022582821547985077, + "eval_rewards/margins": 0.08933208882808685, + "eval_rewards/rejected": -0.11191490292549133, + "eval_runtime": 712.7513, + "eval_samples_per_second": 2.806, + "eval_steps_per_second": 1.403, + "step": 2300 + }, + { + "epoch": 0.15, + "learning_rate": 4.960332139356834e-06, + "logits/chosen": -2.243394136428833, + "logits/rejected": -2.0578529834747314, + "logps/chosen": -214.17446899414062, + "logps/rejected": -208.0134735107422, + "loss": 0.0534, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.026787936687469482, + "rewards/margins": 0.10139461606740952, + "rewards/rejected": -0.1281825602054596, + "step": 2310 + }, + { + "epoch": 0.15, + "learning_rate": 4.95931259572357e-06, + "logits/chosen": -2.3460609912872314, + "logits/rejected": -1.9933313131332397, + "logps/chosen": -236.15878295898438, + "logps/rejected": -278.4571838378906, + "loss": 0.0318, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.009593973867595196, + "rewards/margins": 0.08312395960092545, + "rewards/rejected": -0.09271793067455292, + "step": 2320 + }, + { + "epoch": 0.15, + "learning_rate": 4.9582802231111e-06, + "logits/chosen": -2.1679487228393555, + "logits/rejected": -2.2463414669036865, + "logps/chosen": -211.53768920898438, + "logps/rejected": -200.51268005371094, + "loss": 0.0361, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.001682019210420549, + "rewards/margins": 0.0792674571275711, + "rewards/rejected": -0.07758542895317078, + "step": 2330 + }, + { + "epoch": 0.15, + "learning_rate": 4.957235026904782e-06, + "logits/chosen": -2.340217113494873, + "logits/rejected": -2.0187582969665527, + "logps/chosen": -258.4695129394531, + "logps/rejected": -216.89511108398438, + "loss": 0.0192, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.018414536491036415, + "rewards/margins": 0.04770408570766449, + "rewards/rejected": -0.029289543628692627, + "step": 2340 + }, + { + "epoch": 0.15, + "learning_rate": 4.956177012556875e-06, + "logits/chosen": -2.4300289154052734, + "logits/rejected": -2.1952102184295654, + "logps/chosen": -245.9125213623047, + "logps/rejected": -190.65444946289062, + "loss": 0.0307, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.008598506450653076, + "rewards/margins": 0.08154983818531036, + "rewards/rejected": -0.09014834463596344, + "step": 2350 + }, + { + "epoch": 0.15, + "learning_rate": 4.9551061855864976e-06, + "logits/chosen": -2.0655150413513184, + "logits/rejected": -2.118483066558838, + "logps/chosen": -196.32266235351562, + "logps/rejected": -212.44137573242188, + "loss": 0.0327, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.01747271604835987, + "rewards/margins": 0.0595405288040638, + "rewards/rejected": -0.07701323926448822, + "step": 2360 + }, + { + "epoch": 0.16, + "learning_rate": 4.95402255157961e-06, + "logits/chosen": -2.1108739376068115, + "logits/rejected": -2.167656898498535, + "logps/chosen": -183.8750762939453, + "logps/rejected": -268.9769287109375, + "loss": 0.035, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.005905964411795139, + "rewards/margins": 0.10277839750051498, + "rewards/rejected": -0.09687243402004242, + "step": 2370 + }, + { + "epoch": 0.16, + "learning_rate": 4.952926116188977e-06, + "logits/chosen": -2.406338691711426, + "logits/rejected": -2.3419036865234375, + "logps/chosen": -189.09738159179688, + "logps/rejected": -236.28744506835938, + "loss": 0.0557, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.028600236400961876, + "rewards/margins": 0.039927661418914795, + "rewards/rejected": -0.06852789968252182, + "step": 2380 + }, + { + "epoch": 0.16, + "learning_rate": 4.951816885134143e-06, + "logits/chosen": -2.2776050567626953, + "logits/rejected": -2.2614612579345703, + "logps/chosen": -205.91110229492188, + "logps/rejected": -219.66207885742188, + "loss": 0.0334, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.02179075963795185, + "rewards/margins": 0.07305854558944702, + "rewards/rejected": -0.09484930336475372, + "step": 2390 + }, + { + "epoch": 0.16, + "learning_rate": 4.950694864201399e-06, + "logits/chosen": -2.29520845413208, + "logits/rejected": -2.2112789154052734, + "logps/chosen": -237.30136108398438, + "logps/rejected": -254.004638671875, + "loss": 0.0339, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.006902826018631458, + "rewards/margins": 0.07287013530731201, + "rewards/rejected": -0.06596730649471283, + "step": 2400 + }, + { + "epoch": 0.16, + "eval_logits/chosen": -2.317573070526123, + "eval_logits/rejected": -2.1317877769470215, + "eval_logps/chosen": -230.5072784423828, + "eval_logps/rejected": -222.4461212158203, + "eval_loss": 0.030627042055130005, + "eval_rewards/accuracies": 0.6349999904632568, + "eval_rewards/chosen": 0.0074883149936795235, + "eval_rewards/margins": 0.06165945902466774, + "eval_rewards/rejected": -0.05417114123702049, + "eval_runtime": 714.8382, + "eval_samples_per_second": 2.798, + "eval_steps_per_second": 1.399, + "step": 2400 + }, + { + "epoch": 0.16, + "learning_rate": 4.9495600592437575e-06, + "logits/chosen": -2.366551399230957, + "logits/rejected": -2.2003931999206543, + "logps/chosen": -234.5296173095703, + "logps/rejected": -245.6568145751953, + "loss": 0.0414, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.038175903260707855, + "rewards/margins": 0.03972792625427246, + "rewards/rejected": -0.07790382206439972, + "step": 2410 + }, + { + "epoch": 0.16, + "learning_rate": 4.948412476180917e-06, + "logits/chosen": -2.272407054901123, + "logits/rejected": -1.991207480430603, + "logps/chosen": -190.45693969726562, + "logps/rejected": -184.49008178710938, + "loss": 0.0254, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.031905245035886765, + "rewards/margins": 0.08384934067726135, + "rewards/rejected": -0.11575458198785782, + "step": 2420 + }, + { + "epoch": 0.16, + "learning_rate": 4.947252120999232e-06, + "logits/chosen": -2.3069729804992676, + "logits/rejected": -2.0320534706115723, + "logps/chosen": -272.39654541015625, + "logps/rejected": -220.7366180419922, + "loss": 0.0386, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.02645879052579403, + "rewards/margins": 0.03556925058364868, + "rewards/rejected": -0.06202805042266846, + "step": 2430 + }, + { + "epoch": 0.16, + "learning_rate": 4.946078999751683e-06, + "logits/chosen": -2.2218480110168457, + "logits/rejected": -2.1495237350463867, + "logps/chosen": -176.87838745117188, + "logps/rejected": -168.43165588378906, + "loss": 0.0289, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.013743760995566845, + "rewards/margins": 0.06554745137691498, + "rewards/rejected": -0.051803696900606155, + "step": 2440 + }, + { + "epoch": 0.16, + "learning_rate": 4.944893118557847e-06, + "logits/chosen": -2.143193244934082, + "logits/rejected": -2.1038451194763184, + "logps/chosen": -206.0088348388672, + "logps/rejected": -165.9881591796875, + "loss": 0.0371, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.002378863049671054, + "rewards/margins": 0.06834892183542252, + "rewards/rejected": -0.06597007066011429, + "step": 2450 + }, + { + "epoch": 0.16, + "learning_rate": 4.943694483603861e-06, + "logits/chosen": -2.440500259399414, + "logits/rejected": -2.0264945030212402, + "logps/chosen": -224.66738891601562, + "logps/rejected": -187.57005310058594, + "loss": 0.0249, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.006683288607746363, + "rewards/margins": 0.06940829753875732, + "rewards/rejected": -0.0627250224351883, + "step": 2460 + }, + { + "epoch": 0.16, + "learning_rate": 4.9424831011423914e-06, + "logits/chosen": -2.3970818519592285, + "logits/rejected": -2.305779218673706, + "logps/chosen": -292.70465087890625, + "logps/rejected": -259.5212097167969, + "loss": 0.0325, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": -0.021390587091445923, + "rewards/margins": 0.019155841320753098, + "rewards/rejected": -0.04054642841219902, + "step": 2470 + }, + { + "epoch": 0.16, + "learning_rate": 4.9412589774926015e-06, + "logits/chosen": -2.3957889080047607, + "logits/rejected": -2.0970237255096436, + "logps/chosen": -278.4657287597656, + "logps/rejected": -246.1154327392578, + "loss": 0.0562, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.021379968151450157, + "rewards/margins": 0.0913659855723381, + "rewards/rejected": -0.11274596303701401, + "step": 2480 + }, + { + "epoch": 0.16, + "learning_rate": 4.940022119040121e-06, + "logits/chosen": -2.4432716369628906, + "logits/rejected": -2.1419849395751953, + "logps/chosen": -291.38311767578125, + "logps/rejected": -272.6324462890625, + "loss": 0.0321, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.002664142055436969, + "rewards/margins": 0.04533766582608223, + "rewards/rejected": -0.04267352074384689, + "step": 2490 + }, + { + "epoch": 0.16, + "learning_rate": 4.93877253223701e-06, + "logits/chosen": -2.3866477012634277, + "logits/rejected": -2.1521363258361816, + "logps/chosen": -288.61883544921875, + "logps/rejected": -267.2017822265625, + "loss": 0.0132, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.018266785889863968, + "rewards/margins": 0.05077819898724556, + "rewards/rejected": -0.03251141309738159, + "step": 2500 + }, + { + "epoch": 0.16, + "eval_logits/chosen": -2.3256454467773438, + "eval_logits/rejected": -2.139024496078491, + "eval_logps/chosen": -232.43704223632812, + "eval_logps/rejected": -223.72633361816406, + "eval_loss": 0.03118470311164856, + "eval_rewards/accuracies": 0.6349999904632568, + "eval_rewards/chosen": -0.002160488162189722, + "eval_rewards/margins": 0.0584116131067276, + "eval_rewards/rejected": -0.060572102665901184, + "eval_runtime": 711.145, + "eval_samples_per_second": 2.812, + "eval_steps_per_second": 1.406, + "step": 2500 + }, + { + "epoch": 0.16, + "learning_rate": 4.937510223601725e-06, + "logits/chosen": -2.5465502738952637, + "logits/rejected": -2.379866123199463, + "logps/chosen": -258.18267822265625, + "logps/rejected": -223.7237091064453, + "loss": 0.0302, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.0217165295034647, + "rewards/margins": 0.038762204349040985, + "rewards/rejected": -0.01704566739499569, + "step": 2510 + }, + { + "epoch": 0.16, + "learning_rate": 4.936235199719085e-06, + "logits/chosen": -2.3703575134277344, + "logits/rejected": -2.2414231300354004, + "logps/chosen": -171.20152282714844, + "logps/rejected": -155.46945190429688, + "loss": 0.0306, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.005502406507730484, + "rewards/margins": 0.07223564386367798, + "rewards/rejected": -0.0667332261800766, + "step": 2520 + }, + { + "epoch": 0.17, + "learning_rate": 4.93494746724024e-06, + "logits/chosen": -2.3545944690704346, + "logits/rejected": -2.0969889163970947, + "logps/chosen": -224.9967498779297, + "logps/rejected": -256.32989501953125, + "loss": 0.0275, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.005961322691291571, + "rewards/margins": 0.06936588883399963, + "rewards/rejected": -0.07532721757888794, + "step": 2530 + }, + { + "epoch": 0.17, + "learning_rate": 4.933647032882635e-06, + "logits/chosen": -2.496915340423584, + "logits/rejected": -2.186249256134033, + "logps/chosen": -244.2103729248047, + "logps/rejected": -214.40121459960938, + "loss": 0.0251, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.0032573645003139973, + "rewards/margins": 0.07305190712213516, + "rewards/rejected": -0.06979455053806305, + "step": 2540 + }, + { + "epoch": 0.17, + "learning_rate": 4.932333903429969e-06, + "logits/chosen": -2.1586389541625977, + "logits/rejected": -2.011641263961792, + "logps/chosen": -192.63626098632812, + "logps/rejected": -168.6728515625, + "loss": 0.0251, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": 0.002231207210570574, + "rewards/margins": 0.009213193319737911, + "rewards/rejected": -0.006981985177844763, + "step": 2550 + }, + { + "epoch": 0.17, + "learning_rate": 4.931008085732172e-06, + "logits/chosen": -2.3497612476348877, + "logits/rejected": -1.9341261386871338, + "logps/chosen": -200.062255859375, + "logps/rejected": -161.38348388671875, + "loss": 0.0282, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.0138033302500844, + "rewards/margins": 0.056612949818372726, + "rewards/rejected": -0.0428096204996109, + "step": 2560 + }, + { + "epoch": 0.17, + "learning_rate": 4.9296695867053565e-06, + "logits/chosen": -2.30879807472229, + "logits/rejected": -2.090181827545166, + "logps/chosen": -297.36773681640625, + "logps/rejected": -243.02755737304688, + "loss": 0.0177, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.016169043257832527, + "rewards/margins": 0.05809883400797844, + "rewards/rejected": -0.041929781436920166, + "step": 2570 + }, + { + "epoch": 0.17, + "learning_rate": 4.928318413331791e-06, + "logits/chosen": -2.3878073692321777, + "logits/rejected": -2.1812081336975098, + "logps/chosen": -211.0451202392578, + "logps/rejected": -205.4989776611328, + "loss": 0.0377, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.000909944239538163, + "rewards/margins": 0.05219554901123047, + "rewards/rejected": -0.051285602152347565, + "step": 2580 + }, + { + "epoch": 0.17, + "learning_rate": 4.926954572659855e-06, + "logits/chosen": -2.166869878768921, + "logits/rejected": -2.1766741275787354, + "logps/chosen": -237.266357421875, + "logps/rejected": -273.09771728515625, + "loss": 0.0334, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.024646395817399025, + "rewards/margins": 0.09322737157344818, + "rewards/rejected": -0.06858097016811371, + "step": 2590 + }, + { + "epoch": 0.17, + "learning_rate": 4.925578071804013e-06, + "logits/chosen": -2.1840600967407227, + "logits/rejected": -2.1330018043518066, + "logps/chosen": -234.78652954101562, + "logps/rejected": -307.0347900390625, + "loss": 0.0196, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.025403816252946854, + "rewards/margins": 0.07418211549520493, + "rewards/rejected": -0.09958592802286148, + "step": 2600 + }, + { + "epoch": 0.17, + "eval_logits/chosen": -2.287087917327881, + "eval_logits/rejected": -2.1024510860443115, + "eval_logps/chosen": -233.3758544921875, + "eval_logps/rejected": -227.7709503173828, + "eval_loss": 0.028082743287086487, + "eval_rewards/accuracies": 0.6499999761581421, + "eval_rewards/chosen": -0.006854598876088858, + "eval_rewards/margins": 0.07394073158502579, + "eval_rewards/rejected": -0.08079533278942108, + "eval_runtime": 714.8887, + "eval_samples_per_second": 2.798, + "eval_steps_per_second": 1.399, + "step": 2600 + }, + { + "epoch": 0.17, + "learning_rate": 4.924188917944763e-06, + "logits/chosen": -2.354065418243408, + "logits/rejected": -2.1704587936401367, + "logps/chosen": -219.37435913085938, + "logps/rejected": -207.4625701904297, + "loss": 0.0297, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.003153681056573987, + "rewards/margins": 0.10804332792758942, + "rewards/rejected": -0.1048896461725235, + "step": 2610 + }, + { + "epoch": 0.17, + "learning_rate": 4.922787118328617e-06, + "logits/chosen": -2.376201629638672, + "logits/rejected": -2.0523414611816406, + "logps/chosen": -235.42984008789062, + "logps/rejected": -170.25169372558594, + "loss": 0.0351, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -0.027060797438025475, + "rewards/margins": 0.055673640221357346, + "rewards/rejected": -0.08273444324731827, + "step": 2620 + }, + { + "epoch": 0.17, + "learning_rate": 4.921372680268045e-06, + "logits/chosen": -2.3491272926330566, + "logits/rejected": -2.028884172439575, + "logps/chosen": -239.7322540283203, + "logps/rejected": -218.17904663085938, + "loss": 0.0337, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.048637766391038895, + "rewards/margins": 0.044077835977077484, + "rewards/rejected": -0.09271560609340668, + "step": 2630 + }, + { + "epoch": 0.17, + "learning_rate": 4.919945611141451e-06, + "logits/chosen": -2.4218690395355225, + "logits/rejected": -2.043612003326416, + "logps/chosen": -225.4250030517578, + "logps/rejected": -177.67538452148438, + "loss": 0.0324, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.0009518010774627328, + "rewards/margins": 0.05384860187768936, + "rewards/rejected": -0.054800402373075485, + "step": 2640 + }, + { + "epoch": 0.17, + "learning_rate": 4.918505918393125e-06, + "logits/chosen": -2.279812812805176, + "logits/rejected": -2.1329543590545654, + "logps/chosen": -170.7287139892578, + "logps/rejected": -207.5980224609375, + "loss": 0.0429, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.00849010981619358, + "rewards/margins": 0.06747962534427643, + "rewards/rejected": -0.07596974074840546, + "step": 2650 + }, + { + "epoch": 0.17, + "learning_rate": 4.91705360953321e-06, + "logits/chosen": -2.3256969451904297, + "logits/rejected": -2.0999441146850586, + "logps/chosen": -254.2303924560547, + "logps/rejected": -237.78878784179688, + "loss": 0.0325, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.04371384158730507, + "rewards/margins": 0.08106885105371475, + "rewards/rejected": -0.12478268146514893, + "step": 2660 + }, + { + "epoch": 0.17, + "learning_rate": 4.9155886921376615e-06, + "logits/chosen": -2.2445991039276123, + "logits/rejected": -2.2022550106048584, + "logps/chosen": -216.46865844726562, + "logps/rejected": -250.0487060546875, + "loss": 0.0522, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.05250120908021927, + "rewards/margins": 0.05955871194601059, + "rewards/rejected": -0.11205992847681046, + "step": 2670 + }, + { + "epoch": 0.18, + "learning_rate": 4.914111173848205e-06, + "logits/chosen": -2.3046090602874756, + "logits/rejected": -2.2570502758026123, + "logps/chosen": -238.754150390625, + "logps/rejected": -233.6313934326172, + "loss": 0.0217, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.028612712398171425, + "rewards/margins": 0.05147603899240494, + "rewards/rejected": -0.08008874207735062, + "step": 2680 + }, + { + "epoch": 0.18, + "learning_rate": 4.9126210623723e-06, + "logits/chosen": -2.085294246673584, + "logits/rejected": -2.254727840423584, + "logps/chosen": -202.56884765625, + "logps/rejected": -253.7330322265625, + "loss": 0.023, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.010977232828736305, + "rewards/margins": 0.08696094900369644, + "rewards/rejected": -0.0979381650686264, + "step": 2690 + }, + { + "epoch": 0.18, + "learning_rate": 4.911118365483098e-06, + "logits/chosen": -2.181164264678955, + "logits/rejected": -2.2666800022125244, + "logps/chosen": -209.20834350585938, + "logps/rejected": -235.704833984375, + "loss": 0.0317, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.0372706837952137, + "rewards/margins": 0.08535125106573105, + "rewards/rejected": -0.12262193113565445, + "step": 2700 + }, + { + "epoch": 0.18, + "eval_logits/chosen": -2.294182062149048, + "eval_logits/rejected": -2.1089656352996826, + "eval_logps/chosen": -238.58056640625, + "eval_logps/rejected": -232.78578186035156, + "eval_loss": 0.027993008494377136, + "eval_rewards/accuracies": 0.6545000076293945, + "eval_rewards/chosen": -0.03287803754210472, + "eval_rewards/margins": 0.07299138605594635, + "eval_rewards/rejected": -0.10586943477392197, + "eval_runtime": 713.5186, + "eval_samples_per_second": 2.803, + "eval_steps_per_second": 1.402, + "step": 2700 + }, + { + "epoch": 0.18, + "learning_rate": 4.909603091019403e-06, + "logits/chosen": -2.470644235610962, + "logits/rejected": -2.1014533042907715, + "logps/chosen": -237.65869140625, + "logps/rejected": -215.0699005126953, + "loss": 0.0133, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.006406778935343027, + "rewards/margins": 0.0701574832201004, + "rewards/rejected": -0.07656426727771759, + "step": 2710 + }, + { + "epoch": 0.18, + "learning_rate": 4.908075246885626e-06, + "logits/chosen": -2.247979164123535, + "logits/rejected": -2.1756398677825928, + "logps/chosen": -155.73165893554688, + "logps/rejected": -134.16079711914062, + "loss": 0.0646, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.02502075955271721, + "rewards/margins": 0.0291023887693882, + "rewards/rejected": -0.054123152047395706, + "step": 2720 + }, + { + "epoch": 0.18, + "learning_rate": 4.906534841051755e-06, + "logits/chosen": -2.1049163341522217, + "logits/rejected": -2.167664051055908, + "logps/chosen": -247.0775604248047, + "logps/rejected": -268.4331970214844, + "loss": 0.0212, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.018370602279901505, + "rewards/margins": 0.051600076258182526, + "rewards/rejected": -0.06997067481279373, + "step": 2730 + }, + { + "epoch": 0.18, + "learning_rate": 4.904981881553297e-06, + "logits/chosen": -2.3743033409118652, + "logits/rejected": -2.0446043014526367, + "logps/chosen": -228.78414916992188, + "logps/rejected": -174.08529663085938, + "loss": 0.0243, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.022939365357160568, + "rewards/margins": 0.0470966100692749, + "rewards/rejected": -0.07003597170114517, + "step": 2740 + }, + { + "epoch": 0.18, + "learning_rate": 4.903416376491252e-06, + "logits/chosen": -2.369668960571289, + "logits/rejected": -1.9818544387817383, + "logps/chosen": -283.78900146484375, + "logps/rejected": -263.84686279296875, + "loss": 0.026, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.027483994141221046, + "rewards/margins": 0.10324974358081818, + "rewards/rejected": -0.13073372840881348, + "step": 2750 + }, + { + "epoch": 0.18, + "learning_rate": 4.90183833403206e-06, + "logits/chosen": -2.4412343502044678, + "logits/rejected": -2.286303997039795, + "logps/chosen": -268.1914978027344, + "logps/rejected": -252.0909423828125, + "loss": 0.0257, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.019891971722245216, + "rewards/margins": 0.10076215118169785, + "rewards/rejected": -0.12065412104129791, + "step": 2760 + }, + { + "epoch": 0.18, + "learning_rate": 4.900247762407564e-06, + "logits/chosen": -2.191983938217163, + "logits/rejected": -1.9838281869888306, + "logps/chosen": -181.60751342773438, + "logps/rejected": -223.0465850830078, + "loss": 0.0216, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.04657771438360214, + "rewards/margins": 0.09698217362165451, + "rewards/rejected": -0.14355987310409546, + "step": 2770 + }, + { + "epoch": 0.18, + "learning_rate": 4.898644669914965e-06, + "logits/chosen": -2.253765821456909, + "logits/rejected": -2.1682329177856445, + "logps/chosen": -232.3789825439453, + "logps/rejected": -233.66531372070312, + "loss": 0.0279, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.04153277724981308, + "rewards/margins": 0.07556191831827164, + "rewards/rejected": -0.11709471046924591, + "step": 2780 + }, + { + "epoch": 0.18, + "learning_rate": 4.897029064916778e-06, + "logits/chosen": -2.0696494579315186, + "logits/rejected": -1.8728134632110596, + "logps/chosen": -218.97836303710938, + "logps/rejected": -221.40316772460938, + "loss": 0.0279, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.054142218083143234, + "rewards/margins": 0.07406821846961975, + "rewards/rejected": -0.1282104253768921, + "step": 2790 + }, + { + "epoch": 0.18, + "learning_rate": 4.895400955840791e-06, + "logits/chosen": -2.437290668487549, + "logits/rejected": -1.7853336334228516, + "logps/chosen": -222.3488006591797, + "logps/rejected": -196.37698364257812, + "loss": 0.036, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.009987411089241505, + "rewards/margins": 0.07862985879182816, + "rewards/rejected": -0.08861726522445679, + "step": 2800 + }, + { + "epoch": 0.18, + "eval_logits/chosen": -2.2897098064422607, + "eval_logits/rejected": -2.105027437210083, + "eval_logps/chosen": -235.5567169189453, + "eval_logps/rejected": -228.9888153076172, + "eval_loss": 0.02788878232240677, + "eval_rewards/accuracies": 0.6365000009536743, + "eval_rewards/chosen": -0.017758876085281372, + "eval_rewards/margins": 0.06912563741207123, + "eval_rewards/rejected": -0.0868845209479332, + "eval_runtime": 712.3028, + "eval_samples_per_second": 2.808, + "eval_steps_per_second": 1.404, + "step": 2800 + }, + { + "epoch": 0.18, + "learning_rate": 4.893760351180018e-06, + "logits/chosen": -2.28792142868042, + "logits/rejected": -2.2265536785125732, + "logps/chosen": -203.72744750976562, + "logps/rejected": -219.5680389404297, + "loss": 0.0225, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.025930937379598618, + "rewards/margins": 0.05369790643453598, + "rewards/rejected": -0.0796288475394249, + "step": 2810 + }, + { + "epoch": 0.18, + "learning_rate": 4.892107259492657e-06, + "logits/chosen": -2.2479248046875, + "logits/rejected": -2.009706974029541, + "logps/chosen": -243.56875610351562, + "logps/rejected": -256.754638671875, + "loss": 0.0203, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.0185473021119833, + "rewards/margins": 0.035834766924381256, + "rewards/rejected": -0.054382067173719406, + "step": 2820 + }, + { + "epoch": 0.19, + "learning_rate": 4.890441689402042e-06, + "logits/chosen": -2.3838446140289307, + "logits/rejected": -2.2034523487091064, + "logps/chosen": -338.8334655761719, + "logps/rejected": -308.4311218261719, + "loss": 0.0111, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.01664288341999054, + "rewards/margins": 0.08798809349536896, + "rewards/rejected": -0.1046309843659401, + "step": 2830 + }, + { + "epoch": 0.19, + "learning_rate": 4.888763649596606e-06, + "logits/chosen": -2.3975675106048584, + "logits/rejected": -2.17059326171875, + "logps/chosen": -214.5126190185547, + "logps/rejected": -220.7166290283203, + "loss": 0.0575, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.021754657849669456, + "rewards/margins": 0.07502701133489609, + "rewards/rejected": -0.0967816710472107, + "step": 2840 + }, + { + "epoch": 0.19, + "learning_rate": 4.887073148829824e-06, + "logits/chosen": -2.308504581451416, + "logits/rejected": -2.1595101356506348, + "logps/chosen": -265.9504089355469, + "logps/rejected": -261.0517578125, + "loss": 0.0261, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.004765903111547232, + "rewards/margins": 0.09018461406230927, + "rewards/rejected": -0.0854187160730362, + "step": 2850 + }, + { + "epoch": 0.19, + "learning_rate": 4.885370195920177e-06, + "logits/chosen": -2.1859519481658936, + "logits/rejected": -2.124957323074341, + "logps/chosen": -198.3850860595703, + "logps/rejected": -204.52525329589844, + "loss": 0.0366, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.06530681997537613, + "rewards/margins": 0.07356669753789902, + "rewards/rejected": -0.13887353241443634, + "step": 2860 + }, + { + "epoch": 0.19, + "learning_rate": 4.883654799751101e-06, + "logits/chosen": -2.1068179607391357, + "logits/rejected": -2.3324809074401855, + "logps/chosen": -222.74520874023438, + "logps/rejected": -269.09893798828125, + "loss": 0.0439, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -0.035180218517780304, + "rewards/margins": 0.05806810408830643, + "rewards/rejected": -0.09324832260608673, + "step": 2870 + }, + { + "epoch": 0.19, + "learning_rate": 4.8819269692709435e-06, + "logits/chosen": -2.4083595275878906, + "logits/rejected": -2.236450672149658, + "logps/chosen": -275.53363037109375, + "logps/rejected": -219.38052368164062, + "loss": 0.016, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.04243696108460426, + "rewards/margins": 0.08104520291090012, + "rewards/rejected": -0.12348216772079468, + "step": 2880 + }, + { + "epoch": 0.19, + "learning_rate": 4.880186713492915e-06, + "logits/chosen": -2.250185251235962, + "logits/rejected": -2.0641236305236816, + "logps/chosen": -243.86288452148438, + "logps/rejected": -200.46041870117188, + "loss": 0.02, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.08072765171527863, + "rewards/margins": 0.051097385585308075, + "rewards/rejected": -0.1318250447511673, + "step": 2890 + }, + { + "epoch": 0.19, + "learning_rate": 4.878434041495041e-06, + "logits/chosen": -2.289459466934204, + "logits/rejected": -2.408504009246826, + "logps/chosen": -245.63339233398438, + "logps/rejected": -264.4200439453125, + "loss": 0.0353, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.04233827441930771, + "rewards/margins": 0.08462206274271011, + "rewards/rejected": -0.12696032226085663, + "step": 2900 + }, + { + "epoch": 0.19, + "eval_logits/chosen": -2.284769296646118, + "eval_logits/rejected": -2.0999910831451416, + "eval_logps/chosen": -240.31153869628906, + "eval_logps/rejected": -233.4534912109375, + "eval_loss": 0.02789762057363987, + "eval_rewards/accuracies": 0.6445000171661377, + "eval_rewards/chosen": -0.04153289273381233, + "eval_rewards/margins": 0.06767502427101135, + "eval_rewards/rejected": -0.10920792073011398, + "eval_runtime": 712.6645, + "eval_samples_per_second": 2.806, + "eval_steps_per_second": 1.403, + "step": 2900 + }, + { + "epoch": 0.19, + "learning_rate": 4.876668962420117e-06, + "logits/chosen": -2.2638027667999268, + "logits/rejected": -2.0080816745758057, + "logps/chosen": -292.7586975097656, + "logps/rejected": -245.6000518798828, + "loss": 0.0416, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.015538054518401623, + "rewards/margins": 0.062287408858537674, + "rewards/rejected": -0.07782547175884247, + "step": 2910 + }, + { + "epoch": 0.19, + "learning_rate": 4.87489148547566e-06, + "logits/chosen": -2.274672031402588, + "logits/rejected": -2.128838539123535, + "logps/chosen": -270.25103759765625, + "logps/rejected": -244.59774780273438, + "loss": 0.0387, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.05196281149983406, + "rewards/margins": 0.03759818524122238, + "rewards/rejected": -0.08956098556518555, + "step": 2920 + }, + { + "epoch": 0.19, + "learning_rate": 4.873101619933862e-06, + "logits/chosen": -2.487917900085449, + "logits/rejected": -2.1313300132751465, + "logps/chosen": -270.21759033203125, + "logps/rejected": -234.5707550048828, + "loss": 0.0343, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.02845207415521145, + "rewards/margins": 0.06834385544061661, + "rewards/rejected": -0.0967959314584732, + "step": 2930 + }, + { + "epoch": 0.19, + "learning_rate": 4.8712993751315385e-06, + "logits/chosen": -2.271152973175049, + "logits/rejected": -2.21158504486084, + "logps/chosen": -124.94087219238281, + "logps/rejected": -136.87136840820312, + "loss": 0.048, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.02141464687883854, + "rewards/margins": 0.04014318436384201, + "rewards/rejected": -0.0615578293800354, + "step": 2940 + }, + { + "epoch": 0.19, + "learning_rate": 4.869484760470079e-06, + "logits/chosen": -2.31473970413208, + "logits/rejected": -2.109999418258667, + "logps/chosen": -193.41445922851562, + "logps/rejected": -172.94094848632812, + "loss": 0.0164, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.02376813441514969, + "rewards/margins": 0.07912831753492355, + "rewards/rejected": -0.10289645195007324, + "step": 2950 + }, + { + "epoch": 0.19, + "learning_rate": 4.867657785415404e-06, + "logits/chosen": -2.241927146911621, + "logits/rejected": -1.9641263484954834, + "logps/chosen": -258.5636291503906, + "logps/rejected": -236.51513671875, + "loss": 0.0319, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.04732293635606766, + "rewards/margins": 0.09848068654537201, + "rewards/rejected": -0.14580364525318146, + "step": 2960 + }, + { + "epoch": 0.19, + "learning_rate": 4.865818459497911e-06, + "logits/chosen": -2.4856972694396973, + "logits/rejected": -2.0244452953338623, + "logps/chosen": -294.99627685546875, + "logps/rejected": -221.91909790039062, + "loss": 0.0242, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.05408806353807449, + "rewards/margins": 0.05860002711415291, + "rewards/rejected": -0.1126880869269371, + "step": 2970 + }, + { + "epoch": 0.19, + "learning_rate": 4.863966792312423e-06, + "logits/chosen": -2.355971336364746, + "logits/rejected": -2.1137917041778564, + "logps/chosen": -245.83828735351562, + "logps/rejected": -227.0188446044922, + "loss": 0.0206, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.02189687080681324, + "rewards/margins": 0.11115972697734833, + "rewards/rejected": -0.1330566108226776, + "step": 2980 + }, + { + "epoch": 0.2, + "learning_rate": 4.862102793518145e-06, + "logits/chosen": -2.2027151584625244, + "logits/rejected": -2.2636585235595703, + "logps/chosen": -205.7113494873047, + "logps/rejected": -231.49172973632812, + "loss": 0.0363, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.05714733526110649, + "rewards/margins": 0.09427683800458908, + "rewards/rejected": -0.15142419934272766, + "step": 2990 + }, + { + "epoch": 0.2, + "learning_rate": 4.8602264728386075e-06, + "logits/chosen": -2.3041369915008545, + "logits/rejected": -2.1546449661254883, + "logps/chosen": -260.479736328125, + "logps/rejected": -273.4519958496094, + "loss": 0.0259, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.03990109637379646, + "rewards/margins": 0.0819544792175293, + "rewards/rejected": -0.12185557186603546, + "step": 3000 + }, + { + "epoch": 0.2, + "eval_logits/chosen": -2.274144172668457, + "eval_logits/rejected": -2.088632345199585, + "eval_logps/chosen": -239.57725524902344, + "eval_logps/rejected": -235.87315368652344, + "eval_loss": 0.028896113857626915, + "eval_rewards/accuracies": 0.6504999995231628, + "eval_rewards/chosen": -0.037861473858356476, + "eval_rewards/margins": 0.08344479650259018, + "eval_rewards/rejected": -0.12130627781152725, + "eval_runtime": 714.9786, + "eval_samples_per_second": 2.797, + "eval_steps_per_second": 1.399, + "step": 3000 + }, + { + "epoch": 0.2, + "learning_rate": 4.858337840061616e-06, + "logits/chosen": -2.265939712524414, + "logits/rejected": -2.188263416290283, + "logps/chosen": -187.27552795410156, + "logps/rejected": -255.88009643554688, + "loss": 0.0266, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.02097577229142189, + "rewards/margins": 0.07477692514657974, + "rewards/rejected": -0.09575269371271133, + "step": 3010 + }, + { + "epoch": 0.2, + "learning_rate": 4.856436905039208e-06, + "logits/chosen": -2.2863521575927734, + "logits/rejected": -2.1258704662323, + "logps/chosen": -214.4219970703125, + "logps/rejected": -198.88104248046875, + "loss": 0.0256, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.016084687784314156, + "rewards/margins": 0.09423195570707321, + "rewards/rejected": -0.11031664907932281, + "step": 3020 + }, + { + "epoch": 0.2, + "learning_rate": 4.854523677687588e-06, + "logits/chosen": -2.148622989654541, + "logits/rejected": -2.218465566635132, + "logps/chosen": -191.50128173828125, + "logps/rejected": -225.01953125, + "loss": 0.0247, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.0512787401676178, + "rewards/margins": 0.07560839504003525, + "rewards/rejected": -0.12688712775707245, + "step": 3030 + }, + { + "epoch": 0.2, + "learning_rate": 4.85259816798709e-06, + "logits/chosen": -2.374782085418701, + "logits/rejected": -1.833749532699585, + "logps/chosen": -297.7157897949219, + "logps/rejected": -238.33712768554688, + "loss": 0.0243, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.049768656492233276, + "rewards/margins": 0.10588622093200684, + "rewards/rejected": -0.1556548923254013, + "step": 3040 + }, + { + "epoch": 0.2, + "learning_rate": 4.850660385982114e-06, + "logits/chosen": -2.360474109649658, + "logits/rejected": -2.190361499786377, + "logps/chosen": -258.0581359863281, + "logps/rejected": -216.6773223876953, + "loss": 0.047, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.051694877445697784, + "rewards/margins": 0.06950264424085617, + "rewards/rejected": -0.12119752168655396, + "step": 3050 + }, + { + "epoch": 0.2, + "learning_rate": 4.848710341781081e-06, + "logits/chosen": -2.0902516841888428, + "logits/rejected": -2.1962881088256836, + "logps/chosen": -201.42294311523438, + "logps/rejected": -206.622314453125, + "loss": 0.0326, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.13760720193386078, + "rewards/margins": 0.05463032051920891, + "rewards/rejected": -0.1922374963760376, + "step": 3060 + }, + { + "epoch": 0.2, + "learning_rate": 4.846748045556377e-06, + "logits/chosen": -2.2907516956329346, + "logits/rejected": -1.9787200689315796, + "logps/chosen": -262.0197448730469, + "logps/rejected": -215.56582641601562, + "loss": 0.0412, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.11266320943832397, + "rewards/margins": 0.0702851191163063, + "rewards/rejected": -0.18294832110404968, + "step": 3070 + }, + { + "epoch": 0.2, + "learning_rate": 4.8447735075442995e-06, + "logits/chosen": -2.1806905269622803, + "logits/rejected": -2.2396817207336426, + "logps/chosen": -227.7964324951172, + "logps/rejected": -241.79281616210938, + "loss": 0.0333, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.15117108821868896, + "rewards/margins": 0.08698441833257675, + "rewards/rejected": -0.2381555140018463, + "step": 3080 + }, + { + "epoch": 0.2, + "learning_rate": 4.8427867380450075e-06, + "logits/chosen": -2.3623485565185547, + "logits/rejected": -1.9704208374023438, + "logps/chosen": -254.1536865234375, + "logps/rejected": -228.27432250976562, + "loss": 0.0302, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.11907199770212173, + "rewards/margins": 0.10185201466083527, + "rewards/rejected": -0.2209240198135376, + "step": 3090 + }, + { + "epoch": 0.2, + "learning_rate": 4.840787747422462e-06, + "logits/chosen": -2.329294443130493, + "logits/rejected": -2.075838804244995, + "logps/chosen": -215.3441925048828, + "logps/rejected": -199.28408813476562, + "loss": 0.0362, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.09844044595956802, + "rewards/margins": 0.07098730653524399, + "rewards/rejected": -0.169427752494812, + "step": 3100 + }, + { + "epoch": 0.2, + "eval_logits/chosen": -2.277235269546509, + "eval_logits/rejected": -2.0925114154815674, + "eval_logps/chosen": -254.0054931640625, + "eval_logps/rejected": -249.93931579589844, + "eval_loss": 0.028891319409012794, + "eval_rewards/accuracies": 0.6485000252723694, + "eval_rewards/chosen": -0.1100027933716774, + "eval_rewards/margins": 0.08163423091173172, + "eval_rewards/rejected": -0.19163702428340912, + "eval_runtime": 711.301, + "eval_samples_per_second": 2.812, + "eval_steps_per_second": 1.406, + "step": 3100 + }, + { + "epoch": 0.2, + "learning_rate": 4.838776546104378e-06, + "logits/chosen": -2.254002332687378, + "logits/rejected": -2.2489466667175293, + "logps/chosen": -300.79296875, + "logps/rejected": -279.73236083984375, + "loss": 0.0139, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.09919861704111099, + "rewards/margins": 0.09670659154653549, + "rewards/rejected": -0.19590522348880768, + "step": 3110 + }, + { + "epoch": 0.2, + "learning_rate": 4.836753144582168e-06, + "logits/chosen": -2.2163474559783936, + "logits/rejected": -1.9966824054718018, + "logps/chosen": -262.107666015625, + "logps/rejected": -259.97833251953125, + "loss": 0.0334, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.09938045591115952, + "rewards/margins": 0.10991636663675308, + "rewards/rejected": -0.2092967927455902, + "step": 3120 + }, + { + "epoch": 0.2, + "learning_rate": 4.834717553410884e-06, + "logits/chosen": -2.2825305461883545, + "logits/rejected": -2.0499181747436523, + "logps/chosen": -204.13722229003906, + "logps/rejected": -237.09097290039062, + "loss": 0.0197, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.07430613785982132, + "rewards/margins": 0.09642286598682404, + "rewards/rejected": -0.17072899639606476, + "step": 3130 + }, + { + "epoch": 0.21, + "learning_rate": 4.832669783209167e-06, + "logits/chosen": -2.1582999229431152, + "logits/rejected": -2.2071380615234375, + "logps/chosen": -259.179931640625, + "logps/rejected": -264.71295166015625, + "loss": 0.022, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.07376949489116669, + "rewards/margins": 0.02492070011794567, + "rewards/rejected": -0.09869018942117691, + "step": 3140 + }, + { + "epoch": 0.21, + "learning_rate": 4.8306098446591895e-06, + "logits/chosen": -1.8651392459869385, + "logits/rejected": -2.0267395973205566, + "logps/chosen": -192.75889587402344, + "logps/rejected": -228.6290740966797, + "loss": 0.0408, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.08245015144348145, + "rewards/margins": 0.05469425767660141, + "rewards/rejected": -0.13714441657066345, + "step": 3150 + }, + { + "epoch": 0.21, + "learning_rate": 4.828537748506601e-06, + "logits/chosen": -2.396944522857666, + "logits/rejected": -2.1304783821105957, + "logps/chosen": -284.6820068359375, + "logps/rejected": -235.306884765625, + "loss": 0.0164, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.05641879513859749, + "rewards/margins": 0.034355297684669495, + "rewards/rejected": -0.09077408909797668, + "step": 3160 + }, + { + "epoch": 0.21, + "learning_rate": 4.826453505560469e-06, + "logits/chosen": -2.073132038116455, + "logits/rejected": -2.0445899963378906, + "logps/chosen": -204.7270965576172, + "logps/rejected": -200.5258331298828, + "loss": 0.0339, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -0.06873993575572968, + "rewards/margins": 0.04747641831636429, + "rewards/rejected": -0.11621636152267456, + "step": 3170 + }, + { + "epoch": 0.21, + "learning_rate": 4.824357126693226e-06, + "logits/chosen": -2.1617677211761475, + "logits/rejected": -1.79251229763031, + "logps/chosen": -271.9390869140625, + "logps/rejected": -233.08486938476562, + "loss": 0.0251, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.06597967445850372, + "rewards/margins": 0.049365781247615814, + "rewards/rejected": -0.11534545570611954, + "step": 3180 + }, + { + "epoch": 0.21, + "learning_rate": 4.8222486228406105e-06, + "logits/chosen": -2.34342622756958, + "logits/rejected": -2.0599923133850098, + "logps/chosen": -221.85867309570312, + "logps/rejected": -201.22842407226562, + "loss": 0.0179, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.05309159681200981, + "rewards/margins": 0.07025385648012161, + "rewards/rejected": -0.12334545701742172, + "step": 3190 + }, + { + "epoch": 0.21, + "learning_rate": 4.820128005001612e-06, + "logits/chosen": -2.009340763092041, + "logits/rejected": -1.9286329746246338, + "logps/chosen": -225.3570556640625, + "logps/rejected": -231.0119171142578, + "loss": 0.0319, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.031817398965358734, + "rewards/margins": 0.13598847389221191, + "rewards/rejected": -0.16780588030815125, + "step": 3200 + }, + { + "epoch": 0.21, + "eval_logits/chosen": -2.2569305896759033, + "eval_logits/rejected": -2.074082374572754, + "eval_logps/chosen": -242.5390625, + "eval_logps/rejected": -238.02999877929688, + "eval_loss": 0.028289152309298515, + "eval_rewards/accuracies": 0.6384999752044678, + "eval_rewards/chosen": -0.05267051234841347, + "eval_rewards/margins": 0.07942002266645432, + "eval_rewards/rejected": -0.1320905238389969, + "eval_runtime": 711.7306, + "eval_samples_per_second": 2.81, + "eval_steps_per_second": 1.405, + "step": 3200 + }, + { + "epoch": 0.21, + "learning_rate": 4.817995284238412e-06, + "logits/chosen": -2.0534753799438477, + "logits/rejected": -2.1341710090637207, + "logps/chosen": -207.1750030517578, + "logps/rejected": -253.8484344482422, + "loss": 0.0204, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.05325322598218918, + "rewards/margins": 0.09594316780567169, + "rewards/rejected": -0.14919638633728027, + "step": 3210 + }, + { + "epoch": 0.21, + "learning_rate": 4.815850471676327e-06, + "logits/chosen": -2.1988625526428223, + "logits/rejected": -2.0713298320770264, + "logps/chosen": -246.859619140625, + "logps/rejected": -262.5884704589844, + "loss": 0.0291, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.04222807660698891, + "rewards/margins": 0.09988073259592056, + "rewards/rejected": -0.14210879802703857, + "step": 3220 + }, + { + "epoch": 0.21, + "learning_rate": 4.813693578503751e-06, + "logits/chosen": -2.2338128089904785, + "logits/rejected": -2.0544989109039307, + "logps/chosen": -305.856201171875, + "logps/rejected": -265.7468566894531, + "loss": 0.016, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.04674705117940903, + "rewards/margins": 0.076741524040699, + "rewards/rejected": -0.12348856776952744, + "step": 3230 + }, + { + "epoch": 0.21, + "learning_rate": 4.811524615972093e-06, + "logits/chosen": -2.2481729984283447, + "logits/rejected": -2.1175618171691895, + "logps/chosen": -239.32760620117188, + "logps/rejected": -264.2997131347656, + "loss": 0.0413, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.046959999948740005, + "rewards/margins": 0.08384885638952255, + "rewards/rejected": -0.13080886006355286, + "step": 3240 + }, + { + "epoch": 0.21, + "learning_rate": 4.809343595395724e-06, + "logits/chosen": -2.4664251804351807, + "logits/rejected": -2.2723288536071777, + "logps/chosen": -202.58583068847656, + "logps/rejected": -181.0174102783203, + "loss": 0.0444, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.06765086203813553, + "rewards/margins": 0.04510151222348213, + "rewards/rejected": -0.11275237798690796, + "step": 3250 + }, + { + "epoch": 0.21, + "learning_rate": 4.807150528151918e-06, + "logits/chosen": -2.2760791778564453, + "logits/rejected": -2.1086182594299316, + "logps/chosen": -177.58126831054688, + "logps/rejected": -218.3994140625, + "loss": 0.0188, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.05444352701306343, + "rewards/margins": 0.11475624144077301, + "rewards/rejected": -0.16919977962970734, + "step": 3260 + }, + { + "epoch": 0.21, + "learning_rate": 4.804945425680787e-06, + "logits/chosen": -2.236807346343994, + "logits/rejected": -2.308253526687622, + "logps/chosen": -207.42776489257812, + "logps/rejected": -198.57290649414062, + "loss": 0.0331, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.0949227437376976, + "rewards/margins": 0.05611693114042282, + "rewards/rejected": -0.15103968977928162, + "step": 3270 + }, + { + "epoch": 0.21, + "learning_rate": 4.802728299485225e-06, + "logits/chosen": -2.0927319526672363, + "logits/rejected": -2.0800490379333496, + "logps/chosen": -170.92227172851562, + "logps/rejected": -203.00479125976562, + "loss": 0.0283, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.10158933699131012, + "rewards/margins": 0.052754104137420654, + "rewards/rejected": -0.15434344112873077, + "step": 3280 + }, + { + "epoch": 0.22, + "learning_rate": 4.8004991611308495e-06, + "logits/chosen": -2.391540050506592, + "logits/rejected": -2.102921485900879, + "logps/chosen": -247.2994842529297, + "logps/rejected": -247.5079803466797, + "loss": 0.0127, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.028728622943162918, + "rewards/margins": 0.08129361271858215, + "rewards/rejected": -0.11002223193645477, + "step": 3290 + }, + { + "epoch": 0.22, + "learning_rate": 4.798258022245937e-06, + "logits/chosen": -2.3174631595611572, + "logits/rejected": -1.9177128076553345, + "logps/chosen": -228.5331268310547, + "logps/rejected": -206.4042205810547, + "loss": 0.0333, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.054633207619190216, + "rewards/margins": 0.08168105781078339, + "rewards/rejected": -0.136314257979393, + "step": 3300 + }, + { + "epoch": 0.22, + "eval_logits/chosen": -2.252087354660034, + "eval_logits/rejected": -2.0689594745635986, + "eval_logps/chosen": -242.1912841796875, + "eval_logps/rejected": -239.54627990722656, + "eval_loss": 0.027968592941761017, + "eval_rewards/accuracies": 0.6535000205039978, + "eval_rewards/chosen": -0.050931625068187714, + "eval_rewards/margins": 0.08874025195837021, + "eval_rewards/rejected": -0.13967186212539673, + "eval_runtime": 712.4607, + "eval_samples_per_second": 2.807, + "eval_steps_per_second": 1.404, + "step": 3300 + }, + { + "epoch": 0.22, + "learning_rate": 4.796004894521365e-06, + "logits/chosen": -2.2396187782287598, + "logits/rejected": -2.054224729537964, + "logps/chosen": -238.64242553710938, + "logps/rejected": -279.61846923828125, + "loss": 0.0388, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.050363294780254364, + "rewards/margins": 0.09859765321016312, + "rewards/rejected": -0.14896094799041748, + "step": 3310 + }, + { + "epoch": 0.22, + "learning_rate": 4.7937397897105545e-06, + "logits/chosen": -2.2537460327148438, + "logits/rejected": -2.175597667694092, + "logps/chosen": -211.47531127929688, + "logps/rejected": -194.06646728515625, + "loss": 0.0226, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.03121241368353367, + "rewards/margins": 0.03902296721935272, + "rewards/rejected": -0.07023537904024124, + "step": 3320 + }, + { + "epoch": 0.22, + "learning_rate": 4.791462719629399e-06, + "logits/chosen": -2.275513172149658, + "logits/rejected": -2.131743907928467, + "logps/chosen": -192.20516967773438, + "logps/rejected": -191.20184326171875, + "loss": 0.017, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.03240009397268295, + "rewards/margins": 0.10704060643911362, + "rewards/rejected": -0.13944070041179657, + "step": 3330 + }, + { + "epoch": 0.22, + "learning_rate": 4.789173696156212e-06, + "logits/chosen": -2.2883973121643066, + "logits/rejected": -1.9347209930419922, + "logps/chosen": -281.06097412109375, + "logps/rejected": -289.59222412109375, + "loss": 0.0234, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.02310190722346306, + "rewards/margins": 0.13590338826179504, + "rewards/rejected": -0.1590052843093872, + "step": 3340 + }, + { + "epoch": 0.22, + "learning_rate": 4.786872731231662e-06, + "logits/chosen": -2.317486047744751, + "logits/rejected": -2.1974265575408936, + "logps/chosen": -227.2639923095703, + "logps/rejected": -233.25320434570312, + "loss": 0.0278, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.05186513066291809, + "rewards/margins": 0.08679147809743881, + "rewards/rejected": -0.1386566013097763, + "step": 3350 + }, + { + "epoch": 0.22, + "learning_rate": 4.784559836858709e-06, + "logits/chosen": -2.2945048809051514, + "logits/rejected": -1.8284685611724854, + "logps/chosen": -242.6492919921875, + "logps/rejected": -225.825927734375, + "loss": 0.0113, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.04366091638803482, + "rewards/margins": 0.06798774749040604, + "rewards/rejected": -0.11164864152669907, + "step": 3360 + }, + { + "epoch": 0.22, + "learning_rate": 4.782235025102542e-06, + "logits/chosen": -2.2957305908203125, + "logits/rejected": -2.2282955646514893, + "logps/chosen": -235.95361328125, + "logps/rejected": -237.4597625732422, + "loss": 0.0292, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.04817141592502594, + "rewards/margins": 0.09493356943130493, + "rewards/rejected": -0.14310500025749207, + "step": 3370 + }, + { + "epoch": 0.22, + "learning_rate": 4.779898308090519e-06, + "logits/chosen": -2.2512454986572266, + "logits/rejected": -2.0390985012054443, + "logps/chosen": -286.148681640625, + "logps/rejected": -264.2273864746094, + "loss": 0.0383, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.05735722929239273, + "rewards/margins": 0.07454714924097061, + "rewards/rejected": -0.13190439343452454, + "step": 3380 + }, + { + "epoch": 0.22, + "learning_rate": 4.777549698012101e-06, + "logits/chosen": -2.1888813972473145, + "logits/rejected": -2.029754161834717, + "logps/chosen": -253.3921356201172, + "logps/rejected": -250.53225708007812, + "loss": 0.0201, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.050877444446086884, + "rewards/margins": 0.0814988762140274, + "rewards/rejected": -0.1323763132095337, + "step": 3390 + }, + { + "epoch": 0.22, + "learning_rate": 4.775189207118787e-06, + "logits/chosen": -2.2187693119049072, + "logits/rejected": -2.02724027633667, + "logps/chosen": -279.66302490234375, + "logps/rejected": -274.4883728027344, + "loss": 0.0347, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.03922683373093605, + "rewards/margins": 0.08581504225730896, + "rewards/rejected": -0.1250418722629547, + "step": 3400 + }, + { + "epoch": 0.22, + "eval_logits/chosen": -2.2767276763916016, + "eval_logits/rejected": -2.093092203140259, + "eval_logps/chosen": -240.41015625, + "eval_logps/rejected": -234.529296875, + "eval_loss": 0.028484875336289406, + "eval_rewards/accuracies": 0.6420000195503235, + "eval_rewards/chosen": -0.042025960981845856, + "eval_rewards/margins": 0.07256097346544266, + "eval_rewards/rejected": -0.11458693444728851, + "eval_runtime": 713.4014, + "eval_samples_per_second": 2.803, + "eval_steps_per_second": 1.402, + "step": 3400 + }, + { + "epoch": 0.22, + "learning_rate": 4.772816847724054e-06, + "logits/chosen": -2.382833957672119, + "logits/rejected": -2.1141469478607178, + "logps/chosen": -230.86587524414062, + "logps/rejected": -238.9117889404297, + "loss": 0.0544, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.041556429117918015, + "rewards/margins": 0.04274109750986099, + "rewards/rejected": -0.08429752290248871, + "step": 3410 + }, + { + "epoch": 0.22, + "learning_rate": 4.770432632203294e-06, + "logits/chosen": -2.1297221183776855, + "logits/rejected": -2.0429019927978516, + "logps/chosen": -253.8579864501953, + "logps/rejected": -211.4426727294922, + "loss": 0.0206, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.05105436593294144, + "rewards/margins": 0.03951476141810417, + "rewards/rejected": -0.09056912362575531, + "step": 3420 + }, + { + "epoch": 0.22, + "learning_rate": 4.768036572993738e-06, + "logits/chosen": -2.190948724746704, + "logits/rejected": -2.286194086074829, + "logps/chosen": -287.060546875, + "logps/rejected": -284.55426025390625, + "loss": 0.0192, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.03520800173282623, + "rewards/margins": 0.065467968583107, + "rewards/rejected": -0.10067595541477203, + "step": 3430 + }, + { + "epoch": 0.23, + "learning_rate": 4.765628682594409e-06, + "logits/chosen": -2.364724636077881, + "logits/rejected": -2.186753988265991, + "logps/chosen": -246.08349609375, + "logps/rejected": -238.25064086914062, + "loss": 0.0221, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.01163232047110796, + "rewards/margins": 0.08034755289554596, + "rewards/rejected": -0.09197986871004105, + "step": 3440 + }, + { + "epoch": 0.23, + "learning_rate": 4.763208973566041e-06, + "logits/chosen": -2.1567559242248535, + "logits/rejected": -2.1840717792510986, + "logps/chosen": -192.0625, + "logps/rejected": -228.4545440673828, + "loss": 0.0075, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.039664216339588165, + "rewards/margins": 0.0883837565779686, + "rewards/rejected": -0.12804797291755676, + "step": 3450 + }, + { + "epoch": 0.23, + "learning_rate": 4.76077745853102e-06, + "logits/chosen": -2.3962085247039795, + "logits/rejected": -2.249481439590454, + "logps/chosen": -259.07061767578125, + "logps/rejected": -278.41705322265625, + "loss": 0.0226, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.04508228972554207, + "rewards/margins": 0.09110657125711441, + "rewards/rejected": -0.13618886470794678, + "step": 3460 + }, + { + "epoch": 0.23, + "learning_rate": 4.758334150173322e-06, + "logits/chosen": -2.2859044075012207, + "logits/rejected": -2.0987162590026855, + "logps/chosen": -261.6954040527344, + "logps/rejected": -249.35256958007812, + "loss": 0.0216, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.006942708045244217, + "rewards/margins": 0.06355363130569458, + "rewards/rejected": -0.05661092326045036, + "step": 3470 + }, + { + "epoch": 0.23, + "learning_rate": 4.755879061238439e-06, + "logits/chosen": -2.3577704429626465, + "logits/rejected": -2.13626766204834, + "logps/chosen": -255.3122100830078, + "logps/rejected": -252.277587890625, + "loss": 0.0192, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.004305616952478886, + "rewards/margins": 0.048012375831604004, + "rewards/rejected": -0.04370676726102829, + "step": 3480 + }, + { + "epoch": 0.23, + "learning_rate": 4.753412204533317e-06, + "logits/chosen": -2.486016273498535, + "logits/rejected": -2.0144317150115967, + "logps/chosen": -263.31915283203125, + "logps/rejected": -236.2627716064453, + "loss": 0.0158, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.0023054243065416813, + "rewards/margins": 0.0916750431060791, + "rewards/rejected": -0.09398047626018524, + "step": 3490 + }, + { + "epoch": 0.23, + "learning_rate": 4.750933592926292e-06, + "logits/chosen": -2.3504817485809326, + "logits/rejected": -2.0278964042663574, + "logps/chosen": -220.9480438232422, + "logps/rejected": -212.4861602783203, + "loss": 0.025, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.011101900599896908, + "rewards/margins": 0.0937727838754654, + "rewards/rejected": -0.10487468540668488, + "step": 3500 + }, + { + "epoch": 0.23, + "eval_logits/chosen": -2.251335382461548, + "eval_logits/rejected": -2.068535566329956, + "eval_logps/chosen": -234.412109375, + "eval_logps/rejected": -230.6712646484375, + "eval_loss": 0.027730992063879967, + "eval_rewards/accuracies": 0.656000018119812, + "eval_rewards/chosen": -0.012035808525979519, + "eval_rewards/margins": 0.08326105773448944, + "eval_rewards/rejected": -0.09529686719179153, + "eval_runtime": 714.0893, + "eval_samples_per_second": 2.801, + "eval_steps_per_second": 1.4, + "step": 3500 + }, + { + "epoch": 0.23, + "learning_rate": 4.7484432393470124e-06, + "logits/chosen": -2.4342000484466553, + "logits/rejected": -1.919426679611206, + "logps/chosen": -203.8688507080078, + "logps/rejected": -177.36282348632812, + "loss": 0.0271, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.008934552781283855, + "rewards/margins": 0.14445406198501587, + "rewards/rejected": -0.1533885896205902, + "step": 3510 + }, + { + "epoch": 0.23, + "learning_rate": 4.745941156786385e-06, + "logits/chosen": -2.020981788635254, + "logits/rejected": -1.9922730922698975, + "logps/chosen": -159.48507690429688, + "logps/rejected": -218.340576171875, + "loss": 0.0586, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.03222980350255966, + "rewards/margins": 0.1400715559720993, + "rewards/rejected": -0.17230133712291718, + "step": 3520 + }, + { + "epoch": 0.23, + "learning_rate": 4.743427358296497e-06, + "logits/chosen": -2.16739559173584, + "logits/rejected": -1.9707978963851929, + "logps/chosen": -193.88075256347656, + "logps/rejected": -241.9403533935547, + "loss": 0.0358, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.020862499251961708, + "rewards/margins": 0.18376120924949646, + "rewards/rejected": -0.2046237289905548, + "step": 3530 + }, + { + "epoch": 0.23, + "learning_rate": 4.740901856990553e-06, + "logits/chosen": -2.110822916030884, + "logits/rejected": -1.9231901168823242, + "logps/chosen": -258.498291015625, + "logps/rejected": -227.58938598632812, + "loss": 0.0458, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.009031767025589943, + "rewards/margins": 0.06581844389438629, + "rewards/rejected": -0.07485021650791168, + "step": 3540 + }, + { + "epoch": 0.23, + "learning_rate": 4.738364666042804e-06, + "logits/chosen": -2.356178045272827, + "logits/rejected": -1.9351288080215454, + "logps/chosen": -288.6064453125, + "logps/rejected": -238.495849609375, + "loss": 0.0293, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.015692852437496185, + "rewards/margins": 0.06764644384384155, + "rewards/rejected": -0.051953595131635666, + "step": 3550 + }, + { + "epoch": 0.23, + "learning_rate": 4.735815798688483e-06, + "logits/chosen": -2.3009400367736816, + "logits/rejected": -2.0884649753570557, + "logps/chosen": -197.21116638183594, + "logps/rejected": -235.48648071289062, + "loss": 0.0198, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.005604019854217768, + "rewards/margins": 0.09068088978528976, + "rewards/rejected": -0.08507686853408813, + "step": 3560 + }, + { + "epoch": 0.23, + "learning_rate": 4.7332552682237285e-06, + "logits/chosen": -2.3205363750457764, + "logits/rejected": -1.8733360767364502, + "logps/chosen": -172.0836639404297, + "logps/rejected": -176.2654266357422, + "loss": 0.0245, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.008381395600736141, + "rewards/margins": 0.09977124631404877, + "rewards/rejected": -0.09138984978199005, + "step": 3570 + }, + { + "epoch": 0.23, + "learning_rate": 4.7306830880055234e-06, + "logits/chosen": -2.2690162658691406, + "logits/rejected": -2.1997575759887695, + "logps/chosen": -200.47381591796875, + "logps/rejected": -222.2290496826172, + "loss": 0.0227, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.0486999973654747, + "rewards/margins": 0.06709084659814835, + "rewards/rejected": -0.11579084396362305, + "step": 3580 + }, + { + "epoch": 0.23, + "learning_rate": 4.728099271451619e-06, + "logits/chosen": -2.3077304363250732, + "logits/rejected": -2.2104690074920654, + "logps/chosen": -195.2092742919922, + "logps/rejected": -206.08236694335938, + "loss": 0.0267, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.021006273105740547, + "rewards/margins": 0.08912817388772964, + "rewards/rejected": -0.11013443768024445, + "step": 3590 + }, + { + "epoch": 0.24, + "learning_rate": 4.725503832040466e-06, + "logits/chosen": -2.1276111602783203, + "logits/rejected": -2.127262592315674, + "logps/chosen": -153.59677124023438, + "logps/rejected": -194.3944549560547, + "loss": 0.0305, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.022194508463144302, + "rewards/margins": 0.07355144619941711, + "rewards/rejected": -0.09574595093727112, + "step": 3600 + }, + { + "epoch": 0.24, + "eval_logits/chosen": -2.2770073413848877, + "eval_logits/rejected": -2.092477321624756, + "eval_logps/chosen": -234.256103515625, + "eval_logps/rejected": -230.46665954589844, + "eval_loss": 0.02762630395591259, + "eval_rewards/accuracies": 0.6520000100135803, + "eval_rewards/chosen": -0.011255734600126743, + "eval_rewards/margins": 0.08301801979541779, + "eval_rewards/rejected": -0.09427376091480255, + "eval_runtime": 711.701, + "eval_samples_per_second": 2.81, + "eval_steps_per_second": 1.405, + "step": 3600 + }, + { + "epoch": 0.24, + "learning_rate": 4.722896783311152e-06, + "logits/chosen": -2.2544467449188232, + "logits/rejected": -2.148684024810791, + "logps/chosen": -263.32867431640625, + "logps/rejected": -328.82470703125, + "loss": 0.0268, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.02139856480062008, + "rewards/margins": 0.07181017845869064, + "rewards/rejected": -0.09320874512195587, + "step": 3610 + }, + { + "epoch": 0.24, + "learning_rate": 4.720278138863318e-06, + "logits/chosen": -2.400420665740967, + "logits/rejected": -2.189044952392578, + "logps/chosen": -192.6852569580078, + "logps/rejected": -173.32254028320312, + "loss": 0.0393, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.012817641720175743, + "rewards/margins": 0.06222205236554146, + "rewards/rejected": -0.07503969967365265, + "step": 3620 + }, + { + "epoch": 0.24, + "learning_rate": 4.717647912357095e-06, + "logits/chosen": -2.3465585708618164, + "logits/rejected": -2.433465003967285, + "logps/chosen": -279.39959716796875, + "logps/rejected": -293.50396728515625, + "loss": 0.0238, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": -0.03461218997836113, + "rewards/margins": 0.005859696306288242, + "rewards/rejected": -0.0404718853533268, + "step": 3630 + }, + { + "epoch": 0.24, + "learning_rate": 4.715006117513035e-06, + "logits/chosen": -2.462501287460327, + "logits/rejected": -2.224883556365967, + "logps/chosen": -319.16229248046875, + "logps/rejected": -278.135498046875, + "loss": 0.0307, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.02481684461236, + "rewards/margins": 0.07076647132635117, + "rewards/rejected": -0.045949630439281464, + "step": 3640 + }, + { + "epoch": 0.24, + "learning_rate": 4.7123527681120326e-06, + "logits/chosen": -2.2726309299468994, + "logits/rejected": -2.1317145824432373, + "logps/chosen": -245.8816680908203, + "logps/rejected": -234.3596954345703, + "loss": 0.034, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.006447536405175924, + "rewards/margins": 0.0831485316157341, + "rewards/rejected": -0.07670100033283234, + "step": 3650 + }, + { + "epoch": 0.24, + "learning_rate": 4.7096878779952594e-06, + "logits/chosen": -2.351346254348755, + "logits/rejected": -2.2983498573303223, + "logps/chosen": -278.7234191894531, + "logps/rejected": -286.8404846191406, + "loss": 0.022, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.01655019074678421, + "rewards/margins": 0.053498972207307816, + "rewards/rejected": -0.07004915922880173, + "step": 3660 + }, + { + "epoch": 0.24, + "learning_rate": 4.707011461064086e-06, + "logits/chosen": -2.145847797393799, + "logits/rejected": -1.9123668670654297, + "logps/chosen": -309.30865478515625, + "logps/rejected": -283.8884582519531, + "loss": 0.0306, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.0022481470368802547, + "rewards/margins": 0.09385238587856293, + "rewards/rejected": -0.09610055387020111, + "step": 3670 + }, + { + "epoch": 0.24, + "learning_rate": 4.704323531280016e-06, + "logits/chosen": -2.2220005989074707, + "logits/rejected": -2.047095775604248, + "logps/chosen": -328.34033203125, + "logps/rejected": -258.6874694824219, + "loss": 0.0157, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.005644065327942371, + "rewards/margins": 0.06771357357501984, + "rewards/rejected": -0.07335765659809113, + "step": 3680 + }, + { + "epoch": 0.24, + "learning_rate": 4.701624102664606e-06, + "logits/chosen": -2.3654987812042236, + "logits/rejected": -2.0208535194396973, + "logps/chosen": -267.5926208496094, + "logps/rejected": -228.2905731201172, + "loss": 0.0243, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.03238372504711151, + "rewards/margins": 0.0739908367395401, + "rewards/rejected": -0.10637456178665161, + "step": 3690 + }, + { + "epoch": 0.24, + "learning_rate": 4.698913189299399e-06, + "logits/chosen": -2.1872456073760986, + "logits/rejected": -2.3029673099517822, + "logps/chosen": -196.64865112304688, + "logps/rejected": -241.5049591064453, + "loss": 0.0331, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.05248425155878067, + "rewards/margins": 0.06174800917506218, + "rewards/rejected": -0.11423225700855255, + "step": 3700 + }, + { + "epoch": 0.24, + "eval_logits/chosen": -2.2712066173553467, + "eval_logits/rejected": -2.087006092071533, + "eval_logps/chosen": -245.2156524658203, + "eval_logps/rejected": -241.05653381347656, + "eval_loss": 0.028332557529211044, + "eval_rewards/accuracies": 0.6434999704360962, + "eval_rewards/chosen": -0.06605348736047745, + "eval_rewards/margins": 0.08116975426673889, + "eval_rewards/rejected": -0.14722324907779694, + "eval_runtime": 712.4623, + "eval_samples_per_second": 2.807, + "eval_steps_per_second": 1.404, + "step": 3700 + }, + { + "epoch": 0.24, + "learning_rate": 4.696190805325847e-06, + "logits/chosen": -2.2979846000671387, + "logits/rejected": -2.1545467376708984, + "logps/chosen": -217.24502563476562, + "logps/rejected": -207.59347534179688, + "loss": 0.0129, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.054330892860889435, + "rewards/margins": 0.08733570575714111, + "rewards/rejected": -0.14166659116744995, + "step": 3710 + }, + { + "epoch": 0.24, + "learning_rate": 4.693456964945239e-06, + "logits/chosen": -2.4013776779174805, + "logits/rejected": -1.927514672279358, + "logps/chosen": -308.4510192871094, + "logps/rejected": -226.40151977539062, + "loss": 0.0362, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.05172725394368172, + "rewards/margins": 0.09521359205245972, + "rewards/rejected": -0.14694085717201233, + "step": 3720 + }, + { + "epoch": 0.24, + "learning_rate": 4.6907116824186245e-06, + "logits/chosen": -2.332817792892456, + "logits/rejected": -2.287806749343872, + "logps/chosen": -233.67672729492188, + "logps/rejected": -244.431640625, + "loss": 0.0258, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.03174605220556259, + "rewards/margins": 0.05475563928484917, + "rewards/rejected": -0.08650169521570206, + "step": 3730 + }, + { + "epoch": 0.24, + "learning_rate": 4.687954972066742e-06, + "logits/chosen": -2.222794532775879, + "logits/rejected": -1.9519582986831665, + "logps/chosen": -229.9512939453125, + "logps/rejected": -241.53585815429688, + "loss": 0.0305, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.0006751194596290588, + "rewards/margins": 0.16227775812149048, + "rewards/rejected": -0.16295287013053894, + "step": 3740 + }, + { + "epoch": 0.25, + "learning_rate": 4.685186848269944e-06, + "logits/chosen": -2.2037625312805176, + "logits/rejected": -2.076504945755005, + "logps/chosen": -213.8433074951172, + "logps/rejected": -188.55340576171875, + "loss": 0.0391, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.009854817762970924, + "rewards/margins": 0.0604451522231102, + "rewards/rejected": -0.07029997557401657, + "step": 3750 + }, + { + "epoch": 0.25, + "learning_rate": 4.682407325468119e-06, + "logits/chosen": -2.3056480884552, + "logits/rejected": -1.9488176107406616, + "logps/chosen": -216.91049194335938, + "logps/rejected": -203.93606567382812, + "loss": 0.0189, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.002542532980442047, + "rewards/margins": 0.10327658802270889, + "rewards/rejected": -0.10073405504226685, + "step": 3760 + }, + { + "epoch": 0.25, + "learning_rate": 4.67961641816062e-06, + "logits/chosen": -2.28625226020813, + "logits/rejected": -2.084827423095703, + "logps/chosen": -275.9200134277344, + "logps/rejected": -245.2871551513672, + "loss": 0.0316, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.0006162314675748348, + "rewards/margins": 0.053486168384552, + "rewards/rejected": -0.054102398455142975, + "step": 3770 + }, + { + "epoch": 0.25, + "learning_rate": 4.676814140906188e-06, + "logits/chosen": -2.165799617767334, + "logits/rejected": -2.029515504837036, + "logps/chosen": -249.5352020263672, + "logps/rejected": -234.1592559814453, + "loss": 0.0308, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.049995046108961105, + "rewards/margins": 0.07614854723215103, + "rewards/rejected": -0.12614358961582184, + "step": 3780 + }, + { + "epoch": 0.25, + "learning_rate": 4.674000508322872e-06, + "logits/chosen": -2.044060230255127, + "logits/rejected": -2.1126906871795654, + "logps/chosen": -225.5679473876953, + "logps/rejected": -250.17538452148438, + "loss": 0.0311, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.04016520455479622, + "rewards/margins": 0.07711917161941528, + "rewards/rejected": -0.1172843724489212, + "step": 3790 + }, + { + "epoch": 0.25, + "learning_rate": 4.671175535087959e-06, + "logits/chosen": -2.2188408374786377, + "logits/rejected": -2.1730470657348633, + "logps/chosen": -298.844482421875, + "logps/rejected": -315.20123291015625, + "loss": 0.0351, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.03758421167731285, + "rewards/margins": 0.09794172644615173, + "rewards/rejected": -0.1355259269475937, + "step": 3800 + }, + { + "epoch": 0.25, + "eval_logits/chosen": -2.305997848510742, + "eval_logits/rejected": -2.119776964187622, + "eval_logps/chosen": -238.6973876953125, + "eval_logps/rejected": -231.64306640625, + "eval_loss": 0.029111526906490326, + "eval_rewards/accuracies": 0.640999972820282, + "eval_rewards/chosen": -0.033462151885032654, + "eval_rewards/margins": 0.06669372320175171, + "eval_rewards/rejected": -0.10015588253736496, + "eval_runtime": 710.7935, + "eval_samples_per_second": 2.814, + "eval_steps_per_second": 1.407, + "step": 3800 + }, + { + "epoch": 0.25, + "learning_rate": 4.6683392359378924e-06, + "logits/chosen": -2.188131809234619, + "logits/rejected": -2.008528232574463, + "logps/chosen": -240.27590942382812, + "logps/rejected": -228.5452117919922, + "loss": 0.0122, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.02563432976603508, + "rewards/margins": 0.06892909854650497, + "rewards/rejected": -0.09456343948841095, + "step": 3810 + }, + { + "epoch": 0.25, + "learning_rate": 4.665491625668198e-06, + "logits/chosen": -2.103173017501831, + "logits/rejected": -2.1502113342285156, + "logps/chosen": -164.92218017578125, + "logps/rejected": -204.31317138671875, + "loss": 0.0308, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.04337451234459877, + "rewards/margins": 0.07917975634336472, + "rewards/rejected": -0.12255426496267319, + "step": 3820 + }, + { + "epoch": 0.25, + "learning_rate": 4.662632719133407e-06, + "logits/chosen": -2.383805274963379, + "logits/rejected": -2.0930488109588623, + "logps/chosen": -229.5814971923828, + "logps/rejected": -176.27459716796875, + "loss": 0.0228, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.006080122198909521, + "rewards/margins": 0.06850259006023407, + "rewards/rejected": -0.06242247670888901, + "step": 3830 + }, + { + "epoch": 0.25, + "learning_rate": 4.659762531246974e-06, + "logits/chosen": -2.2730250358581543, + "logits/rejected": -2.1138651371002197, + "logps/chosen": -219.82144165039062, + "logps/rejected": -198.40353393554688, + "loss": 0.0275, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.02621351182460785, + "rewards/margins": 0.06269388645887375, + "rewards/rejected": -0.0889073982834816, + "step": 3840 + }, + { + "epoch": 0.25, + "learning_rate": 4.656881076981207e-06, + "logits/chosen": -2.335495948791504, + "logits/rejected": -2.1931393146514893, + "logps/chosen": -215.53335571289062, + "logps/rejected": -209.0745391845703, + "loss": 0.0295, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.01174996793270111, + "rewards/margins": 0.05719948932528496, + "rewards/rejected": -0.06894946098327637, + "step": 3850 + }, + { + "epoch": 0.25, + "learning_rate": 4.653988371367183e-06, + "logits/chosen": -2.2694575786590576, + "logits/rejected": -2.0069797039031982, + "logps/chosen": -240.78125, + "logps/rejected": -191.01919555664062, + "loss": 0.0385, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.00011368747800588608, + "rewards/margins": 0.05722982436418533, + "rewards/rejected": -0.05711613968014717, + "step": 3860 + }, + { + "epoch": 0.25, + "learning_rate": 4.651084429494671e-06, + "logits/chosen": -2.3504996299743652, + "logits/rejected": -2.065788745880127, + "logps/chosen": -274.817138671875, + "logps/rejected": -206.2301788330078, + "loss": 0.0206, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.002669256180524826, + "rewards/margins": 0.05468413233757019, + "rewards/rejected": -0.05201487988233566, + "step": 3870 + }, + { + "epoch": 0.25, + "learning_rate": 4.648169266512053e-06, + "logits/chosen": -2.4072623252868652, + "logits/rejected": -2.1304709911346436, + "logps/chosen": -220.14645385742188, + "logps/rejected": -188.8533935546875, + "loss": 0.0185, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.025908267125487328, + "rewards/margins": 0.0683053508400917, + "rewards/rejected": -0.04239708185195923, + "step": 3880 + }, + { + "epoch": 0.25, + "learning_rate": 4.6452428976262505e-06, + "logits/chosen": -2.2137064933776855, + "logits/rejected": -1.9928970336914062, + "logps/chosen": -199.99948120117188, + "logps/rejected": -183.72735595703125, + "loss": 0.0277, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.01895982213318348, + "rewards/margins": 0.14025332033634186, + "rewards/rejected": -0.12129350006580353, + "step": 3890 + }, + { + "epoch": 0.26, + "learning_rate": 4.642305338102633e-06, + "logits/chosen": -2.261868715286255, + "logits/rejected": -2.3374361991882324, + "logps/chosen": -161.90980529785156, + "logps/rejected": -198.79263305664062, + "loss": 0.0164, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.0008012913167476654, + "rewards/margins": 0.09368561953306198, + "rewards/rejected": -0.09448691457509995, + "step": 3900 + }, + { + "epoch": 0.26, + "eval_logits/chosen": -2.2779669761657715, + "eval_logits/rejected": -2.0932793617248535, + "eval_logps/chosen": -233.7920684814453, + "eval_logps/rejected": -229.32948303222656, + "eval_loss": 0.027970939874649048, + "eval_rewards/accuracies": 0.6340000033378601, + "eval_rewards/chosen": -0.008935615420341492, + "eval_rewards/margins": 0.0796523168683052, + "eval_rewards/rejected": -0.0885879322886467, + "eval_runtime": 712.9093, + "eval_samples_per_second": 2.805, + "eval_steps_per_second": 1.403, + "step": 3900 + }, + { + "epoch": 0.26, + "learning_rate": 4.639356603264953e-06, + "logits/chosen": -2.3225855827331543, + "logits/rejected": -2.0903942584991455, + "logps/chosen": -243.5463409423828, + "logps/rejected": -226.7582550048828, + "loss": 0.0167, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.008572788909077644, + "rewards/margins": 0.04471360892057419, + "rewards/rejected": -0.05328640341758728, + "step": 3910 + }, + { + "epoch": 0.26, + "learning_rate": 4.636396708495255e-06, + "logits/chosen": -2.161994695663452, + "logits/rejected": -2.1441612243652344, + "logps/chosen": -231.1266326904297, + "logps/rejected": -218.8634490966797, + "loss": 0.0179, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.009657839313149452, + "rewards/margins": 0.06290359795093536, + "rewards/rejected": -0.07256142795085907, + "step": 3920 + }, + { + "epoch": 0.26, + "learning_rate": 4.633425669233799e-06, + "logits/chosen": -2.2521562576293945, + "logits/rejected": -2.273059606552124, + "logps/chosen": -233.33676147460938, + "logps/rejected": -245.6598663330078, + "loss": 0.0197, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.007449137978255749, + "rewards/margins": 0.08385193347930908, + "rewards/rejected": -0.07640279829502106, + "step": 3930 + }, + { + "epoch": 0.26, + "learning_rate": 4.6304435009789825e-06, + "logits/chosen": -2.275355815887451, + "logits/rejected": -2.0570201873779297, + "logps/chosen": -236.5664825439453, + "logps/rejected": -185.60482788085938, + "loss": 0.0252, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.00470371451228857, + "rewards/margins": 0.09974116086959839, + "rewards/rejected": -0.09503744542598724, + "step": 3940 + }, + { + "epoch": 0.26, + "learning_rate": 4.627450219287256e-06, + "logits/chosen": -2.3196358680725098, + "logits/rejected": -2.17457914352417, + "logps/chosen": -184.38966369628906, + "logps/rejected": -173.1756134033203, + "loss": 0.0339, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.014152881689369678, + "rewards/margins": 0.05648508667945862, + "rewards/rejected": -0.07063796371221542, + "step": 3950 + }, + { + "epoch": 0.26, + "learning_rate": 4.624445839773042e-06, + "logits/chosen": -2.2729854583740234, + "logits/rejected": -2.2008156776428223, + "logps/chosen": -174.69515991210938, + "logps/rejected": -179.75315856933594, + "loss": 0.0541, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": -0.023419296368956566, + "rewards/margins": 0.03060915693640709, + "rewards/rejected": -0.054028451442718506, + "step": 3960 + }, + { + "epoch": 0.26, + "learning_rate": 4.621430378108656e-06, + "logits/chosen": -2.2478737831115723, + "logits/rejected": -2.0963854789733887, + "logps/chosen": -260.9545593261719, + "logps/rejected": -273.7813415527344, + "loss": 0.0124, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.015981189906597137, + "rewards/margins": 0.09795571863651276, + "rewards/rejected": -0.11393691599369049, + "step": 3970 + }, + { + "epoch": 0.26, + "learning_rate": 4.618403850024223e-06, + "logits/chosen": -2.166574478149414, + "logits/rejected": -1.9380964040756226, + "logps/chosen": -255.82699584960938, + "logps/rejected": -221.594482421875, + "loss": 0.0273, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.001684072194620967, + "rewards/margins": 0.062149059027433395, + "rewards/rejected": -0.06383313238620758, + "step": 3980 + }, + { + "epoch": 0.26, + "learning_rate": 4.615366271307598e-06, + "logits/chosen": -2.309037923812866, + "logits/rejected": -2.1538918018341064, + "logps/chosen": -198.5775604248047, + "logps/rejected": -200.23765563964844, + "loss": 0.0227, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.03317294269800186, + "rewards/margins": 0.06776221096515656, + "rewards/rejected": -0.10093514621257782, + "step": 3990 + }, + { + "epoch": 0.26, + "learning_rate": 4.612317657804277e-06, + "logits/chosen": -2.1642422676086426, + "logits/rejected": -2.2179436683654785, + "logps/chosen": -157.81178283691406, + "logps/rejected": -228.7684783935547, + "loss": 0.0445, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.05289377644658089, + "rewards/margins": 0.09804403781890869, + "rewards/rejected": -0.15093779563903809, + "step": 4000 + }, + { + "epoch": 0.26, + "eval_logits/chosen": -2.270484209060669, + "eval_logits/rejected": -2.0859897136688232, + "eval_logps/chosen": -239.2251434326172, + "eval_logps/rejected": -236.20582580566406, + "eval_loss": 0.0270866546779871, + "eval_rewards/accuracies": 0.6389999985694885, + "eval_rewards/chosen": -0.03610089048743248, + "eval_rewards/margins": 0.08686867356300354, + "eval_rewards/rejected": -0.12296956777572632, + "eval_runtime": 712.7526, + "eval_samples_per_second": 2.806, + "eval_steps_per_second": 1.403, + "step": 4000 + }, + { + "epoch": 0.26, + "learning_rate": 4.6092580254173236e-06, + "logits/chosen": -2.1679720878601074, + "logits/rejected": -1.9357163906097412, + "logps/chosen": -266.56781005859375, + "logps/rejected": -269.0433044433594, + "loss": 0.029, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.04622514545917511, + "rewards/margins": 0.10571815818548203, + "rewards/rejected": -0.15194329619407654, + "step": 4010 + }, + { + "epoch": 0.26, + "learning_rate": 4.606187390107277e-06, + "logits/chosen": -2.138845443725586, + "logits/rejected": -1.956048607826233, + "logps/chosen": -242.793701171875, + "logps/rejected": -218.82632446289062, + "loss": 0.0396, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.08959133177995682, + "rewards/margins": 0.07512683421373367, + "rewards/rejected": -0.1647181510925293, + "step": 4020 + }, + { + "epoch": 0.26, + "learning_rate": 4.603105767892077e-06, + "logits/chosen": -2.262580394744873, + "logits/rejected": -2.1853690147399902, + "logps/chosen": -207.2705841064453, + "logps/rejected": -239.85885620117188, + "loss": 0.0174, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.0629027932882309, + "rewards/margins": 0.07317514717578888, + "rewards/rejected": -0.13607792556285858, + "step": 4030 + }, + { + "epoch": 0.26, + "learning_rate": 4.6000131748469725e-06, + "logits/chosen": -2.3523991107940674, + "logits/rejected": -1.9695736169815063, + "logps/chosen": -258.4837646484375, + "logps/rejected": -197.4902801513672, + "loss": 0.0485, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.0417175218462944, + "rewards/margins": 0.06662425398826599, + "rewards/rejected": -0.10834179073572159, + "step": 4040 + }, + { + "epoch": 0.26, + "learning_rate": 4.596909627104445e-06, + "logits/chosen": -2.3855977058410645, + "logits/rejected": -2.309062957763672, + "logps/chosen": -263.0769348144531, + "logps/rejected": -246.50241088867188, + "loss": 0.0143, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.07169332355260849, + "rewards/margins": 0.08029817044734955, + "rewards/rejected": -0.15199150145053864, + "step": 4050 + }, + { + "epoch": 0.27, + "learning_rate": 4.5937951408541215e-06, + "logits/chosen": -2.4470372200012207, + "logits/rejected": -1.9213218688964844, + "logps/chosen": -263.614990234375, + "logps/rejected": -240.8369598388672, + "loss": 0.0283, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.05488234758377075, + "rewards/margins": 0.11299531161785126, + "rewards/rejected": -0.167877659201622, + "step": 4060 + }, + { + "epoch": 0.27, + "learning_rate": 4.590669732342685e-06, + "logits/chosen": -2.1618571281433105, + "logits/rejected": -2.0170841217041016, + "logps/chosen": -222.08682250976562, + "logps/rejected": -243.6232452392578, + "loss": 0.0521, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.04969602823257446, + "rewards/margins": 0.09821876138448715, + "rewards/rejected": -0.1479147970676422, + "step": 4070 + }, + { + "epoch": 0.27, + "learning_rate": 4.587533417873799e-06, + "logits/chosen": -2.2454047203063965, + "logits/rejected": -2.341275453567505, + "logps/chosen": -205.1430206298828, + "logps/rejected": -281.76739501953125, + "loss": 0.0161, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.059894900768995285, + "rewards/margins": 0.0846036747097969, + "rewards/rejected": -0.1444985717535019, + "step": 4080 + }, + { + "epoch": 0.27, + "learning_rate": 4.584386213808016e-06, + "logits/chosen": -2.246037006378174, + "logits/rejected": -1.9014495611190796, + "logps/chosen": -232.6446990966797, + "logps/rejected": -198.4827880859375, + "loss": 0.0454, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.054828494787216187, + "rewards/margins": 0.05807109922170639, + "rewards/rejected": -0.11289960145950317, + "step": 4090 + }, + { + "epoch": 0.27, + "learning_rate": 4.581228136562693e-06, + "logits/chosen": -2.132202625274658, + "logits/rejected": -2.2550837993621826, + "logps/chosen": -247.276611328125, + "logps/rejected": -227.5145263671875, + "loss": 0.0176, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.04548081010580063, + "rewards/margins": 0.02961266040802002, + "rewards/rejected": -0.07509347796440125, + "step": 4100 + }, + { + "epoch": 0.27, + "eval_logits/chosen": -2.3252735137939453, + "eval_logits/rejected": -2.1376659870147705, + "eval_logps/chosen": -241.9268798828125, + "eval_logps/rejected": -234.1422882080078, + "eval_loss": 0.028915749862790108, + "eval_rewards/accuracies": 0.6470000147819519, + "eval_rewards/chosen": -0.049609627574682236, + "eval_rewards/margins": 0.06304233521223068, + "eval_rewards/rejected": -0.11265195906162262, + "eval_runtime": 712.9992, + "eval_samples_per_second": 2.805, + "eval_steps_per_second": 1.403, + "step": 4100 + }, + { + "epoch": 0.27, + "learning_rate": 4.578059202611909e-06, + "logits/chosen": -2.345768690109253, + "logits/rejected": -2.1116530895233154, + "logps/chosen": -263.25579833984375, + "logps/rejected": -259.384765625, + "loss": 0.0196, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.030484404414892197, + "rewards/margins": 0.05442474037408829, + "rewards/rejected": -0.08490914851427078, + "step": 4110 + }, + { + "epoch": 0.27, + "learning_rate": 4.574879428486376e-06, + "logits/chosen": -2.3218655586242676, + "logits/rejected": -2.0297176837921143, + "logps/chosen": -221.8189697265625, + "logps/rejected": -232.28121948242188, + "loss": 0.011, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.04451214522123337, + "rewards/margins": 0.0669221431016922, + "rewards/rejected": -0.11143428087234497, + "step": 4120 + }, + { + "epoch": 0.27, + "learning_rate": 4.571688830773352e-06, + "logits/chosen": -2.349295139312744, + "logits/rejected": -2.2383410930633545, + "logps/chosen": -230.47598266601562, + "logps/rejected": -217.79031372070312, + "loss": 0.0294, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.042038775980472565, + "rewards/margins": 0.0359666645526886, + "rewards/rejected": -0.07800544798374176, + "step": 4130 + }, + { + "epoch": 0.27, + "learning_rate": 4.568487426116559e-06, + "logits/chosen": -2.281845808029175, + "logits/rejected": -2.297541856765747, + "logps/chosen": -179.35195922851562, + "logps/rejected": -179.69200134277344, + "loss": 0.0405, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.0369006022810936, + "rewards/margins": 0.03936711698770523, + "rewards/rejected": -0.07626771181821823, + "step": 4140 + }, + { + "epoch": 0.27, + "learning_rate": 4.565275231216092e-06, + "logits/chosen": -2.1967597007751465, + "logits/rejected": -2.1704554557800293, + "logps/chosen": -155.69821166992188, + "logps/rejected": -211.3583526611328, + "loss": 0.0191, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.0217736829072237, + "rewards/margins": 0.0483512282371521, + "rewards/rejected": -0.07012491673231125, + "step": 4150 + }, + { + "epoch": 0.27, + "learning_rate": 4.562052262828331e-06, + "logits/chosen": -2.2314505577087402, + "logits/rejected": -2.086259603500366, + "logps/chosen": -208.48379516601562, + "logps/rejected": -217.4300537109375, + "loss": 0.0477, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.04836384207010269, + "rewards/margins": 0.07111117243766785, + "rewards/rejected": -0.11947502195835114, + "step": 4160 + }, + { + "epoch": 0.27, + "learning_rate": 4.558818537765861e-06, + "logits/chosen": -2.4081733226776123, + "logits/rejected": -2.198742151260376, + "logps/chosen": -245.94277954101562, + "logps/rejected": -225.3533477783203, + "loss": 0.0385, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.04964191094040871, + "rewards/margins": 0.07139302790164948, + "rewards/rejected": -0.12103494256734848, + "step": 4170 + }, + { + "epoch": 0.27, + "learning_rate": 4.555574072897374e-06, + "logits/chosen": -2.3245797157287598, + "logits/rejected": -2.3242580890655518, + "logps/chosen": -213.04324340820312, + "logps/rejected": -226.7286376953125, + "loss": 0.0309, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.055409859865903854, + "rewards/margins": 0.0797853022813797, + "rewards/rejected": -0.13519516587257385, + "step": 4180 + }, + { + "epoch": 0.27, + "learning_rate": 4.552318885147589e-06, + "logits/chosen": -2.433678388595581, + "logits/rejected": -2.0744783878326416, + "logps/chosen": -249.4149627685547, + "logps/rejected": -204.31393432617188, + "loss": 0.0253, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.04248654097318649, + "rewards/margins": 0.07144276797771454, + "rewards/rejected": -0.11392930895090103, + "step": 4190 + }, + { + "epoch": 0.27, + "learning_rate": 4.549052991497159e-06, + "logits/chosen": -2.293544292449951, + "logits/rejected": -2.262787103652954, + "logps/chosen": -190.35092163085938, + "logps/rejected": -205.44454956054688, + "loss": 0.0244, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.050918687134981155, + "rewards/margins": 0.0733739361166954, + "rewards/rejected": -0.12429263442754745, + "step": 4200 + }, + { + "epoch": 0.27, + "eval_logits/chosen": -2.312159776687622, + "eval_logits/rejected": -2.1259677410125732, + "eval_logps/chosen": -237.25537109375, + "eval_logps/rejected": -231.3834991455078, + "eval_loss": 0.029266033321619034, + "eval_rewards/accuracies": 0.6424999833106995, + "eval_rewards/chosen": -0.026252107694745064, + "eval_rewards/margins": 0.07260581851005554, + "eval_rewards/rejected": -0.09885792434215546, + "eval_runtime": 712.4499, + "eval_samples_per_second": 2.807, + "eval_steps_per_second": 1.404, + "step": 4200 + }, + { + "epoch": 0.28, + "learning_rate": 4.545776408982585e-06, + "logits/chosen": -2.2285044193267822, + "logits/rejected": -2.2255208492279053, + "logps/chosen": -234.24560546875, + "logps/rejected": -239.6706085205078, + "loss": 0.0283, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.012507572770118713, + "rewards/margins": 0.07219143211841583, + "rewards/rejected": -0.08469899743795395, + "step": 4210 + }, + { + "epoch": 0.28, + "learning_rate": 4.542489154696128e-06, + "logits/chosen": -2.4383578300476074, + "logits/rejected": -2.077122688293457, + "logps/chosen": -265.59991455078125, + "logps/rejected": -217.50387573242188, + "loss": 0.0137, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.010602285154163837, + "rewards/margins": 0.06577347218990326, + "rewards/rejected": -0.0551711842417717, + "step": 4220 + }, + { + "epoch": 0.28, + "learning_rate": 4.5391912457857145e-06, + "logits/chosen": -2.3199622631073, + "logits/rejected": -2.06068754196167, + "logps/chosen": -266.38726806640625, + "logps/rejected": -232.57876586914062, + "loss": 0.0256, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.0070883058942854404, + "rewards/margins": 0.06650589406490326, + "rewards/rejected": -0.07359420508146286, + "step": 4230 + }, + { + "epoch": 0.28, + "learning_rate": 4.535882699454854e-06, + "logits/chosen": -2.3102855682373047, + "logits/rejected": -2.185673713684082, + "logps/chosen": -274.146240234375, + "logps/rejected": -315.23004150390625, + "loss": 0.0245, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.015475980937480927, + "rewards/margins": 0.09710557758808136, + "rewards/rejected": -0.11258156597614288, + "step": 4240 + }, + { + "epoch": 0.28, + "learning_rate": 4.532563532962546e-06, + "logits/chosen": -2.357006072998047, + "logits/rejected": -2.4395248889923096, + "logps/chosen": -199.3505859375, + "logps/rejected": -236.73263549804688, + "loss": 0.0324, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.0472816601395607, + "rewards/margins": 0.07893550395965576, + "rewards/rejected": -0.12621717154979706, + "step": 4250 + }, + { + "epoch": 0.28, + "learning_rate": 4.529233763623187e-06, + "logits/chosen": -2.3164355754852295, + "logits/rejected": -2.028876304626465, + "logps/chosen": -213.58236694335938, + "logps/rejected": -181.50094604492188, + "loss": 0.0295, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.057409435510635376, + "rewards/margins": 0.07939110696315765, + "rewards/rejected": -0.13680054247379303, + "step": 4260 + }, + { + "epoch": 0.28, + "learning_rate": 4.5258934088064854e-06, + "logits/chosen": -2.202575206756592, + "logits/rejected": -1.7992494106292725, + "logps/chosen": -235.61380004882812, + "logps/rejected": -203.1489715576172, + "loss": 0.0227, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.07816511392593384, + "rewards/margins": 0.1111634373664856, + "rewards/rejected": -0.18932855129241943, + "step": 4270 + }, + { + "epoch": 0.28, + "learning_rate": 4.522542485937369e-06, + "logits/chosen": -2.3277671337127686, + "logits/rejected": -2.0657238960266113, + "logps/chosen": -298.01959228515625, + "logps/rejected": -221.0654754638672, + "loss": 0.0125, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.03683526813983917, + "rewards/margins": 0.09843374788761139, + "rewards/rejected": -0.13526901602745056, + "step": 4280 + }, + { + "epoch": 0.28, + "learning_rate": 4.519181012495892e-06, + "logits/chosen": -2.32460355758667, + "logits/rejected": -2.201780080795288, + "logps/chosen": -246.9171905517578, + "logps/rejected": -235.5345458984375, + "loss": 0.0312, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.02348960004746914, + "rewards/margins": 0.08845724910497665, + "rewards/rejected": -0.11194685846567154, + "step": 4290 + }, + { + "epoch": 0.28, + "learning_rate": 4.515809006017147e-06, + "logits/chosen": -2.2573161125183105, + "logits/rejected": -1.95000422000885, + "logps/chosen": -232.3976287841797, + "logps/rejected": -212.3801727294922, + "loss": 0.0378, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.005059923976659775, + "rewards/margins": 0.0774591714143753, + "rewards/rejected": -0.07239924371242523, + "step": 4300 + }, + { + "epoch": 0.28, + "eval_logits/chosen": -2.2686073780059814, + "eval_logits/rejected": -2.0843474864959717, + "eval_logps/chosen": -231.23663330078125, + "eval_logps/rejected": -226.155029296875, + "eval_loss": 0.026677994057536125, + "eval_rewards/accuracies": 0.6439999938011169, + "eval_rewards/chosen": 0.0038415947929024696, + "eval_rewards/margins": 0.07655727863311768, + "eval_rewards/rejected": -0.07271569967269897, + "eval_runtime": 711.9298, + "eval_samples_per_second": 2.809, + "eval_steps_per_second": 1.405, + "step": 4300 + }, + { + "epoch": 0.28, + "learning_rate": 4.512426484091171e-06, + "logits/chosen": -2.40342116355896, + "logits/rejected": -2.0684916973114014, + "logps/chosen": -271.4616394042969, + "logps/rejected": -245.02908325195312, + "loss": 0.0383, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.030597597360610962, + "rewards/margins": 0.049302391707897186, + "rewards/rejected": -0.018704798072576523, + "step": 4310 + }, + { + "epoch": 0.28, + "learning_rate": 4.509033464362858e-06, + "logits/chosen": -2.1147308349609375, + "logits/rejected": -2.1781909465789795, + "logps/chosen": -236.3292999267578, + "logps/rejected": -266.4777526855469, + "loss": 0.0186, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.024682987481355667, + "rewards/margins": 0.07958771288394928, + "rewards/rejected": -0.05490473657846451, + "step": 4320 + }, + { + "epoch": 0.28, + "learning_rate": 4.505629964531857e-06, + "logits/chosen": -2.3749783039093018, + "logits/rejected": -2.1724915504455566, + "logps/chosen": -222.01260375976562, + "logps/rejected": -207.80874633789062, + "loss": 0.0288, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.005958900786936283, + "rewards/margins": 0.0830468013882637, + "rewards/rejected": -0.0770878940820694, + "step": 4330 + }, + { + "epoch": 0.28, + "learning_rate": 4.502216002352492e-06, + "logits/chosen": -2.385356903076172, + "logits/rejected": -2.1611905097961426, + "logps/chosen": -165.23251342773438, + "logps/rejected": -158.66131591796875, + "loss": 0.0438, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.010958021506667137, + "rewards/margins": 0.06421489268541336, + "rewards/rejected": -0.07517291605472565, + "step": 4340 + }, + { + "epoch": 0.28, + "learning_rate": 4.498791595633663e-06, + "logits/chosen": -2.237499952316284, + "logits/rejected": -1.8516120910644531, + "logps/chosen": -258.93536376953125, + "logps/rejected": -184.10693359375, + "loss": 0.0285, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.01724466308951378, + "rewards/margins": 0.061860036104917526, + "rewards/rejected": -0.04461536929011345, + "step": 4350 + }, + { + "epoch": 0.29, + "learning_rate": 4.495356762238751e-06, + "logits/chosen": -2.484279155731201, + "logits/rejected": -2.0068490505218506, + "logps/chosen": -278.0464782714844, + "logps/rejected": -193.75340270996094, + "loss": 0.0151, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.02575724944472313, + "rewards/margins": 0.06579472124576569, + "rewards/rejected": -0.04003746062517166, + "step": 4360 + }, + { + "epoch": 0.29, + "learning_rate": 4.491911520085532e-06, + "logits/chosen": -2.06331205368042, + "logits/rejected": -1.9536161422729492, + "logps/chosen": -196.34120178222656, + "logps/rejected": -222.7769317626953, + "loss": 0.024, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.020332731306552887, + "rewards/margins": 0.07831554114818573, + "rewards/rejected": -0.05798282101750374, + "step": 4370 + }, + { + "epoch": 0.29, + "learning_rate": 4.488455887146075e-06, + "logits/chosen": -2.1633567810058594, + "logits/rejected": -2.1489412784576416, + "logps/chosen": -166.67652893066406, + "logps/rejected": -202.71359252929688, + "loss": 0.0312, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.017177890986204147, + "rewards/margins": 0.11436694860458374, + "rewards/rejected": -0.0971890538930893, + "step": 4380 + }, + { + "epoch": 0.29, + "learning_rate": 4.484989881446654e-06, + "logits/chosen": -2.419445276260376, + "logits/rejected": -2.214813470840454, + "logps/chosen": -202.06753540039062, + "logps/rejected": -191.85040283203125, + "loss": 0.0379, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.0020423284731805325, + "rewards/margins": 0.03863789141178131, + "rewards/rejected": -0.03659556061029434, + "step": 4390 + }, + { + "epoch": 0.29, + "learning_rate": 4.481513521067654e-06, + "logits/chosen": -2.3927712440490723, + "logits/rejected": -2.035768747329712, + "logps/chosen": -226.9812469482422, + "logps/rejected": -210.1649627685547, + "loss": 0.0135, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.011033494956791401, + "rewards/margins": 0.09299737960100174, + "rewards/rejected": -0.10403086990118027, + "step": 4400 + }, + { + "epoch": 0.29, + "eval_logits/chosen": -2.2857038974761963, + "eval_logits/rejected": -2.099775791168213, + "eval_logps/chosen": -236.32452392578125, + "eval_logps/rejected": -232.56199645996094, + "eval_loss": 0.02732119709253311, + "eval_rewards/accuracies": 0.6434999704360962, + "eval_rewards/chosen": -0.021597841754555702, + "eval_rewards/margins": 0.083152636885643, + "eval_rewards/rejected": -0.10475046932697296, + "eval_runtime": 713.6572, + "eval_samples_per_second": 2.802, + "eval_steps_per_second": 1.401, + "step": 4400 + }, + { + "epoch": 0.29, + "learning_rate": 4.478026824143473e-06, + "logits/chosen": -2.292775869369507, + "logits/rejected": -2.1565842628479004, + "logps/chosen": -269.4031677246094, + "logps/rejected": -234.95068359375, + "loss": 0.0236, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.016707628965377808, + "rewards/margins": 0.11057498306035995, + "rewards/rejected": -0.12728263437747955, + "step": 4410 + }, + { + "epoch": 0.29, + "learning_rate": 4.474529808862429e-06, + "logits/chosen": -2.1790809631347656, + "logits/rejected": -2.103480577468872, + "logps/chosen": -191.47373962402344, + "logps/rejected": -225.77734375, + "loss": 0.0434, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.013267258182168007, + "rewards/margins": 0.08339644223451614, + "rewards/rejected": -0.096663698554039, + "step": 4420 + }, + { + "epoch": 0.29, + "learning_rate": 4.471022493466669e-06, + "logits/chosen": -2.2934823036193848, + "logits/rejected": -1.964529037475586, + "logps/chosen": -304.9217529296875, + "logps/rejected": -239.77035522460938, + "loss": 0.0235, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.014913300052285194, + "rewards/margins": 0.05929947644472122, + "rewards/rejected": -0.07421278208494186, + "step": 4430 + }, + { + "epoch": 0.29, + "learning_rate": 4.467504896252066e-06, + "logits/chosen": -2.30965256690979, + "logits/rejected": -2.1960270404815674, + "logps/chosen": -252.5443115234375, + "logps/rejected": -247.04800415039062, + "loss": 0.0238, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.014214863069355488, + "rewards/margins": 0.11622963845729828, + "rewards/rejected": -0.1304444968700409, + "step": 4440 + }, + { + "epoch": 0.29, + "learning_rate": 4.463977035568132e-06, + "logits/chosen": -2.175062656402588, + "logits/rejected": -2.41196346282959, + "logps/chosen": -215.9466094970703, + "logps/rejected": -276.2236633300781, + "loss": 0.0215, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": -0.017819250002503395, + "rewards/margins": 0.04285722225904465, + "rewards/rejected": -0.060676466673612595, + "step": 4450 + }, + { + "epoch": 0.29, + "learning_rate": 4.460438929817914e-06, + "logits/chosen": -2.243072986602783, + "logits/rejected": -2.0973448753356934, + "logps/chosen": -207.04464721679688, + "logps/rejected": -216.0046844482422, + "loss": 0.0273, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.00312446360476315, + "rewards/margins": 0.06624321639537811, + "rewards/rejected": -0.0693676769733429, + "step": 4460 + }, + { + "epoch": 0.29, + "learning_rate": 4.456890597457907e-06, + "logits/chosen": -2.094747543334961, + "logits/rejected": -2.140568256378174, + "logps/chosen": -218.9592742919922, + "logps/rejected": -252.67184448242188, + "loss": 0.0241, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.03411801904439926, + "rewards/margins": 0.07754088193178177, + "rewards/rejected": -0.11165890842676163, + "step": 4470 + }, + { + "epoch": 0.29, + "learning_rate": 4.453332056997951e-06, + "logits/chosen": -2.213270664215088, + "logits/rejected": -2.2702724933624268, + "logps/chosen": -181.20423889160156, + "logps/rejected": -197.28903198242188, + "loss": 0.0182, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.0062237693928182125, + "rewards/margins": 0.10204926878213882, + "rewards/rejected": -0.10827304422855377, + "step": 4480 + }, + { + "epoch": 0.29, + "learning_rate": 4.449763327001134e-06, + "logits/chosen": -2.249997615814209, + "logits/rejected": -2.156466007232666, + "logps/chosen": -191.21969604492188, + "logps/rejected": -233.1490478515625, + "loss": 0.0268, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.012287397868931293, + "rewards/margins": 0.06756951659917831, + "rewards/rejected": -0.07985690981149673, + "step": 4490 + }, + { + "epoch": 0.29, + "learning_rate": 4.446184426083702e-06, + "logits/chosen": -2.2206637859344482, + "logits/rejected": -1.9963423013687134, + "logps/chosen": -199.42367553710938, + "logps/rejected": -231.6684112548828, + "loss": 0.0143, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.029296617954969406, + "rewards/margins": 0.11938565969467163, + "rewards/rejected": -0.14868226647377014, + "step": 4500 + }, + { + "epoch": 0.29, + "eval_logits/chosen": -2.284329652786255, + "eval_logits/rejected": -2.0987653732299805, + "eval_logps/chosen": -238.03781127929688, + "eval_logps/rejected": -232.74058532714844, + "eval_loss": 0.026792826130986214, + "eval_rewards/accuracies": 0.637499988079071, + "eval_rewards/chosen": -0.030164305120706558, + "eval_rewards/margins": 0.07547909766435623, + "eval_rewards/rejected": -0.10564339905977249, + "eval_runtime": 712.6767, + "eval_samples_per_second": 2.806, + "eval_steps_per_second": 1.403, + "step": 4500 + }, + { + "epoch": 0.3, + "learning_rate": 4.442595372914954e-06, + "logits/chosen": -2.3305094242095947, + "logits/rejected": -2.0600168704986572, + "logps/chosen": -239.54110717773438, + "logps/rejected": -170.85906982421875, + "loss": 0.0155, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.012803817167878151, + "rewards/margins": 0.08296112716197968, + "rewards/rejected": -0.09576494991779327, + "step": 4510 + }, + { + "epoch": 0.3, + "learning_rate": 4.43899618621715e-06, + "logits/chosen": -2.2653181552886963, + "logits/rejected": -2.0612456798553467, + "logps/chosen": -261.11163330078125, + "logps/rejected": -285.72015380859375, + "loss": 0.0401, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.0502888560295105, + "rewards/margins": 0.10705902427434921, + "rewards/rejected": -0.1573478728532791, + "step": 4520 + }, + { + "epoch": 0.3, + "learning_rate": 4.4353868847654105e-06, + "logits/chosen": -2.385094165802002, + "logits/rejected": -2.149104595184326, + "logps/chosen": -249.2441864013672, + "logps/rejected": -231.818359375, + "loss": 0.0321, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.016929741948843002, + "rewards/margins": 0.0632086917757988, + "rewards/rejected": -0.08013845235109329, + "step": 4530 + }, + { + "epoch": 0.3, + "learning_rate": 4.43176748738762e-06, + "logits/chosen": -2.2918667793273926, + "logits/rejected": -2.0501952171325684, + "logps/chosen": -240.1075897216797, + "logps/rejected": -264.6262512207031, + "loss": 0.0155, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.04079819470643997, + "rewards/margins": 0.10065152496099472, + "rewards/rejected": -0.1414497196674347, + "step": 4540 + }, + { + "epoch": 0.3, + "learning_rate": 4.4281380129643295e-06, + "logits/chosen": -2.201007127761841, + "logits/rejected": -2.040647506713867, + "logps/chosen": -233.6107940673828, + "logps/rejected": -243.1392822265625, + "loss": 0.0275, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.01930316351354122, + "rewards/margins": 0.10001279413700104, + "rewards/rejected": -0.11931595951318741, + "step": 4550 + }, + { + "epoch": 0.3, + "learning_rate": 4.424498480428654e-06, + "logits/chosen": -2.240109443664551, + "logits/rejected": -2.133995771408081, + "logps/chosen": -252.0578155517578, + "logps/rejected": -216.51962280273438, + "loss": 0.0364, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.02392945997416973, + "rewards/margins": 0.02243615873157978, + "rewards/rejected": -0.04636561498045921, + "step": 4560 + }, + { + "epoch": 0.3, + "learning_rate": 4.420848908766178e-06, + "logits/chosen": -2.35581636428833, + "logits/rejected": -2.28523588180542, + "logps/chosen": -208.04129028320312, + "logps/rejected": -228.0386199951172, + "loss": 0.0295, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -0.004072178620845079, + "rewards/margins": 0.05558561533689499, + "rewards/rejected": -0.05965778976678848, + "step": 4570 + }, + { + "epoch": 0.3, + "learning_rate": 4.417189317014855e-06, + "logits/chosen": -2.1987602710723877, + "logits/rejected": -2.439194917678833, + "logps/chosen": -203.37686157226562, + "logps/rejected": -242.19515991210938, + "loss": 0.0299, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.012904942035675049, + "rewards/margins": 0.04060705006122589, + "rewards/rejected": -0.05351199582219124, + "step": 4580 + }, + { + "epoch": 0.3, + "learning_rate": 4.41351972426491e-06, + "logits/chosen": -2.1169800758361816, + "logits/rejected": -2.1452808380126953, + "logps/chosen": -253.7069854736328, + "logps/rejected": -320.93609619140625, + "loss": 0.0138, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.03483807295560837, + "rewards/margins": 0.06588040292263031, + "rewards/rejected": -0.10071848332881927, + "step": 4590 + }, + { + "epoch": 0.3, + "learning_rate": 4.409840149658735e-06, + "logits/chosen": -2.2177650928497314, + "logits/rejected": -1.9569743871688843, + "logps/chosen": -286.93402099609375, + "logps/rejected": -251.4782257080078, + "loss": 0.0268, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.012083860114216805, + "rewards/margins": 0.07347994297742844, + "rewards/rejected": -0.08556380867958069, + "step": 4600 + }, + { + "epoch": 0.3, + "eval_logits/chosen": -2.2916858196258545, + "eval_logits/rejected": -2.106189489364624, + "eval_logps/chosen": -236.19700622558594, + "eval_logps/rejected": -229.6837615966797, + "eval_loss": 0.027024326846003532, + "eval_rewards/accuracies": 0.6399999856948853, + "eval_rewards/chosen": -0.02096020244061947, + "eval_rewards/margins": 0.06939905881881714, + "eval_rewards/rejected": -0.09035927057266235, + "eval_runtime": 715.4942, + "eval_samples_per_second": 2.795, + "eval_steps_per_second": 1.398, + "step": 4600 + }, + { + "epoch": 0.3, + "learning_rate": 4.4061506123907925e-06, + "logits/chosen": -2.209913969039917, + "logits/rejected": -2.0499978065490723, + "logps/chosen": -270.0508728027344, + "logps/rejected": -241.28359985351562, + "loss": 0.0317, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.029795369133353233, + "rewards/margins": 0.05769450217485428, + "rewards/rejected": -0.08748986572027206, + "step": 4610 + }, + { + "epoch": 0.3, + "learning_rate": 4.402451131707519e-06, + "logits/chosen": -2.403371810913086, + "logits/rejected": -1.9444561004638672, + "logps/chosen": -213.96875, + "logps/rejected": -159.59783935546875, + "loss": 0.0126, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.022516760975122452, + "rewards/margins": 0.10555239021778107, + "rewards/rejected": -0.12806914746761322, + "step": 4620 + }, + { + "epoch": 0.3, + "learning_rate": 4.398741726907215e-06, + "logits/chosen": -2.4314961433410645, + "logits/rejected": -2.115912437438965, + "logps/chosen": -281.1767272949219, + "logps/rejected": -257.37591552734375, + "loss": 0.023, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.009596886113286018, + "rewards/margins": 0.08262918889522552, + "rewards/rejected": -0.0922260731458664, + "step": 4630 + }, + { + "epoch": 0.3, + "learning_rate": 4.395022417339955e-06, + "logits/chosen": -2.1874938011169434, + "logits/rejected": -2.2346978187561035, + "logps/chosen": -219.12521362304688, + "logps/rejected": -241.1179656982422, + "loss": 0.0379, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.06250091642141342, + "rewards/margins": 0.06751805543899536, + "rewards/rejected": -0.13001897931098938, + "step": 4640 + }, + { + "epoch": 0.3, + "learning_rate": 4.391293222407479e-06, + "logits/chosen": -2.2946298122406006, + "logits/rejected": -2.295693874359131, + "logps/chosen": -140.21095275878906, + "logps/rejected": -170.3481903076172, + "loss": 0.0228, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": -0.018268903717398643, + "rewards/margins": 0.059536147862672806, + "rewards/rejected": -0.0778050571680069, + "step": 4650 + }, + { + "epoch": 0.3, + "learning_rate": 4.387554161563094e-06, + "logits/chosen": -2.291577100753784, + "logits/rejected": -2.2055227756500244, + "logps/chosen": -207.60568237304688, + "logps/rejected": -217.01821899414062, + "loss": 0.0232, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.04370753467082977, + "rewards/margins": 0.10616093873977661, + "rewards/rejected": -0.1498684585094452, + "step": 4660 + }, + { + "epoch": 0.31, + "learning_rate": 4.383805254311575e-06, + "logits/chosen": -2.48099684715271, + "logits/rejected": -2.1088850498199463, + "logps/chosen": -266.7301330566406, + "logps/rejected": -234.53079223632812, + "loss": 0.0447, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.0529455728828907, + "rewards/margins": 0.06683431565761566, + "rewards/rejected": -0.11977989971637726, + "step": 4670 + }, + { + "epoch": 0.31, + "learning_rate": 4.380046520209056e-06, + "logits/chosen": -2.330359935760498, + "logits/rejected": -1.9653619527816772, + "logps/chosen": -211.83743286132812, + "logps/rejected": -205.95068359375, + "loss": 0.0206, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.057855475693941116, + "rewards/margins": 0.08976194262504578, + "rewards/rejected": -0.1476174145936966, + "step": 4680 + }, + { + "epoch": 0.31, + "learning_rate": 4.376277978862936e-06, + "logits/chosen": -2.192985773086548, + "logits/rejected": -1.9010270833969116, + "logps/chosen": -238.52273559570312, + "logps/rejected": -210.4039764404297, + "loss": 0.0262, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.06267054378986359, + "rewards/margins": 0.06206362694501877, + "rewards/rejected": -0.12473416328430176, + "step": 4690 + }, + { + "epoch": 0.31, + "learning_rate": 4.372499649931774e-06, + "logits/chosen": -2.141416072845459, + "logits/rejected": -2.292172908782959, + "logps/chosen": -223.7112579345703, + "logps/rejected": -259.6845397949219, + "loss": 0.026, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.07309852540493011, + "rewards/margins": 0.14008431136608124, + "rewards/rejected": -0.21318283677101135, + "step": 4700 + }, + { + "epoch": 0.31, + "eval_logits/chosen": -2.2986583709716797, + "eval_logits/rejected": -2.1118977069854736, + "eval_logps/chosen": -248.163818359375, + "eval_logps/rejected": -244.79864501953125, + "eval_loss": 0.02719779685139656, + "eval_rewards/accuracies": 0.6495000123977661, + "eval_rewards/chosen": -0.08079422265291214, + "eval_rewards/margins": 0.08513953536748886, + "eval_rewards/rejected": -0.165933758020401, + "eval_runtime": 715.3905, + "eval_samples_per_second": 2.796, + "eval_steps_per_second": 1.398, + "step": 4700 + }, + { + "epoch": 0.31, + "learning_rate": 4.368711553125185e-06, + "logits/chosen": -2.4827535152435303, + "logits/rejected": -2.2455661296844482, + "logps/chosen": -292.4847106933594, + "logps/rejected": -248.26742553710938, + "loss": 0.0258, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.07647337019443512, + "rewards/margins": 0.07113082706928253, + "rewards/rejected": -0.14760419726371765, + "step": 4710 + }, + { + "epoch": 0.31, + "learning_rate": 4.364913708203734e-06, + "logits/chosen": -2.3889904022216797, + "logits/rejected": -2.042200803756714, + "logps/chosen": -302.0526428222656, + "logps/rejected": -241.18408203125, + "loss": 0.019, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.08578182011842728, + "rewards/margins": 0.0727434754371643, + "rewards/rejected": -0.15852530300617218, + "step": 4720 + }, + { + "epoch": 0.31, + "learning_rate": 4.361106134978844e-06, + "logits/chosen": -2.2721707820892334, + "logits/rejected": -2.069014072418213, + "logps/chosen": -283.931884765625, + "logps/rejected": -284.26580810546875, + "loss": 0.0276, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.06750717014074326, + "rewards/margins": 0.06642328202724457, + "rewards/rejected": -0.13393045961856842, + "step": 4730 + }, + { + "epoch": 0.31, + "learning_rate": 4.357288853312681e-06, + "logits/chosen": -2.3525614738464355, + "logits/rejected": -2.2721951007843018, + "logps/chosen": -298.66815185546875, + "logps/rejected": -304.68682861328125, + "loss": 0.0145, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.07595177739858627, + "rewards/margins": 0.05377800017595291, + "rewards/rejected": -0.12972977757453918, + "step": 4740 + }, + { + "epoch": 0.31, + "learning_rate": 4.353461883118056e-06, + "logits/chosen": -2.251840591430664, + "logits/rejected": -2.1155953407287598, + "logps/chosen": -244.6503143310547, + "logps/rejected": -230.669921875, + "loss": 0.0379, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.0767986848950386, + "rewards/margins": 0.03585362061858177, + "rewards/rejected": -0.11265231668949127, + "step": 4750 + }, + { + "epoch": 0.31, + "learning_rate": 4.34962524435832e-06, + "logits/chosen": -2.1269595623016357, + "logits/rejected": -2.0253615379333496, + "logps/chosen": -231.0450439453125, + "logps/rejected": -213.3219757080078, + "loss": 0.0403, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.056823186576366425, + "rewards/margins": 0.09555148333311081, + "rewards/rejected": -0.15237466990947723, + "step": 4760 + }, + { + "epoch": 0.31, + "learning_rate": 4.34577895704726e-06, + "logits/chosen": -2.380775213241577, + "logits/rejected": -2.2097785472869873, + "logps/chosen": -275.146240234375, + "logps/rejected": -261.49993896484375, + "loss": 0.0325, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.06766113638877869, + "rewards/margins": 0.06038862466812134, + "rewards/rejected": -0.12804976105690002, + "step": 4770 + }, + { + "epoch": 0.31, + "learning_rate": 4.3419230412489954e-06, + "logits/chosen": -2.479979991912842, + "logits/rejected": -2.248835563659668, + "logps/chosen": -301.73004150390625, + "logps/rejected": -234.8883514404297, + "loss": 0.0449, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.06181095913052559, + "rewards/margins": 0.046111006289720535, + "rewards/rejected": -0.10792195796966553, + "step": 4780 + }, + { + "epoch": 0.31, + "learning_rate": 4.338057517077872e-06, + "logits/chosen": -2.4250409603118896, + "logits/rejected": -2.017948627471924, + "logps/chosen": -203.47496032714844, + "logps/rejected": -193.13108825683594, + "loss": 0.0374, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.04716876894235611, + "rewards/margins": 0.16504201292991638, + "rewards/rejected": -0.21221080422401428, + "step": 4790 + }, + { + "epoch": 0.31, + "learning_rate": 4.334182404698356e-06, + "logits/chosen": -2.419837236404419, + "logits/rejected": -1.9811451435089111, + "logps/chosen": -246.45455932617188, + "logps/rejected": -179.24266052246094, + "loss": 0.0447, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.0817376896739006, + "rewards/margins": 0.05593956634402275, + "rewards/rejected": -0.13767726719379425, + "step": 4800 + }, + { + "epoch": 0.31, + "eval_logits/chosen": -2.2995717525482178, + "eval_logits/rejected": -2.113132953643799, + "eval_logps/chosen": -244.67567443847656, + "eval_logps/rejected": -240.1879425048828, + "eval_loss": 0.02654874138534069, + "eval_rewards/accuracies": 0.6464999914169312, + "eval_rewards/chosen": -0.06335365027189255, + "eval_rewards/margins": 0.07952655106782913, + "eval_rewards/rejected": -0.14288020133972168, + "eval_runtime": 712.5859, + "eval_samples_per_second": 2.807, + "eval_steps_per_second": 1.403, + "step": 4800 + }, + { + "epoch": 0.31, + "learning_rate": 4.330297724324933e-06, + "logits/chosen": -2.548460006713867, + "logits/rejected": -2.0144965648651123, + "logps/chosen": -317.345947265625, + "logps/rejected": -234.26913452148438, + "loss": 0.0118, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.044672705233097076, + "rewards/margins": 0.09061713516712189, + "rewards/rejected": -0.13528983294963837, + "step": 4810 + }, + { + "epoch": 0.32, + "learning_rate": 4.326403496221999e-06, + "logits/chosen": -2.212214469909668, + "logits/rejected": -2.1176552772521973, + "logps/chosen": -172.85696411132812, + "logps/rejected": -159.83572387695312, + "loss": 0.0338, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.05713977664709091, + "rewards/margins": 0.06879337877035141, + "rewards/rejected": -0.12593314051628113, + "step": 4820 + }, + { + "epoch": 0.32, + "learning_rate": 4.322499740703755e-06, + "logits/chosen": -2.1856634616851807, + "logits/rejected": -2.288818120956421, + "logps/chosen": -200.39944458007812, + "logps/rejected": -236.4174041748047, + "loss": 0.0269, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.04150126129388809, + "rewards/margins": 0.05747325345873833, + "rewards/rejected": -0.09897451102733612, + "step": 4830 + }, + { + "epoch": 0.32, + "learning_rate": 4.318586478134101e-06, + "logits/chosen": -2.2040696144104004, + "logits/rejected": -2.1780283451080322, + "logps/chosen": -198.16943359375, + "logps/rejected": -173.2099151611328, + "loss": 0.0309, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.027509670704603195, + "rewards/margins": 0.07780478149652481, + "rewards/rejected": -0.10531443357467651, + "step": 4840 + }, + { + "epoch": 0.32, + "learning_rate": 4.314663728926534e-06, + "logits/chosen": -2.4546444416046143, + "logits/rejected": -2.205855131149292, + "logps/chosen": -269.7536315917969, + "logps/rejected": -273.2572326660156, + "loss": 0.0329, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.06438954174518585, + "rewards/margins": 0.07040944695472717, + "rewards/rejected": -0.13479898869991302, + "step": 4850 + }, + { + "epoch": 0.32, + "learning_rate": 4.310731513544033e-06, + "logits/chosen": -2.256805896759033, + "logits/rejected": -2.0916640758514404, + "logps/chosen": -253.5753173828125, + "logps/rejected": -223.19534301757812, + "loss": 0.0317, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.053587764501571655, + "rewards/margins": 0.07849793136119843, + "rewards/rejected": -0.13208571076393127, + "step": 4860 + }, + { + "epoch": 0.32, + "learning_rate": 4.30678985249896e-06, + "logits/chosen": -2.2457427978515625, + "logits/rejected": -2.185237169265747, + "logps/chosen": -167.4801025390625, + "logps/rejected": -205.831787109375, + "loss": 0.0595, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.051650386303663254, + "rewards/margins": 0.10044083744287491, + "rewards/rejected": -0.15209123492240906, + "step": 4870 + }, + { + "epoch": 0.32, + "learning_rate": 4.302838766352952e-06, + "logits/chosen": -2.257594347000122, + "logits/rejected": -2.0400383472442627, + "logps/chosen": -265.0000915527344, + "logps/rejected": -244.8212890625, + "loss": 0.0288, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.04703570157289505, + "rewards/margins": 0.07806356251239777, + "rewards/rejected": -0.12509925663471222, + "step": 4880 + }, + { + "epoch": 0.32, + "learning_rate": 4.298878275716806e-06, + "logits/chosen": -2.1690163612365723, + "logits/rejected": -2.1535024642944336, + "logps/chosen": -202.11068725585938, + "logps/rejected": -219.4486083984375, + "loss": 0.0347, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.051156263798475266, + "rewards/margins": 0.1006745845079422, + "rewards/rejected": -0.15183086693286896, + "step": 4890 + }, + { + "epoch": 0.32, + "learning_rate": 4.294908401250386e-06, + "logits/chosen": -2.38234281539917, + "logits/rejected": -1.940474271774292, + "logps/chosen": -226.0990753173828, + "logps/rejected": -201.27613830566406, + "loss": 0.0311, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.05678023770451546, + "rewards/margins": 0.09854695945978165, + "rewards/rejected": -0.1553271859884262, + "step": 4900 + }, + { + "epoch": 0.32, + "eval_logits/chosen": -2.2562031745910645, + "eval_logits/rejected": -2.072751045227051, + "eval_logps/chosen": -240.35696411132812, + "eval_logps/rejected": -237.6829833984375, + "eval_loss": 0.026891114190220833, + "eval_rewards/accuracies": 0.6470000147819519, + "eval_rewards/chosen": -0.04176010936498642, + "eval_rewards/margins": 0.08859530091285706, + "eval_rewards/rejected": -0.13035540282726288, + "eval_runtime": 715.6727, + "eval_samples_per_second": 2.795, + "eval_steps_per_second": 1.397, + "step": 4900 + }, + { + "epoch": 0.32, + "learning_rate": 4.290929163662498e-06, + "logits/chosen": -2.119206428527832, + "logits/rejected": -1.9020423889160156, + "logps/chosen": -276.1395568847656, + "logps/rejected": -236.4208221435547, + "loss": 0.03, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.032021451741456985, + "rewards/margins": 0.08998885005712509, + "rewards/rejected": -0.12201030552387238, + "step": 4910 + }, + { + "epoch": 0.32, + "learning_rate": 4.286940583710796e-06, + "logits/chosen": -2.294553279876709, + "logits/rejected": -2.1739342212677, + "logps/chosen": -303.06158447265625, + "logps/rejected": -267.4081115722656, + "loss": 0.0132, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.051564522087574005, + "rewards/margins": 0.10488219559192657, + "rewards/rejected": -0.15644671022891998, + "step": 4920 + }, + { + "epoch": 0.32, + "learning_rate": 4.282942682201667e-06, + "logits/chosen": -2.1719789505004883, + "logits/rejected": -1.9019883871078491, + "logps/chosen": -264.94378662109375, + "logps/rejected": -242.8894500732422, + "loss": 0.0372, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.06417985260486603, + "rewards/margins": 0.08766107261180878, + "rewards/rejected": -0.15184089541435242, + "step": 4930 + }, + { + "epoch": 0.32, + "learning_rate": 4.278935479990123e-06, + "logits/chosen": -2.463348388671875, + "logits/rejected": -2.2190678119659424, + "logps/chosen": -219.1842498779297, + "logps/rejected": -188.9755401611328, + "loss": 0.0271, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.07260443270206451, + "rewards/margins": 0.0638137012720108, + "rewards/rejected": -0.1364181488752365, + "step": 4940 + }, + { + "epoch": 0.32, + "learning_rate": 4.274918997979695e-06, + "logits/chosen": -2.2152819633483887, + "logits/rejected": -2.24706768989563, + "logps/chosen": -211.72219848632812, + "logps/rejected": -226.01937866210938, + "loss": 0.0393, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.09043584764003754, + "rewards/margins": 0.06280899047851562, + "rewards/rejected": -0.15324482321739197, + "step": 4950 + }, + { + "epoch": 0.32, + "learning_rate": 4.270893257122319e-06, + "logits/chosen": -2.1276659965515137, + "logits/rejected": -1.8986365795135498, + "logps/chosen": -242.81045532226562, + "logps/rejected": -299.8619689941406, + "loss": 0.0196, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.07044224441051483, + "rewards/margins": 0.13771095871925354, + "rewards/rejected": -0.20815320312976837, + "step": 4960 + }, + { + "epoch": 0.33, + "learning_rate": 4.266858278418232e-06, + "logits/chosen": -2.1353728771209717, + "logits/rejected": -1.8892120122909546, + "logps/chosen": -251.4339141845703, + "logps/rejected": -242.8282470703125, + "loss": 0.0278, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.07414842396974564, + "rewards/margins": 0.05872755125164986, + "rewards/rejected": -0.1328759789466858, + "step": 4970 + }, + { + "epoch": 0.33, + "learning_rate": 4.26281408291586e-06, + "logits/chosen": -2.371889591217041, + "logits/rejected": -2.068842887878418, + "logps/chosen": -251.660888671875, + "logps/rejected": -242.34890747070312, + "loss": 0.0345, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.046618033200502396, + "rewards/margins": 0.1011015996336937, + "rewards/rejected": -0.14771965146064758, + "step": 4980 + }, + { + "epoch": 0.33, + "learning_rate": 4.258760691711706e-06, + "logits/chosen": -2.270484685897827, + "logits/rejected": -2.146216630935669, + "logps/chosen": -211.3184051513672, + "logps/rejected": -222.37344360351562, + "loss": 0.0197, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.06928422302007675, + "rewards/margins": 0.07823914289474487, + "rewards/rejected": -0.14752335846424103, + "step": 4990 + }, + { + "epoch": 0.33, + "learning_rate": 4.254698125950247e-06, + "logits/chosen": -2.5137898921966553, + "logits/rejected": -2.2680342197418213, + "logps/chosen": -311.6083679199219, + "logps/rejected": -276.7689208984375, + "loss": 0.0241, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.05296919494867325, + "rewards/margins": 0.05913100391626358, + "rewards/rejected": -0.11210019886493683, + "step": 5000 + }, + { + "epoch": 0.33, + "eval_logits/chosen": -2.270616292953491, + "eval_logits/rejected": -2.0851943492889404, + "eval_logps/chosen": -244.52313232421875, + "eval_logps/rejected": -241.18063354492188, + "eval_loss": 0.02674746699631214, + "eval_rewards/accuracies": 0.6424999833106995, + "eval_rewards/chosen": -0.06259080767631531, + "eval_rewards/margins": 0.08525291085243225, + "eval_rewards/rejected": -0.14784371852874756, + "eval_runtime": 711.8804, + "eval_samples_per_second": 2.809, + "eval_steps_per_second": 1.405, + "step": 5000 + }, + { + "epoch": 0.33, + "learning_rate": 4.250626406823815e-06, + "logits/chosen": -2.325876235961914, + "logits/rejected": -2.0665838718414307, + "logps/chosen": -231.2372589111328, + "logps/rejected": -277.8995056152344, + "loss": 0.0406, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.07187283039093018, + "rewards/margins": 0.15242110192775726, + "rewards/rejected": -0.22429391741752625, + "step": 5010 + }, + { + "epoch": 0.33, + "learning_rate": 4.246545555572489e-06, + "logits/chosen": -2.245234727859497, + "logits/rejected": -2.11665678024292, + "logps/chosen": -170.9512481689453, + "logps/rejected": -217.70150756835938, + "loss": 0.0252, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.08377556502819061, + "rewards/margins": 0.1089911088347435, + "rewards/rejected": -0.1927666962146759, + "step": 5020 + }, + { + "epoch": 0.33, + "learning_rate": 4.242455593483992e-06, + "logits/chosen": -2.338961362838745, + "logits/rejected": -2.139385223388672, + "logps/chosen": -236.36843872070312, + "logps/rejected": -198.01046752929688, + "loss": 0.0224, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.0860726535320282, + "rewards/margins": 0.056757472455501556, + "rewards/rejected": -0.14283011853694916, + "step": 5030 + }, + { + "epoch": 0.33, + "learning_rate": 4.238356541893567e-06, + "logits/chosen": -2.221163511276245, + "logits/rejected": -2.0838024616241455, + "logps/chosen": -210.2323760986328, + "logps/rejected": -213.17843627929688, + "loss": 0.0203, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.10049609839916229, + "rewards/margins": 0.09244342148303986, + "rewards/rejected": -0.19293954968452454, + "step": 5040 + }, + { + "epoch": 0.33, + "learning_rate": 4.234248422183876e-06, + "logits/chosen": -2.0850110054016113, + "logits/rejected": -2.282813310623169, + "logps/chosen": -258.5987548828125, + "logps/rejected": -266.34283447265625, + "loss": 0.0421, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.07832027971744537, + "rewards/margins": 0.06150209158658981, + "rewards/rejected": -0.13982237875461578, + "step": 5050 + }, + { + "epoch": 0.33, + "learning_rate": 4.230131255784884e-06, + "logits/chosen": -2.5053064823150635, + "logits/rejected": -2.1963772773742676, + "logps/chosen": -271.65020751953125, + "logps/rejected": -268.30572509765625, + "loss": 0.0258, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.06883566081523895, + "rewards/margins": 0.0772596225142479, + "rewards/rejected": -0.14609530568122864, + "step": 5060 + }, + { + "epoch": 0.33, + "learning_rate": 4.226005064173748e-06, + "logits/chosen": -2.295358180999756, + "logits/rejected": -2.1440882682800293, + "logps/chosen": -276.32818603515625, + "logps/rejected": -307.0455627441406, + "loss": 0.0157, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.058419596403837204, + "rewards/margins": 0.06130147725343704, + "rewards/rejected": -0.11972107738256454, + "step": 5070 + }, + { + "epoch": 0.33, + "learning_rate": 4.2218698688747035e-06, + "logits/chosen": -2.116386890411377, + "logits/rejected": -1.962472915649414, + "logps/chosen": -252.8600311279297, + "logps/rejected": -227.2456817626953, + "loss": 0.0202, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.1007535308599472, + "rewards/margins": 0.09757916629314423, + "rewards/rejected": -0.19833272695541382, + "step": 5080 + }, + { + "epoch": 0.33, + "learning_rate": 4.217725691458957e-06, + "logits/chosen": -2.4443917274475098, + "logits/rejected": -2.2689287662506104, + "logps/chosen": -199.50445556640625, + "logps/rejected": -245.4992218017578, + "loss": 0.0274, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.07640162855386734, + "rewards/margins": 0.09524401277303696, + "rewards/rejected": -0.1716456115245819, + "step": 5090 + }, + { + "epoch": 0.33, + "learning_rate": 4.213572553544565e-06, + "logits/chosen": -2.3300154209136963, + "logits/rejected": -2.0967860221862793, + "logps/chosen": -254.84469604492188, + "logps/rejected": -264.0647277832031, + "loss": 0.0183, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.06936436146497726, + "rewards/margins": 0.08292285352945328, + "rewards/rejected": -0.15228721499443054, + "step": 5100 + }, + { + "epoch": 0.33, + "eval_logits/chosen": -2.284034252166748, + "eval_logits/rejected": -2.097978353500366, + "eval_logps/chosen": -243.7823944091797, + "eval_logps/rejected": -239.09414672851562, + "eval_loss": 0.026595089584589005, + "eval_rewards/accuracies": 0.6414999961853027, + "eval_rewards/chosen": -0.05888722091913223, + "eval_rewards/margins": 0.07852396368980408, + "eval_rewards/rejected": -0.1374111771583557, + "eval_runtime": 712.368, + "eval_samples_per_second": 2.808, + "eval_steps_per_second": 1.404, + "step": 5100 + }, + { + "epoch": 0.33, + "learning_rate": 4.209410476796331e-06, + "logits/chosen": -2.2061429023742676, + "logits/rejected": -2.149639844894409, + "logps/chosen": -185.64755249023438, + "logps/rejected": -192.6439666748047, + "loss": 0.0358, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.07278040796518326, + "rewards/margins": 0.08492831885814667, + "rewards/rejected": -0.15770871937274933, + "step": 5110 + }, + { + "epoch": 0.33, + "learning_rate": 4.205239482925686e-06, + "logits/chosen": -2.0843310356140137, + "logits/rejected": -2.1315224170684814, + "logps/chosen": -200.5720672607422, + "logps/rejected": -232.9461669921875, + "loss": 0.0355, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.06378359347581863, + "rewards/margins": 0.051238518208265305, + "rewards/rejected": -0.11502210050821304, + "step": 5120 + }, + { + "epoch": 0.34, + "learning_rate": 4.201059593690577e-06, + "logits/chosen": -2.3437514305114746, + "logits/rejected": -2.255739688873291, + "logps/chosen": -237.38357543945312, + "logps/rejected": -225.99667358398438, + "loss": 0.0105, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.05897898226976395, + "rewards/margins": 0.07021085917949677, + "rewards/rejected": -0.1291898488998413, + "step": 5130 + }, + { + "epoch": 0.34, + "learning_rate": 4.196870830895354e-06, + "logits/chosen": -2.1586525440216064, + "logits/rejected": -2.1691572666168213, + "logps/chosen": -270.2343444824219, + "logps/rejected": -332.2071228027344, + "loss": 0.0195, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.0624338760972023, + "rewards/margins": 0.06232718750834465, + "rewards/rejected": -0.12476108223199844, + "step": 5140 + }, + { + "epoch": 0.34, + "learning_rate": 4.192673216390657e-06, + "logits/chosen": -2.3356637954711914, + "logits/rejected": -2.072592258453369, + "logps/chosen": -243.0529327392578, + "logps/rejected": -221.76089477539062, + "loss": 0.0356, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.05039996653795242, + "rewards/margins": 0.08517764508724213, + "rewards/rejected": -0.13557760417461395, + "step": 5150 + }, + { + "epoch": 0.34, + "learning_rate": 4.188466772073296e-06, + "logits/chosen": -2.4300613403320312, + "logits/rejected": -2.1226162910461426, + "logps/chosen": -234.875732421875, + "logps/rejected": -223.30734252929688, + "loss": 0.0189, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.07175593078136444, + "rewards/margins": 0.04596634581685066, + "rewards/rejected": -0.1177222728729248, + "step": 5160 + }, + { + "epoch": 0.34, + "learning_rate": 4.184251519886148e-06, + "logits/chosen": -2.170525074005127, + "logits/rejected": -2.264866828918457, + "logps/chosen": -217.9490203857422, + "logps/rejected": -259.9535827636719, + "loss": 0.0347, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.10920746624469757, + "rewards/margins": 0.06674760580062866, + "rewards/rejected": -0.17595505714416504, + "step": 5170 + }, + { + "epoch": 0.34, + "learning_rate": 4.180027481818033e-06, + "logits/chosen": -2.2939534187316895, + "logits/rejected": -2.2727739810943604, + "logps/chosen": -285.91156005859375, + "logps/rejected": -258.90972900390625, + "loss": 0.0255, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.09606721997261047, + "rewards/margins": 0.0541587769985199, + "rewards/rejected": -0.15022599697113037, + "step": 5180 + }, + { + "epoch": 0.34, + "learning_rate": 4.175794679903602e-06, + "logits/chosen": -2.3346500396728516, + "logits/rejected": -2.097072124481201, + "logps/chosen": -246.05990600585938, + "logps/rejected": -191.774169921875, + "loss": 0.0407, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.0920952558517456, + "rewards/margins": 0.09035644680261612, + "rewards/rejected": -0.18245169520378113, + "step": 5190 + }, + { + "epoch": 0.34, + "learning_rate": 4.171553136223222e-06, + "logits/chosen": -2.28583025932312, + "logits/rejected": -2.2817013263702393, + "logps/chosen": -294.59600830078125, + "logps/rejected": -325.22100830078125, + "loss": 0.0196, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.1363758146762848, + "rewards/margins": 0.09957583248615265, + "rewards/rejected": -0.23595163226127625, + "step": 5200 + }, + { + "epoch": 0.34, + "eval_logits/chosen": -2.30310320854187, + "eval_logits/rejected": -2.115068197250366, + "eval_logps/chosen": -256.06915283203125, + "eval_logps/rejected": -251.5511932373047, + "eval_loss": 0.028050120919942856, + "eval_rewards/accuracies": 0.6439999938011169, + "eval_rewards/chosen": -0.12032100558280945, + "eval_rewards/margins": 0.07937540858983994, + "eval_rewards/rejected": -0.1996964067220688, + "eval_runtime": 712.3698, + "eval_samples_per_second": 2.808, + "eval_steps_per_second": 1.404, + "step": 5200 + }, + { + "epoch": 0.34, + "learning_rate": 4.167302872902865e-06, + "logits/chosen": -2.3044514656066895, + "logits/rejected": -2.2063915729522705, + "logps/chosen": -279.16839599609375, + "logps/rejected": -289.25299072265625, + "loss": 0.0286, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.13850130140781403, + "rewards/margins": 0.11011314392089844, + "rewards/rejected": -0.24861443042755127, + "step": 5210 + }, + { + "epoch": 0.34, + "learning_rate": 4.163043912113985e-06, + "logits/chosen": -2.3407082557678223, + "logits/rejected": -2.1088497638702393, + "logps/chosen": -279.55242919921875, + "logps/rejected": -257.5092468261719, + "loss": 0.0235, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.10256153345108032, + "rewards/margins": 0.05975686386227608, + "rewards/rejected": -0.1623183935880661, + "step": 5220 + }, + { + "epoch": 0.34, + "learning_rate": 4.15877627607341e-06, + "logits/chosen": -2.1317286491394043, + "logits/rejected": -2.0381064414978027, + "logps/chosen": -235.8717803955078, + "logps/rejected": -225.30429077148438, + "loss": 0.0238, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.09638473391532898, + "rewards/margins": 0.06827443093061447, + "rewards/rejected": -0.16465915739536285, + "step": 5230 + }, + { + "epoch": 0.34, + "learning_rate": 4.154499987043217e-06, + "logits/chosen": -2.360839366912842, + "logits/rejected": -2.13840913772583, + "logps/chosen": -242.048095703125, + "logps/rejected": -244.0096893310547, + "loss": 0.0115, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.08922268450260162, + "rewards/margins": 0.12070658057928085, + "rewards/rejected": -0.20992930233478546, + "step": 5240 + }, + { + "epoch": 0.34, + "learning_rate": 4.150215067330625e-06, + "logits/chosen": -2.1910433769226074, + "logits/rejected": -2.1369471549987793, + "logps/chosen": -230.94845581054688, + "logps/rejected": -262.9044494628906, + "loss": 0.0371, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.10949836671352386, + "rewards/margins": 0.08959327638149261, + "rewards/rejected": -0.19909165799617767, + "step": 5250 + }, + { + "epoch": 0.34, + "learning_rate": 4.145921539287876e-06, + "logits/chosen": -2.2304296493530273, + "logits/rejected": -1.951674222946167, + "logps/chosen": -209.9062042236328, + "logps/rejected": -213.408935546875, + "loss": 0.022, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.09656783193349838, + "rewards/margins": 0.11753030121326447, + "rewards/rejected": -0.21409812569618225, + "step": 5260 + }, + { + "epoch": 0.34, + "learning_rate": 4.141619425312115e-06, + "logits/chosen": -2.3175554275512695, + "logits/rejected": -1.9529327154159546, + "logps/chosen": -228.7890625, + "logps/rejected": -215.639892578125, + "loss": 0.021, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.10068907588720322, + "rewards/margins": 0.056463442742824554, + "rewards/rejected": -0.15715253353118896, + "step": 5270 + }, + { + "epoch": 0.35, + "learning_rate": 4.1373087478452735e-06, + "logits/chosen": -2.446063756942749, + "logits/rejected": -2.040144205093384, + "logps/chosen": -240.7342071533203, + "logps/rejected": -214.44467163085938, + "loss": 0.0402, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.07746803760528564, + "rewards/margins": 0.14162200689315796, + "rewards/rejected": -0.2190900295972824, + "step": 5280 + }, + { + "epoch": 0.35, + "learning_rate": 4.132989529373959e-06, + "logits/chosen": -2.3407349586486816, + "logits/rejected": -1.9027103185653687, + "logps/chosen": -274.0345458984375, + "logps/rejected": -211.4494171142578, + "loss": 0.021, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.09060852229595184, + "rewards/margins": 0.08206693828105927, + "rewards/rejected": -0.17267544567584991, + "step": 5290 + }, + { + "epoch": 0.35, + "learning_rate": 4.128661792429331e-06, + "logits/chosen": -2.353055477142334, + "logits/rejected": -2.1825497150421143, + "logps/chosen": -269.4848327636719, + "logps/rejected": -285.455322265625, + "loss": 0.0218, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.0785713791847229, + "rewards/margins": 0.05537046119570732, + "rewards/rejected": -0.13394184410572052, + "step": 5300 + }, + { + "epoch": 0.35, + "eval_logits/chosen": -2.3077661991119385, + "eval_logits/rejected": -2.119899034500122, + "eval_logps/chosen": -249.09693908691406, + "eval_logps/rejected": -245.1140594482422, + "eval_loss": 0.02842918410897255, + "eval_rewards/accuracies": 0.6445000171661377, + "eval_rewards/chosen": -0.08545980602502823, + "eval_rewards/margins": 0.08205102384090424, + "eval_rewards/rejected": -0.16751083731651306, + "eval_runtime": 714.4322, + "eval_samples_per_second": 2.799, + "eval_steps_per_second": 1.4, + "step": 5300 + }, + { + "epoch": 0.35, + "learning_rate": 4.124325559586985e-06, + "logits/chosen": -2.063711643218994, + "logits/rejected": -2.1065452098846436, + "logps/chosen": -213.73178100585938, + "logps/rejected": -230.61123657226562, + "loss": 0.0493, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.12837538123130798, + "rewards/margins": 0.01977720856666565, + "rewards/rejected": -0.14815255999565125, + "step": 5310 + }, + { + "epoch": 0.35, + "learning_rate": 4.119980853466835e-06, + "logits/chosen": -2.285341262817383, + "logits/rejected": -1.893385648727417, + "logps/chosen": -224.71261596679688, + "logps/rejected": -217.3481903076172, + "loss": 0.0409, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.0777909979224205, + "rewards/margins": 0.09874869883060455, + "rewards/rejected": -0.17653970420360565, + "step": 5320 + }, + { + "epoch": 0.35, + "learning_rate": 4.115627696732997e-06, + "logits/chosen": -2.2187039852142334, + "logits/rejected": -2.0215742588043213, + "logps/chosen": -204.32298278808594, + "logps/rejected": -198.72189331054688, + "loss": 0.0249, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.06454373896121979, + "rewards/margins": 0.06859930604696274, + "rewards/rejected": -0.13314305245876312, + "step": 5330 + }, + { + "epoch": 0.35, + "learning_rate": 4.111266112093668e-06, + "logits/chosen": -2.3168303966522217, + "logits/rejected": -2.1125686168670654, + "logps/chosen": -218.3974151611328, + "logps/rejected": -258.51153564453125, + "loss": 0.0219, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.07464858889579773, + "rewards/margins": 0.10858140140771866, + "rewards/rejected": -0.1832299828529358, + "step": 5340 + }, + { + "epoch": 0.35, + "learning_rate": 4.1068961223010115e-06, + "logits/chosen": -2.292290687561035, + "logits/rejected": -2.0046138763427734, + "logps/chosen": -283.95220947265625, + "logps/rejected": -281.2190856933594, + "loss": 0.0281, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.0662577822804451, + "rewards/margins": 0.09971605241298676, + "rewards/rejected": -0.16597382724285126, + "step": 5350 + }, + { + "epoch": 0.35, + "learning_rate": 4.102517750151034e-06, + "logits/chosen": -2.3518424034118652, + "logits/rejected": -2.109513282775879, + "logps/chosen": -303.15936279296875, + "logps/rejected": -241.0292205810547, + "loss": 0.0306, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.04057258740067482, + "rewards/margins": 0.04591400548815727, + "rewards/rejected": -0.08648659288883209, + "step": 5360 + }, + { + "epoch": 0.35, + "learning_rate": 4.09813101848347e-06, + "logits/chosen": -2.2338290214538574, + "logits/rejected": -2.2780566215515137, + "logps/chosen": -224.85452270507812, + "logps/rejected": -255.304931640625, + "loss": 0.0207, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.02934291400015354, + "rewards/margins": 0.06648119539022446, + "rewards/rejected": -0.09582411497831345, + "step": 5370 + }, + { + "epoch": 0.35, + "learning_rate": 4.093735950181659e-06, + "logits/chosen": -2.1906943321228027, + "logits/rejected": -2.094470500946045, + "logps/chosen": -227.29638671875, + "logps/rejected": -271.4776306152344, + "loss": 0.0146, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.009115161374211311, + "rewards/margins": 0.09818967431783676, + "rewards/rejected": -0.10730484873056412, + "step": 5380 + }, + { + "epoch": 0.35, + "learning_rate": 4.0893325681724326e-06, + "logits/chosen": -2.3018577098846436, + "logits/rejected": -2.2464919090270996, + "logps/chosen": -268.96588134765625, + "logps/rejected": -270.78460693359375, + "loss": 0.0357, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.052720922976732254, + "rewards/margins": 0.07049056887626648, + "rewards/rejected": -0.12321150302886963, + "step": 5390 + }, + { + "epoch": 0.35, + "learning_rate": 4.084920895425988e-06, + "logits/chosen": -2.249305009841919, + "logits/rejected": -2.2471258640289307, + "logps/chosen": -249.9848175048828, + "logps/rejected": -277.1879577636719, + "loss": 0.0392, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.06310076266527176, + "rewards/margins": 0.06779120117425919, + "rewards/rejected": -0.13089194893836975, + "step": 5400 + }, + { + "epoch": 0.35, + "eval_logits/chosen": -2.3067572116851807, + "eval_logits/rejected": -2.1202468872070312, + "eval_logps/chosen": -236.23129272460938, + "eval_logps/rejected": -229.63597106933594, + "eval_loss": 0.027629448100924492, + "eval_rewards/accuracies": 0.6320000290870667, + "eval_rewards/chosen": -0.021131761372089386, + "eval_rewards/margins": 0.06898857653141022, + "eval_rewards/rejected": -0.0901203379034996, + "eval_runtime": 712.2572, + "eval_samples_per_second": 2.808, + "eval_steps_per_second": 1.404, + "step": 5400 + }, + { + "epoch": 0.35, + "learning_rate": 4.080500954955769e-06, + "logits/chosen": -2.2228169441223145, + "logits/rejected": -1.9339510202407837, + "logps/chosen": -263.3153991699219, + "logps/rejected": -259.1683349609375, + "loss": 0.0337, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.03721725195646286, + "rewards/margins": 0.07069588452577591, + "rewards/rejected": -0.10791312158107758, + "step": 5410 + }, + { + "epoch": 0.35, + "learning_rate": 4.076072769818354e-06, + "logits/chosen": -2.476381301879883, + "logits/rejected": -2.027029514312744, + "logps/chosen": -248.3318328857422, + "logps/rejected": -200.37460327148438, + "loss": 0.012, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.023047439754009247, + "rewards/margins": 0.07004229724407196, + "rewards/rejected": -0.0930897444486618, + "step": 5420 + }, + { + "epoch": 0.36, + "learning_rate": 4.071636363113323e-06, + "logits/chosen": -2.0812182426452637, + "logits/rejected": -2.0051701068878174, + "logps/chosen": -270.99114990234375, + "logps/rejected": -228.74618530273438, + "loss": 0.0269, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.014305288903415203, + "rewards/margins": 0.05898820608854294, + "rewards/rejected": -0.07329348474740982, + "step": 5430 + }, + { + "epoch": 0.36, + "learning_rate": 4.067191757983146e-06, + "logits/chosen": -2.0510897636413574, + "logits/rejected": -2.1016201972961426, + "logps/chosen": -237.39096069335938, + "logps/rejected": -252.74520874023438, + "loss": 0.0322, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.0390765555202961, + "rewards/margins": 0.09284119307994843, + "rewards/rejected": -0.13191775977611542, + "step": 5440 + }, + { + "epoch": 0.36, + "learning_rate": 4.062738977613063e-06, + "logits/chosen": -2.253300666809082, + "logits/rejected": -2.1287381649017334, + "logps/chosen": -238.584228515625, + "logps/rejected": -205.4900665283203, + "loss": 0.0258, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.020539533346891403, + "rewards/margins": 0.07649553567171097, + "rewards/rejected": -0.09703507274389267, + "step": 5450 + }, + { + "epoch": 0.36, + "learning_rate": 4.058278045230957e-06, + "logits/chosen": -2.1818337440490723, + "logits/rejected": -2.1889472007751465, + "logps/chosen": -224.8425750732422, + "logps/rejected": -237.52249145507812, + "loss": 0.0316, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.032378651201725006, + "rewards/margins": 0.055461399257183075, + "rewards/rejected": -0.08784005045890808, + "step": 5460 + }, + { + "epoch": 0.36, + "learning_rate": 4.053808984107235e-06, + "logits/chosen": -2.401970863342285, + "logits/rejected": -2.076896905899048, + "logps/chosen": -234.11154174804688, + "logps/rejected": -203.02137756347656, + "loss": 0.046, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.006216132082045078, + "rewards/margins": 0.04601501673460007, + "rewards/rejected": -0.05223115161061287, + "step": 5470 + }, + { + "epoch": 0.36, + "learning_rate": 4.04933181755471e-06, + "logits/chosen": -2.3881890773773193, + "logits/rejected": -2.3108270168304443, + "logps/chosen": -207.8488006591797, + "logps/rejected": -216.2948760986328, + "loss": 0.0403, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.006340887397527695, + "rewards/margins": 0.07829299569129944, + "rewards/rejected": -0.08463388681411743, + "step": 5480 + }, + { + "epoch": 0.36, + "learning_rate": 4.044846568928477e-06, + "logits/chosen": -2.273111343383789, + "logits/rejected": -2.347074270248413, + "logps/chosen": -264.272216796875, + "logps/rejected": -267.3084411621094, + "loss": 0.0372, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.01374004315584898, + "rewards/margins": 0.059470999985933304, + "rewards/rejected": -0.07321105152368546, + "step": 5490 + }, + { + "epoch": 0.36, + "learning_rate": 4.040353261625788e-06, + "logits/chosen": -2.439272403717041, + "logits/rejected": -2.0590574741363525, + "logps/chosen": -275.3080139160156, + "logps/rejected": -258.0126037597656, + "loss": 0.0095, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.006186123006045818, + "rewards/margins": 0.10667552798986435, + "rewards/rejected": -0.10048942267894745, + "step": 5500 + }, + { + "epoch": 0.36, + "eval_logits/chosen": -2.3002686500549316, + "eval_logits/rejected": -2.1143617630004883, + "eval_logps/chosen": -234.17333984375, + "eval_logps/rejected": -228.3682861328125, + "eval_loss": 0.027793213725090027, + "eval_rewards/accuracies": 0.6365000009536743, + "eval_rewards/chosen": -0.010841944254934788, + "eval_rewards/margins": 0.07293994724750519, + "eval_rewards/rejected": -0.083781898021698, + "eval_runtime": 713.3002, + "eval_samples_per_second": 2.804, + "eval_steps_per_second": 1.402, + "step": 5500 + }, + { + "epoch": 0.36, + "learning_rate": 4.035851919085936e-06, + "logits/chosen": -2.2740674018859863, + "logits/rejected": -2.147181510925293, + "logps/chosen": -273.53179931640625, + "logps/rejected": -211.1166229248047, + "loss": 0.0156, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.030688535422086716, + "rewards/margins": 0.08148294687271118, + "rewards/rejected": -0.1121714860200882, + "step": 5510 + }, + { + "epoch": 0.36, + "learning_rate": 4.031342564790128e-06, + "logits/chosen": -2.2105610370635986, + "logits/rejected": -2.0671443939208984, + "logps/chosen": -207.53897094726562, + "logps/rejected": -222.29736328125, + "loss": 0.0381, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.007836338132619858, + "rewards/margins": 0.09765578806400299, + "rewards/rejected": -0.10549211502075195, + "step": 5520 + }, + { + "epoch": 0.36, + "learning_rate": 4.026825222261367e-06, + "logits/chosen": -2.228231906890869, + "logits/rejected": -1.981793761253357, + "logps/chosen": -187.2061767578125, + "logps/rejected": -192.12667846679688, + "loss": 0.0467, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.07065559923648834, + "rewards/margins": 0.06031841039657593, + "rewards/rejected": -0.13097400963306427, + "step": 5530 + }, + { + "epoch": 0.36, + "learning_rate": 4.022299915064321e-06, + "logits/chosen": -2.298172950744629, + "logits/rejected": -2.099940299987793, + "logps/chosen": -309.3241882324219, + "logps/rejected": -271.7220458984375, + "loss": 0.0359, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.017381075769662857, + "rewards/margins": 0.0775168314576149, + "rewards/rejected": -0.09489791095256805, + "step": 5540 + }, + { + "epoch": 0.36, + "learning_rate": 4.017766666805213e-06, + "logits/chosen": -2.104163646697998, + "logits/rejected": -2.0608696937561035, + "logps/chosen": -222.52157592773438, + "logps/rejected": -203.38827514648438, + "loss": 0.0323, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.045006077736616135, + "rewards/margins": 0.081141397356987, + "rewards/rejected": -0.12614747881889343, + "step": 5550 + }, + { + "epoch": 0.36, + "learning_rate": 4.013225501131684e-06, + "logits/chosen": -2.3281373977661133, + "logits/rejected": -2.0518734455108643, + "logps/chosen": -221.41348266601562, + "logps/rejected": -207.80209350585938, + "loss": 0.0223, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -0.02017935737967491, + "rewards/margins": 0.06680725514888763, + "rewards/rejected": -0.08698661625385284, + "step": 5560 + }, + { + "epoch": 0.36, + "learning_rate": 4.008676441732679e-06, + "logits/chosen": -2.27862548828125, + "logits/rejected": -1.9543254375457764, + "logps/chosen": -214.8693084716797, + "logps/rejected": -177.15866088867188, + "loss": 0.0424, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": -0.04287482053041458, + "rewards/margins": 0.051451247185468674, + "rewards/rejected": -0.09432607889175415, + "step": 5570 + }, + { + "epoch": 0.37, + "learning_rate": 4.00411951233832e-06, + "logits/chosen": -2.4121203422546387, + "logits/rejected": -2.0738954544067383, + "logps/chosen": -225.67214965820312, + "logps/rejected": -198.25326538085938, + "loss": 0.0355, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.02973489835858345, + "rewards/margins": 0.08627339452505112, + "rewards/rejected": -0.11600829660892487, + "step": 5580 + }, + { + "epoch": 0.37, + "learning_rate": 3.999554736719785e-06, + "logits/chosen": -2.138333797454834, + "logits/rejected": -2.0238735675811768, + "logps/chosen": -297.84124755859375, + "logps/rejected": -269.1844177246094, + "loss": 0.0249, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.02932184934616089, + "rewards/margins": 0.07430565357208252, + "rewards/rejected": -0.10362748801708221, + "step": 5590 + }, + { + "epoch": 0.37, + "learning_rate": 3.994982138689177e-06, + "logits/chosen": -2.40543794631958, + "logits/rejected": -2.21651029586792, + "logps/chosen": -243.3259735107422, + "logps/rejected": -252.36026000976562, + "loss": 0.0199, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.02962910570204258, + "rewards/margins": 0.054427288472652435, + "rewards/rejected": -0.08405639231204987, + "step": 5600 + }, + { + "epoch": 0.37, + "eval_logits/chosen": -2.2606699466705322, + "eval_logits/rejected": -2.076427459716797, + "eval_logps/chosen": -241.3705596923828, + "eval_logps/rejected": -237.5136260986328, + "eval_loss": 0.027888035401701927, + "eval_rewards/accuracies": 0.6430000066757202, + "eval_rewards/chosen": -0.0468280129134655, + "eval_rewards/margins": 0.0826805830001831, + "eval_rewards/rejected": -0.1295085847377777, + "eval_runtime": 712.0846, + "eval_samples_per_second": 2.809, + "eval_steps_per_second": 1.404, + "step": 5600 + }, + { + "epoch": 0.37, + "learning_rate": 3.990401742099408e-06, + "logits/chosen": -2.0736844539642334, + "logits/rejected": -2.093573570251465, + "logps/chosen": -187.79244995117188, + "logps/rejected": -190.85166931152344, + "loss": 0.0315, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -0.044646747410297394, + "rewards/margins": 0.05204001069068909, + "rewards/rejected": -0.09668676555156708, + "step": 5610 + }, + { + "epoch": 0.37, + "learning_rate": 3.985813570844072e-06, + "logits/chosen": -2.210111379623413, + "logits/rejected": -2.069014310836792, + "logps/chosen": -307.0541687011719, + "logps/rejected": -293.98638916015625, + "loss": 0.0468, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.0633372887969017, + "rewards/margins": 0.07241859287023544, + "rewards/rejected": -0.13575588166713715, + "step": 5620 + }, + { + "epoch": 0.37, + "learning_rate": 3.981217648857316e-06, + "logits/chosen": -2.2960948944091797, + "logits/rejected": -2.1048855781555176, + "logps/chosen": -179.8003692626953, + "logps/rejected": -198.77951049804688, + "loss": 0.0142, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.036299627274274826, + "rewards/margins": 0.08598671853542328, + "rewards/rejected": -0.1222863644361496, + "step": 5630 + }, + { + "epoch": 0.37, + "learning_rate": 3.97661400011372e-06, + "logits/chosen": -2.14565110206604, + "logits/rejected": -2.1863856315612793, + "logps/chosen": -250.01345825195312, + "logps/rejected": -246.5854034423828, + "loss": 0.0405, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.03493080660700798, + "rewards/margins": 0.039973922073841095, + "rewards/rejected": -0.07490471750497818, + "step": 5640 + }, + { + "epoch": 0.37, + "learning_rate": 3.972002648628174e-06, + "logits/chosen": -2.1732308864593506, + "logits/rejected": -1.8485368490219116, + "logps/chosen": -284.80145263671875, + "logps/rejected": -247.41360473632812, + "loss": 0.0194, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.038853127509355545, + "rewards/margins": 0.05152001231908798, + "rewards/rejected": -0.09037313610315323, + "step": 5650 + }, + { + "epoch": 0.37, + "learning_rate": 3.967383618455743e-06, + "logits/chosen": -2.3239212036132812, + "logits/rejected": -2.153209924697876, + "logps/chosen": -239.8135528564453, + "logps/rejected": -271.5573425292969, + "loss": 0.0481, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.06346921622753143, + "rewards/margins": 0.06776650995016098, + "rewards/rejected": -0.13123571872711182, + "step": 5660 + }, + { + "epoch": 0.37, + "learning_rate": 3.9627569336915515e-06, + "logits/chosen": -2.4477696418762207, + "logits/rejected": -2.1409733295440674, + "logps/chosen": -251.18179321289062, + "logps/rejected": -213.980712890625, + "loss": 0.0304, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.018595131114125252, + "rewards/margins": 0.09737774729728699, + "rewards/rejected": -0.1159728616476059, + "step": 5670 + }, + { + "epoch": 0.37, + "learning_rate": 3.9581226184706555e-06, + "logits/chosen": -2.294837713241577, + "logits/rejected": -2.39911150932312, + "logps/chosen": -196.95144653320312, + "logps/rejected": -279.7997131347656, + "loss": 0.0149, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.011430233716964722, + "rewards/margins": 0.06075301766395569, + "rewards/rejected": -0.07218325138092041, + "step": 5680 + }, + { + "epoch": 0.37, + "learning_rate": 3.953480696967912e-06, + "logits/chosen": -1.9522489309310913, + "logits/rejected": -2.1829464435577393, + "logps/chosen": -219.80093383789062, + "logps/rejected": -271.7539978027344, + "loss": 0.0209, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.05214661359786987, + "rewards/margins": 0.07753698527812958, + "rewards/rejected": -0.12968358397483826, + "step": 5690 + }, + { + "epoch": 0.37, + "learning_rate": 3.948831193397857e-06, + "logits/chosen": -2.1683051586151123, + "logits/rejected": -2.150635242462158, + "logps/chosen": -172.88192749023438, + "logps/rejected": -194.87937927246094, + "loss": 0.0237, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.034332599490880966, + "rewards/margins": 0.08085787296295166, + "rewards/rejected": -0.11519046872854233, + "step": 5700 + }, + { + "epoch": 0.37, + "eval_logits/chosen": -2.2279889583587646, + "eval_logits/rejected": -2.0451738834381104, + "eval_logps/chosen": -238.47406005859375, + "eval_logps/rejected": -235.90609741210938, + "eval_loss": 0.02667395770549774, + "eval_rewards/accuracies": 0.6445000171661377, + "eval_rewards/chosen": -0.0323454923927784, + "eval_rewards/margins": 0.0891253724694252, + "eval_rewards/rejected": -0.1214708611369133, + "eval_runtime": 713.1838, + "eval_samples_per_second": 2.804, + "eval_steps_per_second": 1.402, + "step": 5700 + }, + { + "epoch": 0.37, + "learning_rate": 3.94417413201458e-06, + "logits/chosen": -2.1256191730499268, + "logits/rejected": -1.9788223505020142, + "logps/chosen": -214.668212890625, + "logps/rejected": -214.07028198242188, + "loss": 0.055, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.015483707189559937, + "rewards/margins": 0.08584414422512054, + "rewards/rejected": -0.10132785141468048, + "step": 5710 + }, + { + "epoch": 0.37, + "learning_rate": 3.9395095371115935e-06, + "logits/chosen": -2.283236265182495, + "logits/rejected": -2.00763201713562, + "logps/chosen": -215.9113311767578, + "logps/rejected": -224.1573028564453, + "loss": 0.0335, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.01627127267420292, + "rewards/margins": 0.0942857414484024, + "rewards/rejected": -0.11055700480937958, + "step": 5720 + }, + { + "epoch": 0.37, + "learning_rate": 3.93483743302171e-06, + "logits/chosen": -2.221068859100342, + "logits/rejected": -1.9899393320083618, + "logps/chosen": -215.34814453125, + "logps/rejected": -204.9816131591797, + "loss": 0.0291, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.012782298028469086, + "rewards/margins": 0.06684406101703644, + "rewards/rejected": -0.07962635904550552, + "step": 5730 + }, + { + "epoch": 0.38, + "learning_rate": 3.930157844116913e-06, + "logits/chosen": -2.0747952461242676, + "logits/rejected": -2.009860038757324, + "logps/chosen": -208.75759887695312, + "logps/rejected": -207.610595703125, + "loss": 0.024, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.004998432006686926, + "rewards/margins": 0.06785848736763, + "rewards/rejected": -0.06286005675792694, + "step": 5740 + }, + { + "epoch": 0.38, + "learning_rate": 3.925470794808229e-06, + "logits/chosen": -2.2149665355682373, + "logits/rejected": -1.8618929386138916, + "logps/chosen": -245.27243041992188, + "logps/rejected": -229.793701171875, + "loss": 0.0308, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.03142361342906952, + "rewards/margins": 0.08669319748878479, + "rewards/rejected": -0.1181168183684349, + "step": 5750 + }, + { + "epoch": 0.38, + "learning_rate": 3.920776309545606e-06, + "logits/chosen": -2.2878258228302, + "logits/rejected": -2.1229512691497803, + "logps/chosen": -152.00323486328125, + "logps/rejected": -158.4375457763672, + "loss": 0.034, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": 0.0035188309848308563, + "rewards/margins": 0.06633396446704865, + "rewards/rejected": -0.06281514465808868, + "step": 5760 + }, + { + "epoch": 0.38, + "learning_rate": 3.916074412817778e-06, + "logits/chosen": -2.1815593242645264, + "logits/rejected": -1.8238589763641357, + "logps/chosen": -244.6201629638672, + "logps/rejected": -267.8778381347656, + "loss": 0.0247, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.006430828478187323, + "rewards/margins": 0.11367674916982651, + "rewards/rejected": -0.12010756880044937, + "step": 5770 + }, + { + "epoch": 0.38, + "learning_rate": 3.911365129152139e-06, + "logits/chosen": -2.3041155338287354, + "logits/rejected": -2.15433669090271, + "logps/chosen": -232.7111358642578, + "logps/rejected": -244.7630615234375, + "loss": 0.0237, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.001386212301440537, + "rewards/margins": 0.0879589095711708, + "rewards/rejected": -0.08934511989355087, + "step": 5780 + }, + { + "epoch": 0.38, + "learning_rate": 3.906648483114623e-06, + "logits/chosen": -2.1890604496002197, + "logits/rejected": -2.0869388580322266, + "logps/chosen": -200.27606201171875, + "logps/rejected": -189.9747314453125, + "loss": 0.0298, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.014762152917683125, + "rewards/margins": 0.11141836643218994, + "rewards/rejected": -0.1261805295944214, + "step": 5790 + }, + { + "epoch": 0.38, + "learning_rate": 3.901924499309564e-06, + "logits/chosen": -2.1342368125915527, + "logits/rejected": -1.9452743530273438, + "logps/chosen": -234.81478881835938, + "logps/rejected": -219.6333770751953, + "loss": 0.0323, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.03054950200021267, + "rewards/margins": 0.08712270855903625, + "rewards/rejected": -0.11767220497131348, + "step": 5800 + }, + { + "epoch": 0.38, + "eval_logits/chosen": -2.2194855213165283, + "eval_logits/rejected": -2.036980152130127, + "eval_logps/chosen": -240.2529754638672, + "eval_logps/rejected": -237.3893280029297, + "eval_loss": 0.026860052719712257, + "eval_rewards/accuracies": 0.6460000276565552, + "eval_rewards/chosen": -0.04124004766345024, + "eval_rewards/margins": 0.08764705806970596, + "eval_rewards/rejected": -0.1288871020078659, + "eval_runtime": 712.9972, + "eval_samples_per_second": 2.805, + "eval_steps_per_second": 1.403, + "step": 5800 + }, + { + "epoch": 0.38, + "learning_rate": 3.897193202379575e-06, + "logits/chosen": -2.221252202987671, + "logits/rejected": -2.055716037750244, + "logps/chosen": -207.3871307373047, + "logps/rejected": -208.7563018798828, + "loss": 0.0246, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.03454715758562088, + "rewards/margins": 0.0917096808552742, + "rewards/rejected": -0.12625685334205627, + "step": 5810 + }, + { + "epoch": 0.38, + "learning_rate": 3.8924546170054215e-06, + "logits/chosen": -2.160278081893921, + "logits/rejected": -2.1069438457489014, + "logps/chosen": -222.0353546142578, + "logps/rejected": -218.66311645507812, + "loss": 0.0191, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.03063853643834591, + "rewards/margins": 0.0805058628320694, + "rewards/rejected": -0.11114440113306046, + "step": 5820 + }, + { + "epoch": 0.38, + "learning_rate": 3.887708767905883e-06, + "logits/chosen": -2.4319489002227783, + "logits/rejected": -2.033052921295166, + "logps/chosen": -251.15188598632812, + "logps/rejected": -199.21560668945312, + "loss": 0.0255, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.03692341595888138, + "rewards/margins": 0.06953327357769012, + "rewards/rejected": -0.1064566820859909, + "step": 5830 + }, + { + "epoch": 0.38, + "learning_rate": 3.882955679837636e-06, + "logits/chosen": -2.1963651180267334, + "logits/rejected": -2.087002992630005, + "logps/chosen": -248.7950439453125, + "logps/rejected": -263.8435974121094, + "loss": 0.0359, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.04709015041589737, + "rewards/margins": 0.06336608529090881, + "rewards/rejected": -0.11045622825622559, + "step": 5840 + }, + { + "epoch": 0.38, + "learning_rate": 3.878195377595113e-06, + "logits/chosen": -2.285632610321045, + "logits/rejected": -2.1098172664642334, + "logps/chosen": -242.3900909423828, + "logps/rejected": -256.57550048828125, + "loss": 0.0336, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.03242162987589836, + "rewards/margins": 0.09874467551708221, + "rewards/rejected": -0.13116630911827087, + "step": 5850 + }, + { + "epoch": 0.38, + "learning_rate": 3.873427886010384e-06, + "logits/chosen": -2.236074686050415, + "logits/rejected": -2.089146137237549, + "logps/chosen": -196.53201293945312, + "logps/rejected": -196.675048828125, + "loss": 0.0205, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.0304003544151783, + "rewards/margins": 0.09616055339574814, + "rewards/rejected": -0.12656089663505554, + "step": 5860 + }, + { + "epoch": 0.38, + "learning_rate": 3.868653229953021e-06, + "logits/chosen": -2.3070778846740723, + "logits/rejected": -2.0845189094543457, + "logps/chosen": -242.5325469970703, + "logps/rejected": -262.4086608886719, + "loss": 0.0129, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.026588618755340576, + "rewards/margins": 0.10969982296228409, + "rewards/rejected": -0.13628843426704407, + "step": 5870 + }, + { + "epoch": 0.38, + "learning_rate": 3.8638714343299675e-06, + "logits/chosen": -2.209580183029175, + "logits/rejected": -2.1250576972961426, + "logps/chosen": -225.6443634033203, + "logps/rejected": -257.268798828125, + "loss": 0.013, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -0.03650829195976257, + "rewards/margins": 0.08254338055849075, + "rewards/rejected": -0.11905165761709213, + "step": 5880 + }, + { + "epoch": 0.39, + "learning_rate": 3.859082524085414e-06, + "logits/chosen": -2.2187929153442383, + "logits/rejected": -1.8690040111541748, + "logps/chosen": -280.08441162109375, + "logps/rejected": -234.6243133544922, + "loss": 0.0263, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.03870617598295212, + "rewards/margins": 0.07389514148235321, + "rewards/rejected": -0.11260131746530533, + "step": 5890 + }, + { + "epoch": 0.39, + "learning_rate": 3.854286524200659e-06, + "logits/chosen": -2.360055923461914, + "logits/rejected": -2.096848249435425, + "logps/chosen": -282.25677490234375, + "logps/rejected": -241.24325561523438, + "loss": 0.0242, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.029196584597229958, + "rewards/margins": 0.04811464250087738, + "rewards/rejected": -0.07731121778488159, + "step": 5900 + }, + { + "epoch": 0.39, + "eval_logits/chosen": -2.2253873348236084, + "eval_logits/rejected": -2.042678117752075, + "eval_logps/chosen": -238.05584716796875, + "eval_logps/rejected": -233.9046630859375, + "eval_loss": 0.02604703977704048, + "eval_rewards/accuracies": 0.6455000042915344, + "eval_rewards/chosen": -0.03025444969534874, + "eval_rewards/margins": 0.08120942115783691, + "eval_rewards/rejected": -0.11146386712789536, + "eval_runtime": 712.5903, + "eval_samples_per_second": 2.807, + "eval_steps_per_second": 1.403, + "step": 5900 + }, + { + "epoch": 0.39, + "learning_rate": 3.849483459693991e-06, + "logits/chosen": -2.299449920654297, + "logits/rejected": -2.1660971641540527, + "logps/chosen": -215.0598602294922, + "logps/rejected": -200.59132385253906, + "loss": 0.0136, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.026915917173027992, + "rewards/margins": 0.12100283801555634, + "rewards/rejected": -0.14791876077651978, + "step": 5910 + }, + { + "epoch": 0.39, + "learning_rate": 3.844673355620544e-06, + "logits/chosen": -2.1884491443634033, + "logits/rejected": -2.05025315284729, + "logps/chosen": -252.08255004882812, + "logps/rejected": -233.2479248046875, + "loss": 0.0182, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.027548307552933693, + "rewards/margins": 0.09725430607795715, + "rewards/rejected": -0.1248026043176651, + "step": 5920 + }, + { + "epoch": 0.39, + "learning_rate": 3.839856237072178e-06, + "logits/chosen": -2.044630527496338, + "logits/rejected": -2.0448217391967773, + "logps/chosen": -190.75665283203125, + "logps/rejected": -232.75241088867188, + "loss": 0.0352, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.040912725031375885, + "rewards/margins": 0.12401840835809708, + "rewards/rejected": -0.16493113338947296, + "step": 5930 + }, + { + "epoch": 0.39, + "learning_rate": 3.8350321291773455e-06, + "logits/chosen": -1.9816339015960693, + "logits/rejected": -1.9150645732879639, + "logps/chosen": -202.12307739257812, + "logps/rejected": -175.07394409179688, + "loss": 0.0247, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.0019943893421441317, + "rewards/margins": 0.08573590219020844, + "rewards/rejected": -0.08374151587486267, + "step": 5940 + }, + { + "epoch": 0.39, + "learning_rate": 3.830201057100953e-06, + "logits/chosen": -2.2745325565338135, + "logits/rejected": -2.305018424987793, + "logps/chosen": -193.10279846191406, + "logps/rejected": -228.12277221679688, + "loss": 0.0178, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.006648970302194357, + "rewards/margins": 0.0976361334323883, + "rewards/rejected": -0.1042850986123085, + "step": 5950 + }, + { + "epoch": 0.39, + "learning_rate": 3.82536304604424e-06, + "logits/chosen": -2.071976900100708, + "logits/rejected": -2.0189526081085205, + "logps/chosen": -233.9423065185547, + "logps/rejected": -217.19754028320312, + "loss": 0.0426, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.011109702289104462, + "rewards/margins": 0.07495688647031784, + "rewards/rejected": -0.06384718418121338, + "step": 5960 + }, + { + "epoch": 0.39, + "learning_rate": 3.8205181212446435e-06, + "logits/chosen": -2.460482358932495, + "logits/rejected": -2.2313225269317627, + "logps/chosen": -266.06683349609375, + "logps/rejected": -237.21945190429688, + "loss": 0.0216, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.02246047556400299, + "rewards/margins": 0.05778072029352188, + "rewards/rejected": -0.03532024100422859, + "step": 5970 + }, + { + "epoch": 0.39, + "learning_rate": 3.815666307975664e-06, + "logits/chosen": -2.259298801422119, + "logits/rejected": -2.1296067237854004, + "logps/chosen": -235.4974822998047, + "logps/rejected": -227.8106231689453, + "loss": 0.0199, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.009101573377847672, + "rewards/margins": 0.05747541785240173, + "rewards/rejected": -0.04837384819984436, + "step": 5980 + }, + { + "epoch": 0.39, + "learning_rate": 3.8108076315467346e-06, + "logits/chosen": -2.3414454460144043, + "logits/rejected": -2.2081501483917236, + "logps/chosen": -259.45941162109375, + "logps/rejected": -199.7333221435547, + "loss": 0.0243, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.010883106850087643, + "rewards/margins": 0.07500231266021729, + "rewards/rejected": -0.0858854204416275, + "step": 5990 + }, + { + "epoch": 0.39, + "learning_rate": 3.805942117303093e-06, + "logits/chosen": -2.5289244651794434, + "logits/rejected": -2.1916663646698, + "logps/chosen": -315.30548095703125, + "logps/rejected": -283.5316467285156, + "loss": 0.0239, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.0036502063740044832, + "rewards/margins": 0.07374846190214157, + "rewards/rejected": -0.07009825855493546, + "step": 6000 + }, + { + "epoch": 0.39, + "eval_logits/chosen": -2.2698471546173096, + "eval_logits/rejected": -2.0839905738830566, + "eval_logps/chosen": -233.28067016601562, + "eval_logps/rejected": -227.20497131347656, + "eval_loss": 0.026537323370575905, + "eval_rewards/accuracies": 0.6395000219345093, + "eval_rewards/chosen": -0.006378578953444958, + "eval_rewards/margins": 0.07158681005239487, + "eval_rewards/rejected": -0.0779653862118721, + "eval_runtime": 713.6836, + "eval_samples_per_second": 2.802, + "eval_steps_per_second": 1.401, + "step": 6000 + }, + { + "epoch": 0.39, + "learning_rate": 3.8010697906256446e-06, + "logits/chosen": -2.1027355194091797, + "logits/rejected": -2.0898165702819824, + "logps/chosen": -216.5233917236328, + "logps/rejected": -203.8676300048828, + "loss": 0.0466, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.049134716391563416, + "rewards/margins": 0.07587228715419769, + "rewards/rejected": -0.1250070035457611, + "step": 6010 + }, + { + "epoch": 0.39, + "learning_rate": 3.7961906769308323e-06, + "logits/chosen": -2.1587841510772705, + "logits/rejected": -1.971003770828247, + "logps/chosen": -212.7795867919922, + "logps/rejected": -236.19271850585938, + "loss": 0.0211, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.03272194415330887, + "rewards/margins": 0.06669095903635025, + "rewards/rejected": -0.09941292554140091, + "step": 6020 + }, + { + "epoch": 0.39, + "learning_rate": 3.7913048016705028e-06, + "logits/chosen": -2.2142512798309326, + "logits/rejected": -2.1702721118927, + "logps/chosen": -262.4501647949219, + "logps/rejected": -271.28912353515625, + "loss": 0.0117, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.012552952393889427, + "rewards/margins": 0.0544348768889904, + "rewards/rejected": -0.06698782742023468, + "step": 6030 + }, + { + "epoch": 0.4, + "learning_rate": 3.786412190331775e-06, + "logits/chosen": -2.408029556274414, + "logits/rejected": -2.0841169357299805, + "logps/chosen": -205.4874267578125, + "logps/rejected": -183.05186462402344, + "loss": 0.0236, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.013134591281414032, + "rewards/margins": 0.07331900298595428, + "rewards/rejected": -0.08645360171794891, + "step": 6040 + }, + { + "epoch": 0.4, + "learning_rate": 3.781512868436906e-06, + "logits/chosen": -2.389183521270752, + "logits/rejected": -2.2467615604400635, + "logps/chosen": -133.06936645507812, + "logps/rejected": -151.9151611328125, + "loss": 0.0152, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.0022931471467018127, + "rewards/margins": 0.05528401583433151, + "rewards/rejected": -0.057577162981033325, + "step": 6050 + }, + { + "epoch": 0.4, + "learning_rate": 3.7766068615431605e-06, + "logits/chosen": -2.2205705642700195, + "logits/rejected": -2.1680896282196045, + "logps/chosen": -256.7353820800781, + "logps/rejected": -226.5559539794922, + "loss": 0.0427, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.0017584555316716433, + "rewards/margins": 0.09227786213159561, + "rewards/rejected": -0.09051939845085144, + "step": 6060 + }, + { + "epoch": 0.4, + "learning_rate": 3.771694195242671e-06, + "logits/chosen": -2.439492702484131, + "logits/rejected": -2.000755786895752, + "logps/chosen": -306.957763671875, + "logps/rejected": -208.7428436279297, + "loss": 0.0417, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.03889502212405205, + "rewards/margins": 0.06758001446723938, + "rewards/rejected": -0.10647504031658173, + "step": 6070 + }, + { + "epoch": 0.4, + "learning_rate": 3.766774895162314e-06, + "logits/chosen": -2.2616937160491943, + "logits/rejected": -2.159468412399292, + "logps/chosen": -256.6993713378906, + "logps/rejected": -209.25021362304688, + "loss": 0.0183, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.050798166543245316, + "rewards/margins": 0.04343273863196373, + "rewards/rejected": -0.09423090517520905, + "step": 6080 + }, + { + "epoch": 0.4, + "learning_rate": 3.7618489869635666e-06, + "logits/chosen": -2.184147357940674, + "logits/rejected": -2.1318554878234863, + "logps/chosen": -261.07501220703125, + "logps/rejected": -250.76461791992188, + "loss": 0.0379, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.07024045288562775, + "rewards/margins": 0.047162558883428574, + "rewards/rejected": -0.11740299314260483, + "step": 6090 + }, + { + "epoch": 0.4, + "learning_rate": 3.756916496342379e-06, + "logits/chosen": -2.2132625579833984, + "logits/rejected": -2.2690443992614746, + "logps/chosen": -196.7451629638672, + "logps/rejected": -219.06893920898438, + "loss": 0.0246, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.0493663027882576, + "rewards/margins": 0.06935496628284454, + "rewards/rejected": -0.11872126907110214, + "step": 6100 + }, + { + "epoch": 0.4, + "eval_logits/chosen": -2.2834253311157227, + "eval_logits/rejected": -2.0963640213012695, + "eval_logps/chosen": -241.33116149902344, + "eval_logps/rejected": -235.506591796875, + "eval_loss": 0.02662130817770958, + "eval_rewards/accuracies": 0.6474999785423279, + "eval_rewards/chosen": -0.046630993485450745, + "eval_rewards/margins": 0.07284247130155563, + "eval_rewards/rejected": -0.11947345733642578, + "eval_runtime": 713.7437, + "eval_samples_per_second": 2.802, + "eval_steps_per_second": 1.401, + "step": 6100 + }, + { + "epoch": 0.4, + "learning_rate": 3.751977449029039e-06, + "logits/chosen": -1.9438211917877197, + "logits/rejected": -1.9466642141342163, + "logps/chosen": -272.6602783203125, + "logps/rejected": -251.07455444335938, + "loss": 0.0397, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.06291915476322174, + "rewards/margins": 0.07904963940382004, + "rewards/rejected": -0.14196878671646118, + "step": 6110 + }, + { + "epoch": 0.4, + "learning_rate": 3.747031870788037e-06, + "logits/chosen": -2.4532384872436523, + "logits/rejected": -2.0979743003845215, + "logps/chosen": -320.97210693359375, + "logps/rejected": -253.79415893554688, + "loss": 0.0284, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.021094132214784622, + "rewards/margins": 0.0762069970369339, + "rewards/rejected": -0.09730114042758942, + "step": 6120 + }, + { + "epoch": 0.4, + "learning_rate": 3.7420797874179326e-06, + "logits/chosen": -2.219827175140381, + "logits/rejected": -1.9641387462615967, + "logps/chosen": -245.064453125, + "logps/rejected": -202.33700561523438, + "loss": 0.0222, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.039774827659130096, + "rewards/margins": 0.08080819994211197, + "rewards/rejected": -0.12058302015066147, + "step": 6130 + }, + { + "epoch": 0.4, + "learning_rate": 3.7371212247512167e-06, + "logits/chosen": -2.567937135696411, + "logits/rejected": -2.2279791831970215, + "logps/chosen": -326.991455078125, + "logps/rejected": -280.57086181640625, + "loss": 0.027, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.0029847125988453627, + "rewards/margins": 0.09343814849853516, + "rewards/rejected": -0.09642285853624344, + "step": 6140 + }, + { + "epoch": 0.4, + "learning_rate": 3.7321562086541817e-06, + "logits/chosen": -2.321716070175171, + "logits/rejected": -2.215344190597534, + "logps/chosen": -258.36090087890625, + "logps/rejected": -273.3643798828125, + "loss": 0.0194, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.026903927326202393, + "rewards/margins": 0.07106615602970123, + "rewards/rejected": -0.09797009080648422, + "step": 6150 + }, + { + "epoch": 0.4, + "learning_rate": 3.7271847650267834e-06, + "logits/chosen": -2.1101181507110596, + "logits/rejected": -2.0075926780700684, + "logps/chosen": -210.41098022460938, + "logps/rejected": -219.27383422851562, + "loss": 0.0448, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.05238168314099312, + "rewards/margins": 0.048765551298856735, + "rewards/rejected": -0.10114721953868866, + "step": 6160 + }, + { + "epoch": 0.4, + "learning_rate": 3.7222069198025086e-06, + "logits/chosen": -2.085695743560791, + "logits/rejected": -1.8926283121109009, + "logps/chosen": -220.03958129882812, + "logps/rejected": -218.895263671875, + "loss": 0.0201, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.05748562142252922, + "rewards/margins": 0.09723970293998718, + "rewards/rejected": -0.1547253280878067, + "step": 6170 + }, + { + "epoch": 0.4, + "learning_rate": 3.7172226989482353e-06, + "logits/chosen": -2.1056084632873535, + "logits/rejected": -1.8976036310195923, + "logps/chosen": -221.66238403320312, + "logps/rejected": -232.70401000976562, + "loss": 0.0279, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.05844768136739731, + "rewards/margins": 0.06593415886163712, + "rewards/rejected": -0.12438184022903442, + "step": 6180 + }, + { + "epoch": 0.4, + "learning_rate": 3.7122321284641007e-06, + "logits/chosen": -2.3730039596557617, + "logits/rejected": -1.9491113424301147, + "logps/chosen": -376.61395263671875, + "logps/rejected": -285.7323913574219, + "loss": 0.0208, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.06913654506206512, + "rewards/margins": 0.09804163873195648, + "rewards/rejected": -0.1671781986951828, + "step": 6190 + }, + { + "epoch": 0.41, + "learning_rate": 3.707235234383365e-06, + "logits/chosen": -2.2920448780059814, + "logits/rejected": -1.9482746124267578, + "logps/chosen": -265.9564514160156, + "logps/rejected": -194.7156524658203, + "loss": 0.0109, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.023512501269578934, + "rewards/margins": 0.07769031822681427, + "rewards/rejected": -0.1012028232216835, + "step": 6200 + }, + { + "epoch": 0.41, + "eval_logits/chosen": -2.244328260421753, + "eval_logits/rejected": -2.05887770652771, + "eval_logps/chosen": -239.60328674316406, + "eval_logps/rejected": -234.93077087402344, + "eval_loss": 0.025902314111590385, + "eval_rewards/accuracies": 0.6420000195503235, + "eval_rewards/chosen": -0.03799163177609444, + "eval_rewards/margins": 0.0786026194691658, + "eval_rewards/rejected": -0.11659426242113113, + "eval_runtime": 714.619, + "eval_samples_per_second": 2.799, + "eval_steps_per_second": 1.399, + "step": 6200 + }, + { + "epoch": 0.41, + "learning_rate": 3.702232042772277e-06, + "logits/chosen": -2.0883724689483643, + "logits/rejected": -2.028675079345703, + "logps/chosen": -214.29074096679688, + "logps/rejected": -213.12255859375, + "loss": 0.0247, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.058722980320453644, + "rewards/margins": 0.10593204200267792, + "rewards/rejected": -0.16465502977371216, + "step": 6210 + }, + { + "epoch": 0.41, + "learning_rate": 3.6972225797299325e-06, + "logits/chosen": -2.2090511322021484, + "logits/rejected": -2.240442991256714, + "logps/chosen": -264.33868408203125, + "logps/rejected": -264.9607849121094, + "loss": 0.0349, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.06915035843849182, + "rewards/margins": 0.08278807252645493, + "rewards/rejected": -0.15193842351436615, + "step": 6220 + }, + { + "epoch": 0.41, + "learning_rate": 3.692206871388147e-06, + "logits/chosen": -2.308846950531006, + "logits/rejected": -1.8605448007583618, + "logps/chosen": -243.57913208007812, + "logps/rejected": -225.1941680908203, + "loss": 0.0174, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.05595209077000618, + "rewards/margins": 0.11422081291675568, + "rewards/rejected": -0.17017289996147156, + "step": 6230 + }, + { + "epoch": 0.41, + "learning_rate": 3.6871849439113115e-06, + "logits/chosen": -1.9982774257659912, + "logits/rejected": -1.9305871725082397, + "logps/chosen": -241.31790161132812, + "logps/rejected": -243.810302734375, + "loss": 0.0352, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.05703529715538025, + "rewards/margins": 0.06020113080739975, + "rewards/rejected": -0.1172364354133606, + "step": 6240 + }, + { + "epoch": 0.41, + "learning_rate": 3.682156823496259e-06, + "logits/chosen": -2.2610268592834473, + "logits/rejected": -1.9768097400665283, + "logps/chosen": -214.890380859375, + "logps/rejected": -204.16453552246094, + "loss": 0.0189, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.004882182460278273, + "rewards/margins": 0.10819858312606812, + "rewards/rejected": -0.11308076232671738, + "step": 6250 + }, + { + "epoch": 0.41, + "learning_rate": 3.67712253637213e-06, + "logits/chosen": -2.2968246936798096, + "logits/rejected": -2.0636565685272217, + "logps/chosen": -300.6327209472656, + "logps/rejected": -225.7855224609375, + "loss": 0.0191, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.0416911318898201, + "rewards/margins": 0.07532685250043869, + "rewards/rejected": -0.11701799929141998, + "step": 6260 + }, + { + "epoch": 0.41, + "learning_rate": 3.672082108800231e-06, + "logits/chosen": -2.1016416549682617, + "logits/rejected": -1.9039815664291382, + "logps/chosen": -231.9393768310547, + "logps/rejected": -213.286376953125, + "loss": 0.037, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.09061791747808456, + "rewards/margins": 0.07693218439817429, + "rewards/rejected": -0.16755011677742004, + "step": 6270 + }, + { + "epoch": 0.41, + "learning_rate": 3.6670355670739012e-06, + "logits/chosen": -2.216489791870117, + "logits/rejected": -2.027721881866455, + "logps/chosen": -168.08026123046875, + "logps/rejected": -186.23406982421875, + "loss": 0.0119, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.030634235590696335, + "rewards/margins": 0.10396245867013931, + "rewards/rejected": -0.13459669053554535, + "step": 6280 + }, + { + "epoch": 0.41, + "learning_rate": 3.6619829375183745e-06, + "logits/chosen": -2.369748115539551, + "logits/rejected": -2.175687789916992, + "logps/chosen": -232.9136505126953, + "logps/rejected": -240.1250457763672, + "loss": 0.0362, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.03455563634634018, + "rewards/margins": 0.11641144752502441, + "rewards/rejected": -0.1509670913219452, + "step": 6290 + }, + { + "epoch": 0.41, + "learning_rate": 3.6569242464906427e-06, + "logits/chosen": -2.2498772144317627, + "logits/rejected": -2.073350429534912, + "logps/chosen": -209.7557373046875, + "logps/rejected": -250.68405151367188, + "loss": 0.0289, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.010133450850844383, + "rewards/margins": 0.09939597547054291, + "rewards/rejected": -0.10952942073345184, + "step": 6300 + }, + { + "epoch": 0.41, + "eval_logits/chosen": -2.2404847145080566, + "eval_logits/rejected": -2.0557026863098145, + "eval_logps/chosen": -237.73385620117188, + "eval_logps/rejected": -233.16732788085938, + "eval_loss": 0.025808749720454216, + "eval_rewards/accuracies": 0.6524999737739563, + "eval_rewards/chosen": -0.028644531965255737, + "eval_rewards/margins": 0.07913253456354141, + "eval_rewards/rejected": -0.10777706652879715, + "eval_runtime": 713.9294, + "eval_samples_per_second": 2.801, + "eval_steps_per_second": 1.401, + "step": 6300 + }, + { + "epoch": 0.41, + "learning_rate": 3.6518595203793156e-06, + "logits/chosen": -2.092099905014038, + "logits/rejected": -2.105501651763916, + "logps/chosen": -260.3985900878906, + "logps/rejected": -294.7735290527344, + "loss": 0.0148, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.008421550504863262, + "rewards/margins": 0.10450099408626556, + "rewards/rejected": -0.1129225492477417, + "step": 6310 + }, + { + "epoch": 0.41, + "learning_rate": 3.646788785604485e-06, + "logits/chosen": -2.234051465988159, + "logits/rejected": -2.093923330307007, + "logps/chosen": -207.9916229248047, + "logps/rejected": -217.1757354736328, + "loss": 0.0149, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.011666452512145042, + "rewards/margins": 0.057145290076732635, + "rewards/rejected": -0.06881174445152283, + "step": 6320 + }, + { + "epoch": 0.41, + "learning_rate": 3.641712068617588e-06, + "logits/chosen": -2.2444276809692383, + "logits/rejected": -2.1274402141571045, + "logps/chosen": -265.61273193359375, + "logps/rejected": -224.0839385986328, + "loss": 0.0239, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.021263372153043747, + "rewards/margins": 0.06007848307490349, + "rewards/rejected": -0.08134184777736664, + "step": 6330 + }, + { + "epoch": 0.41, + "learning_rate": 3.6366293959012673e-06, + "logits/chosen": -2.182690143585205, + "logits/rejected": -1.9510301351547241, + "logps/chosen": -184.07785034179688, + "logps/rejected": -183.8928985595703, + "loss": 0.0297, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.0143792275339365, + "rewards/margins": 0.09669135510921478, + "rewards/rejected": -0.11107059568166733, + "step": 6340 + }, + { + "epoch": 0.42, + "learning_rate": 3.631540793969233e-06, + "logits/chosen": -2.3409228324890137, + "logits/rejected": -2.3132266998291016, + "logps/chosen": -191.70883178710938, + "logps/rejected": -205.50570678710938, + "loss": 0.0242, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.015455419197678566, + "rewards/margins": 0.06059306114912033, + "rewards/rejected": -0.07604847848415375, + "step": 6350 + }, + { + "epoch": 0.42, + "learning_rate": 3.626446289366127e-06, + "logits/chosen": -2.5035297870635986, + "logits/rejected": -2.0659027099609375, + "logps/chosen": -228.3686981201172, + "logps/rejected": -163.69943237304688, + "loss": 0.0265, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.04109755903482437, + "rewards/margins": 0.034247688949108124, + "rewards/rejected": -0.0753452405333519, + "step": 6360 + }, + { + "epoch": 0.42, + "learning_rate": 3.6213459086673786e-06, + "logits/chosen": -2.24674654006958, + "logits/rejected": -2.307798147201538, + "logps/chosen": -177.19583129882812, + "logps/rejected": -206.07693481445312, + "loss": 0.0266, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.03129131719470024, + "rewards/margins": 0.09753891080617905, + "rewards/rejected": -0.128830224275589, + "step": 6370 + }, + { + "epoch": 0.42, + "learning_rate": 3.6162396784790737e-06, + "logits/chosen": -2.1218373775482178, + "logits/rejected": -2.14566707611084, + "logps/chosen": -232.34432983398438, + "logps/rejected": -245.09262084960938, + "loss": 0.0282, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.04668625444173813, + "rewards/margins": 0.06453945487737656, + "rewards/rejected": -0.11122570931911469, + "step": 6380 + }, + { + "epoch": 0.42, + "learning_rate": 3.6111276254378095e-06, + "logits/chosen": -2.2318406105041504, + "logits/rejected": -2.222268581390381, + "logps/chosen": -229.4927520751953, + "logps/rejected": -246.83206176757812, + "loss": 0.0172, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.007679253816604614, + "rewards/margins": 0.10876430571079254, + "rewards/rejected": -0.11644355952739716, + "step": 6390 + }, + { + "epoch": 0.42, + "learning_rate": 3.606009776210559e-06, + "logits/chosen": -2.221876621246338, + "logits/rejected": -1.990378975868225, + "logps/chosen": -252.7962646484375, + "logps/rejected": -225.5522918701172, + "loss": 0.0287, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.060862988233566284, + "rewards/margins": 0.08207409828901291, + "rewards/rejected": -0.1429370939731598, + "step": 6400 + }, + { + "epoch": 0.42, + "eval_logits/chosen": -2.252486228942871, + "eval_logits/rejected": -2.066436290740967, + "eval_logps/chosen": -237.19186401367188, + "eval_logps/rejected": -234.72076416015625, + "eval_loss": 0.0266865361481905, + "eval_rewards/accuracies": 0.6430000066757202, + "eval_rewards/chosen": -0.025934524834156036, + "eval_rewards/margins": 0.0896097868680954, + "eval_rewards/rejected": -0.11554431915283203, + "eval_runtime": 716.7463, + "eval_samples_per_second": 2.79, + "eval_steps_per_second": 1.395, + "step": 6400 + }, + { + "epoch": 0.42, + "learning_rate": 3.600886157494531e-06, + "logits/chosen": -2.3591103553771973, + "logits/rejected": -2.2368922233581543, + "logps/chosen": -268.14385986328125, + "logps/rejected": -274.5497741699219, + "loss": 0.02, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.02643665112555027, + "rewards/margins": 0.09094007313251495, + "rewards/rejected": -0.11737672984600067, + "step": 6410 + }, + { + "epoch": 0.42, + "learning_rate": 3.5957567960170304e-06, + "logits/chosen": -2.455496311187744, + "logits/rejected": -1.7506864070892334, + "logps/chosen": -294.2264709472656, + "logps/rejected": -194.97206115722656, + "loss": 0.0274, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.015206987969577312, + "rewards/margins": 0.09701627492904663, + "rewards/rejected": -0.11222325265407562, + "step": 6420 + }, + { + "epoch": 0.42, + "learning_rate": 3.590621718535319e-06, + "logits/chosen": -2.0646634101867676, + "logits/rejected": -1.8767610788345337, + "logps/chosen": -206.53396606445312, + "logps/rejected": -228.14224243164062, + "loss": 0.028, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.049057986587285995, + "rewards/margins": 0.1125042662024498, + "rewards/rejected": -0.1615622490644455, + "step": 6430 + }, + { + "epoch": 0.42, + "learning_rate": 3.5854809518364775e-06, + "logits/chosen": -2.335057020187378, + "logits/rejected": -2.030397415161133, + "logps/chosen": -244.3311767578125, + "logps/rejected": -221.5835418701172, + "loss": 0.0283, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.009078029543161392, + "rewards/margins": 0.10882551968097687, + "rewards/rejected": -0.09974746406078339, + "step": 6440 + }, + { + "epoch": 0.42, + "learning_rate": 3.580334522737262e-06, + "logits/chosen": -2.247253894805908, + "logits/rejected": -1.964611291885376, + "logps/chosen": -199.90805053710938, + "logps/rejected": -181.4126739501953, + "loss": 0.0169, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.010181794874370098, + "rewards/margins": 0.07577107846736908, + "rewards/rejected": -0.06558927893638611, + "step": 6450 + }, + { + "epoch": 0.42, + "learning_rate": 3.575182458083968e-06, + "logits/chosen": -2.191323757171631, + "logits/rejected": -2.1385130882263184, + "logps/chosen": -243.88778686523438, + "logps/rejected": -236.3755340576172, + "loss": 0.0201, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.015413627028465271, + "rewards/margins": 0.10246391594409943, + "rewards/rejected": -0.1178775280714035, + "step": 6460 + }, + { + "epoch": 0.42, + "learning_rate": 3.5700247847522883e-06, + "logits/chosen": -2.296088218688965, + "logits/rejected": -2.210977554321289, + "logps/chosen": -201.45184326171875, + "logps/rejected": -220.83975219726562, + "loss": 0.0375, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.006730721797794104, + "rewards/margins": 0.09703059494495392, + "rewards/rejected": -0.09029986709356308, + "step": 6470 + }, + { + "epoch": 0.42, + "learning_rate": 3.5648615296471743e-06, + "logits/chosen": -2.0994935035705566, + "logits/rejected": -2.067354440689087, + "logps/chosen": -200.0526123046875, + "logps/rejected": -260.09405517578125, + "loss": 0.0226, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.013449016027152538, + "rewards/margins": 0.10983811318874359, + "rewards/rejected": -0.1232871264219284, + "step": 6480 + }, + { + "epoch": 0.42, + "learning_rate": 3.559692719702693e-06, + "logits/chosen": -2.1190197467803955, + "logits/rejected": -1.8080532550811768, + "logps/chosen": -292.5730285644531, + "logps/rejected": -252.4334716796875, + "loss": 0.0388, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.032139308750629425, + "rewards/margins": 0.08294972777366638, + "rewards/rejected": -0.1150890588760376, + "step": 6490 + }, + { + "epoch": 0.43, + "learning_rate": 3.55451838188189e-06, + "logits/chosen": -2.243839979171753, + "logits/rejected": -2.140303134918213, + "logps/chosen": -263.24530029296875, + "logps/rejected": -299.83380126953125, + "loss": 0.0631, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.00979236327111721, + "rewards/margins": 0.07327703386545181, + "rewards/rejected": -0.08306938409805298, + "step": 6500 + }, + { + "epoch": 0.43, + "eval_logits/chosen": -2.275908946990967, + "eval_logits/rejected": -2.089535713195801, + "eval_logps/chosen": -238.27188110351562, + "eval_logps/rejected": -233.43910217285156, + "eval_loss": 0.025913719087839127, + "eval_rewards/accuracies": 0.6460000276565552, + "eval_rewards/chosen": -0.03133460506796837, + "eval_rewards/margins": 0.0778014212846756, + "eval_rewards/rejected": -0.10913601517677307, + "eval_runtime": 714.2995, + "eval_samples_per_second": 2.8, + "eval_steps_per_second": 1.4, + "step": 6500 + }, + { + "epoch": 0.43, + "learning_rate": 3.549338543176645e-06, + "logits/chosen": -2.296164035797119, + "logits/rejected": -2.033365249633789, + "logps/chosen": -311.88177490234375, + "logps/rejected": -292.5616149902344, + "loss": 0.03, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.026966029778122902, + "rewards/margins": 0.06516522169113159, + "rewards/rejected": -0.09213124215602875, + "step": 6510 + }, + { + "epoch": 0.43, + "learning_rate": 3.5441532306075342e-06, + "logits/chosen": -2.252589464187622, + "logits/rejected": -2.237151622772217, + "logps/chosen": -240.11624145507812, + "logps/rejected": -296.4466857910156, + "loss": 0.0173, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.04348507523536682, + "rewards/margins": 0.04882603511214256, + "rewards/rejected": -0.09231110662221909, + "step": 6520 + }, + { + "epoch": 0.43, + "learning_rate": 3.5389624712236894e-06, + "logits/chosen": -2.253516674041748, + "logits/rejected": -2.0285708904266357, + "logps/chosen": -217.43795776367188, + "logps/rejected": -196.87884521484375, + "loss": 0.0244, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.017010662704706192, + "rewards/margins": 0.03184361383318901, + "rewards/rejected": -0.0488542802631855, + "step": 6530 + }, + { + "epoch": 0.43, + "learning_rate": 3.533766292102653e-06, + "logits/chosen": -2.2152111530303955, + "logits/rejected": -2.077603816986084, + "logps/chosen": -216.4482421875, + "logps/rejected": -216.7845916748047, + "loss": 0.0563, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.033018387854099274, + "rewards/margins": 0.06012535095214844, + "rewards/rejected": -0.09314373880624771, + "step": 6540 + }, + { + "epoch": 0.43, + "learning_rate": 3.5285647203502404e-06, + "logits/chosen": -2.4234461784362793, + "logits/rejected": -2.296536922454834, + "logps/chosen": -257.1291809082031, + "logps/rejected": -229.7969512939453, + "loss": 0.0223, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.0007092708838172257, + "rewards/margins": 0.052169036120176315, + "rewards/rejected": -0.051459766924381256, + "step": 6550 + }, + { + "epoch": 0.43, + "learning_rate": 3.5233577831003983e-06, + "logits/chosen": -2.2396888732910156, + "logits/rejected": -2.0937328338623047, + "logps/chosen": -254.46728515625, + "logps/rejected": -243.6793670654297, + "loss": 0.0214, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.011456744745373726, + "rewards/margins": 0.06440354883670807, + "rewards/rejected": -0.07586028426885605, + "step": 6560 + }, + { + "epoch": 0.43, + "learning_rate": 3.5181455075150628e-06, + "logits/chosen": -2.1576619148254395, + "logits/rejected": -1.7788465023040771, + "logps/chosen": -184.81137084960938, + "logps/rejected": -151.5428009033203, + "loss": 0.0257, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.014326018281280994, + "rewards/margins": 0.06677161902189255, + "rewards/rejected": -0.08109764009714127, + "step": 6570 + }, + { + "epoch": 0.43, + "learning_rate": 3.512927920784016e-06, + "logits/chosen": -2.3142216205596924, + "logits/rejected": -2.175314426422119, + "logps/chosen": -226.6049346923828, + "logps/rejected": -230.943359375, + "loss": 0.0201, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.004654319025576115, + "rewards/margins": 0.12159235775470734, + "rewards/rejected": -0.12624667584896088, + "step": 6580 + }, + { + "epoch": 0.43, + "learning_rate": 3.5077050501247457e-06, + "logits/chosen": -2.39911150932312, + "logits/rejected": -1.9505914449691772, + "logps/chosen": -276.9880676269531, + "logps/rejected": -229.661865234375, + "loss": 0.0215, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.020134396851062775, + "rewards/margins": 0.10031189024448395, + "rewards/rejected": -0.08017749339342117, + "step": 6590 + }, + { + "epoch": 0.43, + "learning_rate": 3.5024769227823042e-06, + "logits/chosen": -2.296597957611084, + "logits/rejected": -2.161756992340088, + "logps/chosen": -169.4569854736328, + "logps/rejected": -144.53469848632812, + "loss": 0.037, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.032928384840488434, + "rewards/margins": 0.07937689125537872, + "rewards/rejected": -0.11230529844760895, + "step": 6600 + }, + { + "epoch": 0.43, + "eval_logits/chosen": -2.2867612838745117, + "eval_logits/rejected": -2.0996828079223633, + "eval_logps/chosen": -233.88197326660156, + "eval_logps/rejected": -229.03372192382812, + "eval_loss": 0.02600272372364998, + "eval_rewards/accuracies": 0.6489999890327454, + "eval_rewards/chosen": -0.009385163895785809, + "eval_rewards/margins": 0.0777239203453064, + "eval_rewards/rejected": -0.08710909634828568, + "eval_runtime": 714.3371, + "eval_samples_per_second": 2.8, + "eval_steps_per_second": 1.4, + "step": 6600 + }, + { + "epoch": 0.43, + "learning_rate": 3.4972435660291646e-06, + "logits/chosen": -2.376971483230591, + "logits/rejected": -2.2304623126983643, + "logps/chosen": -247.54150390625, + "logps/rejected": -238.1759796142578, + "loss": 0.0183, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.021986085921525955, + "rewards/margins": 0.07787088304758072, + "rewards/rejected": -0.09985697269439697, + "step": 6610 + }, + { + "epoch": 0.43, + "learning_rate": 3.492005007165079e-06, + "logits/chosen": -2.251359701156616, + "logits/rejected": -2.004368305206299, + "logps/chosen": -225.4510498046875, + "logps/rejected": -245.10220336914062, + "loss": 0.0322, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.02042979560792446, + "rewards/margins": 0.06747279316186905, + "rewards/rejected": -0.04704299941658974, + "step": 6620 + }, + { + "epoch": 0.43, + "learning_rate": 3.4867612735169377e-06, + "logits/chosen": -2.396332263946533, + "logits/rejected": -2.0499672889709473, + "logps/chosen": -221.58731079101562, + "logps/rejected": -163.45327758789062, + "loss": 0.0251, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.017235388979315758, + "rewards/margins": 0.1004711240530014, + "rewards/rejected": -0.0832357257604599, + "step": 6630 + }, + { + "epoch": 0.43, + "learning_rate": 3.4815123924386226e-06, + "logits/chosen": -2.5589592456817627, + "logits/rejected": -2.251009225845337, + "logps/chosen": -304.1456604003906, + "logps/rejected": -247.13693237304688, + "loss": 0.0176, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.011978634633123875, + "rewards/margins": 0.06331901252269745, + "rewards/rejected": -0.05134038254618645, + "step": 6640 + }, + { + "epoch": 0.44, + "learning_rate": 3.4762583913108696e-06, + "logits/chosen": -2.1234230995178223, + "logits/rejected": -1.8880468606948853, + "logps/chosen": -269.94964599609375, + "logps/rejected": -243.2335205078125, + "loss": 0.0241, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.0009564816718921065, + "rewards/margins": 0.07099533826112747, + "rewards/rejected": -0.07195182889699936, + "step": 6650 + }, + { + "epoch": 0.44, + "learning_rate": 3.4709992975411217e-06, + "logits/chosen": -2.250683069229126, + "logits/rejected": -1.7695224285125732, + "logps/chosen": -257.90576171875, + "logps/rejected": -223.98779296875, + "loss": 0.0305, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.014731831848621368, + "rewards/margins": 0.09174492210149765, + "rewards/rejected": -0.10647676140069962, + "step": 6660 + }, + { + "epoch": 0.44, + "learning_rate": 3.4657351385633886e-06, + "logits/chosen": -2.366260528564453, + "logits/rejected": -2.0163345336914062, + "logps/chosen": -187.8857879638672, + "logps/rejected": -198.3480224609375, + "loss": 0.0281, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.012975359335541725, + "rewards/margins": 0.113312266767025, + "rewards/rejected": -0.12628760933876038, + "step": 6670 + }, + { + "epoch": 0.44, + "learning_rate": 3.4604659418381024e-06, + "logits/chosen": -2.276533603668213, + "logits/rejected": -1.8742822408676147, + "logps/chosen": -230.89651489257812, + "logps/rejected": -220.8121337890625, + "loss": 0.0349, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.06421110033988953, + "rewards/margins": 0.09723123162984848, + "rewards/rejected": -0.1614423394203186, + "step": 6680 + }, + { + "epoch": 0.44, + "learning_rate": 3.4551917348519744e-06, + "logits/chosen": -2.3929922580718994, + "logits/rejected": -2.1257662773132324, + "logps/chosen": -286.9535827636719, + "logps/rejected": -255.3826904296875, + "loss": 0.0357, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.02727435901761055, + "rewards/margins": 0.07814844697713852, + "rewards/rejected": -0.10542280972003937, + "step": 6690 + }, + { + "epoch": 0.44, + "learning_rate": 3.4499125451178505e-06, + "logits/chosen": -1.9648650884628296, + "logits/rejected": -2.052551746368408, + "logps/chosen": -216.1411590576172, + "logps/rejected": -250.99026489257812, + "loss": 0.0296, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.05924935266375542, + "rewards/margins": 0.05118199437856674, + "rewards/rejected": -0.11043135076761246, + "step": 6700 + }, + { + "epoch": 0.44, + "eval_logits/chosen": -2.2902636528015137, + "eval_logits/rejected": -2.1026463508605957, + "eval_logps/chosen": -240.9244384765625, + "eval_logps/rejected": -237.36306762695312, + "eval_loss": 0.026407985016703606, + "eval_rewards/accuracies": 0.656499981880188, + "eval_rewards/chosen": -0.04459746181964874, + "eval_rewards/margins": 0.08415839076042175, + "eval_rewards/rejected": -0.1287558525800705, + "eval_runtime": 713.723, + "eval_samples_per_second": 2.802, + "eval_steps_per_second": 1.401, + "step": 6700 + }, + { + "epoch": 0.44, + "learning_rate": 3.4446284001745723e-06, + "logits/chosen": -2.1049177646636963, + "logits/rejected": -1.856149673461914, + "logps/chosen": -226.0932159423828, + "logps/rejected": -256.2013854980469, + "loss": 0.0309, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.09929800033569336, + "rewards/margins": 0.06680585443973541, + "rewards/rejected": -0.16610386967658997, + "step": 6710 + }, + { + "epoch": 0.44, + "learning_rate": 3.439339327586827e-06, + "logits/chosen": -2.2584190368652344, + "logits/rejected": -2.199280261993408, + "logps/chosen": -194.5971221923828, + "logps/rejected": -206.33316040039062, + "loss": 0.0202, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.022896986454725266, + "rewards/margins": 0.08549851924180984, + "rewards/rejected": -0.10839549452066422, + "step": 6720 + }, + { + "epoch": 0.44, + "learning_rate": 3.434045354945008e-06, + "logits/chosen": -2.346409559249878, + "logits/rejected": -2.0874228477478027, + "logps/chosen": -289.5050964355469, + "logps/rejected": -296.0002746582031, + "loss": 0.0305, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.07740263640880585, + "rewards/margins": 0.05615498498082161, + "rewards/rejected": -0.13355764746665955, + "step": 6730 + }, + { + "epoch": 0.44, + "learning_rate": 3.4287465098650713e-06, + "logits/chosen": -2.3473329544067383, + "logits/rejected": -2.3483903408050537, + "logps/chosen": -259.7020568847656, + "logps/rejected": -255.46054077148438, + "loss": 0.0245, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.05863260477781296, + "rewards/margins": 0.0530003197491169, + "rewards/rejected": -0.11163292080163956, + "step": 6740 + }, + { + "epoch": 0.44, + "learning_rate": 3.423442819988387e-06, + "logits/chosen": -2.1913259029388428, + "logits/rejected": -2.08129620552063, + "logps/chosen": -202.03309631347656, + "logps/rejected": -206.5372772216797, + "loss": 0.0554, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.0932292714715004, + "rewards/margins": 0.05846773460507393, + "rewards/rejected": -0.15169700980186462, + "step": 6750 + }, + { + "epoch": 0.44, + "learning_rate": 3.4181343129816e-06, + "logits/chosen": -2.378754138946533, + "logits/rejected": -2.0726230144500732, + "logps/chosen": -184.03543090820312, + "logps/rejected": -183.15615844726562, + "loss": 0.0331, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.03836870938539505, + "rewards/margins": 0.0670543760061264, + "rewards/rejected": -0.10542309284210205, + "step": 6760 + }, + { + "epoch": 0.44, + "learning_rate": 3.4128210165364837e-06, + "logits/chosen": -2.111802339553833, + "logits/rejected": -2.1186046600341797, + "logps/chosen": -197.4541778564453, + "logps/rejected": -229.36154174804688, + "loss": 0.0212, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.030224764719605446, + "rewards/margins": 0.11691973358392715, + "rewards/rejected": -0.14714448153972626, + "step": 6770 + }, + { + "epoch": 0.44, + "learning_rate": 3.407502958369795e-06, + "logits/chosen": -2.22578763961792, + "logits/rejected": -2.080477476119995, + "logps/chosen": -253.41683959960938, + "logps/rejected": -243.12594604492188, + "loss": 0.0367, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.05484537035226822, + "rewards/margins": 0.09460707008838654, + "rewards/rejected": -0.14945244789123535, + "step": 6780 + }, + { + "epoch": 0.44, + "learning_rate": 3.4021801662231297e-06, + "logits/chosen": -2.2658119201660156, + "logits/rejected": -2.1124956607818604, + "logps/chosen": -263.18060302734375, + "logps/rejected": -253.9446258544922, + "loss": 0.0304, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.07151900231838226, + "rewards/margins": 0.06519722938537598, + "rewards/rejected": -0.13671624660491943, + "step": 6790 + }, + { + "epoch": 0.44, + "learning_rate": 3.3968526678627793e-06, + "logits/chosen": -2.2123255729675293, + "logits/rejected": -1.8768161535263062, + "logps/chosen": -276.95269775390625, + "logps/rejected": -247.5306396484375, + "loss": 0.038, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.059611111879348755, + "rewards/margins": 0.07340748608112335, + "rewards/rejected": -0.1330185830593109, + "step": 6800 + }, + { + "epoch": 0.44, + "eval_logits/chosen": -2.273937940597534, + "eval_logits/rejected": -2.087099075317383, + "eval_logps/chosen": -245.886474609375, + "eval_logps/rejected": -241.46580505371094, + "eval_loss": 0.02618832141160965, + "eval_rewards/accuracies": 0.656499981880188, + "eval_rewards/chosen": -0.06940756738185883, + "eval_rewards/margins": 0.07986201345920563, + "eval_rewards/rejected": -0.14926959574222565, + "eval_runtime": 712.4632, + "eval_samples_per_second": 2.807, + "eval_steps_per_second": 1.404, + "step": 6800 + }, + { + "epoch": 0.45, + "learning_rate": 3.391520491079586e-06, + "logits/chosen": -2.3683648109436035, + "logits/rejected": -2.3810901641845703, + "logps/chosen": -207.74728393554688, + "logps/rejected": -188.58126831054688, + "loss": 0.0582, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.05892329663038254, + "rewards/margins": 0.05681822821497917, + "rewards/rejected": -0.1157415360212326, + "step": 6810 + }, + { + "epoch": 0.45, + "learning_rate": 3.3861836636887936e-06, + "logits/chosen": -2.2756943702697754, + "logits/rejected": -2.0937790870666504, + "logps/chosen": -277.7412109375, + "logps/rejected": -248.2487335205078, + "loss": 0.0142, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.054611772298812866, + "rewards/margins": 0.08493000268936157, + "rewards/rejected": -0.13954177498817444, + "step": 6820 + }, + { + "epoch": 0.45, + "learning_rate": 3.3808422135299106e-06, + "logits/chosen": -2.2582032680511475, + "logits/rejected": -2.343784809112549, + "logps/chosen": -297.9847106933594, + "logps/rejected": -366.10028076171875, + "loss": 0.0126, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.061534445732831955, + "rewards/margins": 0.07520034909248352, + "rewards/rejected": -0.13673481345176697, + "step": 6830 + }, + { + "epoch": 0.45, + "learning_rate": 3.375496168466556e-06, + "logits/chosen": -2.4298951625823975, + "logits/rejected": -1.9825141429901123, + "logps/chosen": -222.685546875, + "logps/rejected": -177.19808959960938, + "loss": 0.0184, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.03873712942004204, + "rewards/margins": 0.05219585821032524, + "rewards/rejected": -0.09093298763036728, + "step": 6840 + }, + { + "epoch": 0.45, + "learning_rate": 3.3701455563863205e-06, + "logits/chosen": -2.488891363143921, + "logits/rejected": -2.0427050590515137, + "logps/chosen": -303.811279296875, + "logps/rejected": -287.64788818359375, + "loss": 0.0249, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.05968303605914116, + "rewards/margins": 0.10077802836894989, + "rewards/rejected": -0.16046105325222015, + "step": 6850 + }, + { + "epoch": 0.45, + "learning_rate": 3.3647904052006174e-06, + "logits/chosen": -2.2914767265319824, + "logits/rejected": -2.227921962738037, + "logps/chosen": -273.80303955078125, + "logps/rejected": -294.72979736328125, + "loss": 0.0212, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.04252370074391365, + "rewards/margins": 0.08960084617137909, + "rewards/rejected": -0.13212454319000244, + "step": 6860 + }, + { + "epoch": 0.45, + "learning_rate": 3.3594307428445383e-06, + "logits/chosen": -2.4990713596343994, + "logits/rejected": -2.1006529331207275, + "logps/chosen": -338.07830810546875, + "logps/rejected": -319.7018737792969, + "loss": 0.0127, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.024137722328305244, + "rewards/margins": 0.06520069390535355, + "rewards/rejected": -0.08933840692043304, + "step": 6870 + }, + { + "epoch": 0.45, + "learning_rate": 3.354066597276707e-06, + "logits/chosen": -2.1523618698120117, + "logits/rejected": -2.1465184688568115, + "logps/chosen": -231.46969604492188, + "logps/rejected": -280.14129638671875, + "loss": 0.0284, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.03952707722783089, + "rewards/margins": 0.06655522435903549, + "rewards/rejected": -0.10608228296041489, + "step": 6880 + }, + { + "epoch": 0.45, + "learning_rate": 3.348697996479136e-06, + "logits/chosen": -2.3686251640319824, + "logits/rejected": -2.1159777641296387, + "logps/chosen": -237.6735382080078, + "logps/rejected": -196.40487670898438, + "loss": 0.0188, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.04932292550802231, + "rewards/margins": 0.06101522967219353, + "rewards/rejected": -0.11033815145492554, + "step": 6890 + }, + { + "epoch": 0.45, + "learning_rate": 3.3433249684570757e-06, + "logits/chosen": -2.1756784915924072, + "logits/rejected": -2.0409185886383057, + "logps/chosen": -185.26063537597656, + "logps/rejected": -158.40994262695312, + "loss": 0.0458, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.03715535253286362, + "rewards/margins": 0.09433852136135101, + "rewards/rejected": -0.13149388134479523, + "step": 6900 + }, + { + "epoch": 0.45, + "eval_logits/chosen": -2.279813051223755, + "eval_logits/rejected": -2.0924649238586426, + "eval_logps/chosen": -239.05287170410156, + "eval_logps/rejected": -234.09742736816406, + "eval_loss": 0.02606966905295849, + "eval_rewards/accuracies": 0.6524999737739563, + "eval_rewards/chosen": -0.03523955121636391, + "eval_rewards/margins": 0.07718797028064728, + "eval_rewards/rejected": -0.11242751032114029, + "eval_runtime": 712.5848, + "eval_samples_per_second": 2.807, + "eval_steps_per_second": 1.403, + "step": 6900 + }, + { + "epoch": 0.45, + "learning_rate": 3.3379475412388724e-06, + "logits/chosen": -2.364109516143799, + "logits/rejected": -2.200424909591675, + "logps/chosen": -247.0403594970703, + "logps/rejected": -235.1184844970703, + "loss": 0.0379, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.025426704436540604, + "rewards/margins": 0.10888361930847168, + "rewards/rejected": -0.13431032001972198, + "step": 6910 + }, + { + "epoch": 0.45, + "learning_rate": 3.3325657428758207e-06, + "logits/chosen": -2.161489248275757, + "logits/rejected": -2.1392319202423096, + "logps/chosen": -249.1950225830078, + "logps/rejected": -270.0361328125, + "loss": 0.0307, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.02759755589067936, + "rewards/margins": 0.1000722274184227, + "rewards/rejected": -0.1276697814464569, + "step": 6920 + }, + { + "epoch": 0.45, + "learning_rate": 3.3271796014420175e-06, + "logits/chosen": -2.262923002243042, + "logits/rejected": -2.2007548809051514, + "logps/chosen": -223.37106323242188, + "logps/rejected": -228.798095703125, + "loss": 0.0319, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.05460399389266968, + "rewards/margins": 0.11567654460668564, + "rewards/rejected": -0.1702805608510971, + "step": 6930 + }, + { + "epoch": 0.45, + "learning_rate": 3.3217891450342142e-06, + "logits/chosen": -2.259061574935913, + "logits/rejected": -1.9373016357421875, + "logps/chosen": -268.7401123046875, + "logps/rejected": -220.4185791015625, + "loss": 0.0102, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.05070844292640686, + "rewards/margins": 0.10335638374090195, + "rewards/rejected": -0.1540648192167282, + "step": 6940 + }, + { + "epoch": 0.45, + "learning_rate": 3.3163944017716733e-06, + "logits/chosen": -2.4251351356506348, + "logits/rejected": -2.1257576942443848, + "logps/chosen": -224.58676147460938, + "logps/rejected": -206.57284545898438, + "loss": 0.0135, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.041079506278038025, + "rewards/margins": 0.07541914284229279, + "rewards/rejected": -0.11649864912033081, + "step": 6950 + }, + { + "epoch": 0.46, + "learning_rate": 3.310995399796017e-06, + "logits/chosen": -2.371244192123413, + "logits/rejected": -2.2540652751922607, + "logps/chosen": -283.013427734375, + "logps/rejected": -285.94757080078125, + "loss": 0.0357, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -0.045136041939258575, + "rewards/margins": 0.04868122190237045, + "rewards/rejected": -0.09381726384162903, + "step": 6960 + }, + { + "epoch": 0.46, + "learning_rate": 3.305592167271085e-06, + "logits/chosen": -2.257551908493042, + "logits/rejected": -2.159736394882202, + "logps/chosen": -202.6905059814453, + "logps/rejected": -210.4559783935547, + "loss": 0.0285, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.027221733704209328, + "rewards/margins": 0.09338773787021637, + "rewards/rejected": -0.12060944736003876, + "step": 6970 + }, + { + "epoch": 0.46, + "learning_rate": 3.3001847323827846e-06, + "logits/chosen": -2.2555832862854004, + "logits/rejected": -2.2027993202209473, + "logps/chosen": -274.7593688964844, + "logps/rejected": -290.1148986816406, + "loss": 0.0213, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.033308275043964386, + "rewards/margins": 0.09387167543172836, + "rewards/rejected": -0.12717995047569275, + "step": 6980 + }, + { + "epoch": 0.46, + "learning_rate": 3.2947731233389447e-06, + "logits/chosen": -2.337197780609131, + "logits/rejected": -1.9657859802246094, + "logps/chosen": -255.54342651367188, + "logps/rejected": -223.072998046875, + "loss": 0.0134, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.04018372297286987, + "rewards/margins": 0.11478477716445923, + "rewards/rejected": -0.1549684852361679, + "step": 6990 + }, + { + "epoch": 0.46, + "learning_rate": 3.2893573683691706e-06, + "logits/chosen": -2.159594774246216, + "logits/rejected": -2.1685497760772705, + "logps/chosen": -213.18618774414062, + "logps/rejected": -216.9435272216797, + "loss": 0.0275, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.042463529855012894, + "rewards/margins": 0.10826573520898819, + "rewards/rejected": -0.1507292538881302, + "step": 7000 + }, + { + "epoch": 0.46, + "eval_logits/chosen": -2.2773642539978027, + "eval_logits/rejected": -2.089714765548706, + "eval_logps/chosen": -242.4080810546875, + "eval_logps/rejected": -239.64163208007812, + "eval_loss": 0.025703566148877144, + "eval_rewards/accuracies": 0.6535000205039978, + "eval_rewards/chosen": -0.0520155094563961, + "eval_rewards/margins": 0.08813316375017166, + "eval_rewards/rejected": -0.14014868438243866, + "eval_runtime": 714.8357, + "eval_samples_per_second": 2.798, + "eval_steps_per_second": 1.399, + "step": 7000 + }, + { + "epoch": 0.46, + "learning_rate": 3.2839374957246915e-06, + "logits/chosen": -2.3641977310180664, + "logits/rejected": -2.0795207023620605, + "logps/chosen": -279.41522216796875, + "logps/rejected": -200.2919921875, + "loss": 0.0218, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.06662468612194061, + "rewards/margins": 0.062074802815914154, + "rewards/rejected": -0.12869949638843536, + "step": 7010 + }, + { + "epoch": 0.46, + "learning_rate": 3.2785135336782187e-06, + "logits/chosen": -2.214670419692993, + "logits/rejected": -2.041901111602783, + "logps/chosen": -247.4052276611328, + "logps/rejected": -288.4595642089844, + "loss": 0.0143, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.06744905561208725, + "rewards/margins": 0.09370996057987213, + "rewards/rejected": -0.16115902364253998, + "step": 7020 + }, + { + "epoch": 0.46, + "learning_rate": 3.2730855105237952e-06, + "logits/chosen": -2.3737800121307373, + "logits/rejected": -2.180330276489258, + "logps/chosen": -229.07577514648438, + "logps/rejected": -290.7456970214844, + "loss": 0.0314, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.04399664327502251, + "rewards/margins": 0.08744674921035767, + "rewards/rejected": -0.13144339621067047, + "step": 7030 + }, + { + "epoch": 0.46, + "learning_rate": 3.2676534545766486e-06, + "logits/chosen": -2.1809287071228027, + "logits/rejected": -2.151038885116577, + "logps/chosen": -220.2820281982422, + "logps/rejected": -220.81185913085938, + "loss": 0.0272, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.039062611758708954, + "rewards/margins": 0.05211324617266655, + "rewards/rejected": -0.09117583930492401, + "step": 7040 + }, + { + "epoch": 0.46, + "learning_rate": 3.262217394173043e-06, + "logits/chosen": -2.285374164581299, + "logits/rejected": -2.0053107738494873, + "logps/chosen": -252.326416015625, + "logps/rejected": -265.61517333984375, + "loss": 0.0327, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.05737876147031784, + "rewards/margins": 0.09685908257961273, + "rewards/rejected": -0.15423783659934998, + "step": 7050 + }, + { + "epoch": 0.46, + "learning_rate": 3.2567773576701333e-06, + "logits/chosen": -2.114854574203491, + "logits/rejected": -1.9429212808609009, + "logps/chosen": -265.2227783203125, + "logps/rejected": -261.1485595703125, + "loss": 0.0297, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.047326505184173584, + "rewards/margins": 0.13120698928833008, + "rewards/rejected": -0.17853349447250366, + "step": 7060 + }, + { + "epoch": 0.46, + "learning_rate": 3.2513333734458154e-06, + "logits/chosen": -2.3451738357543945, + "logits/rejected": -2.249483108520508, + "logps/chosen": -216.91574096679688, + "logps/rejected": -211.2926483154297, + "loss": 0.024, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.050372231751680374, + "rewards/margins": 0.05492279678583145, + "rewards/rejected": -0.10529503971338272, + "step": 7070 + }, + { + "epoch": 0.46, + "learning_rate": 3.245885469898576e-06, + "logits/chosen": -2.2400033473968506, + "logits/rejected": -2.0241847038269043, + "logps/chosen": -310.1903381347656, + "logps/rejected": -264.5427551269531, + "loss": 0.0214, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.04817565903067589, + "rewards/margins": 0.09020276367664337, + "rewards/rejected": -0.13837842643260956, + "step": 7080 + }, + { + "epoch": 0.46, + "learning_rate": 3.2404336754473497e-06, + "logits/chosen": -2.2381837368011475, + "logits/rejected": -1.9788227081298828, + "logps/chosen": -273.0093688964844, + "logps/rejected": -220.49594116210938, + "loss": 0.0179, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.02751203440129757, + "rewards/margins": 0.06128401681780815, + "rewards/rejected": -0.08879604935646057, + "step": 7090 + }, + { + "epoch": 0.46, + "learning_rate": 3.234978018531367e-06, + "logits/chosen": -2.5674209594726562, + "logits/rejected": -2.139941692352295, + "logps/chosen": -263.71978759765625, + "logps/rejected": -214.44461059570312, + "loss": 0.0175, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.02609780803322792, + "rewards/margins": 0.077622190117836, + "rewards/rejected": -0.10372000932693481, + "step": 7100 + }, + { + "epoch": 0.46, + "eval_logits/chosen": -2.293259859085083, + "eval_logits/rejected": -2.1057543754577637, + "eval_logps/chosen": -239.9512939453125, + "eval_logps/rejected": -235.46559143066406, + "eval_loss": 0.025518544018268585, + "eval_rewards/accuracies": 0.652999997138977, + "eval_rewards/chosen": -0.03973172605037689, + "eval_rewards/margins": 0.07953677326440811, + "eval_rewards/rejected": -0.1192684918642044, + "eval_runtime": 712.5716, + "eval_samples_per_second": 2.807, + "eval_steps_per_second": 1.403, + "step": 7100 + }, + { + "epoch": 0.47, + "learning_rate": 3.229518527610006e-06, + "logits/chosen": -2.388373851776123, + "logits/rejected": -2.092414140701294, + "logps/chosen": -299.6824951171875, + "logps/rejected": -266.2288513183594, + "loss": 0.0162, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.034462135285139084, + "rewards/margins": 0.05861176922917366, + "rewards/rejected": -0.09307390451431274, + "step": 7110 + }, + { + "epoch": 0.47, + "learning_rate": 3.2240552311626465e-06, + "logits/chosen": -2.3738510608673096, + "logits/rejected": -2.137483596801758, + "logps/chosen": -248.15280151367188, + "logps/rejected": -244.7117462158203, + "loss": 0.0197, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.0346953347325325, + "rewards/margins": 0.057519711554050446, + "rewards/rejected": -0.09221504628658295, + "step": 7120 + }, + { + "epoch": 0.47, + "learning_rate": 3.2185881576885193e-06, + "logits/chosen": -2.3900671005249023, + "logits/rejected": -2.0392982959747314, + "logps/chosen": -225.50747680664062, + "logps/rejected": -200.21090698242188, + "loss": 0.0298, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.06892313063144684, + "rewards/margins": 0.06685464829206467, + "rewards/rejected": -0.1357778012752533, + "step": 7130 + }, + { + "epoch": 0.47, + "learning_rate": 3.213117335706557e-06, + "logits/chosen": -2.262974262237549, + "logits/rejected": -2.3756120204925537, + "logps/chosen": -271.0682373046875, + "logps/rejected": -291.7215270996094, + "loss": 0.025, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.0751163512468338, + "rewards/margins": 0.05723525211215019, + "rewards/rejected": -0.1323516070842743, + "step": 7140 + }, + { + "epoch": 0.47, + "learning_rate": 3.2076427937552473e-06, + "logits/chosen": -2.3007476329803467, + "logits/rejected": -2.078320264816284, + "logps/chosen": -252.1639862060547, + "logps/rejected": -255.96145629882812, + "loss": 0.0297, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.0333712212741375, + "rewards/margins": 0.1146056205034256, + "rewards/rejected": -0.1479768455028534, + "step": 7150 + }, + { + "epoch": 0.47, + "learning_rate": 3.2021645603924827e-06, + "logits/chosen": -2.1458470821380615, + "logits/rejected": -2.0497066974639893, + "logps/chosen": -148.47946166992188, + "logps/rejected": -179.91090393066406, + "loss": 0.0244, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.05012665316462517, + "rewards/margins": 0.11033505201339722, + "rewards/rejected": -0.16046170890331268, + "step": 7160 + }, + { + "epoch": 0.47, + "learning_rate": 3.196682664195412e-06, + "logits/chosen": -2.2713592052459717, + "logits/rejected": -1.9928386211395264, + "logps/chosen": -217.07821655273438, + "logps/rejected": -192.48062133789062, + "loss": 0.0353, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.06515000015497208, + "rewards/margins": 0.05146735906600952, + "rewards/rejected": -0.1166173666715622, + "step": 7170 + }, + { + "epoch": 0.47, + "learning_rate": 3.191197133760291e-06, + "logits/chosen": -2.5346858501434326, + "logits/rejected": -2.2119202613830566, + "logps/chosen": -271.4148254394531, + "logps/rejected": -222.3765411376953, + "loss": 0.028, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.020056938752532005, + "rewards/margins": 0.12605223059654236, + "rewards/rejected": -0.14610914885997772, + "step": 7180 + }, + { + "epoch": 0.47, + "learning_rate": 3.185707997702334e-06, + "logits/chosen": -2.19646954536438, + "logits/rejected": -2.0350489616394043, + "logps/chosen": -255.05160522460938, + "logps/rejected": -236.4979248046875, + "loss": 0.0147, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.0714336410164833, + "rewards/margins": 0.09648963809013367, + "rewards/rejected": -0.16792325675487518, + "step": 7190 + }, + { + "epoch": 0.47, + "learning_rate": 3.1802152846555624e-06, + "logits/chosen": -2.2263684272766113, + "logits/rejected": -2.203920841217041, + "logps/chosen": -234.8302764892578, + "logps/rejected": -235.4951171875, + "loss": 0.035, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.05293622612953186, + "rewards/margins": 0.08092103898525238, + "rewards/rejected": -0.13385728001594543, + "step": 7200 + }, + { + "epoch": 0.47, + "eval_logits/chosen": -2.308262825012207, + "eval_logits/rejected": -2.1193060874938965, + "eval_logps/chosen": -242.8714599609375, + "eval_logps/rejected": -236.956787109375, + "eval_loss": 0.026033619418740273, + "eval_rewards/accuracies": 0.6485000252723694, + "eval_rewards/chosen": -0.054332468658685684, + "eval_rewards/margins": 0.07239188253879547, + "eval_rewards/rejected": -0.12672434747219086, + "eval_runtime": 712.7385, + "eval_samples_per_second": 2.806, + "eval_steps_per_second": 1.403, + "step": 7200 + }, + { + "epoch": 0.47, + "learning_rate": 3.174719023272659e-06, + "logits/chosen": -2.383152484893799, + "logits/rejected": -2.4308807849884033, + "logps/chosen": -224.28189086914062, + "logps/rejected": -286.50152587890625, + "loss": 0.0194, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.05293875187635422, + "rewards/margins": 0.07935214787721634, + "rewards/rejected": -0.13229091465473175, + "step": 7210 + }, + { + "epoch": 0.47, + "learning_rate": 3.169219242224816e-06, + "logits/chosen": -2.3175277709960938, + "logits/rejected": -2.1515915393829346, + "logps/chosen": -254.6971893310547, + "logps/rejected": -261.6206970214844, + "loss": 0.0118, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.07384978979825974, + "rewards/margins": 0.06509410589933395, + "rewards/rejected": -0.1389438956975937, + "step": 7220 + }, + { + "epoch": 0.47, + "learning_rate": 3.1637159702015837e-06, + "logits/chosen": -2.320132255554199, + "logits/rejected": -2.0065832138061523, + "logps/chosen": -216.2084503173828, + "logps/rejected": -218.912841796875, + "loss": 0.021, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.0515814833343029, + "rewards/margins": 0.110136017203331, + "rewards/rejected": -0.1617175042629242, + "step": 7230 + }, + { + "epoch": 0.47, + "learning_rate": 3.1582092359107263e-06, + "logits/chosen": -2.375622272491455, + "logits/rejected": -2.1548819541931152, + "logps/chosen": -289.31683349609375, + "logps/rejected": -261.6122741699219, + "loss": 0.0222, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.055195726454257965, + "rewards/margins": 0.09063062071800232, + "rewards/rejected": -0.14582636952400208, + "step": 7240 + }, + { + "epoch": 0.47, + "learning_rate": 3.152699068078067e-06, + "logits/chosen": -2.1964268684387207, + "logits/rejected": -1.9905602931976318, + "logps/chosen": -301.0905456542969, + "logps/rejected": -305.6982421875, + "loss": 0.0211, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.1040010005235672, + "rewards/margins": 0.12574461102485657, + "rewards/rejected": -0.22974559664726257, + "step": 7250 + }, + { + "epoch": 0.48, + "learning_rate": 3.1471854954473415e-06, + "logits/chosen": -2.345653772354126, + "logits/rejected": -2.389192581176758, + "logps/chosen": -258.79254150390625, + "logps/rejected": -274.45782470703125, + "loss": 0.0179, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.0401294082403183, + "rewards/margins": 0.09716589748859406, + "rewards/rejected": -0.13729530572891235, + "step": 7260 + }, + { + "epoch": 0.48, + "learning_rate": 3.1416685467800436e-06, + "logits/chosen": -2.1382288932800293, + "logits/rejected": -2.1094374656677246, + "logps/chosen": -202.70059204101562, + "logps/rejected": -200.3344268798828, + "loss": 0.0216, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.08558902144432068, + "rewards/margins": 0.10081753879785538, + "rewards/rejected": -0.18640658259391785, + "step": 7270 + }, + { + "epoch": 0.48, + "learning_rate": 3.1361482508552803e-06, + "logits/chosen": -2.3437018394470215, + "logits/rejected": -1.8407390117645264, + "logps/chosen": -257.7562561035156, + "logps/rejected": -227.2680206298828, + "loss": 0.0341, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.07902660965919495, + "rewards/margins": 0.06449031084775925, + "rewards/rejected": -0.1435169279575348, + "step": 7280 + }, + { + "epoch": 0.48, + "learning_rate": 3.1306246364696198e-06, + "logits/chosen": -2.467689037322998, + "logits/rejected": -2.2647864818573, + "logps/chosen": -265.7434997558594, + "logps/rejected": -264.20330810546875, + "loss": 0.0182, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.06634589284658432, + "rewards/margins": 0.0798247903585434, + "rewards/rejected": -0.1461706906557083, + "step": 7290 + }, + { + "epoch": 0.48, + "learning_rate": 3.1250977324369413e-06, + "logits/chosen": -2.253009080886841, + "logits/rejected": -2.173417091369629, + "logps/chosen": -171.82815551757812, + "logps/rejected": -197.09963989257812, + "loss": 0.015, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.06437839567661285, + "rewards/margins": 0.09279756993055344, + "rewards/rejected": -0.15717598795890808, + "step": 7300 + }, + { + "epoch": 0.48, + "eval_logits/chosen": -2.3008673191070557, + "eval_logits/rejected": -2.1122865676879883, + "eval_logps/chosen": -249.43240356445312, + "eval_logps/rejected": -244.0609130859375, + "eval_loss": 0.025661982595920563, + "eval_rewards/accuracies": 0.6389999985694885, + "eval_rewards/chosen": -0.08713724464178085, + "eval_rewards/margins": 0.07510782033205032, + "eval_rewards/rejected": -0.16224505007266998, + "eval_runtime": 713.91, + "eval_samples_per_second": 2.801, + "eval_steps_per_second": 1.401, + "step": 7300 + }, + { + "epoch": 0.48, + "learning_rate": 3.1195675675882825e-06, + "logits/chosen": -2.2031381130218506, + "logits/rejected": -2.096050977706909, + "logps/chosen": -258.1648864746094, + "logps/rejected": -240.55856323242188, + "loss": 0.0283, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.11486943811178207, + "rewards/margins": 0.06499240547418594, + "rewards/rejected": -0.17986184358596802, + "step": 7310 + }, + { + "epoch": 0.48, + "learning_rate": 3.1140341707716926e-06, + "logits/chosen": -2.168344259262085, + "logits/rejected": -1.9166107177734375, + "logps/chosen": -215.80386352539062, + "logps/rejected": -195.4596710205078, + "loss": 0.039, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.0999809131026268, + "rewards/margins": 0.11335988342761993, + "rewards/rejected": -0.21334078907966614, + "step": 7320 + }, + { + "epoch": 0.48, + "learning_rate": 3.1084975708520803e-06, + "logits/chosen": -2.4190192222595215, + "logits/rejected": -2.0219929218292236, + "logps/chosen": -277.07574462890625, + "logps/rejected": -224.830322265625, + "loss": 0.0176, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.07675959169864655, + "rewards/margins": 0.09250538051128387, + "rewards/rejected": -0.1692649871110916, + "step": 7330 + }, + { + "epoch": 0.48, + "learning_rate": 3.1029577967110625e-06, + "logits/chosen": -2.418910026550293, + "logits/rejected": -2.204145669937134, + "logps/chosen": -233.3201446533203, + "logps/rejected": -192.99319458007812, + "loss": 0.0374, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.08462905138731003, + "rewards/margins": 0.043656978756189346, + "rewards/rejected": -0.12828603386878967, + "step": 7340 + }, + { + "epoch": 0.48, + "learning_rate": 3.097414877246814e-06, + "logits/chosen": -2.2419230937957764, + "logits/rejected": -1.994065523147583, + "logps/chosen": -215.1243133544922, + "logps/rejected": -206.0422821044922, + "loss": 0.0339, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.08240430057048798, + "rewards/margins": 0.10215411335229874, + "rewards/rejected": -0.18455840647220612, + "step": 7350 + }, + { + "epoch": 0.48, + "learning_rate": 3.0918688413739197e-06, + "logits/chosen": -2.328660488128662, + "logits/rejected": -2.0175442695617676, + "logps/chosen": -243.36770629882812, + "logps/rejected": -201.67184448242188, + "loss": 0.02, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.053423862904310226, + "rewards/margins": 0.11288408935070038, + "rewards/rejected": -0.1663079559803009, + "step": 7360 + }, + { + "epoch": 0.48, + "learning_rate": 3.0863197180232178e-06, + "logits/chosen": -2.3689608573913574, + "logits/rejected": -1.9991772174835205, + "logps/chosen": -213.11666870117188, + "logps/rejected": -216.60641479492188, + "loss": 0.0205, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.06686246395111084, + "rewards/margins": 0.07855210453271866, + "rewards/rejected": -0.1454145610332489, + "step": 7370 + }, + { + "epoch": 0.48, + "learning_rate": 3.0807675361416554e-06, + "logits/chosen": -2.268491744995117, + "logits/rejected": -2.0238747596740723, + "logps/chosen": -196.4187774658203, + "logps/rejected": -134.76646423339844, + "loss": 0.0356, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.037692587822675705, + "rewards/margins": 0.08691040426492691, + "rewards/rejected": -0.12460298836231232, + "step": 7380 + }, + { + "epoch": 0.48, + "learning_rate": 3.0752123246921327e-06, + "logits/chosen": -2.3893237113952637, + "logits/rejected": -2.114490509033203, + "logps/chosen": -290.68157958984375, + "logps/rejected": -240.748779296875, + "loss": 0.0188, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.06384162604808807, + "rewards/margins": 0.08499327301979065, + "rewards/rejected": -0.14883491396903992, + "step": 7390 + }, + { + "epoch": 0.48, + "learning_rate": 3.069654112653353e-06, + "logits/chosen": -2.406578540802002, + "logits/rejected": -2.199592351913452, + "logps/chosen": -231.58706665039062, + "logps/rejected": -207.14590454101562, + "loss": 0.0231, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.07551421225070953, + "rewards/margins": 0.04055469110608101, + "rewards/rejected": -0.11606889963150024, + "step": 7400 + }, + { + "epoch": 0.48, + "eval_logits/chosen": -2.2913472652435303, + "eval_logits/rejected": -2.103548049926758, + "eval_logps/chosen": -245.1847686767578, + "eval_logps/rejected": -240.86825561523438, + "eval_loss": 0.0254887156188488, + "eval_rewards/accuracies": 0.6489999890327454, + "eval_rewards/chosen": -0.06589899212121964, + "eval_rewards/margins": 0.08038286119699478, + "eval_rewards/rejected": -0.14628185331821442, + "eval_runtime": 710.4192, + "eval_samples_per_second": 2.815, + "eval_steps_per_second": 1.408, + "step": 7400 + }, + { + "epoch": 0.48, + "learning_rate": 3.064092929019673e-06, + "logits/chosen": -2.283900737762451, + "logits/rejected": -2.317167282104492, + "logps/chosen": -269.21429443359375, + "logps/rejected": -300.99951171875, + "loss": 0.0248, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.0583542063832283, + "rewards/margins": 0.0595521442592144, + "rewards/rejected": -0.117906354367733, + "step": 7410 + }, + { + "epoch": 0.49, + "learning_rate": 3.058528802800952e-06, + "logits/chosen": -2.321166753768921, + "logits/rejected": -2.080514907836914, + "logps/chosen": -303.0827331542969, + "logps/rejected": -282.8626403808594, + "loss": 0.0204, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.05224515125155449, + "rewards/margins": 0.08176366984844208, + "rewards/rejected": -0.13400882482528687, + "step": 7420 + }, + { + "epoch": 0.49, + "learning_rate": 3.052961763022397e-06, + "logits/chosen": -2.4596550464630127, + "logits/rejected": -2.1328208446502686, + "logps/chosen": -197.9812469482422, + "logps/rejected": -182.56109619140625, + "loss": 0.0378, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.07550039142370224, + "rewards/margins": 0.1165175586938858, + "rewards/rejected": -0.19201794266700745, + "step": 7430 + }, + { + "epoch": 0.49, + "learning_rate": 3.047391838724415e-06, + "logits/chosen": -2.4146902561187744, + "logits/rejected": -1.951748251914978, + "logps/chosen": -248.65621948242188, + "logps/rejected": -253.9379119873047, + "loss": 0.0308, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.06568771600723267, + "rewards/margins": 0.10674123466014862, + "rewards/rejected": -0.1724289357662201, + "step": 7440 + }, + { + "epoch": 0.49, + "learning_rate": 3.0418190589624587e-06, + "logits/chosen": -2.335435152053833, + "logits/rejected": -2.073451519012451, + "logps/chosen": -191.60012817382812, + "logps/rejected": -212.0890655517578, + "loss": 0.0241, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.06989014893770218, + "rewards/margins": 0.060682911425828934, + "rewards/rejected": -0.1305730640888214, + "step": 7450 + }, + { + "epoch": 0.49, + "learning_rate": 3.0362434528068784e-06, + "logits/chosen": -2.3186919689178467, + "logits/rejected": -1.8934450149536133, + "logps/chosen": -285.0685119628906, + "logps/rejected": -220.4761962890625, + "loss": 0.0099, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.07872681319713593, + "rewards/margins": 0.10450903326272964, + "rewards/rejected": -0.18323580920696259, + "step": 7460 + }, + { + "epoch": 0.49, + "learning_rate": 3.0306650493427657e-06, + "logits/chosen": -2.214409828186035, + "logits/rejected": -2.1102757453918457, + "logps/chosen": -245.85733032226562, + "logps/rejected": -254.14730834960938, + "loss": 0.0301, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.07119054347276688, + "rewards/margins": 0.08625958859920502, + "rewards/rejected": -0.1574501246213913, + "step": 7470 + }, + { + "epoch": 0.49, + "learning_rate": 3.0250838776698077e-06, + "logits/chosen": -2.0632340908050537, + "logits/rejected": -2.1093077659606934, + "logps/chosen": -204.76260375976562, + "logps/rejected": -221.22854614257812, + "loss": 0.021, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.10421650111675262, + "rewards/margins": 0.09606151282787323, + "rewards/rejected": -0.20027799904346466, + "step": 7480 + }, + { + "epoch": 0.49, + "learning_rate": 3.0194999669021275e-06, + "logits/chosen": -2.0692830085754395, + "logits/rejected": -1.7427141666412354, + "logps/chosen": -239.0797119140625, + "logps/rejected": -213.71298217773438, + "loss": 0.027, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.061056338250637054, + "rewards/margins": 0.10842617601156235, + "rewards/rejected": -0.1694825291633606, + "step": 7490 + }, + { + "epoch": 0.49, + "learning_rate": 3.0139133461681403e-06, + "logits/chosen": -2.2071690559387207, + "logits/rejected": -2.060537815093994, + "logps/chosen": -275.1077575683594, + "logps/rejected": -238.1566619873047, + "loss": 0.0211, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.05356758087873459, + "rewards/margins": 0.0959450975060463, + "rewards/rejected": -0.14951267838478088, + "step": 7500 + }, + { + "epoch": 0.49, + "eval_logits/chosen": -2.248464345932007, + "eval_logits/rejected": -2.063547372817993, + "eval_logps/chosen": -244.62350463867188, + "eval_logps/rejected": -240.8419647216797, + "eval_loss": 0.02583528310060501, + "eval_rewards/accuracies": 0.6520000100135803, + "eval_rewards/chosen": -0.06309277564287186, + "eval_rewards/margins": 0.08305763453245163, + "eval_rewards/rejected": -0.1461504101753235, + "eval_runtime": 711.8448, + "eval_samples_per_second": 2.81, + "eval_steps_per_second": 1.405, + "step": 7500 + }, + { + "epoch": 0.49, + "learning_rate": 3.0083240446103965e-06, + "logits/chosen": -1.9640617370605469, + "logits/rejected": -1.9273643493652344, + "logps/chosen": -195.1829376220703, + "logps/rejected": -223.74267578125, + "loss": 0.0156, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.05574166774749756, + "rewards/margins": 0.1044529527425766, + "rewards/rejected": -0.16019462049007416, + "step": 7510 + }, + { + "epoch": 0.49, + "learning_rate": 3.0027320913854306e-06, + "logits/chosen": -2.451677083969116, + "logits/rejected": -2.165865421295166, + "logps/chosen": -301.517333984375, + "logps/rejected": -258.03253173828125, + "loss": 0.022, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.04527861624956131, + "rewards/margins": 0.09612289816141129, + "rewards/rejected": -0.1414014995098114, + "step": 7520 + }, + { + "epoch": 0.49, + "learning_rate": 2.997137515663609e-06, + "logits/chosen": -2.2002179622650146, + "logits/rejected": -2.116541862487793, + "logps/chosen": -230.6831817626953, + "logps/rejected": -211.8942108154297, + "loss": 0.0552, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.026528939604759216, + "rewards/margins": 0.0945509821176529, + "rewards/rejected": -0.1210799366235733, + "step": 7530 + }, + { + "epoch": 0.49, + "learning_rate": 2.991540346628981e-06, + "logits/chosen": -2.3002963066101074, + "logits/rejected": -2.118590831756592, + "logps/chosen": -248.7118682861328, + "logps/rejected": -234.67245483398438, + "loss": 0.0165, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.045557767152786255, + "rewards/margins": 0.060344088822603226, + "rewards/rejected": -0.10590185225009918, + "step": 7540 + }, + { + "epoch": 0.49, + "learning_rate": 2.985940613479121e-06, + "logits/chosen": -2.3952338695526123, + "logits/rejected": -2.2858328819274902, + "logps/chosen": -302.9120788574219, + "logps/rejected": -258.48260498046875, + "loss": 0.0282, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.04531232267618179, + "rewards/margins": 0.07471176236867905, + "rewards/rejected": -0.12002407014369965, + "step": 7550 + }, + { + "epoch": 0.49, + "learning_rate": 2.980338345424981e-06, + "logits/chosen": -2.255479574203491, + "logits/rejected": -1.950728416442871, + "logps/chosen": -257.19573974609375, + "logps/rejected": -223.8048858642578, + "loss": 0.0157, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.039879582822322845, + "rewards/margins": 0.07442986220121384, + "rewards/rejected": -0.11430943012237549, + "step": 7560 + }, + { + "epoch": 0.5, + "learning_rate": 2.974733571690735e-06, + "logits/chosen": -2.3409745693206787, + "logits/rejected": -2.0642552375793457, + "logps/chosen": -254.4838409423828, + "logps/rejected": -212.90652465820312, + "loss": 0.0446, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.08538492023944855, + "rewards/margins": 0.08857604116201401, + "rewards/rejected": -0.17396095395088196, + "step": 7570 + }, + { + "epoch": 0.5, + "learning_rate": 2.9691263215136274e-06, + "logits/chosen": -2.2871947288513184, + "logits/rejected": -2.2657456398010254, + "logps/chosen": -274.03948974609375, + "logps/rejected": -258.7521057128906, + "loss": 0.0118, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.035195596516132355, + "rewards/margins": 0.08183564990758896, + "rewards/rejected": -0.11703125387430191, + "step": 7580 + }, + { + "epoch": 0.5, + "learning_rate": 2.963516624143823e-06, + "logits/chosen": -2.181278705596924, + "logits/rejected": -2.06855845451355, + "logps/chosen": -231.0608673095703, + "logps/rejected": -211.93508911132812, + "loss": 0.0217, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.09247070550918579, + "rewards/margins": 0.0920250415802002, + "rewards/rejected": -0.184495747089386, + "step": 7590 + }, + { + "epoch": 0.5, + "learning_rate": 2.9579045088442504e-06, + "logits/chosen": -2.076108455657959, + "logits/rejected": -2.109138011932373, + "logps/chosen": -205.429931640625, + "logps/rejected": -247.7603759765625, + "loss": 0.0379, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.0846247524023056, + "rewards/margins": 0.1129075139760971, + "rewards/rejected": -0.1975322812795639, + "step": 7600 + }, + { + "epoch": 0.5, + "eval_logits/chosen": -2.2404394149780273, + "eval_logits/rejected": -2.05661678314209, + "eval_logps/chosen": -246.95504760742188, + "eval_logps/rejected": -243.5423126220703, + "eval_loss": 0.02591700106859207, + "eval_rewards/accuracies": 0.6474999785423279, + "eval_rewards/chosen": -0.07475046068429947, + "eval_rewards/margins": 0.08490156382322311, + "eval_rewards/rejected": -0.1596520096063614, + "eval_runtime": 711.6852, + "eval_samples_per_second": 2.81, + "eval_steps_per_second": 1.405, + "step": 7600 + }, + { + "epoch": 0.5, + "learning_rate": 2.9522900048904534e-06, + "logits/chosen": -2.151641845703125, + "logits/rejected": -2.059218168258667, + "logps/chosen": -261.9276123046875, + "logps/rejected": -240.96853637695312, + "loss": 0.0265, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.11094732582569122, + "rewards/margins": 0.04531213641166687, + "rewards/rejected": -0.1562594622373581, + "step": 7610 + }, + { + "epoch": 0.5, + "learning_rate": 2.9466731415704343e-06, + "logits/chosen": -2.2437233924865723, + "logits/rejected": -2.116011381149292, + "logps/chosen": -239.49179077148438, + "logps/rejected": -253.66470336914062, + "loss": 0.0229, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.0785256177186966, + "rewards/margins": 0.09270112216472626, + "rewards/rejected": -0.17122673988342285, + "step": 7620 + }, + { + "epoch": 0.5, + "learning_rate": 2.941053948184503e-06, + "logits/chosen": -2.306546211242676, + "logits/rejected": -2.155078649520874, + "logps/chosen": -293.14190673828125, + "logps/rejected": -267.7697448730469, + "loss": 0.0338, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.06601421535015106, + "rewards/margins": 0.04770015925168991, + "rewards/rejected": -0.11371437460184097, + "step": 7630 + }, + { + "epoch": 0.5, + "learning_rate": 2.935432454045125e-06, + "logits/chosen": -2.10274076461792, + "logits/rejected": -2.168013095855713, + "logps/chosen": -249.28128051757812, + "logps/rejected": -234.67724609375, + "loss": 0.0248, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.08658457547426224, + "rewards/margins": 0.03301741182804108, + "rewards/rejected": -0.11960198730230331, + "step": 7640 + }, + { + "epoch": 0.5, + "learning_rate": 2.929808688476768e-06, + "logits/chosen": -2.340390682220459, + "logits/rejected": -2.217571496963501, + "logps/chosen": -254.6952667236328, + "logps/rejected": -253.04006958007812, + "loss": 0.0389, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.0766642838716507, + "rewards/margins": 0.07973639667034149, + "rewards/rejected": -0.1564006805419922, + "step": 7650 + }, + { + "epoch": 0.5, + "learning_rate": 2.924182680815748e-06, + "logits/chosen": -2.258434534072876, + "logits/rejected": -2.182904005050659, + "logps/chosen": -244.4628448486328, + "logps/rejected": -245.7698211669922, + "loss": 0.0149, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.04916124790906906, + "rewards/margins": 0.12007315456867218, + "rewards/rejected": -0.16923440992832184, + "step": 7660 + }, + { + "epoch": 0.5, + "learning_rate": 2.9185544604100765e-06, + "logits/chosen": -2.0396854877471924, + "logits/rejected": -1.946394681930542, + "logps/chosen": -211.8596954345703, + "logps/rejected": -221.6324005126953, + "loss": 0.0246, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.07974790781736374, + "rewards/margins": 0.07014047354459763, + "rewards/rejected": -0.14988838136196136, + "step": 7670 + }, + { + "epoch": 0.5, + "learning_rate": 2.9129240566193083e-06, + "logits/chosen": -2.34912371635437, + "logits/rejected": -2.0322132110595703, + "logps/chosen": -215.6170654296875, + "logps/rejected": -219.56967163085938, + "loss": 0.022, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.06768370419740677, + "rewards/margins": 0.08620280772447586, + "rewards/rejected": -0.15388651192188263, + "step": 7680 + }, + { + "epoch": 0.5, + "learning_rate": 2.9072914988143874e-06, + "logits/chosen": -2.108494758605957, + "logits/rejected": -2.0200300216674805, + "logps/chosen": -213.08517456054688, + "logps/rejected": -228.9197998046875, + "loss": 0.0381, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.05827696993947029, + "rewards/margins": 0.1215302124619484, + "rewards/rejected": -0.17980718612670898, + "step": 7690 + }, + { + "epoch": 0.5, + "learning_rate": 2.9016568163774956e-06, + "logits/chosen": -2.3322207927703857, + "logits/rejected": -2.09818434715271, + "logps/chosen": -183.9313507080078, + "logps/rejected": -165.3214874267578, + "loss": 0.0117, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.058510202914476395, + "rewards/margins": 0.07721497863531113, + "rewards/rejected": -0.13572517037391663, + "step": 7700 + }, + { + "epoch": 0.5, + "eval_logits/chosen": -2.2502083778381348, + "eval_logits/rejected": -2.0660572052001953, + "eval_logps/chosen": -243.0759735107422, + "eval_logps/rejected": -239.77200317382812, + "eval_loss": 0.02568177692592144, + "eval_rewards/accuracies": 0.6620000004768372, + "eval_rewards/chosen": -0.0553550161421299, + "eval_rewards/margins": 0.08544543385505676, + "eval_rewards/rejected": -0.14080046117305756, + "eval_runtime": 713.4471, + "eval_samples_per_second": 2.803, + "eval_steps_per_second": 1.402, + "step": 7700 + }, + { + "epoch": 0.5, + "learning_rate": 2.8960200387018942e-06, + "logits/chosen": -2.0974814891815186, + "logits/rejected": -2.0618884563446045, + "logps/chosen": -321.8419494628906, + "logps/rejected": -286.4595031738281, + "loss": 0.0235, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.07054462283849716, + "rewards/margins": 0.06725998967885971, + "rewards/rejected": -0.13780462741851807, + "step": 7710 + }, + { + "epoch": 0.51, + "learning_rate": 2.8903811951917792e-06, + "logits/chosen": -2.2555909156799316, + "logits/rejected": -2.1047475337982178, + "logps/chosen": -207.61007690429688, + "logps/rejected": -176.0470428466797, + "loss": 0.0266, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.04665211960673332, + "rewards/margins": 0.07789582014083862, + "rewards/rejected": -0.12454793602228165, + "step": 7720 + }, + { + "epoch": 0.51, + "learning_rate": 2.88474031526212e-06, + "logits/chosen": -2.212296962738037, + "logits/rejected": -2.180422306060791, + "logps/chosen": -213.6601104736328, + "logps/rejected": -240.7231903076172, + "loss": 0.0163, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.05410192161798477, + "rewards/margins": 0.06817255169153214, + "rewards/rejected": -0.1222744733095169, + "step": 7730 + }, + { + "epoch": 0.51, + "learning_rate": 2.879097428338509e-06, + "logits/chosen": -2.208082437515259, + "logits/rejected": -1.9035813808441162, + "logps/chosen": -224.62008666992188, + "logps/rejected": -218.4266815185547, + "loss": 0.0316, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.043625570833683014, + "rewards/margins": 0.08067473769187927, + "rewards/rejected": -0.12430031597614288, + "step": 7740 + }, + { + "epoch": 0.51, + "learning_rate": 2.8734525638570094e-06, + "logits/chosen": -2.2176127433776855, + "logits/rejected": -2.1426639556884766, + "logps/chosen": -241.19674682617188, + "logps/rejected": -242.5107421875, + "loss": 0.0232, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.03501252457499504, + "rewards/margins": 0.06843651831150055, + "rewards/rejected": -0.1034490317106247, + "step": 7750 + }, + { + "epoch": 0.51, + "learning_rate": 2.8678057512639982e-06, + "logits/chosen": -2.170894145965576, + "logits/rejected": -2.076629638671875, + "logps/chosen": -288.15228271484375, + "logps/rejected": -292.22430419921875, + "loss": 0.0225, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.01042949128895998, + "rewards/margins": 0.13065668940544128, + "rewards/rejected": -0.1410861760377884, + "step": 7760 + }, + { + "epoch": 0.51, + "learning_rate": 2.8621570200160172e-06, + "logits/chosen": -2.0633962154388428, + "logits/rejected": -1.9723310470581055, + "logps/chosen": -172.1400909423828, + "logps/rejected": -184.14730834960938, + "loss": 0.0184, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.017855005338788033, + "rewards/margins": 0.10092601925134659, + "rewards/rejected": -0.11878103017807007, + "step": 7770 + }, + { + "epoch": 0.51, + "learning_rate": 2.856506399579615e-06, + "logits/chosen": -2.417569637298584, + "logits/rejected": -2.0469810962677, + "logps/chosen": -231.6193389892578, + "logps/rejected": -227.15756225585938, + "loss": 0.0301, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.06627919524908066, + "rewards/margins": 0.07513771951198578, + "rewards/rejected": -0.14141690731048584, + "step": 7780 + }, + { + "epoch": 0.51, + "learning_rate": 2.8508539194311964e-06, + "logits/chosen": -2.329987049102783, + "logits/rejected": -2.3192734718322754, + "logps/chosen": -261.53924560546875, + "logps/rejected": -287.02728271484375, + "loss": 0.0106, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.039029061794281006, + "rewards/margins": 0.06329012662172318, + "rewards/rejected": -0.10231919586658478, + "step": 7790 + }, + { + "epoch": 0.51, + "learning_rate": 2.8451996090568656e-06, + "logits/chosen": -2.2257461547851562, + "logits/rejected": -2.104602098464966, + "logps/chosen": -202.16307067871094, + "logps/rejected": -204.09323120117188, + "loss": 0.0197, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.08011750876903534, + "rewards/margins": 0.09945876896381378, + "rewards/rejected": -0.17957626283168793, + "step": 7800 + }, + { + "epoch": 0.51, + "eval_logits/chosen": -2.27226185798645, + "eval_logits/rejected": -2.0867011547088623, + "eval_logps/chosen": -245.60133361816406, + "eval_logps/rejected": -242.34844970703125, + "eval_loss": 0.026134636253118515, + "eval_rewards/accuracies": 0.6589999794960022, + "eval_rewards/chosen": -0.06798180192708969, + "eval_rewards/margins": 0.08570097386837006, + "eval_rewards/rejected": -0.15368276834487915, + "eval_runtime": 710.4359, + "eval_samples_per_second": 2.815, + "eval_steps_per_second": 1.408, + "step": 7800 + }, + { + "epoch": 0.51, + "learning_rate": 2.839543497952276e-06, + "logits/chosen": -2.1585605144500732, + "logits/rejected": -2.267885208129883, + "logps/chosen": -199.9320831298828, + "logps/rejected": -208.8585662841797, + "loss": 0.0458, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.07660982012748718, + "rewards/margins": 0.08815185725688934, + "rewards/rejected": -0.16476169228553772, + "step": 7810 + }, + { + "epoch": 0.51, + "learning_rate": 2.833885615622474e-06, + "logits/chosen": -2.2131218910217285, + "logits/rejected": -2.064406394958496, + "logps/chosen": -221.01327514648438, + "logps/rejected": -244.9972686767578, + "loss": 0.0318, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.09218723326921463, + "rewards/margins": 0.07254284620285034, + "rewards/rejected": -0.16473010182380676, + "step": 7820 + }, + { + "epoch": 0.51, + "learning_rate": 2.8282259915817454e-06, + "logits/chosen": -1.9040521383285522, + "logits/rejected": -2.1038641929626465, + "logps/chosen": -155.58538818359375, + "logps/rejected": -215.759521484375, + "loss": 0.0184, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.073203444480896, + "rewards/margins": 0.09981255233287811, + "rewards/rejected": -0.17301598191261292, + "step": 7830 + }, + { + "epoch": 0.51, + "learning_rate": 2.8225646553534614e-06, + "logits/chosen": -2.0671770572662354, + "logits/rejected": -1.9584850072860718, + "logps/chosen": -208.79440307617188, + "logps/rejected": -218.4818878173828, + "loss": 0.0309, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.047392092645168304, + "rewards/margins": 0.06489382684230804, + "rewards/rejected": -0.11228591203689575, + "step": 7840 + }, + { + "epoch": 0.51, + "learning_rate": 2.8169016364699255e-06, + "logits/chosen": -2.264528512954712, + "logits/rejected": -1.993786096572876, + "logps/chosen": -228.7519073486328, + "logps/rejected": -244.06967163085938, + "loss": 0.0279, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.08431997150182724, + "rewards/margins": 0.06783817708492279, + "rewards/rejected": -0.15215817093849182, + "step": 7850 + }, + { + "epoch": 0.51, + "learning_rate": 2.811236964472217e-06, + "logits/chosen": -2.3775601387023926, + "logits/rejected": -2.0033047199249268, + "logps/chosen": -323.74560546875, + "logps/rejected": -277.99798583984375, + "loss": 0.0308, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.06669513881206512, + "rewards/margins": 0.07186156511306763, + "rewards/rejected": -0.13855668902397156, + "step": 7860 + }, + { + "epoch": 0.51, + "learning_rate": 2.805570668910041e-06, + "logits/chosen": -2.0790164470672607, + "logits/rejected": -2.051755666732788, + "logps/chosen": -191.5697479248047, + "logps/rejected": -271.2172546386719, + "loss": 0.0203, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.10965213924646378, + "rewards/margins": 0.08073590695858002, + "rewards/rejected": -0.1903880536556244, + "step": 7870 + }, + { + "epoch": 0.52, + "learning_rate": 2.7999027793415695e-06, + "logits/chosen": -2.4737279415130615, + "logits/rejected": -2.0029778480529785, + "logps/chosen": -260.46319580078125, + "logps/rejected": -227.0538787841797, + "loss": 0.0147, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.07773597538471222, + "rewards/margins": 0.057581812143325806, + "rewards/rejected": -0.13531777262687683, + "step": 7880 + }, + { + "epoch": 0.52, + "learning_rate": 2.794233325333293e-06, + "logits/chosen": -2.160679340362549, + "logits/rejected": -2.0541396141052246, + "logps/chosen": -270.203857421875, + "logps/rejected": -267.7791442871094, + "loss": 0.0209, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.05961208418011665, + "rewards/margins": 0.10626313835382462, + "rewards/rejected": -0.16587522625923157, + "step": 7890 + }, + { + "epoch": 0.52, + "learning_rate": 2.7885623364598597e-06, + "logits/chosen": -2.3812639713287354, + "logits/rejected": -2.0741419792175293, + "logps/chosen": -280.6651306152344, + "logps/rejected": -257.989501953125, + "loss": 0.0296, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.0848626121878624, + "rewards/margins": 0.09621632099151611, + "rewards/rejected": -0.1810789406299591, + "step": 7900 + }, + { + "epoch": 0.52, + "eval_logits/chosen": -2.2761878967285156, + "eval_logits/rejected": -2.0899689197540283, + "eval_logps/chosen": -245.6046600341797, + "eval_logps/rejected": -241.3648681640625, + "eval_loss": 0.025326939299702644, + "eval_rewards/accuracies": 0.6554999947547913, + "eval_rewards/chosen": -0.06799853593111038, + "eval_rewards/margins": 0.08076643198728561, + "eval_rewards/rejected": -0.148764967918396, + "eval_runtime": 712.7189, + "eval_samples_per_second": 2.806, + "eval_steps_per_second": 1.403, + "step": 7900 + }, + { + "epoch": 0.52, + "learning_rate": 2.782889842303926e-06, + "logits/chosen": -2.247314453125, + "logits/rejected": -2.0741233825683594, + "logps/chosen": -183.73057556152344, + "logps/rejected": -185.82192993164062, + "loss": 0.0417, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.11750733852386475, + "rewards/margins": 0.03794608265161514, + "rewards/rejected": -0.1554533988237381, + "step": 7910 + }, + { + "epoch": 0.52, + "learning_rate": 2.7772158724559987e-06, + "logits/chosen": -2.0826973915100098, + "logits/rejected": -1.9278663396835327, + "logps/chosen": -228.0181427001953, + "logps/rejected": -298.50860595703125, + "loss": 0.0105, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.05454551428556442, + "rewards/margins": 0.16509851813316345, + "rewards/rejected": -0.21964402496814728, + "step": 7920 + }, + { + "epoch": 0.52, + "learning_rate": 2.7715404565142856e-06, + "logits/chosen": -2.2426955699920654, + "logits/rejected": -2.106792449951172, + "logps/chosen": -215.12130737304688, + "logps/rejected": -220.0441436767578, + "loss": 0.0179, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.0757858008146286, + "rewards/margins": 0.05572701245546341, + "rewards/rejected": -0.1315128058195114, + "step": 7930 + }, + { + "epoch": 0.52, + "learning_rate": 2.7658636240845354e-06, + "logits/chosen": -2.390791416168213, + "logits/rejected": -2.2865827083587646, + "logps/chosen": -236.14505004882812, + "logps/rejected": -268.32696533203125, + "loss": 0.0107, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.0634835809469223, + "rewards/margins": 0.08781920373439789, + "rewards/rejected": -0.1513027846813202, + "step": 7940 + }, + { + "epoch": 0.52, + "learning_rate": 2.7601854047798872e-06, + "logits/chosen": -2.180553913116455, + "logits/rejected": -2.220141887664795, + "logps/chosen": -237.9814453125, + "logps/rejected": -271.52630615234375, + "loss": 0.0278, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.07029031217098236, + "rewards/margins": 0.07064785063266754, + "rewards/rejected": -0.1409381479024887, + "step": 7950 + }, + { + "epoch": 0.52, + "learning_rate": 2.7545058282207148e-06, + "logits/chosen": -2.315669059753418, + "logits/rejected": -1.9250940084457397, + "logps/chosen": -225.5505828857422, + "logps/rejected": -209.63894653320312, + "loss": 0.0269, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.07004386186599731, + "rewards/margins": 0.07333650439977646, + "rewards/rejected": -0.14338035881519318, + "step": 7960 + }, + { + "epoch": 0.52, + "learning_rate": 2.748824924034471e-06, + "logits/chosen": -2.244135618209839, + "logits/rejected": -2.105903148651123, + "logps/chosen": -240.2060089111328, + "logps/rejected": -238.26852416992188, + "loss": 0.0133, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.10397887229919434, + "rewards/margins": 0.08479610085487366, + "rewards/rejected": -0.188774973154068, + "step": 7970 + }, + { + "epoch": 0.52, + "learning_rate": 2.743142721855536e-06, + "logits/chosen": -2.1047046184539795, + "logits/rejected": -2.0798017978668213, + "logps/chosen": -165.8533477783203, + "logps/rejected": -167.95614624023438, + "loss": 0.0372, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.0699661374092102, + "rewards/margins": 0.052256323397159576, + "rewards/rejected": -0.12222246825695038, + "step": 7980 + }, + { + "epoch": 0.52, + "learning_rate": 2.737459251325058e-06, + "logits/chosen": -2.2246992588043213, + "logits/rejected": -2.1735329627990723, + "logps/chosen": -279.4375305175781, + "logps/rejected": -266.1138610839844, + "loss": 0.0129, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.04731985926628113, + "rewards/margins": 0.046038512140512466, + "rewards/rejected": -0.0933583602309227, + "step": 7990 + }, + { + "epoch": 0.52, + "learning_rate": 2.731774542090804e-06, + "logits/chosen": -2.1986324787139893, + "logits/rejected": -1.7890323400497437, + "logps/chosen": -204.6566619873047, + "logps/rejected": -194.89529418945312, + "loss": 0.0385, + "rewards/accuracies": 0.42500001192092896, + "rewards/chosen": -0.05940311402082443, + "rewards/margins": 0.04636824131011963, + "rewards/rejected": -0.10577134042978287, + "step": 8000 + }, + { + "epoch": 0.52, + "eval_logits/chosen": -2.2589457035064697, + "eval_logits/rejected": -2.073702573776245, + "eval_logps/chosen": -241.4889373779297, + "eval_logps/rejected": -237.5529022216797, + "eval_loss": 0.025104772299528122, + "eval_rewards/accuracies": 0.6499999761581421, + "eval_rewards/chosen": -0.04741990193724632, + "eval_rewards/margins": 0.08228505402803421, + "eval_rewards/rejected": -0.12970495223999023, + "eval_runtime": 713.1967, + "eval_samples_per_second": 2.804, + "eval_steps_per_second": 1.402, + "step": 8000 + }, + { + "epoch": 0.52, + "learning_rate": 2.7260886238070034e-06, + "logits/chosen": -2.26794171333313, + "logits/rejected": -2.1829566955566406, + "logps/chosen": -205.74667358398438, + "logps/rejected": -216.74722290039062, + "loss": 0.0387, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.046383585780858994, + "rewards/margins": 0.07854325324296951, + "rewards/rejected": -0.1249268501996994, + "step": 8010 + }, + { + "epoch": 0.52, + "learning_rate": 2.72040152613419e-06, + "logits/chosen": -2.296430826187134, + "logits/rejected": -1.7960484027862549, + "logps/chosen": -228.84841918945312, + "logps/rejected": -171.97018432617188, + "loss": 0.0344, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.05986651033163071, + "rewards/margins": 0.13173654675483704, + "rewards/rejected": -0.19160306453704834, + "step": 8020 + }, + { + "epoch": 0.53, + "learning_rate": 2.7147132787390516e-06, + "logits/chosen": -2.277390956878662, + "logits/rejected": -1.9836620092391968, + "logps/chosen": -236.23367309570312, + "logps/rejected": -234.6664581298828, + "loss": 0.022, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.043777886778116226, + "rewards/margins": 0.07709158211946487, + "rewards/rejected": -0.1208694726228714, + "step": 8030 + }, + { + "epoch": 0.53, + "learning_rate": 2.709023911294273e-06, + "logits/chosen": -2.3583455085754395, + "logits/rejected": -1.8926982879638672, + "logps/chosen": -247.84768676757812, + "logps/rejected": -246.155517578125, + "loss": 0.0446, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.027644217014312744, + "rewards/margins": 0.14603692293167114, + "rewards/rejected": -0.1736811399459839, + "step": 8040 + }, + { + "epoch": 0.53, + "learning_rate": 2.7033334534783806e-06, + "logits/chosen": -2.251115083694458, + "logits/rejected": -2.343580961227417, + "logps/chosen": -208.17770385742188, + "logps/rejected": -244.0928192138672, + "loss": 0.0245, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.045159198343753815, + "rewards/margins": 0.09858004003763199, + "rewards/rejected": -0.1437392383813858, + "step": 8050 + }, + { + "epoch": 0.53, + "learning_rate": 2.697641934975592e-06, + "logits/chosen": -2.2605040073394775, + "logits/rejected": -2.0341665744781494, + "logps/chosen": -238.95156860351562, + "logps/rejected": -222.9936065673828, + "loss": 0.0342, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.05582579970359802, + "rewards/margins": 0.08297914266586304, + "rewards/rejected": -0.13880494236946106, + "step": 8060 + }, + { + "epoch": 0.53, + "learning_rate": 2.691949385475654e-06, + "logits/chosen": -2.2924370765686035, + "logits/rejected": -2.0420851707458496, + "logps/chosen": -257.25457763671875, + "logps/rejected": -246.86679077148438, + "loss": 0.0376, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.06681881099939346, + "rewards/margins": 0.07591725140810013, + "rewards/rejected": -0.1427360475063324, + "step": 8070 + }, + { + "epoch": 0.53, + "learning_rate": 2.6862558346736937e-06, + "logits/chosen": -2.205564260482788, + "logits/rejected": -2.058074951171875, + "logps/chosen": -252.31167602539062, + "logps/rejected": -277.84716796875, + "loss": 0.0192, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.06979704648256302, + "rewards/margins": 0.15368643403053284, + "rewards/rejected": -0.22348348796367645, + "step": 8080 + }, + { + "epoch": 0.53, + "learning_rate": 2.6805613122700617e-06, + "logits/chosen": -2.2460989952087402, + "logits/rejected": -1.916083574295044, + "logps/chosen": -242.57296752929688, + "logps/rejected": -261.56463623046875, + "loss": 0.0219, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.09500262886285782, + "rewards/margins": 0.09024744480848312, + "rewards/rejected": -0.18525007367134094, + "step": 8090 + }, + { + "epoch": 0.53, + "learning_rate": 2.674865847970176e-06, + "logits/chosen": -2.1714882850646973, + "logits/rejected": -1.9047397375106812, + "logps/chosen": -222.4866485595703, + "logps/rejected": -262.7538146972656, + "loss": 0.0295, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.07435137033462524, + "rewards/margins": 0.0890941396355629, + "rewards/rejected": -0.16344550251960754, + "step": 8100 + }, + { + "epoch": 0.53, + "eval_logits/chosen": -2.2293014526367188, + "eval_logits/rejected": -2.0447380542755127, + "eval_logps/chosen": -246.51162719726562, + "eval_logps/rejected": -242.96429443359375, + "eval_loss": 0.02487250603735447, + "eval_rewards/accuracies": 0.6589999794960022, + "eval_rewards/chosen": -0.07253342866897583, + "eval_rewards/margins": 0.08422857522964478, + "eval_rewards/rejected": -0.1567619889974594, + "eval_runtime": 712.8219, + "eval_samples_per_second": 2.806, + "eval_steps_per_second": 1.403, + "step": 8100 + }, + { + "epoch": 0.53, + "learning_rate": 2.669169471484368e-06, + "logits/chosen": -1.9838573932647705, + "logits/rejected": -2.03348708152771, + "logps/chosen": -182.14694213867188, + "logps/rejected": -187.5550079345703, + "loss": 0.0315, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": -0.085444375872612, + "rewards/margins": 0.040241360664367676, + "rewards/rejected": -0.12568573653697968, + "step": 8110 + }, + { + "epoch": 0.53, + "learning_rate": 2.6634722125277278e-06, + "logits/chosen": -2.333920955657959, + "logits/rejected": -2.0139307975769043, + "logps/chosen": -250.2372589111328, + "logps/rejected": -274.3821105957031, + "loss": 0.0275, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.09058667719364166, + "rewards/margins": 0.079750195145607, + "rewards/rejected": -0.17033687233924866, + "step": 8120 + }, + { + "epoch": 0.53, + "learning_rate": 2.6577741008199498e-06, + "logits/chosen": -2.2460877895355225, + "logits/rejected": -1.8959665298461914, + "logps/chosen": -270.9378356933594, + "logps/rejected": -243.338134765625, + "loss": 0.025, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.07389827072620392, + "rewards/margins": 0.16220808029174805, + "rewards/rejected": -0.23610636591911316, + "step": 8130 + }, + { + "epoch": 0.53, + "learning_rate": 2.652075166085175e-06, + "logits/chosen": -2.146793842315674, + "logits/rejected": -2.1021342277526855, + "logps/chosen": -252.03121948242188, + "logps/rejected": -300.4096984863281, + "loss": 0.0242, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.08725959807634354, + "rewards/margins": 0.12416623532772064, + "rewards/rejected": -0.21142585575580597, + "step": 8140 + }, + { + "epoch": 0.53, + "learning_rate": 2.6463754380518395e-06, + "logits/chosen": -2.1202099323272705, + "logits/rejected": -1.9160016775131226, + "logps/chosen": -251.58853149414062, + "logps/rejected": -213.3192901611328, + "loss": 0.0292, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.10315438359975815, + "rewards/margins": 0.08278907835483551, + "rewards/rejected": -0.18594345450401306, + "step": 8150 + }, + { + "epoch": 0.53, + "learning_rate": 2.6406749464525167e-06, + "logits/chosen": -2.2524800300598145, + "logits/rejected": -1.9723221063613892, + "logps/chosen": -242.8863067626953, + "logps/rejected": -214.191650390625, + "loss": 0.0389, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.046635985374450684, + "rewards/margins": 0.09268581122159958, + "rewards/rejected": -0.13932180404663086, + "step": 8160 + }, + { + "epoch": 0.53, + "learning_rate": 2.634973721023762e-06, + "logits/chosen": -2.288912534713745, + "logits/rejected": -2.132657051086426, + "logps/chosen": -273.7260437011719, + "logps/rejected": -234.4434051513672, + "loss": 0.0504, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.0963190421462059, + "rewards/margins": 0.05077965185046196, + "rewards/rejected": -0.14709869027137756, + "step": 8170 + }, + { + "epoch": 0.54, + "learning_rate": 2.6292717915059605e-06, + "logits/chosen": -2.334805965423584, + "logits/rejected": -2.0667991638183594, + "logps/chosen": -291.00115966796875, + "logps/rejected": -256.76666259765625, + "loss": 0.0146, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.07685528695583344, + "rewards/margins": 0.10979298502206802, + "rewards/rejected": -0.18664827942848206, + "step": 8180 + }, + { + "epoch": 0.54, + "learning_rate": 2.6235691876431706e-06, + "logits/chosen": -2.138881206512451, + "logits/rejected": -2.1886343955993652, + "logps/chosen": -233.529541015625, + "logps/rejected": -253.8592987060547, + "loss": 0.0195, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.07456973940134048, + "rewards/margins": 0.07365313172340393, + "rewards/rejected": -0.1482228934764862, + "step": 8190 + }, + { + "epoch": 0.54, + "learning_rate": 2.6178659391829673e-06, + "logits/chosen": -2.356948137283325, + "logits/rejected": -2.0806820392608643, + "logps/chosen": -248.50607299804688, + "logps/rejected": -223.2180633544922, + "loss": 0.0147, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.055140793323516846, + "rewards/margins": 0.07777590304613113, + "rewards/rejected": -0.13291668891906738, + "step": 8200 + }, + { + "epoch": 0.54, + "eval_logits/chosen": -2.2300643920898438, + "eval_logits/rejected": -2.045851945877075, + "eval_logps/chosen": -248.29393005371094, + "eval_logps/rejected": -244.34071350097656, + "eval_loss": 0.025006111711263657, + "eval_rewards/accuracies": 0.6455000042915344, + "eval_rewards/chosen": -0.08144490420818329, + "eval_rewards/margins": 0.08219918608665466, + "eval_rewards/rejected": -0.16364407539367676, + "eval_runtime": 712.7354, + "eval_samples_per_second": 2.806, + "eval_steps_per_second": 1.403, + "step": 8200 + }, + { + "epoch": 0.54, + "learning_rate": 2.6121620758762877e-06, + "logits/chosen": -2.213970422744751, + "logits/rejected": -1.9608663320541382, + "logps/chosen": -209.7481231689453, + "logps/rejected": -221.8324432373047, + "loss": 0.0346, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.08831901103258133, + "rewards/margins": 0.06441696733236313, + "rewards/rejected": -0.15273597836494446, + "step": 8210 + }, + { + "epoch": 0.54, + "learning_rate": 2.606457627477277e-06, + "logits/chosen": -2.149473190307617, + "logits/rejected": -2.0927867889404297, + "logps/chosen": -189.00047302246094, + "logps/rejected": -210.3500213623047, + "loss": 0.0487, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.06685586273670197, + "rewards/margins": 0.0899873748421669, + "rewards/rejected": -0.15684325993061066, + "step": 8220 + }, + { + "epoch": 0.54, + "learning_rate": 2.6007526237431324e-06, + "logits/chosen": -2.3013267517089844, + "logits/rejected": -2.2491354942321777, + "logps/chosen": -196.39431762695312, + "logps/rejected": -228.51321411132812, + "loss": 0.0192, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.07182405889034271, + "rewards/margins": 0.09000401198863983, + "rewards/rejected": -0.16182805597782135, + "step": 8230 + }, + { + "epoch": 0.54, + "learning_rate": 2.5950470944339478e-06, + "logits/chosen": -2.0825228691101074, + "logits/rejected": -2.1439049243927, + "logps/chosen": -231.905517578125, + "logps/rejected": -234.68832397460938, + "loss": 0.0335, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.050518035888671875, + "rewards/margins": 0.03139277547597885, + "rewards/rejected": -0.08191081136465073, + "step": 8240 + }, + { + "epoch": 0.54, + "learning_rate": 2.58934106931256e-06, + "logits/chosen": -2.217744827270508, + "logits/rejected": -1.9220752716064453, + "logps/chosen": -237.2313690185547, + "logps/rejected": -231.385986328125, + "loss": 0.0302, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.08229657262563705, + "rewards/margins": 0.06666077673435211, + "rewards/rejected": -0.14895735681056976, + "step": 8250 + }, + { + "epoch": 0.54, + "learning_rate": 2.58363457814439e-06, + "logits/chosen": -2.212259292602539, + "logits/rejected": -1.929527997970581, + "logps/chosen": -230.8060760498047, + "logps/rejected": -234.58523559570312, + "loss": 0.0308, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.10067076981067657, + "rewards/margins": 0.08751292526721954, + "rewards/rejected": -0.18818369507789612, + "step": 8260 + }, + { + "epoch": 0.54, + "learning_rate": 2.5779276506972924e-06, + "logits/chosen": -2.1959948539733887, + "logits/rejected": -2.173485279083252, + "logps/chosen": -247.912841796875, + "logps/rejected": -220.7331085205078, + "loss": 0.0237, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.0797998383641243, + "rewards/margins": 0.06101224943995476, + "rewards/rejected": -0.14081206917762756, + "step": 8270 + }, + { + "epoch": 0.54, + "learning_rate": 2.5722203167413945e-06, + "logits/chosen": -2.316793441772461, + "logits/rejected": -1.9904791116714478, + "logps/chosen": -299.3642272949219, + "logps/rejected": -234.1168975830078, + "loss": 0.016, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.08033281564712524, + "rewards/margins": 0.08336742967367172, + "rewards/rejected": -0.16370025277137756, + "step": 8280 + }, + { + "epoch": 0.54, + "learning_rate": 2.5665126060489476e-06, + "logits/chosen": -2.2750446796417236, + "logits/rejected": -2.1247048377990723, + "logps/chosen": -204.67526245117188, + "logps/rejected": -246.42562866210938, + "loss": 0.0132, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.08286619186401367, + "rewards/margins": 0.06451877951622009, + "rewards/rejected": -0.14738497138023376, + "step": 8290 + }, + { + "epoch": 0.54, + "learning_rate": 2.560804548394165e-06, + "logits/chosen": -2.1876912117004395, + "logits/rejected": -1.9274814128875732, + "logps/chosen": -263.56103515625, + "logps/rejected": -235.9149932861328, + "loss": 0.0166, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.08140331506729126, + "rewards/margins": 0.08813115209341049, + "rewards/rejected": -0.16953447461128235, + "step": 8300 + }, + { + "epoch": 0.54, + "eval_logits/chosen": -2.2465648651123047, + "eval_logits/rejected": -2.06178617477417, + "eval_logps/chosen": -244.70086669921875, + "eval_logps/rejected": -239.91378784179688, + "eval_loss": 0.025410430505871773, + "eval_rewards/accuracies": 0.6535000205039978, + "eval_rewards/chosen": -0.06347952038049698, + "eval_rewards/margins": 0.07802990823984146, + "eval_rewards/rejected": -0.14150942862033844, + "eval_runtime": 712.3701, + "eval_samples_per_second": 2.808, + "eval_steps_per_second": 1.404, + "step": 8300 + }, + { + "epoch": 0.54, + "learning_rate": 2.5550961735530734e-06, + "logits/chosen": -2.0759482383728027, + "logits/rejected": -2.251865863800049, + "logps/chosen": -172.39732360839844, + "logps/rejected": -215.35757446289062, + "loss": 0.0252, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.046807561069726944, + "rewards/margins": 0.06184772402048111, + "rewards/rejected": -0.10865527391433716, + "step": 8310 + }, + { + "epoch": 0.54, + "learning_rate": 2.549387511303351e-06, + "logits/chosen": -2.231900215148926, + "logits/rejected": -2.271174669265747, + "logps/chosen": -181.90859985351562, + "logps/rejected": -237.9917755126953, + "loss": 0.0127, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.060858823359012604, + "rewards/margins": 0.05901271849870682, + "rewards/rejected": -0.11987153440713882, + "step": 8320 + }, + { + "epoch": 0.55, + "learning_rate": 2.5436785914241774e-06, + "logits/chosen": -2.1687874794006348, + "logits/rejected": -2.1932742595672607, + "logps/chosen": -214.6407012939453, + "logps/rejected": -211.1691131591797, + "loss": 0.0365, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.0784073919057846, + "rewards/margins": 0.14249031245708466, + "rewards/rejected": -0.22089770436286926, + "step": 8330 + }, + { + "epoch": 0.55, + "learning_rate": 2.5379694436960746e-06, + "logits/chosen": -2.350149393081665, + "logits/rejected": -2.1551597118377686, + "logps/chosen": -255.09774780273438, + "logps/rejected": -279.1950378417969, + "loss": 0.0251, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.045433927327394485, + "rewards/margins": 0.06766197085380554, + "rewards/rejected": -0.11309590190649033, + "step": 8340 + }, + { + "epoch": 0.55, + "learning_rate": 2.5322600979007533e-06, + "logits/chosen": -2.3590166568756104, + "logits/rejected": -2.1173205375671387, + "logps/chosen": -226.3520050048828, + "logps/rejected": -220.6983642578125, + "loss": 0.0225, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.06928284466266632, + "rewards/margins": 0.07522784173488617, + "rewards/rejected": -0.1445106863975525, + "step": 8350 + }, + { + "epoch": 0.55, + "learning_rate": 2.5265505838209592e-06, + "logits/chosen": -2.378201723098755, + "logits/rejected": -2.035088062286377, + "logps/chosen": -273.260986328125, + "logps/rejected": -238.40170288085938, + "loss": 0.0294, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.08148683607578278, + "rewards/margins": 0.057769954204559326, + "rewards/rejected": -0.1392567902803421, + "step": 8360 + }, + { + "epoch": 0.55, + "learning_rate": 2.520840931240314e-06, + "logits/chosen": -2.4000751972198486, + "logits/rejected": -1.9181627035140991, + "logps/chosen": -222.73373413085938, + "logps/rejected": -174.30149841308594, + "loss": 0.021, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.06462567299604416, + "rewards/margins": 0.07644327729940414, + "rewards/rejected": -0.1410689651966095, + "step": 8370 + }, + { + "epoch": 0.55, + "learning_rate": 2.515131169943162e-06, + "logits/chosen": -1.9641252756118774, + "logits/rejected": -2.0435292720794678, + "logps/chosen": -275.6920471191406, + "logps/rejected": -287.4151611328125, + "loss": 0.0239, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.08798758685588837, + "rewards/margins": 0.09347712248563766, + "rewards/rejected": -0.18146470189094543, + "step": 8380 + }, + { + "epoch": 0.55, + "learning_rate": 2.509421329714416e-06, + "logits/chosen": -2.0971388816833496, + "logits/rejected": -2.129549980163574, + "logps/chosen": -218.18551635742188, + "logps/rejected": -247.5358123779297, + "loss": 0.0362, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.06023172661662102, + "rewards/margins": 0.040791187435388565, + "rewards/rejected": -0.10102292150259018, + "step": 8390 + }, + { + "epoch": 0.55, + "learning_rate": 2.5037114403393987e-06, + "logits/chosen": -2.2117886543273926, + "logits/rejected": -1.9651107788085938, + "logps/chosen": -221.71493530273438, + "logps/rejected": -200.9381866455078, + "loss": 0.0177, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.05367283895611763, + "rewards/margins": 0.06501881778240204, + "rewards/rejected": -0.11869166791439056, + "step": 8400 + }, + { + "epoch": 0.55, + "eval_logits/chosen": -2.2463998794555664, + "eval_logits/rejected": -2.062314510345459, + "eval_logps/chosen": -243.38662719726562, + "eval_logps/rejected": -236.77584838867188, + "eval_loss": 0.02601229026913643, + "eval_rewards/accuracies": 0.6504999995231628, + "eval_rewards/chosen": -0.05690838024020195, + "eval_rewards/margins": 0.0689113661646843, + "eval_rewards/rejected": -0.12581974267959595, + "eval_runtime": 712.0051, + "eval_samples_per_second": 2.809, + "eval_steps_per_second": 1.404, + "step": 8400 + }, + { + "epoch": 0.55, + "learning_rate": 2.4980015316036908e-06, + "logits/chosen": -2.080289363861084, + "logits/rejected": -2.130309581756592, + "logps/chosen": -185.6266326904297, + "logps/rejected": -229.3311309814453, + "loss": 0.0218, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.049084652215242386, + "rewards/margins": 0.11059342324733734, + "rewards/rejected": -0.15967807173728943, + "step": 8410 + }, + { + "epoch": 0.55, + "learning_rate": 2.4922916332929725e-06, + "logits/chosen": -2.413717269897461, + "logits/rejected": -2.1551430225372314, + "logps/chosen": -245.4679718017578, + "logps/rejected": -211.2089080810547, + "loss": 0.0285, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.05021866410970688, + "rewards/margins": 0.026777099817991257, + "rewards/rejected": -0.07699576765298843, + "step": 8420 + }, + { + "epoch": 0.55, + "learning_rate": 2.4865817751928716e-06, + "logits/chosen": -2.151843786239624, + "logits/rejected": -2.135716199874878, + "logps/chosen": -206.24124145507812, + "logps/rejected": -257.28741455078125, + "loss": 0.0401, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.053762663155794144, + "rewards/margins": 0.12312023341655731, + "rewards/rejected": -0.17688289284706116, + "step": 8430 + }, + { + "epoch": 0.55, + "learning_rate": 2.4808719870888037e-06, + "logits/chosen": -2.012324333190918, + "logits/rejected": -1.9392001628875732, + "logps/chosen": -227.54605102539062, + "logps/rejected": -215.35977172851562, + "loss": 0.0174, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.04937288910150528, + "rewards/margins": 0.10543738305568695, + "rewards/rejected": -0.15481027960777283, + "step": 8440 + }, + { + "epoch": 0.55, + "learning_rate": 2.4751622987658206e-06, + "logits/chosen": -2.4310302734375, + "logits/rejected": -2.250427722930908, + "logps/chosen": -246.6014404296875, + "logps/rejected": -248.3928680419922, + "loss": 0.0246, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.04325593635439873, + "rewards/margins": 0.06266029924154282, + "rewards/rejected": -0.10591623932123184, + "step": 8450 + }, + { + "epoch": 0.55, + "learning_rate": 2.4694527400084546e-06, + "logits/chosen": -2.2096364498138428, + "logits/rejected": -2.1173431873321533, + "logps/chosen": -234.0587615966797, + "logps/rejected": -243.300048828125, + "loss": 0.0249, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.059070538729429245, + "rewards/margins": 0.0754295364022255, + "rewards/rejected": -0.13450007140636444, + "step": 8460 + }, + { + "epoch": 0.55, + "learning_rate": 2.4637433406005607e-06, + "logits/chosen": -2.410126209259033, + "logits/rejected": -2.3014655113220215, + "logps/chosen": -324.4518127441406, + "logps/rejected": -303.3929748535156, + "loss": 0.023, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.07362736016511917, + "rewards/margins": 0.034921690821647644, + "rewards/rejected": -0.10854904353618622, + "step": 8470 + }, + { + "epoch": 0.55, + "learning_rate": 2.4580341303251628e-06, + "logits/chosen": -2.2237625122070312, + "logits/rejected": -1.9655005931854248, + "logps/chosen": -270.9044189453125, + "logps/rejected": -251.58609008789062, + "loss": 0.032, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.054766081273555756, + "rewards/margins": 0.08540228009223938, + "rewards/rejected": -0.14016836881637573, + "step": 8480 + }, + { + "epoch": 0.56, + "learning_rate": 2.4523251389642984e-06, + "logits/chosen": -2.1201000213623047, + "logits/rejected": -1.9871246814727783, + "logps/chosen": -269.6641845703125, + "logps/rejected": -253.3271026611328, + "loss": 0.0413, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.0755244717001915, + "rewards/margins": 0.10400693118572235, + "rewards/rejected": -0.17953141033649445, + "step": 8490 + }, + { + "epoch": 0.56, + "learning_rate": 2.4466163962988626e-06, + "logits/chosen": -2.437121629714966, + "logits/rejected": -2.0692362785339355, + "logps/chosen": -293.087646484375, + "logps/rejected": -214.70263671875, + "loss": 0.0323, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.06243325024843216, + "rewards/margins": 0.10167159885168076, + "rewards/rejected": -0.16410483419895172, + "step": 8500 + }, + { + "epoch": 0.56, + "eval_logits/chosen": -2.235217571258545, + "eval_logits/rejected": -2.0510294437408447, + "eval_logps/chosen": -244.1342315673828, + "eval_logps/rejected": -241.17881774902344, + "eval_loss": 0.024714848026633263, + "eval_rewards/accuracies": 0.6589999794960022, + "eval_rewards/chosen": -0.06064639613032341, + "eval_rewards/margins": 0.08718820661306381, + "eval_rewards/rejected": -0.14783459901809692, + "eval_runtime": 714.7546, + "eval_samples_per_second": 2.798, + "eval_steps_per_second": 1.399, + "step": 8500 + }, + { + "epoch": 0.56, + "learning_rate": 2.4409079321084543e-06, + "logits/chosen": -2.1872916221618652, + "logits/rejected": -2.2446069717407227, + "logps/chosen": -222.677490234375, + "logps/rejected": -270.8872985839844, + "loss": 0.0329, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.03778903931379318, + "rewards/margins": 0.09135451167821884, + "rewards/rejected": -0.12914356589317322, + "step": 8510 + }, + { + "epoch": 0.56, + "learning_rate": 2.4351997761712184e-06, + "logits/chosen": -2.450037956237793, + "logits/rejected": -1.995489478111267, + "logps/chosen": -255.88497924804688, + "logps/rejected": -208.86666870117188, + "loss": 0.0117, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.05248330906033516, + "rewards/margins": 0.09843796491622925, + "rewards/rejected": -0.1509212702512741, + "step": 8520 + }, + { + "epoch": 0.56, + "learning_rate": 2.4294919582636933e-06, + "logits/chosen": -2.240788221359253, + "logits/rejected": -2.096522092819214, + "logps/chosen": -218.12100219726562, + "logps/rejected": -224.65902709960938, + "loss": 0.0286, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.035092391073703766, + "rewards/margins": 0.09188707917928696, + "rewards/rejected": -0.12697947025299072, + "step": 8530 + }, + { + "epoch": 0.56, + "learning_rate": 2.423784508160652e-06, + "logits/chosen": -2.3209753036499023, + "logits/rejected": -2.0700995922088623, + "logps/chosen": -269.3289489746094, + "logps/rejected": -236.85446166992188, + "loss": 0.0144, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.07049113512039185, + "rewards/margins": 0.07681427896022797, + "rewards/rejected": -0.14730539917945862, + "step": 8540 + }, + { + "epoch": 0.56, + "learning_rate": 2.418077455634951e-06, + "logits/chosen": -2.1461005210876465, + "logits/rejected": -2.193398952484131, + "logps/chosen": -231.21365356445312, + "logps/rejected": -268.70916748046875, + "loss": 0.0164, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.07321284711360931, + "rewards/margins": 0.049691587686538696, + "rewards/rejected": -0.12290443480014801, + "step": 8550 + }, + { + "epoch": 0.56, + "learning_rate": 2.4123708304573714e-06, + "logits/chosen": -2.339146852493286, + "logits/rejected": -2.191551446914673, + "logps/chosen": -300.6154479980469, + "logps/rejected": -300.11444091796875, + "loss": 0.034, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.05424055457115173, + "rewards/margins": 0.08098848909139633, + "rewards/rejected": -0.13522902131080627, + "step": 8560 + }, + { + "epoch": 0.56, + "learning_rate": 2.406664662396465e-06, + "logits/chosen": -2.1146233081817627, + "logits/rejected": -1.964341163635254, + "logps/chosen": -203.61959838867188, + "logps/rejected": -201.922607421875, + "loss": 0.0154, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.09618864208459854, + "rewards/margins": 0.06992589682340622, + "rewards/rejected": -0.16611452400684357, + "step": 8570 + }, + { + "epoch": 0.56, + "learning_rate": 2.4009589812184012e-06, + "logits/chosen": -2.2772903442382812, + "logits/rejected": -1.8964955806732178, + "logps/chosen": -215.86569213867188, + "logps/rejected": -179.8876953125, + "loss": 0.0142, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.06322894990444183, + "rewards/margins": 0.08566492795944214, + "rewards/rejected": -0.14889389276504517, + "step": 8580 + }, + { + "epoch": 0.56, + "learning_rate": 2.3952538166868073e-06, + "logits/chosen": -2.0397861003875732, + "logits/rejected": -2.1017701625823975, + "logps/chosen": -232.49710083007812, + "logps/rejected": -240.0894317626953, + "loss": 0.0268, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.07475374639034271, + "rewards/margins": 0.12092749774456024, + "rewards/rejected": -0.19568124413490295, + "step": 8590 + }, + { + "epoch": 0.56, + "learning_rate": 2.389549198562616e-06, + "logits/chosen": -2.240582227706909, + "logits/rejected": -1.814883828163147, + "logps/chosen": -238.9700469970703, + "logps/rejected": -230.742919921875, + "loss": 0.0178, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.06444202363491058, + "rewards/margins": 0.10876522958278656, + "rewards/rejected": -0.17320728302001953, + "step": 8600 + }, + { + "epoch": 0.56, + "eval_logits/chosen": -2.245410203933716, + "eval_logits/rejected": -2.0606565475463867, + "eval_logps/chosen": -245.94483947753906, + "eval_logps/rejected": -243.05995178222656, + "eval_loss": 0.024543337523937225, + "eval_rewards/accuracies": 0.6610000133514404, + "eval_rewards/chosen": -0.06969940662384033, + "eval_rewards/margins": 0.08754073828458786, + "eval_rewards/rejected": -0.1572401374578476, + "eval_runtime": 712.5003, + "eval_samples_per_second": 2.807, + "eval_steps_per_second": 1.404, + "step": 8600 + }, + { + "epoch": 0.56, + "learning_rate": 2.3838451566039098e-06, + "logits/chosen": -2.270934581756592, + "logits/rejected": -2.096938133239746, + "logps/chosen": -255.7738037109375, + "logps/rejected": -253.7856903076172, + "loss": 0.0296, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.08631912618875504, + "rewards/margins": 0.040729813277721405, + "rewards/rejected": -0.12704893946647644, + "step": 8610 + }, + { + "epoch": 0.56, + "learning_rate": 2.3781417205657662e-06, + "logits/chosen": -2.2661519050598145, + "logits/rejected": -1.9691057205200195, + "logps/chosen": -211.8663330078125, + "logps/rejected": -189.75233459472656, + "loss": 0.048, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.07054928690195084, + "rewards/margins": 0.08046600222587585, + "rewards/rejected": -0.1510152816772461, + "step": 8620 + }, + { + "epoch": 0.56, + "learning_rate": 2.3724389202001006e-06, + "logits/chosen": -2.2923483848571777, + "logits/rejected": -2.0269789695739746, + "logps/chosen": -218.217529296875, + "logps/rejected": -207.7074737548828, + "loss": 0.0202, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.07947049289941788, + "rewards/margins": 0.05593295767903328, + "rewards/rejected": -0.13540346920490265, + "step": 8630 + }, + { + "epoch": 0.57, + "learning_rate": 2.366736785255514e-06, + "logits/chosen": -2.1759963035583496, + "logits/rejected": -2.105229139328003, + "logps/chosen": -216.8457489013672, + "logps/rejected": -219.7010498046875, + "loss": 0.0184, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.0905846655368805, + "rewards/margins": 0.0703764483332634, + "rewards/rejected": -0.1609611064195633, + "step": 8640 + }, + { + "epoch": 0.57, + "learning_rate": 2.3610353454771355e-06, + "logits/chosen": -2.075446605682373, + "logits/rejected": -2.004032850265503, + "logps/chosen": -204.26815795898438, + "logps/rejected": -202.31051635742188, + "loss": 0.0396, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.07160364091396332, + "rewards/margins": 0.0716845691204071, + "rewards/rejected": -0.14328821003437042, + "step": 8650 + }, + { + "epoch": 0.57, + "learning_rate": 2.355334630606467e-06, + "logits/chosen": -2.4558193683624268, + "logits/rejected": -1.9807491302490234, + "logps/chosen": -254.63613891601562, + "logps/rejected": -205.7122039794922, + "loss": 0.0128, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.07172928750514984, + "rewards/margins": 0.07310833036899567, + "rewards/rejected": -0.1448376178741455, + "step": 8660 + }, + { + "epoch": 0.57, + "learning_rate": 2.349634670381231e-06, + "logits/chosen": -2.062732696533203, + "logits/rejected": -2.0113823413848877, + "logps/chosen": -223.168701171875, + "logps/rejected": -247.4897918701172, + "loss": 0.0395, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.0798480361700058, + "rewards/margins": 0.07261226326227188, + "rewards/rejected": -0.15246029198169708, + "step": 8670 + }, + { + "epoch": 0.57, + "learning_rate": 2.3439354945352104e-06, + "logits/chosen": -2.3090953826904297, + "logits/rejected": -2.245356559753418, + "logps/chosen": -259.2715148925781, + "logps/rejected": -221.3633575439453, + "loss": 0.066, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -0.07873961329460144, + "rewards/margins": 0.032447971403598785, + "rewards/rejected": -0.11118757724761963, + "step": 8680 + }, + { + "epoch": 0.57, + "learning_rate": 2.3382371327981e-06, + "logits/chosen": -2.170595645904541, + "logits/rejected": -2.1648993492126465, + "logps/chosen": -243.6365203857422, + "logps/rejected": -248.4460906982422, + "loss": 0.0252, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.06080128625035286, + "rewards/margins": 0.0955219715833664, + "rewards/rejected": -0.15632323920726776, + "step": 8690 + }, + { + "epoch": 0.57, + "learning_rate": 2.3325396148953456e-06, + "logits/chosen": -2.034379482269287, + "logits/rejected": -2.143871784210205, + "logps/chosen": -190.2666473388672, + "logps/rejected": -265.8991394042969, + "loss": 0.0473, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.09482596069574356, + "rewards/margins": 0.10751118510961533, + "rewards/rejected": -0.2023371458053589, + "step": 8700 + }, + { + "epoch": 0.57, + "eval_logits/chosen": -2.2517881393432617, + "eval_logits/rejected": -2.0663349628448486, + "eval_logps/chosen": -245.9043426513672, + "eval_logps/rejected": -242.30227661132812, + "eval_loss": 0.024720149114727974, + "eval_rewards/accuracies": 0.656499981880188, + "eval_rewards/chosen": -0.06949705630540848, + "eval_rewards/margins": 0.08395478129386902, + "eval_rewards/rejected": -0.1534518301486969, + "eval_runtime": 712.8179, + "eval_samples_per_second": 2.806, + "eval_steps_per_second": 1.403, + "step": 8700 + }, + { + "epoch": 0.57, + "learning_rate": 2.3268429705479915e-06, + "logits/chosen": -2.4342236518859863, + "logits/rejected": -2.078758716583252, + "logps/chosen": -238.2794189453125, + "logps/rejected": -214.2716064453125, + "loss": 0.0192, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.06733395159244537, + "rewards/margins": 0.08655620366334915, + "rewards/rejected": -0.15389014780521393, + "step": 8710 + }, + { + "epoch": 0.57, + "learning_rate": 2.3211472294725248e-06, + "logits/chosen": -2.279585599899292, + "logits/rejected": -2.1437907218933105, + "logps/chosen": -225.87057495117188, + "logps/rejected": -233.3922882080078, + "loss": 0.0289, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.04727703332901001, + "rewards/margins": 0.1009441465139389, + "rewards/rejected": -0.1482211798429489, + "step": 8720 + }, + { + "epoch": 0.57, + "learning_rate": 2.315452421380721e-06, + "logits/chosen": -2.160231113433838, + "logits/rejected": -1.7129781246185303, + "logps/chosen": -268.67547607421875, + "logps/rejected": -238.1331024169922, + "loss": 0.0218, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.068655826151371, + "rewards/margins": 0.08725351095199585, + "rewards/rejected": -0.15590932965278625, + "step": 8730 + }, + { + "epoch": 0.57, + "learning_rate": 2.3097585759794886e-06, + "logits/chosen": -2.252720832824707, + "logits/rejected": -1.8722463846206665, + "logps/chosen": -263.8703308105469, + "logps/rejected": -228.2633056640625, + "loss": 0.0213, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.04764527827501297, + "rewards/margins": 0.1389247328042984, + "rewards/rejected": -0.18657000362873077, + "step": 8740 + }, + { + "epoch": 0.57, + "learning_rate": 2.3040657229707155e-06, + "logits/chosen": -2.262620210647583, + "logits/rejected": -2.1514811515808105, + "logps/chosen": -183.5465850830078, + "logps/rejected": -213.42861938476562, + "loss": 0.0202, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.0574948713183403, + "rewards/margins": 0.09163203835487366, + "rewards/rejected": -0.14912688732147217, + "step": 8750 + }, + { + "epoch": 0.57, + "learning_rate": 2.2983738920511104e-06, + "logits/chosen": -2.420673370361328, + "logits/rejected": -1.949000597000122, + "logps/chosen": -275.28204345703125, + "logps/rejected": -241.54721069335938, + "loss": 0.0235, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.027445774525403976, + "rewards/margins": 0.08283614367246628, + "rewards/rejected": -0.11028194427490234, + "step": 8760 + }, + { + "epoch": 0.57, + "learning_rate": 2.2926831129120523e-06, + "logits/chosen": -2.072657346725464, + "logits/rejected": -2.0243425369262695, + "logps/chosen": -245.273193359375, + "logps/rejected": -228.0929718017578, + "loss": 0.0221, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.04740776866674423, + "rewards/margins": 0.055877458304166794, + "rewards/rejected": -0.10328521579504013, + "step": 8770 + }, + { + "epoch": 0.57, + "learning_rate": 2.2869934152394323e-06, + "logits/chosen": -2.268575668334961, + "logits/rejected": -2.009955883026123, + "logps/chosen": -283.28582763671875, + "logps/rejected": -244.081787109375, + "loss": 0.0299, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.07160186767578125, + "rewards/margins": 0.08199025690555573, + "rewards/rejected": -0.15359210968017578, + "step": 8780 + }, + { + "epoch": 0.58, + "learning_rate": 2.281304828713501e-06, + "logits/chosen": -2.165152072906494, + "logits/rejected": -2.090395212173462, + "logps/chosen": -247.3802947998047, + "logps/rejected": -253.34701538085938, + "loss": 0.0257, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.0719650387763977, + "rewards/margins": 0.06734617799520493, + "rewards/rejected": -0.13931122422218323, + "step": 8790 + }, + { + "epoch": 0.58, + "learning_rate": 2.275617383008711e-06, + "logits/chosen": -2.2487900257110596, + "logits/rejected": -2.1492092609405518, + "logps/chosen": -248.94287109375, + "logps/rejected": -257.15435791015625, + "loss": 0.0302, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.06727709621191025, + "rewards/margins": 0.05439624935388565, + "rewards/rejected": -0.1216733306646347, + "step": 8800 + }, + { + "epoch": 0.58, + "eval_logits/chosen": -2.244839906692505, + "eval_logits/rejected": -2.059295177459717, + "eval_logps/chosen": -241.635009765625, + "eval_logps/rejected": -237.9780731201172, + "eval_loss": 0.024881817400455475, + "eval_rewards/accuracies": 0.6610000133514404, + "eval_rewards/chosen": -0.04815037548542023, + "eval_rewards/margins": 0.08368047326803207, + "eval_rewards/rejected": -0.1318308562040329, + "eval_runtime": 714.7964, + "eval_samples_per_second": 2.798, + "eval_steps_per_second": 1.399, + "step": 8800 + }, + { + "epoch": 0.58, + "learning_rate": 2.269931107793567e-06, + "logits/chosen": -2.1685874462127686, + "logits/rejected": -2.078900098800659, + "logps/chosen": -219.8336944580078, + "logps/rejected": -240.2872772216797, + "loss": 0.0254, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.03776669502258301, + "rewards/margins": 0.06180558353662491, + "rewards/rejected": -0.09957227855920792, + "step": 8810 + }, + { + "epoch": 0.58, + "learning_rate": 2.2642460327304655e-06, + "logits/chosen": -2.1021993160247803, + "logits/rejected": -2.151242256164551, + "logps/chosen": -254.8282928466797, + "logps/rejected": -253.4233856201172, + "loss": 0.0213, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.05134277418255806, + "rewards/margins": 0.0722130537033081, + "rewards/rejected": -0.12355582416057587, + "step": 8820 + }, + { + "epoch": 0.58, + "learning_rate": 2.258562187475543e-06, + "logits/chosen": -2.0955007076263428, + "logits/rejected": -2.0419440269470215, + "logps/chosen": -237.06796264648438, + "logps/rejected": -215.91860961914062, + "loss": 0.0155, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.05060536786913872, + "rewards/margins": 0.08181245625019073, + "rewards/rejected": -0.13241782784461975, + "step": 8830 + }, + { + "epoch": 0.58, + "learning_rate": 2.2528796016785196e-06, + "logits/chosen": -2.134673595428467, + "logits/rejected": -1.9623810052871704, + "logps/chosen": -200.1417694091797, + "logps/rejected": -232.51611328125, + "loss": 0.0249, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.051119185984134674, + "rewards/margins": 0.11354148387908936, + "rewards/rejected": -0.16466066241264343, + "step": 8840 + }, + { + "epoch": 0.58, + "learning_rate": 2.247198304982548e-06, + "logits/chosen": -2.17331600189209, + "logits/rejected": -1.9789447784423828, + "logps/chosen": -168.8157501220703, + "logps/rejected": -181.06729125976562, + "loss": 0.0251, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.034221209585666656, + "rewards/margins": 0.08100482821464539, + "rewards/rejected": -0.11522604525089264, + "step": 8850 + }, + { + "epoch": 0.58, + "learning_rate": 2.2415183270240533e-06, + "logits/chosen": -2.458310604095459, + "logits/rejected": -2.2174506187438965, + "logps/chosen": -208.83425903320312, + "logps/rejected": -231.552001953125, + "loss": 0.0448, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.04752310365438461, + "rewards/margins": 0.0920606404542923, + "rewards/rejected": -0.1395837366580963, + "step": 8860 + }, + { + "epoch": 0.58, + "learning_rate": 2.2358396974325837e-06, + "logits/chosen": -2.2556686401367188, + "logits/rejected": -2.0556740760803223, + "logps/chosen": -250.03115844726562, + "logps/rejected": -243.87911987304688, + "loss": 0.035, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.03881791979074478, + "rewards/margins": 0.10057507455348969, + "rewards/rejected": -0.13939300179481506, + "step": 8870 + }, + { + "epoch": 0.58, + "learning_rate": 2.2301624458306525e-06, + "logits/chosen": -2.3523645401000977, + "logits/rejected": -2.0720772743225098, + "logps/chosen": -273.26007080078125, + "logps/rejected": -234.77783203125, + "loss": 0.0221, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.06955992430448532, + "rewards/margins": 0.06429468095302582, + "rewards/rejected": -0.13385462760925293, + "step": 8880 + }, + { + "epoch": 0.58, + "learning_rate": 2.2244866018335855e-06, + "logits/chosen": -2.220266819000244, + "logits/rejected": -2.207763671875, + "logps/chosen": -227.5157928466797, + "logps/rejected": -256.3835144042969, + "loss": 0.0308, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.05229061841964722, + "rewards/margins": 0.07230345159769058, + "rewards/rejected": -0.1245940700173378, + "step": 8890 + }, + { + "epoch": 0.58, + "learning_rate": 2.2188121950493648e-06, + "logits/chosen": -2.357114791870117, + "logits/rejected": -1.9857616424560547, + "logps/chosen": -233.6726837158203, + "logps/rejected": -167.13748168945312, + "loss": 0.0391, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.07464625686407089, + "rewards/margins": 0.05858578532934189, + "rewards/rejected": -0.13323204219341278, + "step": 8900 + }, + { + "epoch": 0.58, + "eval_logits/chosen": -2.252225399017334, + "eval_logits/rejected": -2.0657691955566406, + "eval_logps/chosen": -244.75286865234375, + "eval_logps/rejected": -242.57672119140625, + "eval_loss": 0.02482656203210354, + "eval_rewards/accuracies": 0.6620000004768372, + "eval_rewards/chosen": -0.06373953819274902, + "eval_rewards/margins": 0.09108465164899826, + "eval_rewards/rejected": -0.15482419729232788, + "eval_runtime": 715.6883, + "eval_samples_per_second": 2.795, + "eval_steps_per_second": 1.397, + "step": 8900 + }, + { + "epoch": 0.58, + "learning_rate": 2.2131392550784766e-06, + "logits/chosen": -2.3761825561523438, + "logits/rejected": -1.8096641302108765, + "logps/chosen": -298.76287841796875, + "logps/rejected": -220.29977416992188, + "loss": 0.0231, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.06942112743854523, + "rewards/margins": 0.08550569415092468, + "rewards/rejected": -0.1549268215894699, + "step": 8910 + }, + { + "epoch": 0.58, + "learning_rate": 2.2074678115137533e-06, + "logits/chosen": -2.0481929779052734, + "logits/rejected": -1.9484479427337646, + "logps/chosen": -207.98080444335938, + "logps/rejected": -239.3912353515625, + "loss": 0.0285, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.06631931662559509, + "rewards/margins": 0.13246922194957733, + "rewards/rejected": -0.19878853857517242, + "step": 8920 + }, + { + "epoch": 0.58, + "learning_rate": 2.201797893940224e-06, + "logits/chosen": -2.143235921859741, + "logits/rejected": -1.944832444190979, + "logps/chosen": -245.1650390625, + "logps/rejected": -282.40557861328125, + "loss": 0.0125, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.059938348829746246, + "rewards/margins": 0.08444986492395401, + "rewards/rejected": -0.14438821375370026, + "step": 8930 + }, + { + "epoch": 0.58, + "learning_rate": 2.196129531934956e-06, + "logits/chosen": -2.189138889312744, + "logits/rejected": -1.9247684478759766, + "logps/chosen": -246.6875, + "logps/rejected": -246.45431518554688, + "loss": 0.0124, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.04386736825108528, + "rewards/margins": 0.08914701640605927, + "rewards/rejected": -0.13301438093185425, + "step": 8940 + }, + { + "epoch": 0.59, + "learning_rate": 2.190462755066902e-06, + "logits/chosen": -2.207003593444824, + "logits/rejected": -1.9766992330551147, + "logps/chosen": -278.06781005859375, + "logps/rejected": -265.1041564941406, + "loss": 0.0133, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.07833322882652283, + "rewards/margins": 0.0626354068517685, + "rewards/rejected": -0.14096863567829132, + "step": 8950 + }, + { + "epoch": 0.59, + "learning_rate": 2.184797592896746e-06, + "logits/chosen": -2.334012985229492, + "logits/rejected": -2.29227876663208, + "logps/chosen": -243.9264373779297, + "logps/rejected": -234.7732391357422, + "loss": 0.0115, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.04745422303676605, + "rewards/margins": 0.08689974248409271, + "rewards/rejected": -0.13435396552085876, + "step": 8960 + }, + { + "epoch": 0.59, + "learning_rate": 2.17913407497675e-06, + "logits/chosen": -2.2696456909179688, + "logits/rejected": -2.334486961364746, + "logps/chosen": -184.894287109375, + "logps/rejected": -233.4813690185547, + "loss": 0.0531, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.03256151080131531, + "rewards/margins": 0.07317936420440674, + "rewards/rejected": -0.10574086755514145, + "step": 8970 + }, + { + "epoch": 0.59, + "learning_rate": 2.173472230850596e-06, + "logits/chosen": -2.382445812225342, + "logits/rejected": -2.188955307006836, + "logps/chosen": -213.95181274414062, + "logps/rejected": -181.55081176757812, + "loss": 0.0555, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -0.06844629347324371, + "rewards/margins": 0.04021351411938667, + "rewards/rejected": -0.10865980386734009, + "step": 8980 + }, + { + "epoch": 0.59, + "learning_rate": 2.1678120900532375e-06, + "logits/chosen": -2.349627733230591, + "logits/rejected": -2.056666851043701, + "logps/chosen": -248.0489501953125, + "logps/rejected": -237.8482666015625, + "loss": 0.0268, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.06067051738500595, + "rewards/margins": 0.09529812633991241, + "rewards/rejected": -0.15596863627433777, + "step": 8990 + }, + { + "epoch": 0.59, + "learning_rate": 2.1621536821107412e-06, + "logits/chosen": -2.244640827178955, + "logits/rejected": -2.11474609375, + "logps/chosen": -210.2344512939453, + "logps/rejected": -187.39842224121094, + "loss": 0.0377, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.03832307457923889, + "rewards/margins": 0.09598144143819809, + "rewards/rejected": -0.13430452346801758, + "step": 9000 + }, + { + "epoch": 0.59, + "eval_logits/chosen": -2.261298418045044, + "eval_logits/rejected": -2.074450731277466, + "eval_logps/chosen": -239.097412109375, + "eval_logps/rejected": -235.3852996826172, + "eval_loss": 0.024586597457528114, + "eval_rewards/accuracies": 0.6575000286102295, + "eval_rewards/chosen": -0.03546227514743805, + "eval_rewards/margins": 0.08340466767549515, + "eval_rewards/rejected": -0.11886695772409439, + "eval_runtime": 710.489, + "eval_samples_per_second": 2.815, + "eval_steps_per_second": 1.407, + "step": 9000 + }, + { + "epoch": 0.59, + "learning_rate": 2.1564970365401346e-06, + "logits/chosen": -2.3016772270202637, + "logits/rejected": -2.036717653274536, + "logps/chosen": -191.85366821289062, + "logps/rejected": -171.2881317138672, + "loss": 0.0267, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.038512926548719406, + "rewards/margins": 0.08721283823251724, + "rewards/rejected": -0.12572576105594635, + "step": 9010 + }, + { + "epoch": 0.59, + "learning_rate": 2.1508421828492527e-06, + "logits/chosen": -2.4234752655029297, + "logits/rejected": -2.0773463249206543, + "logps/chosen": -228.48855590820312, + "logps/rejected": -175.48695373535156, + "loss": 0.0359, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.016479194164276123, + "rewards/margins": 0.07619353383779526, + "rewards/rejected": -0.09267272800207138, + "step": 9020 + }, + { + "epoch": 0.59, + "learning_rate": 2.145189150536582e-06, + "logits/chosen": -2.104311466217041, + "logits/rejected": -2.0074868202209473, + "logps/chosen": -226.5256805419922, + "logps/rejected": -189.82669067382812, + "loss": 0.0397, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.0217808298766613, + "rewards/margins": 0.06351622194051743, + "rewards/rejected": -0.08529704809188843, + "step": 9030 + }, + { + "epoch": 0.59, + "learning_rate": 2.139537969091107e-06, + "logits/chosen": -2.132573366165161, + "logits/rejected": -2.097846508026123, + "logps/chosen": -273.43206787109375, + "logps/rejected": -218.9208526611328, + "loss": 0.0275, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.03924193233251572, + "rewards/margins": 0.038296617567539215, + "rewards/rejected": -0.07753854990005493, + "step": 9040 + }, + { + "epoch": 0.59, + "learning_rate": 2.1338886679921603e-06, + "logits/chosen": -2.211646795272827, + "logits/rejected": -2.1421194076538086, + "logps/chosen": -247.41549682617188, + "logps/rejected": -239.46170043945312, + "loss": 0.0306, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.029362160712480545, + "rewards/margins": 0.06228286027908325, + "rewards/rejected": -0.0916450172662735, + "step": 9050 + }, + { + "epoch": 0.59, + "learning_rate": 2.128241276709263e-06, + "logits/chosen": -2.2989306449890137, + "logits/rejected": -2.2443785667419434, + "logps/chosen": -204.4984588623047, + "logps/rejected": -238.19253540039062, + "loss": 0.0304, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.006183523219078779, + "rewards/margins": 0.068283312022686, + "rewards/rejected": -0.07446683943271637, + "step": 9060 + }, + { + "epoch": 0.59, + "learning_rate": 2.1225958247019746e-06, + "logits/chosen": -2.3331146240234375, + "logits/rejected": -2.462541103363037, + "logps/chosen": -191.1440887451172, + "logps/rejected": -221.80014038085938, + "loss": 0.0138, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.02001136541366577, + "rewards/margins": 0.05258213356137276, + "rewards/rejected": -0.07259351015090942, + "step": 9070 + }, + { + "epoch": 0.59, + "learning_rate": 2.1169523414197383e-06, + "logits/chosen": -2.11511492729187, + "logits/rejected": -2.1142194271087646, + "logps/chosen": -203.34683227539062, + "logps/rejected": -236.80703735351562, + "loss": 0.0143, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.022705694660544395, + "rewards/margins": 0.05069264769554138, + "rewards/rejected": -0.07339833676815033, + "step": 9080 + }, + { + "epoch": 0.59, + "learning_rate": 2.1113108563017267e-06, + "logits/chosen": -2.1969635486602783, + "logits/rejected": -1.9869884252548218, + "logps/chosen": -224.9974822998047, + "logps/rejected": -210.8538055419922, + "loss": 0.0251, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.061071284115314484, + "rewards/margins": 0.0824274942278862, + "rewards/rejected": -0.14349877834320068, + "step": 9090 + }, + { + "epoch": 0.6, + "learning_rate": 2.1056713987766905e-06, + "logits/chosen": -2.407088041305542, + "logits/rejected": -2.0729117393493652, + "logps/chosen": -222.61288452148438, + "logps/rejected": -190.90902709960938, + "loss": 0.0296, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.03706289082765579, + "rewards/margins": 0.08505786955356598, + "rewards/rejected": -0.12212076038122177, + "step": 9100 + }, + { + "epoch": 0.6, + "eval_logits/chosen": -2.2747437953948975, + "eval_logits/rejected": -2.087127208709717, + "eval_logps/chosen": -239.7537078857422, + "eval_logps/rejected": -234.9412078857422, + "eval_loss": 0.024893444031476974, + "eval_rewards/accuracies": 0.6549999713897705, + "eval_rewards/chosen": -0.03874371945858002, + "eval_rewards/margins": 0.07790277898311615, + "eval_rewards/rejected": -0.11664648354053497, + "eval_runtime": 713.9364, + "eval_samples_per_second": 2.801, + "eval_steps_per_second": 1.401, + "step": 9100 + }, + { + "epoch": 0.6, + "learning_rate": 2.1000339982628022e-06, + "logits/chosen": -2.072683334350586, + "logits/rejected": -2.154127597808838, + "logps/chosen": -262.1419372558594, + "logps/rejected": -240.17245483398438, + "loss": 0.0208, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.05434552580118179, + "rewards/margins": 0.06175314635038376, + "rewards/rejected": -0.11609866470098495, + "step": 9110 + }, + { + "epoch": 0.6, + "learning_rate": 2.0943986841675043e-06, + "logits/chosen": -2.290806293487549, + "logits/rejected": -2.060344696044922, + "logps/chosen": -209.92916870117188, + "logps/rejected": -210.94656372070312, + "loss": 0.012, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.03359084203839302, + "rewards/margins": 0.1017860397696495, + "rewards/rejected": -0.13537687063217163, + "step": 9120 + }, + { + "epoch": 0.6, + "learning_rate": 2.088765485887356e-06, + "logits/chosen": -2.2724077701568604, + "logits/rejected": -2.0708746910095215, + "logps/chosen": -252.2576141357422, + "logps/rejected": -218.6079559326172, + "loss": 0.0253, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.03771350532770157, + "rewards/margins": 0.039878927171230316, + "rewards/rejected": -0.07759243249893188, + "step": 9130 + }, + { + "epoch": 0.6, + "learning_rate": 2.083134432807879e-06, + "logits/chosen": -2.2039096355438232, + "logits/rejected": -2.1305642127990723, + "logps/chosen": -206.875244140625, + "logps/rejected": -245.89584350585938, + "loss": 0.0208, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.06301550567150116, + "rewards/margins": 0.08751269429922104, + "rewards/rejected": -0.1505282074213028, + "step": 9140 + }, + { + "epoch": 0.6, + "learning_rate": 2.077505554303404e-06, + "logits/chosen": -2.2766964435577393, + "logits/rejected": -2.2406208515167236, + "logps/chosen": -176.80477905273438, + "logps/rejected": -193.82846069335938, + "loss": 0.0159, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.011263975873589516, + "rewards/margins": 0.06328533589839935, + "rewards/rejected": -0.07454931735992432, + "step": 9150 + }, + { + "epoch": 0.6, + "learning_rate": 2.071878879736918e-06, + "logits/chosen": -2.2843353748321533, + "logits/rejected": -2.0918726921081543, + "logps/chosen": -257.02337646484375, + "logps/rejected": -349.31024169921875, + "loss": 0.0241, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.05091879516839981, + "rewards/margins": 0.07001566886901855, + "rewards/rejected": -0.12093446403741837, + "step": 9160 + }, + { + "epoch": 0.6, + "learning_rate": 2.0662544384599136e-06, + "logits/chosen": -2.1885151863098145, + "logits/rejected": -2.1158032417297363, + "logps/chosen": -206.37759399414062, + "logps/rejected": -206.095703125, + "loss": 0.0314, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.021354788914322853, + "rewards/margins": 0.0810171440243721, + "rewards/rejected": -0.1023719310760498, + "step": 9170 + }, + { + "epoch": 0.6, + "learning_rate": 2.0606322598122314e-06, + "logits/chosen": -2.1867973804473877, + "logits/rejected": -2.303119421005249, + "logps/chosen": -194.89300537109375, + "logps/rejected": -221.6018829345703, + "loss": 0.0151, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.041897259652614594, + "rewards/margins": 0.035239990800619125, + "rewards/rejected": -0.07713725417852402, + "step": 9180 + }, + { + "epoch": 0.6, + "learning_rate": 2.0550123731219085e-06, + "logits/chosen": -2.4753687381744385, + "logits/rejected": -2.300013303756714, + "logps/chosen": -258.02777099609375, + "logps/rejected": -230.7709197998047, + "loss": 0.0252, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.024239610880613327, + "rewards/margins": 0.06944013386964798, + "rewards/rejected": -0.09367974102497101, + "step": 9190 + }, + { + "epoch": 0.6, + "learning_rate": 2.0493948077050267e-06, + "logits/chosen": -2.128877639770508, + "logits/rejected": -1.9453260898590088, + "logps/chosen": -199.78512573242188, + "logps/rejected": -198.4648895263672, + "loss": 0.0241, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.03264569118618965, + "rewards/margins": 0.08384671807289124, + "rewards/rejected": -0.11649241298437119, + "step": 9200 + }, + { + "epoch": 0.6, + "eval_logits/chosen": -2.2942728996276855, + "eval_logits/rejected": -2.1060142517089844, + "eval_logps/chosen": -239.1661376953125, + "eval_logps/rejected": -233.8347930908203, + "eval_loss": 0.025179192423820496, + "eval_rewards/accuracies": 0.6575000286102295, + "eval_rewards/chosen": -0.03580596297979355, + "eval_rewards/margins": 0.07530846446752548, + "eval_rewards/rejected": -0.11111443489789963, + "eval_runtime": 713.7289, + "eval_samples_per_second": 2.802, + "eval_steps_per_second": 1.401, + "step": 9200 + }, + { + "epoch": 0.6, + "learning_rate": 2.0437795928655596e-06, + "logits/chosen": -2.318690538406372, + "logits/rejected": -2.2952334880828857, + "logps/chosen": -288.6143493652344, + "logps/rejected": -281.8822326660156, + "loss": 0.0261, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.03450857847929001, + "rewards/margins": 0.05725806951522827, + "rewards/rejected": -0.09176664799451828, + "step": 9210 + }, + { + "epoch": 0.6, + "learning_rate": 2.0381667578952184e-06, + "logits/chosen": -2.395092010498047, + "logits/rejected": -2.1757171154022217, + "logps/chosen": -219.50790405273438, + "logps/rejected": -246.2615966796875, + "loss": 0.0494, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.04317527264356613, + "rewards/margins": 0.10568326711654663, + "rewards/rejected": -0.14885856211185455, + "step": 9220 + }, + { + "epoch": 0.6, + "learning_rate": 2.0325563320732995e-06, + "logits/chosen": -2.4702086448669434, + "logits/rejected": -2.082486629486084, + "logps/chosen": -271.30841064453125, + "logps/rejected": -244.59213256835938, + "loss": 0.0197, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.03985026478767395, + "rewards/margins": 0.08650810271501541, + "rewards/rejected": -0.12635836005210876, + "step": 9230 + }, + { + "epoch": 0.6, + "learning_rate": 2.026948344666532e-06, + "logits/chosen": -2.2059807777404785, + "logits/rejected": -2.1636924743652344, + "logps/chosen": -211.00961303710938, + "logps/rejected": -226.2769012451172, + "loss": 0.0311, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.05941852927207947, + "rewards/margins": 0.07855961471796036, + "rewards/rejected": -0.13797815144062042, + "step": 9240 + }, + { + "epoch": 0.61, + "learning_rate": 2.0213428249289257e-06, + "logits/chosen": -2.1782264709472656, + "logits/rejected": -2.0983102321624756, + "logps/chosen": -207.1298828125, + "logps/rejected": -227.71847534179688, + "loss": 0.0269, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.042946770787239075, + "rewards/margins": 0.09390541166067123, + "rewards/rejected": -0.1368521898984909, + "step": 9250 + }, + { + "epoch": 0.61, + "learning_rate": 2.0157398021016175e-06, + "logits/chosen": -2.191451072692871, + "logits/rejected": -2.1529908180236816, + "logps/chosen": -153.64662170410156, + "logps/rejected": -216.68057250976562, + "loss": 0.0254, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.030556291341781616, + "rewards/margins": 0.08414594829082489, + "rewards/rejected": -0.1147022470831871, + "step": 9260 + }, + { + "epoch": 0.61, + "learning_rate": 2.010139305412719e-06, + "logits/chosen": -2.468810558319092, + "logits/rejected": -2.2477877140045166, + "logps/chosen": -286.96978759765625, + "logps/rejected": -258.82525634765625, + "loss": 0.0128, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.051870543509721756, + "rewards/margins": 0.07109765708446503, + "rewards/rejected": -0.12296819686889648, + "step": 9270 + }, + { + "epoch": 0.61, + "learning_rate": 2.0045413640771644e-06, + "logits/chosen": -2.197082042694092, + "logits/rejected": -2.3372139930725098, + "logps/chosen": -265.10992431640625, + "logps/rejected": -287.07794189453125, + "loss": 0.0286, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.04972299933433533, + "rewards/margins": 0.09356241673231125, + "rewards/rejected": -0.14328542351722717, + "step": 9280 + }, + { + "epoch": 0.61, + "learning_rate": 1.998946007296558e-06, + "logits/chosen": -2.4369912147521973, + "logits/rejected": -2.100095510482788, + "logps/chosen": -317.42645263671875, + "logps/rejected": -271.50347900390625, + "loss": 0.0171, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.04412918910384178, + "rewards/margins": 0.08756975829601288, + "rewards/rejected": -0.13169893622398376, + "step": 9290 + }, + { + "epoch": 0.61, + "learning_rate": 1.9933532642590215e-06, + "logits/chosen": -2.1721298694610596, + "logits/rejected": -1.797800064086914, + "logps/chosen": -192.38026428222656, + "logps/rejected": -159.61685180664062, + "loss": 0.019, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.016109880059957504, + "rewards/margins": 0.08698701858520508, + "rewards/rejected": -0.10309691727161407, + "step": 9300 + }, + { + "epoch": 0.61, + "eval_logits/chosen": -2.2889318466186523, + "eval_logits/rejected": -2.1003968715667725, + "eval_logps/chosen": -242.31744384765625, + "eval_logps/rejected": -239.0792694091797, + "eval_loss": 0.0250336192548275, + "eval_rewards/accuracies": 0.6579999923706055, + "eval_rewards/chosen": -0.051562484353780746, + "eval_rewards/margins": 0.08577432483434677, + "eval_rewards/rejected": -0.13733680546283722, + "eval_runtime": 714.1938, + "eval_samples_per_second": 2.8, + "eval_steps_per_second": 1.4, + "step": 9300 + }, + { + "epoch": 0.61, + "learning_rate": 1.987763164139042e-06, + "logits/chosen": -2.322814464569092, + "logits/rejected": -2.1172091960906982, + "logps/chosen": -214.98190307617188, + "logps/rejected": -233.5045928955078, + "loss": 0.0172, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.048634134232997894, + "rewards/margins": 0.08134166896343231, + "rewards/rejected": -0.1299758106470108, + "step": 9310 + }, + { + "epoch": 0.61, + "learning_rate": 1.982175736097321e-06, + "logits/chosen": -2.013521194458008, + "logits/rejected": -2.0271811485290527, + "logps/chosen": -287.46978759765625, + "logps/rejected": -321.9363708496094, + "loss": 0.0165, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.06464368104934692, + "rewards/margins": 0.08723381161689758, + "rewards/rejected": -0.1518774926662445, + "step": 9320 + }, + { + "epoch": 0.61, + "learning_rate": 1.9765910092806196e-06, + "logits/chosen": -2.212627410888672, + "logits/rejected": -2.1153512001037598, + "logps/chosen": -184.89788818359375, + "logps/rejected": -178.43759155273438, + "loss": 0.0383, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.034111760556697845, + "rewards/margins": 0.06598981469869614, + "rewards/rejected": -0.10010156780481339, + "step": 9330 + }, + { + "epoch": 0.61, + "learning_rate": 1.9710090128216083e-06, + "logits/chosen": -2.266669750213623, + "logits/rejected": -2.1711018085479736, + "logps/chosen": -234.8772430419922, + "logps/rejected": -242.46630859375, + "loss": 0.0254, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.06073601171374321, + "rewards/margins": 0.11876089870929718, + "rewards/rejected": -0.1794969141483307, + "step": 9340 + }, + { + "epoch": 0.61, + "learning_rate": 1.9654297758387155e-06, + "logits/chosen": -2.1124844551086426, + "logits/rejected": -2.0897462368011475, + "logps/chosen": -177.40499877929688, + "logps/rejected": -213.63632202148438, + "loss": 0.0304, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.08267485350370407, + "rewards/margins": 0.07752931863069534, + "rewards/rejected": -0.1602041870355606, + "step": 9350 + }, + { + "epoch": 0.61, + "learning_rate": 1.9598533274359736e-06, + "logits/chosen": -2.28397798538208, + "logits/rejected": -2.193634510040283, + "logps/chosen": -256.9334411621094, + "logps/rejected": -270.81170654296875, + "loss": 0.0267, + "rewards/accuracies": 0.42500001192092896, + "rewards/chosen": -0.08626577258110046, + "rewards/margins": 0.03258334472775459, + "rewards/rejected": -0.11884911358356476, + "step": 9360 + }, + { + "epoch": 0.61, + "learning_rate": 1.9542796967028697e-06, + "logits/chosen": -2.295401096343994, + "logits/rejected": -2.1766135692596436, + "logps/chosen": -232.0633087158203, + "logps/rejected": -225.03836059570312, + "loss": 0.02, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.06911368668079376, + "rewards/margins": 0.056089501827955246, + "rewards/rejected": -0.1252032071352005, + "step": 9370 + }, + { + "epoch": 0.61, + "learning_rate": 1.948708912714192e-06, + "logits/chosen": -2.192821979522705, + "logits/rejected": -1.9560225009918213, + "logps/chosen": -269.24444580078125, + "logps/rejected": -250.17758178710938, + "loss": 0.0272, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.11149577796459198, + "rewards/margins": 0.05934471637010574, + "rewards/rejected": -0.17084050178527832, + "step": 9380 + }, + { + "epoch": 0.61, + "learning_rate": 1.9431410045298786e-06, + "logits/chosen": -2.0476009845733643, + "logits/rejected": -1.9805431365966797, + "logps/chosen": -228.1685333251953, + "logps/rejected": -242.92599487304688, + "loss": 0.0206, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.05521588400006294, + "rewards/margins": 0.07866604626178741, + "rewards/rejected": -0.13388192653656006, + "step": 9390 + }, + { + "epoch": 0.62, + "learning_rate": 1.9375760011948654e-06, + "logits/chosen": -2.3774194717407227, + "logits/rejected": -2.2332215309143066, + "logps/chosen": -211.9001007080078, + "logps/rejected": -255.4772186279297, + "loss": 0.0247, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.061110854148864746, + "rewards/margins": 0.10628316551446915, + "rewards/rejected": -0.1673940122127533, + "step": 9400 + }, + { + "epoch": 0.62, + "eval_logits/chosen": -2.29257869720459, + "eval_logits/rejected": -2.1041393280029297, + "eval_logps/chosen": -246.23622131347656, + "eval_logps/rejected": -241.68353271484375, + "eval_loss": 0.025083504617214203, + "eval_rewards/accuracies": 0.6545000076293945, + "eval_rewards/chosen": -0.07115628570318222, + "eval_rewards/margins": 0.07920186221599579, + "eval_rewards/rejected": -0.1503581553697586, + "eval_runtime": 715.2109, + "eval_samples_per_second": 2.796, + "eval_steps_per_second": 1.398, + "step": 9400 + }, + { + "epoch": 0.62, + "learning_rate": 1.932013931738937e-06, + "logits/chosen": -2.2685647010803223, + "logits/rejected": -2.040989398956299, + "logps/chosen": -219.1533203125, + "logps/rejected": -258.0494079589844, + "loss": 0.0346, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.07775808870792389, + "rewards/margins": 0.13007166981697083, + "rewards/rejected": -0.20782975852489471, + "step": 9410 + }, + { + "epoch": 0.62, + "learning_rate": 1.9264548251765717e-06, + "logits/chosen": -2.3763561248779297, + "logits/rejected": -2.1895689964294434, + "logps/chosen": -215.4586181640625, + "logps/rejected": -227.1342315673828, + "loss": 0.0165, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.058279722929000854, + "rewards/margins": 0.0739368349313736, + "rewards/rejected": -0.13221655786037445, + "step": 9420 + }, + { + "epoch": 0.62, + "learning_rate": 1.9208987105067924e-06, + "logits/chosen": -2.1715502738952637, + "logits/rejected": -2.031933307647705, + "logps/chosen": -226.62173461914062, + "logps/rejected": -214.856689453125, + "loss": 0.028, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.060323454439640045, + "rewards/margins": 0.060612984001636505, + "rewards/rejected": -0.12093643844127655, + "step": 9430 + }, + { + "epoch": 0.62, + "learning_rate": 1.9153456167130154e-06, + "logits/chosen": -2.281136989593506, + "logits/rejected": -2.2747669219970703, + "logps/chosen": -216.7677001953125, + "logps/rejected": -256.7870178222656, + "loss": 0.0444, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.06348533928394318, + "rewards/margins": 0.0691031813621521, + "rewards/rejected": -0.13258852064609528, + "step": 9440 + }, + { + "epoch": 0.62, + "learning_rate": 1.9097955727628975e-06, + "logits/chosen": -2.309915065765381, + "logits/rejected": -2.3080644607543945, + "logps/chosen": -202.22084045410156, + "logps/rejected": -229.6215057373047, + "loss": 0.0363, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.031176622956991196, + "rewards/margins": 0.07732857763767242, + "rewards/rejected": -0.10850518941879272, + "step": 9450 + }, + { + "epoch": 0.62, + "learning_rate": 1.904248607608187e-06, + "logits/chosen": -2.2073731422424316, + "logits/rejected": -2.2413322925567627, + "logps/chosen": -264.6854248046875, + "logps/rejected": -228.81912231445312, + "loss": 0.0209, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.03732733801007271, + "rewards/margins": 0.05620206519961357, + "rewards/rejected": -0.09352940320968628, + "step": 9460 + }, + { + "epoch": 0.62, + "learning_rate": 1.8987047501845714e-06, + "logits/chosen": -2.2785770893096924, + "logits/rejected": -2.2424912452697754, + "logps/chosen": -174.671142578125, + "logps/rejected": -187.09288024902344, + "loss": 0.0343, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.03873337432742119, + "rewards/margins": 0.09333629906177521, + "rewards/rejected": -0.1320696771144867, + "step": 9470 + }, + { + "epoch": 0.62, + "learning_rate": 1.8931640294115267e-06, + "logits/chosen": -2.0822432041168213, + "logits/rejected": -1.9636636972427368, + "logps/chosen": -201.2562713623047, + "logps/rejected": -207.4918212890625, + "loss": 0.0389, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.03625180199742317, + "rewards/margins": 0.10882870852947235, + "rewards/rejected": -0.14508050680160522, + "step": 9480 + }, + { + "epoch": 0.62, + "learning_rate": 1.8876264741921662e-06, + "logits/chosen": -2.0588245391845703, + "logits/rejected": -2.0789387226104736, + "logps/chosen": -196.6965789794922, + "logps/rejected": -215.16513061523438, + "loss": 0.0208, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.03285546973347664, + "rewards/margins": 0.11952035129070282, + "rewards/rejected": -0.15237581729888916, + "step": 9490 + }, + { + "epoch": 0.62, + "learning_rate": 1.8820921134130912e-06, + "logits/chosen": -2.2807486057281494, + "logits/rejected": -1.9099798202514648, + "logps/chosen": -239.4491729736328, + "logps/rejected": -223.02163696289062, + "loss": 0.0161, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.044685035943984985, + "rewards/margins": 0.14240868389606476, + "rewards/rejected": -0.18709370493888855, + "step": 9500 + }, + { + "epoch": 0.62, + "eval_logits/chosen": -2.2826719284057617, + "eval_logits/rejected": -2.094907283782959, + "eval_logps/chosen": -242.37461853027344, + "eval_logps/rejected": -238.37704467773438, + "eval_loss": 0.024934230372309685, + "eval_rewards/accuracies": 0.6485000252723694, + "eval_rewards/chosen": -0.05184837058186531, + "eval_rewards/margins": 0.08197740465402603, + "eval_rewards/rejected": -0.13382577896118164, + "eval_runtime": 711.7958, + "eval_samples_per_second": 2.81, + "eval_steps_per_second": 1.405, + "step": 9500 + }, + { + "epoch": 0.62, + "learning_rate": 1.8765609759442378e-06, + "logits/chosen": -2.140984058380127, + "logits/rejected": -2.035247325897217, + "logps/chosen": -250.96426391601562, + "logps/rejected": -253.80502319335938, + "loss": 0.0223, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.04821477085351944, + "rewards/margins": 0.06826470792293549, + "rewards/rejected": -0.11647947877645493, + "step": 9510 + }, + { + "epoch": 0.62, + "learning_rate": 1.8710330906387288e-06, + "logits/chosen": -2.348546266555786, + "logits/rejected": -2.2972640991210938, + "logps/chosen": -249.5334014892578, + "logps/rejected": -290.72796630859375, + "loss": 0.0291, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.05522875860333443, + "rewards/margins": 0.08802361786365509, + "rewards/rejected": -0.1432523876428604, + "step": 9520 + }, + { + "epoch": 0.62, + "learning_rate": 1.8655084863327222e-06, + "logits/chosen": -2.2527644634246826, + "logits/rejected": -2.2713284492492676, + "logps/chosen": -190.42855834960938, + "logps/rejected": -210.0547332763672, + "loss": 0.0261, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": -0.03485274314880371, + "rewards/margins": 0.06732039153575897, + "rewards/rejected": -0.10217314958572388, + "step": 9530 + }, + { + "epoch": 0.62, + "learning_rate": 1.8599871918452603e-06, + "logits/chosen": -2.1004343032836914, + "logits/rejected": -2.093557357788086, + "logps/chosen": -229.6087188720703, + "logps/rejected": -261.7853088378906, + "loss": 0.0093, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.03258442506194115, + "rewards/margins": 0.09463578462600708, + "rewards/rejected": -0.12722022831439972, + "step": 9540 + }, + { + "epoch": 0.62, + "learning_rate": 1.8544692359781192e-06, + "logits/chosen": -2.2808165550231934, + "logits/rejected": -2.0491530895233154, + "logps/chosen": -191.7283477783203, + "logps/rejected": -175.3466033935547, + "loss": 0.0365, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.02071845903992653, + "rewards/margins": 0.07282758504152298, + "rewards/rejected": -0.09354604780673981, + "step": 9550 + }, + { + "epoch": 0.63, + "learning_rate": 1.8489546475156602e-06, + "logits/chosen": -2.4583964347839355, + "logits/rejected": -2.2367022037506104, + "logps/chosen": -223.090576171875, + "logps/rejected": -219.71420288085938, + "loss": 0.0158, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.0179886631667614, + "rewards/margins": 0.07395794242620468, + "rewards/rejected": -0.09194660931825638, + "step": 9560 + }, + { + "epoch": 0.63, + "learning_rate": 1.8434434552246778e-06, + "logits/chosen": -2.090735673904419, + "logits/rejected": -2.021360397338867, + "logps/chosen": -215.08010864257812, + "logps/rejected": -218.2127685546875, + "loss": 0.0179, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.02724291943013668, + "rewards/margins": 0.07197809219360352, + "rewards/rejected": -0.09922101348638535, + "step": 9570 + }, + { + "epoch": 0.63, + "learning_rate": 1.837935687854251e-06, + "logits/chosen": -2.3042380809783936, + "logits/rejected": -2.060258150100708, + "logps/chosen": -225.2897186279297, + "logps/rejected": -210.74374389648438, + "loss": 0.0322, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.018714962527155876, + "rewards/margins": 0.08627069741487503, + "rewards/rejected": -0.10498566925525665, + "step": 9580 + }, + { + "epoch": 0.63, + "learning_rate": 1.832431374135592e-06, + "logits/chosen": -2.4167397022247314, + "logits/rejected": -2.029613494873047, + "logps/chosen": -256.81829833984375, + "logps/rejected": -262.15032958984375, + "loss": 0.0161, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.041914623230695724, + "rewards/margins": 0.12102575600147247, + "rewards/rejected": -0.1629403680562973, + "step": 9590 + }, + { + "epoch": 0.63, + "learning_rate": 1.8269305427818977e-06, + "logits/chosen": -2.4077861309051514, + "logits/rejected": -2.265671730041504, + "logps/chosen": -214.59957885742188, + "logps/rejected": -200.29623413085938, + "loss": 0.0198, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.029916757717728615, + "rewards/margins": 0.06680215895175934, + "rewards/rejected": -0.0967189222574234, + "step": 9600 + }, + { + "epoch": 0.63, + "eval_logits/chosen": -2.2786946296691895, + "eval_logits/rejected": -2.0912680625915527, + "eval_logps/chosen": -237.63516235351562, + "eval_logps/rejected": -234.0897979736328, + "eval_loss": 0.0249630156904459, + "eval_rewards/accuracies": 0.6499999761581421, + "eval_rewards/chosen": -0.02815098501741886, + "eval_rewards/margins": 0.08423858880996704, + "eval_rewards/rejected": -0.11238957196474075, + "eval_runtime": 711.979, + "eval_samples_per_second": 2.809, + "eval_steps_per_second": 1.405, + "step": 9600 + }, + { + "epoch": 0.63, + "learning_rate": 1.821433222488199e-06, + "logits/chosen": -2.272484302520752, + "logits/rejected": -1.9709657430648804, + "logps/chosen": -228.2349395751953, + "logps/rejected": -217.74484252929688, + "loss": 0.0118, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.013100971467792988, + "rewards/margins": 0.0862874835729599, + "rewards/rejected": -0.09938845038414001, + "step": 9610 + }, + { + "epoch": 0.63, + "learning_rate": 1.8159394419312112e-06, + "logits/chosen": -2.3230252265930176, + "logits/rejected": -2.130493640899658, + "logps/chosen": -263.906982421875, + "logps/rejected": -244.85977172851562, + "loss": 0.0284, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.025884822010993958, + "rewards/margins": 0.13745743036270142, + "rewards/rejected": -0.16334223747253418, + "step": 9620 + }, + { + "epoch": 0.63, + "learning_rate": 1.8104492297691845e-06, + "logits/chosen": -2.2625083923339844, + "logits/rejected": -2.043485641479492, + "logps/chosen": -241.1596221923828, + "logps/rejected": -231.29623413085938, + "loss": 0.0347, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.07318250834941864, + "rewards/margins": 0.07124580442905426, + "rewards/rejected": -0.1444282978773117, + "step": 9630 + }, + { + "epoch": 0.63, + "learning_rate": 1.8049626146417562e-06, + "logits/chosen": -2.0654587745666504, + "logits/rejected": -1.9282382726669312, + "logps/chosen": -165.26133728027344, + "logps/rejected": -181.92007446289062, + "loss": 0.0504, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.05092727392911911, + "rewards/margins": 0.10185831785202026, + "rewards/rejected": -0.15278561413288116, + "step": 9640 + }, + { + "epoch": 0.63, + "learning_rate": 1.7994796251697983e-06, + "logits/chosen": -2.1830663681030273, + "logits/rejected": -2.0419445037841797, + "logps/chosen": -210.02487182617188, + "logps/rejected": -271.4522705078125, + "loss": 0.0159, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.0899299904704094, + "rewards/margins": 0.10931304842233658, + "rewards/rejected": -0.19924303889274597, + "step": 9650 + }, + { + "epoch": 0.63, + "learning_rate": 1.794000289955269e-06, + "logits/chosen": -2.229628801345825, + "logits/rejected": -2.061565399169922, + "logps/chosen": -282.14886474609375, + "logps/rejected": -268.85076904296875, + "loss": 0.0393, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.07544299960136414, + "rewards/margins": 0.09074367582798004, + "rewards/rejected": -0.16618667542934418, + "step": 9660 + }, + { + "epoch": 0.63, + "learning_rate": 1.7885246375810646e-06, + "logits/chosen": -2.2054381370544434, + "logits/rejected": -1.9357036352157593, + "logps/chosen": -233.9512481689453, + "logps/rejected": -245.37796020507812, + "loss": 0.0188, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.04838669300079346, + "rewards/margins": 0.07594360411167145, + "rewards/rejected": -0.1243302971124649, + "step": 9670 + }, + { + "epoch": 0.63, + "learning_rate": 1.7830526966108713e-06, + "logits/chosen": -2.0536415576934814, + "logits/rejected": -1.858930230140686, + "logps/chosen": -205.92813110351562, + "logps/rejected": -206.12521362304688, + "loss": 0.0399, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.09727603942155838, + "rewards/margins": 0.14404307305812836, + "rewards/rejected": -0.24131910502910614, + "step": 9680 + }, + { + "epoch": 0.63, + "learning_rate": 1.7775844955890129e-06, + "logits/chosen": -2.209892988204956, + "logits/rejected": -2.049481153488159, + "logps/chosen": -220.50900268554688, + "logps/rejected": -229.58203125, + "loss": 0.0224, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.04481334984302521, + "rewards/margins": 0.10324843972921371, + "rewards/rejected": -0.1480618268251419, + "step": 9690 + }, + { + "epoch": 0.63, + "learning_rate": 1.7721200630403046e-06, + "logits/chosen": -2.283764600753784, + "logits/rejected": -2.0978660583496094, + "logps/chosen": -206.02914428710938, + "logps/rejected": -245.3820343017578, + "loss": 0.0368, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.05133398622274399, + "rewards/margins": 0.06903479993343353, + "rewards/rejected": -0.12036879360675812, + "step": 9700 + }, + { + "epoch": 0.63, + "eval_logits/chosen": -2.278688430786133, + "eval_logits/rejected": -2.0913586616516113, + "eval_logps/chosen": -243.37106323242188, + "eval_logps/rejected": -239.7048797607422, + "eval_loss": 0.024813618510961533, + "eval_rewards/accuracies": 0.6585000157356262, + "eval_rewards/chosen": -0.056830476969480515, + "eval_rewards/margins": 0.08363436907529831, + "eval_rewards/rejected": -0.14046484231948853, + "eval_runtime": 713.2782, + "eval_samples_per_second": 2.804, + "eval_steps_per_second": 1.402, + "step": 9700 + }, + { + "epoch": 0.64, + "learning_rate": 1.7666594274699037e-06, + "logits/chosen": -2.1993210315704346, + "logits/rejected": -2.048921585083008, + "logps/chosen": -261.8725891113281, + "logps/rejected": -245.17886352539062, + "loss": 0.0156, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.04591141268610954, + "rewards/margins": 0.10755202919244766, + "rewards/rejected": -0.1534634530544281, + "step": 9710 + }, + { + "epoch": 0.64, + "learning_rate": 1.76120261736316e-06, + "logits/chosen": -2.2551050186157227, + "logits/rejected": -1.8844871520996094, + "logps/chosen": -239.8577423095703, + "logps/rejected": -237.2745819091797, + "loss": 0.0279, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.06571348011493683, + "rewards/margins": 0.12280043214559555, + "rewards/rejected": -0.18851391971111298, + "step": 9720 + }, + { + "epoch": 0.64, + "learning_rate": 1.755749661185468e-06, + "logits/chosen": -2.3268203735351562, + "logits/rejected": -1.8979631662368774, + "logps/chosen": -301.92840576171875, + "logps/rejected": -262.7877197265625, + "loss": 0.0206, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.03739435225725174, + "rewards/margins": 0.09846501052379608, + "rewards/rejected": -0.13585934042930603, + "step": 9730 + }, + { + "epoch": 0.64, + "learning_rate": 1.7503005873821183e-06, + "logits/chosen": -2.265444755554199, + "logits/rejected": -2.209855556488037, + "logps/chosen": -169.6016082763672, + "logps/rejected": -213.1735076904297, + "loss": 0.0187, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.04586641117930412, + "rewards/margins": 0.08653993904590607, + "rewards/rejected": -0.1324063390493393, + "step": 9740 + }, + { + "epoch": 0.64, + "learning_rate": 1.744855424378148e-06, + "logits/chosen": -2.1263904571533203, + "logits/rejected": -2.1451613903045654, + "logps/chosen": -199.63528442382812, + "logps/rejected": -240.84548950195312, + "loss": 0.019, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.05504153296351433, + "rewards/margins": 0.08882343024015427, + "rewards/rejected": -0.1438649594783783, + "step": 9750 + }, + { + "epoch": 0.64, + "learning_rate": 1.7394142005781973e-06, + "logits/chosen": -2.0734992027282715, + "logits/rejected": -2.1553051471710205, + "logps/chosen": -273.6174621582031, + "logps/rejected": -289.822998046875, + "loss": 0.012, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.05953211337327957, + "rewards/margins": 0.06317280232906342, + "rewards/rejected": -0.12270492315292358, + "step": 9760 + }, + { + "epoch": 0.64, + "learning_rate": 1.7339769443663528e-06, + "logits/chosen": -2.2552011013031006, + "logits/rejected": -2.1124913692474365, + "logps/chosen": -155.26698303222656, + "logps/rejected": -174.4816131591797, + "loss": 0.0283, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.061623912304639816, + "rewards/margins": 0.09255679696798325, + "rewards/rejected": -0.15418072044849396, + "step": 9770 + }, + { + "epoch": 0.64, + "learning_rate": 1.7285436841060078e-06, + "logits/chosen": -2.4462904930114746, + "logits/rejected": -2.161907196044922, + "logps/chosen": -286.5582275390625, + "logps/rejected": -260.4649353027344, + "loss": 0.0235, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.05214942619204521, + "rewards/margins": 0.08430466800928116, + "rewards/rejected": -0.13645410537719727, + "step": 9780 + }, + { + "epoch": 0.64, + "learning_rate": 1.7231144481397083e-06, + "logits/chosen": -2.34212589263916, + "logits/rejected": -2.265117883682251, + "logps/chosen": -231.6746826171875, + "logps/rejected": -217.1168975830078, + "loss": 0.0143, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.05638168007135391, + "rewards/margins": 0.05440463870763779, + "rewards/rejected": -0.1107863038778305, + "step": 9790 + }, + { + "epoch": 0.64, + "learning_rate": 1.7176892647890092e-06, + "logits/chosen": -2.4070024490356445, + "logits/rejected": -2.165473222732544, + "logps/chosen": -246.8561553955078, + "logps/rejected": -219.91323852539062, + "loss": 0.0214, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.055950719863176346, + "rewards/margins": 0.05969247967004776, + "rewards/rejected": -0.1156432032585144, + "step": 9800 + }, + { + "epoch": 0.64, + "eval_logits/chosen": -2.2843563556671143, + "eval_logits/rejected": -2.09706974029541, + "eval_logps/chosen": -243.19447326660156, + "eval_logps/rejected": -239.02984619140625, + "eval_loss": 0.024816662073135376, + "eval_rewards/accuracies": 0.6570000052452087, + "eval_rewards/chosen": -0.05594758316874504, + "eval_rewards/margins": 0.08114214986562729, + "eval_rewards/rejected": -0.13708974421024323, + "eval_runtime": 715.6997, + "eval_samples_per_second": 2.794, + "eval_steps_per_second": 1.397, + "step": 9800 + }, + { + "epoch": 0.64, + "learning_rate": 1.7122681623543239e-06, + "logits/chosen": -2.426173448562622, + "logits/rejected": -2.166084051132202, + "logps/chosen": -255.25790405273438, + "logps/rejected": -265.0026550292969, + "loss": 0.0188, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.0413464792072773, + "rewards/margins": 0.1047930121421814, + "rewards/rejected": -0.1461394876241684, + "step": 9810 + }, + { + "epoch": 0.64, + "learning_rate": 1.7068511691147788e-06, + "logits/chosen": -2.168544292449951, + "logits/rejected": -2.208672046661377, + "logps/chosen": -202.48780822753906, + "logps/rejected": -227.08316040039062, + "loss": 0.0091, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.041783712804317474, + "rewards/margins": 0.08112876862287521, + "rewards/rejected": -0.12291248887777328, + "step": 9820 + }, + { + "epoch": 0.64, + "learning_rate": 1.7014383133280636e-06, + "logits/chosen": -2.394834041595459, + "logits/rejected": -2.033219575881958, + "logps/chosen": -264.05914306640625, + "logps/rejected": -227.944580078125, + "loss": 0.0281, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.07712364196777344, + "rewards/margins": 0.08639691770076752, + "rewards/rejected": -0.16352055966854095, + "step": 9830 + }, + { + "epoch": 0.64, + "learning_rate": 1.696029623230286e-06, + "logits/chosen": -2.366926670074463, + "logits/rejected": -2.284317970275879, + "logps/chosen": -265.00750732421875, + "logps/rejected": -303.00238037109375, + "loss": 0.0209, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.04542611166834831, + "rewards/margins": 0.09607888758182526, + "rewards/rejected": -0.14150500297546387, + "step": 9840 + }, + { + "epoch": 0.64, + "learning_rate": 1.6906251270358229e-06, + "logits/chosen": -2.3569698333740234, + "logits/rejected": -2.2085394859313965, + "logps/chosen": -274.58624267578125, + "logps/rejected": -237.61727905273438, + "loss": 0.0156, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.05865948647260666, + "rewards/margins": 0.06386444717645645, + "rewards/rejected": -0.12252392619848251, + "step": 9850 + }, + { + "epoch": 0.65, + "learning_rate": 1.685224852937174e-06, + "logits/chosen": -2.1372501850128174, + "logits/rejected": -2.017017364501953, + "logps/chosen": -203.86383056640625, + "logps/rejected": -277.4493103027344, + "loss": 0.0382, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.02749522402882576, + "rewards/margins": 0.1586325615644455, + "rewards/rejected": -0.18612778186798096, + "step": 9860 + }, + { + "epoch": 0.65, + "learning_rate": 1.6798288291048136e-06, + "logits/chosen": -2.1029751300811768, + "logits/rejected": -2.0041463375091553, + "logps/chosen": -235.9657745361328, + "logps/rejected": -231.2330322265625, + "loss": 0.025, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.06362083554267883, + "rewards/margins": 0.11986134946346283, + "rewards/rejected": -0.18348219990730286, + "step": 9870 + }, + { + "epoch": 0.65, + "learning_rate": 1.6744370836870466e-06, + "logits/chosen": -2.5070695877075195, + "logits/rejected": -2.2133679389953613, + "logps/chosen": -342.8887634277344, + "logps/rejected": -282.80853271484375, + "loss": 0.017, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.04097612574696541, + "rewards/margins": 0.11718785762786865, + "rewards/rejected": -0.15816399455070496, + "step": 9880 + }, + { + "epoch": 0.65, + "learning_rate": 1.6690496448098576e-06, + "logits/chosen": -2.178298234939575, + "logits/rejected": -1.882962942123413, + "logps/chosen": -241.05026245117188, + "logps/rejected": -235.6141815185547, + "loss": 0.0223, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.060070209205150604, + "rewards/margins": 0.07711504399776459, + "rewards/rejected": -0.1371852457523346, + "step": 9890 + }, + { + "epoch": 0.65, + "learning_rate": 1.6636665405767666e-06, + "logits/chosen": -2.278411865234375, + "logits/rejected": -2.1138100624084473, + "logps/chosen": -241.71383666992188, + "logps/rejected": -236.0366973876953, + "loss": 0.0331, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.01971452124416828, + "rewards/margins": 0.0660644918680191, + "rewards/rejected": -0.08577899634838104, + "step": 9900 + }, + { + "epoch": 0.65, + "eval_logits/chosen": -2.27319598197937, + "eval_logits/rejected": -2.0866928100585938, + "eval_logps/chosen": -240.8263397216797, + "eval_logps/rejected": -238.18746948242188, + "eval_loss": 0.024608775973320007, + "eval_rewards/accuracies": 0.6600000262260437, + "eval_rewards/chosen": -0.044106971472501755, + "eval_rewards/margins": 0.08877087384462357, + "eval_rewards/rejected": -0.13287784159183502, + "eval_runtime": 714.6039, + "eval_samples_per_second": 2.799, + "eval_steps_per_second": 1.399, + "step": 9900 + }, + { + "epoch": 0.65, + "learning_rate": 1.6582877990686827e-06, + "logits/chosen": -2.2707982063293457, + "logits/rejected": -2.2193493843078613, + "logps/chosen": -129.47329711914062, + "logps/rejected": -174.6560516357422, + "loss": 0.0281, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.040894515812397, + "rewards/margins": 0.11425725370645523, + "rewards/rejected": -0.15515175461769104, + "step": 9910 + }, + { + "epoch": 0.65, + "learning_rate": 1.6529134483437562e-06, + "logits/chosen": -2.2776689529418945, + "logits/rejected": -2.1015264987945557, + "logps/chosen": -216.85537719726562, + "logps/rejected": -200.73300170898438, + "loss": 0.0362, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.05640562251210213, + "rewards/margins": 0.11665663868188858, + "rewards/rejected": -0.17306223511695862, + "step": 9920 + }, + { + "epoch": 0.65, + "learning_rate": 1.647543516437233e-06, + "logits/chosen": -2.2426798343658447, + "logits/rejected": -2.1841890811920166, + "logps/chosen": -213.09963989257812, + "logps/rejected": -249.3842010498047, + "loss": 0.0388, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.07030726969242096, + "rewards/margins": 0.07799828052520752, + "rewards/rejected": -0.14830553531646729, + "step": 9930 + }, + { + "epoch": 0.65, + "learning_rate": 1.6421780313613088e-06, + "logits/chosen": -2.3773422241210938, + "logits/rejected": -1.9746917486190796, + "logps/chosen": -214.04379272460938, + "logps/rejected": -201.2644805908203, + "loss": 0.0338, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.04651091247797012, + "rewards/margins": 0.10205264389514923, + "rewards/rejected": -0.14856356382369995, + "step": 9940 + }, + { + "epoch": 0.65, + "learning_rate": 1.6368170211049816e-06, + "logits/chosen": -2.294351816177368, + "logits/rejected": -1.810516357421875, + "logps/chosen": -294.4578552246094, + "logps/rejected": -256.480224609375, + "loss": 0.0149, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.05328863114118576, + "rewards/margins": 0.11066905409097672, + "rewards/rejected": -0.16395768523216248, + "step": 9950 + }, + { + "epoch": 0.65, + "learning_rate": 1.6314605136339074e-06, + "logits/chosen": -2.2938685417175293, + "logits/rejected": -2.132032871246338, + "logps/chosen": -211.07138061523438, + "logps/rejected": -208.8017120361328, + "loss": 0.0433, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.0774233490228653, + "rewards/margins": 0.07814882695674896, + "rewards/rejected": -0.15557217597961426, + "step": 9960 + }, + { + "epoch": 0.65, + "learning_rate": 1.6261085368902526e-06, + "logits/chosen": -2.499953269958496, + "logits/rejected": -2.15836501121521, + "logps/chosen": -275.83831787109375, + "logps/rejected": -247.0941619873047, + "loss": 0.0218, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.05448717996478081, + "rewards/margins": 0.07442667335271835, + "rewards/rejected": -0.12891386449337006, + "step": 9970 + }, + { + "epoch": 0.65, + "learning_rate": 1.6207611187925503e-06, + "logits/chosen": -2.1789181232452393, + "logits/rejected": -2.1919662952423096, + "logps/chosen": -219.1823272705078, + "logps/rejected": -284.10052490234375, + "loss": 0.0307, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.051777105778455734, + "rewards/margins": 0.09300215542316437, + "rewards/rejected": -0.1447792649269104, + "step": 9980 + }, + { + "epoch": 0.65, + "learning_rate": 1.6154182872355512e-06, + "logits/chosen": -2.222287654876709, + "logits/rejected": -2.273768186569214, + "logps/chosen": -178.19265747070312, + "logps/rejected": -206.7621307373047, + "loss": 0.052, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.07187598198652267, + "rewards/margins": 0.07074934989213943, + "rewards/rejected": -0.1426253318786621, + "step": 9990 + }, + { + "epoch": 0.65, + "learning_rate": 1.610080070090084e-06, + "logits/chosen": -2.2096710205078125, + "logits/rejected": -2.1410071849823, + "logps/chosen": -199.0048370361328, + "logps/rejected": -210.6748046875, + "loss": 0.0316, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.09041772037744522, + "rewards/margins": 0.10349156707525253, + "rewards/rejected": -0.19390928745269775, + "step": 10000 + }, + { + "epoch": 0.65, + "eval_logits/chosen": -2.263366460800171, + "eval_logits/rejected": -2.076998233795166, + "eval_logps/chosen": -243.4642333984375, + "eval_logps/rejected": -241.0921630859375, + "eval_loss": 0.024637000635266304, + "eval_rewards/accuracies": 0.6579999923706055, + "eval_rewards/chosen": -0.05729633569717407, + "eval_rewards/margins": 0.09010498225688934, + "eval_rewards/rejected": -0.14740131795406342, + "eval_runtime": 714.5264, + "eval_samples_per_second": 2.799, + "eval_steps_per_second": 1.4, + "step": 10000 + }, + { + "epoch": 0.65, + "learning_rate": 1.6047464952029034e-06, + "logits/chosen": -2.402103900909424, + "logits/rejected": -2.2708446979522705, + "logps/chosen": -263.43438720703125, + "logps/rejected": -293.1377258300781, + "loss": 0.0134, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.045734953135252, + "rewards/margins": 0.10738639533519745, + "rewards/rejected": -0.15312136709690094, + "step": 10010 + }, + { + "epoch": 0.66, + "learning_rate": 1.5994175903965486e-06, + "logits/chosen": -2.1264870166778564, + "logits/rejected": -2.0037424564361572, + "logps/chosen": -266.4388427734375, + "logps/rejected": -290.78729248046875, + "loss": 0.0353, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.08054221421480179, + "rewards/margins": 0.11296994984149933, + "rewards/rejected": -0.19351215660572052, + "step": 10020 + }, + { + "epoch": 0.66, + "learning_rate": 1.5940933834691977e-06, + "logits/chosen": -2.572889804840088, + "logits/rejected": -1.9311988353729248, + "logps/chosen": -317.31982421875, + "logps/rejected": -226.02621459960938, + "loss": 0.0231, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.06026948243379593, + "rewards/margins": 0.08301849663257599, + "rewards/rejected": -0.14328798651695251, + "step": 10030 + }, + { + "epoch": 0.66, + "learning_rate": 1.588773902194522e-06, + "logits/chosen": -2.0572285652160645, + "logits/rejected": -1.7748944759368896, + "logps/chosen": -220.39501953125, + "logps/rejected": -247.3108367919922, + "loss": 0.0168, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.08157656341791153, + "rewards/margins": 0.1557811051607132, + "rewards/rejected": -0.23735766112804413, + "step": 10040 + }, + { + "epoch": 0.66, + "learning_rate": 1.583459174321541e-06, + "logits/chosen": -2.05017352104187, + "logits/rejected": -1.8998372554779053, + "logps/chosen": -228.8952178955078, + "logps/rejected": -221.0664825439453, + "loss": 0.0329, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.0963895320892334, + "rewards/margins": 0.0963265672326088, + "rewards/rejected": -0.1927160918712616, + "step": 10050 + }, + { + "epoch": 0.66, + "learning_rate": 1.5781492275744797e-06, + "logits/chosen": -2.4671225547790527, + "logits/rejected": -2.0863845348358154, + "logps/chosen": -306.43975830078125, + "logps/rejected": -306.3621520996094, + "loss": 0.0237, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.0609763041138649, + "rewards/margins": 0.13070151209831238, + "rewards/rejected": -0.19167783856391907, + "step": 10060 + }, + { + "epoch": 0.66, + "learning_rate": 1.5728440896526215e-06, + "logits/chosen": -2.184709310531616, + "logits/rejected": -2.00661563873291, + "logps/chosen": -288.0798645019531, + "logps/rejected": -258.2320251464844, + "loss": 0.0128, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.0733894407749176, + "rewards/margins": 0.08177514374256134, + "rewards/rejected": -0.15516456961631775, + "step": 10070 + }, + { + "epoch": 0.66, + "learning_rate": 1.5675437882301633e-06, + "logits/chosen": -2.263230323791504, + "logits/rejected": -2.0684688091278076, + "logps/chosen": -237.2273406982422, + "logps/rejected": -208.65243530273438, + "loss": 0.039, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.09091582894325256, + "rewards/margins": 0.031104255467653275, + "rewards/rejected": -0.12202008813619614, + "step": 10080 + }, + { + "epoch": 0.66, + "learning_rate": 1.5622483509560748e-06, + "logits/chosen": -2.1492769718170166, + "logits/rejected": -2.160562753677368, + "logps/chosen": -185.2451629638672, + "logps/rejected": -235.5522003173828, + "loss": 0.0333, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.07258006930351257, + "rewards/margins": 0.1006237119436264, + "rewards/rejected": -0.17320378124713898, + "step": 10090 + }, + { + "epoch": 0.66, + "learning_rate": 1.5569578054539506e-06, + "logits/chosen": -2.2144834995269775, + "logits/rejected": -1.87839674949646, + "logps/chosen": -292.9901428222656, + "logps/rejected": -237.5506134033203, + "loss": 0.0181, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.06896514445543289, + "rewards/margins": 0.14211763441562653, + "rewards/rejected": -0.21108278632164001, + "step": 10100 + }, + { + "epoch": 0.66, + "eval_logits/chosen": -2.2660727500915527, + "eval_logits/rejected": -2.080104351043701, + "eval_logps/chosen": -247.1387176513672, + "eval_logps/rejected": -243.84608459472656, + "eval_loss": 0.024765770882368088, + "eval_rewards/accuracies": 0.6669999957084656, + "eval_rewards/chosen": -0.07566884905099869, + "eval_rewards/margins": 0.08550204336643219, + "eval_rewards/rejected": -0.16117088496685028, + "eval_runtime": 713.5274, + "eval_samples_per_second": 2.803, + "eval_steps_per_second": 1.401, + "step": 10100 + }, + { + "epoch": 0.66, + "learning_rate": 1.551672179321867e-06, + "logits/chosen": -2.1911749839782715, + "logits/rejected": -2.258227825164795, + "logps/chosen": -229.6168975830078, + "logps/rejected": -224.55361938476562, + "loss": 0.0168, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.0687999278306961, + "rewards/margins": 0.08053916692733765, + "rewards/rejected": -0.14933909475803375, + "step": 10110 + }, + { + "epoch": 0.66, + "learning_rate": 1.5463915001322398e-06, + "logits/chosen": -2.22826886177063, + "logits/rejected": -2.0819613933563232, + "logps/chosen": -263.48443603515625, + "logps/rejected": -267.28863525390625, + "loss": 0.0479, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.08914048969745636, + "rewards/margins": 0.0908593013882637, + "rewards/rejected": -0.17999979853630066, + "step": 10120 + }, + { + "epoch": 0.66, + "learning_rate": 1.5411157954316784e-06, + "logits/chosen": -2.2317793369293213, + "logits/rejected": -2.131615161895752, + "logps/chosen": -214.57431030273438, + "logps/rejected": -216.745849609375, + "loss": 0.0158, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.0634838342666626, + "rewards/margins": 0.07930682599544525, + "rewards/rejected": -0.14279064536094666, + "step": 10130 + }, + { + "epoch": 0.66, + "learning_rate": 1.535845092740843e-06, + "logits/chosen": -2.3938117027282715, + "logits/rejected": -2.235452175140381, + "logps/chosen": -248.4803466796875, + "logps/rejected": -271.53656005859375, + "loss": 0.0297, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.05579571798443794, + "rewards/margins": 0.0533018484711647, + "rewards/rejected": -0.10909757763147354, + "step": 10140 + }, + { + "epoch": 0.66, + "learning_rate": 1.5305794195543005e-06, + "logits/chosen": -2.250174045562744, + "logits/rejected": -2.3086235523223877, + "logps/chosen": -221.4529571533203, + "logps/rejected": -223.0041961669922, + "loss": 0.0335, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.07136930525302887, + "rewards/margins": 0.0924132838845253, + "rewards/rejected": -0.16378256678581238, + "step": 10150 + }, + { + "epoch": 0.66, + "learning_rate": 1.5253188033403816e-06, + "logits/chosen": -2.2710773944854736, + "logits/rejected": -2.3633055686950684, + "logps/chosen": -183.15744018554688, + "logps/rejected": -208.3464813232422, + "loss": 0.0237, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.06211277097463608, + "rewards/margins": 0.0426534041762352, + "rewards/rejected": -0.10476617515087128, + "step": 10160 + }, + { + "epoch": 0.67, + "learning_rate": 1.520063271541037e-06, + "logits/chosen": -2.240269660949707, + "logits/rejected": -2.1465556621551514, + "logps/chosen": -193.79080200195312, + "logps/rejected": -208.65560913085938, + "loss": 0.0198, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.07643021643161774, + "rewards/margins": 0.13083912432193756, + "rewards/rejected": -0.2072693407535553, + "step": 10170 + }, + { + "epoch": 0.67, + "learning_rate": 1.5148128515716954e-06, + "logits/chosen": -2.4676265716552734, + "logits/rejected": -1.8373692035675049, + "logps/chosen": -275.50396728515625, + "logps/rejected": -226.16683959960938, + "loss": 0.0294, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.044873807579278946, + "rewards/margins": 0.11518532037734985, + "rewards/rejected": -0.1600591242313385, + "step": 10180 + }, + { + "epoch": 0.67, + "learning_rate": 1.5095675708211197e-06, + "logits/chosen": -2.286296844482422, + "logits/rejected": -2.276829719543457, + "logps/chosen": -211.50582885742188, + "logps/rejected": -243.75173950195312, + "loss": 0.0499, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": -0.10172464698553085, + "rewards/margins": 0.04451145976781845, + "rewards/rejected": -0.1462361216545105, + "step": 10190 + }, + { + "epoch": 0.67, + "learning_rate": 1.504327456651263e-06, + "logits/chosen": -2.2106895446777344, + "logits/rejected": -2.136573076248169, + "logps/chosen": -275.1043701171875, + "logps/rejected": -267.5289306640625, + "loss": 0.0159, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.06850683689117432, + "rewards/margins": 0.09694625437259674, + "rewards/rejected": -0.16545307636260986, + "step": 10200 + }, + { + "epoch": 0.67, + "eval_logits/chosen": -2.2463204860687256, + "eval_logits/rejected": -2.0610575675964355, + "eval_logps/chosen": -244.7626495361328, + "eval_logps/rejected": -242.60560607910156, + "eval_loss": 0.02451149746775627, + "eval_rewards/accuracies": 0.6610000133514404, + "eval_rewards/chosen": -0.06378858536481857, + "eval_rewards/margins": 0.09117982536554337, + "eval_rewards/rejected": -0.15496839582920074, + "eval_runtime": 715.3275, + "eval_samples_per_second": 2.796, + "eval_steps_per_second": 1.398, + "step": 10200 + }, + { + "epoch": 0.67, + "learning_rate": 1.4990925363971284e-06, + "logits/chosen": -2.304161310195923, + "logits/rejected": -1.8916089534759521, + "logps/chosen": -304.38824462890625, + "logps/rejected": -271.1502685546875, + "loss": 0.0352, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.07232534885406494, + "rewards/margins": 0.176255464553833, + "rewards/rejected": -0.24858078360557556, + "step": 10210 + }, + { + "epoch": 0.67, + "learning_rate": 1.4938628373666236e-06, + "logits/chosen": -2.175797939300537, + "logits/rejected": -2.2267374992370605, + "logps/chosen": -186.3314666748047, + "logps/rejected": -202.7518768310547, + "loss": 0.0369, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.06876092404127121, + "rewards/margins": 0.0727389007806778, + "rewards/rejected": -0.1414998173713684, + "step": 10220 + }, + { + "epoch": 0.67, + "learning_rate": 1.4886383868404203e-06, + "logits/chosen": -2.0863027572631836, + "logits/rejected": -1.9565706253051758, + "logps/chosen": -175.46356201171875, + "logps/rejected": -183.82199096679688, + "loss": 0.0168, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.07120905071496964, + "rewards/margins": 0.09194178879261017, + "rewards/rejected": -0.1631508320569992, + "step": 10230 + }, + { + "epoch": 0.67, + "learning_rate": 1.483419212071813e-06, + "logits/chosen": -2.034952163696289, + "logits/rejected": -1.865007996559143, + "logps/chosen": -202.6958770751953, + "logps/rejected": -209.85952758789062, + "loss": 0.035, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.07115866988897324, + "rewards/margins": 0.07172514498233795, + "rewards/rejected": -0.1428838074207306, + "step": 10240 + }, + { + "epoch": 0.67, + "learning_rate": 1.478205340286573e-06, + "logits/chosen": -2.1753857135772705, + "logits/rejected": -2.151221990585327, + "logps/chosen": -227.4921417236328, + "logps/rejected": -222.22811889648438, + "loss": 0.0441, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.11941616237163544, + "rewards/margins": 0.0637040063738823, + "rewards/rejected": -0.18312017619609833, + "step": 10250 + }, + { + "epoch": 0.67, + "learning_rate": 1.4729967986828104e-06, + "logits/chosen": -2.3345413208007812, + "logits/rejected": -2.1017842292785645, + "logps/chosen": -332.4571533203125, + "logps/rejected": -302.4991760253906, + "loss": 0.0283, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.05236706882715225, + "rewards/margins": 0.09005019068717957, + "rewards/rejected": -0.14241725206375122, + "step": 10260 + }, + { + "epoch": 0.67, + "learning_rate": 1.4677936144308286e-06, + "logits/chosen": -2.3077831268310547, + "logits/rejected": -2.01301646232605, + "logps/chosen": -235.05453491210938, + "logps/rejected": -231.1743621826172, + "loss": 0.0282, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.054049067199230194, + "rewards/margins": 0.12091416120529175, + "rewards/rejected": -0.17496320605278015, + "step": 10270 + }, + { + "epoch": 0.67, + "learning_rate": 1.4625958146729864e-06, + "logits/chosen": -2.3322410583496094, + "logits/rejected": -2.154740571975708, + "logps/chosen": -232.1531982421875, + "logps/rejected": -233.8138427734375, + "loss": 0.021, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.059446774423122406, + "rewards/margins": 0.08804331719875336, + "rewards/rejected": -0.14749008417129517, + "step": 10280 + }, + { + "epoch": 0.67, + "learning_rate": 1.4574034265235523e-06, + "logits/chosen": -2.413435459136963, + "logits/rejected": -1.8607642650604248, + "logps/chosen": -266.5216979980469, + "logps/rejected": -190.62388610839844, + "loss": 0.0369, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.0534658320248127, + "rewards/margins": 0.10932525247335434, + "rewards/rejected": -0.16279107332229614, + "step": 10290 + }, + { + "epoch": 0.67, + "learning_rate": 1.452216477068568e-06, + "logits/chosen": -2.2776665687561035, + "logits/rejected": -1.7751963138580322, + "logps/chosen": -241.0727996826172, + "logps/rejected": -170.58828735351562, + "loss": 0.018, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.055139027535915375, + "rewards/margins": 0.11215144395828247, + "rewards/rejected": -0.16729044914245605, + "step": 10300 + }, + { + "epoch": 0.67, + "eval_logits/chosen": -2.255398988723755, + "eval_logits/rejected": -2.0697996616363525, + "eval_logps/chosen": -243.80836486816406, + "eval_logps/rejected": -240.55059814453125, + "eval_loss": 0.024353496730327606, + "eval_rewards/accuracies": 0.6614999771118164, + "eval_rewards/chosen": -0.05901704356074333, + "eval_rewards/margins": 0.08567636460065842, + "eval_rewards/rejected": -0.14469340443611145, + "eval_runtime": 714.1324, + "eval_samples_per_second": 2.801, + "eval_steps_per_second": 1.4, + "step": 10300 + }, + { + "epoch": 0.67, + "learning_rate": 1.4470349933657004e-06, + "logits/chosen": -2.4610915184020996, + "logits/rejected": -2.2262465953826904, + "logps/chosen": -233.5832977294922, + "logps/rejected": -225.6370849609375, + "loss": 0.0275, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.06426848471164703, + "rewards/margins": 0.08016308397054672, + "rewards/rejected": -0.14443157613277435, + "step": 10310 + }, + { + "epoch": 0.68, + "learning_rate": 1.4418590024441096e-06, + "logits/chosen": -2.3426105976104736, + "logits/rejected": -1.9475975036621094, + "logps/chosen": -261.2298278808594, + "logps/rejected": -212.2733917236328, + "loss": 0.0216, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.0388479009270668, + "rewards/margins": 0.08764450252056122, + "rewards/rejected": -0.12649241089820862, + "step": 10320 + }, + { + "epoch": 0.68, + "learning_rate": 1.436688531304297e-06, + "logits/chosen": -2.340837001800537, + "logits/rejected": -2.0119991302490234, + "logps/chosen": -226.4020538330078, + "logps/rejected": -240.67123413085938, + "loss": 0.026, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.040372543036937714, + "rewards/margins": 0.09879190474748611, + "rewards/rejected": -0.13916444778442383, + "step": 10330 + }, + { + "epoch": 0.68, + "learning_rate": 1.431523606917974e-06, + "logits/chosen": -2.163691282272339, + "logits/rejected": -2.15777850151062, + "logps/chosen": -223.59619140625, + "logps/rejected": -248.87796020507812, + "loss": 0.0276, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.09248442202806473, + "rewards/margins": 0.09333528578281403, + "rewards/rejected": -0.18581970036029816, + "step": 10340 + }, + { + "epoch": 0.68, + "learning_rate": 1.4263642562279162e-06, + "logits/chosen": -1.9505666494369507, + "logits/rejected": -1.9221442937850952, + "logps/chosen": -263.3883361816406, + "logps/rejected": -291.3192138671875, + "loss": 0.0182, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.06950507313013077, + "rewards/margins": 0.09720407426357269, + "rewards/rejected": -0.16670915484428406, + "step": 10350 + }, + { + "epoch": 0.68, + "learning_rate": 1.4212105061478257e-06, + "logits/chosen": -2.009002447128296, + "logits/rejected": -2.0214123725891113, + "logps/chosen": -245.10855102539062, + "logps/rejected": -272.1604309082031, + "loss": 0.0218, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.07609008252620697, + "rewards/margins": 0.09614210575819016, + "rewards/rejected": -0.17223218083381653, + "step": 10360 + }, + { + "epoch": 0.68, + "learning_rate": 1.4160623835621848e-06, + "logits/chosen": -2.3645236492156982, + "logits/rejected": -2.2268776893615723, + "logps/chosen": -243.3690948486328, + "logps/rejected": -249.6556854248047, + "loss": 0.0173, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.03028956614434719, + "rewards/margins": 0.08882386982440948, + "rewards/rejected": -0.11911344528198242, + "step": 10370 + }, + { + "epoch": 0.68, + "learning_rate": 1.4109199153261249e-06, + "logits/chosen": -2.1503076553344727, + "logits/rejected": -2.042171001434326, + "logps/chosen": -286.06243896484375, + "logps/rejected": -275.43499755859375, + "loss": 0.0148, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.05033920332789421, + "rewards/margins": 0.09843063354492188, + "rewards/rejected": -0.14876984059810638, + "step": 10380 + }, + { + "epoch": 0.68, + "learning_rate": 1.405783128265278e-06, + "logits/chosen": -2.221115827560425, + "logits/rejected": -2.198319673538208, + "logps/chosen": -217.111328125, + "logps/rejected": -228.8067169189453, + "loss": 0.0177, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.06488905847072601, + "rewards/margins": 0.07480791211128235, + "rewards/rejected": -0.13969698548316956, + "step": 10390 + }, + { + "epoch": 0.68, + "learning_rate": 1.4006520491756427e-06, + "logits/chosen": -2.3132808208465576, + "logits/rejected": -2.080735683441162, + "logps/chosen": -205.646484375, + "logps/rejected": -161.129638671875, + "loss": 0.0144, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.04711119830608368, + "rewards/margins": 0.10062043368816376, + "rewards/rejected": -0.14773163199424744, + "step": 10400 + }, + { + "epoch": 0.68, + "eval_logits/chosen": -2.2488584518432617, + "eval_logits/rejected": -2.0630223751068115, + "eval_logps/chosen": -239.70639038085938, + "eval_logps/rejected": -236.77072143554688, + "eval_loss": 0.024509282782673836, + "eval_rewards/accuracies": 0.6604999899864197, + "eval_rewards/chosen": -0.03850714862346649, + "eval_rewards/margins": 0.08728697150945663, + "eval_rewards/rejected": -0.12579411268234253, + "eval_runtime": 714.6697, + "eval_samples_per_second": 2.798, + "eval_steps_per_second": 1.399, + "step": 10400 + }, + { + "epoch": 0.68, + "learning_rate": 1.39552670482344e-06, + "logits/chosen": -2.1409950256347656, + "logits/rejected": -2.213547706604004, + "logps/chosen": -184.26515197753906, + "logps/rejected": -197.73403930664062, + "loss": 0.0188, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.048327818512916565, + "rewards/margins": 0.06788711249828339, + "rewards/rejected": -0.11621493101119995, + "step": 10410 + }, + { + "epoch": 0.68, + "learning_rate": 1.3904071219449776e-06, + "logits/chosen": -2.226022243499756, + "logits/rejected": -1.7965023517608643, + "logps/chosen": -202.78274536132812, + "logps/rejected": -136.0773162841797, + "loss": 0.0232, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.014405476860702038, + "rewards/margins": 0.08977358788251877, + "rewards/rejected": -0.10417908430099487, + "step": 10420 + }, + { + "epoch": 0.68, + "learning_rate": 1.3852933272465068e-06, + "logits/chosen": -2.3943543434143066, + "logits/rejected": -2.1661648750305176, + "logps/chosen": -241.92822265625, + "logps/rejected": -220.4378662109375, + "loss": 0.0254, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.020340552553534508, + "rewards/margins": 0.06866742670536041, + "rewards/rejected": -0.08900798857212067, + "step": 10430 + }, + { + "epoch": 0.68, + "learning_rate": 1.3801853474040873e-06, + "logits/chosen": -2.1801600456237793, + "logits/rejected": -2.1125564575195312, + "logps/chosen": -246.2391357421875, + "logps/rejected": -248.90744018554688, + "loss": 0.0248, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.03858952969312668, + "rewards/margins": 0.09564275294542313, + "rewards/rejected": -0.1342322826385498, + "step": 10440 + }, + { + "epoch": 0.68, + "learning_rate": 1.3750832090634417e-06, + "logits/chosen": -2.3373451232910156, + "logits/rejected": -2.034323215484619, + "logps/chosen": -191.03738403320312, + "logps/rejected": -193.2030487060547, + "loss": 0.0102, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.024323513731360435, + "rewards/margins": 0.08408524841070175, + "rewards/rejected": -0.10840876400470734, + "step": 10450 + }, + { + "epoch": 0.68, + "learning_rate": 1.3699869388398245e-06, + "logits/chosen": -2.199007034301758, + "logits/rejected": -2.0577285289764404, + "logps/chosen": -220.73062133789062, + "logps/rejected": -220.3302459716797, + "loss": 0.0181, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.03854639083147049, + "rewards/margins": 0.09026548266410828, + "rewards/rejected": -0.12881188094615936, + "step": 10460 + }, + { + "epoch": 0.69, + "learning_rate": 1.3648965633178772e-06, + "logits/chosen": -2.2158031463623047, + "logits/rejected": -2.126948118209839, + "logps/chosen": -209.24462890625, + "logps/rejected": -239.54849243164062, + "loss": 0.0313, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.04256223887205124, + "rewards/margins": 0.09446152299642563, + "rewards/rejected": -0.13702376186847687, + "step": 10470 + }, + { + "epoch": 0.69, + "learning_rate": 1.3598121090514938e-06, + "logits/chosen": -2.203119993209839, + "logits/rejected": -2.0843684673309326, + "logps/chosen": -190.83518981933594, + "logps/rejected": -183.73692321777344, + "loss": 0.0274, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.03913657367229462, + "rewards/margins": 0.08512511104345322, + "rewards/rejected": -0.12426167726516724, + "step": 10480 + }, + { + "epoch": 0.69, + "learning_rate": 1.3547336025636753e-06, + "logits/chosen": -2.131678342819214, + "logits/rejected": -1.9140796661376953, + "logps/chosen": -285.03436279296875, + "logps/rejected": -258.30731201171875, + "loss": 0.0166, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.04868927597999573, + "rewards/margins": 0.07400977611541748, + "rewards/rejected": -0.1226990595459938, + "step": 10490 + }, + { + "epoch": 0.69, + "learning_rate": 1.3496610703464022e-06, + "logits/chosen": -2.2886691093444824, + "logits/rejected": -2.0513675212860107, + "logps/chosen": -238.70556640625, + "logps/rejected": -210.661376953125, + "loss": 0.0273, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.047455064952373505, + "rewards/margins": 0.08738575875759125, + "rewards/rejected": -0.13484081625938416, + "step": 10500 + }, + { + "epoch": 0.69, + "eval_logits/chosen": -2.2537121772766113, + "eval_logits/rejected": -2.067767858505249, + "eval_logps/chosen": -240.62744140625, + "eval_logps/rejected": -237.07449340820312, + "eval_loss": 0.024398881942033768, + "eval_rewards/accuracies": 0.656499981880188, + "eval_rewards/chosen": -0.043112535029649734, + "eval_rewards/margins": 0.084200419485569, + "eval_rewards/rejected": -0.12731294333934784, + "eval_runtime": 713.601, + "eval_samples_per_second": 2.803, + "eval_steps_per_second": 1.401, + "step": 10500 + }, + { + "epoch": 0.69, + "learning_rate": 1.3445945388604848e-06, + "logits/chosen": -2.1545567512512207, + "logits/rejected": -1.904229760169983, + "logps/chosen": -255.2687530517578, + "logps/rejected": -235.4828338623047, + "loss": 0.0265, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.08522634953260422, + "rewards/margins": 0.11687376350164413, + "rewards/rejected": -0.20210011303424835, + "step": 10510 + }, + { + "epoch": 0.69, + "learning_rate": 1.3395340345354358e-06, + "logits/chosen": -2.1851556301116943, + "logits/rejected": -2.310978412628174, + "logps/chosen": -235.08981323242188, + "logps/rejected": -270.3477478027344, + "loss": 0.0253, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.05695996806025505, + "rewards/margins": 0.09005177766084671, + "rewards/rejected": -0.14701174199581146, + "step": 10520 + }, + { + "epoch": 0.69, + "learning_rate": 1.334479583769322e-06, + "logits/chosen": -2.4132397174835205, + "logits/rejected": -2.111905336380005, + "logps/chosen": -263.5672912597656, + "logps/rejected": -223.38528442382812, + "loss": 0.0273, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.04364245384931564, + "rewards/margins": 0.048938460648059845, + "rewards/rejected": -0.09258091449737549, + "step": 10530 + }, + { + "epoch": 0.69, + "learning_rate": 1.3294312129286366e-06, + "logits/chosen": -2.2596263885498047, + "logits/rejected": -2.141056537628174, + "logps/chosen": -272.45751953125, + "logps/rejected": -269.8136291503906, + "loss": 0.0186, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.028644507750868797, + "rewards/margins": 0.05513488128781319, + "rewards/rejected": -0.08377937972545624, + "step": 10540 + }, + { + "epoch": 0.69, + "learning_rate": 1.324388948348153e-06, + "logits/chosen": -2.411508560180664, + "logits/rejected": -2.021793842315674, + "logps/chosen": -291.7307434082031, + "logps/rejected": -228.37838745117188, + "loss": 0.025, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.03246372565627098, + "rewards/margins": 0.09007063508033752, + "rewards/rejected": -0.1225343719124794, + "step": 10550 + }, + { + "epoch": 0.69, + "learning_rate": 1.319352816330796e-06, + "logits/chosen": -2.5005040168762207, + "logits/rejected": -1.985607385635376, + "logps/chosen": -291.51397705078125, + "logps/rejected": -204.1352081298828, + "loss": 0.0218, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.04158360883593559, + "rewards/margins": 0.0953831896185875, + "rewards/rejected": -0.13696682453155518, + "step": 10560 + }, + { + "epoch": 0.69, + "learning_rate": 1.314322843147494e-06, + "logits/chosen": -2.103975296020508, + "logits/rejected": -2.1999995708465576, + "logps/chosen": -186.9462432861328, + "logps/rejected": -256.3164978027344, + "loss": 0.0222, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.08310200273990631, + "rewards/margins": 0.07251256704330444, + "rewards/rejected": -0.15561453998088837, + "step": 10570 + }, + { + "epoch": 0.69, + "learning_rate": 1.3092990550370526e-06, + "logits/chosen": -2.3676490783691406, + "logits/rejected": -2.032197952270508, + "logps/chosen": -357.0168151855469, + "logps/rejected": -287.89605712890625, + "loss": 0.0162, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.05629213899374008, + "rewards/margins": 0.08115239441394806, + "rewards/rejected": -0.13744454085826874, + "step": 10580 + }, + { + "epoch": 0.69, + "learning_rate": 1.3042814782060131e-06, + "logits/chosen": -2.2966275215148926, + "logits/rejected": -2.005599021911621, + "logps/chosen": -182.35781860351562, + "logps/rejected": -181.06105041503906, + "loss": 0.0119, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.004274829290807247, + "rewards/margins": 0.11309751123189926, + "rewards/rejected": -0.11737234890460968, + "step": 10590 + }, + { + "epoch": 0.69, + "learning_rate": 1.2992701388285112e-06, + "logits/chosen": -2.3099424839019775, + "logits/rejected": -2.059408664703369, + "logps/chosen": -274.67364501953125, + "logps/rejected": -245.584228515625, + "loss": 0.0194, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.026622802019119263, + "rewards/margins": 0.0769127830862999, + "rewards/rejected": -0.10353559255599976, + "step": 10600 + }, + { + "epoch": 0.69, + "eval_logits/chosen": -2.2543232440948486, + "eval_logits/rejected": -2.0683767795562744, + "eval_logps/chosen": -240.60284423828125, + "eval_logps/rejected": -237.0673370361328, + "eval_loss": 0.024293892085552216, + "eval_rewards/accuracies": 0.6635000109672546, + "eval_rewards/chosen": -0.04298943653702736, + "eval_rewards/margins": 0.08428782224655151, + "eval_rewards/rejected": -0.12727726995944977, + "eval_runtime": 714.2376, + "eval_samples_per_second": 2.8, + "eval_steps_per_second": 1.4, + "step": 10600 + }, + { + "epoch": 0.69, + "learning_rate": 1.29426506304615e-06, + "logits/chosen": -2.1544384956359863, + "logits/rejected": -2.087916374206543, + "logps/chosen": -239.1622314453125, + "logps/rejected": -227.13082885742188, + "loss": 0.0438, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.08219398558139801, + "rewards/margins": 0.03723324462771416, + "rewards/rejected": -0.11942724138498306, + "step": 10610 + }, + { + "epoch": 0.69, + "learning_rate": 1.289266276967855e-06, + "logits/chosen": -2.313107967376709, + "logits/rejected": -2.1951651573181152, + "logps/chosen": -346.13580322265625, + "logps/rejected": -270.1081848144531, + "loss": 0.0207, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.037769950926303864, + "rewards/margins": 0.07065282016992569, + "rewards/rejected": -0.10842277109622955, + "step": 10620 + }, + { + "epoch": 0.7, + "learning_rate": 1.284273806669745e-06, + "logits/chosen": -2.264178514480591, + "logits/rejected": -2.0449130535125732, + "logps/chosen": -255.4210968017578, + "logps/rejected": -291.5980224609375, + "loss": 0.0198, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.0804806649684906, + "rewards/margins": 0.09318158775568008, + "rewards/rejected": -0.17366227507591248, + "step": 10630 + }, + { + "epoch": 0.7, + "learning_rate": 1.2792876781949884e-06, + "logits/chosen": -2.0060455799102783, + "logits/rejected": -1.7587082386016846, + "logps/chosen": -216.16738891601562, + "logps/rejected": -214.92654418945312, + "loss": 0.0319, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.050473641604185104, + "rewards/margins": 0.09038424491882324, + "rewards/rejected": -0.14085790514945984, + "step": 10640 + }, + { + "epoch": 0.7, + "learning_rate": 1.274307917553676e-06, + "logits/chosen": -2.256579637527466, + "logits/rejected": -2.1760025024414062, + "logps/chosen": -207.838623046875, + "logps/rejected": -258.51043701171875, + "loss": 0.0265, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.06089919060468674, + "rewards/margins": 0.13036146759986877, + "rewards/rejected": -0.1912606656551361, + "step": 10650 + }, + { + "epoch": 0.7, + "learning_rate": 1.2693345507226767e-06, + "logits/chosen": -2.0505454540252686, + "logits/rejected": -2.1315135955810547, + "logps/chosen": -245.26171875, + "logps/rejected": -271.7505798339844, + "loss": 0.0171, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.07635726779699326, + "rewards/margins": 0.12187687307596207, + "rewards/rejected": -0.19823415577411652, + "step": 10660 + }, + { + "epoch": 0.7, + "learning_rate": 1.2643676036455099e-06, + "logits/chosen": -2.3000295162200928, + "logits/rejected": -2.206305742263794, + "logps/chosen": -299.7503967285156, + "logps/rejected": -259.10772705078125, + "loss": 0.0212, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.059489428997039795, + "rewards/margins": 0.05049192160367966, + "rewards/rejected": -0.10998135805130005, + "step": 10670 + }, + { + "epoch": 0.7, + "learning_rate": 1.259407102232203e-06, + "logits/chosen": -2.3993871212005615, + "logits/rejected": -2.0211315155029297, + "logps/chosen": -288.9398498535156, + "logps/rejected": -235.0491180419922, + "loss": 0.0156, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.05652741715312004, + "rewards/margins": 0.09526638686656952, + "rewards/rejected": -0.15179380774497986, + "step": 10680 + }, + { + "epoch": 0.7, + "learning_rate": 1.254453072359163e-06, + "logits/chosen": -2.3295562267303467, + "logits/rejected": -2.1101839542388916, + "logps/chosen": -242.13516235351562, + "logps/rejected": -235.0244903564453, + "loss": 0.0192, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.04057559370994568, + "rewards/margins": 0.07377767562866211, + "rewards/rejected": -0.11435327678918839, + "step": 10690 + }, + { + "epoch": 0.7, + "learning_rate": 1.2495055398690337e-06, + "logits/chosen": -2.418180465698242, + "logits/rejected": -2.1741397380828857, + "logps/chosen": -237.8803253173828, + "logps/rejected": -240.59848022460938, + "loss": 0.0199, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.04012968763709068, + "rewards/margins": 0.06000664830207825, + "rewards/rejected": -0.10013633966445923, + "step": 10700 + }, + { + "epoch": 0.7, + "eval_logits/chosen": -2.2555792331695557, + "eval_logits/rejected": -2.069556713104248, + "eval_logps/chosen": -240.78065490722656, + "eval_logps/rejected": -236.7907257080078, + "eval_loss": 0.02439829520881176, + "eval_rewards/accuracies": 0.659500002861023, + "eval_rewards/chosen": -0.04387851804494858, + "eval_rewards/margins": 0.08201548457145691, + "eval_rewards/rejected": -0.1258939951658249, + "eval_runtime": 716.3713, + "eval_samples_per_second": 2.792, + "eval_steps_per_second": 1.396, + "step": 10700 + }, + { + "epoch": 0.7, + "learning_rate": 1.2445645305705718e-06, + "logits/chosen": -2.4278831481933594, + "logits/rejected": -2.1002821922302246, + "logps/chosen": -218.99618530273438, + "logps/rejected": -210.4351806640625, + "loss": 0.0272, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.06337618827819824, + "rewards/margins": 0.06040935590863228, + "rewards/rejected": -0.12378554046154022, + "step": 10710 + }, + { + "epoch": 0.7, + "learning_rate": 1.2396300702384995e-06, + "logits/chosen": -2.4178760051727295, + "logits/rejected": -2.1771225929260254, + "logps/chosen": -267.5671081542969, + "logps/rejected": -238.9056396484375, + "loss": 0.0135, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.04879522696137428, + "rewards/margins": 0.045019179582595825, + "rewards/rejected": -0.09381439536809921, + "step": 10720 + }, + { + "epoch": 0.7, + "learning_rate": 1.234702184613381e-06, + "logits/chosen": -2.0416836738586426, + "logits/rejected": -2.115546226501465, + "logps/chosen": -224.7384490966797, + "logps/rejected": -239.72744750976562, + "loss": 0.0159, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.046280622482299805, + "rewards/margins": 0.07662250101566315, + "rewards/rejected": -0.12290313094854355, + "step": 10730 + }, + { + "epoch": 0.7, + "learning_rate": 1.2297808994014793e-06, + "logits/chosen": -2.3566012382507324, + "logits/rejected": -2.110050678253174, + "logps/chosen": -292.0399475097656, + "logps/rejected": -270.0953369140625, + "loss": 0.0192, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.030525337904691696, + "rewards/margins": 0.0647398829460144, + "rewards/rejected": -0.0952652245759964, + "step": 10740 + }, + { + "epoch": 0.7, + "learning_rate": 1.2248662402746314e-06, + "logits/chosen": -2.190117359161377, + "logits/rejected": -2.0638766288757324, + "logps/chosen": -206.7563934326172, + "logps/rejected": -214.2916259765625, + "loss": 0.0326, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.08419866859912872, + "rewards/margins": 0.06825531274080276, + "rewards/rejected": -0.1524539738893509, + "step": 10750 + }, + { + "epoch": 0.7, + "learning_rate": 1.2199582328701045e-06, + "logits/chosen": -2.356628179550171, + "logits/rejected": -1.8749473094940186, + "logps/chosen": -301.6092529296875, + "logps/rejected": -279.6309509277344, + "loss": 0.0247, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.05487877130508423, + "rewards/margins": 0.09156282991170883, + "rewards/rejected": -0.14644160866737366, + "step": 10760 + }, + { + "epoch": 0.7, + "learning_rate": 1.2150569027904712e-06, + "logits/chosen": -2.2430715560913086, + "logits/rejected": -2.1345303058624268, + "logps/chosen": -255.2360076904297, + "logps/rejected": -265.4273681640625, + "loss": 0.0386, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.046262722462415695, + "rewards/margins": 0.08247127383947372, + "rewards/rejected": -0.12873399257659912, + "step": 10770 + }, + { + "epoch": 0.71, + "learning_rate": 1.2101622756034688e-06, + "logits/chosen": -2.2396297454833984, + "logits/rejected": -2.224565029144287, + "logps/chosen": -230.2205810546875, + "logps/rejected": -216.84671020507812, + "loss": 0.031, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.03733275085687637, + "rewards/margins": 0.08495677262544632, + "rewards/rejected": -0.1222895160317421, + "step": 10780 + }, + { + "epoch": 0.71, + "learning_rate": 1.2052743768418715e-06, + "logits/chosen": -2.295220136642456, + "logits/rejected": -2.0792322158813477, + "logps/chosen": -252.6884002685547, + "logps/rejected": -236.13229370117188, + "loss": 0.0098, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.024396592751145363, + "rewards/margins": 0.08725408464670181, + "rewards/rejected": -0.11165066808462143, + "step": 10790 + }, + { + "epoch": 0.71, + "learning_rate": 1.2003932320033523e-06, + "logits/chosen": -2.3979673385620117, + "logits/rejected": -2.0977725982666016, + "logps/chosen": -231.81423950195312, + "logps/rejected": -254.93875122070312, + "loss": 0.0349, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.02655757963657379, + "rewards/margins": 0.11246329545974731, + "rewards/rejected": -0.1390208601951599, + "step": 10800 + }, + { + "epoch": 0.71, + "eval_logits/chosen": -2.2533140182495117, + "eval_logits/rejected": -2.0673305988311768, + "eval_logps/chosen": -239.88389587402344, + "eval_logps/rejected": -236.12091064453125, + "eval_loss": 0.02448507584631443, + "eval_rewards/accuracies": 0.6585000157356262, + "eval_rewards/chosen": -0.03939465060830116, + "eval_rewards/margins": 0.08315033465623856, + "eval_rewards/rejected": -0.12254498898983002, + "eval_runtime": 714.098, + "eval_samples_per_second": 2.801, + "eval_steps_per_second": 1.4, + "step": 10800 + }, + { + "epoch": 0.71, + "learning_rate": 1.1955188665503553e-06, + "logits/chosen": -2.064089059829712, + "logits/rejected": -2.1178905963897705, + "logps/chosen": -217.57968139648438, + "logps/rejected": -211.15603637695312, + "loss": 0.0306, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.06227996200323105, + "rewards/margins": 0.07183237373828888, + "rewards/rejected": -0.13411234319210052, + "step": 10810 + }, + { + "epoch": 0.71, + "learning_rate": 1.1906513059099566e-06, + "logits/chosen": -2.2473361492156982, + "logits/rejected": -1.9340236186981201, + "logps/chosen": -237.6453857421875, + "logps/rejected": -251.6685028076172, + "loss": 0.0182, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.030781585723161697, + "rewards/margins": 0.11381824314594269, + "rewards/rejected": -0.1445998251438141, + "step": 10820 + }, + { + "epoch": 0.71, + "learning_rate": 1.185790575473738e-06, + "logits/chosen": -2.1922922134399414, + "logits/rejected": -1.9812322854995728, + "logps/chosen": -237.2852325439453, + "logps/rejected": -210.5954132080078, + "loss": 0.0278, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.040759555995464325, + "rewards/margins": 0.1029219776391983, + "rewards/rejected": -0.14368152618408203, + "step": 10830 + }, + { + "epoch": 0.71, + "learning_rate": 1.1809367005976516e-06, + "logits/chosen": -2.258755683898926, + "logits/rejected": -2.0178275108337402, + "logps/chosen": -290.83380126953125, + "logps/rejected": -230.97933959960938, + "loss": 0.0348, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.041695039719343185, + "rewards/margins": 0.05596474930644035, + "rewards/rejected": -0.09765978157520294, + "step": 10840 + }, + { + "epoch": 0.71, + "learning_rate": 1.1760897066018842e-06, + "logits/chosen": -2.1914420127868652, + "logits/rejected": -2.0563783645629883, + "logps/chosen": -229.926513671875, + "logps/rejected": -243.2219696044922, + "loss": 0.0136, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.03442706912755966, + "rewards/margins": 0.09646196663379669, + "rewards/rejected": -0.13088904321193695, + "step": 10850 + }, + { + "epoch": 0.71, + "learning_rate": 1.1712496187707327e-06, + "logits/chosen": -2.219277858734131, + "logits/rejected": -1.9796173572540283, + "logps/chosen": -254.57894897460938, + "logps/rejected": -267.26800537109375, + "loss": 0.0609, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.07137357443571091, + "rewards/margins": 0.1337740272283554, + "rewards/rejected": -0.20514757931232452, + "step": 10860 + }, + { + "epoch": 0.71, + "learning_rate": 1.1664164623524646e-06, + "logits/chosen": -2.1844263076782227, + "logits/rejected": -2.0241284370422363, + "logps/chosen": -224.78298950195312, + "logps/rejected": -212.51327514648438, + "loss": 0.0336, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.03203979507088661, + "rewards/margins": 0.07993793487548828, + "rewards/rejected": -0.11197773367166519, + "step": 10870 + }, + { + "epoch": 0.71, + "learning_rate": 1.1615902625591926e-06, + "logits/chosen": -2.1608195304870605, + "logits/rejected": -2.0687127113342285, + "logps/chosen": -240.38119506835938, + "logps/rejected": -244.77163696289062, + "loss": 0.0303, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.07579996436834335, + "rewards/margins": 0.06902037560939789, + "rewards/rejected": -0.14482033252716064, + "step": 10880 + }, + { + "epoch": 0.71, + "learning_rate": 1.156771044566738e-06, + "logits/chosen": -2.2513537406921387, + "logits/rejected": -2.177926540374756, + "logps/chosen": -276.24200439453125, + "logps/rejected": -247.43820190429688, + "loss": 0.0151, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.05813673138618469, + "rewards/margins": 0.0720682367682457, + "rewards/rejected": -0.1302049607038498, + "step": 10890 + }, + { + "epoch": 0.71, + "learning_rate": 1.1519588335145037e-06, + "logits/chosen": -2.1998701095581055, + "logits/rejected": -2.3404908180236816, + "logps/chosen": -214.55245971679688, + "logps/rejected": -241.935302734375, + "loss": 0.0294, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.03486829251050949, + "rewards/margins": 0.04341721534729004, + "rewards/rejected": -0.07828550785779953, + "step": 10900 + }, + { + "epoch": 0.71, + "eval_logits/chosen": -2.255387783050537, + "eval_logits/rejected": -2.0696256160736084, + "eval_logps/chosen": -241.1903839111328, + "eval_logps/rejected": -236.889892578125, + "eval_loss": 0.02456137165427208, + "eval_rewards/accuracies": 0.6614999771118164, + "eval_rewards/chosen": -0.04592716321349144, + "eval_rewards/margins": 0.0804627314209938, + "eval_rewards/rejected": -0.12638989090919495, + "eval_runtime": 712.0708, + "eval_samples_per_second": 2.809, + "eval_steps_per_second": 1.404, + "step": 10900 + }, + { + "epoch": 0.71, + "learning_rate": 1.1471536545053382e-06, + "logits/chosen": -2.291710138320923, + "logits/rejected": -2.275481700897217, + "logps/chosen": -214.15664672851562, + "logps/rejected": -240.5742950439453, + "loss": 0.0439, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.025608647614717484, + "rewards/margins": 0.094327412545681, + "rewards/rejected": -0.11993608623743057, + "step": 10910 + }, + { + "epoch": 0.71, + "learning_rate": 1.1423555326054112e-06, + "logits/chosen": -2.1900248527526855, + "logits/rejected": -1.9168975353240967, + "logps/chosen": -289.61859130859375, + "logps/rejected": -248.3450927734375, + "loss": 0.018, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.03392020985484123, + "rewards/margins": 0.1608126163482666, + "rewards/rejected": -0.19473282992839813, + "step": 10920 + }, + { + "epoch": 0.72, + "learning_rate": 1.1375644928440743e-06, + "logits/chosen": -2.3279836177825928, + "logits/rejected": -1.9061031341552734, + "logps/chosen": -244.189208984375, + "logps/rejected": -197.33164978027344, + "loss": 0.0143, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.037529680877923965, + "rewards/margins": 0.10323164612054825, + "rewards/rejected": -0.14076131582260132, + "step": 10930 + }, + { + "epoch": 0.72, + "learning_rate": 1.1327805602137396e-06, + "logits/chosen": -2.2742772102355957, + "logits/rejected": -2.094223976135254, + "logps/chosen": -272.142333984375, + "logps/rejected": -235.36026000976562, + "loss": 0.0187, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.0669911801815033, + "rewards/margins": 0.0809796154499054, + "rewards/rejected": -0.1479707956314087, + "step": 10940 + }, + { + "epoch": 0.72, + "learning_rate": 1.1280037596697426e-06, + "logits/chosen": -2.121037006378174, + "logits/rejected": -2.0819196701049805, + "logps/chosen": -233.88143920898438, + "logps/rejected": -297.5550231933594, + "loss": 0.0321, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.06499499827623367, + "rewards/margins": 0.1337338089942932, + "rewards/rejected": -0.1987287998199463, + "step": 10950 + }, + { + "epoch": 0.72, + "learning_rate": 1.123234116130216e-06, + "logits/chosen": -2.1778290271759033, + "logits/rejected": -2.141162872314453, + "logps/chosen": -196.35324096679688, + "logps/rejected": -225.81494140625, + "loss": 0.0302, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.05232428386807442, + "rewards/margins": 0.11703711748123169, + "rewards/rejected": -0.16936138272285461, + "step": 10960 + }, + { + "epoch": 0.72, + "learning_rate": 1.1184716544759553e-06, + "logits/chosen": -2.033923625946045, + "logits/rejected": -2.009354591369629, + "logps/chosen": -174.05067443847656, + "logps/rejected": -196.1552734375, + "loss": 0.0388, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.043432433158159256, + "rewards/margins": 0.04845009371638298, + "rewards/rejected": -0.09188252687454224, + "step": 10970 + }, + { + "epoch": 0.72, + "learning_rate": 1.1137163995502948e-06, + "logits/chosen": -2.4753432273864746, + "logits/rejected": -2.243244171142578, + "logps/chosen": -228.891357421875, + "logps/rejected": -211.28121948242188, + "loss": 0.0177, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.0367816723883152, + "rewards/margins": 0.07545877248048782, + "rewards/rejected": -0.11224043369293213, + "step": 10980 + }, + { + "epoch": 0.72, + "learning_rate": 1.1089683761589717e-06, + "logits/chosen": -2.096439838409424, + "logits/rejected": -1.9385840892791748, + "logps/chosen": -244.000732421875, + "logps/rejected": -249.7571258544922, + "loss": 0.0134, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.01897195540368557, + "rewards/margins": 0.12335314601659775, + "rewards/rejected": -0.14232513308525085, + "step": 10990 + }, + { + "epoch": 0.72, + "learning_rate": 1.1042276090700044e-06, + "logits/chosen": -2.251804828643799, + "logits/rejected": -2.199323892593384, + "logps/chosen": -226.4453582763672, + "logps/rejected": -269.26055908203125, + "loss": 0.0493, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.08071614801883698, + "rewards/margins": 0.06322924047708511, + "rewards/rejected": -0.1439453810453415, + "step": 11000 + }, + { + "epoch": 0.72, + "eval_logits/chosen": -2.257098436355591, + "eval_logits/rejected": -2.0713884830474854, + "eval_logps/chosen": -240.1348876953125, + "eval_logps/rejected": -235.5289306640625, + "eval_loss": 0.02467404119670391, + "eval_rewards/accuracies": 0.6554999947547913, + "eval_rewards/chosen": -0.04064975306391716, + "eval_rewards/margins": 0.07893543690443039, + "eval_rewards/rejected": -0.11958518624305725, + "eval_runtime": 712.198, + "eval_samples_per_second": 2.808, + "eval_steps_per_second": 1.404, + "step": 11000 + }, + { + "epoch": 0.72, + "learning_rate": 1.0994941230135536e-06, + "logits/chosen": -2.217406749725342, + "logits/rejected": -1.925784707069397, + "logps/chosen": -237.1549835205078, + "logps/rejected": -228.53414916992188, + "loss": 0.0112, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.02152491733431816, + "rewards/margins": 0.1371946781873703, + "rewards/rejected": -0.15871959924697876, + "step": 11010 + }, + { + "epoch": 0.72, + "learning_rate": 1.094767942681804e-06, + "logits/chosen": -2.495387554168701, + "logits/rejected": -2.0764713287353516, + "logps/chosen": -265.5306701660156, + "logps/rejected": -244.12820434570312, + "loss": 0.0315, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.08830820024013519, + "rewards/margins": 0.08899056166410446, + "rewards/rejected": -0.17729876935482025, + "step": 11020 + }, + { + "epoch": 0.72, + "learning_rate": 1.0900490927288248e-06, + "logits/chosen": -2.0295703411102295, + "logits/rejected": -1.984675645828247, + "logps/chosen": -272.8847351074219, + "logps/rejected": -235.7385711669922, + "loss": 0.0242, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.06208237260580063, + "rewards/margins": 0.06165067106485367, + "rewards/rejected": -0.1237330436706543, + "step": 11030 + }, + { + "epoch": 0.72, + "learning_rate": 1.0853375977704511e-06, + "logits/chosen": -2.2647392749786377, + "logits/rejected": -2.0776801109313965, + "logps/chosen": -249.8405303955078, + "logps/rejected": -199.8957061767578, + "loss": 0.0273, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.047079846262931824, + "rewards/margins": 0.07032942026853561, + "rewards/rejected": -0.11740926653146744, + "step": 11040 + }, + { + "epoch": 0.72, + "learning_rate": 1.0806334823841466e-06, + "logits/chosen": -2.110968589782715, + "logits/rejected": -2.217334747314453, + "logps/chosen": -254.81399536132812, + "logps/rejected": -295.74554443359375, + "loss": 0.0395, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.07599040865898132, + "rewards/margins": 0.05764209106564522, + "rewards/rejected": -0.13363249599933624, + "step": 11050 + }, + { + "epoch": 0.72, + "learning_rate": 1.0759367711088825e-06, + "logits/chosen": -2.1156725883483887, + "logits/rejected": -2.253272294998169, + "logps/chosen": -192.5499725341797, + "logps/rejected": -236.18276977539062, + "loss": 0.0224, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.043587371706962585, + "rewards/margins": 0.05077790096402168, + "rewards/rejected": -0.09436526894569397, + "step": 11060 + }, + { + "epoch": 0.72, + "learning_rate": 1.0712474884450056e-06, + "logits/chosen": -2.2297728061676025, + "logits/rejected": -2.032270908355713, + "logps/chosen": -209.87161254882812, + "logps/rejected": -199.3758544921875, + "loss": 0.0533, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.035191889852285385, + "rewards/margins": 0.09145514667034149, + "rewards/rejected": -0.12664702534675598, + "step": 11070 + }, + { + "epoch": 0.72, + "learning_rate": 1.066565658854112e-06, + "logits/chosen": -2.174771547317505, + "logits/rejected": -2.1367924213409424, + "logps/chosen": -139.6210479736328, + "logps/rejected": -152.4537811279297, + "loss": 0.0249, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.058945734053850174, + "rewards/margins": 0.0803261548280716, + "rewards/rejected": -0.13927188515663147, + "step": 11080 + }, + { + "epoch": 0.73, + "learning_rate": 1.0618913067589165e-06, + "logits/chosen": -2.283958673477173, + "logits/rejected": -2.0666098594665527, + "logps/chosen": -222.7408905029297, + "logps/rejected": -204.1822509765625, + "loss": 0.0378, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.034630995243787766, + "rewards/margins": 0.09450625628232956, + "rewards/rejected": -0.12913724780082703, + "step": 11090 + }, + { + "epoch": 0.73, + "learning_rate": 1.0572244565431313e-06, + "logits/chosen": -2.13905668258667, + "logits/rejected": -2.020878553390503, + "logps/chosen": -157.81869506835938, + "logps/rejected": -179.3304901123047, + "loss": 0.0186, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.06200949102640152, + "rewards/margins": 0.08175288140773773, + "rewards/rejected": -0.14376236498355865, + "step": 11100 + }, + { + "epoch": 0.73, + "eval_logits/chosen": -2.2600183486938477, + "eval_logits/rejected": -2.0741000175476074, + "eval_logps/chosen": -239.24647521972656, + "eval_logps/rejected": -235.19859313964844, + "eval_loss": 0.02463115192949772, + "eval_rewards/accuracies": 0.6604999899864197, + "eval_rewards/chosen": -0.036207519471645355, + "eval_rewards/margins": 0.08172591030597687, + "eval_rewards/rejected": -0.11793343722820282, + "eval_runtime": 712.2662, + "eval_samples_per_second": 2.808, + "eval_steps_per_second": 1.404, + "step": 11100 + }, + { + "epoch": 0.73, + "learning_rate": 1.0525651325513317e-06, + "logits/chosen": -2.2700681686401367, + "logits/rejected": -2.2490804195404053, + "logps/chosen": -337.8574523925781, + "logps/rejected": -326.71588134765625, + "loss": 0.0254, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.03293894976377487, + "rewards/margins": 0.05006919056177139, + "rewards/rejected": -0.08300813287496567, + "step": 11110 + }, + { + "epoch": 0.73, + "learning_rate": 1.0479133590888351e-06, + "logits/chosen": -2.3007349967956543, + "logits/rejected": -2.011467695236206, + "logps/chosen": -262.3280334472656, + "logps/rejected": -254.04037475585938, + "loss": 0.0181, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.03436540812253952, + "rewards/margins": 0.09342274814844131, + "rewards/rejected": -0.12778815627098083, + "step": 11120 + }, + { + "epoch": 0.73, + "learning_rate": 1.0432691604215695e-06, + "logits/chosen": -2.1729531288146973, + "logits/rejected": -2.0845744609832764, + "logps/chosen": -236.9325714111328, + "logps/rejected": -216.92691040039062, + "loss": 0.0292, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.00942798238247633, + "rewards/margins": 0.058537401258945465, + "rewards/rejected": -0.06796539574861526, + "step": 11130 + }, + { + "epoch": 0.73, + "learning_rate": 1.0386325607759515e-06, + "logits/chosen": -2.188417911529541, + "logits/rejected": -2.1244304180145264, + "logps/chosen": -193.04457092285156, + "logps/rejected": -190.75962829589844, + "loss": 0.0263, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.008272209204733372, + "rewards/margins": 0.08892752230167389, + "rewards/rejected": -0.09719973802566528, + "step": 11140 + }, + { + "epoch": 0.73, + "learning_rate": 1.0340035843387544e-06, + "logits/chosen": -2.2836594581604004, + "logits/rejected": -1.9597225189208984, + "logps/chosen": -187.1991729736328, + "logps/rejected": -174.34954833984375, + "loss": 0.0164, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.037318091839551926, + "rewards/margins": 0.07366606593132019, + "rewards/rejected": -0.11098414659500122, + "step": 11150 + }, + { + "epoch": 0.73, + "learning_rate": 1.0293822552569887e-06, + "logits/chosen": -2.401132106781006, + "logits/rejected": -2.1063313484191895, + "logps/chosen": -257.7044372558594, + "logps/rejected": -222.4474639892578, + "loss": 0.0185, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.02910296991467476, + "rewards/margins": 0.10286752879619598, + "rewards/rejected": -0.13197049498558044, + "step": 11160 + }, + { + "epoch": 0.73, + "learning_rate": 1.0247685976377688e-06, + "logits/chosen": -2.1574559211730957, + "logits/rejected": -1.9853312969207764, + "logps/chosen": -191.28790283203125, + "logps/rejected": -168.7615966796875, + "loss": 0.0258, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.04801440238952637, + "rewards/margins": 0.08152531087398529, + "rewards/rejected": -0.12953971326351166, + "step": 11170 + }, + { + "epoch": 0.73, + "learning_rate": 1.0201626355481939e-06, + "logits/chosen": -2.3552398681640625, + "logits/rejected": -2.0675008296966553, + "logps/chosen": -225.4108428955078, + "logps/rejected": -189.56910705566406, + "loss": 0.0137, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.04645168036222458, + "rewards/margins": 0.08859079331159592, + "rewards/rejected": -0.1350424736738205, + "step": 11180 + }, + { + "epoch": 0.73, + "learning_rate": 1.0155643930152192e-06, + "logits/chosen": -2.370053768157959, + "logits/rejected": -2.282047986984253, + "logps/chosen": -281.9950866699219, + "logps/rejected": -237.9798126220703, + "loss": 0.0163, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.04693884029984474, + "rewards/margins": 0.05315285921096802, + "rewards/rejected": -0.10009171068668365, + "step": 11190 + }, + { + "epoch": 0.73, + "learning_rate": 1.0109738940255286e-06, + "logits/chosen": -2.197000503540039, + "logits/rejected": -1.9323844909667969, + "logps/chosen": -222.72396850585938, + "logps/rejected": -200.8384246826172, + "loss": 0.0233, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.028707217425107956, + "rewards/margins": 0.07697827368974686, + "rewards/rejected": -0.10568549484014511, + "step": 11200 + }, + { + "epoch": 0.73, + "eval_logits/chosen": -2.260996103286743, + "eval_logits/rejected": -2.0750019550323486, + "eval_logps/chosen": -237.5009002685547, + "eval_logps/rejected": -233.3054962158203, + "eval_loss": 0.02466612309217453, + "eval_rewards/accuracies": 0.6585000157356262, + "eval_rewards/chosen": -0.027479784563183784, + "eval_rewards/margins": 0.08098819851875305, + "eval_rewards/rejected": -0.10846797376871109, + "eval_runtime": 713.1565, + "eval_samples_per_second": 2.804, + "eval_steps_per_second": 1.402, + "step": 11200 + }, + { + "epoch": 0.73, + "learning_rate": 1.0063911625254155e-06, + "logits/chosen": -2.264378070831299, + "logits/rejected": -2.126863718032837, + "logps/chosen": -234.0228729248047, + "logps/rejected": -250.18081665039062, + "loss": 0.0234, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.009498958475887775, + "rewards/margins": 0.07764261215925217, + "rewards/rejected": -0.06814365088939667, + "step": 11210 + }, + { + "epoch": 0.73, + "learning_rate": 1.0018162224206502e-06, + "logits/chosen": -2.174161434173584, + "logits/rejected": -2.084477663040161, + "logps/chosen": -183.6377410888672, + "logps/rejected": -209.85507202148438, + "loss": 0.0199, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.045814670622348785, + "rewards/margins": 0.11253632605075836, + "rewards/rejected": -0.15835098922252655, + "step": 11220 + }, + { + "epoch": 0.73, + "learning_rate": 9.97249097576363e-07, + "logits/chosen": -2.424260377883911, + "logits/rejected": -2.1379024982452393, + "logps/chosen": -238.74502563476562, + "logps/rejected": -215.4193878173828, + "loss": 0.0288, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.028442194685339928, + "rewards/margins": 0.10083095729351044, + "rewards/rejected": -0.12927314639091492, + "step": 11230 + }, + { + "epoch": 0.74, + "learning_rate": 9.92689811816913e-07, + "logits/chosen": -2.290501594543457, + "logits/rejected": -2.0555148124694824, + "logps/chosen": -225.33627319335938, + "logps/rejected": -193.66452026367188, + "loss": 0.0369, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.06407545506954193, + "rewards/margins": 0.07865617424249649, + "rewards/rejected": -0.14273162186145782, + "step": 11240 + }, + { + "epoch": 0.74, + "learning_rate": 9.881383889257691e-07, + "logits/chosen": -2.2328104972839355, + "logits/rejected": -2.2730355262756348, + "logps/chosen": -174.77767944335938, + "logps/rejected": -251.2799530029297, + "loss": 0.0145, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.027403950691223145, + "rewards/margins": 0.06211910769343376, + "rewards/rejected": -0.0895230621099472, + "step": 11250 + }, + { + "epoch": 0.74, + "learning_rate": 9.835948526453817e-07, + "logits/chosen": -2.0998029708862305, + "logits/rejected": -2.2713968753814697, + "logps/chosen": -190.5878448486328, + "logps/rejected": -244.7119598388672, + "loss": 0.0368, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.05079245567321777, + "rewards/margins": 0.052261579781770706, + "rewards/rejected": -0.10305403172969818, + "step": 11260 + }, + { + "epoch": 0.74, + "learning_rate": 9.790592266770633e-07, + "logits/chosen": -2.4601409435272217, + "logits/rejected": -2.1775612831115723, + "logps/chosen": -276.0139465332031, + "logps/rejected": -262.95062255859375, + "loss": 0.0285, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.03334607556462288, + "rewards/margins": 0.07587692886590958, + "rewards/rejected": -0.10922299325466156, + "step": 11270 + }, + { + "epoch": 0.74, + "learning_rate": 9.745315346808584e-07, + "logits/chosen": -2.1334099769592285, + "logits/rejected": -1.9849653244018555, + "logps/chosen": -223.7794952392578, + "logps/rejected": -214.8672637939453, + "loss": 0.0326, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.02352016046643257, + "rewards/margins": 0.0673544630408287, + "rewards/rejected": -0.09087462723255157, + "step": 11280 + }, + { + "epoch": 0.74, + "learning_rate": 9.70011800275428e-07, + "logits/chosen": -2.1849100589752197, + "logits/rejected": -2.1144683361053467, + "logps/chosen": -245.4765625, + "logps/rejected": -275.33837890625, + "loss": 0.022, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.03975829482078552, + "rewards/margins": 0.09207911789417267, + "rewards/rejected": -0.1318374127149582, + "step": 11290 + }, + { + "epoch": 0.74, + "learning_rate": 9.655000470379206e-07, + "logits/chosen": -2.104613780975342, + "logits/rejected": -2.0167109966278076, + "logps/chosen": -219.29104614257812, + "logps/rejected": -230.323486328125, + "loss": 0.0218, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.04471781104803085, + "rewards/margins": 0.10857494175434113, + "rewards/rejected": -0.15329274535179138, + "step": 11300 + }, + { + "epoch": 0.74, + "eval_logits/chosen": -2.262881278991699, + "eval_logits/rejected": -2.0764498710632324, + "eval_logps/chosen": -239.4001007080078, + "eval_logps/rejected": -235.6196746826172, + "eval_loss": 0.02440127171576023, + "eval_rewards/accuracies": 0.6575000286102295, + "eval_rewards/chosen": -0.036975663155317307, + "eval_rewards/margins": 0.08306314796209335, + "eval_rewards/rejected": -0.12003880739212036, + "eval_runtime": 713.3374, + "eval_samples_per_second": 2.804, + "eval_steps_per_second": 1.402, + "step": 11300 + }, + { + "epoch": 0.74, + "learning_rate": 9.609962985038517e-07, + "logits/chosen": -2.3760733604431152, + "logits/rejected": -1.9836667776107788, + "logps/chosen": -223.32772827148438, + "logps/rejected": -223.2099609375, + "loss": 0.0285, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.03983340784907341, + "rewards/margins": 0.1250358521938324, + "rewards/rejected": -0.1648692637681961, + "step": 11310 + }, + { + "epoch": 0.74, + "learning_rate": 9.565005781669786e-07, + "logits/chosen": -2.4255800247192383, + "logits/rejected": -2.0649266242980957, + "logps/chosen": -265.18695068359375, + "logps/rejected": -232.25265502929688, + "loss": 0.0244, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.022981833666563034, + "rewards/margins": 0.10189278423786163, + "rewards/rejected": -0.12487462908029556, + "step": 11320 + }, + { + "epoch": 0.74, + "learning_rate": 9.520129094791822e-07, + "logits/chosen": -2.2258224487304688, + "logits/rejected": -2.0585074424743652, + "logps/chosen": -191.6188507080078, + "logps/rejected": -201.65322875976562, + "loss": 0.0376, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.06061561033129692, + "rewards/margins": 0.10118802636861801, + "rewards/rejected": -0.16180363297462463, + "step": 11330 + }, + { + "epoch": 0.74, + "learning_rate": 9.475333158503389e-07, + "logits/chosen": -2.216360569000244, + "logits/rejected": -1.9243495464324951, + "logps/chosen": -221.27688598632812, + "logps/rejected": -187.52163696289062, + "loss": 0.0248, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.017273029312491417, + "rewards/margins": 0.057016678154468536, + "rewards/rejected": -0.0742897093296051, + "step": 11340 + }, + { + "epoch": 0.74, + "learning_rate": 9.430618206482053e-07, + "logits/chosen": -2.187056064605713, + "logits/rejected": -2.1244726181030273, + "logps/chosen": -144.8751220703125, + "logps/rejected": -155.0688018798828, + "loss": 0.0175, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.022261008620262146, + "rewards/margins": 0.047990910708904266, + "rewards/rejected": -0.07025192677974701, + "step": 11350 + }, + { + "epoch": 0.74, + "learning_rate": 9.385984471982892e-07, + "logits/chosen": -2.168158531188965, + "logits/rejected": -1.8052467107772827, + "logps/chosen": -222.80551147460938, + "logps/rejected": -199.8382568359375, + "loss": 0.0185, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.03989826887845993, + "rewards/margins": 0.1388367861509323, + "rewards/rejected": -0.17873504757881165, + "step": 11360 + }, + { + "epoch": 0.74, + "learning_rate": 9.341432187837343e-07, + "logits/chosen": -2.2368786334991455, + "logits/rejected": -2.144381046295166, + "logps/chosen": -196.49130249023438, + "logps/rejected": -232.60446166992188, + "loss": 0.0365, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.030937695875763893, + "rewards/margins": 0.1062239557504654, + "rewards/rejected": -0.13716165721416473, + "step": 11370 + }, + { + "epoch": 0.74, + "learning_rate": 9.29696158645193e-07, + "logits/chosen": -2.1705503463745117, + "logits/rejected": -2.2710890769958496, + "logps/chosen": -225.888427734375, + "logps/rejected": -282.481201171875, + "loss": 0.0152, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.023930717259645462, + "rewards/margins": 0.13801470398902893, + "rewards/rejected": -0.1619454324245453, + "step": 11380 + }, + { + "epoch": 0.75, + "learning_rate": 9.252572899807111e-07, + "logits/chosen": -2.2330169677734375, + "logits/rejected": -2.245069742202759, + "logps/chosen": -278.10125732421875, + "logps/rejected": -275.082763671875, + "loss": 0.0087, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.03750836104154587, + "rewards/margins": 0.11584211885929108, + "rewards/rejected": -0.15335044264793396, + "step": 11390 + }, + { + "epoch": 0.75, + "learning_rate": 9.208266359456003e-07, + "logits/chosen": -2.317948818206787, + "logits/rejected": -2.0604913234710693, + "logps/chosen": -209.72067260742188, + "logps/rejected": -233.62484741210938, + "loss": 0.0365, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.01957458071410656, + "rewards/margins": 0.07166720926761627, + "rewards/rejected": -0.09124179929494858, + "step": 11400 + }, + { + "epoch": 0.75, + "eval_logits/chosen": -2.2583706378936768, + "eval_logits/rejected": -2.0719101428985596, + "eval_logps/chosen": -239.11155700683594, + "eval_logps/rejected": -236.07205200195312, + "eval_loss": 0.024521106854081154, + "eval_rewards/accuracies": 0.6579999923706055, + "eval_rewards/chosen": -0.03553308546543121, + "eval_rewards/margins": 0.08676765114068985, + "eval_rewards/rejected": -0.12230074405670166, + "eval_runtime": 715.1436, + "eval_samples_per_second": 2.797, + "eval_steps_per_second": 1.398, + "step": 11400 + }, + { + "epoch": 0.75, + "learning_rate": 9.164042196523229e-07, + "logits/chosen": -2.4343605041503906, + "logits/rejected": -2.129931688308716, + "logps/chosen": -198.1632843017578, + "logps/rejected": -204.4161834716797, + "loss": 0.0197, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.043811190873384476, + "rewards/margins": 0.09885282069444656, + "rewards/rejected": -0.14266401529312134, + "step": 11410 + }, + { + "epoch": 0.75, + "learning_rate": 9.119900641703696e-07, + "logits/chosen": -2.3990368843078613, + "logits/rejected": -2.1311213970184326, + "logps/chosen": -223.1060028076172, + "logps/rejected": -191.94911193847656, + "loss": 0.0274, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.04829835519194603, + "rewards/margins": 0.07589010894298553, + "rewards/rejected": -0.12418848276138306, + "step": 11420 + }, + { + "epoch": 0.75, + "learning_rate": 9.075841925261364e-07, + "logits/chosen": -2.5026679039001465, + "logits/rejected": -2.263340711593628, + "logps/chosen": -242.7382049560547, + "logps/rejected": -243.7560577392578, + "loss": 0.047, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.02409534528851509, + "rewards/margins": 0.07723324745893478, + "rewards/rejected": -0.10132858902215958, + "step": 11430 + }, + { + "epoch": 0.75, + "learning_rate": 9.031866277028093e-07, + "logits/chosen": -2.1907057762145996, + "logits/rejected": -2.2167811393737793, + "logps/chosen": -199.39405822753906, + "logps/rejected": -242.85537719726562, + "loss": 0.0152, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.041596420109272, + "rewards/margins": 0.083258256316185, + "rewards/rejected": -0.1248546615242958, + "step": 11440 + }, + { + "epoch": 0.75, + "learning_rate": 8.987973926402391e-07, + "logits/chosen": -2.1543126106262207, + "logits/rejected": -2.1936657428741455, + "logps/chosen": -219.5368194580078, + "logps/rejected": -235.53662109375, + "loss": 0.0358, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.035986222326755524, + "rewards/margins": 0.08293595165014267, + "rewards/rejected": -0.1189221516251564, + "step": 11450 + }, + { + "epoch": 0.75, + "learning_rate": 8.944165102348273e-07, + "logits/chosen": -2.370535373687744, + "logits/rejected": -2.220362424850464, + "logps/chosen": -157.20303344726562, + "logps/rejected": -187.8885498046875, + "loss": 0.0548, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.0012795614311471581, + "rewards/margins": 0.11714081466197968, + "rewards/rejected": -0.11842037737369537, + "step": 11460 + }, + { + "epoch": 0.75, + "learning_rate": 8.900440033394018e-07, + "logits/chosen": -2.189187526702881, + "logits/rejected": -2.238858461380005, + "logps/chosen": -187.5149383544922, + "logps/rejected": -192.39752197265625, + "loss": 0.0235, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.024993527680635452, + "rewards/margins": 0.07084139436483383, + "rewards/rejected": -0.09583492577075958, + "step": 11470 + }, + { + "epoch": 0.75, + "learning_rate": 8.856798947631009e-07, + "logits/chosen": -2.2251715660095215, + "logits/rejected": -2.2451891899108887, + "logps/chosen": -195.27099609375, + "logps/rejected": -234.95126342773438, + "loss": 0.0211, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.021976452320814133, + "rewards/margins": 0.1136414036154747, + "rewards/rejected": -0.13561786711215973, + "step": 11480 + }, + { + "epoch": 0.75, + "learning_rate": 8.813242072712519e-07, + "logits/chosen": -1.963451623916626, + "logits/rejected": -1.8482892513275146, + "logps/chosen": -173.51779174804688, + "logps/rejected": -187.3588409423828, + "loss": 0.0343, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.05827740579843521, + "rewards/margins": 0.06352510303258896, + "rewards/rejected": -0.12180250883102417, + "step": 11490 + }, + { + "epoch": 0.75, + "learning_rate": 8.769769635852557e-07, + "logits/chosen": -2.191650867462158, + "logits/rejected": -2.2480854988098145, + "logps/chosen": -219.7031707763672, + "logps/rejected": -196.14064025878906, + "loss": 0.0199, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.021976929157972336, + "rewards/margins": 0.06597265601158142, + "rewards/rejected": -0.08794957399368286, + "step": 11500 + }, + { + "epoch": 0.75, + "eval_logits/chosen": -2.26953387260437, + "eval_logits/rejected": -2.0827267169952393, + "eval_logps/chosen": -238.35740661621094, + "eval_logps/rejected": -233.97021484375, + "eval_loss": 0.02457558736205101, + "eval_rewards/accuracies": 0.6589999794960022, + "eval_rewards/chosen": -0.03176227957010269, + "eval_rewards/margins": 0.08002925664186478, + "eval_rewards/rejected": -0.11179153621196747, + "eval_runtime": 714.8856, + "eval_samples_per_second": 2.798, + "eval_steps_per_second": 1.399, + "step": 11500 + }, + { + "epoch": 0.75, + "learning_rate": 8.726381863824635e-07, + "logits/chosen": -2.4292683601379395, + "logits/rejected": -2.1044631004333496, + "logps/chosen": -289.0771789550781, + "logps/rejected": -236.4252166748047, + "loss": 0.0179, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.020000828430056572, + "rewards/margins": 0.07377125322818756, + "rewards/rejected": -0.09377209842205048, + "step": 11510 + }, + { + "epoch": 0.75, + "learning_rate": 8.683078982960638e-07, + "logits/chosen": -2.181670665740967, + "logits/rejected": -1.8756259679794312, + "logps/chosen": -236.1549072265625, + "logps/rejected": -207.24911499023438, + "loss": 0.0372, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.05734957009553909, + "rewards/margins": 0.10592573881149292, + "rewards/rejected": -0.1632753163576126, + "step": 11520 + }, + { + "epoch": 0.75, + "learning_rate": 8.639861219149584e-07, + "logits/chosen": -2.033862352371216, + "logits/rejected": -2.08524751663208, + "logps/chosen": -274.317626953125, + "logps/rejected": -250.8245849609375, + "loss": 0.0265, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.05167793110013008, + "rewards/margins": 0.09715747833251953, + "rewards/rejected": -0.14883539080619812, + "step": 11530 + }, + { + "epoch": 0.76, + "learning_rate": 8.596728797836532e-07, + "logits/chosen": -2.1555728912353516, + "logits/rejected": -2.002525806427002, + "logps/chosen": -220.53543090820312, + "logps/rejected": -275.8230285644531, + "loss": 0.0266, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.026133406907320023, + "rewards/margins": 0.1284731775522232, + "rewards/rejected": -0.15460659563541412, + "step": 11540 + }, + { + "epoch": 0.76, + "learning_rate": 8.553681944021294e-07, + "logits/chosen": -2.2165935039520264, + "logits/rejected": -2.242164134979248, + "logps/chosen": -244.7801513671875, + "logps/rejected": -243.2750701904297, + "loss": 0.0177, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.028480708599090576, + "rewards/margins": 0.08921106159687042, + "rewards/rejected": -0.117691770195961, + "step": 11550 + }, + { + "epoch": 0.76, + "learning_rate": 8.510720882257365e-07, + "logits/chosen": -1.984035849571228, + "logits/rejected": -2.1254215240478516, + "logps/chosen": -166.98123168945312, + "logps/rejected": -233.5638885498047, + "loss": 0.0191, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.028403252363204956, + "rewards/margins": 0.12675362825393677, + "rewards/rejected": -0.15515688061714172, + "step": 11560 + }, + { + "epoch": 0.76, + "learning_rate": 8.467845836650667e-07, + "logits/chosen": -1.8751825094223022, + "logits/rejected": -1.9234832525253296, + "logps/chosen": -218.1288299560547, + "logps/rejected": -242.38735961914062, + "loss": 0.0294, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.04931117966771126, + "rewards/margins": 0.08851714432239532, + "rewards/rejected": -0.13782832026481628, + "step": 11570 + }, + { + "epoch": 0.76, + "learning_rate": 8.425057030858461e-07, + "logits/chosen": -2.068479537963867, + "logits/rejected": -1.9056812524795532, + "logps/chosen": -162.3206329345703, + "logps/rejected": -210.03085327148438, + "loss": 0.0166, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.03132368624210358, + "rewards/margins": 0.09343663603067398, + "rewards/rejected": -0.12476031482219696, + "step": 11580 + }, + { + "epoch": 0.76, + "learning_rate": 8.382354688088098e-07, + "logits/chosen": -2.2166271209716797, + "logits/rejected": -2.080953359603882, + "logps/chosen": -167.2935791015625, + "logps/rejected": -197.07273864746094, + "loss": 0.0335, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.04569109529256821, + "rewards/margins": 0.07178109884262085, + "rewards/rejected": -0.11747218668460846, + "step": 11590 + }, + { + "epoch": 0.76, + "learning_rate": 8.33973903109594e-07, + "logits/chosen": -2.3416852951049805, + "logits/rejected": -2.0910236835479736, + "logps/chosen": -227.5818328857422, + "logps/rejected": -217.0053253173828, + "loss": 0.0296, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.05492377281188965, + "rewards/margins": 0.10492531955242157, + "rewards/rejected": -0.15984909236431122, + "step": 11600 + }, + { + "epoch": 0.76, + "eval_logits/chosen": -2.2632830142974854, + "eval_logits/rejected": -2.0765445232391357, + "eval_logps/chosen": -240.4170684814453, + "eval_logps/rejected": -237.5938262939453, + "eval_loss": 0.024378182366490364, + "eval_rewards/accuracies": 0.6664999723434448, + "eval_rewards/chosen": -0.042060501873493195, + "eval_rewards/margins": 0.08784911781549454, + "eval_rewards/rejected": -0.12990963459014893, + "eval_runtime": 712.6642, + "eval_samples_per_second": 2.806, + "eval_steps_per_second": 1.403, + "step": 11600 + }, + { + "epoch": 0.76, + "learning_rate": 8.297210282186102e-07, + "logits/chosen": -2.1106295585632324, + "logits/rejected": -2.084667682647705, + "logps/chosen": -246.1740264892578, + "logps/rejected": -282.34100341796875, + "loss": 0.0178, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.08078263700008392, + "rewards/margins": 0.07400176674127579, + "rewards/rejected": -0.1547844111919403, + "step": 11610 + }, + { + "epoch": 0.76, + "learning_rate": 8.254768663209397e-07, + "logits/chosen": -2.20991849899292, + "logits/rejected": -2.009641170501709, + "logps/chosen": -286.65081787109375, + "logps/rejected": -235.0749969482422, + "loss": 0.0329, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.040355004370212555, + "rewards/margins": 0.05447888374328613, + "rewards/rejected": -0.09483388066291809, + "step": 11620 + }, + { + "epoch": 0.76, + "learning_rate": 8.212414395562079e-07, + "logits/chosen": -2.0545597076416016, + "logits/rejected": -2.1798338890075684, + "logps/chosen": -242.8502655029297, + "logps/rejected": -279.1524353027344, + "loss": 0.0339, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.06318067014217377, + "rewards/margins": 0.06190643832087517, + "rewards/rejected": -0.12508711218833923, + "step": 11630 + }, + { + "epoch": 0.76, + "learning_rate": 8.170147700184775e-07, + "logits/chosen": -2.2550766468048096, + "logits/rejected": -2.138669490814209, + "logps/chosen": -262.5296325683594, + "logps/rejected": -272.13238525390625, + "loss": 0.0237, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.030391475185751915, + "rewards/margins": 0.09267780184745789, + "rewards/rejected": -0.12306930124759674, + "step": 11640 + }, + { + "epoch": 0.76, + "learning_rate": 8.127968797561242e-07, + "logits/chosen": -2.3019351959228516, + "logits/rejected": -2.041527271270752, + "logps/chosen": -235.93344116210938, + "logps/rejected": -237.5359344482422, + "loss": 0.0236, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.05981367081403732, + "rewards/margins": 0.12012721598148346, + "rewards/rejected": -0.17994089424610138, + "step": 11650 + }, + { + "epoch": 0.76, + "learning_rate": 8.085877907717338e-07, + "logits/chosen": -2.1951727867126465, + "logits/rejected": -2.1553750038146973, + "logps/chosen": -228.0797576904297, + "logps/rejected": -237.4129638671875, + "loss": 0.0148, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.03584109619259834, + "rewards/margins": 0.10506229102611542, + "rewards/rejected": -0.14090339839458466, + "step": 11660 + }, + { + "epoch": 0.76, + "learning_rate": 8.043875250219732e-07, + "logits/chosen": -2.1565158367156982, + "logits/rejected": -2.0909366607666016, + "logps/chosen": -242.97103881835938, + "logps/rejected": -230.5181427001953, + "loss": 0.039, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.0678316205739975, + "rewards/margins": 0.04307982325553894, + "rewards/rejected": -0.11091144382953644, + "step": 11670 + }, + { + "epoch": 0.76, + "learning_rate": 8.001961044174881e-07, + "logits/chosen": -2.3242721557617188, + "logits/rejected": -2.117072343826294, + "logps/chosen": -240.4780731201172, + "logps/rejected": -192.73492431640625, + "loss": 0.0309, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.06006825715303421, + "rewards/margins": 0.047224875539541245, + "rewards/rejected": -0.10729314386844635, + "step": 11680 + }, + { + "epoch": 0.76, + "learning_rate": 7.960135508227795e-07, + "logits/chosen": -2.3332180976867676, + "logits/rejected": -1.9855926036834717, + "logps/chosen": -301.40838623046875, + "logps/rejected": -253.2394256591797, + "loss": 0.0265, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.03769667446613312, + "rewards/margins": 0.08073518425226212, + "rewards/rejected": -0.11843186616897583, + "step": 11690 + }, + { + "epoch": 0.77, + "learning_rate": 7.91839886056098e-07, + "logits/chosen": -2.3475003242492676, + "logits/rejected": -2.0893478393554688, + "logps/chosen": -295.53472900390625, + "logps/rejected": -281.19122314453125, + "loss": 0.015, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.058862458914518356, + "rewards/margins": 0.08113612979650497, + "rewards/rejected": -0.13999858498573303, + "step": 11700 + }, + { + "epoch": 0.77, + "eval_logits/chosen": -2.2643589973449707, + "eval_logits/rejected": -2.0778892040252686, + "eval_logps/chosen": -241.74781799316406, + "eval_logps/rejected": -237.93856811523438, + "eval_loss": 0.02438133768737316, + "eval_rewards/accuracies": 0.6600000262260437, + "eval_rewards/chosen": -0.048714205622673035, + "eval_rewards/margins": 0.0829191654920578, + "eval_rewards/rejected": -0.13163337111473083, + "eval_runtime": 713.7867, + "eval_samples_per_second": 2.802, + "eval_steps_per_second": 1.401, + "step": 11700 + }, + { + "epoch": 0.77, + "learning_rate": 7.876751318893217e-07, + "logits/chosen": -2.1693975925445557, + "logits/rejected": -1.9223251342773438, + "logps/chosen": -247.9962615966797, + "logps/rejected": -241.5628662109375, + "loss": 0.0229, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.05065304785966873, + "rewards/margins": 0.07577352225780487, + "rewards/rejected": -0.1264265775680542, + "step": 11710 + }, + { + "epoch": 0.77, + "learning_rate": 7.8351931004785e-07, + "logits/chosen": -2.1445133686065674, + "logits/rejected": -1.7985641956329346, + "logps/chosen": -218.8597412109375, + "logps/rejected": -211.9443817138672, + "loss": 0.0214, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.051157396286726, + "rewards/margins": 0.08983974158763885, + "rewards/rejected": -0.14099714159965515, + "step": 11720 + }, + { + "epoch": 0.77, + "learning_rate": 7.793724422104834e-07, + "logits/chosen": -2.0111899375915527, + "logits/rejected": -2.158306121826172, + "logps/chosen": -218.9979705810547, + "logps/rejected": -309.1783447265625, + "loss": 0.0314, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.0508684441447258, + "rewards/margins": 0.10351568460464478, + "rewards/rejected": -0.15438412129878998, + "step": 11730 + }, + { + "epoch": 0.77, + "learning_rate": 7.752345500093184e-07, + "logits/chosen": -2.3120059967041016, + "logits/rejected": -2.278259038925171, + "logps/chosen": -227.6975555419922, + "logps/rejected": -209.49868774414062, + "loss": 0.046, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.08771263062953949, + "rewards/margins": 0.057827867567539215, + "rewards/rejected": -0.14554047584533691, + "step": 11740 + }, + { + "epoch": 0.77, + "learning_rate": 7.711056550296253e-07, + "logits/chosen": -2.363168239593506, + "logits/rejected": -2.191441535949707, + "logps/chosen": -253.12173461914062, + "logps/rejected": -236.3367156982422, + "loss": 0.0425, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.03163955360651016, + "rewards/margins": 0.10120024532079697, + "rewards/rejected": -0.13283979892730713, + "step": 11750 + }, + { + "epoch": 0.77, + "learning_rate": 7.669857788097445e-07, + "logits/chosen": -2.0963737964630127, + "logits/rejected": -1.8543342351913452, + "logps/chosen": -176.55654907226562, + "logps/rejected": -223.7884063720703, + "loss": 0.0171, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.07495290786027908, + "rewards/margins": 0.10303473472595215, + "rewards/rejected": -0.17798765003681183, + "step": 11760 + }, + { + "epoch": 0.77, + "learning_rate": 7.628749428409676e-07, + "logits/chosen": -2.360349178314209, + "logits/rejected": -1.9492822885513306, + "logps/chosen": -251.5398406982422, + "logps/rejected": -211.39425659179688, + "loss": 0.0438, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.07326027005910873, + "rewards/margins": 0.09302968531847, + "rewards/rejected": -0.16628995537757874, + "step": 11770 + }, + { + "epoch": 0.77, + "learning_rate": 7.587731685674288e-07, + "logits/chosen": -2.24495792388916, + "logits/rejected": -2.2814254760742188, + "logps/chosen": -283.14324951171875, + "logps/rejected": -308.06915283203125, + "loss": 0.0081, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.04777499660849571, + "rewards/margins": 0.08544452488422394, + "rewards/rejected": -0.13321951031684875, + "step": 11780 + }, + { + "epoch": 0.77, + "learning_rate": 7.546804773859931e-07, + "logits/chosen": -2.340010166168213, + "logits/rejected": -2.1141135692596436, + "logps/chosen": -238.205322265625, + "logps/rejected": -242.54702758789062, + "loss": 0.0164, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.05582636594772339, + "rewards/margins": 0.12138841301202774, + "rewards/rejected": -0.17721477150917053, + "step": 11790 + }, + { + "epoch": 0.77, + "learning_rate": 7.505968906461409e-07, + "logits/chosen": -2.241865634918213, + "logits/rejected": -2.102313280105591, + "logps/chosen": -255.6861114501953, + "logps/rejected": -248.14163208007812, + "loss": 0.0127, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.07373024523258209, + "rewards/margins": 0.08300880342721939, + "rewards/rejected": -0.1567390412092209, + "step": 11800 + }, + { + "epoch": 0.77, + "eval_logits/chosen": -2.2653133869171143, + "eval_logits/rejected": -2.0786798000335693, + "eval_logps/chosen": -243.9700927734375, + "eval_logps/rejected": -240.0970916748047, + "eval_loss": 0.024359513074159622, + "eval_rewards/accuracies": 0.6579999923706055, + "eval_rewards/chosen": -0.05982571840286255, + "eval_rewards/margins": 0.08260022103786469, + "eval_rewards/rejected": -0.14242593944072723, + "eval_runtime": 711.8556, + "eval_samples_per_second": 2.81, + "eval_steps_per_second": 1.405, + "step": 11800 + }, + { + "epoch": 0.77, + "learning_rate": 7.465224296498627e-07, + "logits/chosen": -2.3098983764648438, + "logits/rejected": -1.9284107685089111, + "logps/chosen": -245.07510375976562, + "logps/rejected": -218.8394317626953, + "loss": 0.0285, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.060580063611269, + "rewards/margins": 0.07446667551994324, + "rewards/rejected": -0.13504675030708313, + "step": 11810 + }, + { + "epoch": 0.77, + "learning_rate": 7.424571156515412e-07, + "logits/chosen": -2.1790266036987305, + "logits/rejected": -2.16428542137146, + "logps/chosen": -188.22915649414062, + "logps/rejected": -232.13473510742188, + "loss": 0.0285, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.052619803696870804, + "rewards/margins": 0.10614506900310516, + "rewards/rejected": -0.15876488387584686, + "step": 11820 + }, + { + "epoch": 0.77, + "learning_rate": 7.38400969857847e-07, + "logits/chosen": -2.130056858062744, + "logits/rejected": -1.9207099676132202, + "logps/chosen": -205.91537475585938, + "logps/rejected": -237.07766723632812, + "loss": 0.0335, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.11386485397815704, + "rewards/margins": 0.14005056023597717, + "rewards/rejected": -0.253915399312973, + "step": 11830 + }, + { + "epoch": 0.77, + "learning_rate": 7.343540134276225e-07, + "logits/chosen": -2.262645721435547, + "logits/rejected": -2.1876039505004883, + "logps/chosen": -178.30035400390625, + "logps/rejected": -195.77682495117188, + "loss": 0.0253, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.03794369101524353, + "rewards/margins": 0.07790088653564453, + "rewards/rejected": -0.11584459245204926, + "step": 11840 + }, + { + "epoch": 0.78, + "learning_rate": 7.303162674717762e-07, + "logits/chosen": -2.230045795440674, + "logits/rejected": -1.846605658531189, + "logps/chosen": -228.3832550048828, + "logps/rejected": -187.96006774902344, + "loss": 0.0383, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.09077741205692291, + "rewards/margins": 0.07706119865179062, + "rewards/rejected": -0.16783861815929413, + "step": 11850 + }, + { + "epoch": 0.78, + "learning_rate": 7.26287753053167e-07, + "logits/chosen": -2.1979362964630127, + "logits/rejected": -2.1252353191375732, + "logps/chosen": -278.9451904296875, + "logps/rejected": -292.9638671875, + "loss": 0.0257, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.06984533369541168, + "rewards/margins": 0.07643438875675201, + "rewards/rejected": -0.1462797075510025, + "step": 11860 + }, + { + "epoch": 0.78, + "learning_rate": 7.222684911865013e-07, + "logits/chosen": -2.305039644241333, + "logits/rejected": -2.314790964126587, + "logps/chosen": -218.77285766601562, + "logps/rejected": -262.61029052734375, + "loss": 0.0322, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.05802080035209656, + "rewards/margins": 0.1107601672410965, + "rewards/rejected": -0.16878096759319305, + "step": 11870 + }, + { + "epoch": 0.78, + "learning_rate": 7.182585028382166e-07, + "logits/chosen": -2.3521711826324463, + "logits/rejected": -2.054400682449341, + "logps/chosen": -286.86614990234375, + "logps/rejected": -272.48370361328125, + "loss": 0.0308, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.053572773933410645, + "rewards/margins": 0.09033741056919098, + "rewards/rejected": -0.14391018450260162, + "step": 11880 + }, + { + "epoch": 0.78, + "learning_rate": 7.142578089263769e-07, + "logits/chosen": -2.3823986053466797, + "logits/rejected": -2.061488389968872, + "logps/chosen": -330.8357849121094, + "logps/rejected": -289.43865966796875, + "loss": 0.028, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.06203612685203552, + "rewards/margins": 0.08701418340206146, + "rewards/rejected": -0.14905031025409698, + "step": 11890 + }, + { + "epoch": 0.78, + "learning_rate": 7.102664303205611e-07, + "logits/chosen": -2.2598845958709717, + "logits/rejected": -2.024601697921753, + "logps/chosen": -233.4888458251953, + "logps/rejected": -232.8012237548828, + "loss": 0.0199, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.06463171541690826, + "rewards/margins": 0.08594464510679245, + "rewards/rejected": -0.1505763679742813, + "step": 11900 + }, + { + "epoch": 0.78, + "eval_logits/chosen": -2.2626161575317383, + "eval_logits/rejected": -2.075838565826416, + "eval_logps/chosen": -243.8325653076172, + "eval_logps/rejected": -240.6167755126953, + "eval_loss": 0.02434389479458332, + "eval_rewards/accuracies": 0.6604999899864197, + "eval_rewards/chosen": -0.05913807079195976, + "eval_rewards/margins": 0.08588622510433197, + "eval_rewards/rejected": -0.14502428472042084, + "eval_runtime": 716.2778, + "eval_samples_per_second": 2.792, + "eval_steps_per_second": 1.396, + "step": 11900 + }, + { + "epoch": 0.78, + "learning_rate": 7.062843878417566e-07, + "logits/chosen": -2.3879013061523438, + "logits/rejected": -2.250087261199951, + "logps/chosen": -231.5170440673828, + "logps/rejected": -212.97030639648438, + "loss": 0.0212, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.040422223508358, + "rewards/margins": 0.07248730212450027, + "rewards/rejected": -0.11290953308343887, + "step": 11910 + }, + { + "epoch": 0.78, + "learning_rate": 7.023117022622458e-07, + "logits/chosen": -2.3014559745788574, + "logits/rejected": -1.9482473134994507, + "logps/chosen": -256.7848815917969, + "logps/rejected": -249.9860382080078, + "loss": 0.0267, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.09134788066148758, + "rewards/margins": 0.08234737813472748, + "rewards/rejected": -0.17369526624679565, + "step": 11920 + }, + { + "epoch": 0.78, + "learning_rate": 6.983483943055042e-07, + "logits/chosen": -2.205634593963623, + "logits/rejected": -2.0224173069000244, + "logps/chosen": -293.17437744140625, + "logps/rejected": -252.63803100585938, + "loss": 0.0272, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.06353868544101715, + "rewards/margins": 0.0811837837100029, + "rewards/rejected": -0.14472243189811707, + "step": 11930 + }, + { + "epoch": 0.78, + "learning_rate": 6.943944846460859e-07, + "logits/chosen": -2.2417654991149902, + "logits/rejected": -2.1794562339782715, + "logps/chosen": -228.4346923828125, + "logps/rejected": -194.443603515625, + "loss": 0.028, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.048401739448308945, + "rewards/margins": 0.06375519186258316, + "rewards/rejected": -0.112156942486763, + "step": 11940 + }, + { + "epoch": 0.78, + "learning_rate": 6.904499939095225e-07, + "logits/chosen": -2.2047362327575684, + "logits/rejected": -2.151808977127075, + "logps/chosen": -233.1191864013672, + "logps/rejected": -239.6602020263672, + "loss": 0.0145, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.053860194981098175, + "rewards/margins": 0.10934630781412125, + "rewards/rejected": -0.16320650279521942, + "step": 11950 + }, + { + "epoch": 0.78, + "learning_rate": 6.865149426722079e-07, + "logits/chosen": -2.176809310913086, + "logits/rejected": -2.12095308303833, + "logps/chosen": -288.0912170410156, + "logps/rejected": -276.6849670410156, + "loss": 0.0138, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.08453039824962616, + "rewards/margins": 0.09643884003162384, + "rewards/rejected": -0.18096923828125, + "step": 11960 + }, + { + "epoch": 0.78, + "learning_rate": 6.825893514612985e-07, + "logits/chosen": -2.0197577476501465, + "logits/rejected": -2.115384578704834, + "logps/chosen": -246.4369659423828, + "logps/rejected": -265.3404235839844, + "loss": 0.0381, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.04935074225068092, + "rewards/margins": 0.1054350957274437, + "rewards/rejected": -0.15478582680225372, + "step": 11970 + }, + { + "epoch": 0.78, + "learning_rate": 6.786732407546001e-07, + "logits/chosen": -2.0275187492370605, + "logits/rejected": -1.975950837135315, + "logps/chosen": -213.6422882080078, + "logps/rejected": -191.46072387695312, + "loss": 0.0284, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.06380544602870941, + "rewards/margins": 0.0876699760556221, + "rewards/rejected": -0.1514754295349121, + "step": 11980 + }, + { + "epoch": 0.78, + "learning_rate": 6.747666309804654e-07, + "logits/chosen": -2.4276633262634277, + "logits/rejected": -2.1066055297851562, + "logps/chosen": -299.38726806640625, + "logps/rejected": -234.7521514892578, + "loss": 0.0215, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.05855029821395874, + "rewards/margins": 0.07450170814990997, + "rewards/rejected": -0.1330520212650299, + "step": 11990 + }, + { + "epoch": 0.79, + "learning_rate": 6.708695425176831e-07, + "logits/chosen": -2.0516610145568848, + "logits/rejected": -2.04587984085083, + "logps/chosen": -183.84193420410156, + "logps/rejected": -225.9083251953125, + "loss": 0.0313, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.0740157812833786, + "rewards/margins": 0.10655520856380463, + "rewards/rejected": -0.18057098984718323, + "step": 12000 + }, + { + "epoch": 0.79, + "eval_logits/chosen": -2.2668938636779785, + "eval_logits/rejected": -2.079688549041748, + "eval_logps/chosen": -243.17733764648438, + "eval_logps/rejected": -240.09140014648438, + "eval_loss": 0.024385279044508934, + "eval_rewards/accuracies": 0.6604999899864197, + "eval_rewards/chosen": -0.0558619387447834, + "eval_rewards/margins": 0.08653547614812851, + "eval_rewards/rejected": -0.14239740371704102, + "eval_runtime": 714.7901, + "eval_samples_per_second": 2.798, + "eval_steps_per_second": 1.399, + "step": 12000 + }, + { + "epoch": 0.79, + "learning_rate": 6.669819956953768e-07, + "logits/chosen": -2.0975677967071533, + "logits/rejected": -2.0333313941955566, + "logps/chosen": -180.8198699951172, + "logps/rejected": -205.4145965576172, + "loss": 0.0106, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.05244321748614311, + "rewards/margins": 0.08027370274066925, + "rewards/rejected": -0.13271690905094147, + "step": 12010 + }, + { + "epoch": 0.79, + "learning_rate": 6.631040107928957e-07, + "logits/chosen": -2.4497742652893066, + "logits/rejected": -2.1184353828430176, + "logps/chosen": -280.5262451171875, + "logps/rejected": -200.26541137695312, + "loss": 0.0333, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.06170976907014847, + "rewards/margins": 0.08018840849399567, + "rewards/rejected": -0.14189818501472473, + "step": 12020 + }, + { + "epoch": 0.79, + "learning_rate": 6.592356080397072e-07, + "logits/chosen": -2.3427226543426514, + "logits/rejected": -1.7892353534698486, + "logps/chosen": -238.2996826171875, + "logps/rejected": -200.78785705566406, + "loss": 0.0259, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.05005241557955742, + "rewards/margins": 0.09475790709257126, + "rewards/rejected": -0.14481033384799957, + "step": 12030 + }, + { + "epoch": 0.79, + "learning_rate": 6.553768076152963e-07, + "logits/chosen": -2.22847843170166, + "logits/rejected": -2.327822208404541, + "logps/chosen": -169.56468200683594, + "logps/rejected": -220.78173828125, + "loss": 0.0524, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.057335417717695236, + "rewards/margins": 0.1205652505159378, + "rewards/rejected": -0.17790067195892334, + "step": 12040 + }, + { + "epoch": 0.79, + "learning_rate": 6.51527629649055e-07, + "logits/chosen": -2.374026298522949, + "logits/rejected": -2.2255825996398926, + "logps/chosen": -269.08404541015625, + "logps/rejected": -251.5164031982422, + "loss": 0.0149, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.07750485837459564, + "rewards/margins": 0.057897042483091354, + "rewards/rejected": -0.1354019045829773, + "step": 12050 + }, + { + "epoch": 0.79, + "learning_rate": 6.476880942201824e-07, + "logits/chosen": -2.507349967956543, + "logits/rejected": -2.1045010089874268, + "logps/chosen": -246.28945922851562, + "logps/rejected": -207.04495239257812, + "loss": 0.011, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.024758975952863693, + "rewards/margins": 0.09689836204051971, + "rewards/rejected": -0.1216573491692543, + "step": 12060 + }, + { + "epoch": 0.79, + "learning_rate": 6.438582213575748e-07, + "logits/chosen": -2.197597026824951, + "logits/rejected": -2.168886184692383, + "logps/chosen": -239.6921844482422, + "logps/rejected": -266.37591552734375, + "loss": 0.0355, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.048878345638513565, + "rewards/margins": 0.07769103348255157, + "rewards/rejected": -0.12656937539577484, + "step": 12070 + }, + { + "epoch": 0.79, + "learning_rate": 6.400380310397267e-07, + "logits/chosen": -2.1425106525421143, + "logits/rejected": -2.1414434909820557, + "logps/chosen": -243.58932495117188, + "logps/rejected": -286.09259033203125, + "loss": 0.015, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.057394880801439285, + "rewards/margins": 0.055457305163145065, + "rewards/rejected": -0.11285219341516495, + "step": 12080 + }, + { + "epoch": 0.79, + "learning_rate": 6.362275431946202e-07, + "logits/chosen": -2.117353916168213, + "logits/rejected": -2.1524546146392822, + "logps/chosen": -244.9851837158203, + "logps/rejected": -263.6101379394531, + "loss": 0.0416, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.04315786436200142, + "rewards/margins": 0.06288377195596695, + "rewards/rejected": -0.10604163259267807, + "step": 12090 + }, + { + "epoch": 0.79, + "learning_rate": 6.324267776996285e-07, + "logits/chosen": -2.3388924598693848, + "logits/rejected": -1.9531021118164062, + "logps/chosen": -386.77874755859375, + "logps/rejected": -292.1388244628906, + "loss": 0.0102, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.06099815294146538, + "rewards/margins": 0.1367029845714569, + "rewards/rejected": -0.19770114123821259, + "step": 12100 + }, + { + "epoch": 0.79, + "eval_logits/chosen": -2.2704570293426514, + "eval_logits/rejected": -2.082979440689087, + "eval_logps/chosen": -242.26406860351562, + "eval_logps/rejected": -238.70458984375, + "eval_loss": 0.02435409463942051, + "eval_rewards/accuracies": 0.656000018119812, + "eval_rewards/chosen": -0.051295530050992966, + "eval_rewards/margins": 0.08416783064603806, + "eval_rewards/rejected": -0.13546337187290192, + "eval_runtime": 713.3114, + "eval_samples_per_second": 2.804, + "eval_steps_per_second": 1.402, + "step": 12100 + }, + { + "epoch": 0.79, + "learning_rate": 6.286357543814045e-07, + "logits/chosen": -2.193878650665283, + "logits/rejected": -2.1076154708862305, + "logps/chosen": -203.20327758789062, + "logps/rejected": -295.43634033203125, + "loss": 0.0416, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.04836731776595116, + "rewards/margins": 0.12110915035009384, + "rewards/rejected": -0.1694764643907547, + "step": 12110 + }, + { + "epoch": 0.79, + "learning_rate": 6.248544930157838e-07, + "logits/chosen": -2.3076460361480713, + "logits/rejected": -2.06266188621521, + "logps/chosen": -194.84645080566406, + "logps/rejected": -201.7028350830078, + "loss": 0.0299, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.05380439758300781, + "rewards/margins": 0.12318424880504608, + "rewards/rejected": -0.17698867619037628, + "step": 12120 + }, + { + "epoch": 0.79, + "learning_rate": 6.21083013327678e-07, + "logits/chosen": -2.2582828998565674, + "logits/rejected": -2.157015323638916, + "logps/chosen": -309.3568115234375, + "logps/rejected": -267.8647155761719, + "loss": 0.0184, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.025580257177352905, + "rewards/margins": 0.07216853648424149, + "rewards/rejected": -0.09774880111217499, + "step": 12130 + }, + { + "epoch": 0.79, + "learning_rate": 6.17321334990973e-07, + "logits/chosen": -2.2270803451538086, + "logits/rejected": -2.139096260070801, + "logps/chosen": -213.1139373779297, + "logps/rejected": -190.55184936523438, + "loss": 0.0136, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.061547160148620605, + "rewards/margins": 0.06337620317935944, + "rewards/rejected": -0.12492338567972183, + "step": 12140 + }, + { + "epoch": 0.79, + "learning_rate": 6.135694776284243e-07, + "logits/chosen": -2.384162425994873, + "logits/rejected": -2.108898401260376, + "logps/chosen": -279.59844970703125, + "logps/rejected": -237.87149047851562, + "loss": 0.0252, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.04403727129101753, + "rewards/margins": 0.11522600799798965, + "rewards/rejected": -0.15926328301429749, + "step": 12150 + }, + { + "epoch": 0.8, + "learning_rate": 6.098274608115595e-07, + "logits/chosen": -2.1649386882781982, + "logits/rejected": -2.027029037475586, + "logps/chosen": -215.2659149169922, + "logps/rejected": -193.32310485839844, + "loss": 0.0672, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.04807636886835098, + "rewards/margins": 0.03344811871647835, + "rewards/rejected": -0.08152447640895844, + "step": 12160 + }, + { + "epoch": 0.8, + "learning_rate": 6.060953040605697e-07, + "logits/chosen": -2.397775650024414, + "logits/rejected": -1.8723652362823486, + "logps/chosen": -342.2147216796875, + "logps/rejected": -294.21893310546875, + "loss": 0.009, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.015425214543938637, + "rewards/margins": 0.10949543863534927, + "rewards/rejected": -0.12492065131664276, + "step": 12170 + }, + { + "epoch": 0.8, + "learning_rate": 6.023730268442144e-07, + "logits/chosen": -2.186662197113037, + "logits/rejected": -2.0057554244995117, + "logps/chosen": -212.6865692138672, + "logps/rejected": -209.54025268554688, + "loss": 0.0094, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.03932885453104973, + "rewards/margins": 0.12649241089820862, + "rewards/rejected": -0.16582126915454865, + "step": 12180 + }, + { + "epoch": 0.8, + "learning_rate": 5.986606485797131e-07, + "logits/chosen": -2.182776689529419, + "logits/rejected": -1.9772993326187134, + "logps/chosen": -209.5937957763672, + "logps/rejected": -228.5242462158203, + "loss": 0.0295, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.04851619526743889, + "rewards/margins": 0.06932185590267181, + "rewards/rejected": -0.1178380697965622, + "step": 12190 + }, + { + "epoch": 0.8, + "learning_rate": 5.949581886326511e-07, + "logits/chosen": -2.303760051727295, + "logits/rejected": -2.287932872772217, + "logps/chosen": -303.95550537109375, + "logps/rejected": -286.0728759765625, + "loss": 0.0325, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.03081701323390007, + "rewards/margins": 0.05515850707888603, + "rewards/rejected": -0.0859755203127861, + "step": 12200 + }, + { + "epoch": 0.8, + "eval_logits/chosen": -2.2709171772003174, + "eval_logits/rejected": -2.0834968090057373, + "eval_logps/chosen": -241.12908935546875, + "eval_logps/rejected": -237.43380737304688, + "eval_loss": 0.02426832541823387, + "eval_rewards/accuracies": 0.6600000262260437, + "eval_rewards/chosen": -0.04562075808644295, + "eval_rewards/margins": 0.08348869532346725, + "eval_rewards/rejected": -0.1291094571352005, + "eval_runtime": 713.0091, + "eval_samples_per_second": 2.805, + "eval_steps_per_second": 1.403, + "step": 12200 + }, + { + "epoch": 0.8, + "learning_rate": 5.912656663168717e-07, + "logits/chosen": -2.3682117462158203, + "logits/rejected": -2.309563636779785, + "logps/chosen": -230.9191131591797, + "logps/rejected": -236.99209594726562, + "loss": 0.0195, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.03454895317554474, + "rewards/margins": 0.062227051705121994, + "rewards/rejected": -0.09677600860595703, + "step": 12210 + }, + { + "epoch": 0.8, + "learning_rate": 5.875831008943817e-07, + "logits/chosen": -2.103549003601074, + "logits/rejected": -2.0876801013946533, + "logps/chosen": -187.82420349121094, + "logps/rejected": -176.9785614013672, + "loss": 0.0261, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.05090073496103287, + "rewards/margins": 0.05877278000116348, + "rewards/rejected": -0.10967351496219635, + "step": 12220 + }, + { + "epoch": 0.8, + "learning_rate": 5.839105115752442e-07, + "logits/chosen": -2.2246978282928467, + "logits/rejected": -2.0403006076812744, + "logps/chosen": -238.2069854736328, + "logps/rejected": -216.3524932861328, + "loss": 0.0205, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.07614725083112717, + "rewards/margins": 0.09076641499996185, + "rewards/rejected": -0.16691365838050842, + "step": 12230 + }, + { + "epoch": 0.8, + "learning_rate": 5.802479175174855e-07, + "logits/chosen": -2.2168078422546387, + "logits/rejected": -2.068084716796875, + "logps/chosen": -174.10366821289062, + "logps/rejected": -198.13929748535156, + "loss": 0.0125, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.04054706171154976, + "rewards/margins": 0.09054260700941086, + "rewards/rejected": -0.13108967244625092, + "step": 12240 + }, + { + "epoch": 0.8, + "learning_rate": 5.765953378269901e-07, + "logits/chosen": -2.135885715484619, + "logits/rejected": -2.073364734649658, + "logps/chosen": -218.061767578125, + "logps/rejected": -272.62738037109375, + "loss": 0.0271, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.05534181743860245, + "rewards/margins": 0.11760924011468887, + "rewards/rejected": -0.17295105755329132, + "step": 12250 + }, + { + "epoch": 0.8, + "learning_rate": 5.729527915574037e-07, + "logits/chosen": -2.2846944332122803, + "logits/rejected": -2.1489596366882324, + "logps/chosen": -230.0850372314453, + "logps/rejected": -247.8024139404297, + "loss": 0.0273, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.04833231121301651, + "rewards/margins": 0.08689139783382416, + "rewards/rejected": -0.13522370159626007, + "step": 12260 + }, + { + "epoch": 0.8, + "learning_rate": 5.693202977100304e-07, + "logits/chosen": -2.291938304901123, + "logits/rejected": -2.0095741748809814, + "logps/chosen": -179.6736602783203, + "logps/rejected": -192.34451293945312, + "loss": 0.0234, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.04764264076948166, + "rewards/margins": 0.08003589510917664, + "rewards/rejected": -0.1276785284280777, + "step": 12270 + }, + { + "epoch": 0.8, + "learning_rate": 5.656978752337389e-07, + "logits/chosen": -2.310103178024292, + "logits/rejected": -2.1109793186187744, + "logps/chosen": -213.84716796875, + "logps/rejected": -236.92471313476562, + "loss": 0.0331, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.07096020877361298, + "rewards/margins": 0.10838906466960907, + "rewards/rejected": -0.17934927344322205, + "step": 12280 + }, + { + "epoch": 0.8, + "learning_rate": 5.620855430248581e-07, + "logits/chosen": -2.210860013961792, + "logits/rejected": -2.074171781539917, + "logps/chosen": -166.60595703125, + "logps/rejected": -186.58425903320312, + "loss": 0.0289, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.0230544600635767, + "rewards/margins": 0.1107834130525589, + "rewards/rejected": -0.13383787870407104, + "step": 12290 + }, + { + "epoch": 0.8, + "learning_rate": 5.584833199270837e-07, + "logits/chosen": -2.2861437797546387, + "logits/rejected": -2.133598804473877, + "logps/chosen": -240.87899780273438, + "logps/rejected": -249.25509643554688, + "loss": 0.028, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.0612337663769722, + "rewards/margins": 0.08034642040729523, + "rewards/rejected": -0.14158019423484802, + "step": 12300 + }, + { + "epoch": 0.8, + "eval_logits/chosen": -2.2695388793945312, + "eval_logits/rejected": -2.0820517539978027, + "eval_logps/chosen": -241.85560607910156, + "eval_logps/rejected": -238.8947296142578, + "eval_loss": 0.024259360507130623, + "eval_rewards/accuracies": 0.6585000157356262, + "eval_rewards/chosen": -0.049253277480602264, + "eval_rewards/margins": 0.08716095238924026, + "eval_rewards/rejected": -0.13641421496868134, + "eval_runtime": 713.9946, + "eval_samples_per_second": 2.801, + "eval_steps_per_second": 1.401, + "step": 12300 + }, + { + "epoch": 0.81, + "learning_rate": 5.548912247313742e-07, + "logits/chosen": -2.500735282897949, + "logits/rejected": -2.072622299194336, + "logps/chosen": -298.7041320800781, + "logps/rejected": -262.83197021484375, + "loss": 0.0205, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.07452087104320526, + "rewards/margins": 0.05851732939481735, + "rewards/rejected": -0.13303819298744202, + "step": 12310 + }, + { + "epoch": 0.81, + "learning_rate": 5.513092761758596e-07, + "logits/chosen": -2.3174662590026855, + "logits/rejected": -2.1213202476501465, + "logps/chosen": -284.2948913574219, + "logps/rejected": -226.7404022216797, + "loss": 0.0245, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.06542503833770752, + "rewards/margins": 0.04713209718465805, + "rewards/rejected": -0.11255712807178497, + "step": 12320 + }, + { + "epoch": 0.81, + "learning_rate": 5.477374929457363e-07, + "logits/chosen": -2.235321521759033, + "logits/rejected": -2.2302331924438477, + "logps/chosen": -215.40139770507812, + "logps/rejected": -209.98681640625, + "loss": 0.0146, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.06670118123292923, + "rewards/margins": 0.06878657639026642, + "rewards/rejected": -0.13548775017261505, + "step": 12330 + }, + { + "epoch": 0.81, + "learning_rate": 5.441758936731772e-07, + "logits/chosen": -2.25117826461792, + "logits/rejected": -2.1371283531188965, + "logps/chosen": -245.52734375, + "logps/rejected": -244.70254516601562, + "loss": 0.0211, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.0509103462100029, + "rewards/margins": 0.08482729643583298, + "rewards/rejected": -0.13573762774467468, + "step": 12340 + }, + { + "epoch": 0.81, + "learning_rate": 5.406244969372273e-07, + "logits/chosen": -2.17441987991333, + "logits/rejected": -2.010354518890381, + "logps/chosen": -211.4480438232422, + "logps/rejected": -247.693603515625, + "loss": 0.0226, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.05862605571746826, + "rewards/margins": 0.14871786534786224, + "rewards/rejected": -0.2073439359664917, + "step": 12350 + }, + { + "epoch": 0.81, + "learning_rate": 5.370833212637122e-07, + "logits/chosen": -2.2330093383789062, + "logits/rejected": -1.9421203136444092, + "logps/chosen": -229.41799926757812, + "logps/rejected": -234.67514038085938, + "loss": 0.0232, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.05453511327505112, + "rewards/margins": 0.09521164000034332, + "rewards/rejected": -0.14974676072597504, + "step": 12360 + }, + { + "epoch": 0.81, + "learning_rate": 5.335523851251392e-07, + "logits/chosen": -2.1823971271514893, + "logits/rejected": -2.107938528060913, + "logps/chosen": -219.23770141601562, + "logps/rejected": -219.7248077392578, + "loss": 0.0329, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.06551903486251831, + "rewards/margins": 0.1064266562461853, + "rewards/rejected": -0.1719456911087036, + "step": 12370 + }, + { + "epoch": 0.81, + "learning_rate": 5.300317069406003e-07, + "logits/chosen": -2.1719276905059814, + "logits/rejected": -2.142944812774658, + "logps/chosen": -168.9110565185547, + "logps/rejected": -194.3233642578125, + "loss": 0.0116, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.04082585498690605, + "rewards/margins": 0.1082276701927185, + "rewards/rejected": -0.14905352890491486, + "step": 12380 + }, + { + "epoch": 0.81, + "learning_rate": 5.265213050756782e-07, + "logits/chosen": -2.406087636947632, + "logits/rejected": -2.2311203479766846, + "logps/chosen": -239.39688110351562, + "logps/rejected": -255.44509887695312, + "loss": 0.0249, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.03672807663679123, + "rewards/margins": 0.0908966213464737, + "rewards/rejected": -0.12762470543384552, + "step": 12390 + }, + { + "epoch": 0.81, + "learning_rate": 5.230211978423477e-07, + "logits/chosen": -2.3066985607147217, + "logits/rejected": -2.1874680519104004, + "logps/chosen": -232.8973846435547, + "logps/rejected": -230.87380981445312, + "loss": 0.0278, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.07048575580120087, + "rewards/margins": 0.06572605669498444, + "rewards/rejected": -0.1362117975950241, + "step": 12400 + }, + { + "epoch": 0.81, + "eval_logits/chosen": -2.2793402671813965, + "eval_logits/rejected": -2.091266393661499, + "eval_logps/chosen": -242.20643615722656, + "eval_logps/rejected": -238.47528076171875, + "eval_loss": 0.02414710633456707, + "eval_rewards/accuracies": 0.6600000262260437, + "eval_rewards/chosen": -0.051007479429244995, + "eval_rewards/margins": 0.08330940455198288, + "eval_rewards/rejected": -0.13431687653064728, + "eval_runtime": 713.7293, + "eval_samples_per_second": 2.802, + "eval_steps_per_second": 1.401, + "step": 12400 + }, + { + "epoch": 0.81, + "learning_rate": 5.195314034988835e-07, + "logits/chosen": -2.437074661254883, + "logits/rejected": -2.1730642318725586, + "logps/chosen": -226.95419311523438, + "logps/rejected": -178.89450073242188, + "loss": 0.0388, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.038335494697093964, + "rewards/margins": 0.09489767253398895, + "rewards/rejected": -0.1332331895828247, + "step": 12410 + }, + { + "epoch": 0.81, + "learning_rate": 5.160519402497616e-07, + "logits/chosen": -2.3210339546203613, + "logits/rejected": -2.1709866523742676, + "logps/chosen": -243.803466796875, + "logps/rejected": -257.8756103515625, + "loss": 0.0414, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.06874729692935944, + "rewards/margins": 0.09066031873226166, + "rewards/rejected": -0.1594076305627823, + "step": 12420 + }, + { + "epoch": 0.81, + "learning_rate": 5.125828262455679e-07, + "logits/chosen": -2.212090492248535, + "logits/rejected": -2.011735200881958, + "logps/chosen": -266.16326904296875, + "logps/rejected": -254.65283203125, + "loss": 0.0203, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.05225072428584099, + "rewards/margins": 0.09414578974246979, + "rewards/rejected": -0.14639653265476227, + "step": 12430 + }, + { + "epoch": 0.81, + "learning_rate": 5.091240795828992e-07, + "logits/chosen": -1.9795331954956055, + "logits/rejected": -2.1605002880096436, + "logps/chosen": -212.42431640625, + "logps/rejected": -253.1348114013672, + "loss": 0.0483, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.04966943711042404, + "rewards/margins": 0.10439908504486084, + "rewards/rejected": -0.15406851470470428, + "step": 12440 + }, + { + "epoch": 0.81, + "learning_rate": 5.056757183042732e-07, + "logits/chosen": -2.2055981159210205, + "logits/rejected": -2.1182687282562256, + "logps/chosen": -246.42037963867188, + "logps/rejected": -246.04562377929688, + "loss": 0.0121, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.06432102620601654, + "rewards/margins": 0.09751377999782562, + "rewards/rejected": -0.16183480620384216, + "step": 12450 + }, + { + "epoch": 0.82, + "learning_rate": 5.022377603980308e-07, + "logits/chosen": -2.368230104446411, + "logits/rejected": -2.0332324504852295, + "logps/chosen": -262.3285217285156, + "logps/rejected": -214.49362182617188, + "loss": 0.0213, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.06521899998188019, + "rewards/margins": 0.09318248927593231, + "rewards/rejected": -0.1584015190601349, + "step": 12460 + }, + { + "epoch": 0.82, + "learning_rate": 4.988102237982454e-07, + "logits/chosen": -2.319570779800415, + "logits/rejected": -2.243185520172119, + "logps/chosen": -237.8197479248047, + "logps/rejected": -209.4499969482422, + "loss": 0.0186, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.07819648832082748, + "rewards/margins": 0.06102749705314636, + "rewards/rejected": -0.13922397792339325, + "step": 12470 + }, + { + "epoch": 0.82, + "learning_rate": 4.953931263846251e-07, + "logits/chosen": -2.303527355194092, + "logits/rejected": -2.0281221866607666, + "logps/chosen": -276.46331787109375, + "logps/rejected": -253.54183959960938, + "loss": 0.0356, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.07377218455076218, + "rewards/margins": 0.10519599914550781, + "rewards/rejected": -0.1789681762456894, + "step": 12480 + }, + { + "epoch": 0.82, + "learning_rate": 4.919864859824266e-07, + "logits/chosen": -2.2470602989196777, + "logits/rejected": -2.104475736618042, + "logps/chosen": -254.1764678955078, + "logps/rejected": -225.07070922851562, + "loss": 0.0339, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.0837114155292511, + "rewards/margins": 0.0796470046043396, + "rewards/rejected": -0.1633584201335907, + "step": 12490 + }, + { + "epoch": 0.82, + "learning_rate": 4.885903203623532e-07, + "logits/chosen": -2.4405932426452637, + "logits/rejected": -2.0412323474884033, + "logps/chosen": -292.0880126953125, + "logps/rejected": -247.72152709960938, + "loss": 0.0142, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.03537056967616081, + "rewards/margins": 0.09703455865383148, + "rewards/rejected": -0.1324051171541214, + "step": 12500 + }, + { + "epoch": 0.82, + "eval_logits/chosen": -2.2793068885803223, + "eval_logits/rejected": -2.0913002490997314, + "eval_logps/chosen": -242.81405639648438, + "eval_logps/rejected": -239.04124450683594, + "eval_loss": 0.02413610927760601, + "eval_rewards/accuracies": 0.6570000052452087, + "eval_rewards/chosen": -0.05404556915163994, + "eval_rewards/margins": 0.08310119062662125, + "eval_rewards/rejected": -0.1371467560529709, + "eval_runtime": 713.8604, + "eval_samples_per_second": 2.802, + "eval_steps_per_second": 1.401, + "step": 12500 + }, + { + "epoch": 0.82, + "learning_rate": 4.852046472404695e-07, + "logits/chosen": -2.4357948303222656, + "logits/rejected": -1.7031605243682861, + "logps/chosen": -301.8102722167969, + "logps/rejected": -193.1060333251953, + "loss": 0.0331, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.019607767462730408, + "rewards/margins": 0.08484580367803574, + "rewards/rejected": -0.10445356369018555, + "step": 12510 + }, + { + "epoch": 0.82, + "learning_rate": 4.818294842781035e-07, + "logits/chosen": -2.3425495624542236, + "logits/rejected": -2.1587905883789062, + "logps/chosen": -232.7345428466797, + "logps/rejected": -205.90792846679688, + "loss": 0.0275, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.03263060376048088, + "rewards/margins": 0.12765631079673767, + "rewards/rejected": -0.16028691828250885, + "step": 12520 + }, + { + "epoch": 0.82, + "learning_rate": 4.784648490817601e-07, + "logits/chosen": -2.3454043865203857, + "logits/rejected": -2.053494930267334, + "logps/chosen": -229.67822265625, + "logps/rejected": -200.66259765625, + "loss": 0.0315, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.047857291996479034, + "rewards/margins": 0.07121441513299942, + "rewards/rejected": -0.11907169967889786, + "step": 12530 + }, + { + "epoch": 0.82, + "learning_rate": 4.751107592030235e-07, + "logits/chosen": -2.360136032104492, + "logits/rejected": -2.0764052867889404, + "logps/chosen": -178.2238006591797, + "logps/rejected": -184.0672149658203, + "loss": 0.0251, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.04186008870601654, + "rewards/margins": 0.11471670866012573, + "rewards/rejected": -0.15657678246498108, + "step": 12540 + }, + { + "epoch": 0.82, + "learning_rate": 4.717672321384703e-07, + "logits/chosen": -2.2550435066223145, + "logits/rejected": -2.0041213035583496, + "logps/chosen": -229.6531524658203, + "logps/rejected": -207.37255859375, + "loss": 0.0208, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.03203721344470978, + "rewards/margins": 0.09957583248615265, + "rewards/rejected": -0.13161304593086243, + "step": 12550 + }, + { + "epoch": 0.82, + "learning_rate": 4.684342853295748e-07, + "logits/chosen": -2.2040514945983887, + "logits/rejected": -2.081937313079834, + "logps/chosen": -196.20738220214844, + "logps/rejected": -212.50997924804688, + "loss": 0.0245, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.04255301505327225, + "rewards/margins": 0.1011330857872963, + "rewards/rejected": -0.14368610084056854, + "step": 12560 + }, + { + "epoch": 0.82, + "learning_rate": 4.651119361626213e-07, + "logits/chosen": -2.504356861114502, + "logits/rejected": -2.1664860248565674, + "logps/chosen": -247.20285034179688, + "logps/rejected": -214.8421630859375, + "loss": 0.0218, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.030421242117881775, + "rewards/margins": 0.06934089213609695, + "rewards/rejected": -0.09976212680339813, + "step": 12570 + }, + { + "epoch": 0.82, + "learning_rate": 4.618002019686091e-07, + "logits/chosen": -2.263784885406494, + "logits/rejected": -2.087101697921753, + "logps/chosen": -290.5827941894531, + "logps/rejected": -250.25985717773438, + "loss": 0.0219, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.05685209110379219, + "rewards/margins": 0.08156587183475494, + "rewards/rejected": -0.13841792941093445, + "step": 12580 + }, + { + "epoch": 0.82, + "learning_rate": 4.5849910002316757e-07, + "logits/chosen": -2.327174425125122, + "logits/rejected": -1.988965630531311, + "logps/chosen": -197.8284149169922, + "logps/rejected": -183.67318725585938, + "loss": 0.0349, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.0865812748670578, + "rewards/margins": 0.09554515033960342, + "rewards/rejected": -0.18212643265724182, + "step": 12590 + }, + { + "epoch": 0.82, + "learning_rate": 4.5520864754645984e-07, + "logits/chosen": -2.384525775909424, + "logits/rejected": -2.2449145317077637, + "logps/chosen": -291.3929138183594, + "logps/rejected": -260.1684875488281, + "loss": 0.0177, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.05021563917398453, + "rewards/margins": 0.06300269067287445, + "rewards/rejected": -0.11321830749511719, + "step": 12600 + }, + { + "epoch": 0.82, + "eval_logits/chosen": -2.2796547412872314, + "eval_logits/rejected": -2.0916662216186523, + "eval_logps/chosen": -243.12290954589844, + "eval_logps/rejected": -239.190185546875, + "eval_loss": 0.024224113672971725, + "eval_rewards/accuracies": 0.6579999923706055, + "eval_rewards/chosen": -0.055589765310287476, + "eval_rewards/margins": 0.08230166882276535, + "eval_rewards/rejected": -0.13789144158363342, + "eval_runtime": 713.353, + "eval_samples_per_second": 2.804, + "eval_steps_per_second": 1.402, + "step": 12600 + }, + { + "epoch": 0.83, + "learning_rate": 4.5192886170309896e-07, + "logits/chosen": -2.2043235301971436, + "logits/rejected": -2.105273962020874, + "logps/chosen": -202.4031982421875, + "logps/rejected": -213.1467742919922, + "loss": 0.0194, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.05817372351884842, + "rewards/margins": 0.05279749631881714, + "rewards/rejected": -0.11097122728824615, + "step": 12610 + }, + { + "epoch": 0.83, + "learning_rate": 4.486597596020548e-07, + "logits/chosen": -2.2953429222106934, + "logits/rejected": -2.0294137001037598, + "logps/chosen": -233.8562774658203, + "logps/rejected": -210.57388305664062, + "loss": 0.0192, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.07137481123209, + "rewards/margins": 0.07519672065973282, + "rewards/rejected": -0.14657153189182281, + "step": 12620 + }, + { + "epoch": 0.83, + "learning_rate": 4.454013582965644e-07, + "logits/chosen": -2.227466106414795, + "logits/rejected": -1.818380355834961, + "logps/chosen": -275.13970947265625, + "logps/rejected": -237.52182006835938, + "loss": 0.0191, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.058415018022060394, + "rewards/margins": 0.06412716954946518, + "rewards/rejected": -0.12254220247268677, + "step": 12630 + }, + { + "epoch": 0.83, + "learning_rate": 4.4215367478404605e-07, + "logits/chosen": -2.0815138816833496, + "logits/rejected": -2.054581642150879, + "logps/chosen": -285.30023193359375, + "logps/rejected": -324.38555908203125, + "loss": 0.0431, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.07270147651433945, + "rewards/margins": 0.07356799393892288, + "rewards/rejected": -0.14626947045326233, + "step": 12640 + }, + { + "epoch": 0.83, + "learning_rate": 4.389167260060068e-07, + "logits/chosen": -2.3553760051727295, + "logits/rejected": -2.1189093589782715, + "logps/chosen": -214.73532104492188, + "logps/rejected": -200.194580078125, + "loss": 0.0171, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.03022538684308529, + "rewards/margins": 0.11816896498203278, + "rewards/rejected": -0.14839434623718262, + "step": 12650 + }, + { + "epoch": 0.83, + "learning_rate": 4.356905288479579e-07, + "logits/chosen": -2.223787784576416, + "logits/rejected": -1.9934899806976318, + "logps/chosen": -238.44442749023438, + "logps/rejected": -238.2301483154297, + "loss": 0.023, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.059618107974529266, + "rewards/margins": 0.14274819195270538, + "rewards/rejected": -0.20236630737781525, + "step": 12660 + }, + { + "epoch": 0.83, + "learning_rate": 4.3247510013932377e-07, + "logits/chosen": -2.165524482727051, + "logits/rejected": -2.005889654159546, + "logps/chosen": -264.05902099609375, + "logps/rejected": -284.2886047363281, + "loss": 0.038, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.05935274809598923, + "rewards/margins": 0.10204823315143585, + "rewards/rejected": -0.16140100359916687, + "step": 12670 + }, + { + "epoch": 0.83, + "learning_rate": 4.2927045665335594e-07, + "logits/chosen": -1.8666915893554688, + "logits/rejected": -1.7911564111709595, + "logps/chosen": -181.5066375732422, + "logps/rejected": -196.82656860351562, + "loss": 0.0176, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.08314456045627594, + "rewards/margins": 0.09437574446201324, + "rewards/rejected": -0.17752030491828918, + "step": 12680 + }, + { + "epoch": 0.83, + "learning_rate": 4.260766151070439e-07, + "logits/chosen": -2.090467929840088, + "logits/rejected": -2.1346335411071777, + "logps/chosen": -235.2054901123047, + "logps/rejected": -242.34439086914062, + "loss": 0.0274, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.05385993793606758, + "rewards/margins": 0.09199769794940948, + "rewards/rejected": -0.14585763216018677, + "step": 12690 + }, + { + "epoch": 0.83, + "learning_rate": 4.228935921610308e-07, + "logits/chosen": -2.323847770690918, + "logits/rejected": -1.9647403955459595, + "logps/chosen": -271.04962158203125, + "logps/rejected": -221.23440551757812, + "loss": 0.0133, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.03665490821003914, + "rewards/margins": 0.07118140161037445, + "rewards/rejected": -0.10783632099628448, + "step": 12700 + }, + { + "epoch": 0.83, + "eval_logits/chosen": -2.281400680541992, + "eval_logits/rejected": -2.09328293800354, + "eval_logps/chosen": -241.91531372070312, + "eval_logps/rejected": -237.8956298828125, + "eval_loss": 0.024218622595071793, + "eval_rewards/accuracies": 0.6575000286102295, + "eval_rewards/chosen": -0.0495518334209919, + "eval_rewards/margins": 0.0818667784333229, + "eval_rewards/rejected": -0.1314186155796051, + "eval_runtime": 713.0872, + "eval_samples_per_second": 2.805, + "eval_steps_per_second": 1.402, + "step": 12700 + }, + { + "epoch": 0.83, + "learning_rate": 4.1972140441952246e-07, + "logits/chosen": -2.1343767642974854, + "logits/rejected": -2.1033663749694824, + "logps/chosen": -246.02841186523438, + "logps/rejected": -262.2856140136719, + "loss": 0.0525, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.032225728034973145, + "rewards/margins": 0.07124023139476776, + "rewards/rejected": -0.10346595197916031, + "step": 12710 + }, + { + "epoch": 0.83, + "learning_rate": 4.165600684302046e-07, + "logits/chosen": -2.245701789855957, + "logits/rejected": -2.2973880767822266, + "logps/chosen": -182.59765625, + "logps/rejected": -210.4281463623047, + "loss": 0.0206, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.034068286418914795, + "rewards/margins": 0.08936282247304916, + "rewards/rejected": -0.12343110144138336, + "step": 12720 + }, + { + "epoch": 0.83, + "learning_rate": 4.13409600684154e-07, + "logits/chosen": -2.3500359058380127, + "logits/rejected": -2.065136671066284, + "logps/chosen": -224.13363647460938, + "logps/rejected": -213.9673309326172, + "loss": 0.0515, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.045570097863674164, + "rewards/margins": 0.0950850322842598, + "rewards/rejected": -0.14065513014793396, + "step": 12730 + }, + { + "epoch": 0.83, + "learning_rate": 4.102700176157548e-07, + "logits/chosen": -2.415928602218628, + "logits/rejected": -2.047894239425659, + "logps/chosen": -336.18682861328125, + "logps/rejected": -254.1941375732422, + "loss": 0.0254, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.05715783312916756, + "rewards/margins": 0.07896944135427475, + "rewards/rejected": -0.1361272633075714, + "step": 12740 + }, + { + "epoch": 0.83, + "learning_rate": 4.0714133560260884e-07, + "logits/chosen": -2.3049137592315674, + "logits/rejected": -2.1440796852111816, + "logps/chosen": -270.9903259277344, + "logps/rejected": -225.3958282470703, + "loss": 0.0283, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.057286281138658524, + "rewards/margins": 0.06318188458681107, + "rewards/rejected": -0.12046817690134048, + "step": 12750 + }, + { + "epoch": 0.83, + "learning_rate": 4.0402357096545527e-07, + "logits/chosen": -2.180220365524292, + "logits/rejected": -2.14140248298645, + "logps/chosen": -260.32342529296875, + "logps/rejected": -265.9286193847656, + "loss": 0.0184, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.04280921444296837, + "rewards/margins": 0.07888636738061905, + "rewards/rejected": -0.12169557809829712, + "step": 12760 + }, + { + "epoch": 0.84, + "learning_rate": 4.0091673996808025e-07, + "logits/chosen": -2.389176845550537, + "logits/rejected": -2.1885383129119873, + "logps/chosen": -211.50894165039062, + "logps/rejected": -203.1997833251953, + "loss": 0.0251, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.07224427908658981, + "rewards/margins": 0.08159011602401733, + "rewards/rejected": -0.15383440256118774, + "step": 12770 + }, + { + "epoch": 0.84, + "learning_rate": 3.9782085881723776e-07, + "logits/chosen": -2.207456588745117, + "logits/rejected": -2.061331272125244, + "logps/chosen": -176.98324584960938, + "logps/rejected": -208.9066619873047, + "loss": 0.0298, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.05692816898226738, + "rewards/margins": 0.09978387504816055, + "rewards/rejected": -0.15671204030513763, + "step": 12780 + }, + { + "epoch": 0.84, + "learning_rate": 3.947359436625592e-07, + "logits/chosen": -2.215158462524414, + "logits/rejected": -2.1081624031066895, + "logps/chosen": -241.7562713623047, + "logps/rejected": -225.2100830078125, + "loss": 0.0107, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.03546663373708725, + "rewards/margins": 0.10508982837200165, + "rewards/rejected": -0.1405564546585083, + "step": 12790 + }, + { + "epoch": 0.84, + "learning_rate": 3.9166201059647386e-07, + "logits/chosen": -2.3521649837493896, + "logits/rejected": -2.221381664276123, + "logps/chosen": -268.37359619140625, + "logps/rejected": -241.0518341064453, + "loss": 0.0186, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.03261677920818329, + "rewards/margins": 0.04804609343409538, + "rewards/rejected": -0.08066286146640778, + "step": 12800 + }, + { + "epoch": 0.84, + "eval_logits/chosen": -2.2818355560302734, + "eval_logits/rejected": -2.0935537815093994, + "eval_logps/chosen": -241.017578125, + "eval_logps/rejected": -237.06175231933594, + "eval_loss": 0.02415802702307701, + "eval_rewards/accuracies": 0.656499981880188, + "eval_rewards/chosen": -0.04506318271160126, + "eval_rewards/margins": 0.08218610286712646, + "eval_rewards/rejected": -0.12724927067756653, + "eval_runtime": 712.6346, + "eval_samples_per_second": 2.806, + "eval_steps_per_second": 1.403, + "step": 12800 + }, + { + "epoch": 0.84, + "learning_rate": 3.8859907565412194e-07, + "logits/chosen": -2.1575679779052734, + "logits/rejected": -2.242316722869873, + "logps/chosen": -191.49639892578125, + "logps/rejected": -207.98928833007812, + "loss": 0.0485, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.05923575907945633, + "rewards/margins": 0.08535636961460114, + "rewards/rejected": -0.14459213614463806, + "step": 12810 + }, + { + "epoch": 0.84, + "learning_rate": 3.8554715481327303e-07, + "logits/chosen": -2.3164939880371094, + "logits/rejected": -1.9328874349594116, + "logps/chosen": -246.2336883544922, + "logps/rejected": -230.98385620117188, + "loss": 0.0374, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.0648733526468277, + "rewards/margins": 0.10073963552713394, + "rewards/rejected": -0.16561298072338104, + "step": 12820 + }, + { + "epoch": 0.84, + "learning_rate": 3.8250626399424007e-07, + "logits/chosen": -2.3338048458099365, + "logits/rejected": -2.086073875427246, + "logps/chosen": -261.89605712890625, + "logps/rejected": -259.2681579589844, + "loss": 0.0293, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.05208975076675415, + "rewards/margins": 0.08111140131950378, + "rewards/rejected": -0.13320115208625793, + "step": 12830 + }, + { + "epoch": 0.84, + "learning_rate": 3.7947641905980104e-07, + "logits/chosen": -2.155761957168579, + "logits/rejected": -2.1673035621643066, + "logps/chosen": -214.59146118164062, + "logps/rejected": -195.30322265625, + "loss": 0.0337, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.03636503964662552, + "rewards/margins": 0.07696239650249481, + "rewards/rejected": -0.11332742869853973, + "step": 12840 + }, + { + "epoch": 0.84, + "learning_rate": 3.764576358151098e-07, + "logits/chosen": -2.19984769821167, + "logits/rejected": -2.1929872035980225, + "logps/chosen": -182.03663635253906, + "logps/rejected": -184.73660278320312, + "loss": 0.0103, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.036097969859838486, + "rewards/margins": 0.07272133976221085, + "rewards/rejected": -0.10881930589675903, + "step": 12850 + }, + { + "epoch": 0.84, + "learning_rate": 3.7344993000761944e-07, + "logits/chosen": -2.3004448413848877, + "logits/rejected": -2.1955597400665283, + "logps/chosen": -191.7588348388672, + "logps/rejected": -252.80300903320312, + "loss": 0.0137, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.07221857458353043, + "rewards/margins": 0.09036187827587128, + "rewards/rejected": -0.1625804454088211, + "step": 12860 + }, + { + "epoch": 0.84, + "learning_rate": 3.7045331732699585e-07, + "logits/chosen": -2.3142223358154297, + "logits/rejected": -2.1267404556274414, + "logps/chosen": -213.4777374267578, + "logps/rejected": -203.80746459960938, + "loss": 0.0338, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.041398413479328156, + "rewards/margins": 0.13014577329158783, + "rewards/rejected": -0.17154419422149658, + "step": 12870 + }, + { + "epoch": 0.84, + "learning_rate": 3.6746781340503993e-07, + "logits/chosen": -2.1561598777770996, + "logits/rejected": -2.0527725219726562, + "logps/chosen": -239.81692504882812, + "logps/rejected": -250.43899536132812, + "loss": 0.0233, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.027005767449736595, + "rewards/margins": 0.09324527531862259, + "rewards/rejected": -0.12025104463100433, + "step": 12880 + }, + { + "epoch": 0.84, + "learning_rate": 3.6449343381560116e-07, + "logits/chosen": -2.251657247543335, + "logits/rejected": -2.0255634784698486, + "logps/chosen": -246.94400024414062, + "logps/rejected": -256.09893798828125, + "loss": 0.034, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.07759173959493637, + "rewards/margins": 0.11272220313549042, + "rewards/rejected": -0.1903139352798462, + "step": 12890 + }, + { + "epoch": 0.84, + "learning_rate": 3.615301940745017e-07, + "logits/chosen": -2.516042947769165, + "logits/rejected": -1.9202125072479248, + "logps/chosen": -325.527099609375, + "logps/rejected": -234.82717895507812, + "loss": 0.0117, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.041659772396087646, + "rewards/margins": 0.07238514721393585, + "rewards/rejected": -0.11404494196176529, + "step": 12900 + }, + { + "epoch": 0.84, + "eval_logits/chosen": -2.2790322303771973, + "eval_logits/rejected": -2.0908236503601074, + "eval_logps/chosen": -239.9394989013672, + "eval_logps/rejected": -236.24349975585938, + "eval_loss": 0.02411588840186596, + "eval_rewards/accuracies": 0.6579999923706055, + "eval_rewards/chosen": -0.03967278078198433, + "eval_rewards/margins": 0.0834852010011673, + "eval_rewards/rejected": -0.12315797060728073, + "eval_runtime": 710.5468, + "eval_samples_per_second": 2.815, + "eval_steps_per_second": 1.407, + "step": 12900 + }, + { + "epoch": 0.84, + "learning_rate": 3.5857810963945084e-07, + "logits/chosen": -2.1284141540527344, + "logits/rejected": -1.9132626056671143, + "logps/chosen": -225.8367462158203, + "logps/rejected": -226.9214630126953, + "loss": 0.0443, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.056638143956661224, + "rewards/margins": 0.07953803241252899, + "rewards/rejected": -0.1361761838197708, + "step": 12910 + }, + { + "epoch": 0.85, + "learning_rate": 3.556371959099678e-07, + "logits/chosen": -2.348480463027954, + "logits/rejected": -2.103860855102539, + "logps/chosen": -302.9031066894531, + "logps/rejected": -277.6312255859375, + "loss": 0.0125, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.028643876314163208, + "rewards/margins": 0.07349316030740738, + "rewards/rejected": -0.10213702917098999, + "step": 12920 + }, + { + "epoch": 0.85, + "learning_rate": 3.5270746822729797e-07, + "logits/chosen": -2.2369742393493652, + "logits/rejected": -2.157525062561035, + "logps/chosen": -255.8109588623047, + "logps/rejected": -288.16217041015625, + "loss": 0.0357, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.04164459556341171, + "rewards/margins": 0.09887997806072235, + "rewards/rejected": -0.14052458107471466, + "step": 12930 + }, + { + "epoch": 0.85, + "learning_rate": 3.4978894187433746e-07, + "logits/chosen": -2.316485643386841, + "logits/rejected": -2.1782500743865967, + "logps/chosen": -163.74954223632812, + "logps/rejected": -163.9324493408203, + "loss": 0.0443, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": -0.0633717030286789, + "rewards/margins": 0.05545664578676224, + "rewards/rejected": -0.11882835626602173, + "step": 12940 + }, + { + "epoch": 0.85, + "learning_rate": 3.468816320755486e-07, + "logits/chosen": -2.104240894317627, + "logits/rejected": -1.9286854267120361, + "logps/chosen": -227.17626953125, + "logps/rejected": -201.1591033935547, + "loss": 0.0121, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.016242554411292076, + "rewards/margins": 0.06796287000179291, + "rewards/rejected": -0.08420543372631073, + "step": 12950 + }, + { + "epoch": 0.85, + "learning_rate": 3.4398555399688336e-07, + "logits/chosen": -2.3788199424743652, + "logits/rejected": -2.011913537979126, + "logps/chosen": -224.4126739501953, + "logps/rejected": -210.57296752929688, + "loss": 0.0248, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.06531089544296265, + "rewards/margins": 0.03412250801920891, + "rewards/rejected": -0.09943340718746185, + "step": 12960 + }, + { + "epoch": 0.85, + "learning_rate": 3.411007227457047e-07, + "logits/chosen": -2.2853686809539795, + "logits/rejected": -2.221477508544922, + "logps/chosen": -254.6948699951172, + "logps/rejected": -243.5530548095703, + "loss": 0.0206, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.034668006002902985, + "rewards/margins": 0.10566465556621552, + "rewards/rejected": -0.1403326839208603, + "step": 12970 + }, + { + "epoch": 0.85, + "learning_rate": 3.382271533707043e-07, + "logits/chosen": -2.1889026165008545, + "logits/rejected": -2.1711583137512207, + "logps/chosen": -198.1396026611328, + "logps/rejected": -187.36752319335938, + "loss": 0.0256, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.03267192095518112, + "rewards/margins": 0.06055806949734688, + "rewards/rejected": -0.0932299867272377, + "step": 12980 + }, + { + "epoch": 0.85, + "learning_rate": 3.353648608618287e-07, + "logits/chosen": -2.2680234909057617, + "logits/rejected": -2.0193073749542236, + "logps/chosen": -176.73532104492188, + "logps/rejected": -182.29981994628906, + "loss": 0.0289, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.04432029277086258, + "rewards/margins": 0.07767603546380997, + "rewards/rejected": -0.12199632823467255, + "step": 12990 + }, + { + "epoch": 0.85, + "learning_rate": 3.3251386015019676e-07, + "logits/chosen": -2.3049163818359375, + "logits/rejected": -2.0893776416778564, + "logps/chosen": -213.1085968017578, + "logps/rejected": -196.01004028320312, + "loss": 0.0116, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.051745809614658356, + "rewards/margins": 0.08322995156049728, + "rewards/rejected": -0.13497576117515564, + "step": 13000 + }, + { + "epoch": 0.85, + "eval_logits/chosen": -2.2780840396881104, + "eval_logits/rejected": -2.089902639389038, + "eval_logps/chosen": -240.3864288330078, + "eval_logps/rejected": -237.061279296875, + "eval_loss": 0.0241058599203825, + "eval_rewards/accuracies": 0.6579999923706055, + "eval_rewards/chosen": -0.04190727323293686, + "eval_rewards/margins": 0.0853395164012909, + "eval_rewards/rejected": -0.12724678218364716, + "eval_runtime": 712.6776, + "eval_samples_per_second": 2.806, + "eval_steps_per_second": 1.403, + "step": 13000 + }, + { + "epoch": 0.85, + "learning_rate": 3.296741661080255e-07, + "logits/chosen": -2.229576349258423, + "logits/rejected": -2.135002613067627, + "logps/chosen": -243.9384002685547, + "logps/rejected": -258.056884765625, + "loss": 0.0153, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.054000191390514374, + "rewards/margins": 0.10329830646514893, + "rewards/rejected": -0.1572985053062439, + "step": 13010 + }, + { + "epoch": 0.85, + "learning_rate": 3.2684579354854974e-07, + "logits/chosen": -2.3377318382263184, + "logits/rejected": -2.229548454284668, + "logps/chosen": -306.2275390625, + "logps/rejected": -335.21124267578125, + "loss": 0.0297, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.06884465366601944, + "rewards/margins": 0.09362609684467316, + "rewards/rejected": -0.1624707579612732, + "step": 13020 + }, + { + "epoch": 0.85, + "learning_rate": 3.2402875722594653e-07, + "logits/chosen": -2.3580403327941895, + "logits/rejected": -2.0886740684509277, + "logps/chosen": -174.5337677001953, + "logps/rejected": -198.30404663085938, + "loss": 0.0152, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.027441659942269325, + "rewards/margins": 0.08809362351894379, + "rewards/rejected": -0.11553528159856796, + "step": 13030 + }, + { + "epoch": 0.85, + "learning_rate": 3.212230718352566e-07, + "logits/chosen": -2.2204318046569824, + "logits/rejected": -2.235112190246582, + "logps/chosen": -235.35092163085938, + "logps/rejected": -175.29856872558594, + "loss": 0.0313, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.05150535702705383, + "rewards/margins": 0.018223298713564873, + "rewards/rejected": -0.06972865760326385, + "step": 13040 + }, + { + "epoch": 0.85, + "learning_rate": 3.1842875201231025e-07, + "logits/chosen": -2.278357744216919, + "logits/rejected": -1.9919675588607788, + "logps/chosen": -228.9661102294922, + "logps/rejected": -218.33358764648438, + "loss": 0.0262, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.036130357533693314, + "rewards/margins": 0.07571340352296829, + "rewards/rejected": -0.1118437647819519, + "step": 13050 + }, + { + "epoch": 0.85, + "learning_rate": 3.156458123336478e-07, + "logits/chosen": -2.1235363483428955, + "logits/rejected": -1.9482349157333374, + "logps/chosen": -164.78628540039062, + "logps/rejected": -178.83383178710938, + "loss": 0.0237, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.03137214109301567, + "rewards/margins": 0.12098171561956406, + "rewards/rejected": -0.15235385298728943, + "step": 13060 + }, + { + "epoch": 0.86, + "learning_rate": 3.128742673164459e-07, + "logits/chosen": -2.3608107566833496, + "logits/rejected": -1.9896495342254639, + "logps/chosen": -290.57684326171875, + "logps/rejected": -264.00274658203125, + "loss": 0.0079, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.038247980177402496, + "rewards/margins": 0.08970911800861359, + "rewards/rejected": -0.12795709073543549, + "step": 13070 + }, + { + "epoch": 0.86, + "learning_rate": 3.101141314184414e-07, + "logits/chosen": -2.4731571674346924, + "logits/rejected": -2.226579189300537, + "logps/chosen": -212.8175506591797, + "logps/rejected": -215.1665802001953, + "loss": 0.0264, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.03716491162776947, + "rewards/margins": 0.05780158191919327, + "rewards/rejected": -0.09496650099754333, + "step": 13080 + }, + { + "epoch": 0.86, + "learning_rate": 3.0736541903785526e-07, + "logits/chosen": -2.121049165725708, + "logits/rejected": -2.087754487991333, + "logps/chosen": -217.7719268798828, + "logps/rejected": -284.319091796875, + "loss": 0.0207, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.04378505051136017, + "rewards/margins": 0.0854378491640091, + "rewards/rejected": -0.12922288477420807, + "step": 13090 + }, + { + "epoch": 0.86, + "learning_rate": 3.0462814451331704e-07, + "logits/chosen": -2.177192211151123, + "logits/rejected": -2.042564630508423, + "logps/chosen": -241.98666381835938, + "logps/rejected": -251.2749481201172, + "loss": 0.0338, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.056551218032836914, + "rewards/margins": 0.04691624268889427, + "rewards/rejected": -0.10346746444702148, + "step": 13100 + }, + { + "epoch": 0.86, + "eval_logits/chosen": -2.282399892807007, + "eval_logits/rejected": -2.094139337539673, + "eval_logps/chosen": -240.08843994140625, + "eval_logps/rejected": -236.25448608398438, + "eval_loss": 0.02407999336719513, + "eval_rewards/accuracies": 0.656499981880188, + "eval_rewards/chosen": -0.04041757434606552, + "eval_rewards/margins": 0.08279527723789215, + "eval_rewards/rejected": -0.12321285903453827, + "eval_runtime": 712.8135, + "eval_samples_per_second": 2.806, + "eval_steps_per_second": 1.403, + "step": 13100 + }, + { + "epoch": 0.86, + "learning_rate": 3.019023221237927e-07, + "logits/chosen": -2.239502429962158, + "logits/rejected": -2.0651755332946777, + "logps/chosen": -246.2601318359375, + "logps/rejected": -205.74697875976562, + "loss": 0.0222, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.04763896390795708, + "rewards/margins": 0.08527128398418427, + "rewards/rejected": -0.13291025161743164, + "step": 13110 + }, + { + "epoch": 0.86, + "learning_rate": 2.991879660885058e-07, + "logits/chosen": -2.4021129608154297, + "logits/rejected": -2.1448426246643066, + "logps/chosen": -270.3570861816406, + "logps/rejected": -270.1576843261719, + "loss": 0.0313, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.03405073657631874, + "rewards/margins": 0.08080430328845978, + "rewards/rejected": -0.11485502868890762, + "step": 13120 + }, + { + "epoch": 0.86, + "learning_rate": 2.9648509056686786e-07, + "logits/chosen": -2.305415630340576, + "logits/rejected": -2.1397616863250732, + "logps/chosen": -183.93411254882812, + "logps/rejected": -175.04684448242188, + "loss": 0.0292, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.03496173024177551, + "rewards/margins": 0.08344221115112305, + "rewards/rejected": -0.11840394884347916, + "step": 13130 + }, + { + "epoch": 0.86, + "learning_rate": 2.937937096584012e-07, + "logits/chosen": -2.29146146774292, + "logits/rejected": -2.0643632411956787, + "logps/chosen": -297.84332275390625, + "logps/rejected": -251.4933624267578, + "loss": 0.0271, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.037211813032627106, + "rewards/margins": 0.0706702172756195, + "rewards/rejected": -0.10788202285766602, + "step": 13140 + }, + { + "epoch": 0.86, + "learning_rate": 2.9111383740266756e-07, + "logits/chosen": -2.090592622756958, + "logits/rejected": -1.9605131149291992, + "logps/chosen": -244.744140625, + "logps/rejected": -249.68655395507812, + "loss": 0.0227, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.04479987546801567, + "rewards/margins": 0.053652308881282806, + "rewards/rejected": -0.09845219552516937, + "step": 13150 + }, + { + "epoch": 0.86, + "learning_rate": 2.8844548777919255e-07, + "logits/chosen": -2.3207592964172363, + "logits/rejected": -2.0424532890319824, + "logps/chosen": -210.184814453125, + "logps/rejected": -202.9254150390625, + "loss": 0.0247, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.03150486573576927, + "rewards/margins": 0.07510420680046082, + "rewards/rejected": -0.10660906881093979, + "step": 13160 + }, + { + "epoch": 0.86, + "learning_rate": 2.8578867470739594e-07, + "logits/chosen": -2.156432628631592, + "logits/rejected": -2.0161538124084473, + "logps/chosen": -197.3763427734375, + "logps/rejected": -185.967041015625, + "loss": 0.0517, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.06443998962640762, + "rewards/margins": 0.10006703436374664, + "rewards/rejected": -0.16450701653957367, + "step": 13170 + }, + { + "epoch": 0.86, + "learning_rate": 2.8314341204651484e-07, + "logits/chosen": -2.3970794677734375, + "logits/rejected": -2.125443458557129, + "logps/chosen": -281.5930480957031, + "logps/rejected": -231.6305694580078, + "loss": 0.0152, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.027769574895501137, + "rewards/margins": 0.11410045623779297, + "rewards/rejected": -0.14187002182006836, + "step": 13180 + }, + { + "epoch": 0.86, + "learning_rate": 2.805097135955362e-07, + "logits/chosen": -2.310490131378174, + "logits/rejected": -2.1002745628356934, + "logps/chosen": -219.2947235107422, + "logps/rejected": -205.0678253173828, + "loss": 0.0379, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.03385355696082115, + "rewards/margins": 0.10349483788013458, + "rewards/rejected": -0.13734838366508484, + "step": 13190 + }, + { + "epoch": 0.86, + "learning_rate": 2.778875930931213e-07, + "logits/chosen": -2.306821346282959, + "logits/rejected": -1.9826276302337646, + "logps/chosen": -242.89950561523438, + "logps/rejected": -245.67117309570312, + "loss": 0.0206, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.03890315443277359, + "rewards/margins": 0.10066906362771988, + "rewards/rejected": -0.13957220315933228, + "step": 13200 + }, + { + "epoch": 0.86, + "eval_logits/chosen": -2.2772462368011475, + "eval_logits/rejected": -2.089167833328247, + "eval_logps/chosen": -240.58746337890625, + "eval_logps/rejected": -237.21768188476562, + "eval_loss": 0.024047939106822014, + "eval_rewards/accuracies": 0.6589999794960022, + "eval_rewards/chosen": -0.042912621051073074, + "eval_rewards/margins": 0.08511631935834885, + "eval_rewards/rejected": -0.12802892923355103, + "eval_runtime": 714.205, + "eval_samples_per_second": 2.8, + "eval_steps_per_second": 1.4, + "step": 13200 + }, + { + "epoch": 0.86, + "learning_rate": 2.7527706421753426e-07, + "logits/chosen": -2.278181791305542, + "logits/rejected": -2.2129273414611816, + "logps/chosen": -208.23129272460938, + "logps/rejected": -224.854736328125, + "loss": 0.0276, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.04915057495236397, + "rewards/margins": 0.060673534870147705, + "rewards/rejected": -0.10982410609722137, + "step": 13210 + }, + { + "epoch": 0.86, + "learning_rate": 2.726781405865736e-07, + "logits/chosen": -2.3695473670959473, + "logits/rejected": -1.8260514736175537, + "logps/chosen": -309.7107849121094, + "logps/rejected": -211.94125366210938, + "loss": 0.0176, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.03522497043013573, + "rewards/margins": 0.08999510854482651, + "rewards/rejected": -0.12522009015083313, + "step": 13220 + }, + { + "epoch": 0.87, + "learning_rate": 2.7009083575749687e-07, + "logits/chosen": -2.2610929012298584, + "logits/rejected": -2.1646924018859863, + "logps/chosen": -252.8301544189453, + "logps/rejected": -263.22113037109375, + "loss": 0.0156, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.044849202036857605, + "rewards/margins": 0.06013251468539238, + "rewards/rejected": -0.10498170554637909, + "step": 13230 + }, + { + "epoch": 0.87, + "learning_rate": 2.6751516322695457e-07, + "logits/chosen": -2.3331539630889893, + "logits/rejected": -2.2672178745269775, + "logps/chosen": -198.41842651367188, + "logps/rejected": -202.80697631835938, + "loss": 0.0251, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.04087433964014053, + "rewards/margins": 0.05214228481054306, + "rewards/rejected": -0.09301663935184479, + "step": 13240 + }, + { + "epoch": 0.87, + "learning_rate": 2.649511364309154e-07, + "logits/chosen": -2.27077317237854, + "logits/rejected": -2.239917278289795, + "logps/chosen": -210.52523803710938, + "logps/rejected": -208.5736083984375, + "loss": 0.0057, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.04117783159017563, + "rewards/margins": 0.09464763104915619, + "rewards/rejected": -0.13582547008991241, + "step": 13250 + }, + { + "epoch": 0.87, + "learning_rate": 2.6239876874460003e-07, + "logits/chosen": -2.3841896057128906, + "logits/rejected": -2.257284641265869, + "logps/chosen": -290.65576171875, + "logps/rejected": -285.47900390625, + "loss": 0.0196, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.031202878803014755, + "rewards/margins": 0.12689706683158875, + "rewards/rejected": -0.158099964261055, + "step": 13260 + }, + { + "epoch": 0.87, + "learning_rate": 2.5985807348240744e-07, + "logits/chosen": -2.424506187438965, + "logits/rejected": -1.9588611125946045, + "logps/chosen": -238.301025390625, + "logps/rejected": -216.34521484375, + "loss": 0.0144, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.02978738211095333, + "rewards/margins": 0.12995637953281403, + "rewards/rejected": -0.1597437560558319, + "step": 13270 + }, + { + "epoch": 0.87, + "learning_rate": 2.5732906389785014e-07, + "logits/chosen": -2.312986373901367, + "logits/rejected": -2.1459386348724365, + "logps/chosen": -282.345703125, + "logps/rejected": -268.94635009765625, + "loss": 0.0139, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.023198014125227928, + "rewards/margins": 0.12373118102550507, + "rewards/rejected": -0.14692921936511993, + "step": 13280 + }, + { + "epoch": 0.87, + "learning_rate": 2.5481175318347956e-07, + "logits/chosen": -2.174574375152588, + "logits/rejected": -2.2210919857025146, + "logps/chosen": -233.5167999267578, + "logps/rejected": -270.05572509765625, + "loss": 0.0204, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.030095994472503662, + "rewards/margins": 0.09321445971727371, + "rewards/rejected": -0.12331046164035797, + "step": 13290 + }, + { + "epoch": 0.87, + "learning_rate": 2.5230615447082246e-07, + "logits/chosen": -2.273481845855713, + "logits/rejected": -1.9318599700927734, + "logps/chosen": -260.14398193359375, + "logps/rejected": -260.80560302734375, + "loss": 0.018, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.04413662105798721, + "rewards/margins": 0.07723399251699448, + "rewards/rejected": -0.1213705986738205, + "step": 13300 + }, + { + "epoch": 0.87, + "eval_logits/chosen": -2.277225971221924, + "eval_logits/rejected": -2.0891315937042236, + "eval_logps/chosen": -240.14222717285156, + "eval_logps/rejected": -236.75955200195312, + "eval_loss": 0.02404077909886837, + "eval_rewards/accuracies": 0.6600000262260437, + "eval_rewards/chosen": -0.04068637639284134, + "eval_rewards/margins": 0.08505190163850784, + "eval_rewards/rejected": -0.12573827803134918, + "eval_runtime": 714.5476, + "eval_samples_per_second": 2.799, + "eval_steps_per_second": 1.399, + "step": 13300 + }, + { + "epoch": 0.87, + "learning_rate": 2.49812280830308e-07, + "logits/chosen": -2.300873279571533, + "logits/rejected": -1.8354320526123047, + "logps/chosen": -232.4656219482422, + "logps/rejected": -232.950927734375, + "loss": 0.0207, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.03714224696159363, + "rewards/margins": 0.17670652270317078, + "rewards/rejected": -0.2138487547636032, + "step": 13310 + }, + { + "epoch": 0.87, + "learning_rate": 2.4733014527120457e-07, + "logits/chosen": -2.1679153442382812, + "logits/rejected": -1.9986671209335327, + "logps/chosen": -221.49765014648438, + "logps/rejected": -219.9002685546875, + "loss": 0.0274, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.0954136997461319, + "rewards/margins": 0.10647950321435928, + "rewards/rejected": -0.20189321041107178, + "step": 13320 + }, + { + "epoch": 0.87, + "learning_rate": 2.4485976074154565e-07, + "logits/chosen": -2.2446699142456055, + "logits/rejected": -2.301520824432373, + "logps/chosen": -219.3531036376953, + "logps/rejected": -253.1564483642578, + "loss": 0.0207, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": -0.05177539587020874, + "rewards/margins": 0.018800964578986168, + "rewards/rejected": -0.07057635486125946, + "step": 13330 + }, + { + "epoch": 0.87, + "learning_rate": 2.4240114012806763e-07, + "logits/chosen": -2.2601559162139893, + "logits/rejected": -2.2380449771881104, + "logps/chosen": -212.53067016601562, + "logps/rejected": -209.5550994873047, + "loss": 0.0197, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.023662324994802475, + "rewards/margins": 0.07047148048877716, + "rewards/rejected": -0.09413380920886993, + "step": 13340 + }, + { + "epoch": 0.87, + "learning_rate": 2.399542962561399e-07, + "logits/chosen": -2.1661453247070312, + "logits/rejected": -2.0010826587677, + "logps/chosen": -230.67544555664062, + "logps/rejected": -205.7180938720703, + "loss": 0.0274, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.02609751746058464, + "rewards/margins": 0.11225984990596771, + "rewards/rejected": -0.13835737109184265, + "step": 13350 + }, + { + "epoch": 0.87, + "learning_rate": 2.3751924188969876e-07, + "logits/chosen": -2.2163939476013184, + "logits/rejected": -2.0882577896118164, + "logps/chosen": -256.12921142578125, + "logps/rejected": -258.23223876953125, + "loss": 0.0171, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.021953338757157326, + "rewards/margins": 0.10968828201293945, + "rewards/rejected": -0.13164159655570984, + "step": 13360 + }, + { + "epoch": 0.87, + "learning_rate": 2.3509598973118024e-07, + "logits/chosen": -2.409304141998291, + "logits/rejected": -2.2119812965393066, + "logps/chosen": -228.95950317382812, + "logps/rejected": -176.398193359375, + "loss": 0.015, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.039770763367414474, + "rewards/margins": 0.05113250017166138, + "rewards/rejected": -0.09090326726436615, + "step": 13370 + }, + { + "epoch": 0.88, + "learning_rate": 2.326845524214555e-07, + "logits/chosen": -2.0703043937683105, + "logits/rejected": -2.148268938064575, + "logps/chosen": -245.6635284423828, + "logps/rejected": -221.56747436523438, + "loss": 0.034, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.053046077489852905, + "rewards/margins": 0.010143243707716465, + "rewards/rejected": -0.0631893128156662, + "step": 13380 + }, + { + "epoch": 0.88, + "learning_rate": 2.3028494253976158e-07, + "logits/chosen": -2.3399901390075684, + "logits/rejected": -2.1317601203918457, + "logps/chosen": -353.18402099609375, + "logps/rejected": -301.57366943359375, + "loss": 0.0209, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.059592921286821365, + "rewards/margins": 0.06485487520694733, + "rewards/rejected": -0.12444780021905899, + "step": 13390 + }, + { + "epoch": 0.88, + "learning_rate": 2.2789717260364026e-07, + "logits/chosen": -2.3331074714660645, + "logits/rejected": -2.1523356437683105, + "logps/chosen": -172.91854858398438, + "logps/rejected": -164.48684692382812, + "loss": 0.0275, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.03906229883432388, + "rewards/margins": 0.0664801225066185, + "rewards/rejected": -0.10554243624210358, + "step": 13400 + }, + { + "epoch": 0.88, + "eval_logits/chosen": -2.278562545776367, + "eval_logits/rejected": -2.0904037952423096, + "eval_logps/chosen": -239.84494018554688, + "eval_logps/rejected": -236.29257202148438, + "eval_loss": 0.02403143234550953, + "eval_rewards/accuracies": 0.6585000157356262, + "eval_rewards/chosen": -0.039199963212013245, + "eval_rewards/margins": 0.08420341461896896, + "eval_rewards/rejected": -0.12340336292982101, + "eval_runtime": 713.6647, + "eval_samples_per_second": 2.802, + "eval_steps_per_second": 1.401, + "step": 13400 + }, + { + "epoch": 0.88, + "learning_rate": 2.255212550688682e-07, + "logits/chosen": -2.233186721801758, + "logits/rejected": -2.340898036956787, + "logps/chosen": -227.12271118164062, + "logps/rejected": -309.49151611328125, + "loss": 0.0252, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.0458383783698082, + "rewards/margins": 0.09091077744960785, + "rewards/rejected": -0.13674917817115784, + "step": 13410 + }, + { + "epoch": 0.88, + "learning_rate": 2.2315720232939598e-07, + "logits/chosen": -2.600893497467041, + "logits/rejected": -2.138629913330078, + "logps/chosen": -265.4658203125, + "logps/rejected": -198.0146026611328, + "loss": 0.0157, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.016760537400841713, + "rewards/margins": 0.11075560748577118, + "rewards/rejected": -0.12751615047454834, + "step": 13420 + }, + { + "epoch": 0.88, + "learning_rate": 2.2080502671727956e-07, + "logits/chosen": -2.3849570751190186, + "logits/rejected": -2.043117046356201, + "logps/chosen": -224.8113555908203, + "logps/rejected": -219.0714874267578, + "loss": 0.0282, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.02364751696586609, + "rewards/margins": 0.08043906092643738, + "rewards/rejected": -0.10408657789230347, + "step": 13430 + }, + { + "epoch": 0.88, + "learning_rate": 2.1846474050262078e-07, + "logits/chosen": -2.350308656692505, + "logits/rejected": -2.2130496501922607, + "logps/chosen": -252.57907104492188, + "logps/rejected": -198.87066650390625, + "loss": 0.0115, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.030091604217886925, + "rewards/margins": 0.06044364720582962, + "rewards/rejected": -0.09053526073694229, + "step": 13440 + }, + { + "epoch": 0.88, + "learning_rate": 2.1613635589349756e-07, + "logits/chosen": -1.924780249595642, + "logits/rejected": -2.009662628173828, + "logps/chosen": -202.04293823242188, + "logps/rejected": -251.4788055419922, + "loss": 0.0402, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.024398323148489, + "rewards/margins": 0.10957573354244232, + "rewards/rejected": -0.13397404551506042, + "step": 13450 + }, + { + "epoch": 0.88, + "learning_rate": 2.1381988503590578e-07, + "logits/chosen": -2.011870861053467, + "logits/rejected": -2.077077627182007, + "logps/chosen": -223.63961791992188, + "logps/rejected": -238.5012664794922, + "loss": 0.0164, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.03883880376815796, + "rewards/margins": 0.10505743324756622, + "rewards/rejected": -0.14389625191688538, + "step": 13460 + }, + { + "epoch": 0.88, + "learning_rate": 2.11515340013691e-07, + "logits/chosen": -2.3660213947296143, + "logits/rejected": -2.3374831676483154, + "logps/chosen": -238.0471954345703, + "logps/rejected": -241.2670440673828, + "loss": 0.0171, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.03437312692403793, + "rewards/margins": 0.12400054931640625, + "rewards/rejected": -0.1583736687898636, + "step": 13470 + }, + { + "epoch": 0.88, + "learning_rate": 2.092227328484897e-07, + "logits/chosen": -2.1448683738708496, + "logits/rejected": -2.1051604747772217, + "logps/chosen": -211.29458618164062, + "logps/rejected": -262.7167663574219, + "loss": 0.0152, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.03642933443188667, + "rewards/margins": 0.1038234680891037, + "rewards/rejected": -0.14025278389453888, + "step": 13480 + }, + { + "epoch": 0.88, + "learning_rate": 2.0694207549966345e-07, + "logits/chosen": -2.1631648540496826, + "logits/rejected": -2.0374045372009277, + "logps/chosen": -221.1369171142578, + "logps/rejected": -214.2423095703125, + "loss": 0.0332, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.061142198741436005, + "rewards/margins": 0.04987434297800064, + "rewards/rejected": -0.11101654917001724, + "step": 13490 + }, + { + "epoch": 0.88, + "learning_rate": 2.0467337986423864e-07, + "logits/chosen": -2.4196977615356445, + "logits/rejected": -2.1247169971466064, + "logps/chosen": -310.6087341308594, + "logps/rejected": -289.87506103515625, + "loss": 0.0177, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.0244259275496006, + "rewards/margins": 0.062395643442869186, + "rewards/rejected": -0.08682157099246979, + "step": 13500 + }, + { + "epoch": 0.88, + "eval_logits/chosen": -2.2791824340820312, + "eval_logits/rejected": -2.0911219120025635, + "eval_logps/chosen": -239.38253784179688, + "eval_logps/rejected": -235.622314453125, + "eval_loss": 0.024019325152039528, + "eval_rewards/accuracies": 0.6579999923706055, + "eval_rewards/chosen": -0.03688788786530495, + "eval_rewards/margins": 0.08316419273614883, + "eval_rewards/rejected": -0.12005206942558289, + "eval_runtime": 713.7035, + "eval_samples_per_second": 2.802, + "eval_steps_per_second": 1.401, + "step": 13500 + }, + { + "epoch": 0.88, + "learning_rate": 2.0241665777684272e-07, + "logits/chosen": -2.3379337787628174, + "logits/rejected": -2.2434744834899902, + "logps/chosen": -273.8184509277344, + "logps/rejected": -255.4578094482422, + "loss": 0.0209, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.027627144008874893, + "rewards/margins": 0.12627311050891876, + "rewards/rejected": -0.15390026569366455, + "step": 13510 + }, + { + "epoch": 0.88, + "learning_rate": 2.0017192100964366e-07, + "logits/chosen": -1.9843534231185913, + "logits/rejected": -2.065035820007324, + "logps/chosen": -211.2195587158203, + "logps/rejected": -229.91024780273438, + "loss": 0.0204, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.05631772428750992, + "rewards/margins": 0.08286778628826141, + "rewards/rejected": -0.13918551802635193, + "step": 13520 + }, + { + "epoch": 0.89, + "learning_rate": 1.9793918127228777e-07, + "logits/chosen": -2.366363048553467, + "logits/rejected": -1.9975332021713257, + "logps/chosen": -326.14263916015625, + "logps/rejected": -289.92840576171875, + "loss": 0.0234, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.04921586066484451, + "rewards/margins": 0.08933909237384796, + "rewards/rejected": -0.13855496048927307, + "step": 13530 + }, + { + "epoch": 0.89, + "learning_rate": 1.9571845021184005e-07, + "logits/chosen": -2.1216652393341064, + "logits/rejected": -2.031754732131958, + "logps/chosen": -243.87771606445312, + "logps/rejected": -265.47723388671875, + "loss": 0.026, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.06445915251970291, + "rewards/margins": 0.08740756660699844, + "rewards/rejected": -0.15186671912670135, + "step": 13540 + }, + { + "epoch": 0.89, + "learning_rate": 1.9350973941272027e-07, + "logits/chosen": -2.26531720161438, + "logits/rejected": -2.232865810394287, + "logps/chosen": -214.000732421875, + "logps/rejected": -210.4711456298828, + "loss": 0.033, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.05112849920988083, + "rewards/margins": 0.08485585451126099, + "rewards/rejected": -0.1359843611717224, + "step": 13550 + }, + { + "epoch": 0.89, + "learning_rate": 1.9131306039664676e-07, + "logits/chosen": -2.1346435546875, + "logits/rejected": -2.0782511234283447, + "logps/chosen": -207.88265991210938, + "logps/rejected": -251.60629272460938, + "loss": 0.0484, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.039892952889204025, + "rewards/margins": 0.09683619439601898, + "rewards/rejected": -0.1367291510105133, + "step": 13560 + }, + { + "epoch": 0.89, + "learning_rate": 1.8912842462257358e-07, + "logits/chosen": -2.170205593109131, + "logits/rejected": -2.098627805709839, + "logps/chosen": -228.48764038085938, + "logps/rejected": -234.42172241210938, + "loss": 0.0362, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.04159203916788101, + "rewards/margins": 0.1146048903465271, + "rewards/rejected": -0.1561969369649887, + "step": 13570 + }, + { + "epoch": 0.89, + "learning_rate": 1.869558434866303e-07, + "logits/chosen": -2.241560459136963, + "logits/rejected": -2.308928966522217, + "logps/chosen": -191.30880737304688, + "logps/rejected": -229.52975463867188, + "loss": 0.0319, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.05109493061900139, + "rewards/margins": 0.10095541179180145, + "rewards/rejected": -0.15205034613609314, + "step": 13580 + }, + { + "epoch": 0.89, + "learning_rate": 1.847953283220652e-07, + "logits/chosen": -2.41326642036438, + "logits/rejected": -2.0905160903930664, + "logps/chosen": -262.574951171875, + "logps/rejected": -211.3037567138672, + "loss": 0.0165, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.032326146960258484, + "rewards/margins": 0.13682501018047333, + "rewards/rejected": -0.1691511571407318, + "step": 13590 + }, + { + "epoch": 0.89, + "learning_rate": 1.8264689039918265e-07, + "logits/chosen": -2.3743739128112793, + "logits/rejected": -2.0387189388275146, + "logps/chosen": -269.67041015625, + "logps/rejected": -256.69189453125, + "loss": 0.0225, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.04920024052262306, + "rewards/margins": 0.07223823666572571, + "rewards/rejected": -0.12143848091363907, + "step": 13600 + }, + { + "epoch": 0.89, + "eval_logits/chosen": -2.2793779373168945, + "eval_logits/rejected": -2.091268539428711, + "eval_logps/chosen": -240.11476135253906, + "eval_logps/rejected": -236.71998596191406, + "eval_loss": 0.0239734910428524, + "eval_rewards/accuracies": 0.6579999923706055, + "eval_rewards/chosen": -0.04054900258779526, + "eval_rewards/margins": 0.08499140292406082, + "eval_rewards/rejected": -0.12554040551185608, + "eval_runtime": 714.3747, + "eval_samples_per_second": 2.8, + "eval_steps_per_second": 1.4, + "step": 13600 + }, + { + "epoch": 0.89, + "learning_rate": 1.8051054092528857e-07, + "logits/chosen": -2.308885097503662, + "logits/rejected": -2.1212146282196045, + "logps/chosen": -265.5684509277344, + "logps/rejected": -278.5885925292969, + "loss": 0.021, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.022335032001137733, + "rewards/margins": 0.1139020100235939, + "rewards/rejected": -0.13623705506324768, + "step": 13610 + }, + { + "epoch": 0.89, + "learning_rate": 1.783862910446271e-07, + "logits/chosen": -1.9275972843170166, + "logits/rejected": -2.0645546913146973, + "logps/chosen": -182.4210205078125, + "logps/rejected": -196.45352172851562, + "loss": 0.0311, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.04783863574266434, + "rewards/margins": 0.11673679202795029, + "rewards/rejected": -0.16457542777061462, + "step": 13620 + }, + { + "epoch": 0.89, + "learning_rate": 1.762741518383271e-07, + "logits/chosen": -2.326813220977783, + "logits/rejected": -2.162519693374634, + "logps/chosen": -230.1579132080078, + "logps/rejected": -220.95681762695312, + "loss": 0.0211, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.041627801954746246, + "rewards/margins": 0.09231483936309814, + "rewards/rejected": -0.133942648768425, + "step": 13630 + }, + { + "epoch": 0.89, + "learning_rate": 1.7417413432434082e-07, + "logits/chosen": -2.386035680770874, + "logits/rejected": -1.9975239038467407, + "logps/chosen": -263.8116149902344, + "logps/rejected": -224.295166015625, + "loss": 0.0433, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.05575961619615555, + "rewards/margins": 0.06625467538833618, + "rewards/rejected": -0.12201429903507233, + "step": 13640 + }, + { + "epoch": 0.89, + "learning_rate": 1.7208624945738855e-07, + "logits/chosen": -2.3942575454711914, + "logits/rejected": -2.2442939281463623, + "logps/chosen": -226.2176513671875, + "logps/rejected": -243.75204467773438, + "loss": 0.0193, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.0406666025519371, + "rewards/margins": 0.05048510432243347, + "rewards/rejected": -0.09115169942378998, + "step": 13650 + }, + { + "epoch": 0.89, + "learning_rate": 1.7001050812889995e-07, + "logits/chosen": -2.3620524406433105, + "logits/rejected": -2.045550584793091, + "logps/chosen": -268.92852783203125, + "logps/rejected": -246.66116333007812, + "loss": 0.018, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.0669785588979721, + "rewards/margins": 0.08991777151823044, + "rewards/rejected": -0.15689633786678314, + "step": 13660 + }, + { + "epoch": 0.89, + "learning_rate": 1.679469211669596e-07, + "logits/chosen": -2.283552646636963, + "logits/rejected": -2.1251068115234375, + "logps/chosen": -234.9169158935547, + "logps/rejected": -205.94332885742188, + "loss": 0.02, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.04985884949564934, + "rewards/margins": 0.11315397918224335, + "rewards/rejected": -0.16301283240318298, + "step": 13670 + }, + { + "epoch": 0.9, + "learning_rate": 1.6589549933624715e-07, + "logits/chosen": -2.273036479949951, + "logits/rejected": -2.088663101196289, + "logps/chosen": -244.7873077392578, + "logps/rejected": -223.2783203125, + "loss": 0.0129, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.022901328280568123, + "rewards/margins": 0.14988216757774353, + "rewards/rejected": -0.1727834939956665, + "step": 13680 + }, + { + "epoch": 0.9, + "learning_rate": 1.638562533379845e-07, + "logits/chosen": -2.278625726699829, + "logits/rejected": -2.108764886856079, + "logps/chosen": -261.9084167480469, + "logps/rejected": -212.7818603515625, + "loss": 0.0259, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.031029045581817627, + "rewards/margins": 0.06570714712142944, + "rewards/rejected": -0.09673619270324707, + "step": 13690 + }, + { + "epoch": 0.9, + "learning_rate": 1.6182919380987676e-07, + "logits/chosen": -2.323634386062622, + "logits/rejected": -2.239675521850586, + "logps/chosen": -234.4893798828125, + "logps/rejected": -231.24038696289062, + "loss": 0.0223, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.043856892734766006, + "rewards/margins": 0.060640860348939896, + "rewards/rejected": -0.1044977456331253, + "step": 13700 + }, + { + "epoch": 0.9, + "eval_logits/chosen": -2.280345916748047, + "eval_logits/rejected": -2.0922813415527344, + "eval_logps/chosen": -240.45132446289062, + "eval_logps/rejected": -236.97463989257812, + "eval_loss": 0.02395368367433548, + "eval_rewards/accuracies": 0.659500002861023, + "eval_rewards/chosen": -0.042231932282447815, + "eval_rewards/margins": 0.08458175510168076, + "eval_rewards/rejected": -0.12681369483470917, + "eval_runtime": 713.0169, + "eval_samples_per_second": 2.805, + "eval_steps_per_second": 1.402, + "step": 13700 + }, + { + "epoch": 0.9, + "learning_rate": 1.598143313260603e-07, + "logits/chosen": -2.233161687850952, + "logits/rejected": -2.139432907104492, + "logps/chosen": -195.6060333251953, + "logps/rejected": -198.994873046875, + "loss": 0.0342, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.03776133805513382, + "rewards/margins": 0.0770169347524643, + "rewards/rejected": -0.11477828025817871, + "step": 13710 + }, + { + "epoch": 0.9, + "learning_rate": 1.5781167639704415e-07, + "logits/chosen": -2.469926357269287, + "logits/rejected": -1.927030324935913, + "logps/chosen": -338.1827697753906, + "logps/rejected": -227.13156127929688, + "loss": 0.0243, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.030080029740929604, + "rewards/margins": 0.06596101820468903, + "rewards/rejected": -0.09604103863239288, + "step": 13720 + }, + { + "epoch": 0.9, + "learning_rate": 1.5582123946965787e-07, + "logits/chosen": -2.1400372982025146, + "logits/rejected": -2.0245399475097656, + "logps/chosen": -241.9009552001953, + "logps/rejected": -268.08343505859375, + "loss": 0.035, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.037529777735471725, + "rewards/margins": 0.08017268031835556, + "rewards/rejected": -0.11770244687795639, + "step": 13730 + }, + { + "epoch": 0.9, + "learning_rate": 1.5384303092699504e-07, + "logits/chosen": -2.3461155891418457, + "logits/rejected": -2.112164258956909, + "logps/chosen": -294.508056640625, + "logps/rejected": -317.8929138183594, + "loss": 0.0107, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.037617024034261703, + "rewards/margins": 0.12533004581928253, + "rewards/rejected": -0.16294705867767334, + "step": 13740 + }, + { + "epoch": 0.9, + "learning_rate": 1.518770610883613e-07, + "logits/chosen": -2.215355396270752, + "logits/rejected": -1.9471629858016968, + "logps/chosen": -228.7562255859375, + "logps/rejected": -224.67568969726562, + "loss": 0.0184, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.07108782231807709, + "rewards/margins": 0.12666651606559753, + "rewards/rejected": -0.19775435328483582, + "step": 13750 + }, + { + "epoch": 0.9, + "learning_rate": 1.4992334020921735e-07, + "logits/chosen": -2.228881359100342, + "logits/rejected": -2.126504898071289, + "logps/chosen": -177.85769653320312, + "logps/rejected": -172.0975341796875, + "loss": 0.022, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.02442639134824276, + "rewards/margins": 0.11926829814910889, + "rewards/rejected": -0.1436946839094162, + "step": 13760 + }, + { + "epoch": 0.9, + "learning_rate": 1.4798187848112905e-07, + "logits/chosen": -2.1605820655822754, + "logits/rejected": -2.1533656120300293, + "logps/chosen": -240.3361358642578, + "logps/rejected": -220.5252227783203, + "loss": 0.0203, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.08682699501514435, + "rewards/margins": 0.10523271560668945, + "rewards/rejected": -0.1920597106218338, + "step": 13770 + }, + { + "epoch": 0.9, + "learning_rate": 1.460526860317113e-07, + "logits/chosen": -2.348702907562256, + "logits/rejected": -2.2805120944976807, + "logps/chosen": -182.96865844726562, + "logps/rejected": -242.5914306640625, + "loss": 0.0231, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.048420049250125885, + "rewards/margins": 0.1393798291683197, + "rewards/rejected": -0.187799870967865, + "step": 13780 + }, + { + "epoch": 0.9, + "learning_rate": 1.441357729245771e-07, + "logits/chosen": -2.472022533416748, + "logits/rejected": -1.9027433395385742, + "logps/chosen": -261.91046142578125, + "logps/rejected": -224.39181518554688, + "loss": 0.0166, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.06959304213523865, + "rewards/margins": 0.09234293550252914, + "rewards/rejected": -0.1619359850883484, + "step": 13790 + }, + { + "epoch": 0.9, + "learning_rate": 1.4223114915928482e-07, + "logits/chosen": -2.1173558235168457, + "logits/rejected": -1.8709170818328857, + "logps/chosen": -236.102294921875, + "logps/rejected": -256.97296142578125, + "loss": 0.0302, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.05089854449033737, + "rewards/margins": 0.07051734626293182, + "rewards/rejected": -0.1214158907532692, + "step": 13800 + }, + { + "epoch": 0.9, + "eval_logits/chosen": -2.2779958248138428, + "eval_logits/rejected": -2.0899648666381836, + "eval_logps/chosen": -240.32012939453125, + "eval_logps/rejected": -237.0576629638672, + "eval_loss": 0.023951876908540726, + "eval_rewards/accuracies": 0.6575000286102295, + "eval_rewards/chosen": -0.04157585650682449, + "eval_rewards/margins": 0.08565285801887512, + "eval_rewards/rejected": -0.12722869217395782, + "eval_runtime": 713.4223, + "eval_samples_per_second": 2.803, + "eval_steps_per_second": 1.402, + "step": 13800 + }, + { + "epoch": 0.9, + "learning_rate": 1.403388246712842e-07, + "logits/chosen": -2.1812937259674072, + "logits/rejected": -1.9446184635162354, + "logps/chosen": -176.65940856933594, + "logps/rejected": -180.36642456054688, + "loss": 0.0253, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.062041789293289185, + "rewards/margins": 0.054293982684612274, + "rewards/rejected": -0.11633577197790146, + "step": 13810 + }, + { + "epoch": 0.9, + "learning_rate": 1.3845880933186757e-07, + "logits/chosen": -2.4402499198913574, + "logits/rejected": -2.179344654083252, + "logps/chosen": -248.98678588867188, + "logps/rejected": -221.80526733398438, + "loss": 0.0273, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.05124008655548096, + "rewards/margins": 0.03527190536260605, + "rewards/rejected": -0.08651198446750641, + "step": 13820 + }, + { + "epoch": 0.9, + "learning_rate": 1.3659111294811457e-07, + "logits/chosen": -2.282447099685669, + "logits/rejected": -2.141117572784424, + "logps/chosen": -205.77249145507812, + "logps/rejected": -201.51483154296875, + "loss": 0.0285, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.06964652985334396, + "rewards/margins": 0.06819276511669159, + "rewards/rejected": -0.13783928751945496, + "step": 13830 + }, + { + "epoch": 0.91, + "learning_rate": 1.347357452628459e-07, + "logits/chosen": -2.437429666519165, + "logits/rejected": -2.303067207336426, + "logps/chosen": -251.4400634765625, + "logps/rejected": -259.0691223144531, + "loss": 0.0325, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.030712375417351723, + "rewards/margins": 0.07565386593341827, + "rewards/rejected": -0.10636623948812485, + "step": 13840 + }, + { + "epoch": 0.91, + "learning_rate": 1.3289271595456732e-07, + "logits/chosen": -2.1983304023742676, + "logits/rejected": -2.010070323944092, + "logps/chosen": -218.0074005126953, + "logps/rejected": -214.27383422851562, + "loss": 0.0156, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.06839106976985931, + "rewards/margins": 0.11030055582523346, + "rewards/rejected": -0.17869162559509277, + "step": 13850 + }, + { + "epoch": 0.91, + "learning_rate": 1.310620346374228e-07, + "logits/chosen": -2.110769271850586, + "logits/rejected": -1.9821794033050537, + "logps/chosen": -241.5878143310547, + "logps/rejected": -227.92190551757812, + "loss": 0.0163, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.046829283237457275, + "rewards/margins": 0.11568622291088104, + "rewards/rejected": -0.1625155210494995, + "step": 13860 + }, + { + "epoch": 0.91, + "learning_rate": 1.2924371086114274e-07, + "logits/chosen": -2.171700954437256, + "logits/rejected": -1.9256553649902344, + "logps/chosen": -246.22705078125, + "logps/rejected": -256.12603759765625, + "loss": 0.0128, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.050827883183956146, + "rewards/margins": 0.07962900400161743, + "rewards/rejected": -0.13045687973499298, + "step": 13870 + }, + { + "epoch": 0.91, + "learning_rate": 1.274377541109953e-07, + "logits/chosen": -2.12762451171875, + "logits/rejected": -2.201578140258789, + "logps/chosen": -174.74855041503906, + "logps/rejected": -267.5423889160156, + "loss": 0.0203, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.05199012905359268, + "rewards/margins": 0.07435055077075958, + "rewards/rejected": -0.12634067237377167, + "step": 13880 + }, + { + "epoch": 0.91, + "learning_rate": 1.2564417380773435e-07, + "logits/chosen": -2.055483102798462, + "logits/rejected": -1.917754888534546, + "logps/chosen": -188.5877227783203, + "logps/rejected": -231.0465545654297, + "loss": 0.0292, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.050453364849090576, + "rewards/margins": 0.09988425672054291, + "rewards/rejected": -0.1503376066684723, + "step": 13890 + }, + { + "epoch": 0.91, + "learning_rate": 1.2386297930755436e-07, + "logits/chosen": -2.2837576866149902, + "logits/rejected": -2.2889015674591064, + "logps/chosen": -264.87750244140625, + "logps/rejected": -278.1776123046875, + "loss": 0.0213, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.08495865762233734, + "rewards/margins": 0.09063265472650528, + "rewards/rejected": -0.17559130489826202, + "step": 13900 + }, + { + "epoch": 0.91, + "eval_logits/chosen": -2.276700258255005, + "eval_logits/rejected": -2.0887749195098877, + "eval_logps/chosen": -240.15423583984375, + "eval_logps/rejected": -236.94261169433594, + "eval_loss": 0.02391652762889862, + "eval_rewards/accuracies": 0.6604999899864197, + "eval_rewards/chosen": -0.040746383368968964, + "eval_rewards/margins": 0.08590715378522873, + "eval_rewards/rejected": -0.1266535371541977, + "eval_runtime": 712.968, + "eval_samples_per_second": 2.805, + "eval_steps_per_second": 1.403, + "step": 13900 + }, + { + "epoch": 0.91, + "learning_rate": 1.220941799020378e-07, + "logits/chosen": -2.0645687580108643, + "logits/rejected": -1.9768037796020508, + "logps/chosen": -226.5297393798828, + "logps/rejected": -222.92886352539062, + "loss": 0.0194, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.03724323958158493, + "rewards/margins": 0.09375981986522675, + "rewards/rejected": -0.13100305199623108, + "step": 13910 + }, + { + "epoch": 0.91, + "learning_rate": 1.2033778481810975e-07, + "logits/chosen": -2.334246873855591, + "logits/rejected": -2.0699946880340576, + "logps/chosen": -225.9147186279297, + "logps/rejected": -209.2530975341797, + "loss": 0.0348, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.02960597537457943, + "rewards/margins": 0.10342290252447128, + "rewards/rejected": -0.13302887976169586, + "step": 13920 + }, + { + "epoch": 0.91, + "learning_rate": 1.1859380321798591e-07, + "logits/chosen": -2.2709996700286865, + "logits/rejected": -2.342074394226074, + "logps/chosen": -212.2975616455078, + "logps/rejected": -241.0948486328125, + "loss": 0.0208, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.048681434243917465, + "rewards/margins": 0.07026176899671555, + "rewards/rejected": -0.11894319206476212, + "step": 13930 + }, + { + "epoch": 0.91, + "learning_rate": 1.1686224419912989e-07, + "logits/chosen": -2.1778564453125, + "logits/rejected": -1.9656540155410767, + "logps/chosen": -263.8016662597656, + "logps/rejected": -259.34735107421875, + "loss": 0.0151, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.05464515835046768, + "rewards/margins": 0.12418242543935776, + "rewards/rejected": -0.17882758378982544, + "step": 13940 + }, + { + "epoch": 0.91, + "learning_rate": 1.1514311679420104e-07, + "logits/chosen": -1.9906097650527954, + "logits/rejected": -2.0624940395355225, + "logps/chosen": -166.02975463867188, + "logps/rejected": -243.86508178710938, + "loss": 0.0254, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.0544060580432415, + "rewards/margins": 0.11007817834615707, + "rewards/rejected": -0.16448423266410828, + "step": 13950 + }, + { + "epoch": 0.91, + "learning_rate": 1.1343642997101029e-07, + "logits/chosen": -2.269257068634033, + "logits/rejected": -2.173360824584961, + "logps/chosen": -209.3166961669922, + "logps/rejected": -215.50198364257812, + "loss": 0.0273, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.0348169282078743, + "rewards/margins": 0.09703563153743744, + "rewards/rejected": -0.13185256719589233, + "step": 13960 + }, + { + "epoch": 0.91, + "learning_rate": 1.1174219263247188e-07, + "logits/chosen": -2.010667324066162, + "logits/rejected": -1.900843620300293, + "logps/chosen": -209.64718627929688, + "logps/rejected": -211.0970001220703, + "loss": 0.0188, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.054148148745298386, + "rewards/margins": 0.094904825091362, + "rewards/rejected": -0.1490529626607895, + "step": 13970 + }, + { + "epoch": 0.91, + "learning_rate": 1.1006041361655839e-07, + "logits/chosen": -2.4445457458496094, + "logits/rejected": -1.978326439857483, + "logps/chosen": -219.8248748779297, + "logps/rejected": -193.75340270996094, + "loss": 0.0262, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.05693792551755905, + "rewards/margins": 0.06933034956455231, + "rewards/rejected": -0.12626829743385315, + "step": 13980 + }, + { + "epoch": 0.92, + "learning_rate": 1.0839110169625189e-07, + "logits/chosen": -2.005495548248291, + "logits/rejected": -2.297020435333252, + "logps/chosen": -217.3983917236328, + "logps/rejected": -237.1815643310547, + "loss": 0.0231, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.052780430763959885, + "rewards/margins": 0.1356925070285797, + "rewards/rejected": -0.1884729266166687, + "step": 13990 + }, + { + "epoch": 0.92, + "learning_rate": 1.06734265579502e-07, + "logits/chosen": -2.2928407192230225, + "logits/rejected": -1.987210988998413, + "logps/chosen": -273.75787353515625, + "logps/rejected": -230.35916137695312, + "loss": 0.0221, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.06120569631457329, + "rewards/margins": 0.11553256213665009, + "rewards/rejected": -0.17673827707767487, + "step": 14000 + }, + { + "epoch": 0.92, + "eval_logits/chosen": -2.277212381362915, + "eval_logits/rejected": -2.089181900024414, + "eval_logps/chosen": -240.49688720703125, + "eval_logps/rejected": -237.35060119628906, + "eval_loss": 0.023917585611343384, + "eval_rewards/accuracies": 0.659500002861023, + "eval_rewards/chosen": -0.042459722608327866, + "eval_rewards/margins": 0.08623373508453369, + "eval_rewards/rejected": -0.12869346141815186, + "eval_runtime": 714.107, + "eval_samples_per_second": 2.801, + "eval_steps_per_second": 1.4, + "step": 14000 + }, + { + "epoch": 0.92, + "learning_rate": 1.050899139091771e-07, + "logits/chosen": -2.370375394821167, + "logits/rejected": -2.018911838531494, + "logps/chosen": -289.21685791015625, + "logps/rejected": -260.20703125, + "loss": 0.0266, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.05294713377952576, + "rewards/margins": 0.08754151314496994, + "rewards/rejected": -0.1404886543750763, + "step": 14010 + }, + { + "epoch": 0.92, + "learning_rate": 1.0345805526302072e-07, + "logits/chosen": -2.194415807723999, + "logits/rejected": -2.2927064895629883, + "logps/chosen": -210.08584594726562, + "logps/rejected": -221.80447387695312, + "loss": 0.0167, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.044617511332035065, + "rewards/margins": 0.09478165954351425, + "rewards/rejected": -0.1393991857767105, + "step": 14020 + }, + { + "epoch": 0.92, + "learning_rate": 1.0183869815360764e-07, + "logits/chosen": -2.188875913619995, + "logits/rejected": -2.306763172149658, + "logps/chosen": -199.89414978027344, + "logps/rejected": -245.0305938720703, + "loss": 0.0175, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.039903342723846436, + "rewards/margins": 0.06248168274760246, + "rewards/rejected": -0.1023850217461586, + "step": 14030 + }, + { + "epoch": 0.92, + "learning_rate": 1.0023185102829763e-07, + "logits/chosen": -1.9965900182724, + "logits/rejected": -2.2122974395751953, + "logps/chosen": -239.11306762695312, + "logps/rejected": -262.6435546875, + "loss": 0.0188, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.0414542555809021, + "rewards/margins": 0.0967288389801979, + "rewards/rejected": -0.1381830871105194, + "step": 14040 + }, + { + "epoch": 0.92, + "learning_rate": 9.863752226919182e-08, + "logits/chosen": -2.1965866088867188, + "logits/rejected": -1.7160733938217163, + "logps/chosen": -244.0505828857422, + "logps/rejected": -205.0543212890625, + "loss": 0.0302, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.03159191086888313, + "rewards/margins": 0.13291195034980774, + "rewards/rejected": -0.16450384259223938, + "step": 14050 + }, + { + "epoch": 0.92, + "learning_rate": 9.705572019309107e-08, + "logits/chosen": -2.124981164932251, + "logits/rejected": -2.1576197147369385, + "logps/chosen": -277.95843505859375, + "logps/rejected": -269.6822509765625, + "loss": 0.0196, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.049655042588710785, + "rewards/margins": 0.11603889614343643, + "rewards/rejected": -0.16569393873214722, + "step": 14060 + }, + { + "epoch": 0.92, + "learning_rate": 9.548645305144849e-08, + "logits/chosen": -2.3364295959472656, + "logits/rejected": -2.1991865634918213, + "logps/chosen": -180.5296173095703, + "logps/rejected": -202.05340576171875, + "loss": 0.0254, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.03517225757241249, + "rewards/margins": 0.08775045722723007, + "rewards/rejected": -0.12292270362377167, + "step": 14070 + }, + { + "epoch": 0.92, + "learning_rate": 9.392972903033149e-08, + "logits/chosen": -2.2558159828186035, + "logits/rejected": -2.1181063652038574, + "logps/chosen": -231.4455108642578, + "logps/rejected": -232.4448699951172, + "loss": 0.0158, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.04157125949859619, + "rewards/margins": 0.04271433874964714, + "rewards/rejected": -0.08428559452295303, + "step": 14080 + }, + { + "epoch": 0.92, + "learning_rate": 9.238555625037449e-08, + "logits/chosen": -2.2797014713287354, + "logits/rejected": -2.06451153755188, + "logps/chosen": -200.78018188476562, + "logps/rejected": -184.2545928955078, + "loss": 0.0155, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.05035996437072754, + "rewards/margins": 0.06578092277050018, + "rewards/rejected": -0.11614088714122772, + "step": 14090 + }, + { + "epoch": 0.92, + "learning_rate": 9.085394276673903e-08, + "logits/chosen": -2.2840874195098877, + "logits/rejected": -1.9881312847137451, + "logps/chosen": -278.541015625, + "logps/rejected": -282.0232849121094, + "loss": 0.0259, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.05187432840466499, + "rewards/margins": 0.08965910971164703, + "rewards/rejected": -0.14153344929218292, + "step": 14100 + }, + { + "epoch": 0.92, + "eval_logits/chosen": -2.2771763801574707, + "eval_logits/rejected": -2.0892348289489746, + "eval_logps/chosen": -240.22537231445312, + "eval_logps/rejected": -236.93740844726562, + "eval_loss": 0.023912038654088974, + "eval_rewards/accuracies": 0.6585000157356262, + "eval_rewards/chosen": -0.04110207408666611, + "eval_rewards/margins": 0.08552539348602295, + "eval_rewards/rejected": -0.12662747502326965, + "eval_runtime": 710.9583, + "eval_samples_per_second": 2.813, + "eval_steps_per_second": 1.407, + "step": 14100 + }, + { + "epoch": 0.92, + "learning_rate": 8.933489656907157e-08, + "logits/chosen": -2.263106107711792, + "logits/rejected": -2.1606152057647705, + "logps/chosen": -227.7388153076172, + "logps/rejected": -264.6552429199219, + "loss": 0.027, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.05201687291264534, + "rewards/margins": 0.05988026782870293, + "rewards/rejected": -0.11189714819192886, + "step": 14110 + }, + { + "epoch": 0.92, + "learning_rate": 8.782842558146127e-08, + "logits/chosen": -2.3026959896087646, + "logits/rejected": -2.231685161590576, + "logps/chosen": -166.2057647705078, + "logps/rejected": -186.4104766845703, + "loss": 0.0333, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.0216091126203537, + "rewards/margins": 0.10955610126256943, + "rewards/rejected": -0.13116520643234253, + "step": 14120 + }, + { + "epoch": 0.92, + "learning_rate": 8.633453766239836e-08, + "logits/chosen": -2.3711798191070557, + "logits/rejected": -2.1423401832580566, + "logps/chosen": -238.1058349609375, + "logps/rejected": -218.2222442626953, + "loss": 0.0117, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.014923456124961376, + "rewards/margins": 0.05985826998949051, + "rewards/rejected": -0.07478172332048416, + "step": 14130 + }, + { + "epoch": 0.93, + "learning_rate": 8.485324060473448e-08, + "logits/chosen": -2.2005109786987305, + "logits/rejected": -2.0876071453094482, + "logps/chosen": -246.9003448486328, + "logps/rejected": -247.547119140625, + "loss": 0.0137, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.03309129923582077, + "rewards/margins": 0.06518785655498505, + "rewards/rejected": -0.09827915579080582, + "step": 14140 + }, + { + "epoch": 0.93, + "learning_rate": 8.338454213564052e-08, + "logits/chosen": -2.246474266052246, + "logits/rejected": -1.9718784093856812, + "logps/chosen": -241.40493774414062, + "logps/rejected": -243.92190551757812, + "loss": 0.0272, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.05253978818655014, + "rewards/margins": 0.09855607897043228, + "rewards/rejected": -0.15109586715698242, + "step": 14150 + }, + { + "epoch": 0.93, + "learning_rate": 8.192844991656679e-08, + "logits/chosen": -2.235311985015869, + "logits/rejected": -1.9973386526107788, + "logps/chosen": -249.0086669921875, + "logps/rejected": -228.0577392578125, + "loss": 0.0238, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.0514599084854126, + "rewards/margins": 0.07728545367717743, + "rewards/rejected": -0.12874536216259003, + "step": 14160 + }, + { + "epoch": 0.93, + "learning_rate": 8.048497154320434e-08, + "logits/chosen": -2.2661285400390625, + "logits/rejected": -2.312032699584961, + "logps/chosen": -142.10989379882812, + "logps/rejected": -165.83572387695312, + "loss": 0.0229, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.06280183792114258, + "rewards/margins": 0.07314186543226242, + "rewards/rejected": -0.1359437108039856, + "step": 14170 + }, + { + "epoch": 0.93, + "learning_rate": 7.905411454544265e-08, + "logits/chosen": -2.281566619873047, + "logits/rejected": -2.133256196975708, + "logps/chosen": -247.7585906982422, + "logps/rejected": -265.89141845703125, + "loss": 0.0337, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.05627186968922615, + "rewards/margins": 0.06695263832807541, + "rewards/rejected": -0.12322449684143066, + "step": 14180 + }, + { + "epoch": 0.93, + "learning_rate": 7.763588638733332e-08, + "logits/chosen": -2.294487714767456, + "logits/rejected": -2.249011516571045, + "logps/chosen": -269.75885009765625, + "logps/rejected": -266.6964416503906, + "loss": 0.0229, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.03669353947043419, + "rewards/margins": 0.08380848169326782, + "rewards/rejected": -0.12050201743841171, + "step": 14190 + }, + { + "epoch": 0.93, + "learning_rate": 7.623029446704899e-08, + "logits/chosen": -2.154623508453369, + "logits/rejected": -2.328273296356201, + "logps/chosen": -317.51373291015625, + "logps/rejected": -303.58099365234375, + "loss": 0.0156, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.03036128357052803, + "rewards/margins": 0.114809550344944, + "rewards/rejected": -0.14517082273960114, + "step": 14200 + }, + { + "epoch": 0.93, + "eval_logits/chosen": -2.2770776748657227, + "eval_logits/rejected": -2.089095115661621, + "eval_logps/chosen": -240.37933349609375, + "eval_logps/rejected": -237.17074584960938, + "eval_loss": 0.023934001103043556, + "eval_rewards/accuracies": 0.6614999771118164, + "eval_rewards/chosen": -0.04187189042568207, + "eval_rewards/margins": 0.08592244237661362, + "eval_rewards/rejected": -0.1277943253517151, + "eval_runtime": 715.6797, + "eval_samples_per_second": 2.795, + "eval_steps_per_second": 1.397, + "step": 14200 + }, + { + "epoch": 0.93, + "learning_rate": 7.483734611684557e-08, + "logits/chosen": -2.0783333778381348, + "logits/rejected": -1.9120200872421265, + "logps/chosen": -264.9306640625, + "logps/rejected": -227.62576293945312, + "loss": 0.0429, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.04022422060370445, + "rewards/margins": 0.08428023010492325, + "rewards/rejected": -0.12450442463159561, + "step": 14210 + }, + { + "epoch": 0.93, + "learning_rate": 7.345704860302366e-08, + "logits/chosen": -2.3603944778442383, + "logits/rejected": -2.3407979011535645, + "logps/chosen": -257.5492858886719, + "logps/rejected": -275.9205322265625, + "loss": 0.0135, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.04648297280073166, + "rewards/margins": 0.09524835646152496, + "rewards/rejected": -0.14173133671283722, + "step": 14220 + }, + { + "epoch": 0.93, + "learning_rate": 7.208940912589224e-08, + "logits/chosen": -2.283950090408325, + "logits/rejected": -1.9505077600479126, + "logps/chosen": -223.5438995361328, + "logps/rejected": -212.1008758544922, + "loss": 0.0269, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.06978797167539597, + "rewards/margins": 0.12967699766159058, + "rewards/rejected": -0.19946496188640594, + "step": 14230 + }, + { + "epoch": 0.93, + "learning_rate": 7.073443481972753e-08, + "logits/chosen": -2.0981688499450684, + "logits/rejected": -2.0736212730407715, + "logps/chosen": -194.62655639648438, + "logps/rejected": -231.03488159179688, + "loss": 0.01, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.052353955805301666, + "rewards/margins": 0.08453324437141418, + "rewards/rejected": -0.13688719272613525, + "step": 14240 + }, + { + "epoch": 0.93, + "learning_rate": 6.939213275274027e-08, + "logits/chosen": -2.2323665618896484, + "logits/rejected": -2.1960151195526123, + "logps/chosen": -250.4298858642578, + "logps/rejected": -243.5245819091797, + "loss": 0.0232, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.05588046833872795, + "rewards/margins": 0.06348704546689987, + "rewards/rejected": -0.11936751753091812, + "step": 14250 + }, + { + "epoch": 0.93, + "learning_rate": 6.806250992703461e-08, + "logits/chosen": -2.259444236755371, + "logits/rejected": -2.1075503826141357, + "logps/chosen": -224.3917694091797, + "logps/rejected": -212.67379760742188, + "loss": 0.026, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.0384858064353466, + "rewards/margins": 0.07246101647615433, + "rewards/rejected": -0.11094681918621063, + "step": 14260 + }, + { + "epoch": 0.93, + "learning_rate": 6.674557327857572e-08, + "logits/chosen": -2.253323793411255, + "logits/rejected": -2.262890100479126, + "logps/chosen": -261.03839111328125, + "logps/rejected": -276.0207214355469, + "loss": 0.0235, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.033838655799627304, + "rewards/margins": 0.1203516274690628, + "rewards/rejected": -0.1541902720928192, + "step": 14270 + }, + { + "epoch": 0.93, + "learning_rate": 6.544132967714917e-08, + "logits/chosen": -2.0000040531158447, + "logits/rejected": -2.0072243213653564, + "logps/chosen": -259.1932678222656, + "logps/rejected": -270.4208068847656, + "loss": 0.0185, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.061666447669267654, + "rewards/margins": 0.12469998747110367, + "rewards/rejected": -0.18636643886566162, + "step": 14280 + }, + { + "epoch": 0.93, + "learning_rate": 6.414978592632932e-08, + "logits/chosen": -2.3282947540283203, + "logits/rejected": -1.893397331237793, + "logps/chosen": -271.5886535644531, + "logps/rejected": -242.21109008789062, + "loss": 0.0127, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.052446603775024414, + "rewards/margins": 0.08538929373025894, + "rewards/rejected": -0.13783589005470276, + "step": 14290 + }, + { + "epoch": 0.94, + "learning_rate": 6.287094876344046e-08, + "logits/chosen": -2.2830162048339844, + "logits/rejected": -2.3124501705169678, + "logps/chosen": -175.96054077148438, + "logps/rejected": -198.17027282714844, + "loss": 0.0158, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.017701296135783195, + "rewards/margins": 0.07785354554653168, + "rewards/rejected": -0.09555485099554062, + "step": 14300 + }, + { + "epoch": 0.94, + "eval_logits/chosen": -2.2765443325042725, + "eval_logits/rejected": -2.0886929035186768, + "eval_logps/chosen": -240.28904724121094, + "eval_logps/rejected": -237.00119018554688, + "eval_loss": 0.023930862545967102, + "eval_rewards/accuracies": 0.6600000262260437, + "eval_rewards/chosen": -0.04142040014266968, + "eval_rewards/margins": 0.0855260118842125, + "eval_rewards/rejected": -0.12694638967514038, + "eval_runtime": 713.3157, + "eval_samples_per_second": 2.804, + "eval_steps_per_second": 1.402, + "step": 14300 + }, + { + "epoch": 0.94, + "learning_rate": 6.160482485952413e-08, + "logits/chosen": -2.423222303390503, + "logits/rejected": -2.1630914211273193, + "logps/chosen": -248.218994140625, + "logps/rejected": -230.79067993164062, + "loss": 0.025, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.05884624272584915, + "rewards/margins": 0.07183768600225449, + "rewards/rejected": -0.13068392872810364, + "step": 14310 + }, + { + "epoch": 0.94, + "learning_rate": 6.035142081930234e-08, + "logits/chosen": -2.288935661315918, + "logits/rejected": -1.9192240238189697, + "logps/chosen": -274.0880432128906, + "logps/rejected": -213.8658447265625, + "loss": 0.0178, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.05645836517214775, + "rewards/margins": 0.08397369086742401, + "rewards/rejected": -0.14043205976486206, + "step": 14320 + }, + { + "epoch": 0.94, + "learning_rate": 5.911074318114496e-08, + "logits/chosen": -2.117149591445923, + "logits/rejected": -2.226193428039551, + "logps/chosen": -212.69583129882812, + "logps/rejected": -271.21246337890625, + "loss": 0.0138, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.04553840309381485, + "rewards/margins": 0.0819590762257576, + "rewards/rejected": -0.12749749422073364, + "step": 14330 + }, + { + "epoch": 0.94, + "learning_rate": 5.788279841703381e-08, + "logits/chosen": -2.3232810497283936, + "logits/rejected": -2.0692296028137207, + "logps/chosen": -191.612060546875, + "logps/rejected": -203.80642700195312, + "loss": 0.024, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.03851230815052986, + "rewards/margins": 0.09930738061666489, + "rewards/rejected": -0.13781967759132385, + "step": 14340 + }, + { + "epoch": 0.94, + "learning_rate": 5.66675929325311e-08, + "logits/chosen": -2.321681499481201, + "logits/rejected": -2.07851505279541, + "logps/chosen": -229.58871459960938, + "logps/rejected": -227.32705688476562, + "loss": 0.0155, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.04462137073278427, + "rewards/margins": 0.04543551802635193, + "rewards/rejected": -0.0900568813085556, + "step": 14350 + }, + { + "epoch": 0.94, + "learning_rate": 5.546513306674301e-08, + "logits/chosen": -2.2515957355499268, + "logits/rejected": -1.8618234395980835, + "logps/chosen": -287.59844970703125, + "logps/rejected": -232.39102172851562, + "loss": 0.0239, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.045409008860588074, + "rewards/margins": 0.09937725961208344, + "rewards/rejected": -0.1447862684726715, + "step": 14360 + }, + { + "epoch": 0.94, + "learning_rate": 5.4275425092290004e-08, + "logits/chosen": -2.382500410079956, + "logits/rejected": -2.3284032344818115, + "logps/chosen": -260.4015197753906, + "logps/rejected": -260.8076171875, + "loss": 0.0243, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.02802688255906105, + "rewards/margins": 0.09424374997615814, + "rewards/rejected": -0.1222706288099289, + "step": 14370 + }, + { + "epoch": 0.94, + "learning_rate": 5.309847521527078e-08, + "logits/chosen": -2.2202110290527344, + "logits/rejected": -1.856729507446289, + "logps/chosen": -293.5809631347656, + "logps/rejected": -266.0841369628906, + "loss": 0.0229, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.04503798484802246, + "rewards/margins": 0.07715752720832825, + "rewards/rejected": -0.12219550460577011, + "step": 14380 + }, + { + "epoch": 0.94, + "learning_rate": 5.1934289575233385e-08, + "logits/chosen": -2.094172954559326, + "logits/rejected": -1.7569259405136108, + "logps/chosen": -250.232421875, + "logps/rejected": -235.7849578857422, + "loss": 0.0272, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.05351518467068672, + "rewards/margins": 0.09959940612316132, + "rewards/rejected": -0.15311458706855774, + "step": 14390 + }, + { + "epoch": 0.94, + "learning_rate": 5.078287424513994e-08, + "logits/chosen": -2.34496808052063, + "logits/rejected": -2.232339382171631, + "logps/chosen": -280.80596923828125, + "logps/rejected": -227.66921997070312, + "loss": 0.0216, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.05379108712077141, + "rewards/margins": 0.10846780240535736, + "rewards/rejected": -0.16225889325141907, + "step": 14400 + }, + { + "epoch": 0.94, + "eval_logits/chosen": -2.2774136066436768, + "eval_logits/rejected": -2.0895285606384277, + "eval_logps/chosen": -240.2555694580078, + "eval_logps/rejected": -236.98165893554688, + "eval_loss": 0.02393474243581295, + "eval_rewards/accuracies": 0.6620000004768372, + "eval_rewards/chosen": -0.04125319421291351, + "eval_rewards/margins": 0.08559557795524597, + "eval_rewards/rejected": -0.12684877216815948, + "eval_runtime": 713.2387, + "eval_samples_per_second": 2.804, + "eval_steps_per_second": 1.402, + "step": 14400 + }, + { + "epoch": 0.94, + "learning_rate": 4.964423523133671e-08, + "logits/chosen": -2.353637456893921, + "logits/rejected": -2.108694314956665, + "logps/chosen": -223.6641082763672, + "logps/rejected": -203.7133026123047, + "loss": 0.0262, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.02966316044330597, + "rewards/margins": 0.07199189066886902, + "rewards/rejected": -0.10165505111217499, + "step": 14410 + }, + { + "epoch": 0.94, + "learning_rate": 4.8518378473522976e-08, + "logits/chosen": -2.210888624191284, + "logits/rejected": -2.0540719032287598, + "logps/chosen": -260.373291015625, + "logps/rejected": -270.9510192871094, + "loss": 0.0279, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.049317214637994766, + "rewards/margins": 0.08652471750974655, + "rewards/rejected": -0.13584193587303162, + "step": 14420 + }, + { + "epoch": 0.94, + "learning_rate": 4.7405309844718584e-08, + "logits/chosen": -2.1370275020599365, + "logits/rejected": -2.032277822494507, + "logps/chosen": -204.3740692138672, + "logps/rejected": -238.6953582763672, + "loss": 0.0243, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.059375204145908356, + "rewards/margins": 0.12562525272369385, + "rewards/rejected": -0.1850004643201828, + "step": 14430 + }, + { + "epoch": 0.94, + "learning_rate": 4.630503515123508e-08, + "logits/chosen": -2.3820743560791016, + "logits/rejected": -2.08414888381958, + "logps/chosen": -211.52529907226562, + "logps/rejected": -179.07192993164062, + "loss": 0.0283, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.0532965362071991, + "rewards/margins": 0.08706490695476532, + "rewards/rejected": -0.14036144316196442, + "step": 14440 + }, + { + "epoch": 0.95, + "learning_rate": 4.5217560132644056e-08, + "logits/chosen": -2.2008249759674072, + "logits/rejected": -2.1269989013671875, + "logps/chosen": -159.01589965820312, + "logps/rejected": -192.65785217285156, + "loss": 0.0564, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.041981421411037445, + "rewards/margins": 0.07620520889759064, + "rewards/rejected": -0.11818663775920868, + "step": 14450 + }, + { + "epoch": 0.95, + "learning_rate": 4.41428904617483e-08, + "logits/chosen": -2.236302614212036, + "logits/rejected": -2.231376886367798, + "logps/chosen": -187.8321533203125, + "logps/rejected": -207.3106689453125, + "loss": 0.0367, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.05361725017428398, + "rewards/margins": 0.07409012317657471, + "rewards/rejected": -0.12770739197731018, + "step": 14460 + }, + { + "epoch": 0.95, + "learning_rate": 4.3081031744550696e-08, + "logits/chosen": -2.3412978649139404, + "logits/rejected": -2.2665255069732666, + "logps/chosen": -258.1219482421875, + "logps/rejected": -254.7415313720703, + "loss": 0.0316, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.024883154779672623, + "rewards/margins": 0.09473639726638794, + "rewards/rejected": -0.11961954832077026, + "step": 14470 + }, + { + "epoch": 0.95, + "learning_rate": 4.2031989520227025e-08, + "logits/chosen": -2.316408634185791, + "logits/rejected": -2.113938808441162, + "logps/chosen": -235.3325653076172, + "logps/rejected": -227.225830078125, + "loss": 0.0082, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.03509070724248886, + "rewards/margins": 0.0689038410782814, + "rewards/rejected": -0.10399456322193146, + "step": 14480 + }, + { + "epoch": 0.95, + "learning_rate": 4.099576926109461e-08, + "logits/chosen": -2.3996028900146484, + "logits/rejected": -1.8967199325561523, + "logps/chosen": -246.0100555419922, + "logps/rejected": -184.26046752929688, + "loss": 0.0251, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.03114110231399536, + "rewards/margins": 0.0870242789387703, + "rewards/rejected": -0.11816537380218506, + "step": 14490 + }, + { + "epoch": 0.95, + "learning_rate": 3.997237637258705e-08, + "logits/chosen": -2.230151891708374, + "logits/rejected": -2.2772998809814453, + "logps/chosen": -321.3664855957031, + "logps/rejected": -295.66204833984375, + "loss": 0.0126, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.020565873011946678, + "rewards/margins": 0.08819369971752167, + "rewards/rejected": -0.10875958204269409, + "step": 14500 + }, + { + "epoch": 0.95, + "eval_logits/chosen": -2.277437925338745, + "eval_logits/rejected": -2.0894877910614014, + "eval_logps/chosen": -240.2699432373047, + "eval_logps/rejected": -237.0005340576172, + "eval_loss": 0.023945190012454987, + "eval_rewards/accuracies": 0.6604999899864197, + "eval_rewards/chosen": -0.04132496565580368, + "eval_rewards/margins": 0.08561818301677704, + "eval_rewards/rejected": -0.12694314122200012, + "eval_runtime": 712.8613, + "eval_samples_per_second": 2.806, + "eval_steps_per_second": 1.403, + "step": 14500 + }, + { + "epoch": 0.95, + "learning_rate": 3.8961816193222035e-08, + "logits/chosen": -2.370445966720581, + "logits/rejected": -2.1521902084350586, + "logps/chosen": -251.1434326171875, + "logps/rejected": -203.28396606445312, + "loss": 0.0278, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.07646572589874268, + "rewards/margins": 0.04736895114183426, + "rewards/rejected": -0.12383468449115753, + "step": 14510 + }, + { + "epoch": 0.95, + "learning_rate": 3.79640939945769e-08, + "logits/chosen": -2.332946300506592, + "logits/rejected": -2.2011606693267822, + "logps/chosen": -288.9212341308594, + "logps/rejected": -211.8104248046875, + "loss": 0.0074, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.013501740992069244, + "rewards/margins": 0.04576627165079117, + "rewards/rejected": -0.05926801636815071, + "step": 14520 + }, + { + "epoch": 0.95, + "learning_rate": 3.697921498125895e-08, + "logits/chosen": -2.0662643909454346, + "logits/rejected": -2.1631338596343994, + "logps/chosen": -224.1737060546875, + "logps/rejected": -247.8341522216797, + "loss": 0.0171, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.07110340148210526, + "rewards/margins": 0.08890338987112045, + "rewards/rejected": -0.1600067913532257, + "step": 14530 + }, + { + "epoch": 0.95, + "learning_rate": 3.6007184290880456e-08, + "logits/chosen": -2.2737791538238525, + "logits/rejected": -2.1759960651397705, + "logps/chosen": -224.28488159179688, + "logps/rejected": -222.465576171875, + "loss": 0.0443, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.061559468507766724, + "rewards/margins": 0.07422170042991638, + "rewards/rejected": -0.1357811540365219, + "step": 14540 + }, + { + "epoch": 0.95, + "learning_rate": 3.504800699402872e-08, + "logits/chosen": -2.5146634578704834, + "logits/rejected": -2.2180233001708984, + "logps/chosen": -351.6441345214844, + "logps/rejected": -287.64080810546875, + "loss": 0.0109, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.029949486255645752, + "rewards/margins": 0.04681790992617607, + "rewards/rejected": -0.07676739990711212, + "step": 14550 + }, + { + "epoch": 0.95, + "learning_rate": 3.4101688094242967e-08, + "logits/chosen": -2.218144178390503, + "logits/rejected": -2.109200954437256, + "logps/chosen": -285.6780090332031, + "logps/rejected": -283.11676025390625, + "loss": 0.0429, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.08058507740497589, + "rewards/margins": 0.11307475715875626, + "rewards/rejected": -0.19365984201431274, + "step": 14560 + }, + { + "epoch": 0.95, + "learning_rate": 3.3168232527985564e-08, + "logits/chosen": -2.1541683673858643, + "logits/rejected": -1.8619887828826904, + "logps/chosen": -252.0712432861328, + "logps/rejected": -214.01773071289062, + "loss": 0.0285, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.05226341634988785, + "rewards/margins": 0.08348184078931808, + "rewards/rejected": -0.13574525713920593, + "step": 14570 + }, + { + "epoch": 0.95, + "learning_rate": 3.224764516461892e-08, + "logits/chosen": -2.2693839073181152, + "logits/rejected": -2.088949680328369, + "logps/chosen": -259.0687561035156, + "logps/rejected": -251.11636352539062, + "loss": 0.0169, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.02034943364560604, + "rewards/margins": 0.11398156732320786, + "rewards/rejected": -0.13433101773262024, + "step": 14580 + }, + { + "epoch": 0.95, + "learning_rate": 3.133993080637665e-08, + "logits/chosen": -2.2507710456848145, + "logits/rejected": -2.0576610565185547, + "logps/chosen": -212.4413299560547, + "logps/rejected": -220.51016235351562, + "loss": 0.022, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.047802168875932693, + "rewards/margins": 0.10239864885807037, + "rewards/rejected": -0.15020082890987396, + "step": 14590 + }, + { + "epoch": 0.96, + "learning_rate": 3.0445094188342186e-08, + "logits/chosen": -2.0587873458862305, + "logits/rejected": -1.7677072286605835, + "logps/chosen": -262.41070556640625, + "logps/rejected": -200.29734802246094, + "loss": 0.0346, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.04467066004872322, + "rewards/margins": 0.09075422585010529, + "rewards/rejected": -0.1354248821735382, + "step": 14600 + }, + { + "epoch": 0.96, + "eval_logits/chosen": -2.278109312057495, + "eval_logits/rejected": -2.0901198387145996, + "eval_logps/chosen": -240.32408142089844, + "eval_logps/rejected": -236.98968505859375, + "eval_loss": 0.02390436641871929, + "eval_rewards/accuracies": 0.6589999794960022, + "eval_rewards/chosen": -0.04159563407301903, + "eval_rewards/margins": 0.08529327809810638, + "eval_rewards/rejected": -0.1268889158964157, + "eval_runtime": 711.7023, + "eval_samples_per_second": 2.81, + "eval_steps_per_second": 1.405, + "step": 14600 + }, + { + "epoch": 0.96, + "learning_rate": 2.9563139978421028e-08, + "logits/chosen": -2.1933608055114746, + "logits/rejected": -2.2148003578186035, + "logps/chosen": -232.57373046875, + "logps/rejected": -236.6164093017578, + "loss": 0.0102, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.033509157598018646, + "rewards/margins": 0.059135984629392624, + "rewards/rejected": -0.09264513850212097, + "step": 14610 + }, + { + "epoch": 0.96, + "learning_rate": 2.869407277731939e-08, + "logits/chosen": -2.1482250690460205, + "logits/rejected": -2.0788064002990723, + "logps/chosen": -192.88778686523438, + "logps/rejected": -185.89356994628906, + "loss": 0.0167, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.038153309375047684, + "rewards/margins": 0.08087687939405441, + "rewards/rejected": -0.11903019994497299, + "step": 14620 + }, + { + "epoch": 0.96, + "learning_rate": 2.783789711851642e-08, + "logits/chosen": -2.28196120262146, + "logits/rejected": -2.0531458854675293, + "logps/chosen": -167.2568359375, + "logps/rejected": -168.91690063476562, + "loss": 0.0185, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.027029190212488174, + "rewards/margins": 0.12046368420124054, + "rewards/rejected": -0.14749285578727722, + "step": 14630 + }, + { + "epoch": 0.96, + "learning_rate": 2.6994617468244778e-08, + "logits/chosen": -2.3830745220184326, + "logits/rejected": -1.9102243185043335, + "logps/chosen": -220.4351348876953, + "logps/rejected": -178.19236755371094, + "loss": 0.0223, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.03707236796617508, + "rewards/margins": 0.10837974399328232, + "rewards/rejected": -0.1454521119594574, + "step": 14640 + }, + { + "epoch": 0.96, + "learning_rate": 2.6164238225463155e-08, + "logits/chosen": -2.2158422470092773, + "logits/rejected": -1.8621547222137451, + "logps/chosen": -291.20098876953125, + "logps/rejected": -232.3519744873047, + "loss": 0.0387, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.04386935755610466, + "rewards/margins": 0.09829467535018921, + "rewards/rejected": -0.14216403663158417, + "step": 14650 + }, + { + "epoch": 0.96, + "learning_rate": 2.534676372183742e-08, + "logits/chosen": -2.2373509407043457, + "logits/rejected": -2.0849971771240234, + "logps/chosen": -293.45819091796875, + "logps/rejected": -254.5859832763672, + "loss": 0.0315, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.047519437968730927, + "rewards/margins": 0.06727181375026703, + "rewards/rejected": -0.11479125171899796, + "step": 14660 + }, + { + "epoch": 0.96, + "learning_rate": 2.4542198221714218e-08, + "logits/chosen": -2.1163833141326904, + "logits/rejected": -1.8461744785308838, + "logps/chosen": -146.51657104492188, + "logps/rejected": -166.13450622558594, + "loss": 0.025, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -0.042048048228025436, + "rewards/margins": 0.09695516526699066, + "rewards/rejected": -0.1390032321214676, + "step": 14670 + }, + { + "epoch": 0.96, + "learning_rate": 2.3750545922101854e-08, + "logits/chosen": -2.5506093502044678, + "logits/rejected": -2.1625025272369385, + "logps/chosen": -318.39263916015625, + "logps/rejected": -267.0853576660156, + "loss": 0.0337, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.045007385313510895, + "rewards/margins": 0.08543379604816437, + "rewards/rejected": -0.13044118881225586, + "step": 14680 + }, + { + "epoch": 0.96, + "learning_rate": 2.2971810952646112e-08, + "logits/chosen": -2.264488458633423, + "logits/rejected": -2.1718125343322754, + "logps/chosen": -268.1809997558594, + "logps/rejected": -230.80789184570312, + "loss": 0.0309, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.04909127205610275, + "rewards/margins": 0.051663417369127274, + "rewards/rejected": -0.10075469315052032, + "step": 14690 + }, + { + "epoch": 0.96, + "learning_rate": 2.2205997375610576e-08, + "logits/chosen": -2.094313383102417, + "logits/rejected": -2.0372653007507324, + "logps/chosen": -189.8638458251953, + "logps/rejected": -215.89511108398438, + "loss": 0.0225, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.021789545193314552, + "rewards/margins": 0.1043560728430748, + "rewards/rejected": -0.126145601272583, + "step": 14700 + }, + { + "epoch": 0.96, + "eval_logits/chosen": -2.2773990631103516, + "eval_logits/rejected": -2.089484453201294, + "eval_logps/chosen": -240.30162048339844, + "eval_logps/rejected": -236.9473114013672, + "eval_loss": 0.023914767429232597, + "eval_rewards/accuracies": 0.6604999899864197, + "eval_rewards/chosen": -0.041483353823423386, + "eval_rewards/margins": 0.08519367128610611, + "eval_rewards/rejected": -0.126677006483078, + "eval_runtime": 714.1783, + "eval_samples_per_second": 2.8, + "eval_steps_per_second": 1.4, + "step": 14700 + }, + { + "epoch": 0.96, + "learning_rate": 2.1453109185853304e-08, + "logits/chosen": -2.3149261474609375, + "logits/rejected": -2.2594079971313477, + "logps/chosen": -207.83242797851562, + "logps/rejected": -227.0337371826172, + "loss": 0.0423, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.02876967191696167, + "rewards/margins": 0.08727029711008072, + "rewards/rejected": -0.11603996902704239, + "step": 14710 + }, + { + "epoch": 0.96, + "learning_rate": 2.0713150310808784e-08, + "logits/chosen": -2.0834505558013916, + "logits/rejected": -2.402132272720337, + "logps/chosen": -238.4170684814453, + "logps/rejected": -250.8238067626953, + "loss": 0.0205, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.06464622169733047, + "rewards/margins": 0.0332169234752655, + "rewards/rejected": -0.09786313772201538, + "step": 14720 + }, + { + "epoch": 0.96, + "learning_rate": 1.9986124610464064e-08, + "logits/chosen": -2.1681363582611084, + "logits/rejected": -1.8642688989639282, + "logps/chosen": -296.47052001953125, + "logps/rejected": -253.28335571289062, + "loss": 0.0169, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.046201933175325394, + "rewards/margins": 0.13312575221061707, + "rewards/rejected": -0.17932769656181335, + "step": 14730 + }, + { + "epoch": 0.96, + "learning_rate": 1.927203587734211e-08, + "logits/chosen": -2.1540091037750244, + "logits/rejected": -1.766482949256897, + "logps/chosen": -259.76141357421875, + "logps/rejected": -228.9178924560547, + "loss": 0.0347, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.03384867683053017, + "rewards/margins": 0.08841460943222046, + "rewards/rejected": -0.12226328998804092, + "step": 14740 + }, + { + "epoch": 0.97, + "learning_rate": 1.8570887836479034e-08, + "logits/chosen": -2.269257068634033, + "logits/rejected": -2.0660524368286133, + "logps/chosen": -206.35147094726562, + "logps/rejected": -273.08447265625, + "loss": 0.0287, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.06364154815673828, + "rewards/margins": 0.06958835572004318, + "rewards/rejected": -0.13322989642620087, + "step": 14750 + }, + { + "epoch": 0.97, + "learning_rate": 1.7882684145406616e-08, + "logits/chosen": -2.339402198791504, + "logits/rejected": -2.27685284614563, + "logps/chosen": -301.135009765625, + "logps/rejected": -314.385986328125, + "loss": 0.0238, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.023571869358420372, + "rewards/margins": 0.07569929957389832, + "rewards/rejected": -0.09927116334438324, + "step": 14760 + }, + { + "epoch": 0.97, + "learning_rate": 1.7207428394132865e-08, + "logits/chosen": -2.4952762126922607, + "logits/rejected": -2.0110652446746826, + "logps/chosen": -277.0386962890625, + "logps/rejected": -247.00454711914062, + "loss": 0.0123, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.03629208356142044, + "rewards/margins": 0.12326530367136002, + "rewards/rejected": -0.15955738723278046, + "step": 14770 + }, + { + "epoch": 0.97, + "learning_rate": 1.654512410512177e-08, + "logits/chosen": -2.2198612689971924, + "logits/rejected": -1.9765468835830688, + "logps/chosen": -262.82794189453125, + "logps/rejected": -211.95394897460938, + "loss": 0.0392, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.04625507816672325, + "rewards/margins": 0.05757225677371025, + "rewards/rejected": -0.1038273423910141, + "step": 14780 + }, + { + "epoch": 0.97, + "learning_rate": 1.5895774733277468e-08, + "logits/chosen": -2.2779715061187744, + "logits/rejected": -1.9392287731170654, + "logps/chosen": -285.97137451171875, + "logps/rejected": -254.766845703125, + "loss": 0.0127, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.030772119760513306, + "rewards/margins": 0.09515996277332306, + "rewards/rejected": -0.12593206763267517, + "step": 14790 + }, + { + "epoch": 0.97, + "learning_rate": 1.5259383665924e-08, + "logits/chosen": -2.547119140625, + "logits/rejected": -2.155965805053711, + "logps/chosen": -345.37567138671875, + "logps/rejected": -267.6927490234375, + "loss": 0.0099, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.016418835148215294, + "rewards/margins": 0.08162996172904968, + "rewards/rejected": -0.09804878383874893, + "step": 14800 + }, + { + "epoch": 0.97, + "eval_logits/chosen": -2.2770659923553467, + "eval_logits/rejected": -2.089148759841919, + "eval_logps/chosen": -240.30921936035156, + "eval_logps/rejected": -236.97500610351562, + "eval_loss": 0.023923002183437347, + "eval_rewards/accuracies": 0.659500002861023, + "eval_rewards/chosen": -0.041521333158016205, + "eval_rewards/margins": 0.0852942168712616, + "eval_rewards/rejected": -0.1268155574798584, + "eval_runtime": 712.7523, + "eval_samples_per_second": 2.806, + "eval_steps_per_second": 1.403, + "step": 14800 + }, + { + "epoch": 0.97, + "learning_rate": 1.4635954222789461e-08, + "logits/chosen": -2.228029251098633, + "logits/rejected": -2.14042592048645, + "logps/chosen": -222.356201171875, + "logps/rejected": -244.2379608154297, + "loss": 0.0156, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.026243016123771667, + "rewards/margins": 0.0838693231344223, + "rewards/rejected": -0.11011233180761337, + "step": 14810 + }, + { + "epoch": 0.97, + "learning_rate": 1.402548965598688e-08, + "logits/chosen": -2.1659653186798096, + "logits/rejected": -2.252767562866211, + "logps/chosen": -214.77322387695312, + "logps/rejected": -222.8150634765625, + "loss": 0.0124, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.05072982236742973, + "rewards/margins": 0.056291453540325165, + "rewards/rejected": -0.1070212721824646, + "step": 14820 + }, + { + "epoch": 0.97, + "learning_rate": 1.3427993149998375e-08, + "logits/chosen": -2.4160420894622803, + "logits/rejected": -2.1670732498168945, + "logps/chosen": -248.33438110351562, + "logps/rejected": -213.2601776123047, + "loss": 0.0298, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.031486038118600845, + "rewards/margins": 0.10147968679666519, + "rewards/rejected": -0.13296571373939514, + "step": 14830 + }, + { + "epoch": 0.97, + "learning_rate": 1.2843467821658518e-08, + "logits/chosen": -2.3670287132263184, + "logits/rejected": -2.2997143268585205, + "logps/chosen": -229.97329711914062, + "logps/rejected": -248.75820922851562, + "loss": 0.017, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.018752047792077065, + "rewards/margins": 0.09576685726642609, + "rewards/rejected": -0.114518903195858, + "step": 14840 + }, + { + "epoch": 0.97, + "learning_rate": 1.2271916720137666e-08, + "logits/chosen": -2.486077308654785, + "logits/rejected": -2.167175531387329, + "logps/chosen": -296.70526123046875, + "logps/rejected": -261.83966064453125, + "loss": 0.0365, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.054678551852703094, + "rewards/margins": 0.05634712427854538, + "rewards/rejected": -0.11102566868066788, + "step": 14850 + }, + { + "epoch": 0.97, + "learning_rate": 1.171334282692671e-08, + "logits/chosen": -2.3231124877929688, + "logits/rejected": -2.24255108833313, + "logps/chosen": -291.93927001953125, + "logps/rejected": -279.7203674316406, + "loss": 0.0141, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.04670420661568642, + "rewards/margins": 0.09206493943929672, + "rewards/rejected": -0.13876914978027344, + "step": 14860 + }, + { + "epoch": 0.97, + "learning_rate": 1.116774905582041e-08, + "logits/chosen": -2.3719723224639893, + "logits/rejected": -2.0442380905151367, + "logps/chosen": -192.50331115722656, + "logps/rejected": -192.2175750732422, + "loss": 0.0125, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -0.03842391073703766, + "rewards/margins": 0.06208761781454086, + "rewards/rejected": -0.10051152855157852, + "step": 14870 + }, + { + "epoch": 0.97, + "learning_rate": 1.0635138252902966e-08, + "logits/chosen": -2.4592127799987793, + "logits/rejected": -2.2050700187683105, + "logps/chosen": -238.54061889648438, + "logps/rejected": -235.62460327148438, + "loss": 0.0263, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.040633201599121094, + "rewards/margins": 0.09606233984231949, + "rewards/rejected": -0.13669554889202118, + "step": 14880 + }, + { + "epoch": 0.97, + "learning_rate": 1.0115513196533589e-08, + "logits/chosen": -2.2937681674957275, + "logits/rejected": -2.1239376068115234, + "logps/chosen": -268.3608093261719, + "logps/rejected": -265.96392822265625, + "loss": 0.0121, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.02875324711203575, + "rewards/margins": 0.07455660402774811, + "rewards/rejected": -0.10330984741449356, + "step": 14890 + }, + { + "epoch": 0.97, + "learning_rate": 9.608876597330952e-09, + "logits/chosen": -2.3046488761901855, + "logits/rejected": -1.9809150695800781, + "logps/chosen": -301.057861328125, + "logps/rejected": -309.0120849609375, + "loss": 0.0235, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.07217559963464737, + "rewards/margins": 0.09819747507572174, + "rewards/rejected": -0.1703730821609497, + "step": 14900 + }, + { + "epoch": 0.97, + "eval_logits/chosen": -2.277737855911255, + "eval_logits/rejected": -2.089785575866699, + "eval_logps/chosen": -240.29910278320312, + "eval_logps/rejected": -236.97601318359375, + "eval_loss": 0.023935021832585335, + "eval_rewards/accuracies": 0.6585000157356262, + "eval_rewards/chosen": -0.041470736265182495, + "eval_rewards/margins": 0.08534979820251465, + "eval_rewards/rejected": -0.12682053446769714, + "eval_runtime": 710.892, + "eval_samples_per_second": 2.813, + "eval_steps_per_second": 1.407, + "step": 14900 + }, + { + "epoch": 0.98, + "learning_rate": 9.115231098159594e-09, + "logits/chosen": -2.3617255687713623, + "logits/rejected": -2.250319004058838, + "logps/chosen": -262.0185852050781, + "logps/rejected": -255.55947875976562, + "loss": 0.0242, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.04352415353059769, + "rewards/margins": 0.06921429187059402, + "rewards/rejected": -0.11273844540119171, + "step": 14910 + }, + { + "epoch": 0.98, + "learning_rate": 8.634579274116317e-09, + "logits/chosen": -2.211763858795166, + "logits/rejected": -2.14729642868042, + "logps/chosen": -196.95347595214844, + "logps/rejected": -237.3124237060547, + "loss": 0.0317, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.03447120264172554, + "rewards/margins": 0.09874703735113144, + "rewards/rejected": -0.13321822881698608, + "step": 14920 + }, + { + "epoch": 0.98, + "learning_rate": 8.166923632516865e-09, + "logits/chosen": -2.392099618911743, + "logits/rejected": -2.104140281677246, + "logps/chosen": -235.0437774658203, + "logps/rejected": -304.4234619140625, + "loss": 0.0161, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.0365629717707634, + "rewards/margins": 0.1587628573179245, + "rewards/rejected": -0.1953258067369461, + "step": 14930 + }, + { + "epoch": 0.98, + "learning_rate": 7.712266612881492e-09, + "logits/chosen": -2.130908250808716, + "logits/rejected": -2.0000076293945312, + "logps/chosen": -189.84274291992188, + "logps/rejected": -202.4415740966797, + "loss": 0.0312, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.023030346259474754, + "rewards/margins": 0.08737320452928543, + "rewards/rejected": -0.11040355265140533, + "step": 14940 + }, + { + "epoch": 0.98, + "learning_rate": 7.270610586924687e-09, + "logits/chosen": -2.409262180328369, + "logits/rejected": -2.1712772846221924, + "logps/chosen": -268.30853271484375, + "logps/rejected": -239.8023223876953, + "loss": 0.009, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.016716131940484047, + "rewards/margins": 0.07787027209997177, + "rewards/rejected": -0.09458640217781067, + "step": 14950 + }, + { + "epoch": 0.98, + "learning_rate": 6.841957858539916e-09, + "logits/chosen": -2.2161965370178223, + "logits/rejected": -2.0737485885620117, + "logps/chosen": -181.96450805664062, + "logps/rejected": -205.32528686523438, + "loss": 0.0355, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.07654479146003723, + "rewards/margins": 0.06255006790161133, + "rewards/rejected": -0.13909485936164856, + "step": 14960 + }, + { + "epoch": 0.98, + "learning_rate": 6.426310663790181e-09, + "logits/chosen": -2.1482691764831543, + "logits/rejected": -2.04679536819458, + "logps/chosen": -246.98974609375, + "logps/rejected": -229.2218017578125, + "loss": 0.0138, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.050069890916347504, + "rewards/margins": 0.06850225478410721, + "rewards/rejected": -0.11857215315103531, + "step": 14970 + }, + { + "epoch": 0.98, + "learning_rate": 6.023671170894696e-09, + "logits/chosen": -2.4767868518829346, + "logits/rejected": -1.8887195587158203, + "logps/chosen": -295.3521728515625, + "logps/rejected": -234.2387237548828, + "loss": 0.017, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.015118017792701721, + "rewards/margins": 0.128330796957016, + "rewards/rejected": -0.14344879984855652, + "step": 14980 + }, + { + "epoch": 0.98, + "learning_rate": 5.634041480218344e-09, + "logits/chosen": -2.4271140098571777, + "logits/rejected": -2.0972952842712402, + "logps/chosen": -259.35394287109375, + "logps/rejected": -271.25103759765625, + "loss": 0.0117, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.031401630491018295, + "rewards/margins": 0.08168746531009674, + "rewards/rejected": -0.11308909952640533, + "step": 14990 + }, + { + "epoch": 0.98, + "learning_rate": 5.257423624260849e-09, + "logits/chosen": -2.4722533226013184, + "logits/rejected": -2.0414023399353027, + "logps/chosen": -270.02227783203125, + "logps/rejected": -240.05807495117188, + "loss": 0.019, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.03349440172314644, + "rewards/margins": 0.06639524549245834, + "rewards/rejected": -0.09988965094089508, + "step": 15000 + }, + { + "epoch": 0.98, + "eval_logits/chosen": -2.2778244018554688, + "eval_logits/rejected": -2.089890718460083, + "eval_logps/chosen": -240.30604553222656, + "eval_logps/rejected": -236.9526824951172, + "eval_loss": 0.023917993530631065, + "eval_rewards/accuracies": 0.6610000133514404, + "eval_rewards/chosen": -0.04150532931089401, + "eval_rewards/margins": 0.08519868552684784, + "eval_rewards/rejected": -0.12670400738716125, + "eval_runtime": 712.9633, + "eval_samples_per_second": 2.805, + "eval_steps_per_second": 1.403, + "step": 15000 + }, + { + "epoch": 0.98, + "learning_rate": 4.893819567644564e-09, + "logits/chosen": -2.157071352005005, + "logits/rejected": -2.131338596343994, + "logps/chosen": -197.52670288085938, + "logps/rejected": -218.3302001953125, + "loss": 0.0298, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.0476498007774353, + "rewards/margins": 0.0531373992562294, + "rewards/rejected": -0.1007872000336647, + "step": 15010 + }, + { + "epoch": 0.98, + "learning_rate": 4.543231207107257e-09, + "logits/chosen": -2.2290079593658447, + "logits/rejected": -2.018141746520996, + "logps/chosen": -269.5322265625, + "logps/rejected": -258.697998046875, + "loss": 0.0416, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.05855320021510124, + "rewards/margins": 0.06387249380350113, + "rewards/rejected": -0.12242569029331207, + "step": 15020 + }, + { + "epoch": 0.98, + "learning_rate": 4.205660371488785e-09, + "logits/chosen": -2.532921075820923, + "logits/rejected": -2.17484712600708, + "logps/chosen": -292.44708251953125, + "logps/rejected": -261.38031005859375, + "loss": 0.0184, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.041468605399131775, + "rewards/margins": 0.053616128861904144, + "rewards/rejected": -0.09508474171161652, + "step": 15030 + }, + { + "epoch": 0.98, + "learning_rate": 3.88110882172471e-09, + "logits/chosen": -2.1984848976135254, + "logits/rejected": -2.1399903297424316, + "logps/chosen": -232.79849243164062, + "logps/rejected": -237.8025665283203, + "loss": 0.0276, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.06050034239888191, + "rewards/margins": 0.04820340871810913, + "rewards/rejected": -0.10870374739170074, + "step": 15040 + }, + { + "epoch": 0.98, + "learning_rate": 3.569578250834371e-09, + "logits/chosen": -2.2971742153167725, + "logits/rejected": -2.0346813201904297, + "logps/chosen": -307.75616455078125, + "logps/rejected": -291.34820556640625, + "loss": 0.0228, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.028891805559396744, + "rewards/margins": 0.10677383095026016, + "rewards/rejected": -0.135665625333786, + "step": 15050 + }, + { + "epoch": 0.99, + "learning_rate": 3.2710702839139353e-09, + "logits/chosen": -2.3282997608184814, + "logits/rejected": -2.1847825050354004, + "logps/chosen": -211.9817352294922, + "logps/rejected": -229.8556671142578, + "loss": 0.0211, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.03691229224205017, + "rewards/margins": 0.043003737926483154, + "rewards/rejected": -0.07991602271795273, + "step": 15060 + }, + { + "epoch": 0.99, + "learning_rate": 2.9855864781272448e-09, + "logits/chosen": -2.294987440109253, + "logits/rejected": -2.332965612411499, + "logps/chosen": -219.7119140625, + "logps/rejected": -267.57525634765625, + "loss": 0.0142, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.04019111022353172, + "rewards/margins": 0.06534648686647415, + "rewards/rejected": -0.10553759336471558, + "step": 15070 + }, + { + "epoch": 0.99, + "learning_rate": 2.7131283226977665e-09, + "logits/chosen": -2.299408197402954, + "logits/rejected": -2.3635406494140625, + "logps/chosen": -227.406982421875, + "logps/rejected": -257.7666931152344, + "loss": 0.0119, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.039793360978364944, + "rewards/margins": 0.08254800736904144, + "rewards/rejected": -0.12234137207269669, + "step": 15080 + }, + { + "epoch": 0.99, + "learning_rate": 2.4536972389008205e-09, + "logits/chosen": -2.236517906188965, + "logits/rejected": -2.0118248462677, + "logps/chosen": -242.9285430908203, + "logps/rejected": -228.8876190185547, + "loss": 0.0266, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.028967713937163353, + "rewards/margins": 0.118250273168087, + "rewards/rejected": -0.1472179889678955, + "step": 15090 + }, + { + "epoch": 0.99, + "learning_rate": 2.20729458005553e-09, + "logits/chosen": -2.152559757232666, + "logits/rejected": -1.9877811670303345, + "logps/chosen": -200.26695251464844, + "logps/rejected": -202.02871704101562, + "loss": 0.0368, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.029288524761795998, + "rewards/margins": 0.11981719732284546, + "rewards/rejected": -0.1491057127714157, + "step": 15100 + }, + { + "epoch": 0.99, + "eval_logits/chosen": -2.2784409523010254, + "eval_logits/rejected": -2.0904457569122314, + "eval_logps/chosen": -240.2960662841797, + "eval_logps/rejected": -236.9457550048828, + "eval_loss": 0.023891154676675797, + "eval_rewards/accuracies": 0.6604999899864197, + "eval_rewards/chosen": -0.04145561903715134, + "eval_rewards/margins": 0.08521371334791183, + "eval_rewards/rejected": -0.12666933238506317, + "eval_runtime": 716.4143, + "eval_samples_per_second": 2.792, + "eval_steps_per_second": 1.396, + "step": 15100 + }, + { + "epoch": 0.99, + "learning_rate": 1.9739216315192712e-09, + "logits/chosen": -2.263357162475586, + "logits/rejected": -2.052135944366455, + "logps/chosen": -236.76602172851562, + "logps/rejected": -223.97885131835938, + "loss": 0.0216, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.037286076694726944, + "rewards/margins": 0.054551173001527786, + "rewards/rejected": -0.09183724969625473, + "step": 15110 + }, + { + "epoch": 0.99, + "learning_rate": 1.7535796106796231e-09, + "logits/chosen": -2.345848321914673, + "logits/rejected": -2.0252323150634766, + "logps/chosen": -286.8771057128906, + "logps/rejected": -220.3518524169922, + "loss": 0.0156, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.056703440845012665, + "rewards/margins": 0.06902850419282913, + "rewards/rejected": -0.1257319450378418, + "step": 15120 + }, + { + "epoch": 0.99, + "learning_rate": 1.5462696669482636e-09, + "logits/chosen": -2.315427303314209, + "logits/rejected": -2.1944830417633057, + "logps/chosen": -226.43545532226562, + "logps/rejected": -250.79995727539062, + "loss": 0.009, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.025966573506593704, + "rewards/margins": 0.08360234647989273, + "rewards/rejected": -0.10956891626119614, + "step": 15130 + }, + { + "epoch": 0.99, + "learning_rate": 1.3519928817556927e-09, + "logits/chosen": -2.167985677719116, + "logits/rejected": -2.1198575496673584, + "logps/chosen": -179.43984985351562, + "logps/rejected": -195.4428253173828, + "loss": 0.0315, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.030231673270463943, + "rewards/margins": 0.07728022336959839, + "rewards/rejected": -0.10751190036535263, + "step": 15140 + }, + { + "epoch": 0.99, + "learning_rate": 1.1707502685448512e-09, + "logits/chosen": -2.3988311290740967, + "logits/rejected": -2.062964677810669, + "logps/chosen": -233.6556396484375, + "logps/rejected": -200.08004760742188, + "loss": 0.0161, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.0569281280040741, + "rewards/margins": 0.12306445837020874, + "rewards/rejected": -0.17999258637428284, + "step": 15150 + }, + { + "epoch": 0.99, + "learning_rate": 1.002542772765569e-09, + "logits/chosen": -2.228262186050415, + "logits/rejected": -1.9481480121612549, + "logps/chosen": -196.65823364257812, + "logps/rejected": -177.3425750732422, + "loss": 0.0274, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.04027201980352402, + "rewards/margins": 0.09495989978313446, + "rewards/rejected": -0.13523191213607788, + "step": 15160 + }, + { + "epoch": 0.99, + "learning_rate": 8.473712718709559e-10, + "logits/chosen": -2.0948386192321777, + "logits/rejected": -2.1079623699188232, + "logps/chosen": -205.24417114257812, + "logps/rejected": -202.10992431640625, + "loss": 0.0276, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.05749630928039551, + "rewards/margins": 0.038219258189201355, + "rewards/rejected": -0.09571556746959686, + "step": 15170 + }, + { + "epoch": 0.99, + "learning_rate": 7.052365753112966e-10, + "logits/chosen": -2.1221511363983154, + "logits/rejected": -1.8346284627914429, + "logps/chosen": -246.028564453125, + "logps/rejected": -247.57980346679688, + "loss": 0.0285, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.05356248468160629, + "rewards/margins": 0.10163445770740509, + "rewards/rejected": -0.15519694983959198, + "step": 15180 + }, + { + "epoch": 0.99, + "learning_rate": 5.761394245307195e-10, + "logits/chosen": -2.105289936065674, + "logits/rejected": -2.1355576515197754, + "logps/chosen": -244.42221069335938, + "logps/rejected": -252.21102905273438, + "loss": 0.0072, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.03670011833310127, + "rewards/margins": 0.0450015589594841, + "rewards/rejected": -0.08170167356729507, + "step": 15190 + }, + { + "epoch": 0.99, + "learning_rate": 4.6008049296358826e-10, + "logits/chosen": -2.1782045364379883, + "logits/rejected": -2.0728297233581543, + "logps/chosen": -195.4044952392578, + "logps/rejected": -183.70651245117188, + "loss": 0.0267, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.04690961167216301, + "rewards/margins": 0.08272843062877655, + "rewards/rejected": -0.12963804602622986, + "step": 15200 + }, + { + "epoch": 0.99, + "eval_logits/chosen": -2.2778265476226807, + "eval_logits/rejected": -2.0898988246917725, + "eval_logps/chosen": -240.2912139892578, + "eval_logps/rejected": -236.9213409423828, + "eval_loss": 0.02391921915113926, + "eval_rewards/accuracies": 0.6579999923706055, + "eval_rewards/chosen": -0.041431326419115067, + "eval_rewards/margins": 0.08511585742235184, + "eval_rewards/rejected": -0.1265471875667572, + "eval_runtime": 712.7473, + "eval_samples_per_second": 2.806, + "eval_steps_per_second": 1.403, + "step": 15200 + }, + { + "epoch": 1.0, + "learning_rate": 3.5706038603006146e-10, + "logits/chosen": -2.3718504905700684, + "logits/rejected": -2.3244967460632324, + "logps/chosen": -290.2176818847656, + "logps/rejected": -294.3794860839844, + "loss": 0.02, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.0030365826096385717, + "rewards/margins": 0.09202475100755692, + "rewards/rejected": -0.09506133943796158, + "step": 15210 + }, + { + "epoch": 1.0, + "learning_rate": 2.670796411333165e-10, + "logits/chosen": -2.547684907913208, + "logits/rejected": -2.255117893218994, + "logps/chosen": -230.6437530517578, + "logps/rejected": -235.52615356445312, + "loss": 0.0224, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.05416148155927658, + "rewards/margins": 0.08594264090061188, + "rewards/rejected": -0.14010414481163025, + "step": 15220 + }, + { + "epoch": 1.0, + "learning_rate": 1.9013872765677455e-10, + "logits/chosen": -2.2683866024017334, + "logits/rejected": -2.062041759490967, + "logps/chosen": -222.8066864013672, + "logps/rejected": -219.8751220703125, + "loss": 0.0143, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.01646781899034977, + "rewards/margins": 0.04801513999700546, + "rewards/rejected": -0.06448295712471008, + "step": 15230 + }, + { + "epoch": 1.0, + "learning_rate": 1.262380469624347e-10, + "logits/chosen": -2.24546480178833, + "logits/rejected": -2.0749285221099854, + "logps/chosen": -211.616455078125, + "logps/rejected": -201.745361328125, + "loss": 0.0339, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.038047708570957184, + "rewards/margins": 0.0651930570602417, + "rewards/rejected": -0.10324076563119888, + "step": 15240 + }, + { + "epoch": 1.0, + "learning_rate": 7.53779323872661e-11, + "logits/chosen": -2.1421215534210205, + "logits/rejected": -2.240142345428467, + "logps/chosen": -200.442138671875, + "logps/rejected": -227.1268310546875, + "loss": 0.0214, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.042893700301647186, + "rewards/margins": 0.10320397466421127, + "rewards/rejected": -0.14609768986701965, + "step": 15250 + }, + { + "epoch": 1.0, + "learning_rate": 3.7558649242652734e-11, + "logits/chosen": -2.455829381942749, + "logits/rejected": -2.216055393218994, + "logps/chosen": -404.4881591796875, + "logps/rejected": -340.13348388671875, + "loss": 0.0209, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.039021652191877365, + "rewards/margins": 0.07608093321323395, + "rewards/rejected": -0.11510257422924042, + "step": 15260 + }, + { + "epoch": 1.0, + "learning_rate": 1.2780394812450526e-11, + "logits/chosen": -2.086707592010498, + "logits/rejected": -2.040337562561035, + "logps/chosen": -244.7919921875, + "logps/rejected": -264.2740478515625, + "loss": 0.0164, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.0632411390542984, + "rewards/margins": 0.09782315790653229, + "rewards/rejected": -0.1610642969608307, + "step": 15270 + }, + { + "epoch": 1.0, + "learning_rate": 1.0432983521546646e-12, + "logits/chosen": -2.1208715438842773, + "logits/rejected": -2.0309219360351562, + "logps/chosen": -194.59828186035156, + "logps/rejected": -233.06478881835938, + "loss": 0.0202, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.05566044896841049, + "rewards/margins": 0.09810435026884079, + "rewards/rejected": -0.15376481413841248, + "step": 15280 + }, + { + "epoch": 1.0, + "step": 15284, + "total_flos": 0.0, + "train_loss": 0.028345070170466266, + "train_runtime": 172193.3354, + "train_samples_per_second": 0.355, + "train_steps_per_second": 0.089 + } + ], + "logging_steps": 10, + "max_steps": 15284, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 100, + "total_flos": 0.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}