{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.998972954467648, "eval_steps": 100, "global_step": 6570, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 4.5662100456621e-10, "logits/chosen": -2.5806336402893066, "logits/rejected": -2.507266044616699, "logps/chosen": -74.40542602539062, "logps/rejected": -80.08415222167969, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.0, "learning_rate": 4.5662100456621e-09, "logits/chosen": -2.6602303981781006, "logits/rejected": -2.3218560218811035, "logps/chosen": -85.13052368164062, "logps/rejected": -65.45736694335938, "loss": 0.6935, "rewards/accuracies": 0.4861111044883728, "rewards/chosen": -0.0020011265296489, "rewards/margins": 0.0015958944568410516, "rewards/rejected": -0.0035970211029052734, "step": 10 }, { "epoch": 0.01, "learning_rate": 9.1324200913242e-09, "logits/chosen": -2.676975727081299, "logits/rejected": -2.353113889694214, "logps/chosen": -85.28013610839844, "logps/rejected": -66.55751037597656, "loss": 0.7026, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.01160498894751072, "rewards/margins": -0.03348593786358833, "rewards/rejected": 0.021880948916077614, "step": 20 }, { "epoch": 0.01, "learning_rate": 1.36986301369863e-08, "logits/chosen": -2.6892871856689453, "logits/rejected": -2.300590991973877, "logps/chosen": -83.56986236572266, "logps/rejected": -67.03092956542969, "loss": 0.6961, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.0031071901321411133, "rewards/margins": 0.010077809914946556, "rewards/rejected": -0.01318500004708767, "step": 30 }, { "epoch": 0.02, "learning_rate": 1.82648401826484e-08, "logits/chosen": -2.669743299484253, "logits/rejected": -2.3138179779052734, "logps/chosen": -89.42594909667969, "logps/rejected": -67.19525909423828, "loss": 0.6847, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.009614706039428711, "rewards/margins": 0.03450586646795273, "rewards/rejected": -0.024891162291169167, "step": 40 }, { "epoch": 0.02, "learning_rate": 2.28310502283105e-08, "logits/chosen": -2.6460647583007812, "logits/rejected": -2.3767526149749756, "logps/chosen": -76.3002700805664, "logps/rejected": -61.87116241455078, "loss": 0.6789, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.00959167443215847, "rewards/margins": 0.05547298118472099, "rewards/rejected": -0.04588130861520767, "step": 50 }, { "epoch": 0.03, "learning_rate": 2.73972602739726e-08, "logits/chosen": -2.6644747257232666, "logits/rejected": -2.332427740097046, "logps/chosen": -84.58428192138672, "logps/rejected": -65.63447570800781, "loss": 0.6508, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.010953652672469616, "rewards/margins": 0.07386454194784164, "rewards/rejected": -0.06291089206933975, "step": 60 }, { "epoch": 0.03, "learning_rate": 3.19634703196347e-08, "logits/chosen": -2.674008846282959, "logits/rejected": -2.3772647380828857, "logps/chosen": -86.68485260009766, "logps/rejected": -67.3741683959961, "loss": 0.6355, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.03106447495520115, "rewards/margins": 0.12061159312725067, "rewards/rejected": -0.08954712003469467, "step": 70 }, { "epoch": 0.04, "learning_rate": 3.65296803652968e-08, "logits/chosen": -2.69270658493042, "logits/rejected": -2.407546043395996, "logps/chosen": -80.9357681274414, "logps/rejected": -62.59514617919922, "loss": 0.5861, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 0.06220342963933945, "rewards/margins": 0.19547274708747864, "rewards/rejected": -0.1332693099975586, "step": 80 }, { "epoch": 0.04, "learning_rate": 4.10958904109589e-08, "logits/chosen": -2.678983211517334, "logits/rejected": -2.4065537452697754, "logps/chosen": -77.17921447753906, "logps/rejected": -65.30864715576172, "loss": 0.5411, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.09732481092214584, "rewards/margins": 0.32089582085609436, "rewards/rejected": -0.2235710173845291, "step": 90 }, { "epoch": 0.05, "learning_rate": 4.5662100456621e-08, "logits/chosen": -2.671326160430908, "logits/rejected": -2.328735828399658, "logps/chosen": -83.13504791259766, "logps/rejected": -66.26860046386719, "loss": 0.5085, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.152253657579422, "rewards/margins": 0.45365238189697266, "rewards/rejected": -0.30139869451522827, "step": 100 }, { "epoch": 0.05, "eval_logits/chosen": -2.3585970401763916, "eval_logits/rejected": -2.0870280265808105, "eval_logps/chosen": -81.93758392333984, "eval_logps/rejected": -63.1281852722168, "eval_loss": 0.49781882762908936, "eval_rewards/accuracies": 0.9525139927864075, "eval_rewards/chosen": 0.12407515943050385, "eval_rewards/margins": 0.4575193524360657, "eval_rewards/rejected": -0.333444207906723, "eval_runtime": 279.4658, "eval_samples_per_second": 10.241, "eval_steps_per_second": 0.641, "step": 100 }, { "epoch": 0.05, "learning_rate": 5.02283105022831e-08, "logits/chosen": -2.6873135566711426, "logits/rejected": -2.3353161811828613, "logps/chosen": -84.43807983398438, "logps/rejected": -66.07930755615234, "loss": 0.482, "rewards/accuracies": 0.9375, "rewards/chosen": 0.1872214376926422, "rewards/margins": 0.5431746244430542, "rewards/rejected": -0.355953186750412, "step": 110 }, { "epoch": 0.05, "learning_rate": 5.47945205479452e-08, "logits/chosen": -2.6962456703186035, "logits/rejected": -2.308206081390381, "logps/chosen": -86.16998291015625, "logps/rejected": -64.47346496582031, "loss": 0.4414, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.2650398015975952, "rewards/margins": 0.7689052820205688, "rewards/rejected": -0.5038654804229736, "step": 120 }, { "epoch": 0.06, "learning_rate": 5.93607305936073e-08, "logits/chosen": -2.7041585445404053, "logits/rejected": -2.344529867172241, "logps/chosen": -88.99284362792969, "logps/rejected": -66.15284729003906, "loss": 0.3585, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.2921959459781647, "rewards/margins": 0.9323012232780457, "rewards/rejected": -0.6401051878929138, "step": 130 }, { "epoch": 0.06, "learning_rate": 6.39269406392694e-08, "logits/chosen": -2.6795685291290283, "logits/rejected": -2.355473041534424, "logps/chosen": -87.90394592285156, "logps/rejected": -68.25851440429688, "loss": 0.3027, "rewards/accuracies": 1.0, "rewards/chosen": 0.4262579083442688, "rewards/margins": 1.2455202341079712, "rewards/rejected": -0.8192623257637024, "step": 140 }, { "epoch": 0.07, "learning_rate": 6.84931506849315e-08, "logits/chosen": -2.6943793296813965, "logits/rejected": -2.3250906467437744, "logps/chosen": -83.41421508789062, "logps/rejected": -61.89928436279297, "loss": 0.2721, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.5733026266098022, "rewards/margins": 1.454756498336792, "rewards/rejected": -0.8814538717269897, "step": 150 }, { "epoch": 0.07, "learning_rate": 7.30593607305936e-08, "logits/chosen": -2.651337146759033, "logits/rejected": -2.376753330230713, "logps/chosen": -82.60858154296875, "logps/rejected": -69.9248046875, "loss": 0.2605, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.2730570137500763, "rewards/margins": 1.371543526649475, "rewards/rejected": -1.098486304283142, "step": 160 }, { "epoch": 0.08, "learning_rate": 7.76255707762557e-08, "logits/chosen": -2.6764073371887207, "logits/rejected": -2.3649675846099854, "logps/chosen": -85.9468994140625, "logps/rejected": -66.21961975097656, "loss": 0.2672, "rewards/accuracies": 1.0, "rewards/chosen": 0.40627917647361755, "rewards/margins": 1.4158440828323364, "rewards/rejected": -1.009564757347107, "step": 170 }, { "epoch": 0.08, "learning_rate": 8.21917808219178e-08, "logits/chosen": -2.6904137134552, "logits/rejected": -2.3015971183776855, "logps/chosen": -87.99807739257812, "logps/rejected": -64.9723129272461, "loss": 0.2438, "rewards/accuracies": 1.0, "rewards/chosen": 0.5075963735580444, "rewards/margins": 1.5488159656524658, "rewards/rejected": -1.0412195920944214, "step": 180 }, { "epoch": 0.09, "learning_rate": 8.67579908675799e-08, "logits/chosen": -2.6737618446350098, "logits/rejected": -2.2952191829681396, "logps/chosen": -81.42160034179688, "logps/rejected": -63.59276580810547, "loss": 0.2196, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.5729875564575195, "rewards/margins": 1.8921420574188232, "rewards/rejected": -1.3191546201705933, "step": 190 }, { "epoch": 0.09, "learning_rate": 9.1324200913242e-08, "logits/chosen": -2.683471441268921, "logits/rejected": -2.3723092079162598, "logps/chosen": -75.11089324951172, "logps/rejected": -63.15696334838867, "loss": 0.1966, "rewards/accuracies": 1.0, "rewards/chosen": 0.45912885665893555, "rewards/margins": 1.8716201782226562, "rewards/rejected": -1.4124913215637207, "step": 200 }, { "epoch": 0.09, "eval_logits/chosen": -2.3650243282318115, "eval_logits/rejected": -2.09184193611145, "eval_logps/chosen": -81.18123626708984, "eval_logps/rejected": -65.20204162597656, "eval_loss": 0.2003355473279953, "eval_rewards/accuracies": 0.9804469347000122, "eval_rewards/chosen": 0.5022494196891785, "eval_rewards/margins": 1.8726203441619873, "eval_rewards/rejected": -1.3703711032867432, "eval_runtime": 346.9988, "eval_samples_per_second": 8.248, "eval_steps_per_second": 0.516, "step": 200 }, { "epoch": 0.1, "learning_rate": 9.58904109589041e-08, "logits/chosen": -2.66351056098938, "logits/rejected": -2.3506274223327637, "logps/chosen": -81.50225067138672, "logps/rejected": -68.81380462646484, "loss": 0.1863, "rewards/accuracies": 1.0, "rewards/chosen": 0.5385276079177856, "rewards/margins": 2.1100916862487793, "rewards/rejected": -1.5715640783309937, "step": 210 }, { "epoch": 0.1, "learning_rate": 1.004566210045662e-07, "logits/chosen": -2.6930766105651855, "logits/rejected": -2.4052610397338867, "logps/chosen": -75.74411010742188, "logps/rejected": -66.29212951660156, "loss": 0.1686, "rewards/accuracies": 1.0, "rewards/chosen": 0.5017625093460083, "rewards/margins": 2.1454098224639893, "rewards/rejected": -1.64364755153656, "step": 220 }, { "epoch": 0.1, "learning_rate": 1.050228310502283e-07, "logits/chosen": -2.7195515632629395, "logits/rejected": -2.350597620010376, "logps/chosen": -88.55684661865234, "logps/rejected": -70.75065612792969, "loss": 0.136, "rewards/accuracies": 1.0, "rewards/chosen": 0.6530634164810181, "rewards/margins": 2.5016236305236816, "rewards/rejected": -1.8485599756240845, "step": 230 }, { "epoch": 0.11, "learning_rate": 1.095890410958904e-07, "logits/chosen": -2.6515941619873047, "logits/rejected": -2.3529882431030273, "logps/chosen": -82.51238250732422, "logps/rejected": -68.0801010131836, "loss": 0.1178, "rewards/accuracies": 1.0, "rewards/chosen": 0.6933033466339111, "rewards/margins": 2.715801239013672, "rewards/rejected": -2.0224976539611816, "step": 240 }, { "epoch": 0.11, "learning_rate": 1.141552511415525e-07, "logits/chosen": -2.6761295795440674, "logits/rejected": -2.312302350997925, "logps/chosen": -84.59378814697266, "logps/rejected": -70.56218719482422, "loss": 0.103, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.608777642250061, "rewards/margins": 2.9450581073760986, "rewards/rejected": -2.336280584335327, "step": 250 }, { "epoch": 0.12, "learning_rate": 1.187214611872146e-07, "logits/chosen": -2.690821409225464, "logits/rejected": -2.3399136066436768, "logps/chosen": -85.71834564208984, "logps/rejected": -68.01689147949219, "loss": 0.088, "rewards/accuracies": 1.0, "rewards/chosen": 0.8771006464958191, "rewards/margins": 3.4516937732696533, "rewards/rejected": -2.5745925903320312, "step": 260 }, { "epoch": 0.12, "learning_rate": 1.232876712328767e-07, "logits/chosen": -2.7160754203796387, "logits/rejected": -2.2973968982696533, "logps/chosen": -86.98188781738281, "logps/rejected": -70.23755645751953, "loss": 0.0865, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.0049078464508057, "rewards/margins": 3.883263111114502, "rewards/rejected": -2.8783552646636963, "step": 270 }, { "epoch": 0.13, "learning_rate": 1.278538812785388e-07, "logits/chosen": -2.6905083656311035, "logits/rejected": -2.3996729850769043, "logps/chosen": -80.7530517578125, "logps/rejected": -73.58966827392578, "loss": 0.072, "rewards/accuracies": 1.0, "rewards/chosen": 0.8110305666923523, "rewards/margins": 3.6407856941223145, "rewards/rejected": -2.8297553062438965, "step": 280 }, { "epoch": 0.13, "learning_rate": 1.324200913242009e-07, "logits/chosen": -2.673684597015381, "logits/rejected": -2.3589425086975098, "logps/chosen": -83.12544250488281, "logps/rejected": -71.22173309326172, "loss": 0.0669, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.9726211428642273, "rewards/margins": 4.033176422119141, "rewards/rejected": -3.0605549812316895, "step": 290 }, { "epoch": 0.14, "learning_rate": 1.36986301369863e-07, "logits/chosen": -2.7056469917297363, "logits/rejected": -2.375981092453003, "logps/chosen": -83.30494689941406, "logps/rejected": -68.67406463623047, "loss": 0.0612, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.6668909788131714, "rewards/margins": 3.8882434368133545, "rewards/rejected": -3.2213528156280518, "step": 300 }, { "epoch": 0.14, "eval_logits/chosen": -2.3740596771240234, "eval_logits/rejected": -2.08868670463562, "eval_logps/chosen": -80.38629150390625, "eval_logps/rejected": -69.1242904663086, "eval_loss": 0.0655858963727951, "eval_rewards/accuracies": 0.9888268113136292, "eval_rewards/chosen": 0.8997250199317932, "eval_rewards/margins": 4.231220245361328, "eval_rewards/rejected": -3.331495523452759, "eval_runtime": 198.891, "eval_samples_per_second": 14.39, "eval_steps_per_second": 0.9, "step": 300 }, { "epoch": 0.14, "learning_rate": 1.415525114155251e-07, "logits/chosen": -2.6941418647766113, "logits/rejected": -2.345851182937622, "logps/chosen": -77.2409896850586, "logps/rejected": -68.9501953125, "loss": 0.0644, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.0634305477142334, "rewards/margins": 4.496082305908203, "rewards/rejected": -3.432652235031128, "step": 310 }, { "epoch": 0.15, "learning_rate": 1.461187214611872e-07, "logits/chosen": -2.6939618587493896, "logits/rejected": -2.3623878955841064, "logps/chosen": -86.4214859008789, "logps/rejected": -74.5653305053711, "loss": 0.0539, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 1.1121734380722046, "rewards/margins": 4.8083696365356445, "rewards/rejected": -3.6961960792541504, "step": 320 }, { "epoch": 0.15, "learning_rate": 1.506849315068493e-07, "logits/chosen": -2.6660733222961426, "logits/rejected": -2.351500988006592, "logps/chosen": -78.15691375732422, "logps/rejected": -71.72764587402344, "loss": 0.0511, "rewards/accuracies": 1.0, "rewards/chosen": 1.1262184381484985, "rewards/margins": 4.566827297210693, "rewards/rejected": -3.4406089782714844, "step": 330 }, { "epoch": 0.16, "learning_rate": 1.552511415525114e-07, "logits/chosen": -2.681811571121216, "logits/rejected": -2.33913516998291, "logps/chosen": -78.15579223632812, "logps/rejected": -67.17436981201172, "loss": 0.0524, "rewards/accuracies": 1.0, "rewards/chosen": 1.2985680103302002, "rewards/margins": 5.002068042755127, "rewards/rejected": -3.7035000324249268, "step": 340 }, { "epoch": 0.16, "learning_rate": 1.598173515981735e-07, "logits/chosen": -2.6956095695495605, "logits/rejected": -2.3682236671447754, "logps/chosen": -78.71223449707031, "logps/rejected": -75.34117889404297, "loss": 0.0537, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.7907094955444336, "rewards/margins": 4.904671669006348, "rewards/rejected": -4.113962173461914, "step": 350 }, { "epoch": 0.16, "learning_rate": 1.643835616438356e-07, "logits/chosen": -2.6930785179138184, "logits/rejected": -2.3822484016418457, "logps/chosen": -81.62545013427734, "logps/rejected": -72.93721008300781, "loss": 0.0396, "rewards/accuracies": 1.0, "rewards/chosen": 0.9396408200263977, "rewards/margins": 4.865753650665283, "rewards/rejected": -3.926112413406372, "step": 360 }, { "epoch": 0.17, "learning_rate": 1.689497716894977e-07, "logits/chosen": -2.6681289672851562, "logits/rejected": -2.3643383979797363, "logps/chosen": -82.47836303710938, "logps/rejected": -74.23176574707031, "loss": 0.0444, "rewards/accuracies": 1.0, "rewards/chosen": 0.8323831558227539, "rewards/margins": 4.88200569152832, "rewards/rejected": -4.049622535705566, "step": 370 }, { "epoch": 0.17, "learning_rate": 1.735159817351598e-07, "logits/chosen": -2.707343578338623, "logits/rejected": -2.2949228286743164, "logps/chosen": -82.89555358886719, "logps/rejected": -67.7196044921875, "loss": 0.031, "rewards/accuracies": 1.0, "rewards/chosen": 1.1780091524124146, "rewards/margins": 5.6963019371032715, "rewards/rejected": -4.518293380737305, "step": 380 }, { "epoch": 0.18, "learning_rate": 1.780821917808219e-07, "logits/chosen": -2.6802215576171875, "logits/rejected": -2.3144307136535645, "logps/chosen": -82.10844421386719, "logps/rejected": -71.76032257080078, "loss": 0.0345, "rewards/accuracies": 1.0, "rewards/chosen": 1.0805195569992065, "rewards/margins": 6.083002090454102, "rewards/rejected": -5.0024824142456055, "step": 390 }, { "epoch": 0.18, "learning_rate": 1.82648401826484e-07, "logits/chosen": -2.6744701862335205, "logits/rejected": -2.401648759841919, "logps/chosen": -76.14572143554688, "logps/rejected": -74.10814666748047, "loss": 0.029, "rewards/accuracies": 1.0, "rewards/chosen": 0.8850818872451782, "rewards/margins": 5.394505500793457, "rewards/rejected": -4.50942325592041, "step": 400 }, { "epoch": 0.18, "eval_logits/chosen": -2.38041353225708, "eval_logits/rejected": -2.0905208587646484, "eval_logps/chosen": -80.27851867675781, "eval_logps/rejected": -72.58271026611328, "eval_loss": 0.0355822928249836, "eval_rewards/accuracies": 0.994413435459137, "eval_rewards/chosen": 0.953613817691803, "eval_rewards/margins": 6.014315605163574, "eval_rewards/rejected": -5.060702323913574, "eval_runtime": 225.3287, "eval_samples_per_second": 12.701, "eval_steps_per_second": 0.794, "step": 400 }, { "epoch": 0.19, "learning_rate": 1.872146118721461e-07, "logits/chosen": -2.73512864112854, "logits/rejected": -2.272688865661621, "logps/chosen": -92.17513275146484, "logps/rejected": -73.54744720458984, "loss": 0.0279, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.4529603719711304, "rewards/margins": 6.544045925140381, "rewards/rejected": -5.091085433959961, "step": 410 }, { "epoch": 0.19, "learning_rate": 1.917808219178082e-07, "logits/chosen": -2.711390256881714, "logits/rejected": -2.407224178314209, "logps/chosen": -77.8403549194336, "logps/rejected": -77.1642837524414, "loss": 0.0314, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.8899542093276978, "rewards/margins": 6.297305107116699, "rewards/rejected": -5.407351493835449, "step": 420 }, { "epoch": 0.2, "learning_rate": 1.963470319634703e-07, "logits/chosen": -2.6724658012390137, "logits/rejected": -2.3597426414489746, "logps/chosen": -78.14308166503906, "logps/rejected": -73.49642181396484, "loss": 0.0293, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.8707484006881714, "rewards/margins": 6.481536865234375, "rewards/rejected": -5.610787868499756, "step": 430 }, { "epoch": 0.2, "learning_rate": 2.009132420091324e-07, "logits/chosen": -2.6923375129699707, "logits/rejected": -2.372655153274536, "logps/chosen": -78.17349243164062, "logps/rejected": -75.12623596191406, "loss": 0.0273, "rewards/accuracies": 1.0, "rewards/chosen": 0.9264253377914429, "rewards/margins": 6.722792148590088, "rewards/rejected": -5.7963666915893555, "step": 440 }, { "epoch": 0.21, "learning_rate": 2.054794520547945e-07, "logits/chosen": -2.7049460411071777, "logits/rejected": -2.348062038421631, "logps/chosen": -83.39576721191406, "logps/rejected": -74.19837951660156, "loss": 0.0288, "rewards/accuracies": 1.0, "rewards/chosen": 1.0997241735458374, "rewards/margins": 7.522848606109619, "rewards/rejected": -6.42312479019165, "step": 450 }, { "epoch": 0.21, "learning_rate": 2.100456621004566e-07, "logits/chosen": -2.706141710281372, "logits/rejected": -2.32466721534729, "logps/chosen": -86.99141693115234, "logps/rejected": -78.12458038330078, "loss": 0.0273, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.5795246362686157, "rewards/margins": 7.050000190734863, "rewards/rejected": -6.470475196838379, "step": 460 }, { "epoch": 0.21, "learning_rate": 2.146118721461187e-07, "logits/chosen": -2.713736057281494, "logits/rejected": -2.3422598838806152, "logps/chosen": -86.9109115600586, "logps/rejected": -78.92709350585938, "loss": 0.0203, "rewards/accuracies": 1.0, "rewards/chosen": 1.1036381721496582, "rewards/margins": 7.163818359375, "rewards/rejected": -6.060181140899658, "step": 470 }, { "epoch": 0.22, "learning_rate": 2.191780821917808e-07, "logits/chosen": -2.7073612213134766, "logits/rejected": -2.3887152671813965, "logps/chosen": -78.95133972167969, "logps/rejected": -75.35789489746094, "loss": 0.0183, "rewards/accuracies": 1.0, "rewards/chosen": 0.9059654474258423, "rewards/margins": 7.699288845062256, "rewards/rejected": -6.793322563171387, "step": 480 }, { "epoch": 0.22, "learning_rate": 2.237442922374429e-07, "logits/chosen": -2.698927402496338, "logits/rejected": -2.365719795227051, "logps/chosen": -84.30168151855469, "logps/rejected": -79.71058654785156, "loss": 0.0163, "rewards/accuracies": 1.0, "rewards/chosen": 1.4775876998901367, "rewards/margins": 8.191506385803223, "rewards/rejected": -6.713918209075928, "step": 490 }, { "epoch": 0.23, "learning_rate": 2.28310502283105e-07, "logits/chosen": -2.7010083198547363, "logits/rejected": -2.3258798122406006, "logps/chosen": -82.32675170898438, "logps/rejected": -77.3822250366211, "loss": 0.0187, "rewards/accuracies": 1.0, "rewards/chosen": 0.7332242131233215, "rewards/margins": 8.125503540039062, "rewards/rejected": -7.392279148101807, "step": 500 }, { "epoch": 0.23, "eval_logits/chosen": -2.3914828300476074, "eval_logits/rejected": -2.0974159240722656, "eval_logps/chosen": -80.3698501586914, "eval_logps/rejected": -77.4731216430664, "eval_loss": 0.02008504420518875, "eval_rewards/accuracies": 0.9888268113136292, "eval_rewards/chosen": 0.9079460501670837, "eval_rewards/margins": 8.41385555267334, "eval_rewards/rejected": -7.5059099197387695, "eval_runtime": 202.3572, "eval_samples_per_second": 14.143, "eval_steps_per_second": 0.885, "step": 500 }, { "epoch": 0.23, "learning_rate": 2.328767123287671e-07, "logits/chosen": -2.718134880065918, "logits/rejected": -2.3481252193450928, "logps/chosen": -77.1845932006836, "logps/rejected": -76.77481842041016, "loss": 0.0181, "rewards/accuracies": 1.0, "rewards/chosen": 1.150839924812317, "rewards/margins": 8.828267097473145, "rewards/rejected": -7.677426338195801, "step": 510 }, { "epoch": 0.24, "learning_rate": 2.374429223744292e-07, "logits/chosen": -2.7372817993164062, "logits/rejected": -2.3306713104248047, "logps/chosen": -86.04837799072266, "logps/rejected": -82.73287963867188, "loss": 0.0139, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.1335670948028564, "rewards/margins": 9.133977890014648, "rewards/rejected": -8.000411033630371, "step": 520 }, { "epoch": 0.24, "learning_rate": 2.420091324200913e-07, "logits/chosen": -2.7076432704925537, "logits/rejected": -2.3714871406555176, "logps/chosen": -82.55339813232422, "logps/rejected": -83.71029663085938, "loss": 0.0144, "rewards/accuracies": 1.0, "rewards/chosen": 0.8402870297431946, "rewards/margins": 9.453120231628418, "rewards/rejected": -8.612833976745605, "step": 530 }, { "epoch": 0.25, "learning_rate": 2.465753424657534e-07, "logits/chosen": -2.727102518081665, "logits/rejected": -2.3391330242156982, "logps/chosen": -85.00145721435547, "logps/rejected": -86.95020294189453, "loss": 0.0137, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.7179019451141357, "rewards/margins": 10.270326614379883, "rewards/rejected": -8.5524263381958, "step": 540 }, { "epoch": 0.25, "learning_rate": 2.511415525114155e-07, "logits/chosen": -2.7112972736358643, "logits/rejected": -2.359590530395508, "logps/chosen": -87.84523010253906, "logps/rejected": -84.37208557128906, "loss": 0.0125, "rewards/accuracies": 1.0, "rewards/chosen": 0.21119078993797302, "rewards/margins": 9.421560287475586, "rewards/rejected": -9.210370063781738, "step": 550 }, { "epoch": 0.26, "learning_rate": 2.557077625570776e-07, "logits/chosen": -2.7153267860412598, "logits/rejected": -2.389270067214966, "logps/chosen": -81.59357452392578, "logps/rejected": -81.81436157226562, "loss": 0.0088, "rewards/accuracies": 1.0, "rewards/chosen": 0.7386399507522583, "rewards/margins": 9.357259750366211, "rewards/rejected": -8.618619918823242, "step": 560 }, { "epoch": 0.26, "learning_rate": 2.602739726027397e-07, "logits/chosen": -2.723977565765381, "logits/rejected": -2.3546102046966553, "logps/chosen": -82.7842788696289, "logps/rejected": -81.7882080078125, "loss": 0.018, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 1.3646135330200195, "rewards/margins": 10.645269393920898, "rewards/rejected": -9.280656814575195, "step": 570 }, { "epoch": 0.26, "learning_rate": 2.648401826484018e-07, "logits/chosen": -2.7005605697631836, "logits/rejected": -2.3339247703552246, "logps/chosen": -80.11715698242188, "logps/rejected": -80.21146392822266, "loss": 0.0115, "rewards/accuracies": 1.0, "rewards/chosen": 0.8744019269943237, "rewards/margins": 10.899045944213867, "rewards/rejected": -10.02464485168457, "step": 580 }, { "epoch": 0.27, "learning_rate": 2.694063926940639e-07, "logits/chosen": -2.6734836101531982, "logits/rejected": -2.4014201164245605, "logps/chosen": -82.53780364990234, "logps/rejected": -84.220458984375, "loss": 0.0103, "rewards/accuracies": 1.0, "rewards/chosen": 0.7647464871406555, "rewards/margins": 10.639299392700195, "rewards/rejected": -9.874550819396973, "step": 590 }, { "epoch": 0.27, "learning_rate": 2.73972602739726e-07, "logits/chosen": -2.726975917816162, "logits/rejected": -2.3305318355560303, "logps/chosen": -80.99850463867188, "logps/rejected": -84.60621643066406, "loss": 0.0112, "rewards/accuracies": 1.0, "rewards/chosen": 0.9542244076728821, "rewards/margins": 11.250876426696777, "rewards/rejected": -10.296652793884277, "step": 600 }, { "epoch": 0.27, "eval_logits/chosen": -2.395996570587158, "eval_logits/rejected": -2.098736524581909, "eval_logps/chosen": -80.74805450439453, "eval_logps/rejected": -83.36124420166016, "eval_loss": 0.012996630743145943, "eval_rewards/accuracies": 0.9916201233863831, "eval_rewards/chosen": 0.7188423275947571, "eval_rewards/margins": 11.168814659118652, "eval_rewards/rejected": -10.449972152709961, "eval_runtime": 200.4622, "eval_samples_per_second": 14.277, "eval_steps_per_second": 0.893, "step": 600 }, { "epoch": 0.28, "learning_rate": 2.785388127853881e-07, "logits/chosen": -2.707491159439087, "logits/rejected": -2.340517520904541, "logps/chosen": -83.68267059326172, "logps/rejected": -84.78146362304688, "loss": 0.0096, "rewards/accuracies": 1.0, "rewards/chosen": 1.0178213119506836, "rewards/margins": 11.22564697265625, "rewards/rejected": -10.207826614379883, "step": 610 }, { "epoch": 0.28, "learning_rate": 2.831050228310502e-07, "logits/chosen": -2.7114086151123047, "logits/rejected": -2.3884994983673096, "logps/chosen": -80.62281799316406, "logps/rejected": -88.79174041748047, "loss": 0.0132, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.033941447734832764, "rewards/margins": 11.68907356262207, "rewards/rejected": -11.655131340026855, "step": 620 }, { "epoch": 0.29, "learning_rate": 2.876712328767123e-07, "logits/chosen": -2.7296934127807617, "logits/rejected": -2.3777823448181152, "logps/chosen": -81.03443908691406, "logps/rejected": -89.4770736694336, "loss": 0.0092, "rewards/accuracies": 1.0, "rewards/chosen": 1.3098781108856201, "rewards/margins": 13.027020454406738, "rewards/rejected": -11.717142105102539, "step": 630 }, { "epoch": 0.29, "learning_rate": 2.922374429223744e-07, "logits/chosen": -2.7400169372558594, "logits/rejected": -2.3298070430755615, "logps/chosen": -84.39897155761719, "logps/rejected": -85.83755493164062, "loss": 0.0125, "rewards/accuracies": 1.0, "rewards/chosen": 0.713331401348114, "rewards/margins": 11.40768051147461, "rewards/rejected": -10.694350242614746, "step": 640 }, { "epoch": 0.3, "learning_rate": 2.968036529680365e-07, "logits/chosen": -2.7301645278930664, "logits/rejected": -2.363854169845581, "logps/chosen": -84.8055191040039, "logps/rejected": -86.34642028808594, "loss": 0.0052, "rewards/accuracies": 1.0, "rewards/chosen": 1.004478096961975, "rewards/margins": 12.842742919921875, "rewards/rejected": -11.838266372680664, "step": 650 }, { "epoch": 0.3, "learning_rate": 2.998477929984779e-07, "logits/chosen": -2.729630708694458, "logits/rejected": -2.3081212043762207, "logps/chosen": -82.32767486572266, "logps/rejected": -89.8006362915039, "loss": 0.0082, "rewards/accuracies": 1.0, "rewards/chosen": 0.7983686327934265, "rewards/margins": 13.078669548034668, "rewards/rejected": -12.28030014038086, "step": 660 }, { "epoch": 0.31, "learning_rate": 2.993404363267377e-07, "logits/chosen": -2.7496962547302246, "logits/rejected": -2.3616135120391846, "logps/chosen": -82.91675567626953, "logps/rejected": -87.04971313476562, "loss": 0.0086, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.6455497741699219, "rewards/margins": 12.818426132202148, "rewards/rejected": -12.172876358032227, "step": 670 }, { "epoch": 0.31, "learning_rate": 2.9883307965499743e-07, "logits/chosen": -2.6971893310546875, "logits/rejected": -2.342924118041992, "logps/chosen": -82.18447875976562, "logps/rejected": -93.10762786865234, "loss": 0.0078, "rewards/accuracies": 1.0, "rewards/chosen": 0.9928043484687805, "rewards/margins": 14.523343086242676, "rewards/rejected": -13.530538558959961, "step": 680 }, { "epoch": 0.31, "learning_rate": 2.983257229832572e-07, "logits/chosen": -2.727041244506836, "logits/rejected": -2.4135303497314453, "logps/chosen": -81.05046081542969, "logps/rejected": -86.6777572631836, "loss": 0.0087, "rewards/accuracies": 1.0, "rewards/chosen": 0.9642049670219421, "rewards/margins": 13.336194038391113, "rewards/rejected": -12.371989250183105, "step": 690 }, { "epoch": 0.32, "learning_rate": 2.9781836631151696e-07, "logits/chosen": -2.7585201263427734, "logits/rejected": -2.3671116828918457, "logps/chosen": -87.6626968383789, "logps/rejected": -91.76471710205078, "loss": 0.0066, "rewards/accuracies": 1.0, "rewards/chosen": 0.9259325265884399, "rewards/margins": 13.375378608703613, "rewards/rejected": -12.449445724487305, "step": 700 }, { "epoch": 0.32, "eval_logits/chosen": -2.4103851318359375, "eval_logits/rejected": -2.1111161708831787, "eval_logps/chosen": -80.8578872680664, "eval_logps/rejected": -88.73031616210938, "eval_loss": 0.01022921223193407, "eval_rewards/accuracies": 0.9916201233863831, "eval_rewards/chosen": 0.6639266610145569, "eval_rewards/margins": 13.798431396484375, "eval_rewards/rejected": -13.134505271911621, "eval_runtime": 228.5232, "eval_samples_per_second": 12.524, "eval_steps_per_second": 0.783, "step": 700 }, { "epoch": 0.32, "learning_rate": 2.9731100963977676e-07, "logits/chosen": -2.7267959117889404, "logits/rejected": -2.393183469772339, "logps/chosen": -83.08018493652344, "logps/rejected": -94.58126831054688, "loss": 0.0055, "rewards/accuracies": 1.0, "rewards/chosen": 0.9232366681098938, "rewards/margins": 14.582369804382324, "rewards/rejected": -13.65913200378418, "step": 710 }, { "epoch": 0.33, "learning_rate": 2.968036529680365e-07, "logits/chosen": -2.730743885040283, "logits/rejected": -2.473689079284668, "logps/chosen": -77.15296173095703, "logps/rejected": -94.04051208496094, "loss": 0.0049, "rewards/accuracies": 1.0, "rewards/chosen": 0.6984473466873169, "rewards/margins": 14.17845630645752, "rewards/rejected": -13.480009078979492, "step": 720 }, { "epoch": 0.33, "learning_rate": 2.962962962962963e-07, "logits/chosen": -2.715235471725464, "logits/rejected": -2.3469748497009277, "logps/chosen": -81.80482482910156, "logps/rejected": -92.98884582519531, "loss": 0.0042, "rewards/accuracies": 1.0, "rewards/chosen": 0.6785728335380554, "rewards/margins": 14.93310260772705, "rewards/rejected": -14.254528999328613, "step": 730 }, { "epoch": 0.34, "learning_rate": 2.9578893962455603e-07, "logits/chosen": -2.7326717376708984, "logits/rejected": -2.3795382976531982, "logps/chosen": -80.54466247558594, "logps/rejected": -91.09703063964844, "loss": 0.0054, "rewards/accuracies": 1.0, "rewards/chosen": 0.7052832841873169, "rewards/margins": 14.463628768920898, "rewards/rejected": -13.758343696594238, "step": 740 }, { "epoch": 0.34, "learning_rate": 2.952815829528158e-07, "logits/chosen": -2.7219204902648926, "logits/rejected": -2.3723695278167725, "logps/chosen": -83.64436340332031, "logps/rejected": -94.90473175048828, "loss": 0.0112, "rewards/accuracies": 1.0, "rewards/chosen": 0.9612115621566772, "rewards/margins": 15.423006057739258, "rewards/rejected": -14.461793899536133, "step": 750 }, { "epoch": 0.35, "learning_rate": 2.9477422628107556e-07, "logits/chosen": -2.7592785358428955, "logits/rejected": -2.336015224456787, "logps/chosen": -85.76075744628906, "logps/rejected": -92.52715301513672, "loss": 0.0098, "rewards/accuracies": 1.0, "rewards/chosen": 1.1976195573806763, "rewards/margins": 14.608482360839844, "rewards/rejected": -13.410862922668457, "step": 760 }, { "epoch": 0.35, "learning_rate": 2.9426686960933536e-07, "logits/chosen": -2.730905055999756, "logits/rejected": -2.374079465866089, "logps/chosen": -82.19593811035156, "logps/rejected": -92.55106353759766, "loss": 0.0085, "rewards/accuracies": 1.0, "rewards/chosen": 0.6844054460525513, "rewards/margins": 14.929788589477539, "rewards/rejected": -14.245382308959961, "step": 770 }, { "epoch": 0.36, "learning_rate": 2.937595129375951e-07, "logits/chosen": -2.7502939701080322, "logits/rejected": -2.3946685791015625, "logps/chosen": -84.39253234863281, "logps/rejected": -93.54348754882812, "loss": 0.0053, "rewards/accuracies": 1.0, "rewards/chosen": 0.9502304196357727, "rewards/margins": 15.26049518585205, "rewards/rejected": -14.310264587402344, "step": 780 }, { "epoch": 0.36, "learning_rate": 2.932521562658549e-07, "logits/chosen": -2.7556135654449463, "logits/rejected": -2.4346609115600586, "logps/chosen": -80.71319580078125, "logps/rejected": -94.5219955444336, "loss": 0.0082, "rewards/accuracies": 1.0, "rewards/chosen": 1.0875753164291382, "rewards/margins": 16.16122055053711, "rewards/rejected": -15.073646545410156, "step": 790 }, { "epoch": 0.37, "learning_rate": 2.9274479959411463e-07, "logits/chosen": -2.7179512977600098, "logits/rejected": -2.3952744007110596, "logps/chosen": -83.53395080566406, "logps/rejected": -94.33482360839844, "loss": 0.0088, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.8440467119216919, "rewards/margins": 14.868667602539062, "rewards/rejected": -14.024620056152344, "step": 800 }, { "epoch": 0.37, "eval_logits/chosen": -2.4030351638793945, "eval_logits/rejected": -2.1030843257904053, "eval_logps/chosen": -80.360107421875, "eval_logps/rejected": -88.85675048828125, "eval_loss": 0.009848731569945812, "eval_rewards/accuracies": 0.9888268113136292, "eval_rewards/chosen": 0.9128196239471436, "eval_rewards/margins": 14.11054515838623, "eval_rewards/rejected": -13.197725296020508, "eval_runtime": 191.5037, "eval_samples_per_second": 14.945, "eval_steps_per_second": 0.935, "step": 800 }, { "epoch": 0.37, "learning_rate": 2.922374429223744e-07, "logits/chosen": -2.736855983734131, "logits/rejected": -2.385502815246582, "logps/chosen": -76.4046630859375, "logps/rejected": -91.60218811035156, "loss": 0.0139, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.9161388278007507, "rewards/margins": 14.970947265625, "rewards/rejected": -14.0548095703125, "step": 810 }, { "epoch": 0.37, "learning_rate": 2.9173008625063416e-07, "logits/chosen": -2.7486324310302734, "logits/rejected": -2.4126791954040527, "logps/chosen": -76.73153686523438, "logps/rejected": -88.30491638183594, "loss": 0.0072, "rewards/accuracies": 1.0, "rewards/chosen": 1.6000778675079346, "rewards/margins": 15.742793083190918, "rewards/rejected": -14.142717361450195, "step": 820 }, { "epoch": 0.38, "learning_rate": 2.9122272957889396e-07, "logits/chosen": -2.7615063190460205, "logits/rejected": -2.3541018962860107, "logps/chosen": -83.30259704589844, "logps/rejected": -90.85735321044922, "loss": 0.0083, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.9555395245552063, "rewards/margins": 14.290064811706543, "rewards/rejected": -13.334524154663086, "step": 830 }, { "epoch": 0.38, "learning_rate": 2.907153729071537e-07, "logits/chosen": -2.7420332431793213, "logits/rejected": -2.406613826751709, "logps/chosen": -79.8556137084961, "logps/rejected": -92.02437591552734, "loss": 0.0059, "rewards/accuracies": 1.0, "rewards/chosen": 1.5855063199996948, "rewards/margins": 15.713933944702148, "rewards/rejected": -14.128427505493164, "step": 840 }, { "epoch": 0.39, "learning_rate": 2.902080162354135e-07, "logits/chosen": -2.7244153022766113, "logits/rejected": -2.3926236629486084, "logps/chosen": -80.45246124267578, "logps/rejected": -92.79832458496094, "loss": 0.007, "rewards/accuracies": 1.0, "rewards/chosen": 0.8010252118110657, "rewards/margins": 15.269224166870117, "rewards/rejected": -14.468198776245117, "step": 850 }, { "epoch": 0.39, "learning_rate": 2.8970065956367323e-07, "logits/chosen": -2.7244489192962646, "logits/rejected": -2.349091053009033, "logps/chosen": -85.38154602050781, "logps/rejected": -94.31491088867188, "loss": 0.0071, "rewards/accuracies": 1.0, "rewards/chosen": 0.786288857460022, "rewards/margins": 15.459881782531738, "rewards/rejected": -14.673593521118164, "step": 860 }, { "epoch": 0.4, "learning_rate": 2.89193302891933e-07, "logits/chosen": -2.727729558944702, "logits/rejected": -2.4048168659210205, "logps/chosen": -84.41932678222656, "logps/rejected": -96.66879272460938, "loss": 0.0077, "rewards/accuracies": 1.0, "rewards/chosen": 0.9753098487854004, "rewards/margins": 15.289289474487305, "rewards/rejected": -14.31397819519043, "step": 870 }, { "epoch": 0.4, "learning_rate": 2.8868594622019276e-07, "logits/chosen": -2.780242919921875, "logits/rejected": -2.3804755210876465, "logps/chosen": -85.80220794677734, "logps/rejected": -94.20738220214844, "loss": 0.007, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.2555419206619263, "rewards/margins": 16.453609466552734, "rewards/rejected": -15.19806957244873, "step": 880 }, { "epoch": 0.41, "learning_rate": 2.8817858954845256e-07, "logits/chosen": -2.7227251529693604, "logits/rejected": -2.45554256439209, "logps/chosen": -78.00861358642578, "logps/rejected": -95.69950866699219, "loss": 0.0043, "rewards/accuracies": 1.0, "rewards/chosen": 0.6950634121894836, "rewards/margins": 16.06792449951172, "rewards/rejected": -15.372861862182617, "step": 890 }, { "epoch": 0.41, "learning_rate": 2.876712328767123e-07, "logits/chosen": -2.7370660305023193, "logits/rejected": -2.420109748840332, "logps/chosen": -77.81713104248047, "logps/rejected": -91.51441192626953, "loss": 0.0054, "rewards/accuracies": 1.0, "rewards/chosen": 1.133911371231079, "rewards/margins": 15.646966934204102, "rewards/rejected": -14.513055801391602, "step": 900 }, { "epoch": 0.41, "eval_logits/chosen": -2.4143810272216797, "eval_logits/rejected": -2.1158018112182617, "eval_logps/chosen": -80.96397399902344, "eval_logps/rejected": -93.7409439086914, "eval_loss": 0.009246980771422386, "eval_rewards/accuracies": 0.9888268113136292, "eval_rewards/chosen": 0.610884964466095, "eval_rewards/margins": 16.25071144104004, "eval_rewards/rejected": -15.639823913574219, "eval_runtime": 214.2753, "eval_samples_per_second": 13.357, "eval_steps_per_second": 0.835, "step": 900 }, { "epoch": 0.42, "learning_rate": 2.871638762049721e-07, "logits/chosen": -2.7090625762939453, "logits/rejected": -2.3218986988067627, "logps/chosen": -82.98603057861328, "logps/rejected": -96.97901916503906, "loss": 0.0049, "rewards/accuracies": 1.0, "rewards/chosen": 1.351296067237854, "rewards/margins": 16.706687927246094, "rewards/rejected": -15.355390548706055, "step": 910 }, { "epoch": 0.42, "learning_rate": 2.8665651953323183e-07, "logits/chosen": -2.7467129230499268, "logits/rejected": -2.347712516784668, "logps/chosen": -82.70410919189453, "logps/rejected": -92.56896209716797, "loss": 0.0061, "rewards/accuracies": 1.0, "rewards/chosen": 1.697571039199829, "rewards/margins": 16.65634536743164, "rewards/rejected": -14.958773612976074, "step": 920 }, { "epoch": 0.42, "learning_rate": 2.861491628614916e-07, "logits/chosen": -2.7520878314971924, "logits/rejected": -2.3678460121154785, "logps/chosen": -83.40738677978516, "logps/rejected": -97.37800598144531, "loss": 0.0072, "rewards/accuracies": 1.0, "rewards/chosen": 0.38049930334091187, "rewards/margins": 16.591970443725586, "rewards/rejected": -16.211471557617188, "step": 930 }, { "epoch": 0.43, "learning_rate": 2.8564180618975136e-07, "logits/chosen": -2.7488136291503906, "logits/rejected": -2.4220871925354004, "logps/chosen": -89.0190200805664, "logps/rejected": -101.41310119628906, "loss": 0.0039, "rewards/accuracies": 1.0, "rewards/chosen": 0.6862933039665222, "rewards/margins": 17.18902015686035, "rewards/rejected": -16.50272560119629, "step": 940 }, { "epoch": 0.43, "learning_rate": 2.8513444951801116e-07, "logits/chosen": -2.7336249351501465, "logits/rejected": -2.399651050567627, "logps/chosen": -80.05248260498047, "logps/rejected": -97.5555648803711, "loss": 0.0071, "rewards/accuracies": 1.0, "rewards/chosen": 1.1636197566986084, "rewards/margins": 18.248010635375977, "rewards/rejected": -17.084386825561523, "step": 950 }, { "epoch": 0.44, "learning_rate": 2.846270928462709e-07, "logits/chosen": -2.74477219581604, "logits/rejected": -2.413490056991577, "logps/chosen": -83.87934875488281, "logps/rejected": -102.07732391357422, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": 0.24278709292411804, "rewards/margins": 16.87543487548828, "rewards/rejected": -16.632646560668945, "step": 960 }, { "epoch": 0.44, "learning_rate": 2.841197361745307e-07, "logits/chosen": -2.7593932151794434, "logits/rejected": -2.4102084636688232, "logps/chosen": -78.9996337890625, "logps/rejected": -97.92736053466797, "loss": 0.0075, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.2139979600906372, "rewards/margins": 18.505090713500977, "rewards/rejected": -17.291091918945312, "step": 970 }, { "epoch": 0.45, "learning_rate": 2.8361237950279043e-07, "logits/chosen": -2.738161563873291, "logits/rejected": -2.3084099292755127, "logps/chosen": -90.83815002441406, "logps/rejected": -102.61600494384766, "loss": 0.0112, "rewards/accuracies": 1.0, "rewards/chosen": 0.9110714793205261, "rewards/margins": 18.789249420166016, "rewards/rejected": -17.878177642822266, "step": 980 }, { "epoch": 0.45, "learning_rate": 2.831050228310502e-07, "logits/chosen": -2.7366559505462646, "logits/rejected": -2.37532114982605, "logps/chosen": -81.99267578125, "logps/rejected": -94.72532653808594, "loss": 0.006, "rewards/accuracies": 1.0, "rewards/chosen": 1.1337707042694092, "rewards/margins": 16.640193939208984, "rewards/rejected": -15.506423950195312, "step": 990 }, { "epoch": 0.46, "learning_rate": 2.8259766615930996e-07, "logits/chosen": -2.738548517227173, "logits/rejected": -2.358704090118408, "logps/chosen": -83.34911346435547, "logps/rejected": -92.97615051269531, "loss": 0.0044, "rewards/accuracies": 1.0, "rewards/chosen": 1.2462575435638428, "rewards/margins": 17.345651626586914, "rewards/rejected": -16.09939193725586, "step": 1000 }, { "epoch": 0.46, "eval_logits/chosen": -2.394568920135498, "eval_logits/rejected": -2.098773241043091, "eval_logps/chosen": -80.1893310546875, "eval_logps/rejected": -94.47550201416016, "eval_loss": 0.009426701813936234, "eval_rewards/accuracies": 0.9916201233863831, "eval_rewards/chosen": 0.9982085227966309, "eval_rewards/margins": 17.005308151245117, "eval_rewards/rejected": -16.00710105895996, "eval_runtime": 206.2414, "eval_samples_per_second": 13.877, "eval_steps_per_second": 0.868, "step": 1000 }, { "epoch": 0.46, "learning_rate": 2.8209030948756976e-07, "logits/chosen": -2.7161478996276855, "logits/rejected": -2.375811815261841, "logps/chosen": -82.14879608154297, "logps/rejected": -100.61568450927734, "loss": 0.0052, "rewards/accuracies": 1.0, "rewards/chosen": 1.595202088356018, "rewards/margins": 17.73694610595703, "rewards/rejected": -16.141742706298828, "step": 1010 }, { "epoch": 0.47, "learning_rate": 2.815829528158295e-07, "logits/chosen": -2.6856274604797363, "logits/rejected": -2.388237714767456, "logps/chosen": -74.16461181640625, "logps/rejected": -92.93270874023438, "loss": 0.004, "rewards/accuracies": 1.0, "rewards/chosen": 1.0667402744293213, "rewards/margins": 17.18326187133789, "rewards/rejected": -16.116519927978516, "step": 1020 }, { "epoch": 0.47, "learning_rate": 2.810755961440893e-07, "logits/chosen": -2.7035679817199707, "logits/rejected": -2.396179676055908, "logps/chosen": -77.85939025878906, "logps/rejected": -101.44688415527344, "loss": 0.0057, "rewards/accuracies": 1.0, "rewards/chosen": 0.7818120121955872, "rewards/margins": 17.59056282043457, "rewards/rejected": -16.80875015258789, "step": 1030 }, { "epoch": 0.47, "learning_rate": 2.8056823947234903e-07, "logits/chosen": -2.7187979221343994, "logits/rejected": -2.3575632572174072, "logps/chosen": -82.31752014160156, "logps/rejected": -94.10159301757812, "loss": 0.0078, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.8782583475112915, "rewards/margins": 17.394351959228516, "rewards/rejected": -15.516092300415039, "step": 1040 }, { "epoch": 0.48, "learning_rate": 2.800608828006088e-07, "logits/chosen": -2.7172255516052246, "logits/rejected": -2.438119411468506, "logps/chosen": -77.85316467285156, "logps/rejected": -97.91680908203125, "loss": 0.0051, "rewards/accuracies": 1.0, "rewards/chosen": 0.2746346592903137, "rewards/margins": 16.604778289794922, "rewards/rejected": -16.330142974853516, "step": 1050 }, { "epoch": 0.48, "learning_rate": 2.7955352612886856e-07, "logits/chosen": -2.6851072311401367, "logits/rejected": -2.3960647583007812, "logps/chosen": -81.01815795898438, "logps/rejected": -100.95433044433594, "loss": 0.0093, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.07826070487499237, "rewards/margins": 17.925365447998047, "rewards/rejected": -18.00362777709961, "step": 1060 }, { "epoch": 0.49, "learning_rate": 2.7904616945712836e-07, "logits/chosen": -2.7135329246520996, "logits/rejected": -2.365593910217285, "logps/chosen": -80.24275207519531, "logps/rejected": -98.96147155761719, "loss": 0.004, "rewards/accuracies": 1.0, "rewards/chosen": 0.9913237690925598, "rewards/margins": 19.149242401123047, "rewards/rejected": -18.157917022705078, "step": 1070 }, { "epoch": 0.49, "learning_rate": 2.785388127853881e-07, "logits/chosen": -2.7224042415618896, "logits/rejected": -2.35522198677063, "logps/chosen": -81.04705047607422, "logps/rejected": -92.48741149902344, "loss": 0.0084, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.9514676332473755, "rewards/margins": 17.620031356811523, "rewards/rejected": -16.66856575012207, "step": 1080 }, { "epoch": 0.5, "learning_rate": 2.780314561136479e-07, "logits/chosen": -2.7103464603424072, "logits/rejected": -2.391573429107666, "logps/chosen": -79.70823669433594, "logps/rejected": -101.65653991699219, "loss": 0.0031, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.6482914686203003, "rewards/margins": 19.12604522705078, "rewards/rejected": -18.477754592895508, "step": 1090 }, { "epoch": 0.5, "learning_rate": 2.7752409944190763e-07, "logits/chosen": -2.721518039703369, "logits/rejected": -2.3715062141418457, "logps/chosen": -82.43116760253906, "logps/rejected": -103.2406005859375, "loss": 0.0061, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.3527074456214905, "rewards/margins": 19.13258171081543, "rewards/rejected": -18.77987289428711, "step": 1100 }, { "epoch": 0.5, "eval_logits/chosen": -2.395505428314209, "eval_logits/rejected": -2.0990757942199707, "eval_logps/chosen": -81.0848617553711, "eval_logps/rejected": -98.48635864257812, "eval_loss": 0.008930802345275879, "eval_rewards/accuracies": 0.9916201233863831, "eval_rewards/chosen": 0.5504406690597534, "eval_rewards/margins": 18.56296730041504, "eval_rewards/rejected": -18.012527465820312, "eval_runtime": 187.7489, "eval_samples_per_second": 15.244, "eval_steps_per_second": 0.953, "step": 1100 }, { "epoch": 0.51, "learning_rate": 2.770167427701674e-07, "logits/chosen": -2.7353763580322266, "logits/rejected": -2.3498454093933105, "logps/chosen": -85.130126953125, "logps/rejected": -104.15633392333984, "loss": 0.0025, "rewards/accuracies": 1.0, "rewards/chosen": 0.9609131813049316, "rewards/margins": 20.31955337524414, "rewards/rejected": -19.358638763427734, "step": 1110 }, { "epoch": 0.51, "learning_rate": 2.7650938609842716e-07, "logits/chosen": -2.7142040729522705, "logits/rejected": -2.3519530296325684, "logps/chosen": -81.28278350830078, "logps/rejected": -95.15019226074219, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": 1.451286792755127, "rewards/margins": 18.769222259521484, "rewards/rejected": -17.317935943603516, "step": 1120 }, { "epoch": 0.52, "learning_rate": 2.7600202942668696e-07, "logits/chosen": -2.7112655639648438, "logits/rejected": -2.333254098892212, "logps/chosen": -85.1860122680664, "logps/rejected": -100.27143859863281, "loss": 0.0044, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.1594610214233398, "rewards/margins": 19.093997955322266, "rewards/rejected": -17.934534072875977, "step": 1130 }, { "epoch": 0.52, "learning_rate": 2.754946727549467e-07, "logits/chosen": -2.690727949142456, "logits/rejected": -2.3567252159118652, "logps/chosen": -84.9158706665039, "logps/rejected": -105.35355377197266, "loss": 0.0058, "rewards/accuracies": 1.0, "rewards/chosen": 0.020466184243559837, "rewards/margins": 18.791444778442383, "rewards/rejected": -18.770977020263672, "step": 1140 }, { "epoch": 0.52, "learning_rate": 2.749873160832065e-07, "logits/chosen": -2.713503837585449, "logits/rejected": -2.3460965156555176, "logps/chosen": -81.88911437988281, "logps/rejected": -99.89048767089844, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": 1.1322228908538818, "rewards/margins": 20.08858871459961, "rewards/rejected": -18.956363677978516, "step": 1150 }, { "epoch": 0.53, "learning_rate": 2.7447995941146623e-07, "logits/chosen": -2.713376998901367, "logits/rejected": -2.3725056648254395, "logps/chosen": -87.28949737548828, "logps/rejected": -104.84793853759766, "loss": 0.0193, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.5760872960090637, "rewards/margins": 19.00182342529297, "rewards/rejected": -19.57790756225586, "step": 1160 }, { "epoch": 0.53, "learning_rate": 2.73972602739726e-07, "logits/chosen": -2.7053189277648926, "logits/rejected": -2.42372727394104, "logps/chosen": -82.02760314941406, "logps/rejected": -105.42372131347656, "loss": 0.0057, "rewards/accuracies": 1.0, "rewards/chosen": 0.3321594297885895, "rewards/margins": 19.859477996826172, "rewards/rejected": -19.52731704711914, "step": 1170 }, { "epoch": 0.54, "learning_rate": 2.7346524606798576e-07, "logits/chosen": -2.688375949859619, "logits/rejected": -2.371124267578125, "logps/chosen": -85.1495590209961, "logps/rejected": -107.7877197265625, "loss": 0.0115, "rewards/accuracies": 1.0, "rewards/chosen": -0.15405194461345673, "rewards/margins": 20.586816787719727, "rewards/rejected": -20.740869522094727, "step": 1180 }, { "epoch": 0.54, "learning_rate": 2.7295788939624556e-07, "logits/chosen": -2.677525281906128, "logits/rejected": -2.2992100715637207, "logps/chosen": -85.57402801513672, "logps/rejected": -100.76956176757812, "loss": 0.0054, "rewards/accuracies": 1.0, "rewards/chosen": 0.30255943536758423, "rewards/margins": 18.784530639648438, "rewards/rejected": -18.48197364807129, "step": 1190 }, { "epoch": 0.55, "learning_rate": 2.724505327245053e-07, "logits/chosen": -2.7017364501953125, "logits/rejected": -2.349390745162964, "logps/chosen": -84.68556213378906, "logps/rejected": -104.7709732055664, "loss": 0.024, "rewards/accuracies": 1.0, "rewards/chosen": 0.42778903245925903, "rewards/margins": 19.887773513793945, "rewards/rejected": -19.459985733032227, "step": 1200 }, { "epoch": 0.55, "eval_logits/chosen": -2.363281011581421, "eval_logits/rejected": -2.074836254119873, "eval_logps/chosen": -81.21025085449219, "eval_logps/rejected": -95.7979736328125, "eval_loss": 0.00877679605036974, "eval_rewards/accuracies": 0.9916201233863831, "eval_rewards/chosen": 0.48774254322052, "eval_rewards/margins": 17.156078338623047, "eval_rewards/rejected": -16.6683349609375, "eval_runtime": 181.3409, "eval_samples_per_second": 15.782, "eval_steps_per_second": 0.987, "step": 1200 }, { "epoch": 0.55, "learning_rate": 2.719431760527651e-07, "logits/chosen": -2.6893982887268066, "logits/rejected": -2.37581729888916, "logps/chosen": -78.99661254882812, "logps/rejected": -94.20263671875, "loss": 0.0029, "rewards/accuracies": 1.0, "rewards/chosen": 0.6030126214027405, "rewards/margins": 16.686172485351562, "rewards/rejected": -16.083162307739258, "step": 1210 }, { "epoch": 0.56, "learning_rate": 2.7143581938102483e-07, "logits/chosen": -2.6815974712371826, "logits/rejected": -2.3048057556152344, "logps/chosen": -86.3680648803711, "logps/rejected": -98.19645690917969, "loss": 0.0062, "rewards/accuracies": 1.0, "rewards/chosen": 0.6901730298995972, "rewards/margins": 18.03385353088379, "rewards/rejected": -17.343679428100586, "step": 1220 }, { "epoch": 0.56, "learning_rate": 2.709284627092846e-07, "logits/chosen": -2.640092372894287, "logits/rejected": -2.314068555831909, "logps/chosen": -82.85084533691406, "logps/rejected": -100.60610961914062, "loss": 0.0031, "rewards/accuracies": 1.0, "rewards/chosen": 1.2558422088623047, "rewards/margins": 18.57790184020996, "rewards/rejected": -17.32206153869629, "step": 1230 }, { "epoch": 0.57, "learning_rate": 2.7042110603754436e-07, "logits/chosen": -2.6827239990234375, "logits/rejected": -2.3317859172821045, "logps/chosen": -89.39351654052734, "logps/rejected": -101.48863220214844, "loss": 0.0063, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.6145270466804504, "rewards/margins": 18.153644561767578, "rewards/rejected": -17.539119720458984, "step": 1240 }, { "epoch": 0.57, "learning_rate": 2.6991374936580416e-07, "logits/chosen": -2.6788437366485596, "logits/rejected": -2.354506731033325, "logps/chosen": -86.7813491821289, "logps/rejected": -105.3145980834961, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": -0.027428722009062767, "rewards/margins": 18.560232162475586, "rewards/rejected": -18.587661743164062, "step": 1250 }, { "epoch": 0.58, "learning_rate": 2.694063926940639e-07, "logits/chosen": -2.668999433517456, "logits/rejected": -2.299983024597168, "logps/chosen": -89.893798828125, "logps/rejected": -98.39214324951172, "loss": 0.0037, "rewards/accuracies": 1.0, "rewards/chosen": 0.5113745927810669, "rewards/margins": 18.134389877319336, "rewards/rejected": -17.62301254272461, "step": 1260 }, { "epoch": 0.58, "learning_rate": 2.688990360223237e-07, "logits/chosen": -2.679840564727783, "logits/rejected": -2.3709557056427, "logps/chosen": -81.5782699584961, "logps/rejected": -102.47004699707031, "loss": 0.0066, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.2847243845462799, "rewards/margins": 18.76095199584961, "rewards/rejected": -19.045677185058594, "step": 1270 }, { "epoch": 0.58, "learning_rate": 2.6839167935058343e-07, "logits/chosen": -2.6793510913848877, "logits/rejected": -2.393840789794922, "logps/chosen": -79.99113464355469, "logps/rejected": -103.0796127319336, "loss": 0.0096, "rewards/accuracies": 1.0, "rewards/chosen": -0.06277158111333847, "rewards/margins": 17.794504165649414, "rewards/rejected": -17.85727310180664, "step": 1280 }, { "epoch": 0.59, "learning_rate": 2.678843226788432e-07, "logits/chosen": -2.6817729473114014, "logits/rejected": -2.35050630569458, "logps/chosen": -83.2406997680664, "logps/rejected": -104.15470123291016, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": 0.27611058950424194, "rewards/margins": 19.040605545043945, "rewards/rejected": -18.764493942260742, "step": 1290 }, { "epoch": 0.59, "learning_rate": 2.6737696600710296e-07, "logits/chosen": -2.688805103302002, "logits/rejected": -2.319943428039551, "logps/chosen": -85.67652130126953, "logps/rejected": -102.46150970458984, "loss": 0.0039, "rewards/accuracies": 1.0, "rewards/chosen": 0.4964020252227783, "rewards/margins": 19.496139526367188, "rewards/rejected": -18.999738693237305, "step": 1300 }, { "epoch": 0.59, "eval_logits/chosen": -2.3622865676879883, "eval_logits/rejected": -2.0745723247528076, "eval_logps/chosen": -81.43468475341797, "eval_logps/rejected": -99.47993469238281, "eval_loss": 0.008666100911796093, "eval_rewards/accuracies": 0.9916201233863831, "eval_rewards/chosen": 0.37553170323371887, "eval_rewards/margins": 18.88484764099121, "eval_rewards/rejected": -18.509315490722656, "eval_runtime": 199.9196, "eval_samples_per_second": 14.316, "eval_steps_per_second": 0.895, "step": 1300 }, { "epoch": 0.6, "learning_rate": 2.6686960933536276e-07, "logits/chosen": -2.6695351600646973, "logits/rejected": -2.3839993476867676, "logps/chosen": -80.71315002441406, "logps/rejected": -106.0511245727539, "loss": 0.0055, "rewards/accuracies": 1.0, "rewards/chosen": 0.6074424982070923, "rewards/margins": 19.361095428466797, "rewards/rejected": -18.753652572631836, "step": 1310 }, { "epoch": 0.6, "learning_rate": 2.663622526636225e-07, "logits/chosen": -2.6592440605163574, "logits/rejected": -2.398927688598633, "logps/chosen": -78.87480163574219, "logps/rejected": -102.81874084472656, "loss": 0.0089, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.5340367555618286, "rewards/margins": 18.07616424560547, "rewards/rejected": -18.610200881958008, "step": 1320 }, { "epoch": 0.61, "learning_rate": 2.658548959918823e-07, "logits/chosen": -2.66070818901062, "logits/rejected": -2.316807746887207, "logps/chosen": -89.73442840576172, "logps/rejected": -103.236572265625, "loss": 0.0047, "rewards/accuracies": 1.0, "rewards/chosen": 0.4701521396636963, "rewards/margins": 19.30183982849121, "rewards/rejected": -18.83168601989746, "step": 1330 }, { "epoch": 0.61, "learning_rate": 2.6534753932014203e-07, "logits/chosen": -2.6665468215942383, "logits/rejected": -2.348203182220459, "logps/chosen": -80.92488098144531, "logps/rejected": -100.31684875488281, "loss": 0.0098, "rewards/accuracies": 1.0, "rewards/chosen": 0.43299055099487305, "rewards/margins": 18.23300552368164, "rewards/rejected": -17.80001449584961, "step": 1340 }, { "epoch": 0.62, "learning_rate": 2.648401826484018e-07, "logits/chosen": -2.7024118900299072, "logits/rejected": -2.271944284439087, "logps/chosen": -86.07228088378906, "logps/rejected": -103.69766998291016, "loss": 0.0037, "rewards/accuracies": 1.0, "rewards/chosen": 0.7772833108901978, "rewards/margins": 19.974210739135742, "rewards/rejected": -19.196928024291992, "step": 1350 }, { "epoch": 0.62, "learning_rate": 2.6433282597666156e-07, "logits/chosen": -2.709890842437744, "logits/rejected": -2.3020975589752197, "logps/chosen": -86.97416687011719, "logps/rejected": -102.81546783447266, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": 0.6950734257698059, "rewards/margins": 20.670360565185547, "rewards/rejected": -19.97528648376465, "step": 1360 }, { "epoch": 0.63, "learning_rate": 2.6382546930492135e-07, "logits/chosen": -2.684504747390747, "logits/rejected": -2.363217830657959, "logps/chosen": -81.0881576538086, "logps/rejected": -104.67891693115234, "loss": 0.0049, "rewards/accuracies": 1.0, "rewards/chosen": 0.2711435854434967, "rewards/margins": 19.803112030029297, "rewards/rejected": -19.53196907043457, "step": 1370 }, { "epoch": 0.63, "learning_rate": 2.633181126331811e-07, "logits/chosen": -2.6914682388305664, "logits/rejected": -2.4040451049804688, "logps/chosen": -78.05017852783203, "logps/rejected": -100.56519317626953, "loss": 0.0051, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.06084303930401802, "rewards/margins": 18.42987632751465, "rewards/rejected": -18.490718841552734, "step": 1380 }, { "epoch": 0.63, "learning_rate": 2.628107559614409e-07, "logits/chosen": -2.6868820190429688, "logits/rejected": -2.340902805328369, "logps/chosen": -85.12031555175781, "logps/rejected": -102.28285217285156, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": 0.799413800239563, "rewards/margins": 20.09952163696289, "rewards/rejected": -19.300106048583984, "step": 1390 }, { "epoch": 0.64, "learning_rate": 2.6230339928970063e-07, "logits/chosen": -2.6838936805725098, "logits/rejected": -2.403555393218994, "logps/chosen": -79.46581268310547, "logps/rejected": -104.35472106933594, "loss": 0.0051, "rewards/accuracies": 1.0, "rewards/chosen": 0.13009747862815857, "rewards/margins": 21.277833938598633, "rewards/rejected": -21.14773941040039, "step": 1400 }, { "epoch": 0.64, "eval_logits/chosen": -2.373791456222534, "eval_logits/rejected": -2.081942558288574, "eval_logps/chosen": -81.95059967041016, "eval_logps/rejected": -103.5729751586914, "eval_loss": 0.008585774339735508, "eval_rewards/accuracies": 0.9916201233863831, "eval_rewards/chosen": 0.11757203936576843, "eval_rewards/margins": 20.67340850830078, "eval_rewards/rejected": -20.55583953857422, "eval_runtime": 187.555, "eval_samples_per_second": 15.26, "eval_steps_per_second": 0.954, "step": 1400 }, { "epoch": 0.64, "learning_rate": 2.617960426179604e-07, "logits/chosen": -2.6914727687835693, "logits/rejected": -2.357975959777832, "logps/chosen": -86.02412414550781, "logps/rejected": -107.50175476074219, "loss": 0.0038, "rewards/accuracies": 1.0, "rewards/chosen": 1.1850025653839111, "rewards/margins": 21.703245162963867, "rewards/rejected": -20.51824378967285, "step": 1410 }, { "epoch": 0.65, "learning_rate": 2.6128868594622016e-07, "logits/chosen": -2.6898040771484375, "logits/rejected": -2.402202844619751, "logps/chosen": -82.82209777832031, "logps/rejected": -102.64483642578125, "loss": 0.0045, "rewards/accuracies": 1.0, "rewards/chosen": -0.36362993717193604, "rewards/margins": 18.62929916381836, "rewards/rejected": -18.992929458618164, "step": 1420 }, { "epoch": 0.65, "learning_rate": 2.6078132927447995e-07, "logits/chosen": -2.7144572734832764, "logits/rejected": -2.388233184814453, "logps/chosen": -82.55753326416016, "logps/rejected": -104.770263671875, "loss": 0.0029, "rewards/accuracies": 1.0, "rewards/chosen": -0.6140985488891602, "rewards/margins": 20.62877082824707, "rewards/rejected": -21.242870330810547, "step": 1430 }, { "epoch": 0.66, "learning_rate": 2.602739726027397e-07, "logits/chosen": -2.6879756450653076, "logits/rejected": -2.298889636993408, "logps/chosen": -84.464111328125, "logps/rejected": -109.27188873291016, "loss": 0.0028, "rewards/accuracies": 1.0, "rewards/chosen": -0.6033824682235718, "rewards/margins": 21.29957389831543, "rewards/rejected": -21.902957916259766, "step": 1440 }, { "epoch": 0.66, "learning_rate": 2.597666159309995e-07, "logits/chosen": -2.6867659091949463, "logits/rejected": -2.35931396484375, "logps/chosen": -83.69949340820312, "logps/rejected": -104.06939697265625, "loss": 0.0141, "rewards/accuracies": 1.0, "rewards/chosen": -0.11751063913106918, "rewards/margins": 19.827911376953125, "rewards/rejected": -19.945423126220703, "step": 1450 }, { "epoch": 0.67, "learning_rate": 2.5925925925925923e-07, "logits/chosen": -2.6745073795318604, "logits/rejected": -2.336707592010498, "logps/chosen": -82.30137634277344, "logps/rejected": -101.75772857666016, "loss": 0.0083, "rewards/accuracies": 1.0, "rewards/chosen": -0.11277500540018082, "rewards/margins": 18.864107131958008, "rewards/rejected": -18.976879119873047, "step": 1460 }, { "epoch": 0.67, "learning_rate": 2.58751902587519e-07, "logits/chosen": -2.700143814086914, "logits/rejected": -2.3049635887145996, "logps/chosen": -86.68960571289062, "logps/rejected": -101.96650695800781, "loss": 0.0065, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.750586986541748, "rewards/margins": 19.692794799804688, "rewards/rejected": -18.94220733642578, "step": 1470 }, { "epoch": 0.68, "learning_rate": 2.5824454591577876e-07, "logits/chosen": -2.6848690509796143, "logits/rejected": -2.341787576675415, "logps/chosen": -81.72432708740234, "logps/rejected": -100.2541732788086, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": 0.5769522190093994, "rewards/margins": 19.891284942626953, "rewards/rejected": -19.3143310546875, "step": 1480 }, { "epoch": 0.68, "learning_rate": 2.5773718924403855e-07, "logits/chosen": -2.6646339893341064, "logits/rejected": -2.3341898918151855, "logps/chosen": -82.93865203857422, "logps/rejected": -104.23873138427734, "loss": 0.0033, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.15357767045497894, "rewards/margins": 19.931514739990234, "rewards/rejected": -20.085092544555664, "step": 1490 }, { "epoch": 0.68, "learning_rate": 2.572298325722983e-07, "logits/chosen": -2.6633503437042236, "logits/rejected": -2.3143298625946045, "logps/chosen": -87.14234161376953, "logps/rejected": -105.46986389160156, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": 0.3873421847820282, "rewards/margins": 20.57369041442871, "rewards/rejected": -20.186349868774414, "step": 1500 }, { "epoch": 0.68, "eval_logits/chosen": -2.366745710372925, "eval_logits/rejected": -2.081284761428833, "eval_logps/chosen": -81.87535095214844, "eval_logps/rejected": -102.60922241210938, "eval_loss": 0.008870109915733337, "eval_rewards/accuracies": 0.9888268113136292, "eval_rewards/chosen": 0.15519432723522186, "eval_rewards/margins": 20.229154586791992, "eval_rewards/rejected": -20.07396125793457, "eval_runtime": 205.1014, "eval_samples_per_second": 13.954, "eval_steps_per_second": 0.873, "step": 1500 }, { "epoch": 0.69, "learning_rate": 2.567224759005581e-07, "logits/chosen": -2.6701927185058594, "logits/rejected": -2.2988247871398926, "logps/chosen": -84.62726593017578, "logps/rejected": -99.13783264160156, "loss": 0.0035, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.1557789146900177, "rewards/margins": 17.74861717224121, "rewards/rejected": -17.904396057128906, "step": 1510 }, { "epoch": 0.69, "learning_rate": 2.5621511922881783e-07, "logits/chosen": -2.7037365436553955, "logits/rejected": -2.320270538330078, "logps/chosen": -79.28900146484375, "logps/rejected": -102.04307556152344, "loss": 0.0042, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.1656215190887451, "rewards/margins": 20.12354850769043, "rewards/rejected": -18.957927703857422, "step": 1520 }, { "epoch": 0.7, "learning_rate": 2.557077625570776e-07, "logits/chosen": -2.71802020072937, "logits/rejected": -2.435358762741089, "logps/chosen": -79.87864685058594, "logps/rejected": -103.96000671386719, "loss": 0.0062, "rewards/accuracies": 1.0, "rewards/chosen": -0.09602653980255127, "rewards/margins": 18.438899993896484, "rewards/rejected": -18.53492546081543, "step": 1530 }, { "epoch": 0.7, "learning_rate": 2.5520040588533736e-07, "logits/chosen": -2.716503143310547, "logits/rejected": -2.3265323638916016, "logps/chosen": -84.27273559570312, "logps/rejected": -101.59284973144531, "loss": 0.0042, "rewards/accuracies": 1.0, "rewards/chosen": 0.6025292873382568, "rewards/margins": 21.231273651123047, "rewards/rejected": -20.62874412536621, "step": 1540 }, { "epoch": 0.71, "learning_rate": 2.5469304921359715e-07, "logits/chosen": -2.715407371520996, "logits/rejected": -2.398707628250122, "logps/chosen": -81.1008071899414, "logps/rejected": -102.95610046386719, "loss": 0.0037, "rewards/accuracies": 1.0, "rewards/chosen": 0.041560351848602295, "rewards/margins": 18.6967716217041, "rewards/rejected": -18.65521240234375, "step": 1550 }, { "epoch": 0.71, "learning_rate": 2.541856925418569e-07, "logits/chosen": -2.6776270866394043, "logits/rejected": -2.375828266143799, "logps/chosen": -82.5616455078125, "logps/rejected": -103.9339370727539, "loss": 0.0062, "rewards/accuracies": 1.0, "rewards/chosen": -0.5894964933395386, "rewards/margins": 18.874555587768555, "rewards/rejected": -19.464054107666016, "step": 1560 }, { "epoch": 0.72, "learning_rate": 2.536783358701167e-07, "logits/chosen": -2.7209715843200684, "logits/rejected": -2.3514397144317627, "logps/chosen": -85.16648864746094, "logps/rejected": -103.68109130859375, "loss": 0.0025, "rewards/accuracies": 1.0, "rewards/chosen": 0.0002448558807373047, "rewards/margins": 19.584938049316406, "rewards/rejected": -19.584693908691406, "step": 1570 }, { "epoch": 0.72, "learning_rate": 2.5317097919837643e-07, "logits/chosen": -2.6864490509033203, "logits/rejected": -2.4027414321899414, "logps/chosen": -83.934326171875, "logps/rejected": -110.62479400634766, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": -0.7959393262863159, "rewards/margins": 19.69265365600586, "rewards/rejected": -20.48859405517578, "step": 1580 }, { "epoch": 0.73, "learning_rate": 2.526636225266362e-07, "logits/chosen": -2.7191176414489746, "logits/rejected": -2.3462727069854736, "logps/chosen": -83.23155212402344, "logps/rejected": -110.19583892822266, "loss": 0.0045, "rewards/accuracies": 1.0, "rewards/chosen": -0.2033298909664154, "rewards/margins": 20.42452049255371, "rewards/rejected": -20.627853393554688, "step": 1590 }, { "epoch": 0.73, "learning_rate": 2.5215626585489596e-07, "logits/chosen": -2.7117838859558105, "logits/rejected": -2.3902933597564697, "logps/chosen": -79.0739517211914, "logps/rejected": -100.95878601074219, "loss": 0.0027, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.7403717637062073, "rewards/margins": 19.751811981201172, "rewards/rejected": -20.492183685302734, "step": 1600 }, { "epoch": 0.73, "eval_logits/chosen": -2.40783429145813, "eval_logits/rejected": -2.1178596019744873, "eval_logps/chosen": -83.19082641601562, "eval_logps/rejected": -104.05694580078125, "eval_loss": 0.00894769188016653, "eval_rewards/accuracies": 0.9888268113136292, "eval_rewards/chosen": -0.50254225730896, "eval_rewards/margins": 20.295278549194336, "eval_rewards/rejected": -20.797822952270508, "eval_runtime": 228.9664, "eval_samples_per_second": 12.5, "eval_steps_per_second": 0.782, "step": 1600 }, { "epoch": 0.73, "learning_rate": 2.5164890918315575e-07, "logits/chosen": -2.7188661098480225, "logits/rejected": -2.397878646850586, "logps/chosen": -85.95389556884766, "logps/rejected": -105.6358413696289, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": -1.500200629234314, "rewards/margins": 19.843097686767578, "rewards/rejected": -21.34329605102539, "step": 1610 }, { "epoch": 0.74, "learning_rate": 2.511415525114155e-07, "logits/chosen": -2.7342753410339355, "logits/rejected": -2.4542341232299805, "logps/chosen": -81.80928802490234, "logps/rejected": -102.57430267333984, "loss": 0.0058, "rewards/accuracies": 1.0, "rewards/chosen": -0.5822039842605591, "rewards/margins": 19.97235870361328, "rewards/rejected": -20.554563522338867, "step": 1620 }, { "epoch": 0.74, "learning_rate": 2.506341958396753e-07, "logits/chosen": -2.693725347518921, "logits/rejected": -2.342667579650879, "logps/chosen": -89.35189056396484, "logps/rejected": -106.10188293457031, "loss": 0.0046, "rewards/accuracies": 1.0, "rewards/chosen": -1.5038349628448486, "rewards/margins": 19.663232803344727, "rewards/rejected": -21.167070388793945, "step": 1630 }, { "epoch": 0.75, "learning_rate": 2.5012683916793503e-07, "logits/chosen": -2.717991590499878, "logits/rejected": -2.3450677394866943, "logps/chosen": -88.69110107421875, "logps/rejected": -108.9749984741211, "loss": 0.0026, "rewards/accuracies": 1.0, "rewards/chosen": 0.31400182843208313, "rewards/margins": 21.922719955444336, "rewards/rejected": -21.60871696472168, "step": 1640 }, { "epoch": 0.75, "learning_rate": 2.496194824961948e-07, "logits/chosen": -2.6893081665039062, "logits/rejected": -2.387824296951294, "logps/chosen": -84.20822143554688, "logps/rejected": -108.50709533691406, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": -0.155508354306221, "rewards/margins": 21.575735092163086, "rewards/rejected": -21.731243133544922, "step": 1650 }, { "epoch": 0.76, "learning_rate": 2.4911212582445456e-07, "logits/chosen": -2.7032294273376465, "logits/rejected": -2.3929686546325684, "logps/chosen": -82.45536804199219, "logps/rejected": -108.42997741699219, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": -1.2141615152359009, "rewards/margins": 20.1081485748291, "rewards/rejected": -21.322309494018555, "step": 1660 }, { "epoch": 0.76, "learning_rate": 2.4860476915271435e-07, "logits/chosen": -2.7182183265686035, "logits/rejected": -2.3619496822357178, "logps/chosen": -81.94224548339844, "logps/rejected": -108.93473815917969, "loss": 0.0136, "rewards/accuracies": 1.0, "rewards/chosen": -0.3714309334754944, "rewards/margins": 20.847238540649414, "rewards/rejected": -21.218669891357422, "step": 1670 }, { "epoch": 0.77, "learning_rate": 2.480974124809741e-07, "logits/chosen": -2.7049779891967773, "logits/rejected": -2.273693323135376, "logps/chosen": -89.01725769042969, "logps/rejected": -108.63340759277344, "loss": 0.0028, "rewards/accuracies": 1.0, "rewards/chosen": 0.33846113085746765, "rewards/margins": 20.653535842895508, "rewards/rejected": -20.31507110595703, "step": 1680 }, { "epoch": 0.77, "learning_rate": 2.475900558092339e-07, "logits/chosen": -2.6862072944641113, "logits/rejected": -2.337902069091797, "logps/chosen": -84.24800872802734, "logps/rejected": -101.89933013916016, "loss": 0.0025, "rewards/accuracies": 1.0, "rewards/chosen": 0.006035065744072199, "rewards/margins": 20.554426193237305, "rewards/rejected": -20.54839324951172, "step": 1690 }, { "epoch": 0.78, "learning_rate": 2.4708269913749363e-07, "logits/chosen": -2.6960091590881348, "logits/rejected": -2.3470184803009033, "logps/chosen": -85.79381561279297, "logps/rejected": -106.5449447631836, "loss": 0.0031, "rewards/accuracies": 1.0, "rewards/chosen": -0.2503994107246399, "rewards/margins": 20.360069274902344, "rewards/rejected": -20.6104679107666, "step": 1700 }, { "epoch": 0.78, "eval_logits/chosen": -2.377317428588867, "eval_logits/rejected": -2.0914950370788574, "eval_logps/chosen": -83.44847869873047, "eval_logps/rejected": -104.5597152709961, "eval_loss": 0.008542221039533615, "eval_rewards/accuracies": 0.9916201233863831, "eval_rewards/chosen": -0.6313691139221191, "eval_rewards/margins": 20.4178409576416, "eval_rewards/rejected": -21.049213409423828, "eval_runtime": 222.4422, "eval_samples_per_second": 12.866, "eval_steps_per_second": 0.805, "step": 1700 }, { "epoch": 0.78, "learning_rate": 2.465753424657534e-07, "logits/chosen": -2.689178705215454, "logits/rejected": -2.357572078704834, "logps/chosen": -83.33004760742188, "logps/rejected": -106.80476379394531, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": -0.26176801323890686, "rewards/margins": 20.559886932373047, "rewards/rejected": -20.8216552734375, "step": 1710 }, { "epoch": 0.79, "learning_rate": 2.4606798579401316e-07, "logits/chosen": -2.67765474319458, "logits/rejected": -2.3367466926574707, "logps/chosen": -89.78765869140625, "logps/rejected": -109.24089050292969, "loss": 0.0034, "rewards/accuracies": 1.0, "rewards/chosen": -0.24427656829357147, "rewards/margins": 20.040443420410156, "rewards/rejected": -20.284719467163086, "step": 1720 }, { "epoch": 0.79, "learning_rate": 2.4556062912227295e-07, "logits/chosen": -2.6984336376190186, "logits/rejected": -2.3199656009674072, "logps/chosen": -86.07073974609375, "logps/rejected": -103.2815170288086, "loss": 0.0065, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.18969468772411346, "rewards/margins": 19.52256202697754, "rewards/rejected": -19.712255477905273, "step": 1730 }, { "epoch": 0.79, "learning_rate": 2.450532724505327e-07, "logits/chosen": -2.68689227104187, "logits/rejected": -2.3085198402404785, "logps/chosen": -87.46333312988281, "logps/rejected": -103.7766342163086, "loss": 0.0046, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.2703358829021454, "rewards/margins": 19.89851188659668, "rewards/rejected": -19.628173828125, "step": 1740 }, { "epoch": 0.8, "learning_rate": 2.445459157787925e-07, "logits/chosen": -2.695403575897217, "logits/rejected": -2.3278603553771973, "logps/chosen": -82.51519775390625, "logps/rejected": -100.03485107421875, "loss": 0.0056, "rewards/accuracies": 1.0, "rewards/chosen": 0.46579962968826294, "rewards/margins": 19.668228149414062, "rewards/rejected": -19.20242691040039, "step": 1750 }, { "epoch": 0.8, "learning_rate": 2.4403855910705223e-07, "logits/chosen": -2.6673686504364014, "logits/rejected": -2.3828070163726807, "logps/chosen": -83.86640167236328, "logps/rejected": -103.9953842163086, "loss": 0.0045, "rewards/accuracies": 1.0, "rewards/chosen": -0.607195258140564, "rewards/margins": 18.59889793395996, "rewards/rejected": -19.206092834472656, "step": 1760 }, { "epoch": 0.81, "learning_rate": 2.43531202435312e-07, "logits/chosen": -2.6898818016052246, "logits/rejected": -2.2769339084625244, "logps/chosen": -89.10076904296875, "logps/rejected": -102.7822494506836, "loss": 0.0026, "rewards/accuracies": 1.0, "rewards/chosen": -0.7627326846122742, "rewards/margins": 19.39081573486328, "rewards/rejected": -20.153549194335938, "step": 1770 }, { "epoch": 0.81, "learning_rate": 2.4302384576357176e-07, "logits/chosen": -2.6774814128875732, "logits/rejected": -2.406913995742798, "logps/chosen": -88.16549682617188, "logps/rejected": -111.31996154785156, "loss": 0.0014, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.6507390737533569, "rewards/margins": 20.33917999267578, "rewards/rejected": -20.989917755126953, "step": 1780 }, { "epoch": 0.82, "learning_rate": 2.4251648909183155e-07, "logits/chosen": -2.660534381866455, "logits/rejected": -2.352851390838623, "logps/chosen": -87.28363037109375, "logps/rejected": -108.48933410644531, "loss": 0.007, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.7637156844139099, "rewards/margins": 21.422103881835938, "rewards/rejected": -20.65838623046875, "step": 1790 }, { "epoch": 0.82, "learning_rate": 2.420091324200913e-07, "logits/chosen": -2.6425793170928955, "logits/rejected": -2.3781325817108154, "logps/chosen": -89.24564361572266, "logps/rejected": -110.02349853515625, "loss": 0.0049, "rewards/accuracies": 1.0, "rewards/chosen": -1.4377422332763672, "rewards/margins": 19.52930450439453, "rewards/rejected": -20.9670467376709, "step": 1800 }, { "epoch": 0.82, "eval_logits/chosen": -2.350355863571167, "eval_logits/rejected": -2.0670278072357178, "eval_logps/chosen": -83.74292755126953, "eval_logps/rejected": -105.12784576416016, "eval_loss": 0.008470112457871437, "eval_rewards/accuracies": 0.9916201233863831, "eval_rewards/chosen": -0.7785915732383728, "eval_rewards/margins": 20.554677963256836, "eval_rewards/rejected": -21.333271026611328, "eval_runtime": 211.0705, "eval_samples_per_second": 13.559, "eval_steps_per_second": 0.848, "step": 1800 }, { "epoch": 0.83, "learning_rate": 2.415017757483511e-07, "logits/chosen": -2.681694269180298, "logits/rejected": -2.356346607208252, "logps/chosen": -86.7227783203125, "logps/rejected": -113.36888122558594, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -0.21751625835895538, "rewards/margins": 22.28219985961914, "rewards/rejected": -22.499713897705078, "step": 1810 }, { "epoch": 0.83, "learning_rate": 2.409944190766108e-07, "logits/chosen": -2.6711621284484863, "logits/rejected": -2.3341751098632812, "logps/chosen": -87.3926010131836, "logps/rejected": -110.13679504394531, "loss": 0.0043, "rewards/accuracies": 1.0, "rewards/chosen": -1.5693079233169556, "rewards/margins": 20.17491340637207, "rewards/rejected": -21.744220733642578, "step": 1820 }, { "epoch": 0.84, "learning_rate": 2.404870624048706e-07, "logits/chosen": -2.6786088943481445, "logits/rejected": -2.3714494705200195, "logps/chosen": -84.32047271728516, "logps/rejected": -103.80018615722656, "loss": 0.0046, "rewards/accuracies": 1.0, "rewards/chosen": 0.3425736427307129, "rewards/margins": 20.33951187133789, "rewards/rejected": -19.996936798095703, "step": 1830 }, { "epoch": 0.84, "learning_rate": 2.3997970573313036e-07, "logits/chosen": -2.6681151390075684, "logits/rejected": -2.340208053588867, "logps/chosen": -84.2584457397461, "logps/rejected": -102.01902770996094, "loss": 0.0025, "rewards/accuracies": 1.0, "rewards/chosen": -0.46443456411361694, "rewards/margins": 19.576797485351562, "rewards/rejected": -20.04123306274414, "step": 1840 }, { "epoch": 0.84, "learning_rate": 2.3947234906139015e-07, "logits/chosen": -2.6787774562835693, "logits/rejected": -2.334022283554077, "logps/chosen": -83.72932434082031, "logps/rejected": -104.73689270019531, "loss": 0.0038, "rewards/accuracies": 1.0, "rewards/chosen": 0.27790117263793945, "rewards/margins": 21.282752990722656, "rewards/rejected": -21.004854202270508, "step": 1850 }, { "epoch": 0.85, "learning_rate": 2.389649923896499e-07, "logits/chosen": -2.6566855907440186, "logits/rejected": -2.348459482192993, "logps/chosen": -82.66558074951172, "logps/rejected": -106.11601257324219, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": -0.7203642725944519, "rewards/margins": 19.92540740966797, "rewards/rejected": -20.645771026611328, "step": 1860 }, { "epoch": 0.85, "learning_rate": 2.384576357179097e-07, "logits/chosen": -2.6382763385772705, "logits/rejected": -2.320930004119873, "logps/chosen": -86.148681640625, "logps/rejected": -108.33302307128906, "loss": 0.0039, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.7944689989089966, "rewards/margins": 20.57015609741211, "rewards/rejected": -21.364625930786133, "step": 1870 }, { "epoch": 0.86, "learning_rate": 2.3795027904616943e-07, "logits/chosen": -2.6758155822753906, "logits/rejected": -2.3265469074249268, "logps/chosen": -86.53829193115234, "logps/rejected": -109.7621078491211, "loss": 0.0046, "rewards/accuracies": 1.0, "rewards/chosen": -1.7175718545913696, "rewards/margins": 20.12839126586914, "rewards/rejected": -21.845962524414062, "step": 1880 }, { "epoch": 0.86, "learning_rate": 2.374429223744292e-07, "logits/chosen": -2.6609199047088623, "logits/rejected": -2.3674826622009277, "logps/chosen": -84.27062225341797, "logps/rejected": -107.2642593383789, "loss": 0.0038, "rewards/accuracies": 1.0, "rewards/chosen": -0.20487456023693085, "rewards/margins": 21.165468215942383, "rewards/rejected": -21.37034034729004, "step": 1890 }, { "epoch": 0.87, "learning_rate": 2.3693556570268896e-07, "logits/chosen": -2.671532154083252, "logits/rejected": -2.37229323387146, "logps/chosen": -83.75556945800781, "logps/rejected": -110.1830825805664, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": -0.5575953722000122, "rewards/margins": 20.386972427368164, "rewards/rejected": -20.944564819335938, "step": 1900 }, { "epoch": 0.87, "eval_logits/chosen": -2.3546838760375977, "eval_logits/rejected": -2.0729401111602783, "eval_logps/chosen": -83.68504333496094, "eval_logps/rejected": -107.1366958618164, "eval_loss": 0.008397039957344532, "eval_rewards/accuracies": 0.994413435459137, "eval_rewards/chosen": -0.7496498823165894, "eval_rewards/margins": 21.588048934936523, "eval_rewards/rejected": -22.337696075439453, "eval_runtime": 180.8618, "eval_samples_per_second": 15.824, "eval_steps_per_second": 0.99, "step": 1900 }, { "epoch": 0.87, "learning_rate": 2.3642820903094873e-07, "logits/chosen": -2.659634590148926, "logits/rejected": -2.2706897258758545, "logps/chosen": -87.94964599609375, "logps/rejected": -112.7286148071289, "loss": 0.0049, "rewards/accuracies": 1.0, "rewards/chosen": 0.09852766990661621, "rewards/margins": 23.662548065185547, "rewards/rejected": -23.564022064208984, "step": 1910 }, { "epoch": 0.88, "learning_rate": 2.359208523592085e-07, "logits/chosen": -2.6884970664978027, "logits/rejected": -2.360994338989258, "logps/chosen": -85.63497161865234, "logps/rejected": -107.09065246582031, "loss": 0.0064, "rewards/accuracies": 1.0, "rewards/chosen": -1.2782135009765625, "rewards/margins": 20.852567672729492, "rewards/rejected": -22.130781173706055, "step": 1920 }, { "epoch": 0.88, "learning_rate": 2.3541349568746826e-07, "logits/chosen": -2.6634912490844727, "logits/rejected": -2.295370101928711, "logps/chosen": -86.70159149169922, "logps/rejected": -107.43186950683594, "loss": 0.0037, "rewards/accuracies": 1.0, "rewards/chosen": -1.8782844543457031, "rewards/margins": 20.963626861572266, "rewards/rejected": -22.84191131591797, "step": 1930 }, { "epoch": 0.89, "learning_rate": 2.3490613901572803e-07, "logits/chosen": -2.6447227001190186, "logits/rejected": -2.385751247406006, "logps/chosen": -79.87579345703125, "logps/rejected": -109.44964599609375, "loss": 0.0036, "rewards/accuracies": 1.0, "rewards/chosen": -1.060900092124939, "rewards/margins": 21.15647315979004, "rewards/rejected": -22.217370986938477, "step": 1940 }, { "epoch": 0.89, "learning_rate": 2.343987823439878e-07, "logits/chosen": -2.6590116024017334, "logits/rejected": -2.309845209121704, "logps/chosen": -84.08464813232422, "logps/rejected": -109.25309753417969, "loss": 0.0027, "rewards/accuracies": 1.0, "rewards/chosen": -0.7186194658279419, "rewards/margins": 22.74148178100586, "rewards/rejected": -23.460102081298828, "step": 1950 }, { "epoch": 0.89, "learning_rate": 2.3389142567224756e-07, "logits/chosen": -2.6623053550720215, "logits/rejected": -2.397179126739502, "logps/chosen": -87.073974609375, "logps/rejected": -117.36470794677734, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -1.057529091835022, "rewards/margins": 22.958511352539062, "rewards/rejected": -24.016040802001953, "step": 1960 }, { "epoch": 0.9, "learning_rate": 2.3338406900050733e-07, "logits/chosen": -2.6633222103118896, "logits/rejected": -2.3102493286132812, "logps/chosen": -87.67819213867188, "logps/rejected": -113.84678649902344, "loss": 0.0061, "rewards/accuracies": 1.0, "rewards/chosen": -0.0529528371989727, "rewards/margins": 24.872621536254883, "rewards/rejected": -24.925573348999023, "step": 1970 }, { "epoch": 0.9, "learning_rate": 2.328767123287671e-07, "logits/chosen": -2.6640639305114746, "logits/rejected": -2.3144893646240234, "logps/chosen": -89.18020629882812, "logps/rejected": -114.86326599121094, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": -0.8851861953735352, "rewards/margins": 23.453914642333984, "rewards/rejected": -24.339101791381836, "step": 1980 }, { "epoch": 0.91, "learning_rate": 2.3236935565702686e-07, "logits/chosen": -2.687596321105957, "logits/rejected": -2.3537380695343018, "logps/chosen": -86.08324432373047, "logps/rejected": -113.84188079833984, "loss": 0.0042, "rewards/accuracies": 1.0, "rewards/chosen": -0.9546561241149902, "rewards/margins": 22.700944900512695, "rewards/rejected": -23.65559959411621, "step": 1990 }, { "epoch": 0.91, "learning_rate": 2.3186199898528663e-07, "logits/chosen": -2.6602392196655273, "logits/rejected": -2.353370428085327, "logps/chosen": -83.919677734375, "logps/rejected": -110.4391098022461, "loss": 0.0067, "rewards/accuracies": 1.0, "rewards/chosen": -0.7964860200881958, "rewards/margins": 21.585779190063477, "rewards/rejected": -22.382266998291016, "step": 2000 }, { "epoch": 0.91, "eval_logits/chosen": -2.3471853733062744, "eval_logits/rejected": -2.065129518508911, "eval_logps/chosen": -83.81088256835938, "eval_logps/rejected": -108.06615447998047, "eval_loss": 0.008573910221457481, "eval_rewards/accuracies": 0.9916201233863831, "eval_rewards/chosen": -0.8125662207603455, "eval_rewards/margins": 21.98985481262207, "eval_rewards/rejected": -22.802419662475586, "eval_runtime": 202.0176, "eval_samples_per_second": 14.167, "eval_steps_per_second": 0.886, "step": 2000 }, { "epoch": 0.92, "learning_rate": 2.313546423135464e-07, "logits/chosen": -2.6627299785614014, "logits/rejected": -2.3212547302246094, "logps/chosen": -89.50450897216797, "logps/rejected": -105.97727966308594, "loss": 0.0141, "rewards/accuracies": 1.0, "rewards/chosen": -0.550868570804596, "rewards/margins": 21.324947357177734, "rewards/rejected": -21.87581443786621, "step": 2010 }, { "epoch": 0.92, "learning_rate": 2.3084728564180616e-07, "logits/chosen": -2.6087136268615723, "logits/rejected": -2.3505330085754395, "logps/chosen": -81.36959075927734, "logps/rejected": -105.24412536621094, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -0.033040571957826614, "rewards/margins": 19.920692443847656, "rewards/rejected": -19.953731536865234, "step": 2020 }, { "epoch": 0.93, "learning_rate": 2.3033992897006593e-07, "logits/chosen": -2.6397414207458496, "logits/rejected": -2.268911838531494, "logps/chosen": -90.52021789550781, "logps/rejected": -105.94636535644531, "loss": 0.0058, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.3317680060863495, "rewards/margins": 19.993911743164062, "rewards/rejected": -20.325679779052734, "step": 2030 }, { "epoch": 0.93, "learning_rate": 2.298325722983257e-07, "logits/chosen": -2.6304543018341064, "logits/rejected": -2.2495150566101074, "logps/chosen": -88.17591857910156, "logps/rejected": -107.3923110961914, "loss": 0.0038, "rewards/accuracies": 1.0, "rewards/chosen": -0.1817486584186554, "rewards/margins": 20.980478286743164, "rewards/rejected": -21.162227630615234, "step": 2040 }, { "epoch": 0.94, "learning_rate": 2.2932521562658546e-07, "logits/chosen": -2.6193137168884277, "logits/rejected": -2.255574941635132, "logps/chosen": -84.45220184326172, "logps/rejected": -104.55680084228516, "loss": 0.0058, "rewards/accuracies": 1.0, "rewards/chosen": -0.18396982550621033, "rewards/margins": 20.578176498413086, "rewards/rejected": -20.762147903442383, "step": 2050 }, { "epoch": 0.94, "learning_rate": 2.2881785895484523e-07, "logits/chosen": -2.609607458114624, "logits/rejected": -2.27685284614563, "logps/chosen": -87.35172271728516, "logps/rejected": -102.37759399414062, "loss": 0.0077, "rewards/accuracies": 1.0, "rewards/chosen": -0.004270422272384167, "rewards/margins": 20.220355987548828, "rewards/rejected": -20.224626541137695, "step": 2060 }, { "epoch": 0.94, "learning_rate": 2.28310502283105e-07, "logits/chosen": -2.601140022277832, "logits/rejected": -2.3076891899108887, "logps/chosen": -78.9570083618164, "logps/rejected": -102.34449768066406, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": 0.08370795100927353, "rewards/margins": 20.209997177124023, "rewards/rejected": -20.12628746032715, "step": 2070 }, { "epoch": 0.95, "learning_rate": 2.2780314561136476e-07, "logits/chosen": -2.611623764038086, "logits/rejected": -2.2876951694488525, "logps/chosen": -84.95990753173828, "logps/rejected": -102.3360824584961, "loss": 0.0029, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.44320592284202576, "rewards/margins": 19.913740158081055, "rewards/rejected": -20.356945037841797, "step": 2080 }, { "epoch": 0.95, "learning_rate": 2.2729578893962453e-07, "logits/chosen": -2.6101326942443848, "logits/rejected": -2.326184034347534, "logps/chosen": -84.60515594482422, "logps/rejected": -105.9177474975586, "loss": 0.0023, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.24188712239265442, "rewards/margins": 21.254962921142578, "rewards/rejected": -21.49685287475586, "step": 2090 }, { "epoch": 0.96, "learning_rate": 2.267884322678843e-07, "logits/chosen": -2.6193861961364746, "logits/rejected": -2.2853081226348877, "logps/chosen": -81.70816040039062, "logps/rejected": -101.28323364257812, "loss": 0.0041, "rewards/accuracies": 1.0, "rewards/chosen": -0.4797388017177582, "rewards/margins": 20.623106002807617, "rewards/rejected": -21.102848052978516, "step": 2100 }, { "epoch": 0.96, "eval_logits/chosen": -2.313713550567627, "eval_logits/rejected": -2.0362889766693115, "eval_logps/chosen": -83.76630401611328, "eval_logps/rejected": -106.13714599609375, "eval_loss": 0.008187840692698956, "eval_rewards/accuracies": 0.994413435459137, "eval_rewards/chosen": -0.7902824282646179, "eval_rewards/margins": 21.047639846801758, "eval_rewards/rejected": -21.837926864624023, "eval_runtime": 180.8956, "eval_samples_per_second": 15.821, "eval_steps_per_second": 0.99, "step": 2100 }, { "epoch": 0.96, "learning_rate": 2.2628107559614406e-07, "logits/chosen": -2.6243438720703125, "logits/rejected": -2.371274471282959, "logps/chosen": -85.10074615478516, "logps/rejected": -111.37959289550781, "loss": 0.0056, "rewards/accuracies": 1.0, "rewards/chosen": -1.5252538919448853, "rewards/margins": 20.681808471679688, "rewards/rejected": -22.20706558227539, "step": 2110 }, { "epoch": 0.97, "learning_rate": 2.2577371892440383e-07, "logits/chosen": -2.6191418170928955, "logits/rejected": -2.3203933238983154, "logps/chosen": -79.67707824707031, "logps/rejected": -107.4069595336914, "loss": 0.0065, "rewards/accuracies": 1.0, "rewards/chosen": 0.16583606600761414, "rewards/margins": 22.663766860961914, "rewards/rejected": -22.497928619384766, "step": 2120 }, { "epoch": 0.97, "learning_rate": 2.252663622526636e-07, "logits/chosen": -2.637124538421631, "logits/rejected": -2.3179469108581543, "logps/chosen": -86.39234924316406, "logps/rejected": -110.28694915771484, "loss": 0.0047, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.5163416862487793, "rewards/margins": 21.511493682861328, "rewards/rejected": -22.027835845947266, "step": 2130 }, { "epoch": 0.98, "learning_rate": 2.2475900558092336e-07, "logits/chosen": -2.6215131282806396, "logits/rejected": -2.3205182552337646, "logps/chosen": -82.3526840209961, "logps/rejected": -107.95674133300781, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": -0.4713154435157776, "rewards/margins": 21.480653762817383, "rewards/rejected": -21.951969146728516, "step": 2140 }, { "epoch": 0.98, "learning_rate": 2.2425164890918313e-07, "logits/chosen": -2.64332914352417, "logits/rejected": -2.268400192260742, "logps/chosen": -92.27891540527344, "logps/rejected": -109.79569244384766, "loss": 0.0032, "rewards/accuracies": 1.0, "rewards/chosen": 0.3803238570690155, "rewards/margins": 22.256778717041016, "rewards/rejected": -21.87645149230957, "step": 2150 }, { "epoch": 0.99, "learning_rate": 2.237442922374429e-07, "logits/chosen": -2.623831033706665, "logits/rejected": -2.303844690322876, "logps/chosen": -81.09654235839844, "logps/rejected": -104.95540618896484, "loss": 0.0027, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.731011688709259, "rewards/margins": 20.41283416748047, "rewards/rejected": -21.143844604492188, "step": 2160 }, { "epoch": 0.99, "learning_rate": 2.2323693556570266e-07, "logits/chosen": -2.611250638961792, "logits/rejected": -2.2356784343719482, "logps/chosen": -88.5833969116211, "logps/rejected": -112.34712219238281, "loss": 0.0036, "rewards/accuracies": 1.0, "rewards/chosen": 0.04851169511675835, "rewards/margins": 21.59535789489746, "rewards/rejected": -21.546844482421875, "step": 2170 }, { "epoch": 1.0, "learning_rate": 2.2272957889396242e-07, "logits/chosen": -2.617006778717041, "logits/rejected": -2.2967631816864014, "logps/chosen": -82.383544921875, "logps/rejected": -107.3962173461914, "loss": 0.0053, "rewards/accuracies": 1.0, "rewards/chosen": -0.15143811702728271, "rewards/margins": 20.753551483154297, "rewards/rejected": -20.904991149902344, "step": 2180 }, { "epoch": 1.0, "learning_rate": 2.222222222222222e-07, "logits/chosen": -2.618882656097412, "logits/rejected": -2.305601119995117, "logps/chosen": -85.83949279785156, "logps/rejected": -104.82088470458984, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -0.9692584872245789, "rewards/margins": 20.660816192626953, "rewards/rejected": -21.63007354736328, "step": 2190 }, { "epoch": 1.0, "learning_rate": 2.2171486555048196e-07, "logits/chosen": -2.626828670501709, "logits/rejected": -2.2737765312194824, "logps/chosen": -89.91502380371094, "logps/rejected": -110.86567687988281, "loss": 0.0025, "rewards/accuracies": 1.0, "rewards/chosen": 0.6803728938102722, "rewards/margins": 21.999923706054688, "rewards/rejected": -21.31955337524414, "step": 2200 }, { "epoch": 1.0, "eval_logits/chosen": -2.307363271713257, "eval_logits/rejected": -2.0302772521972656, "eval_logps/chosen": -83.08345794677734, "eval_logps/rejected": -105.35157012939453, "eval_loss": 0.007900677621364594, "eval_rewards/accuracies": 0.9916201233863831, "eval_rewards/chosen": -0.4488579034805298, "eval_rewards/margins": 20.996278762817383, "eval_rewards/rejected": -21.44513511657715, "eval_runtime": 193.0055, "eval_samples_per_second": 14.829, "eval_steps_per_second": 0.927, "step": 2200 }, { "epoch": 1.01, "learning_rate": 2.2120750887874172e-07, "logits/chosen": -2.609365940093994, "logits/rejected": -2.2971606254577637, "logps/chosen": -88.7730484008789, "logps/rejected": -110.3308334350586, "loss": 0.0047, "rewards/accuracies": 1.0, "rewards/chosen": -0.2792726457118988, "rewards/margins": 21.683462142944336, "rewards/rejected": -21.962736129760742, "step": 2210 }, { "epoch": 1.01, "learning_rate": 2.207001522070015e-07, "logits/chosen": -2.605121374130249, "logits/rejected": -2.2930240631103516, "logps/chosen": -86.51280212402344, "logps/rejected": -111.41705322265625, "loss": 0.0026, "rewards/accuracies": 1.0, "rewards/chosen": -0.36012354493141174, "rewards/margins": 21.384815216064453, "rewards/rejected": -21.74493980407715, "step": 2220 }, { "epoch": 1.02, "learning_rate": 2.2019279553526126e-07, "logits/chosen": -2.610063076019287, "logits/rejected": -2.233963966369629, "logps/chosen": -85.30977630615234, "logps/rejected": -102.55244445800781, "loss": 0.0044, "rewards/accuracies": 1.0, "rewards/chosen": -0.3474958539009094, "rewards/margins": 21.142913818359375, "rewards/rejected": -21.49040985107422, "step": 2230 }, { "epoch": 1.02, "learning_rate": 2.1968543886352102e-07, "logits/chosen": -2.601090908050537, "logits/rejected": -2.3093574047088623, "logps/chosen": -80.83834838867188, "logps/rejected": -109.14097595214844, "loss": 0.0036, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.176016926765442, "rewards/margins": 21.685558319091797, "rewards/rejected": -22.861574172973633, "step": 2240 }, { "epoch": 1.03, "learning_rate": 2.191780821917808e-07, "logits/chosen": -2.6159205436706543, "logits/rejected": -2.342970371246338, "logps/chosen": -80.5876693725586, "logps/rejected": -107.30357360839844, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": -0.8884702920913696, "rewards/margins": 21.684438705444336, "rewards/rejected": -22.572908401489258, "step": 2250 }, { "epoch": 1.03, "learning_rate": 2.1867072552004056e-07, "logits/chosen": -2.618044376373291, "logits/rejected": -2.3107223510742188, "logps/chosen": -85.27728271484375, "logps/rejected": -109.5432357788086, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": -0.7985124588012695, "rewards/margins": 22.301162719726562, "rewards/rejected": -23.099674224853516, "step": 2260 }, { "epoch": 1.04, "learning_rate": 2.1816336884830032e-07, "logits/chosen": -2.607043743133545, "logits/rejected": -2.3084537982940674, "logps/chosen": -80.77493286132812, "logps/rejected": -107.9861068725586, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -0.3011903464794159, "rewards/margins": 22.64898109436035, "rewards/rejected": -22.950170516967773, "step": 2270 }, { "epoch": 1.04, "learning_rate": 2.176560121765601e-07, "logits/chosen": -2.632981777191162, "logits/rejected": -2.319608211517334, "logps/chosen": -87.87213134765625, "logps/rejected": -110.3426513671875, "loss": 0.0088, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.10271458327770233, "rewards/margins": 22.240734100341797, "rewards/rejected": -22.138019561767578, "step": 2280 }, { "epoch": 1.05, "learning_rate": 2.1714865550481986e-07, "logits/chosen": -2.6590113639831543, "logits/rejected": -2.2972750663757324, "logps/chosen": -86.08259582519531, "logps/rejected": -109.5566635131836, "loss": 0.0052, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.3704858124256134, "rewards/margins": 22.825572967529297, "rewards/rejected": -23.196060180664062, "step": 2290 }, { "epoch": 1.05, "learning_rate": 2.1664129883307962e-07, "logits/chosen": -2.6190247535705566, "logits/rejected": -2.3475418090820312, "logps/chosen": -83.31871032714844, "logps/rejected": -116.7635269165039, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": -1.075631856918335, "rewards/margins": 22.547637939453125, "rewards/rejected": -23.62327003479004, "step": 2300 }, { "epoch": 1.05, "eval_logits/chosen": -2.3260116577148438, "eval_logits/rejected": -2.047701597213745, "eval_logps/chosen": -84.43914794921875, "eval_logps/rejected": -107.98521423339844, "eval_loss": 0.008207444101572037, "eval_rewards/accuracies": 0.994413435459137, "eval_rewards/chosen": -1.1267040967941284, "eval_rewards/margins": 21.635251998901367, "eval_rewards/rejected": -22.76195526123047, "eval_runtime": 223.8202, "eval_samples_per_second": 12.787, "eval_steps_per_second": 0.8, "step": 2300 }, { "epoch": 1.05, "learning_rate": 2.161339421613394e-07, "logits/chosen": -2.6420464515686035, "logits/rejected": -2.3314411640167236, "logps/chosen": -83.9236831665039, "logps/rejected": -115.30757904052734, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": -1.8800777196884155, "rewards/margins": 21.744577407836914, "rewards/rejected": -23.624652862548828, "step": 2310 }, { "epoch": 1.06, "learning_rate": 2.1562658548959916e-07, "logits/chosen": -2.658714532852173, "logits/rejected": -2.285980463027954, "logps/chosen": -89.01048278808594, "logps/rejected": -112.3675308227539, "loss": 0.0055, "rewards/accuracies": 1.0, "rewards/chosen": -1.0313222408294678, "rewards/margins": 23.517837524414062, "rewards/rejected": -24.549158096313477, "step": 2320 }, { "epoch": 1.06, "learning_rate": 2.1511922881785892e-07, "logits/chosen": -2.6318235397338867, "logits/rejected": -2.347407817840576, "logps/chosen": -82.6306381225586, "logps/rejected": -110.7253189086914, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -1.6225992441177368, "rewards/margins": 20.411014556884766, "rewards/rejected": -22.033615112304688, "step": 2330 }, { "epoch": 1.07, "learning_rate": 2.146118721461187e-07, "logits/chosen": -2.635178804397583, "logits/rejected": -2.309039354324341, "logps/chosen": -88.56755828857422, "logps/rejected": -111.56428527832031, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -0.674777626991272, "rewards/margins": 21.995031356811523, "rewards/rejected": -22.66980743408203, "step": 2340 }, { "epoch": 1.07, "learning_rate": 2.1410451547437846e-07, "logits/chosen": -2.621439218521118, "logits/rejected": -2.3563952445983887, "logps/chosen": -83.00621032714844, "logps/rejected": -115.13410949707031, "loss": 0.0047, "rewards/accuracies": 1.0, "rewards/chosen": -0.8916094899177551, "rewards/margins": 22.60810661315918, "rewards/rejected": -23.499717712402344, "step": 2350 }, { "epoch": 1.08, "learning_rate": 2.1359715880263822e-07, "logits/chosen": -2.6506218910217285, "logits/rejected": -2.3272719383239746, "logps/chosen": -87.21891784667969, "logps/rejected": -110.15861511230469, "loss": 0.0034, "rewards/accuracies": 1.0, "rewards/chosen": -1.3753619194030762, "rewards/margins": 22.202041625976562, "rewards/rejected": -23.577402114868164, "step": 2360 }, { "epoch": 1.08, "learning_rate": 2.13089802130898e-07, "logits/chosen": -2.652022123336792, "logits/rejected": -2.3361423015594482, "logps/chosen": -83.00395202636719, "logps/rejected": -113.31111907958984, "loss": 0.0032, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.703661561012268, "rewards/margins": 23.145084381103516, "rewards/rejected": -24.848745346069336, "step": 2370 }, { "epoch": 1.09, "learning_rate": 2.1258244545915776e-07, "logits/chosen": -2.633781909942627, "logits/rejected": -2.3275904655456543, "logps/chosen": -86.6109619140625, "logps/rejected": -112.17244720458984, "loss": 0.0024, "rewards/accuracies": 1.0, "rewards/chosen": -1.1088950634002686, "rewards/margins": 21.805938720703125, "rewards/rejected": -22.914833068847656, "step": 2380 }, { "epoch": 1.09, "learning_rate": 2.1207508878741752e-07, "logits/chosen": -2.64607572555542, "logits/rejected": -2.353994846343994, "logps/chosen": -82.63935852050781, "logps/rejected": -106.2022705078125, "loss": 0.0046, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.921892523765564, "rewards/margins": 22.059276580810547, "rewards/rejected": -23.98116683959961, "step": 2390 }, { "epoch": 1.1, "learning_rate": 2.115677321156773e-07, "logits/chosen": -2.6646969318389893, "logits/rejected": -2.332428455352783, "logps/chosen": -90.0765380859375, "logps/rejected": -118.28413391113281, "loss": 0.0055, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.6554048657417297, "rewards/margins": 23.688457489013672, "rewards/rejected": -24.343860626220703, "step": 2400 }, { "epoch": 1.1, "eval_logits/chosen": -2.338355779647827, "eval_logits/rejected": -2.0616254806518555, "eval_logps/chosen": -85.17961883544922, "eval_logps/rejected": -110.57490539550781, "eval_loss": 0.008537186309695244, "eval_rewards/accuracies": 0.9888268113136292, "eval_rewards/chosen": -1.4969366788864136, "eval_rewards/margins": 22.559864044189453, "eval_rewards/rejected": -24.056804656982422, "eval_runtime": 254.3496, "eval_samples_per_second": 11.252, "eval_steps_per_second": 0.704, "step": 2400 }, { "epoch": 1.1, "learning_rate": 2.1106037544393706e-07, "logits/chosen": -2.648667812347412, "logits/rejected": -2.351290225982666, "logps/chosen": -83.7481918334961, "logps/rejected": -115.79612731933594, "loss": 0.0054, "rewards/accuracies": 1.0, "rewards/chosen": -0.47197288274765015, "rewards/margins": 23.920135498046875, "rewards/rejected": -24.392108917236328, "step": 2410 }, { "epoch": 1.1, "learning_rate": 2.1055301877219682e-07, "logits/chosen": -2.6811461448669434, "logits/rejected": -2.2750840187072754, "logps/chosen": -91.8724594116211, "logps/rejected": -113.91780853271484, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -0.8787704706192017, "rewards/margins": 24.01218032836914, "rewards/rejected": -24.890949249267578, "step": 2420 }, { "epoch": 1.11, "learning_rate": 2.100456621004566e-07, "logits/chosen": -2.643972873687744, "logits/rejected": -2.3852624893188477, "logps/chosen": -79.09517669677734, "logps/rejected": -113.2056655883789, "loss": 0.004, "rewards/accuracies": 1.0, "rewards/chosen": -1.7013204097747803, "rewards/margins": 23.163427352905273, "rewards/rejected": -24.864749908447266, "step": 2430 }, { "epoch": 1.11, "learning_rate": 2.0953830542871636e-07, "logits/chosen": -2.6480491161346436, "logits/rejected": -2.334690570831299, "logps/chosen": -88.85383605957031, "logps/rejected": -111.4326400756836, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -0.3242875337600708, "rewards/margins": 22.901653289794922, "rewards/rejected": -23.225940704345703, "step": 2440 }, { "epoch": 1.12, "learning_rate": 2.0903094875697612e-07, "logits/chosen": -2.668109893798828, "logits/rejected": -2.285407781600952, "logps/chosen": -92.72268676757812, "logps/rejected": -114.85479736328125, "loss": 0.0035, "rewards/accuracies": 1.0, "rewards/chosen": -0.3550542891025543, "rewards/margins": 23.459930419921875, "rewards/rejected": -23.814983367919922, "step": 2450 }, { "epoch": 1.12, "learning_rate": 2.085235920852359e-07, "logits/chosen": -2.6293513774871826, "logits/rejected": -2.327310085296631, "logps/chosen": -86.370849609375, "logps/rejected": -116.29862213134766, "loss": 0.0036, "rewards/accuracies": 1.0, "rewards/chosen": -1.3266971111297607, "rewards/margins": 23.85145378112793, "rewards/rejected": -25.178152084350586, "step": 2460 }, { "epoch": 1.13, "learning_rate": 2.0801623541349566e-07, "logits/chosen": -2.675238847732544, "logits/rejected": -2.3478376865386963, "logps/chosen": -81.60664367675781, "logps/rejected": -109.88370513916016, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": 0.17477786540985107, "rewards/margins": 23.485958099365234, "rewards/rejected": -23.31117820739746, "step": 2470 }, { "epoch": 1.13, "learning_rate": 2.0750887874175542e-07, "logits/chosen": -2.6532633304595947, "logits/rejected": -2.3442189693450928, "logps/chosen": -83.88862609863281, "logps/rejected": -117.04725646972656, "loss": 0.0027, "rewards/accuracies": 1.0, "rewards/chosen": 0.07490722835063934, "rewards/margins": 23.64150047302246, "rewards/rejected": -23.566593170166016, "step": 2480 }, { "epoch": 1.14, "learning_rate": 2.070015220700152e-07, "logits/chosen": -2.6502089500427246, "logits/rejected": -2.289119005203247, "logps/chosen": -84.96060943603516, "logps/rejected": -112.07164001464844, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": 0.2735442519187927, "rewards/margins": 24.202978134155273, "rewards/rejected": -23.929431915283203, "step": 2490 }, { "epoch": 1.14, "learning_rate": 2.0649416539827496e-07, "logits/chosen": -2.6330459117889404, "logits/rejected": -2.305373430252075, "logps/chosen": -84.61180114746094, "logps/rejected": -105.67828369140625, "loss": 0.0139, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.7034128904342651, "rewards/margins": 21.006656646728516, "rewards/rejected": -20.30324363708496, "step": 2500 }, { "epoch": 1.14, "eval_logits/chosen": -2.320574998855591, "eval_logits/rejected": -2.045271396636963, "eval_logps/chosen": -81.27302551269531, "eval_logps/rejected": -103.23329162597656, "eval_loss": 0.00769708352163434, "eval_rewards/accuracies": 0.9916201233863831, "eval_rewards/chosen": 0.45635905861854553, "eval_rewards/margins": 20.84235382080078, "eval_rewards/rejected": -20.385997772216797, "eval_runtime": 177.8282, "eval_samples_per_second": 16.094, "eval_steps_per_second": 1.007, "step": 2500 }, { "epoch": 1.15, "learning_rate": 2.0598680872653472e-07, "logits/chosen": -2.6420750617980957, "logits/rejected": -2.2920174598693848, "logps/chosen": -86.39070129394531, "logps/rejected": -107.4378433227539, "loss": 0.0037, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.5158565640449524, "rewards/margins": 21.056888580322266, "rewards/rejected": -20.54102897644043, "step": 2510 }, { "epoch": 1.15, "learning_rate": 2.054794520547945e-07, "logits/chosen": -2.6286368370056152, "logits/rejected": -2.294121265411377, "logps/chosen": -82.60850524902344, "logps/rejected": -104.18631744384766, "loss": 0.0047, "rewards/accuracies": 1.0, "rewards/chosen": 0.881120502948761, "rewards/margins": 21.782379150390625, "rewards/rejected": -20.901260375976562, "step": 2520 }, { "epoch": 1.15, "learning_rate": 2.0497209538305426e-07, "logits/chosen": -2.606501579284668, "logits/rejected": -2.2681689262390137, "logps/chosen": -82.8125228881836, "logps/rejected": -107.20194244384766, "loss": 0.0032, "rewards/accuracies": 1.0, "rewards/chosen": 0.8536422848701477, "rewards/margins": 22.990398406982422, "rewards/rejected": -22.136754989624023, "step": 2530 }, { "epoch": 1.16, "learning_rate": 2.0446473871131402e-07, "logits/chosen": -2.6260435581207275, "logits/rejected": -2.2896149158477783, "logps/chosen": -90.06600952148438, "logps/rejected": -114.93510437011719, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": 0.7896150350570679, "rewards/margins": 22.024280548095703, "rewards/rejected": -21.234663009643555, "step": 2540 }, { "epoch": 1.16, "learning_rate": 2.039573820395738e-07, "logits/chosen": -2.629570484161377, "logits/rejected": -2.26188063621521, "logps/chosen": -84.52861022949219, "logps/rejected": -106.76336669921875, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": 2.3467342853546143, "rewards/margins": 23.59172248840332, "rewards/rejected": -21.24498748779297, "step": 2550 }, { "epoch": 1.17, "learning_rate": 2.0345002536783356e-07, "logits/chosen": -2.63362979888916, "logits/rejected": -2.3082945346832275, "logps/chosen": -79.91629028320312, "logps/rejected": -102.13844299316406, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": 0.9620257616043091, "rewards/margins": 21.463024139404297, "rewards/rejected": -20.50099754333496, "step": 2560 }, { "epoch": 1.17, "learning_rate": 2.0294266869609332e-07, "logits/chosen": -2.6548991203308105, "logits/rejected": -2.3387608528137207, "logps/chosen": -82.40703582763672, "logps/rejected": -103.59181213378906, "loss": 0.0046, "rewards/accuracies": 1.0, "rewards/chosen": 0.2572837471961975, "rewards/margins": 20.96331214904785, "rewards/rejected": -20.70602798461914, "step": 2570 }, { "epoch": 1.18, "learning_rate": 2.024353120243531e-07, "logits/chosen": -2.6292335987091064, "logits/rejected": -2.3412420749664307, "logps/chosen": -78.17959594726562, "logps/rejected": -109.92939758300781, "loss": 0.0034, "rewards/accuracies": 1.0, "rewards/chosen": 0.21062612533569336, "rewards/margins": 21.128894805908203, "rewards/rejected": -20.918270111083984, "step": 2580 }, { "epoch": 1.18, "learning_rate": 2.0192795535261286e-07, "logits/chosen": -2.6379122734069824, "logits/rejected": -2.2955474853515625, "logps/chosen": -85.09695434570312, "logps/rejected": -112.4044189453125, "loss": 0.0057, "rewards/accuracies": 1.0, "rewards/chosen": 0.9956000447273254, "rewards/margins": 23.14710235595703, "rewards/rejected": -22.151500701904297, "step": 2590 }, { "epoch": 1.19, "learning_rate": 2.0142059868087262e-07, "logits/chosen": -2.6259665489196777, "logits/rejected": -2.328274726867676, "logps/chosen": -82.6556625366211, "logps/rejected": -108.2914810180664, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": 0.8803138732910156, "rewards/margins": 22.103456497192383, "rewards/rejected": -21.223142623901367, "step": 2600 }, { "epoch": 1.19, "eval_logits/chosen": -2.3272674083709717, "eval_logits/rejected": -2.0528404712677, "eval_logps/chosen": -82.01408386230469, "eval_logps/rejected": -106.38927459716797, "eval_loss": 0.008109867572784424, "eval_rewards/accuracies": 0.9916201233863831, "eval_rewards/chosen": 0.08582764863967896, "eval_rewards/margins": 22.04981231689453, "eval_rewards/rejected": -21.9639835357666, "eval_runtime": 256.7045, "eval_samples_per_second": 11.149, "eval_steps_per_second": 0.697, "step": 2600 }, { "epoch": 1.19, "learning_rate": 2.009132420091324e-07, "logits/chosen": -2.6282129287719727, "logits/rejected": -2.354618787765503, "logps/chosen": -79.5919418334961, "logps/rejected": -108.01798248291016, "loss": 0.0067, "rewards/accuracies": 1.0, "rewards/chosen": 0.8283111453056335, "rewards/margins": 22.698415756225586, "rewards/rejected": -21.87010383605957, "step": 2610 }, { "epoch": 1.2, "learning_rate": 2.0040588533739216e-07, "logits/chosen": -2.6521239280700684, "logits/rejected": -2.3077118396759033, "logps/chosen": -87.87489318847656, "logps/rejected": -112.98710632324219, "loss": 0.0031, "rewards/accuracies": 1.0, "rewards/chosen": 0.12028863281011581, "rewards/margins": 23.168392181396484, "rewards/rejected": -23.048105239868164, "step": 2620 }, { "epoch": 1.2, "learning_rate": 1.9989852866565192e-07, "logits/chosen": -2.641965627670288, "logits/rejected": -2.3265223503112793, "logps/chosen": -82.09970092773438, "logps/rejected": -108.6490249633789, "loss": 0.0044, "rewards/accuracies": 1.0, "rewards/chosen": 0.501336932182312, "rewards/margins": 22.71762466430664, "rewards/rejected": -22.216285705566406, "step": 2630 }, { "epoch": 1.21, "learning_rate": 1.993911719939117e-07, "logits/chosen": -2.6664814949035645, "logits/rejected": -2.3428611755371094, "logps/chosen": -82.57010650634766, "logps/rejected": -103.94281005859375, "loss": 0.0037, "rewards/accuracies": 1.0, "rewards/chosen": 0.4482641816139221, "rewards/margins": 21.410070419311523, "rewards/rejected": -20.961807250976562, "step": 2640 }, { "epoch": 1.21, "learning_rate": 1.9888381532217146e-07, "logits/chosen": -2.6294732093811035, "logits/rejected": -2.353087902069092, "logps/chosen": -77.64653015136719, "logps/rejected": -110.2952651977539, "loss": 0.0034, "rewards/accuracies": 1.0, "rewards/chosen": -0.11264465004205704, "rewards/margins": 22.453195571899414, "rewards/rejected": -22.565839767456055, "step": 2650 }, { "epoch": 1.21, "learning_rate": 1.9837645865043122e-07, "logits/chosen": -2.6417112350463867, "logits/rejected": -2.3186283111572266, "logps/chosen": -79.8727035522461, "logps/rejected": -108.9670639038086, "loss": 0.0026, "rewards/accuracies": 1.0, "rewards/chosen": 0.6170593500137329, "rewards/margins": 23.300952911376953, "rewards/rejected": -22.683889389038086, "step": 2660 }, { "epoch": 1.22, "learning_rate": 1.97869101978691e-07, "logits/chosen": -2.6213388442993164, "logits/rejected": -2.3433380126953125, "logps/chosen": -82.99671936035156, "logps/rejected": -109.78025817871094, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -0.12517967820167542, "rewards/margins": 22.674070358276367, "rewards/rejected": -22.79924964904785, "step": 2670 }, { "epoch": 1.22, "learning_rate": 1.9736174530695076e-07, "logits/chosen": -2.648165464401245, "logits/rejected": -2.3396267890930176, "logps/chosen": -81.04335021972656, "logps/rejected": -106.6722640991211, "loss": 0.0042, "rewards/accuracies": 1.0, "rewards/chosen": 0.8094123601913452, "rewards/margins": 23.4619197845459, "rewards/rejected": -22.652507781982422, "step": 2680 }, { "epoch": 1.23, "learning_rate": 1.9685438863521052e-07, "logits/chosen": -2.6579599380493164, "logits/rejected": -2.347898483276367, "logps/chosen": -80.73441314697266, "logps/rejected": -110.6244125366211, "loss": 0.0026, "rewards/accuracies": 1.0, "rewards/chosen": 0.32758834958076477, "rewards/margins": 23.45597267150879, "rewards/rejected": -23.12838363647461, "step": 2690 }, { "epoch": 1.23, "learning_rate": 1.963470319634703e-07, "logits/chosen": -2.6598477363586426, "logits/rejected": -2.3553056716918945, "logps/chosen": -85.83523559570312, "logps/rejected": -110.26567077636719, "loss": 0.0046, "rewards/accuracies": 1.0, "rewards/chosen": -0.38448014855384827, "rewards/margins": 23.080158233642578, "rewards/rejected": -23.46463966369629, "step": 2700 }, { "epoch": 1.23, "eval_logits/chosen": -2.345696210861206, "eval_logits/rejected": -2.066790819168091, "eval_logps/chosen": -82.69432830810547, "eval_logps/rejected": -109.26456451416016, "eval_loss": 0.00831608846783638, "eval_rewards/accuracies": 0.9916201233863831, "eval_rewards/chosen": -0.2542888820171356, "eval_rewards/margins": 23.1473445892334, "eval_rewards/rejected": -23.40163230895996, "eval_runtime": 191.8104, "eval_samples_per_second": 14.921, "eval_steps_per_second": 0.933, "step": 2700 }, { "epoch": 1.24, "learning_rate": 1.9583967529173006e-07, "logits/chosen": -2.678093433380127, "logits/rejected": -2.3364322185516357, "logps/chosen": -85.9931640625, "logps/rejected": -111.09895324707031, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": 0.0020569325424730778, "rewards/margins": 23.53390884399414, "rewards/rejected": -23.531850814819336, "step": 2710 }, { "epoch": 1.24, "learning_rate": 1.9533231861998982e-07, "logits/chosen": -2.6772427558898926, "logits/rejected": -2.3105010986328125, "logps/chosen": -86.41141510009766, "logps/rejected": -116.20054626464844, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": 0.34058305621147156, "rewards/margins": 24.712804794311523, "rewards/rejected": -24.372222900390625, "step": 2720 }, { "epoch": 1.25, "learning_rate": 1.948249619482496e-07, "logits/chosen": -2.6590237617492676, "logits/rejected": -2.350254774093628, "logps/chosen": -84.52534484863281, "logps/rejected": -109.49930572509766, "loss": 0.0055, "rewards/accuracies": 1.0, "rewards/chosen": -0.562452495098114, "rewards/margins": 22.374103546142578, "rewards/rejected": -22.936555862426758, "step": 2730 }, { "epoch": 1.25, "learning_rate": 1.9431760527650936e-07, "logits/chosen": -2.6717960834503174, "logits/rejected": -2.2468342781066895, "logps/chosen": -88.84379577636719, "logps/rejected": -112.70384216308594, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 0.1577484905719757, "rewards/margins": 23.783527374267578, "rewards/rejected": -23.62578010559082, "step": 2740 }, { "epoch": 1.26, "learning_rate": 1.9381024860476912e-07, "logits/chosen": -2.6599864959716797, "logits/rejected": -2.283949375152588, "logps/chosen": -89.84847259521484, "logps/rejected": -112.76377868652344, "loss": 0.0055, "rewards/accuracies": 1.0, "rewards/chosen": 0.6849825382232666, "rewards/margins": 25.06772232055664, "rewards/rejected": -24.382740020751953, "step": 2750 }, { "epoch": 1.26, "learning_rate": 1.933028919330289e-07, "logits/chosen": -2.666045665740967, "logits/rejected": -2.349733829498291, "logps/chosen": -85.85981750488281, "logps/rejected": -112.88619232177734, "loss": 0.0043, "rewards/accuracies": 1.0, "rewards/chosen": 0.15094280242919922, "rewards/margins": 24.22039222717285, "rewards/rejected": -24.06945037841797, "step": 2760 }, { "epoch": 1.26, "learning_rate": 1.9279553526128866e-07, "logits/chosen": -2.6730546951293945, "logits/rejected": -2.324289560317993, "logps/chosen": -84.74443054199219, "logps/rejected": -114.93988037109375, "loss": 0.0036, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.5609356760978699, "rewards/margins": 25.96152114868164, "rewards/rejected": -25.400585174560547, "step": 2770 }, { "epoch": 1.27, "learning_rate": 1.9228817858954842e-07, "logits/chosen": -2.6866507530212402, "logits/rejected": -2.3709559440612793, "logps/chosen": -80.03260803222656, "logps/rejected": -111.25434875488281, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": 0.6718248724937439, "rewards/margins": 24.150434494018555, "rewards/rejected": -23.47861099243164, "step": 2780 }, { "epoch": 1.27, "learning_rate": 1.917808219178082e-07, "logits/chosen": -2.6763479709625244, "logits/rejected": -2.4036178588867188, "logps/chosen": -76.93629455566406, "logps/rejected": -111.5823974609375, "loss": 0.0036, "rewards/accuracies": 1.0, "rewards/chosen": -0.598991870880127, "rewards/margins": 22.445356369018555, "rewards/rejected": -23.044349670410156, "step": 2790 }, { "epoch": 1.28, "learning_rate": 1.9127346524606796e-07, "logits/chosen": -2.6934406757354736, "logits/rejected": -2.358464002609253, "logps/chosen": -84.55718994140625, "logps/rejected": -113.28120422363281, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": 1.0083122253417969, "rewards/margins": 24.021825790405273, "rewards/rejected": -23.01351547241211, "step": 2800 }, { "epoch": 1.28, "eval_logits/chosen": -2.3649747371673584, "eval_logits/rejected": -2.0883922576904297, "eval_logps/chosen": -82.84912872314453, "eval_logps/rejected": -110.03560638427734, "eval_loss": 0.008349267765879631, "eval_rewards/accuracies": 0.9916201233863831, "eval_rewards/chosen": -0.33169642090797424, "eval_rewards/margins": 23.455453872680664, "eval_rewards/rejected": -23.787153244018555, "eval_runtime": 170.6088, "eval_samples_per_second": 16.775, "eval_steps_per_second": 1.049, "step": 2800 }, { "epoch": 1.28, "learning_rate": 1.9076610857432772e-07, "logits/chosen": -2.6867544651031494, "logits/rejected": -2.3584742546081543, "logps/chosen": -82.95903778076172, "logps/rejected": -115.54469299316406, "loss": 0.0012, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.052515339106321335, "rewards/margins": 24.212982177734375, "rewards/rejected": -24.265499114990234, "step": 2810 }, { "epoch": 1.29, "learning_rate": 1.902587519025875e-07, "logits/chosen": -2.695505380630493, "logits/rejected": -2.3168997764587402, "logps/chosen": -89.97647094726562, "logps/rejected": -115.47993469238281, "loss": 0.0027, "rewards/accuracies": 1.0, "rewards/chosen": -0.03120584413409233, "rewards/margins": 24.901416778564453, "rewards/rejected": -24.9326229095459, "step": 2820 }, { "epoch": 1.29, "learning_rate": 1.8975139523084726e-07, "logits/chosen": -2.666916608810425, "logits/rejected": -2.3753468990325928, "logps/chosen": -87.85090637207031, "logps/rejected": -116.8847427368164, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -0.8360681533813477, "rewards/margins": 23.404916763305664, "rewards/rejected": -24.240983963012695, "step": 2830 }, { "epoch": 1.3, "learning_rate": 1.8924403855910702e-07, "logits/chosen": -2.6697330474853516, "logits/rejected": -2.3515477180480957, "logps/chosen": -82.55045318603516, "logps/rejected": -110.74652099609375, "loss": 0.0025, "rewards/accuracies": 1.0, "rewards/chosen": -0.5056793093681335, "rewards/margins": 23.97299575805664, "rewards/rejected": -24.47867774963379, "step": 2840 }, { "epoch": 1.3, "learning_rate": 1.887366818873668e-07, "logits/chosen": -2.705828905105591, "logits/rejected": -2.376357078552246, "logps/chosen": -82.81072235107422, "logps/rejected": -113.68083190917969, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 0.2072942703962326, "rewards/margins": 24.321693420410156, "rewards/rejected": -24.114398956298828, "step": 2850 }, { "epoch": 1.31, "learning_rate": 1.8822932521562656e-07, "logits/chosen": -2.718200445175171, "logits/rejected": -2.399697780609131, "logps/chosen": -81.552978515625, "logps/rejected": -114.05682373046875, "loss": 0.0077, "rewards/accuracies": 1.0, "rewards/chosen": 0.9049330949783325, "rewards/margins": 25.248626708984375, "rewards/rejected": -24.343692779541016, "step": 2860 }, { "epoch": 1.31, "learning_rate": 1.8772196854388632e-07, "logits/chosen": -2.6862926483154297, "logits/rejected": -2.37678861618042, "logps/chosen": -84.85992431640625, "logps/rejected": -113.00279235839844, "loss": 0.0049, "rewards/accuracies": 1.0, "rewards/chosen": -0.42080897092819214, "rewards/margins": 22.70480728149414, "rewards/rejected": -23.125614166259766, "step": 2870 }, { "epoch": 1.31, "learning_rate": 1.872146118721461e-07, "logits/chosen": -2.7302603721618652, "logits/rejected": -2.320916175842285, "logps/chosen": -89.06233978271484, "logps/rejected": -112.29793548583984, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": 0.9952018857002258, "rewards/margins": 24.969585418701172, "rewards/rejected": -23.974384307861328, "step": 2880 }, { "epoch": 1.32, "learning_rate": 1.8670725520040586e-07, "logits/chosen": -2.7124643325805664, "logits/rejected": -2.3599114418029785, "logps/chosen": -85.04414367675781, "logps/rejected": -113.2823486328125, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": 1.1878468990325928, "rewards/margins": 25.414531707763672, "rewards/rejected": -24.226682662963867, "step": 2890 }, { "epoch": 1.32, "learning_rate": 1.8619989852866562e-07, "logits/chosen": -2.7142932415008545, "logits/rejected": -2.4082870483398438, "logps/chosen": -82.4513931274414, "logps/rejected": -116.231689453125, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": -0.4858674108982086, "rewards/margins": 24.812488555908203, "rewards/rejected": -25.298355102539062, "step": 2900 }, { "epoch": 1.32, "eval_logits/chosen": -2.387925863265991, "eval_logits/rejected": -2.1054039001464844, "eval_logps/chosen": -82.73624420166016, "eval_logps/rejected": -111.19764709472656, "eval_loss": 0.00841273833066225, "eval_rewards/accuracies": 0.9916201233863831, "eval_rewards/chosen": -0.2752546966075897, "eval_rewards/margins": 24.092918395996094, "eval_rewards/rejected": -24.3681697845459, "eval_runtime": 203.0821, "eval_samples_per_second": 14.093, "eval_steps_per_second": 0.881, "step": 2900 }, { "epoch": 1.33, "learning_rate": 1.856925418569254e-07, "logits/chosen": -2.67891263961792, "logits/rejected": -2.3304860591888428, "logps/chosen": -87.03717803955078, "logps/rejected": -114.64599609375, "loss": 0.0037, "rewards/accuracies": 1.0, "rewards/chosen": 1.9134318828582764, "rewards/margins": 26.436954498291016, "rewards/rejected": -24.52351951599121, "step": 2910 }, { "epoch": 1.33, "learning_rate": 1.8518518518518516e-07, "logits/chosen": -2.698725461959839, "logits/rejected": -2.398883104324341, "logps/chosen": -81.1679916381836, "logps/rejected": -113.8624267578125, "loss": 0.0047, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.2939888834953308, "rewards/margins": 25.168251037597656, "rewards/rejected": -24.874263763427734, "step": 2920 }, { "epoch": 1.34, "learning_rate": 1.8467782851344492e-07, "logits/chosen": -2.7024359703063965, "logits/rejected": -2.4063286781311035, "logps/chosen": -83.2025146484375, "logps/rejected": -112.1222915649414, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 0.2178274691104889, "rewards/margins": 25.137231826782227, "rewards/rejected": -24.919404983520508, "step": 2930 }, { "epoch": 1.34, "learning_rate": 1.841704718417047e-07, "logits/chosen": -2.687952995300293, "logits/rejected": -2.329446315765381, "logps/chosen": -83.30066680908203, "logps/rejected": -114.17464447021484, "loss": 0.0054, "rewards/accuracies": 1.0, "rewards/chosen": 0.6062806248664856, "rewards/margins": 25.18267822265625, "rewards/rejected": -24.576396942138672, "step": 2940 }, { "epoch": 1.35, "learning_rate": 1.8366311516996446e-07, "logits/chosen": -2.7220399379730225, "logits/rejected": -2.394536256790161, "logps/chosen": -79.72315979003906, "logps/rejected": -108.08235168457031, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -0.6456438899040222, "rewards/margins": 23.10774803161621, "rewards/rejected": -23.75339126586914, "step": 2950 }, { "epoch": 1.35, "learning_rate": 1.8315575849822422e-07, "logits/chosen": -2.7134175300598145, "logits/rejected": -2.4105725288391113, "logps/chosen": -86.59757232666016, "logps/rejected": -109.53243255615234, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": 0.06646320968866348, "rewards/margins": 21.174022674560547, "rewards/rejected": -21.107561111450195, "step": 2960 }, { "epoch": 1.36, "learning_rate": 1.82648401826484e-07, "logits/chosen": -2.6932759284973145, "logits/rejected": -2.363738775253296, "logps/chosen": -82.44561004638672, "logps/rejected": -106.46821594238281, "loss": 0.0053, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.4256173074245453, "rewards/margins": 22.258800506591797, "rewards/rejected": -21.833179473876953, "step": 2970 }, { "epoch": 1.36, "learning_rate": 1.8214104515474375e-07, "logits/chosen": -2.666003704071045, "logits/rejected": -2.3289661407470703, "logps/chosen": -86.4139633178711, "logps/rejected": -114.21708679199219, "loss": 0.0026, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.032378338277339935, "rewards/margins": 23.808643341064453, "rewards/rejected": -23.841022491455078, "step": 2980 }, { "epoch": 1.36, "learning_rate": 1.8163368848300352e-07, "logits/chosen": -2.6774415969848633, "logits/rejected": -2.396759033203125, "logps/chosen": -81.80799102783203, "logps/rejected": -114.46085357666016, "loss": 0.012, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.2402961254119873, "rewards/margins": 24.016206741333008, "rewards/rejected": -23.775909423828125, "step": 2990 }, { "epoch": 1.37, "learning_rate": 1.811263318112633e-07, "logits/chosen": -2.671124219894409, "logits/rejected": -2.3801181316375732, "logps/chosen": -79.99850463867188, "logps/rejected": -110.9075927734375, "loss": 0.0034, "rewards/accuracies": 1.0, "rewards/chosen": 0.988418698310852, "rewards/margins": 24.072200775146484, "rewards/rejected": -23.08378028869629, "step": 3000 }, { "epoch": 1.37, "eval_logits/chosen": -2.3564696311950684, "eval_logits/rejected": -2.081695795059204, "eval_logps/chosen": -81.32012939453125, "eval_logps/rejected": -109.09380340576172, "eval_loss": 0.00814717449247837, "eval_rewards/accuracies": 0.9916201233863831, "eval_rewards/chosen": 0.4328048825263977, "eval_rewards/margins": 23.749055862426758, "eval_rewards/rejected": -23.31624984741211, "eval_runtime": 181.5965, "eval_samples_per_second": 15.76, "eval_steps_per_second": 0.986, "step": 3000 }, { "epoch": 1.37, "learning_rate": 1.8061897513952305e-07, "logits/chosen": -2.6872005462646484, "logits/rejected": -2.323394298553467, "logps/chosen": -86.51441955566406, "logps/rejected": -111.68022155761719, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 1.1383650302886963, "rewards/margins": 24.89560890197754, "rewards/rejected": -23.757244110107422, "step": 3010 }, { "epoch": 1.38, "learning_rate": 1.8011161846778282e-07, "logits/chosen": -2.6841628551483154, "logits/rejected": -2.3249566555023193, "logps/chosen": -84.3753433227539, "logps/rejected": -109.21502685546875, "loss": 0.0025, "rewards/accuracies": 1.0, "rewards/chosen": 1.039917230606079, "rewards/margins": 22.78086280822754, "rewards/rejected": -21.740943908691406, "step": 3020 }, { "epoch": 1.38, "learning_rate": 1.796042617960426e-07, "logits/chosen": -2.669074535369873, "logits/rejected": -2.310793161392212, "logps/chosen": -88.86004638671875, "logps/rejected": -109.41426849365234, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": 0.1169506311416626, "rewards/margins": 23.175771713256836, "rewards/rejected": -23.058818817138672, "step": 3030 }, { "epoch": 1.39, "learning_rate": 1.7909690512430235e-07, "logits/chosen": -2.6745293140411377, "logits/rejected": -2.3425421714782715, "logps/chosen": -83.24311828613281, "logps/rejected": -112.27220153808594, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": 0.7184518575668335, "rewards/margins": 25.374147415161133, "rewards/rejected": -24.65569496154785, "step": 3040 }, { "epoch": 1.39, "learning_rate": 1.7858954845256212e-07, "logits/chosen": -2.659925699234009, "logits/rejected": -2.4067463874816895, "logps/chosen": -83.95664978027344, "logps/rejected": -114.77005767822266, "loss": 0.0047, "rewards/accuracies": 1.0, "rewards/chosen": 1.0322695970535278, "rewards/margins": 24.820148468017578, "rewards/rejected": -23.78788185119629, "step": 3050 }, { "epoch": 1.4, "learning_rate": 1.780821917808219e-07, "logits/chosen": -2.670229434967041, "logits/rejected": -2.381035327911377, "logps/chosen": -81.76079559326172, "logps/rejected": -114.39422607421875, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": 0.4897182881832123, "rewards/margins": 22.658618927001953, "rewards/rejected": -22.168901443481445, "step": 3060 }, { "epoch": 1.4, "learning_rate": 1.7757483510908165e-07, "logits/chosen": -2.666069746017456, "logits/rejected": -2.3733909130096436, "logps/chosen": -78.46485900878906, "logps/rejected": -115.03691101074219, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.5286703109741211, "rewards/margins": 25.251476287841797, "rewards/rejected": -24.72280502319336, "step": 3070 }, { "epoch": 1.41, "learning_rate": 1.7706747843734142e-07, "logits/chosen": -2.6807875633239746, "logits/rejected": -2.302530288696289, "logps/chosen": -80.33370208740234, "logps/rejected": -108.50965881347656, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": 1.0222965478897095, "rewards/margins": 24.445348739624023, "rewards/rejected": -23.423053741455078, "step": 3080 }, { "epoch": 1.41, "learning_rate": 1.765601217656012e-07, "logits/chosen": -2.649898052215576, "logits/rejected": -2.3058419227600098, "logps/chosen": -84.1609878540039, "logps/rejected": -110.86158752441406, "loss": 0.0054, "rewards/accuracies": 1.0, "rewards/chosen": 1.1862804889678955, "rewards/margins": 24.446117401123047, "rewards/rejected": -23.25983428955078, "step": 3090 }, { "epoch": 1.42, "learning_rate": 1.7605276509386095e-07, "logits/chosen": -2.652932643890381, "logits/rejected": -2.3606390953063965, "logps/chosen": -81.58973693847656, "logps/rejected": -112.62255859375, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": 0.5852065086364746, "rewards/margins": 23.893949508666992, "rewards/rejected": -23.308744430541992, "step": 3100 }, { "epoch": 1.42, "eval_logits/chosen": -2.3446712493896484, "eval_logits/rejected": -2.0706257820129395, "eval_logps/chosen": -82.23661041259766, "eval_logps/rejected": -109.9393539428711, "eval_loss": 0.00822613388299942, "eval_rewards/accuracies": 0.994413435459137, "eval_rewards/chosen": -0.025433748960494995, "eval_rewards/margins": 23.713592529296875, "eval_rewards/rejected": -23.739023208618164, "eval_runtime": 199.8912, "eval_samples_per_second": 14.318, "eval_steps_per_second": 0.895, "step": 3100 }, { "epoch": 1.42, "learning_rate": 1.7554540842212072e-07, "logits/chosen": -2.6777281761169434, "logits/rejected": -2.3256850242614746, "logps/chosen": -84.544921875, "logps/rejected": -110.39747619628906, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": 0.7409707903862, "rewards/margins": 23.78182601928711, "rewards/rejected": -23.040857315063477, "step": 3110 }, { "epoch": 1.42, "learning_rate": 1.750380517503805e-07, "logits/chosen": -2.6797995567321777, "logits/rejected": -2.3483428955078125, "logps/chosen": -85.49281311035156, "logps/rejected": -113.40995025634766, "loss": 0.005, "rewards/accuracies": 1.0, "rewards/chosen": 0.11576847732067108, "rewards/margins": 24.14320182800293, "rewards/rejected": -24.027433395385742, "step": 3120 }, { "epoch": 1.43, "learning_rate": 1.7453069507864025e-07, "logits/chosen": -2.663177490234375, "logits/rejected": -2.345109224319458, "logps/chosen": -82.77714538574219, "logps/rejected": -105.75787353515625, "loss": 0.0077, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.2896865904331207, "rewards/margins": 23.47965431213379, "rewards/rejected": -23.769338607788086, "step": 3130 }, { "epoch": 1.43, "learning_rate": 1.7402333840690002e-07, "logits/chosen": -2.6542868614196777, "logits/rejected": -2.3269848823547363, "logps/chosen": -83.46544647216797, "logps/rejected": -110.5511703491211, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": -0.44541388750076294, "rewards/margins": 23.193172454833984, "rewards/rejected": -23.638586044311523, "step": 3140 }, { "epoch": 1.44, "learning_rate": 1.735159817351598e-07, "logits/chosen": -2.6640706062316895, "logits/rejected": -2.377800703048706, "logps/chosen": -81.88560485839844, "logps/rejected": -117.19815826416016, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": 0.39661675691604614, "rewards/margins": 24.547039031982422, "rewards/rejected": -24.150419235229492, "step": 3150 }, { "epoch": 1.44, "learning_rate": 1.7300862506341955e-07, "logits/chosen": -2.6611101627349854, "logits/rejected": -2.325866460800171, "logps/chosen": -93.03730773925781, "logps/rejected": -115.31880187988281, "loss": 0.0064, "rewards/accuracies": 1.0, "rewards/chosen": -0.6037623286247253, "rewards/margins": 24.217891693115234, "rewards/rejected": -24.821657180786133, "step": 3160 }, { "epoch": 1.45, "learning_rate": 1.7250126839167932e-07, "logits/chosen": -2.6688554286956787, "logits/rejected": -2.356708526611328, "logps/chosen": -87.67771911621094, "logps/rejected": -114.6445541381836, "loss": 0.0048, "rewards/accuracies": 1.0, "rewards/chosen": -0.5825105905532837, "rewards/margins": 24.024593353271484, "rewards/rejected": -24.607101440429688, "step": 3170 }, { "epoch": 1.45, "learning_rate": 1.719939117199391e-07, "logits/chosen": -2.6575815677642822, "logits/rejected": -2.3448948860168457, "logps/chosen": -81.12501525878906, "logps/rejected": -114.0226821899414, "loss": 0.0061, "rewards/accuracies": 1.0, "rewards/chosen": -0.43694692850112915, "rewards/margins": 23.854961395263672, "rewards/rejected": -24.29191017150879, "step": 3180 }, { "epoch": 1.46, "learning_rate": 1.7148655504819885e-07, "logits/chosen": -2.6633763313293457, "logits/rejected": -2.3289904594421387, "logps/chosen": -85.6478042602539, "logps/rejected": -113.5853042602539, "loss": 0.0026, "rewards/accuracies": 1.0, "rewards/chosen": -0.35943108797073364, "rewards/margins": 23.344356536865234, "rewards/rejected": -23.70378875732422, "step": 3190 }, { "epoch": 1.46, "learning_rate": 1.7097919837645862e-07, "logits/chosen": -2.675062417984009, "logits/rejected": -2.361314296722412, "logps/chosen": -88.24284362792969, "logps/rejected": -117.39481353759766, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": -1.5497006177902222, "rewards/margins": 24.16522216796875, "rewards/rejected": -25.714920043945312, "step": 3200 }, { "epoch": 1.46, "eval_logits/chosen": -2.3542585372924805, "eval_logits/rejected": -2.0759880542755127, "eval_logps/chosen": -83.72178649902344, "eval_logps/rejected": -110.55169677734375, "eval_loss": 0.008555963635444641, "eval_rewards/accuracies": 0.9916201233863831, "eval_rewards/chosen": -0.7680222392082214, "eval_rewards/margins": 23.277179718017578, "eval_rewards/rejected": -24.045202255249023, "eval_runtime": 283.5638, "eval_samples_per_second": 10.093, "eval_steps_per_second": 0.631, "step": 3200 }, { "epoch": 1.47, "learning_rate": 1.704718417047184e-07, "logits/chosen": -2.6321349143981934, "logits/rejected": -2.3700881004333496, "logps/chosen": -81.64762878417969, "logps/rejected": -107.8651123046875, "loss": 0.0028, "rewards/accuracies": 1.0, "rewards/chosen": -1.6523443460464478, "rewards/margins": 21.662458419799805, "rewards/rejected": -23.314800262451172, "step": 3210 }, { "epoch": 1.47, "learning_rate": 1.6996448503297815e-07, "logits/chosen": -2.6559271812438965, "logits/rejected": -2.36987042427063, "logps/chosen": -80.37175750732422, "logps/rejected": -114.6617431640625, "loss": 0.0037, "rewards/accuracies": 1.0, "rewards/chosen": -1.1735812425613403, "rewards/margins": 23.08481216430664, "rewards/rejected": -24.258392333984375, "step": 3220 }, { "epoch": 1.47, "learning_rate": 1.6945712836123792e-07, "logits/chosen": -2.6714494228363037, "logits/rejected": -2.3544609546661377, "logps/chosen": -79.87291717529297, "logps/rejected": -110.99893951416016, "loss": 0.0067, "rewards/accuracies": 1.0, "rewards/chosen": -0.1398446261882782, "rewards/margins": 24.156417846679688, "rewards/rejected": -24.296260833740234, "step": 3230 }, { "epoch": 1.48, "learning_rate": 1.689497716894977e-07, "logits/chosen": -2.63079571723938, "logits/rejected": -2.328859806060791, "logps/chosen": -87.00918579101562, "logps/rejected": -118.7082748413086, "loss": 0.0042, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.0113550424575806, "rewards/margins": 23.81175994873047, "rewards/rejected": -24.8231143951416, "step": 3240 }, { "epoch": 1.48, "learning_rate": 1.6844241501775745e-07, "logits/chosen": -2.6517558097839355, "logits/rejected": -2.346189260482788, "logps/chosen": -85.27503967285156, "logps/rejected": -112.4189682006836, "loss": 0.0024, "rewards/accuracies": 1.0, "rewards/chosen": -0.23823706805706024, "rewards/margins": 23.88079833984375, "rewards/rejected": -24.119035720825195, "step": 3250 }, { "epoch": 1.49, "learning_rate": 1.6793505834601722e-07, "logits/chosen": -2.6607556343078613, "logits/rejected": -2.3729300498962402, "logps/chosen": -78.48094177246094, "logps/rejected": -114.2974624633789, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -0.4324473440647125, "rewards/margins": 23.81759262084961, "rewards/rejected": -24.250041961669922, "step": 3260 }, { "epoch": 1.49, "learning_rate": 1.67427701674277e-07, "logits/chosen": -2.662973165512085, "logits/rejected": -2.332409381866455, "logps/chosen": -80.18125915527344, "logps/rejected": -111.27408599853516, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": 0.2091609239578247, "rewards/margins": 24.044448852539062, "rewards/rejected": -23.835289001464844, "step": 3270 }, { "epoch": 1.5, "learning_rate": 1.6692034500253675e-07, "logits/chosen": -2.6626083850860596, "logits/rejected": -2.3439598083496094, "logps/chosen": -80.57767486572266, "logps/rejected": -114.80329895019531, "loss": 0.0044, "rewards/accuracies": 1.0, "rewards/chosen": -0.2342810183763504, "rewards/margins": 24.559240341186523, "rewards/rejected": -24.79352378845215, "step": 3280 }, { "epoch": 1.5, "learning_rate": 1.6641298833079652e-07, "logits/chosen": -2.6679940223693848, "logits/rejected": -2.326353073120117, "logps/chosen": -83.03609466552734, "logps/rejected": -111.9715576171875, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": -0.4402456283569336, "rewards/margins": 23.81828498840332, "rewards/rejected": -24.258529663085938, "step": 3290 }, { "epoch": 1.51, "learning_rate": 1.659056316590563e-07, "logits/chosen": -2.66428542137146, "logits/rejected": -2.3843436241149902, "logps/chosen": -78.55763244628906, "logps/rejected": -106.07743072509766, "loss": 0.0032, "rewards/accuracies": 1.0, "rewards/chosen": -0.6965314149856567, "rewards/margins": 23.174253463745117, "rewards/rejected": -23.870784759521484, "step": 3300 }, { "epoch": 1.51, "eval_logits/chosen": -2.3654584884643555, "eval_logits/rejected": -2.088066816329956, "eval_logps/chosen": -82.18890380859375, "eval_logps/rejected": -109.4934310913086, "eval_loss": 0.008571669459342957, "eval_rewards/accuracies": 0.994413435459137, "eval_rewards/chosen": -0.0015816076193004847, "eval_rewards/margins": 23.514490127563477, "eval_rewards/rejected": -23.516069412231445, "eval_runtime": 210.7828, "eval_samples_per_second": 13.578, "eval_steps_per_second": 0.849, "step": 3300 }, { "epoch": 1.51, "learning_rate": 1.6539827498731605e-07, "logits/chosen": -2.696434259414673, "logits/rejected": -2.360797166824341, "logps/chosen": -85.07469177246094, "logps/rejected": -113.14781188964844, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -0.2798609137535095, "rewards/margins": 24.332569122314453, "rewards/rejected": -24.612430572509766, "step": 3310 }, { "epoch": 1.52, "learning_rate": 1.6489091831557582e-07, "logits/chosen": -2.699249505996704, "logits/rejected": -2.4046883583068848, "logps/chosen": -86.44249725341797, "logps/rejected": -112.90473937988281, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -0.09435518085956573, "rewards/margins": 23.588451385498047, "rewards/rejected": -23.68280601501465, "step": 3320 }, { "epoch": 1.52, "learning_rate": 1.643835616438356e-07, "logits/chosen": -2.6865737438201904, "logits/rejected": -2.414750814437866, "logps/chosen": -81.11417388916016, "logps/rejected": -111.50926208496094, "loss": 0.0042, "rewards/accuracies": 1.0, "rewards/chosen": -0.3432890772819519, "rewards/margins": 23.046846389770508, "rewards/rejected": -23.39013671875, "step": 3330 }, { "epoch": 1.52, "learning_rate": 1.6387620497209535e-07, "logits/chosen": -2.6747775077819824, "logits/rejected": -2.3654675483703613, "logps/chosen": -80.5301742553711, "logps/rejected": -114.2152328491211, "loss": 0.0068, "rewards/accuracies": 1.0, "rewards/chosen": -0.24930819869041443, "rewards/margins": 23.585683822631836, "rewards/rejected": -23.834993362426758, "step": 3340 }, { "epoch": 1.53, "learning_rate": 1.6336884830035512e-07, "logits/chosen": -2.687844753265381, "logits/rejected": -2.387223720550537, "logps/chosen": -83.20491790771484, "logps/rejected": -113.59019470214844, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -0.7579129934310913, "rewards/margins": 24.23940086364746, "rewards/rejected": -24.997310638427734, "step": 3350 }, { "epoch": 1.53, "learning_rate": 1.6286149162861489e-07, "logits/chosen": -2.682297945022583, "logits/rejected": -2.341749906539917, "logps/chosen": -84.87847900390625, "logps/rejected": -115.72343444824219, "loss": 0.0043, "rewards/accuracies": 1.0, "rewards/chosen": -0.2171802520751953, "rewards/margins": 24.794300079345703, "rewards/rejected": -25.0114803314209, "step": 3360 }, { "epoch": 1.54, "learning_rate": 1.6235413495687465e-07, "logits/chosen": -2.6745264530181885, "logits/rejected": -2.3458900451660156, "logps/chosen": -79.84796142578125, "logps/rejected": -108.13448333740234, "loss": 0.0027, "rewards/accuracies": 1.0, "rewards/chosen": 0.2517082095146179, "rewards/margins": 23.9562931060791, "rewards/rejected": -23.7045841217041, "step": 3370 }, { "epoch": 1.54, "learning_rate": 1.6184677828513442e-07, "logits/chosen": -2.7040045261383057, "logits/rejected": -2.3453478813171387, "logps/chosen": -84.21946716308594, "logps/rejected": -111.1295166015625, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": 0.5374824404716492, "rewards/margins": 24.102069854736328, "rewards/rejected": -23.564584732055664, "step": 3380 }, { "epoch": 1.55, "learning_rate": 1.613394216133942e-07, "logits/chosen": -2.6855995655059814, "logits/rejected": -2.35943603515625, "logps/chosen": -83.55799102783203, "logps/rejected": -111.28907775878906, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.45095500349998474, "rewards/margins": 25.45101547241211, "rewards/rejected": -25.00006103515625, "step": 3390 }, { "epoch": 1.55, "learning_rate": 1.6083206494165398e-07, "logits/chosen": -2.664564609527588, "logits/rejected": -2.35447359085083, "logps/chosen": -85.25090026855469, "logps/rejected": -110.45634460449219, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -0.25344622135162354, "rewards/margins": 24.170278549194336, "rewards/rejected": -24.423725128173828, "step": 3400 }, { "epoch": 1.55, "eval_logits/chosen": -2.366727113723755, "eval_logits/rejected": -2.0878124237060547, "eval_logps/chosen": -82.14669036865234, "eval_logps/rejected": -110.9883804321289, "eval_loss": 0.00838457141071558, "eval_rewards/accuracies": 0.994413435459137, "eval_rewards/chosen": 0.01952451653778553, "eval_rewards/margins": 24.283069610595703, "eval_rewards/rejected": -24.2635440826416, "eval_runtime": 177.0537, "eval_samples_per_second": 16.165, "eval_steps_per_second": 1.011, "step": 3400 }, { "epoch": 1.56, "learning_rate": 1.6032470826991375e-07, "logits/chosen": -2.664123773574829, "logits/rejected": -2.366610527038574, "logps/chosen": -78.2968978881836, "logps/rejected": -110.25614929199219, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": -0.19316871464252472, "rewards/margins": 23.323383331298828, "rewards/rejected": -23.51655387878418, "step": 3410 }, { "epoch": 1.56, "learning_rate": 1.598173515981735e-07, "logits/chosen": -2.671496629714966, "logits/rejected": -2.3138790130615234, "logps/chosen": -82.02621459960938, "logps/rejected": -111.89013671875, "loss": 0.0029, "rewards/accuracies": 1.0, "rewards/chosen": 0.6785224676132202, "rewards/margins": 24.889408111572266, "rewards/rejected": -24.210886001586914, "step": 3420 }, { "epoch": 1.57, "learning_rate": 1.5930999492643328e-07, "logits/chosen": -2.679556131362915, "logits/rejected": -2.361149311065674, "logps/chosen": -83.46086120605469, "logps/rejected": -113.05169677734375, "loss": 0.0027, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.8767393231391907, "rewards/margins": 24.61939239501953, "rewards/rejected": -23.742652893066406, "step": 3430 }, { "epoch": 1.57, "learning_rate": 1.5880263825469305e-07, "logits/chosen": -2.6824820041656494, "logits/rejected": -2.361029624938965, "logps/chosen": -82.28681945800781, "logps/rejected": -112.43705749511719, "loss": 0.0025, "rewards/accuracies": 1.0, "rewards/chosen": 0.4831086993217468, "rewards/margins": 25.14454460144043, "rewards/rejected": -24.66143226623535, "step": 3440 }, { "epoch": 1.57, "learning_rate": 1.582952815829528e-07, "logits/chosen": -2.7084250450134277, "logits/rejected": -2.344395160675049, "logps/chosen": -85.29447937011719, "logps/rejected": -112.29981994628906, "loss": 0.0034, "rewards/accuracies": 1.0, "rewards/chosen": 0.6575378179550171, "rewards/margins": 24.2863826751709, "rewards/rejected": -23.62884521484375, "step": 3450 }, { "epoch": 1.58, "learning_rate": 1.5778792491121258e-07, "logits/chosen": -2.6704368591308594, "logits/rejected": -2.3395304679870605, "logps/chosen": -84.42998504638672, "logps/rejected": -111.6834945678711, "loss": 0.0032, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.7750858068466187, "rewards/margins": 25.46335220336914, "rewards/rejected": -24.68826675415039, "step": 3460 }, { "epoch": 1.58, "learning_rate": 1.5728056823947235e-07, "logits/chosen": -2.703482151031494, "logits/rejected": -2.38075590133667, "logps/chosen": -78.3385009765625, "logps/rejected": -113.4375, "loss": 0.0043, "rewards/accuracies": 1.0, "rewards/chosen": 0.25260406732559204, "rewards/margins": 24.576128005981445, "rewards/rejected": -24.323524475097656, "step": 3470 }, { "epoch": 1.59, "learning_rate": 1.567732115677321e-07, "logits/chosen": -2.6634645462036133, "logits/rejected": -2.3652682304382324, "logps/chosen": -82.06767272949219, "logps/rejected": -118.97065734863281, "loss": 0.0063, "rewards/accuracies": 1.0, "rewards/chosen": 0.02199583128094673, "rewards/margins": 24.979028701782227, "rewards/rejected": -24.95703125, "step": 3480 }, { "epoch": 1.59, "learning_rate": 1.5626585489599188e-07, "logits/chosen": -2.675232410430908, "logits/rejected": -2.3662333488464355, "logps/chosen": -84.04664611816406, "logps/rejected": -118.52950286865234, "loss": 0.0024, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.4750373363494873, "rewards/margins": 25.74753761291504, "rewards/rejected": -25.272499084472656, "step": 3490 }, { "epoch": 1.6, "learning_rate": 1.5575849822425165e-07, "logits/chosen": -2.67126727104187, "logits/rejected": -2.373340129852295, "logps/chosen": -80.7338638305664, "logps/rejected": -111.84458923339844, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 0.932759165763855, "rewards/margins": 24.73333740234375, "rewards/rejected": -23.800579071044922, "step": 3500 }, { "epoch": 1.6, "eval_logits/chosen": -2.369769811630249, "eval_logits/rejected": -2.088953733444214, "eval_logps/chosen": -82.10145568847656, "eval_logps/rejected": -112.1225357055664, "eval_loss": 0.008706530556082726, "eval_rewards/accuracies": 0.9916201233863831, "eval_rewards/chosen": 0.042142294347286224, "eval_rewards/margins": 24.872756958007812, "eval_rewards/rejected": -24.830615997314453, "eval_runtime": 186.766, "eval_samples_per_second": 15.324, "eval_steps_per_second": 0.958, "step": 3500 }, { "epoch": 1.6, "learning_rate": 1.552511415525114e-07, "logits/chosen": -2.684600591659546, "logits/rejected": -2.360689878463745, "logps/chosen": -80.85955047607422, "logps/rejected": -114.0708236694336, "loss": 0.0054, "rewards/accuracies": 1.0, "rewards/chosen": 0.2006719410419464, "rewards/margins": 25.335285186767578, "rewards/rejected": -25.134613037109375, "step": 3510 }, { "epoch": 1.61, "learning_rate": 1.5474378488077118e-07, "logits/chosen": -2.6973929405212402, "logits/rejected": -2.3962080478668213, "logps/chosen": -86.28897857666016, "logps/rejected": -119.48008728027344, "loss": 0.0034, "rewards/accuracies": 1.0, "rewards/chosen": -0.2793109118938446, "rewards/margins": 25.529159545898438, "rewards/rejected": -25.8084716796875, "step": 3520 }, { "epoch": 1.61, "learning_rate": 1.5423642820903095e-07, "logits/chosen": -2.700697422027588, "logits/rejected": -2.395371675491333, "logps/chosen": -84.37051391601562, "logps/rejected": -114.43376159667969, "loss": 0.0041, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.15875127911567688, "rewards/margins": 24.054805755615234, "rewards/rejected": -23.896055221557617, "step": 3530 }, { "epoch": 1.62, "learning_rate": 1.537290715372907e-07, "logits/chosen": -2.6851584911346436, "logits/rejected": -2.376160144805908, "logps/chosen": -81.4332046508789, "logps/rejected": -113.0853271484375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -0.14719653129577637, "rewards/margins": 24.920785903930664, "rewards/rejected": -25.067981719970703, "step": 3540 }, { "epoch": 1.62, "learning_rate": 1.5322171486555048e-07, "logits/chosen": -2.7332279682159424, "logits/rejected": -2.317025661468506, "logps/chosen": -91.64088439941406, "logps/rejected": -119.6474609375, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": 1.5554498434066772, "rewards/margins": 27.676158905029297, "rewards/rejected": -26.120708465576172, "step": 3550 }, { "epoch": 1.63, "learning_rate": 1.5271435819381025e-07, "logits/chosen": -2.725576162338257, "logits/rejected": -2.388730525970459, "logps/chosen": -89.76695251464844, "logps/rejected": -122.2450180053711, "loss": 0.0043, "rewards/accuracies": 1.0, "rewards/chosen": 0.1292732208967209, "rewards/margins": 26.242355346679688, "rewards/rejected": -26.113082885742188, "step": 3560 }, { "epoch": 1.63, "learning_rate": 1.5220700152207e-07, "logits/chosen": -2.708667278289795, "logits/rejected": -2.314269542694092, "logps/chosen": -88.09994506835938, "logps/rejected": -117.45094299316406, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": -0.09227565675973892, "rewards/margins": 26.516393661499023, "rewards/rejected": -26.608673095703125, "step": 3570 }, { "epoch": 1.63, "learning_rate": 1.5169964485032978e-07, "logits/chosen": -2.7017271518707275, "logits/rejected": -2.4141716957092285, "logps/chosen": -80.44622039794922, "logps/rejected": -116.50738525390625, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": -0.590995728969574, "rewards/margins": 25.494272232055664, "rewards/rejected": -26.085269927978516, "step": 3580 }, { "epoch": 1.64, "learning_rate": 1.5119228817858955e-07, "logits/chosen": -2.7166478633880615, "logits/rejected": -2.362241268157959, "logps/chosen": -88.38592529296875, "logps/rejected": -120.0035400390625, "loss": 0.0034, "rewards/accuracies": 1.0, "rewards/chosen": 0.18774038553237915, "rewards/margins": 26.786956787109375, "rewards/rejected": -26.59921646118164, "step": 3590 }, { "epoch": 1.64, "learning_rate": 1.506849315068493e-07, "logits/chosen": -2.6894726753234863, "logits/rejected": -2.3669166564941406, "logps/chosen": -88.1442642211914, "logps/rejected": -113.01334381103516, "loss": 0.0034, "rewards/accuracies": 1.0, "rewards/chosen": -0.03411858156323433, "rewards/margins": 24.467044830322266, "rewards/rejected": -24.501163482666016, "step": 3600 }, { "epoch": 1.64, "eval_logits/chosen": -2.3851230144500732, "eval_logits/rejected": -2.1030263900756836, "eval_logps/chosen": -82.73149871826172, "eval_logps/rejected": -114.08246612548828, "eval_loss": 0.00856616161763668, "eval_rewards/accuracies": 0.9916201233863831, "eval_rewards/chosen": -0.2728763520717621, "eval_rewards/margins": 25.537704467773438, "eval_rewards/rejected": -25.810579299926758, "eval_runtime": 178.4136, "eval_samples_per_second": 16.041, "eval_steps_per_second": 1.003, "step": 3600 }, { "epoch": 1.65, "learning_rate": 1.5017757483510908e-07, "logits/chosen": -2.6973395347595215, "logits/rejected": -2.3798534870147705, "logps/chosen": -84.19405364990234, "logps/rejected": -121.4945068359375, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": 0.8526423573493958, "rewards/margins": 27.6561336517334, "rewards/rejected": -26.803491592407227, "step": 3610 }, { "epoch": 1.65, "learning_rate": 1.4967021816336885e-07, "logits/chosen": -2.70271897315979, "logits/rejected": -2.3619680404663086, "logps/chosen": -80.52777862548828, "logps/rejected": -113.07865905761719, "loss": 0.0033, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.24278995394706726, "rewards/margins": 26.41168212890625, "rewards/rejected": -26.168893814086914, "step": 3620 }, { "epoch": 1.66, "learning_rate": 1.491628614916286e-07, "logits/chosen": -2.6959216594696045, "logits/rejected": -2.3828177452087402, "logps/chosen": -87.79856872558594, "logps/rejected": -125.32470703125, "loss": 0.0055, "rewards/accuracies": 1.0, "rewards/chosen": -0.5888292789459229, "rewards/margins": 25.83760643005371, "rewards/rejected": -26.426437377929688, "step": 3630 }, { "epoch": 1.66, "learning_rate": 1.4865550481988838e-07, "logits/chosen": -2.714643955230713, "logits/rejected": -2.4147539138793945, "logps/chosen": -82.59846496582031, "logps/rejected": -119.65738677978516, "loss": 0.0048, "rewards/accuracies": 1.0, "rewards/chosen": -0.4641922116279602, "rewards/margins": 27.502628326416016, "rewards/rejected": -27.966815948486328, "step": 3640 }, { "epoch": 1.67, "learning_rate": 1.4814814814814815e-07, "logits/chosen": -2.704658269882202, "logits/rejected": -2.360656499862671, "logps/chosen": -88.01731872558594, "logps/rejected": -116.77565002441406, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -0.2823442816734314, "rewards/margins": 26.041271209716797, "rewards/rejected": -26.3236141204834, "step": 3650 }, { "epoch": 1.67, "learning_rate": 1.476407914764079e-07, "logits/chosen": -2.717862367630005, "logits/rejected": -2.3780062198638916, "logps/chosen": -87.5998764038086, "logps/rejected": -118.67042541503906, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -0.32420802116394043, "rewards/margins": 25.86725425720215, "rewards/rejected": -26.191463470458984, "step": 3660 }, { "epoch": 1.68, "learning_rate": 1.4713343480466768e-07, "logits/chosen": -2.694912910461426, "logits/rejected": -2.3801636695861816, "logps/chosen": -82.05245971679688, "logps/rejected": -119.8744888305664, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": 0.5713659524917603, "rewards/margins": 28.268667221069336, "rewards/rejected": -27.69729995727539, "step": 3670 }, { "epoch": 1.68, "learning_rate": 1.4662607813292745e-07, "logits/chosen": -2.709246873855591, "logits/rejected": -2.343571662902832, "logps/chosen": -85.5533676147461, "logps/rejected": -113.63890075683594, "loss": 0.0089, "rewards/accuracies": 1.0, "rewards/chosen": 0.5408666133880615, "rewards/margins": 25.63242530822754, "rewards/rejected": -25.091556549072266, "step": 3680 }, { "epoch": 1.68, "learning_rate": 1.461187214611872e-07, "logits/chosen": -2.7242205142974854, "logits/rejected": -2.376615047454834, "logps/chosen": -82.33588409423828, "logps/rejected": -112.33255767822266, "loss": 0.0037, "rewards/accuracies": 1.0, "rewards/chosen": 0.4563576579093933, "rewards/margins": 24.6265869140625, "rewards/rejected": -24.170225143432617, "step": 3690 }, { "epoch": 1.69, "learning_rate": 1.4561136478944698e-07, "logits/chosen": -2.73919677734375, "logits/rejected": -2.4225566387176514, "logps/chosen": -83.43978118896484, "logps/rejected": -117.8458023071289, "loss": 0.0027, "rewards/accuracies": 1.0, "rewards/chosen": 0.3199536204338074, "rewards/margins": 25.448444366455078, "rewards/rejected": -25.128490447998047, "step": 3700 }, { "epoch": 1.69, "eval_logits/chosen": -2.4147017002105713, "eval_logits/rejected": -2.1300089359283447, "eval_logps/chosen": -82.11788940429688, "eval_logps/rejected": -112.50545501708984, "eval_loss": 0.008617809973657131, "eval_rewards/accuracies": 0.9916201233863831, "eval_rewards/chosen": 0.03392898291349411, "eval_rewards/margins": 25.056001663208008, "eval_rewards/rejected": -25.02207374572754, "eval_runtime": 186.2686, "eval_samples_per_second": 15.365, "eval_steps_per_second": 0.961, "step": 3700 }, { "epoch": 1.69, "learning_rate": 1.4510400811770675e-07, "logits/chosen": -2.7499756813049316, "logits/rejected": -2.3218953609466553, "logps/chosen": -94.61308288574219, "logps/rejected": -115.5009536743164, "loss": 0.0024, "rewards/accuracies": 1.0, "rewards/chosen": 0.36248910427093506, "rewards/margins": 25.429086685180664, "rewards/rejected": -25.06659698486328, "step": 3710 }, { "epoch": 1.7, "learning_rate": 1.445966514459665e-07, "logits/chosen": -2.732138156890869, "logits/rejected": -2.4120376110076904, "logps/chosen": -80.81584930419922, "logps/rejected": -113.46417236328125, "loss": 0.0049, "rewards/accuracies": 1.0, "rewards/chosen": 0.12681210041046143, "rewards/margins": 24.83046531677246, "rewards/rejected": -24.70365333557129, "step": 3720 }, { "epoch": 1.7, "learning_rate": 1.4408929477422628e-07, "logits/chosen": -2.7351951599121094, "logits/rejected": -2.429306745529175, "logps/chosen": -86.49324798583984, "logps/rejected": -115.26017761230469, "loss": 0.006, "rewards/accuracies": 1.0, "rewards/chosen": 0.4942714273929596, "rewards/margins": 25.894994735717773, "rewards/rejected": -25.400720596313477, "step": 3730 }, { "epoch": 1.71, "learning_rate": 1.4358193810248604e-07, "logits/chosen": -2.6957874298095703, "logits/rejected": -2.3793578147888184, "logps/chosen": -81.92807006835938, "logps/rejected": -114.77325439453125, "loss": 0.0032, "rewards/accuracies": 1.0, "rewards/chosen": 1.3060009479522705, "rewards/margins": 26.99686050415039, "rewards/rejected": -25.690860748291016, "step": 3740 }, { "epoch": 1.71, "learning_rate": 1.430745814307458e-07, "logits/chosen": -2.7312753200531006, "logits/rejected": -2.3709731101989746, "logps/chosen": -85.63764190673828, "logps/rejected": -115.82099914550781, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -0.5038666725158691, "rewards/margins": 24.66942596435547, "rewards/rejected": -25.173294067382812, "step": 3750 }, { "epoch": 1.72, "learning_rate": 1.4256722475900558e-07, "logits/chosen": -2.738646984100342, "logits/rejected": -2.3933870792388916, "logps/chosen": -82.91041564941406, "logps/rejected": -115.7519760131836, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": -0.2839534878730774, "rewards/margins": 25.090396881103516, "rewards/rejected": -25.37434959411621, "step": 3760 }, { "epoch": 1.72, "learning_rate": 1.4205986808726534e-07, "logits/chosen": -2.7764086723327637, "logits/rejected": -2.4644694328308105, "logps/chosen": -82.0972671508789, "logps/rejected": -121.90316009521484, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": 0.04370160028338432, "rewards/margins": 26.28451156616211, "rewards/rejected": -26.240808486938477, "step": 3770 }, { "epoch": 1.73, "learning_rate": 1.415525114155251e-07, "logits/chosen": -2.7368228435516357, "logits/rejected": -2.3885326385498047, "logps/chosen": -90.67134094238281, "logps/rejected": -119.10804748535156, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -0.47617894411087036, "rewards/margins": 25.85808753967285, "rewards/rejected": -26.334264755249023, "step": 3780 }, { "epoch": 1.73, "learning_rate": 1.4104515474378488e-07, "logits/chosen": -2.72708797454834, "logits/rejected": -2.397061824798584, "logps/chosen": -90.2813949584961, "logps/rejected": -113.62602233886719, "loss": 0.0117, "rewards/accuracies": 1.0, "rewards/chosen": -0.7463950514793396, "rewards/margins": 24.796037673950195, "rewards/rejected": -25.54243278503418, "step": 3790 }, { "epoch": 1.73, "learning_rate": 1.4053779807204464e-07, "logits/chosen": -2.716977596282959, "logits/rejected": -2.3923897743225098, "logps/chosen": -84.97366333007812, "logps/rejected": -108.60880279541016, "loss": 0.0056, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.24345584213733673, "rewards/margins": 23.06013298034668, "rewards/rejected": -23.303586959838867, "step": 3800 }, { "epoch": 1.73, "eval_logits/chosen": -2.398029088973999, "eval_logits/rejected": -2.1139631271362305, "eval_logps/chosen": -81.82569885253906, "eval_logps/rejected": -109.69595336914062, "eval_loss": 0.008213773369789124, "eval_rewards/accuracies": 0.9916201233863831, "eval_rewards/chosen": 0.18002404272556305, "eval_rewards/margins": 23.797351837158203, "eval_rewards/rejected": -23.617328643798828, "eval_runtime": 180.803, "eval_samples_per_second": 15.829, "eval_steps_per_second": 0.99, "step": 3800 }, { "epoch": 1.74, "learning_rate": 1.400304414003044e-07, "logits/chosen": -2.7162327766418457, "logits/rejected": -2.4210762977600098, "logps/chosen": -82.35417175292969, "logps/rejected": -113.5506820678711, "loss": 0.0011, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.34166446328163147, "rewards/margins": 23.69597625732422, "rewards/rejected": -24.037639617919922, "step": 3810 }, { "epoch": 1.74, "learning_rate": 1.3952308472856418e-07, "logits/chosen": -2.723470449447632, "logits/rejected": -2.395782947540283, "logps/chosen": -81.69819641113281, "logps/rejected": -107.86186218261719, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 0.37113767862319946, "rewards/margins": 24.45778465270996, "rewards/rejected": -24.08664894104004, "step": 3820 }, { "epoch": 1.75, "learning_rate": 1.3901572805682394e-07, "logits/chosen": -2.7099733352661133, "logits/rejected": -2.358590841293335, "logps/chosen": -86.41353607177734, "logps/rejected": -113.5186767578125, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": 0.8023043870925903, "rewards/margins": 24.44386863708496, "rewards/rejected": -23.641565322875977, "step": 3830 }, { "epoch": 1.75, "learning_rate": 1.385083713850837e-07, "logits/chosen": -2.721942901611328, "logits/rejected": -2.34991455078125, "logps/chosen": -85.73603057861328, "logps/rejected": -111.8265151977539, "loss": 0.0045, "rewards/accuracies": 1.0, "rewards/chosen": 0.6682132482528687, "rewards/margins": 25.99078369140625, "rewards/rejected": -25.322568893432617, "step": 3840 }, { "epoch": 1.76, "learning_rate": 1.3800101471334348e-07, "logits/chosen": -2.740370273590088, "logits/rejected": -2.3777966499328613, "logps/chosen": -86.99876403808594, "logps/rejected": -106.84332275390625, "loss": 0.0057, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.9587116241455078, "rewards/margins": 22.486181259155273, "rewards/rejected": -23.444894790649414, "step": 3850 }, { "epoch": 1.76, "learning_rate": 1.3749365804160324e-07, "logits/chosen": -2.7081170082092285, "logits/rejected": -2.363821506500244, "logps/chosen": -87.59700775146484, "logps/rejected": -111.73420715332031, "loss": 0.0031, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.16735783219337463, "rewards/margins": 23.411279678344727, "rewards/rejected": -23.243919372558594, "step": 3860 }, { "epoch": 1.77, "learning_rate": 1.36986301369863e-07, "logits/chosen": -2.71311616897583, "logits/rejected": -2.404430866241455, "logps/chosen": -84.49826049804688, "logps/rejected": -113.91236877441406, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": 0.657957911491394, "rewards/margins": 25.05248260498047, "rewards/rejected": -24.3945255279541, "step": 3870 }, { "epoch": 1.77, "learning_rate": 1.3647894469812278e-07, "logits/chosen": -2.710131883621216, "logits/rejected": -2.355940341949463, "logps/chosen": -84.90182495117188, "logps/rejected": -112.59244537353516, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": 1.5880377292633057, "rewards/margins": 25.815826416015625, "rewards/rejected": -24.227792739868164, "step": 3880 }, { "epoch": 1.78, "learning_rate": 1.3597158802638254e-07, "logits/chosen": -2.7137367725372314, "logits/rejected": -2.386852741241455, "logps/chosen": -82.02251434326172, "logps/rejected": -115.30233001708984, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -0.25805777311325073, "rewards/margins": 24.0673770904541, "rewards/rejected": -24.3254337310791, "step": 3890 }, { "epoch": 1.78, "learning_rate": 1.354642313546423e-07, "logits/chosen": -2.704136848449707, "logits/rejected": -2.379242420196533, "logps/chosen": -82.89125061035156, "logps/rejected": -110.4068374633789, "loss": 0.0026, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.22567126154899597, "rewards/margins": 24.661211013793945, "rewards/rejected": -24.43553924560547, "step": 3900 }, { "epoch": 1.78, "eval_logits/chosen": -2.396462917327881, "eval_logits/rejected": -2.1140124797821045, "eval_logps/chosen": -82.25261688232422, "eval_logps/rejected": -111.67327880859375, "eval_loss": 0.008316286839544773, "eval_rewards/accuracies": 0.994413435459137, "eval_rewards/chosen": -0.03344082832336426, "eval_rewards/margins": 24.57254981994629, "eval_rewards/rejected": -24.605993270874023, "eval_runtime": 175.3912, "eval_samples_per_second": 16.318, "eval_steps_per_second": 1.021, "step": 3900 }, { "epoch": 1.78, "learning_rate": 1.3495687468290208e-07, "logits/chosen": -2.7211337089538574, "logits/rejected": -2.3387482166290283, "logps/chosen": -85.39901733398438, "logps/rejected": -111.7140121459961, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": 0.26162949204444885, "rewards/margins": 25.03203582763672, "rewards/rejected": -24.770404815673828, "step": 3910 }, { "epoch": 1.79, "learning_rate": 1.3444951801116184e-07, "logits/chosen": -2.73215913772583, "logits/rejected": -2.3686413764953613, "logps/chosen": -84.76148223876953, "logps/rejected": -112.22846984863281, "loss": 0.0037, "rewards/accuracies": 1.0, "rewards/chosen": 0.05043959617614746, "rewards/margins": 25.310312271118164, "rewards/rejected": -25.259872436523438, "step": 3920 }, { "epoch": 1.79, "learning_rate": 1.339421613394216e-07, "logits/chosen": -2.7401113510131836, "logits/rejected": -2.408447265625, "logps/chosen": -86.01676940917969, "logps/rejected": -120.08952331542969, "loss": 0.0043, "rewards/accuracies": 1.0, "rewards/chosen": -0.7376105189323425, "rewards/margins": 25.01788330078125, "rewards/rejected": -25.7554931640625, "step": 3930 }, { "epoch": 1.8, "learning_rate": 1.3343480466768138e-07, "logits/chosen": -2.715308666229248, "logits/rejected": -2.3758597373962402, "logps/chosen": -81.56092834472656, "logps/rejected": -111.95904541015625, "loss": 0.0064, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.2127654254436493, "rewards/margins": 24.776548385620117, "rewards/rejected": -24.98931312561035, "step": 3940 }, { "epoch": 1.8, "learning_rate": 1.3292744799594114e-07, "logits/chosen": -2.7253241539001465, "logits/rejected": -2.405219316482544, "logps/chosen": -81.47698974609375, "logps/rejected": -114.91780853271484, "loss": 0.0024, "rewards/accuracies": 1.0, "rewards/chosen": 0.4850035607814789, "rewards/margins": 25.682281494140625, "rewards/rejected": -25.197280883789062, "step": 3950 }, { "epoch": 1.81, "learning_rate": 1.324200913242009e-07, "logits/chosen": -2.7008957862854004, "logits/rejected": -2.3714041709899902, "logps/chosen": -81.98283386230469, "logps/rejected": -116.29072570800781, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -0.33132046461105347, "rewards/margins": 25.64438819885254, "rewards/rejected": -25.9757080078125, "step": 3960 }, { "epoch": 1.81, "learning_rate": 1.3191273465246068e-07, "logits/chosen": -2.707296848297119, "logits/rejected": -2.4201998710632324, "logps/chosen": -79.35313415527344, "logps/rejected": -116.59090423583984, "loss": 0.0074, "rewards/accuracies": 1.0, "rewards/chosen": -0.5824056267738342, "rewards/margins": 24.454814910888672, "rewards/rejected": -25.037221908569336, "step": 3970 }, { "epoch": 1.82, "learning_rate": 1.3140537798072044e-07, "logits/chosen": -2.729668140411377, "logits/rejected": -2.398587703704834, "logps/chosen": -81.1919937133789, "logps/rejected": -104.11589050292969, "loss": 0.0047, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.20951545238494873, "rewards/margins": 22.610660552978516, "rewards/rejected": -22.401142120361328, "step": 3980 }, { "epoch": 1.82, "learning_rate": 1.308980213089802e-07, "logits/chosen": -2.7305374145507812, "logits/rejected": -2.395145893096924, "logps/chosen": -81.86991119384766, "logps/rejected": -109.388427734375, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": 0.5873571038246155, "rewards/margins": 23.17819595336914, "rewards/rejected": -22.590839385986328, "step": 3990 }, { "epoch": 1.83, "learning_rate": 1.3039066463723998e-07, "logits/chosen": -2.7487363815307617, "logits/rejected": -2.361471652984619, "logps/chosen": -85.89412689208984, "logps/rejected": -110.9847183227539, "loss": 0.0036, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.1719265878200531, "rewards/margins": 23.728368759155273, "rewards/rejected": -23.90029525756836, "step": 4000 }, { "epoch": 1.83, "eval_logits/chosen": -2.4166691303253174, "eval_logits/rejected": -2.134845733642578, "eval_logps/chosen": -82.68785095214844, "eval_logps/rejected": -108.54793548583984, "eval_loss": 0.008006688207387924, "eval_rewards/accuracies": 0.9916201233863831, "eval_rewards/chosen": -0.2510567009449005, "eval_rewards/margins": 22.79225730895996, "eval_rewards/rejected": -23.043312072753906, "eval_runtime": 193.8805, "eval_samples_per_second": 14.762, "eval_steps_per_second": 0.923, "step": 4000 }, { "epoch": 1.83, "learning_rate": 1.2988330796549974e-07, "logits/chosen": -2.7544474601745605, "logits/rejected": -2.412229299545288, "logps/chosen": -83.78072357177734, "logps/rejected": -109.46234130859375, "loss": 0.0065, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.264910340309143, "rewards/margins": 22.25986099243164, "rewards/rejected": -23.52477264404297, "step": 4010 }, { "epoch": 1.83, "learning_rate": 1.293759512937595e-07, "logits/chosen": -2.728762149810791, "logits/rejected": -2.433173418045044, "logps/chosen": -85.98408508300781, "logps/rejected": -112.25279235839844, "loss": 0.0024, "rewards/accuracies": 1.0, "rewards/chosen": -0.8340870141983032, "rewards/margins": 22.182353973388672, "rewards/rejected": -23.01643943786621, "step": 4020 }, { "epoch": 1.84, "learning_rate": 1.2886859462201928e-07, "logits/chosen": -2.7292661666870117, "logits/rejected": -2.426002025604248, "logps/chosen": -83.33248138427734, "logps/rejected": -112.56217956542969, "loss": 0.0043, "rewards/accuracies": 1.0, "rewards/chosen": -0.4581160545349121, "rewards/margins": 22.68105697631836, "rewards/rejected": -23.139171600341797, "step": 4030 }, { "epoch": 1.84, "learning_rate": 1.2836123795027904e-07, "logits/chosen": -2.7262182235717773, "logits/rejected": -2.4424657821655273, "logps/chosen": -79.58126831054688, "logps/rejected": -109.45216369628906, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -0.3296700119972229, "rewards/margins": 22.773860931396484, "rewards/rejected": -23.10352897644043, "step": 4040 }, { "epoch": 1.85, "learning_rate": 1.278538812785388e-07, "logits/chosen": -2.739739179611206, "logits/rejected": -2.3942644596099854, "logps/chosen": -90.85140228271484, "logps/rejected": -115.15174865722656, "loss": 0.0049, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.3278205990791321, "rewards/margins": 22.987062454223633, "rewards/rejected": -23.31488037109375, "step": 4050 }, { "epoch": 1.85, "learning_rate": 1.2734652460679858e-07, "logits/chosen": -2.7618906497955322, "logits/rejected": -2.429367780685425, "logps/chosen": -90.33531951904297, "logps/rejected": -113.79902648925781, "loss": 0.0027, "rewards/accuracies": 1.0, "rewards/chosen": -0.39940521121025085, "rewards/margins": 22.951370239257812, "rewards/rejected": -23.35077476501465, "step": 4060 }, { "epoch": 1.86, "learning_rate": 1.2683916793505834e-07, "logits/chosen": -2.7132468223571777, "logits/rejected": -2.434654712677002, "logps/chosen": -80.01641082763672, "logps/rejected": -113.34393310546875, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": -0.4199061393737793, "rewards/margins": 23.317398071289062, "rewards/rejected": -23.737302780151367, "step": 4070 }, { "epoch": 1.86, "learning_rate": 1.263318112633181e-07, "logits/chosen": -2.7661635875701904, "logits/rejected": -2.4105448722839355, "logps/chosen": -87.9837646484375, "logps/rejected": -107.83976745605469, "loss": 0.0033, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.3711378574371338, "rewards/margins": 21.41225242614746, "rewards/rejected": -21.783390045166016, "step": 4080 }, { "epoch": 1.87, "learning_rate": 1.2582445459157788e-07, "logits/chosen": -2.731945514678955, "logits/rejected": -2.368818759918213, "logps/chosen": -83.7499771118164, "logps/rejected": -113.18292236328125, "loss": 0.0034, "rewards/accuracies": 1.0, "rewards/chosen": 0.0069966791197657585, "rewards/margins": 23.775638580322266, "rewards/rejected": -23.76864242553711, "step": 4090 }, { "epoch": 1.87, "learning_rate": 1.2531709791983764e-07, "logits/chosen": -2.740063428878784, "logits/rejected": -2.3751094341278076, "logps/chosen": -85.87584686279297, "logps/rejected": -115.41978454589844, "loss": 0.0044, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.19453079998493195, "rewards/margins": 24.491985321044922, "rewards/rejected": -24.297454833984375, "step": 4100 }, { "epoch": 1.87, "eval_logits/chosen": -2.4159913063049316, "eval_logits/rejected": -2.1314282417297363, "eval_logps/chosen": -83.03763580322266, "eval_logps/rejected": -110.02344512939453, "eval_loss": 0.00836887490004301, "eval_rewards/accuracies": 0.9916201233863831, "eval_rewards/chosen": -0.42594853043556213, "eval_rewards/margins": 23.355125427246094, "eval_rewards/rejected": -23.781070709228516, "eval_runtime": 183.6698, "eval_samples_per_second": 15.582, "eval_steps_per_second": 0.975, "step": 4100 }, { "epoch": 1.88, "learning_rate": 1.248097412480974e-07, "logits/chosen": -2.746269702911377, "logits/rejected": -2.370673894882202, "logps/chosen": -90.10121154785156, "logps/rejected": -113.69964599609375, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": -0.15433010458946228, "rewards/margins": 23.741132736206055, "rewards/rejected": -23.895463943481445, "step": 4110 }, { "epoch": 1.88, "learning_rate": 1.2430238457635718e-07, "logits/chosen": -2.7529842853546143, "logits/rejected": -2.433593273162842, "logps/chosen": -83.6185073852539, "logps/rejected": -114.53228759765625, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": -1.2069069147109985, "rewards/margins": 23.649782180786133, "rewards/rejected": -24.856691360473633, "step": 4120 }, { "epoch": 1.89, "learning_rate": 1.2379502790461694e-07, "logits/chosen": -2.7312519550323486, "logits/rejected": -2.3650736808776855, "logps/chosen": -82.35449981689453, "logps/rejected": -109.23161315917969, "loss": 0.0053, "rewards/accuracies": 1.0, "rewards/chosen": -1.0486968755722046, "rewards/margins": 23.71874237060547, "rewards/rejected": -24.767438888549805, "step": 4130 }, { "epoch": 1.89, "learning_rate": 1.232876712328767e-07, "logits/chosen": -2.757113218307495, "logits/rejected": -2.3804173469543457, "logps/chosen": -86.58125305175781, "logps/rejected": -113.01507568359375, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -0.756896436214447, "rewards/margins": 23.720684051513672, "rewards/rejected": -24.47757911682129, "step": 4140 }, { "epoch": 1.89, "learning_rate": 1.2278031456113648e-07, "logits/chosen": -2.759451389312744, "logits/rejected": -2.3394014835357666, "logps/chosen": -93.83252716064453, "logps/rejected": -112.074462890625, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": -0.07218992710113525, "rewards/margins": 24.163928985595703, "rewards/rejected": -24.23611831665039, "step": 4150 }, { "epoch": 1.9, "learning_rate": 1.2227295788939624e-07, "logits/chosen": -2.751781940460205, "logits/rejected": -2.3958847522735596, "logps/chosen": -88.75358581542969, "logps/rejected": -116.1768798828125, "loss": 0.0024, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.7892154455184937, "rewards/margins": 24.37813949584961, "rewards/rejected": -25.167354583740234, "step": 4160 }, { "epoch": 1.9, "learning_rate": 1.21765601217656e-07, "logits/chosen": -2.7410354614257812, "logits/rejected": -2.456022024154663, "logps/chosen": -82.11006164550781, "logps/rejected": -111.1407470703125, "loss": 0.0034, "rewards/accuracies": 1.0, "rewards/chosen": -0.7517865300178528, "rewards/margins": 23.780141830444336, "rewards/rejected": -24.53192901611328, "step": 4170 }, { "epoch": 1.91, "learning_rate": 1.2125824454591578e-07, "logits/chosen": -2.747929096221924, "logits/rejected": -2.4334909915924072, "logps/chosen": -87.79632568359375, "logps/rejected": -117.29313659667969, "loss": 0.0037, "rewards/accuracies": 1.0, "rewards/chosen": -0.556801438331604, "rewards/margins": 24.36429214477539, "rewards/rejected": -24.921092987060547, "step": 4180 }, { "epoch": 1.91, "learning_rate": 1.2075088787417554e-07, "logits/chosen": -2.7340826988220215, "logits/rejected": -2.413975954055786, "logps/chosen": -81.87889099121094, "logps/rejected": -109.2918701171875, "loss": 0.0024, "rewards/accuracies": 1.0, "rewards/chosen": -0.025557613000273705, "rewards/margins": 23.915842056274414, "rewards/rejected": -23.941396713256836, "step": 4190 }, { "epoch": 1.92, "learning_rate": 1.202435312024353e-07, "logits/chosen": -2.7550766468048096, "logits/rejected": -2.456955909729004, "logps/chosen": -88.68054962158203, "logps/rejected": -111.10501861572266, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -1.5104751586914062, "rewards/margins": 23.033428192138672, "rewards/rejected": -24.543903350830078, "step": 4200 }, { "epoch": 1.92, "eval_logits/chosen": -2.4196181297302246, "eval_logits/rejected": -2.1368820667266846, "eval_logps/chosen": -83.32768249511719, "eval_logps/rejected": -108.93321990966797, "eval_loss": 0.008334940299391747, "eval_rewards/accuracies": 0.994413435459137, "eval_rewards/chosen": -0.5709689259529114, "eval_rewards/margins": 22.664993286132812, "eval_rewards/rejected": -23.2359619140625, "eval_runtime": 226.3807, "eval_samples_per_second": 12.642, "eval_steps_per_second": 0.791, "step": 4200 }, { "epoch": 1.92, "learning_rate": 1.1973617453069508e-07, "logits/chosen": -2.7501883506774902, "logits/rejected": -2.384361982345581, "logps/chosen": -83.97435760498047, "logps/rejected": -110.06040954589844, "loss": 0.004, "rewards/accuracies": 1.0, "rewards/chosen": 0.2817421853542328, "rewards/margins": 24.425376892089844, "rewards/rejected": -24.143634796142578, "step": 4210 }, { "epoch": 1.93, "learning_rate": 1.1922881785895484e-07, "logits/chosen": -2.7327611446380615, "logits/rejected": -2.4343745708465576, "logps/chosen": -85.28137969970703, "logps/rejected": -114.49162292480469, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": -0.8153437376022339, "rewards/margins": 22.84836196899414, "rewards/rejected": -23.663707733154297, "step": 4220 }, { "epoch": 1.93, "learning_rate": 1.187214611872146e-07, "logits/chosen": -2.7376768589019775, "logits/rejected": -2.424269914627075, "logps/chosen": -87.45048522949219, "logps/rejected": -117.5290298461914, "loss": 0.0028, "rewards/accuracies": 1.0, "rewards/chosen": -0.4922616481781006, "rewards/margins": 24.17519187927246, "rewards/rejected": -24.667457580566406, "step": 4230 }, { "epoch": 1.94, "learning_rate": 1.1821410451547436e-07, "logits/chosen": -2.7516493797302246, "logits/rejected": -2.460716724395752, "logps/chosen": -78.94384765625, "logps/rejected": -112.85262298583984, "loss": 0.0025, "rewards/accuracies": 1.0, "rewards/chosen": 0.15330730378627777, "rewards/margins": 23.479642868041992, "rewards/rejected": -23.326335906982422, "step": 4240 }, { "epoch": 1.94, "learning_rate": 1.1770674784373413e-07, "logits/chosen": -2.7155966758728027, "logits/rejected": -2.4863228797912598, "logps/chosen": -80.0714340209961, "logps/rejected": -113.4439926147461, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -1.572272539138794, "rewards/margins": 22.423776626586914, "rewards/rejected": -23.99604606628418, "step": 4250 }, { "epoch": 1.94, "learning_rate": 1.171993911719939e-07, "logits/chosen": -2.7354233264923096, "logits/rejected": -2.4438700675964355, "logps/chosen": -81.00538635253906, "logps/rejected": -116.08367919921875, "loss": 0.0045, "rewards/accuracies": 1.0, "rewards/chosen": -1.1464935541152954, "rewards/margins": 24.15064239501953, "rewards/rejected": -25.297134399414062, "step": 4260 }, { "epoch": 1.95, "learning_rate": 1.1669203450025366e-07, "logits/chosen": -2.7367005348205566, "logits/rejected": -2.4384920597076416, "logps/chosen": -82.76194763183594, "logps/rejected": -112.2519760131836, "loss": 0.0065, "rewards/accuracies": 1.0, "rewards/chosen": -0.09042773395776749, "rewards/margins": 24.201007843017578, "rewards/rejected": -24.29143714904785, "step": 4270 }, { "epoch": 1.95, "learning_rate": 1.1618467782851343e-07, "logits/chosen": -2.720911741256714, "logits/rejected": -2.392549514770508, "logps/chosen": -86.5931625366211, "logps/rejected": -115.57108306884766, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -0.4980667233467102, "rewards/margins": 24.22681427001953, "rewards/rejected": -24.72488021850586, "step": 4280 }, { "epoch": 1.96, "learning_rate": 1.156773211567732e-07, "logits/chosen": -2.734757423400879, "logits/rejected": -2.3672633171081543, "logps/chosen": -82.19950103759766, "logps/rejected": -109.08793640136719, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": -0.6968777775764465, "rewards/margins": 23.608652114868164, "rewards/rejected": -24.305530548095703, "step": 4290 }, { "epoch": 1.96, "learning_rate": 1.1516996448503296e-07, "logits/chosen": -2.724691390991211, "logits/rejected": -2.3827404975891113, "logps/chosen": -84.1601791381836, "logps/rejected": -118.51438903808594, "loss": 0.0044, "rewards/accuracies": 1.0, "rewards/chosen": -0.06279841810464859, "rewards/margins": 25.268766403198242, "rewards/rejected": -25.331567764282227, "step": 4300 }, { "epoch": 1.96, "eval_logits/chosen": -2.410888195037842, "eval_logits/rejected": -2.130680799484253, "eval_logps/chosen": -83.45832061767578, "eval_logps/rejected": -111.75601196289062, "eval_loss": 0.008501616306602955, "eval_rewards/accuracies": 0.9972066879272461, "eval_rewards/chosen": -0.6362941265106201, "eval_rewards/margins": 24.011058807373047, "eval_rewards/rejected": -24.647354125976562, "eval_runtime": 202.3406, "eval_samples_per_second": 14.144, "eval_steps_per_second": 0.885, "step": 4300 }, { "epoch": 1.97, "learning_rate": 1.1466260781329273e-07, "logits/chosen": -2.7313926219940186, "logits/rejected": -2.417076826095581, "logps/chosen": -83.44046020507812, "logps/rejected": -115.34992980957031, "loss": 0.0022, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.9900857210159302, "rewards/margins": 23.526927947998047, "rewards/rejected": -24.517013549804688, "step": 4310 }, { "epoch": 1.97, "learning_rate": 1.141552511415525e-07, "logits/chosen": -2.7361061573028564, "logits/rejected": -2.3558712005615234, "logps/chosen": -86.88752746582031, "logps/rejected": -111.52519226074219, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": 0.09841351211071014, "rewards/margins": 24.383689880371094, "rewards/rejected": -24.285276412963867, "step": 4320 }, { "epoch": 1.98, "learning_rate": 1.1364789446981226e-07, "logits/chosen": -2.730456590652466, "logits/rejected": -2.4067349433898926, "logps/chosen": -90.84781646728516, "logps/rejected": -115.67729187011719, "loss": 0.0045, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.2065244913101196, "rewards/margins": 24.192974090576172, "rewards/rejected": -25.399499893188477, "step": 4330 }, { "epoch": 1.98, "learning_rate": 1.1314053779807203e-07, "logits/chosen": -2.7302258014678955, "logits/rejected": -2.410287380218506, "logps/chosen": -79.2750244140625, "logps/rejected": -108.2707748413086, "loss": 0.0036, "rewards/accuracies": 1.0, "rewards/chosen": -0.48253995180130005, "rewards/margins": 22.766185760498047, "rewards/rejected": -23.24872589111328, "step": 4340 }, { "epoch": 1.99, "learning_rate": 1.126331811263318e-07, "logits/chosen": -2.7434325218200684, "logits/rejected": -2.3884055614471436, "logps/chosen": -89.85121154785156, "logps/rejected": -113.74176025390625, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -0.6823492646217346, "rewards/margins": 23.525726318359375, "rewards/rejected": -24.20807456970215, "step": 4350 }, { "epoch": 1.99, "learning_rate": 1.1212582445459156e-07, "logits/chosen": -2.7460639476776123, "logits/rejected": -2.3658998012542725, "logps/chosen": -90.60530853271484, "logps/rejected": -115.68277740478516, "loss": 0.0026, "rewards/accuracies": 1.0, "rewards/chosen": -0.7538153529167175, "rewards/margins": 24.32625389099121, "rewards/rejected": -25.080068588256836, "step": 4360 }, { "epoch": 1.99, "learning_rate": 1.1161846778285133e-07, "logits/chosen": -2.735424518585205, "logits/rejected": -2.4141480922698975, "logps/chosen": -81.69535064697266, "logps/rejected": -115.5906982421875, "loss": 0.0059, "rewards/accuracies": 1.0, "rewards/chosen": 0.10660628974437714, "rewards/margins": 24.344066619873047, "rewards/rejected": -24.23746109008789, "step": 4370 }, { "epoch": 2.0, "learning_rate": 1.111111111111111e-07, "logits/chosen": -2.735525131225586, "logits/rejected": -2.3597075939178467, "logps/chosen": -85.69715881347656, "logps/rejected": -114.16136169433594, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": 0.14449606835842133, "rewards/margins": 25.308826446533203, "rewards/rejected": -25.16432762145996, "step": 4380 }, { "epoch": 2.0, "learning_rate": 1.1060375443937086e-07, "logits/chosen": -2.7272896766662598, "logits/rejected": -2.364004611968994, "logps/chosen": -85.98393249511719, "logps/rejected": -111.15299987792969, "loss": 0.0044, "rewards/accuracies": 1.0, "rewards/chosen": -0.5045382976531982, "rewards/margins": 23.96527671813965, "rewards/rejected": -24.469816207885742, "step": 4390 }, { "epoch": 2.01, "learning_rate": 1.1009639776763063e-07, "logits/chosen": -2.7307615280151367, "logits/rejected": -2.455038070678711, "logps/chosen": -83.47547912597656, "logps/rejected": -116.3271484375, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": -0.026294182986021042, "rewards/margins": 24.239322662353516, "rewards/rejected": -24.2656192779541, "step": 4400 }, { "epoch": 2.01, "eval_logits/chosen": -2.413396120071411, "eval_logits/rejected": -2.13217830657959, "eval_logps/chosen": -83.41236114501953, "eval_logps/rejected": -112.35968017578125, "eval_loss": 0.008528295904397964, "eval_rewards/accuracies": 0.9916201233863831, "eval_rewards/chosen": -0.6133103370666504, "eval_rewards/margins": 24.335878372192383, "eval_rewards/rejected": -24.949186325073242, "eval_runtime": 170.3415, "eval_samples_per_second": 16.802, "eval_steps_per_second": 1.051, "step": 4400 }, { "epoch": 2.01, "learning_rate": 1.095890410958904e-07, "logits/chosen": -2.7199060916900635, "logits/rejected": -2.4221317768096924, "logps/chosen": -84.6057357788086, "logps/rejected": -120.41743469238281, "loss": 0.0044, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.0734550952911377, "rewards/margins": 23.737314224243164, "rewards/rejected": -24.81077003479004, "step": 4410 }, { "epoch": 2.02, "learning_rate": 1.0908168442415016e-07, "logits/chosen": -2.741459608078003, "logits/rejected": -2.4125568866729736, "logps/chosen": -82.20728302001953, "logps/rejected": -114.6263427734375, "loss": 0.0077, "rewards/accuracies": 1.0, "rewards/chosen": -0.7107439041137695, "rewards/margins": 24.820287704467773, "rewards/rejected": -25.531028747558594, "step": 4420 }, { "epoch": 2.02, "learning_rate": 1.0857432775240993e-07, "logits/chosen": -2.7324230670928955, "logits/rejected": -2.44114089012146, "logps/chosen": -83.89222717285156, "logps/rejected": -117.7229232788086, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -1.0631663799285889, "rewards/margins": 24.403308868408203, "rewards/rejected": -25.466474533081055, "step": 4430 }, { "epoch": 2.03, "learning_rate": 1.080669710806697e-07, "logits/chosen": -2.727234363555908, "logits/rejected": -2.3752944469451904, "logps/chosen": -82.16426849365234, "logps/rejected": -113.43917083740234, "loss": 0.0025, "rewards/accuracies": 1.0, "rewards/chosen": -0.02631220780313015, "rewards/margins": 25.634429931640625, "rewards/rejected": -25.660741806030273, "step": 4440 }, { "epoch": 2.03, "learning_rate": 1.0755961440892946e-07, "logits/chosen": -2.735023260116577, "logits/rejected": -2.4273390769958496, "logps/chosen": -82.823486328125, "logps/rejected": -117.38553619384766, "loss": 0.0024, "rewards/accuracies": 1.0, "rewards/chosen": -1.0146194696426392, "rewards/margins": 25.19106674194336, "rewards/rejected": -26.205684661865234, "step": 4450 }, { "epoch": 2.04, "learning_rate": 1.0705225773718923e-07, "logits/chosen": -2.7084691524505615, "logits/rejected": -2.4082369804382324, "logps/chosen": -84.9083023071289, "logps/rejected": -115.2505111694336, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -1.2864501476287842, "rewards/margins": 23.769474029541016, "rewards/rejected": -25.055923461914062, "step": 4460 }, { "epoch": 2.04, "learning_rate": 1.06544901065449e-07, "logits/chosen": -2.7284061908721924, "logits/rejected": -2.4069182872772217, "logps/chosen": -89.87139129638672, "logps/rejected": -113.24871826171875, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": -0.03114454820752144, "rewards/margins": 24.267412185668945, "rewards/rejected": -24.29855728149414, "step": 4470 }, { "epoch": 2.04, "learning_rate": 1.0603754439370876e-07, "logits/chosen": -2.7637999057769775, "logits/rejected": -2.3503127098083496, "logps/chosen": -87.11588287353516, "logps/rejected": -115.85011291503906, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 0.3462512195110321, "rewards/margins": 25.407480239868164, "rewards/rejected": -25.061227798461914, "step": 4480 }, { "epoch": 2.05, "learning_rate": 1.0553018772196853e-07, "logits/chosen": -2.7481796741485596, "logits/rejected": -2.4183032512664795, "logps/chosen": -83.23652648925781, "logps/rejected": -114.95933532714844, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -0.1508525162935257, "rewards/margins": 24.560148239135742, "rewards/rejected": -24.71099853515625, "step": 4490 }, { "epoch": 2.05, "learning_rate": 1.050228310502283e-07, "logits/chosen": -2.7153160572052, "logits/rejected": -2.425497531890869, "logps/chosen": -82.66905975341797, "logps/rejected": -114.1065444946289, "loss": 0.0033, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.0852515697479248, "rewards/margins": 23.62014389038086, "rewards/rejected": -24.705394744873047, "step": 4500 }, { "epoch": 2.05, "eval_logits/chosen": -2.4142351150512695, "eval_logits/rejected": -2.1326496601104736, "eval_logps/chosen": -83.60591125488281, "eval_logps/rejected": -113.47210693359375, "eval_loss": 0.008534280583262444, "eval_rewards/accuracies": 0.9916201233863831, "eval_rewards/chosen": -0.7100857496261597, "eval_rewards/margins": 24.795320510864258, "eval_rewards/rejected": -25.505407333374023, "eval_runtime": 249.6733, "eval_samples_per_second": 11.463, "eval_steps_per_second": 0.717, "step": 4500 }, { "epoch": 2.06, "learning_rate": 1.0451547437848806e-07, "logits/chosen": -2.753589153289795, "logits/rejected": -2.38529634475708, "logps/chosen": -87.801025390625, "logps/rejected": -114.15721130371094, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": -0.4556310176849365, "rewards/margins": 25.235393524169922, "rewards/rejected": -25.691024780273438, "step": 4510 }, { "epoch": 2.06, "learning_rate": 1.0400811770674783e-07, "logits/chosen": -2.733555793762207, "logits/rejected": -2.4012415409088135, "logps/chosen": -77.68000793457031, "logps/rejected": -114.9632339477539, "loss": 0.0065, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.7152536511421204, "rewards/margins": 25.809295654296875, "rewards/rejected": -26.524547576904297, "step": 4520 }, { "epoch": 2.07, "learning_rate": 1.035007610350076e-07, "logits/chosen": -2.742079257965088, "logits/rejected": -2.423189878463745, "logps/chosen": -86.51893615722656, "logps/rejected": -115.84825134277344, "loss": 0.0043, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.43903386592865, "rewards/margins": 24.648290634155273, "rewards/rejected": -26.087322235107422, "step": 4530 }, { "epoch": 2.07, "learning_rate": 1.0299340436326736e-07, "logits/chosen": -2.7405965328216553, "logits/rejected": -2.505030870437622, "logps/chosen": -80.73456573486328, "logps/rejected": -110.7133560180664, "loss": 0.0031, "rewards/accuracies": 1.0, "rewards/chosen": 0.08044181019067764, "rewards/margins": 24.27283477783203, "rewards/rejected": -24.192392349243164, "step": 4540 }, { "epoch": 2.08, "learning_rate": 1.0248604769152713e-07, "logits/chosen": -2.719449520111084, "logits/rejected": -2.443070411682129, "logps/chosen": -82.0495376586914, "logps/rejected": -117.40000915527344, "loss": 0.0024, "rewards/accuracies": 1.0, "rewards/chosen": -1.1909353733062744, "rewards/margins": 24.419300079345703, "rewards/rejected": -25.6102352142334, "step": 4550 }, { "epoch": 2.08, "learning_rate": 1.019786910197869e-07, "logits/chosen": -2.7304117679595947, "logits/rejected": -2.436112403869629, "logps/chosen": -85.12984466552734, "logps/rejected": -112.81459045410156, "loss": 0.0061, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.6323047876358032, "rewards/margins": 23.803577423095703, "rewards/rejected": -25.435882568359375, "step": 4560 }, { "epoch": 2.09, "learning_rate": 1.0147133434804666e-07, "logits/chosen": -2.7492613792419434, "logits/rejected": -2.3745036125183105, "logps/chosen": -87.1934585571289, "logps/rejected": -114.2850570678711, "loss": 0.0054, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.26164981722831726, "rewards/margins": 25.54134178161621, "rewards/rejected": -25.802989959716797, "step": 4570 }, { "epoch": 2.09, "learning_rate": 1.0096397767630643e-07, "logits/chosen": -2.7224347591400146, "logits/rejected": -2.3461365699768066, "logps/chosen": -87.7784652709961, "logps/rejected": -116.38435363769531, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -0.5094483494758606, "rewards/margins": 24.91290283203125, "rewards/rejected": -25.422351837158203, "step": 4580 }, { "epoch": 2.1, "learning_rate": 1.004566210045662e-07, "logits/chosen": -2.7191128730773926, "logits/rejected": -2.417572498321533, "logps/chosen": -82.06591033935547, "logps/rejected": -111.7408447265625, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": 0.2713128328323364, "rewards/margins": 24.137386322021484, "rewards/rejected": -23.866071701049805, "step": 4590 }, { "epoch": 2.1, "learning_rate": 9.994926433282596e-08, "logits/chosen": -2.7330124378204346, "logits/rejected": -2.4072132110595703, "logps/chosen": -85.74193572998047, "logps/rejected": -119.9201889038086, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": -0.7810177206993103, "rewards/margins": 25.37158203125, "rewards/rejected": -26.152599334716797, "step": 4600 }, { "epoch": 2.1, "eval_logits/chosen": -2.4152045249938965, "eval_logits/rejected": -2.133347988128662, "eval_logps/chosen": -83.75669860839844, "eval_logps/rejected": -114.56344604492188, "eval_loss": 0.008692936971783638, "eval_rewards/accuracies": 0.9916201233863831, "eval_rewards/chosen": -0.7854794859886169, "eval_rewards/margins": 25.265592575073242, "eval_rewards/rejected": -26.051071166992188, "eval_runtime": 202.7973, "eval_samples_per_second": 14.113, "eval_steps_per_second": 0.883, "step": 4600 }, { "epoch": 2.1, "learning_rate": 9.944190766108573e-08, "logits/chosen": -2.7630772590637207, "logits/rejected": -2.426917791366577, "logps/chosen": -86.51116943359375, "logps/rejected": -113.0669937133789, "loss": 0.0035, "rewards/accuracies": 1.0, "rewards/chosen": -0.4053497314453125, "rewards/margins": 24.80515480041504, "rewards/rejected": -25.21050453186035, "step": 4610 }, { "epoch": 2.11, "learning_rate": 9.89345509893455e-08, "logits/chosen": -2.7054784297943115, "logits/rejected": -2.440279245376587, "logps/chosen": -81.97158813476562, "logps/rejected": -118.75299072265625, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": -0.9931540489196777, "rewards/margins": 26.19051742553711, "rewards/rejected": -27.183673858642578, "step": 4620 }, { "epoch": 2.11, "learning_rate": 9.842719431760526e-08, "logits/chosen": -2.7215018272399902, "logits/rejected": -2.3590283393859863, "logps/chosen": -86.5494384765625, "logps/rejected": -119.49763488769531, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -0.04281427711248398, "rewards/margins": 27.71389389038086, "rewards/rejected": -27.7567081451416, "step": 4630 }, { "epoch": 2.12, "learning_rate": 9.791983764586503e-08, "logits/chosen": -2.735898494720459, "logits/rejected": -2.3761093616485596, "logps/chosen": -83.83515930175781, "logps/rejected": -110.11314392089844, "loss": 0.0045, "rewards/accuracies": 1.0, "rewards/chosen": -0.921487033367157, "rewards/margins": 25.208492279052734, "rewards/rejected": -26.12997817993164, "step": 4640 }, { "epoch": 2.12, "learning_rate": 9.74124809741248e-08, "logits/chosen": -2.7466511726379395, "logits/rejected": -2.41642165184021, "logps/chosen": -84.6783676147461, "logps/rejected": -119.63688659667969, "loss": 0.0055, "rewards/accuracies": 1.0, "rewards/chosen": -0.04230177402496338, "rewards/margins": 27.000308990478516, "rewards/rejected": -27.0426082611084, "step": 4650 }, { "epoch": 2.13, "learning_rate": 9.690512430238456e-08, "logits/chosen": -2.739722728729248, "logits/rejected": -2.3790388107299805, "logps/chosen": -81.67439270019531, "logps/rejected": -112.6825942993164, "loss": 0.0045, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.0495341531932354, "rewards/margins": 25.877033233642578, "rewards/rejected": -25.926565170288086, "step": 4660 }, { "epoch": 2.13, "learning_rate": 9.639776763064433e-08, "logits/chosen": -2.750140428543091, "logits/rejected": -2.4367566108703613, "logps/chosen": -85.68074035644531, "logps/rejected": -119.1629867553711, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": -0.6781964302062988, "rewards/margins": 25.859704971313477, "rewards/rejected": -26.53790283203125, "step": 4670 }, { "epoch": 2.14, "learning_rate": 9.58904109589041e-08, "logits/chosen": -2.7557902336120605, "logits/rejected": -2.364943265914917, "logps/chosen": -88.01564025878906, "logps/rejected": -118.62255859375, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": -1.7701088190078735, "rewards/margins": 26.2861385345459, "rewards/rejected": -28.056243896484375, "step": 4680 }, { "epoch": 2.14, "learning_rate": 9.538305428716386e-08, "logits/chosen": -2.729992389678955, "logits/rejected": -2.419080972671509, "logps/chosen": -89.33726501464844, "logps/rejected": -116.74159240722656, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -1.078500509262085, "rewards/margins": 25.338275909423828, "rewards/rejected": -26.41677474975586, "step": 4690 }, { "epoch": 2.15, "learning_rate": 9.487569761542363e-08, "logits/chosen": -2.7589967250823975, "logits/rejected": -2.393528938293457, "logps/chosen": -86.66984558105469, "logps/rejected": -117.08720397949219, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -0.47808927297592163, "rewards/margins": 26.81856346130371, "rewards/rejected": -27.29665184020996, "step": 4700 }, { "epoch": 2.15, "eval_logits/chosen": -2.4198496341705322, "eval_logits/rejected": -2.136857509613037, "eval_logps/chosen": -83.9869613647461, "eval_logps/rejected": -115.63025665283203, "eval_loss": 0.008820492774248123, "eval_rewards/accuracies": 0.994413435459137, "eval_rewards/chosen": -0.9006128311157227, "eval_rewards/margins": 25.683860778808594, "eval_rewards/rejected": -26.584474563598633, "eval_runtime": 195.4866, "eval_samples_per_second": 14.64, "eval_steps_per_second": 0.916, "step": 4700 }, { "epoch": 2.15, "learning_rate": 9.43683409436834e-08, "logits/chosen": -2.7482564449310303, "logits/rejected": -2.455512523651123, "logps/chosen": -83.74434661865234, "logps/rejected": -122.92010498046875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -0.45079880952835083, "rewards/margins": 27.006671905517578, "rewards/rejected": -27.457469940185547, "step": 4710 }, { "epoch": 2.15, "learning_rate": 9.386098427194316e-08, "logits/chosen": -2.7407925128936768, "logits/rejected": -2.3859658241271973, "logps/chosen": -87.83242797851562, "logps/rejected": -117.82684326171875, "loss": 0.0044, "rewards/accuracies": 1.0, "rewards/chosen": -1.8445281982421875, "rewards/margins": 25.436491012573242, "rewards/rejected": -27.281017303466797, "step": 4720 }, { "epoch": 2.16, "learning_rate": 9.335362760020293e-08, "logits/chosen": -2.736440658569336, "logits/rejected": -2.4071812629699707, "logps/chosen": -79.68396759033203, "logps/rejected": -114.01423645019531, "loss": 0.0056, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.3683773279190063, "rewards/margins": 25.245344161987305, "rewards/rejected": -26.613719940185547, "step": 4730 }, { "epoch": 2.16, "learning_rate": 9.28462709284627e-08, "logits/chosen": -2.737265110015869, "logits/rejected": -2.3971991539001465, "logps/chosen": -85.11085510253906, "logps/rejected": -117.37776184082031, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": -0.14145371317863464, "rewards/margins": 27.50563621520996, "rewards/rejected": -27.647090911865234, "step": 4740 }, { "epoch": 2.17, "learning_rate": 9.233891425672246e-08, "logits/chosen": -2.7653756141662598, "logits/rejected": -2.418442487716675, "logps/chosen": -83.51417541503906, "logps/rejected": -114.92437744140625, "loss": 0.0055, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.19078734517097473, "rewards/margins": 26.294750213623047, "rewards/rejected": -26.103967666625977, "step": 4750 }, { "epoch": 2.17, "learning_rate": 9.183155758498223e-08, "logits/chosen": -2.719844341278076, "logits/rejected": -2.402873992919922, "logps/chosen": -88.03221130371094, "logps/rejected": -122.08265686035156, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -0.19254076480865479, "rewards/margins": 27.305049896240234, "rewards/rejected": -27.49759292602539, "step": 4760 }, { "epoch": 2.18, "learning_rate": 9.1324200913242e-08, "logits/chosen": -2.7646842002868652, "logits/rejected": -2.454407215118408, "logps/chosen": -87.63697814941406, "logps/rejected": -121.50807189941406, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": -1.0095808506011963, "rewards/margins": 25.738910675048828, "rewards/rejected": -26.748489379882812, "step": 4770 }, { "epoch": 2.18, "learning_rate": 9.081684424150176e-08, "logits/chosen": -2.7403461933135986, "logits/rejected": -2.412534236907959, "logps/chosen": -86.78933715820312, "logps/rejected": -119.14067077636719, "loss": 0.0054, "rewards/accuracies": 1.0, "rewards/chosen": -0.4612332284450531, "rewards/margins": 26.460323333740234, "rewards/rejected": -26.921554565429688, "step": 4780 }, { "epoch": 2.19, "learning_rate": 9.030948756976153e-08, "logits/chosen": -2.7297046184539795, "logits/rejected": -2.333649158477783, "logps/chosen": -86.60482025146484, "logps/rejected": -117.9629898071289, "loss": 0.0031, "rewards/accuracies": 1.0, "rewards/chosen": 0.09385283291339874, "rewards/margins": 27.59212875366211, "rewards/rejected": -27.498279571533203, "step": 4790 }, { "epoch": 2.19, "learning_rate": 8.98021308980213e-08, "logits/chosen": -2.753516674041748, "logits/rejected": -2.424811601638794, "logps/chosen": -85.31449127197266, "logps/rejected": -122.04701232910156, "loss": 0.0065, "rewards/accuracies": 1.0, "rewards/chosen": 0.10934486240148544, "rewards/margins": 27.865203857421875, "rewards/rejected": -27.755863189697266, "step": 4800 }, { "epoch": 2.19, "eval_logits/chosen": -2.419823169708252, "eval_logits/rejected": -2.1393251419067383, "eval_logps/chosen": -83.69972229003906, "eval_logps/rejected": -116.2533187866211, "eval_loss": 0.008754607290029526, "eval_rewards/accuracies": 0.9916201233863831, "eval_rewards/chosen": -0.7569895386695862, "eval_rewards/margins": 26.139020919799805, "eval_rewards/rejected": -26.89600944519043, "eval_runtime": 186.63, "eval_samples_per_second": 15.335, "eval_steps_per_second": 0.959, "step": 4800 }, { "epoch": 2.2, "learning_rate": 8.929477422628106e-08, "logits/chosen": -2.7604269981384277, "logits/rejected": -2.3702645301818848, "logps/chosen": -92.0900650024414, "logps/rejected": -121.00146484375, "loss": 0.0043, "rewards/accuracies": 1.0, "rewards/chosen": -1.0225459337234497, "rewards/margins": 25.919504165649414, "rewards/rejected": -26.942047119140625, "step": 4810 }, { "epoch": 2.2, "learning_rate": 8.878741755454083e-08, "logits/chosen": -2.741004467010498, "logits/rejected": -2.388043165206909, "logps/chosen": -90.02754211425781, "logps/rejected": -121.30998229980469, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -1.2571947574615479, "rewards/margins": 26.245468139648438, "rewards/rejected": -27.50266456604004, "step": 4820 }, { "epoch": 2.2, "learning_rate": 8.82800608828006e-08, "logits/chosen": -2.7629637718200684, "logits/rejected": -2.4480700492858887, "logps/chosen": -88.1716079711914, "logps/rejected": -118.70650482177734, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -0.8290748596191406, "rewards/margins": 27.12428855895996, "rewards/rejected": -27.9533634185791, "step": 4830 }, { "epoch": 2.21, "learning_rate": 8.777270421106036e-08, "logits/chosen": -2.7500314712524414, "logits/rejected": -2.4319820404052734, "logps/chosen": -85.62918853759766, "logps/rejected": -122.43016052246094, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -1.6603271961212158, "rewards/margins": 27.1922664642334, "rewards/rejected": -28.85259437561035, "step": 4840 }, { "epoch": 2.21, "learning_rate": 8.726534753932013e-08, "logits/chosen": -2.7457435131073, "logits/rejected": -2.4542102813720703, "logps/chosen": -82.3641357421875, "logps/rejected": -118.55645751953125, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -1.4643232822418213, "rewards/margins": 25.572376251220703, "rewards/rejected": -27.036697387695312, "step": 4850 }, { "epoch": 2.22, "learning_rate": 8.67579908675799e-08, "logits/chosen": -2.733107089996338, "logits/rejected": -2.4463107585906982, "logps/chosen": -80.9885025024414, "logps/rejected": -119.35713195800781, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -1.0983593463897705, "rewards/margins": 26.480026245117188, "rewards/rejected": -27.578388214111328, "step": 4860 }, { "epoch": 2.22, "learning_rate": 8.625063419583966e-08, "logits/chosen": -2.7471723556518555, "logits/rejected": -2.4317822456359863, "logps/chosen": -81.436767578125, "logps/rejected": -116.1656265258789, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -0.5986993908882141, "rewards/margins": 26.00970458984375, "rewards/rejected": -26.6084041595459, "step": 4870 }, { "epoch": 2.23, "learning_rate": 8.574327752409943e-08, "logits/chosen": -2.730562210083008, "logits/rejected": -2.430459976196289, "logps/chosen": -80.17289733886719, "logps/rejected": -119.05906677246094, "loss": 0.0023, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.9607702493667603, "rewards/margins": 27.269878387451172, "rewards/rejected": -28.23065185546875, "step": 4880 }, { "epoch": 2.23, "learning_rate": 8.52359208523592e-08, "logits/chosen": -2.7618296146392822, "logits/rejected": -2.3698832988739014, "logps/chosen": -84.16023254394531, "logps/rejected": -121.373779296875, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": 0.04714477062225342, "rewards/margins": 28.0283145904541, "rewards/rejected": -27.98116683959961, "step": 4890 }, { "epoch": 2.24, "learning_rate": 8.472856418061896e-08, "logits/chosen": -2.7569549083709717, "logits/rejected": -2.381782054901123, "logps/chosen": -89.47846984863281, "logps/rejected": -120.94297790527344, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -1.319193720817566, "rewards/margins": 26.88057518005371, "rewards/rejected": -28.19976806640625, "step": 4900 }, { "epoch": 2.24, "eval_logits/chosen": -2.4244959354400635, "eval_logits/rejected": -2.142841100692749, "eval_logps/chosen": -84.10192108154297, "eval_logps/rejected": -118.3475341796875, "eval_loss": 0.009065668098628521, "eval_rewards/accuracies": 0.9916201233863831, "eval_rewards/chosen": -0.9580895900726318, "eval_rewards/margins": 26.985023498535156, "eval_rewards/rejected": -27.943113327026367, "eval_runtime": 246.6978, "eval_samples_per_second": 11.601, "eval_steps_per_second": 0.726, "step": 4900 }, { "epoch": 2.24, "learning_rate": 8.422120750887873e-08, "logits/chosen": -2.744419574737549, "logits/rejected": -2.380239963531494, "logps/chosen": -88.51338958740234, "logps/rejected": -120.62641906738281, "loss": 0.0024, "rewards/accuracies": 1.0, "rewards/chosen": -0.2450464516878128, "rewards/margins": 27.529590606689453, "rewards/rejected": -27.77463722229004, "step": 4910 }, { "epoch": 2.25, "learning_rate": 8.37138508371385e-08, "logits/chosen": -2.7320945262908936, "logits/rejected": -2.3801326751708984, "logps/chosen": -88.2393798828125, "logps/rejected": -123.69564056396484, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": -0.8916546106338501, "rewards/margins": 26.60930824279785, "rewards/rejected": -27.500957489013672, "step": 4920 }, { "epoch": 2.25, "learning_rate": 8.320649416539826e-08, "logits/chosen": -2.7349050045013428, "logits/rejected": -2.4097728729248047, "logps/chosen": -86.44974517822266, "logps/rejected": -120.07059478759766, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": -1.1891833543777466, "rewards/margins": 26.973369598388672, "rewards/rejected": -28.162551879882812, "step": 4930 }, { "epoch": 2.25, "learning_rate": 8.269913749365803e-08, "logits/chosen": -2.7491402626037598, "logits/rejected": -2.4169535636901855, "logps/chosen": -87.71852111816406, "logps/rejected": -121.096923828125, "loss": 0.0045, "rewards/accuracies": 1.0, "rewards/chosen": -0.12718172371387482, "rewards/margins": 27.187097549438477, "rewards/rejected": -27.31427574157715, "step": 4940 }, { "epoch": 2.26, "learning_rate": 8.21917808219178e-08, "logits/chosen": -2.738935708999634, "logits/rejected": -2.4141650199890137, "logps/chosen": -87.27973937988281, "logps/rejected": -125.20686340332031, "loss": 0.0056, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.0808908939361572, "rewards/margins": 28.100439071655273, "rewards/rejected": -29.181325912475586, "step": 4950 }, { "epoch": 2.26, "learning_rate": 8.168442415017756e-08, "logits/chosen": -2.7511541843414307, "logits/rejected": -2.4077138900756836, "logps/chosen": -89.78852081298828, "logps/rejected": -121.01255798339844, "loss": 0.0043, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.9819309711456299, "rewards/margins": 26.551565170288086, "rewards/rejected": -28.533496856689453, "step": 4960 }, { "epoch": 2.27, "learning_rate": 8.117706747843733e-08, "logits/chosen": -2.7466483116149902, "logits/rejected": -2.4387660026550293, "logps/chosen": -80.66881561279297, "logps/rejected": -118.0799331665039, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": -0.502818763256073, "rewards/margins": 27.108835220336914, "rewards/rejected": -27.61165428161621, "step": 4970 }, { "epoch": 2.27, "learning_rate": 8.06697108066971e-08, "logits/chosen": -2.7467684745788574, "logits/rejected": -2.3875367641448975, "logps/chosen": -84.77754974365234, "logps/rejected": -122.67195892333984, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": -0.5546636581420898, "rewards/margins": 28.829843521118164, "rewards/rejected": -29.384502410888672, "step": 4980 }, { "epoch": 2.28, "learning_rate": 8.016235413495687e-08, "logits/chosen": -2.753793954849243, "logits/rejected": -2.4937329292297363, "logps/chosen": -80.5523910522461, "logps/rejected": -120.7317123413086, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -0.7898101806640625, "rewards/margins": 27.617298126220703, "rewards/rejected": -28.4071044921875, "step": 4990 }, { "epoch": 2.28, "learning_rate": 7.965499746321664e-08, "logits/chosen": -2.74638032913208, "logits/rejected": -2.385280132293701, "logps/chosen": -87.71144104003906, "logps/rejected": -125.1296157836914, "loss": 0.0026, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.024894308298826218, "rewards/margins": 28.520004272460938, "rewards/rejected": -28.544897079467773, "step": 5000 }, { "epoch": 2.28, "eval_logits/chosen": -2.4286837577819824, "eval_logits/rejected": -2.1479344367980957, "eval_logps/chosen": -84.69007873535156, "eval_logps/rejected": -120.12315368652344, "eval_loss": 0.009109003469347954, "eval_rewards/accuracies": 0.994413435459137, "eval_rewards/chosen": -1.252166986465454, "eval_rewards/margins": 27.578763961791992, "eval_rewards/rejected": -28.830928802490234, "eval_runtime": 178.594, "eval_samples_per_second": 16.025, "eval_steps_per_second": 1.002, "step": 5000 }, { "epoch": 2.29, "learning_rate": 7.91476407914764e-08, "logits/chosen": -2.7525131702423096, "logits/rejected": -2.429238796234131, "logps/chosen": -84.10145568847656, "logps/rejected": -118.63456726074219, "loss": 0.0021, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.631492018699646, "rewards/margins": 27.067092895507812, "rewards/rejected": -28.698583602905273, "step": 5010 }, { "epoch": 2.29, "learning_rate": 7.864028411973617e-08, "logits/chosen": -2.7419450283050537, "logits/rejected": -2.4353508949279785, "logps/chosen": -87.23265075683594, "logps/rejected": -119.93537902832031, "loss": 0.0064, "rewards/accuracies": 1.0, "rewards/chosen": -1.198089361190796, "rewards/margins": 27.85365867614746, "rewards/rejected": -29.051748275756836, "step": 5020 }, { "epoch": 2.3, "learning_rate": 7.813292744799594e-08, "logits/chosen": -2.7567243576049805, "logits/rejected": -2.434091091156006, "logps/chosen": -88.50325775146484, "logps/rejected": -123.66788482666016, "loss": 0.0022, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.608832597732544, "rewards/margins": 27.53262710571289, "rewards/rejected": -29.14145851135254, "step": 5030 }, { "epoch": 2.3, "learning_rate": 7.76255707762557e-08, "logits/chosen": -2.7514610290527344, "logits/rejected": -2.4645791053771973, "logps/chosen": -87.3482894897461, "logps/rejected": -124.16609954833984, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -1.6217960119247437, "rewards/margins": 27.709707260131836, "rewards/rejected": -29.33150291442871, "step": 5040 }, { "epoch": 2.31, "learning_rate": 7.711821410451547e-08, "logits/chosen": -2.753720998764038, "logits/rejected": -2.426527261734009, "logps/chosen": -87.53666687011719, "logps/rejected": -121.10200500488281, "loss": 0.0024, "rewards/accuracies": 1.0, "rewards/chosen": -0.7266362309455872, "rewards/margins": 27.7170467376709, "rewards/rejected": -28.44368553161621, "step": 5050 }, { "epoch": 2.31, "learning_rate": 7.661085743277524e-08, "logits/chosen": -2.727189540863037, "logits/rejected": -2.4044411182403564, "logps/chosen": -91.38700866699219, "logps/rejected": -123.95140075683594, "loss": 0.0014, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.4474596977233887, "rewards/margins": 27.62904930114746, "rewards/rejected": -29.07651138305664, "step": 5060 }, { "epoch": 2.31, "learning_rate": 7.6103500761035e-08, "logits/chosen": -2.748976230621338, "logits/rejected": -2.4300005435943604, "logps/chosen": -83.37203216552734, "logps/rejected": -121.5499267578125, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -0.9153432846069336, "rewards/margins": 28.415149688720703, "rewards/rejected": -29.330493927001953, "step": 5070 }, { "epoch": 2.32, "learning_rate": 7.559614408929477e-08, "logits/chosen": -2.762366533279419, "logits/rejected": -2.387699604034424, "logps/chosen": -85.62800598144531, "logps/rejected": -120.8462142944336, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": 0.7187870740890503, "rewards/margins": 29.0457763671875, "rewards/rejected": -28.326990127563477, "step": 5080 }, { "epoch": 2.32, "learning_rate": 7.508878741755454e-08, "logits/chosen": -2.7581846714019775, "logits/rejected": -2.394716262817383, "logps/chosen": -85.32283020019531, "logps/rejected": -123.6475830078125, "loss": 0.0054, "rewards/accuracies": 1.0, "rewards/chosen": -0.7468321919441223, "rewards/margins": 28.43435287475586, "rewards/rejected": -29.18118667602539, "step": 5090 }, { "epoch": 2.33, "learning_rate": 7.45814307458143e-08, "logits/chosen": -2.7617030143737793, "logits/rejected": -2.4536755084991455, "logps/chosen": -89.8367919921875, "logps/rejected": -124.69386291503906, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": -2.090759038925171, "rewards/margins": 26.8618221282959, "rewards/rejected": -28.95258140563965, "step": 5100 }, { "epoch": 2.33, "eval_logits/chosen": -2.432805061340332, "eval_logits/rejected": -2.1521759033203125, "eval_logps/chosen": -83.90615844726562, "eval_logps/rejected": -119.9259262084961, "eval_loss": 0.008918280713260174, "eval_rewards/accuracies": 0.9916201233863831, "eval_rewards/chosen": -0.8602104783058167, "eval_rewards/margins": 27.872102737426758, "eval_rewards/rejected": -28.732315063476562, "eval_runtime": 178.8174, "eval_samples_per_second": 16.005, "eval_steps_per_second": 1.001, "step": 5100 }, { "epoch": 2.33, "learning_rate": 7.407407407407407e-08, "logits/chosen": -2.735382080078125, "logits/rejected": -2.4374847412109375, "logps/chosen": -84.13839721679688, "logps/rejected": -120.77783203125, "loss": 0.0044, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.7216458916664124, "rewards/margins": 27.35125732421875, "rewards/rejected": -28.072900772094727, "step": 5110 }, { "epoch": 2.34, "learning_rate": 7.356671740233384e-08, "logits/chosen": -2.7483973503112793, "logits/rejected": -2.3707454204559326, "logps/chosen": -88.2922134399414, "logps/rejected": -119.92042541503906, "loss": 0.003, "rewards/accuracies": 1.0, "rewards/chosen": -1.73274827003479, "rewards/margins": 27.419116973876953, "rewards/rejected": -29.1518611907959, "step": 5120 }, { "epoch": 2.34, "learning_rate": 7.30593607305936e-08, "logits/chosen": -2.758070945739746, "logits/rejected": -2.4467294216156006, "logps/chosen": -87.00746154785156, "logps/rejected": -123.29046630859375, "loss": 0.0022, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.5180299282073975, "rewards/margins": 27.019861221313477, "rewards/rejected": -28.537887573242188, "step": 5130 }, { "epoch": 2.35, "learning_rate": 7.255200405885337e-08, "logits/chosen": -2.7713027000427246, "logits/rejected": -2.4392294883728027, "logps/chosen": -87.25083923339844, "logps/rejected": -125.13394927978516, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -1.0268337726593018, "rewards/margins": 28.515838623046875, "rewards/rejected": -29.542673110961914, "step": 5140 }, { "epoch": 2.35, "learning_rate": 7.204464738711314e-08, "logits/chosen": -2.740143299102783, "logits/rejected": -2.390916585922241, "logps/chosen": -85.54835510253906, "logps/rejected": -124.1504135131836, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -0.22981826961040497, "rewards/margins": 29.020532608032227, "rewards/rejected": -29.250350952148438, "step": 5150 }, { "epoch": 2.36, "learning_rate": 7.15372907153729e-08, "logits/chosen": -2.7730255126953125, "logits/rejected": -2.4213600158691406, "logps/chosen": -85.68617248535156, "logps/rejected": -124.814208984375, "loss": 0.0044, "rewards/accuracies": 1.0, "rewards/chosen": -1.642124891281128, "rewards/margins": 28.095458984375, "rewards/rejected": -29.737585067749023, "step": 5160 }, { "epoch": 2.36, "learning_rate": 7.102993404363267e-08, "logits/chosen": -2.7789113521575928, "logits/rejected": -2.4307496547698975, "logps/chosen": -86.83216857910156, "logps/rejected": -121.3434829711914, "loss": 0.0041, "rewards/accuracies": 1.0, "rewards/chosen": -1.2657625675201416, "rewards/margins": 27.959598541259766, "rewards/rejected": -29.225360870361328, "step": 5170 }, { "epoch": 2.36, "learning_rate": 7.052257737189244e-08, "logits/chosen": -2.7474265098571777, "logits/rejected": -2.4472198486328125, "logps/chosen": -85.38337707519531, "logps/rejected": -120.9761734008789, "loss": 0.0024, "rewards/accuracies": 1.0, "rewards/chosen": -2.248941659927368, "rewards/margins": 27.54292869567871, "rewards/rejected": -29.791866302490234, "step": 5180 }, { "epoch": 2.37, "learning_rate": 7.00152207001522e-08, "logits/chosen": -2.744936466217041, "logits/rejected": -2.427401065826416, "logps/chosen": -86.35491180419922, "logps/rejected": -126.22785949707031, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -0.505506157875061, "rewards/margins": 29.2794189453125, "rewards/rejected": -29.784921646118164, "step": 5190 }, { "epoch": 2.37, "learning_rate": 6.950786402841197e-08, "logits/chosen": -2.7449307441711426, "logits/rejected": -2.4244794845581055, "logps/chosen": -88.2833480834961, "logps/rejected": -123.68865966796875, "loss": 0.0041, "rewards/accuracies": 1.0, "rewards/chosen": -1.5235570669174194, "rewards/margins": 27.651763916015625, "rewards/rejected": -29.175323486328125, "step": 5200 }, { "epoch": 2.37, "eval_logits/chosen": -2.434272527694702, "eval_logits/rejected": -2.1536428928375244, "eval_logps/chosen": -84.26679229736328, "eval_logps/rejected": -121.03350830078125, "eval_loss": 0.009124072268605232, "eval_rewards/accuracies": 0.9916201233863831, "eval_rewards/chosen": -1.0405298471450806, "eval_rewards/margins": 28.245573043823242, "eval_rewards/rejected": -29.28610610961914, "eval_runtime": 217.3859, "eval_samples_per_second": 13.166, "eval_steps_per_second": 0.823, "step": 5200 }, { "epoch": 2.38, "learning_rate": 6.900050735667174e-08, "logits/chosen": -2.7636780738830566, "logits/rejected": -2.4439876079559326, "logps/chosen": -83.2416763305664, "logps/rejected": -126.52783203125, "loss": 0.0035, "rewards/accuracies": 1.0, "rewards/chosen": -1.2189033031463623, "rewards/margins": 28.027973175048828, "rewards/rejected": -29.246877670288086, "step": 5210 }, { "epoch": 2.38, "learning_rate": 6.84931506849315e-08, "logits/chosen": -2.7646894454956055, "logits/rejected": -2.49737286567688, "logps/chosen": -82.13372039794922, "logps/rejected": -126.6552505493164, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -0.6217214465141296, "rewards/margins": 28.312519073486328, "rewards/rejected": -28.934240341186523, "step": 5220 }, { "epoch": 2.39, "learning_rate": 6.798579401319127e-08, "logits/chosen": -2.7640790939331055, "logits/rejected": -2.459313154220581, "logps/chosen": -84.48454284667969, "logps/rejected": -127.57563781738281, "loss": 0.0043, "rewards/accuracies": 1.0, "rewards/chosen": -0.8562952876091003, "rewards/margins": 28.880661010742188, "rewards/rejected": -29.73695945739746, "step": 5230 }, { "epoch": 2.39, "learning_rate": 6.747843734145104e-08, "logits/chosen": -2.741337776184082, "logits/rejected": -2.435396432876587, "logps/chosen": -81.76171875, "logps/rejected": -120.62077331542969, "loss": 0.0035, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.19452862441539764, "rewards/margins": 27.625417709350586, "rewards/rejected": -27.8199462890625, "step": 5240 }, { "epoch": 2.4, "learning_rate": 6.69710806697108e-08, "logits/chosen": -2.7427334785461426, "logits/rejected": -2.406214475631714, "logps/chosen": -85.02540588378906, "logps/rejected": -122.47391510009766, "loss": 0.0043, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.10450949519872665, "rewards/margins": 29.903783798217773, "rewards/rejected": -29.79927635192871, "step": 5250 }, { "epoch": 2.4, "learning_rate": 6.646372399797057e-08, "logits/chosen": -2.7573702335357666, "logits/rejected": -2.4254257678985596, "logps/chosen": -81.25457763671875, "logps/rejected": -115.71319580078125, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -1.2134050130844116, "rewards/margins": 26.369384765625, "rewards/rejected": -27.58279037475586, "step": 5260 }, { "epoch": 2.41, "learning_rate": 6.595636732623034e-08, "logits/chosen": -2.7415711879730225, "logits/rejected": -2.4441635608673096, "logps/chosen": -84.73302459716797, "logps/rejected": -124.75992584228516, "loss": 0.0012, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.1790273189544678, "rewards/margins": 27.980754852294922, "rewards/rejected": -29.1597843170166, "step": 5270 }, { "epoch": 2.41, "learning_rate": 6.54490106544901e-08, "logits/chosen": -2.7682554721832275, "logits/rejected": -2.4446873664855957, "logps/chosen": -83.69309997558594, "logps/rejected": -124.48780822753906, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": 0.45241624116897583, "rewards/margins": 31.140522003173828, "rewards/rejected": -30.688106536865234, "step": 5280 }, { "epoch": 2.41, "learning_rate": 6.494165398274987e-08, "logits/chosen": -2.7499213218688965, "logits/rejected": -2.403151273727417, "logps/chosen": -90.95135498046875, "logps/rejected": -124.18879699707031, "loss": 0.0019, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.9265022277832031, "rewards/margins": 27.68059730529785, "rewards/rejected": -29.607101440429688, "step": 5290 }, { "epoch": 2.42, "learning_rate": 6.443429731100964e-08, "logits/chosen": -2.745110273361206, "logits/rejected": -2.418513298034668, "logps/chosen": -87.60101318359375, "logps/rejected": -125.9690933227539, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": -0.9313419461250305, "rewards/margins": 29.444713592529297, "rewards/rejected": -30.376056671142578, "step": 5300 }, { "epoch": 2.42, "eval_logits/chosen": -2.433647871017456, "eval_logits/rejected": -2.152880907058716, "eval_logps/chosen": -84.45043182373047, "eval_logps/rejected": -121.50931549072266, "eval_loss": 0.009305678308010101, "eval_rewards/accuracies": 0.9916201233863831, "eval_rewards/chosen": -1.1323456764221191, "eval_rewards/margins": 28.39165687561035, "eval_rewards/rejected": -29.52400016784668, "eval_runtime": 172.9171, "eval_samples_per_second": 16.551, "eval_steps_per_second": 1.035, "step": 5300 }, { "epoch": 2.42, "learning_rate": 6.39269406392694e-08, "logits/chosen": -2.755413055419922, "logits/rejected": -2.4468331336975098, "logps/chosen": -83.88670349121094, "logps/rejected": -126.59808349609375, "loss": 0.0046, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.6614924669265747, "rewards/margins": 28.857173919677734, "rewards/rejected": -29.518667221069336, "step": 5310 }, { "epoch": 2.43, "learning_rate": 6.341958396752917e-08, "logits/chosen": -2.7668747901916504, "logits/rejected": -2.398794651031494, "logps/chosen": -87.24461364746094, "logps/rejected": -122.89497375488281, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -0.8741731643676758, "rewards/margins": 28.587453842163086, "rewards/rejected": -29.461627960205078, "step": 5320 }, { "epoch": 2.43, "learning_rate": 6.291222729578894e-08, "logits/chosen": -2.7630562782287598, "logits/rejected": -2.447037935256958, "logps/chosen": -91.0066909790039, "logps/rejected": -123.65763854980469, "loss": 0.0052, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.823206663131714, "rewards/margins": 26.815990447998047, "rewards/rejected": -29.63919448852539, "step": 5330 }, { "epoch": 2.44, "learning_rate": 6.24048706240487e-08, "logits/chosen": -2.782585620880127, "logits/rejected": -2.4105465412139893, "logps/chosen": -92.41482543945312, "logps/rejected": -123.64078521728516, "loss": 0.0056, "rewards/accuracies": 1.0, "rewards/chosen": -2.0140185356140137, "rewards/margins": 27.505090713500977, "rewards/rejected": -29.51910400390625, "step": 5340 }, { "epoch": 2.44, "learning_rate": 6.189751395230847e-08, "logits/chosen": -2.766284227371216, "logits/rejected": -2.4156837463378906, "logps/chosen": -89.42349243164062, "logps/rejected": -122.7345199584961, "loss": 0.0025, "rewards/accuracies": 1.0, "rewards/chosen": -0.4651731848716736, "rewards/margins": 28.895532608032227, "rewards/rejected": -29.360708236694336, "step": 5350 }, { "epoch": 2.45, "learning_rate": 6.139015728056824e-08, "logits/chosen": -2.7686755657196045, "logits/rejected": -2.4416086673736572, "logps/chosen": -84.95355987548828, "logps/rejected": -121.6220932006836, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -0.9390074014663696, "rewards/margins": 28.085041046142578, "rewards/rejected": -29.0240478515625, "step": 5360 }, { "epoch": 2.45, "learning_rate": 6.0882800608828e-08, "logits/chosen": -2.745166301727295, "logits/rejected": -2.462104320526123, "logps/chosen": -84.87953186035156, "logps/rejected": -121.75547790527344, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -1.0288751125335693, "rewards/margins": 28.0614013671875, "rewards/rejected": -29.09027671813965, "step": 5370 }, { "epoch": 2.46, "learning_rate": 6.037544393708777e-08, "logits/chosen": -2.769814968109131, "logits/rejected": -2.4504354000091553, "logps/chosen": -88.84867858886719, "logps/rejected": -127.912841796875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -1.2884315252304077, "rewards/margins": 28.920536041259766, "rewards/rejected": -30.208969116210938, "step": 5380 }, { "epoch": 2.46, "learning_rate": 5.986808726534754e-08, "logits/chosen": -2.7714927196502686, "logits/rejected": -2.4107742309570312, "logps/chosen": -88.07279205322266, "logps/rejected": -121.81535339355469, "loss": 0.0059, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.9670640826225281, "rewards/margins": 28.364221572875977, "rewards/rejected": -29.331289291381836, "step": 5390 }, { "epoch": 2.46, "learning_rate": 5.93607305936073e-08, "logits/chosen": -2.754364490509033, "logits/rejected": -2.4336113929748535, "logps/chosen": -83.56143951416016, "logps/rejected": -121.99029541015625, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -0.716526985168457, "rewards/margins": 29.066598892211914, "rewards/rejected": -29.783126831054688, "step": 5400 }, { "epoch": 2.46, "eval_logits/chosen": -2.441641330718994, "eval_logits/rejected": -2.159546375274658, "eval_logps/chosen": -84.62613677978516, "eval_logps/rejected": -120.88663482666016, "eval_loss": 0.009226926602423191, "eval_rewards/accuracies": 0.9916201233863831, "eval_rewards/chosen": -1.2201969623565674, "eval_rewards/margins": 27.992460250854492, "eval_rewards/rejected": -29.212661743164062, "eval_runtime": 181.3698, "eval_samples_per_second": 15.78, "eval_steps_per_second": 0.987, "step": 5400 }, { "epoch": 2.47, "learning_rate": 5.8853373921867065e-08, "logits/chosen": -2.752310037612915, "logits/rejected": -2.4851667881011963, "logps/chosen": -87.39705657958984, "logps/rejected": -123.9158935546875, "loss": 0.0044, "rewards/accuracies": 1.0, "rewards/chosen": -1.41219961643219, "rewards/margins": 26.550390243530273, "rewards/rejected": -27.96259117126465, "step": 5410 }, { "epoch": 2.47, "learning_rate": 5.834601725012683e-08, "logits/chosen": -2.7847986221313477, "logits/rejected": -2.443748712539673, "logps/chosen": -85.70228576660156, "logps/rejected": -123.20450592041016, "loss": 0.0011, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.018771767616272, "rewards/margins": 28.040563583374023, "rewards/rejected": -29.059337615966797, "step": 5420 }, { "epoch": 2.48, "learning_rate": 5.78386605783866e-08, "logits/chosen": -2.758910894393921, "logits/rejected": -2.4003539085388184, "logps/chosen": -86.43096160888672, "logps/rejected": -122.36539459228516, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -0.773932158946991, "rewards/margins": 28.860431671142578, "rewards/rejected": -29.634363174438477, "step": 5430 }, { "epoch": 2.48, "learning_rate": 5.7331303906646365e-08, "logits/chosen": -2.751533031463623, "logits/rejected": -2.478721857070923, "logps/chosen": -85.23453521728516, "logps/rejected": -125.13542175292969, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": -1.3752094507217407, "rewards/margins": 27.906848907470703, "rewards/rejected": -29.282052993774414, "step": 5440 }, { "epoch": 2.49, "learning_rate": 5.682394723490613e-08, "logits/chosen": -2.771869659423828, "logits/rejected": -2.400510311126709, "logps/chosen": -89.07925415039062, "logps/rejected": -124.29158020019531, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -1.0254909992218018, "rewards/margins": 28.481983184814453, "rewards/rejected": -29.507471084594727, "step": 5450 }, { "epoch": 2.49, "learning_rate": 5.63165905631659e-08, "logits/chosen": -2.7659332752227783, "logits/rejected": -2.4291632175445557, "logps/chosen": -92.83513641357422, "logps/rejected": -130.30252075195312, "loss": 0.0032, "rewards/accuracies": 1.0, "rewards/chosen": -1.4746873378753662, "rewards/margins": 28.83097267150879, "rewards/rejected": -30.3056640625, "step": 5460 }, { "epoch": 2.5, "learning_rate": 5.5809233891425665e-08, "logits/chosen": -2.770104169845581, "logits/rejected": -2.42267107963562, "logps/chosen": -85.7560043334961, "logps/rejected": -118.6550064086914, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -2.2139437198638916, "rewards/margins": 26.978551864624023, "rewards/rejected": -29.192495346069336, "step": 5470 }, { "epoch": 2.5, "learning_rate": 5.530187721968543e-08, "logits/chosen": -2.7796452045440674, "logits/rejected": -2.4254415035247803, "logps/chosen": -90.0015640258789, "logps/rejected": -124.4817123413086, "loss": 0.0021, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.2050654888153076, "rewards/margins": 28.514633178710938, "rewards/rejected": -29.71969985961914, "step": 5480 }, { "epoch": 2.51, "learning_rate": 5.47945205479452e-08, "logits/chosen": -2.767035961151123, "logits/rejected": -2.4230144023895264, "logps/chosen": -86.62277221679688, "logps/rejected": -126.09996032714844, "loss": 0.0032, "rewards/accuracies": 1.0, "rewards/chosen": -1.1140720844268799, "rewards/margins": 29.11148452758789, "rewards/rejected": -30.225555419921875, "step": 5490 }, { "epoch": 2.51, "learning_rate": 5.4287163876204964e-08, "logits/chosen": -2.757408380508423, "logits/rejected": -2.4415459632873535, "logps/chosen": -87.70851135253906, "logps/rejected": -125.58280944824219, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -1.6237621307373047, "rewards/margins": 27.80632972717285, "rewards/rejected": -29.430089950561523, "step": 5500 }, { "epoch": 2.51, "eval_logits/chosen": -2.4404361248016357, "eval_logits/rejected": -2.160865545272827, "eval_logps/chosen": -85.05990600585938, "eval_logps/rejected": -121.87393188476562, "eval_loss": 0.009266870096325874, "eval_rewards/accuracies": 0.9916201233863831, "eval_rewards/chosen": -1.4370806217193604, "eval_rewards/margins": 28.26922607421875, "eval_rewards/rejected": -29.706308364868164, "eval_runtime": 230.8453, "eval_samples_per_second": 12.398, "eval_steps_per_second": 0.775, "step": 5500 }, { "epoch": 2.52, "learning_rate": 5.377980720446473e-08, "logits/chosen": -2.765672445297241, "logits/rejected": -2.473078489303589, "logps/chosen": -85.22737884521484, "logps/rejected": -119.9512939453125, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -1.3123337030410767, "rewards/margins": 28.8724422454834, "rewards/rejected": -30.18478012084961, "step": 5510 }, { "epoch": 2.52, "learning_rate": 5.32724505327245e-08, "logits/chosen": -2.785693407058716, "logits/rejected": -2.443232774734497, "logps/chosen": -88.04589080810547, "logps/rejected": -122.4439468383789, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": -1.2144062519073486, "rewards/margins": 27.08591079711914, "rewards/rejected": -28.300317764282227, "step": 5520 }, { "epoch": 2.52, "learning_rate": 5.2765093860984264e-08, "logits/chosen": -2.764965772628784, "logits/rejected": -2.477296829223633, "logps/chosen": -86.52013397216797, "logps/rejected": -127.77616882324219, "loss": 0.0033, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.973849892616272, "rewards/margins": 28.264923095703125, "rewards/rejected": -30.23876953125, "step": 5530 }, { "epoch": 2.53, "learning_rate": 5.225773718924403e-08, "logits/chosen": -2.7691471576690674, "logits/rejected": -2.4197044372558594, "logps/chosen": -83.17195129394531, "logps/rejected": -119.6854019165039, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": -0.8871095776557922, "rewards/margins": 28.597660064697266, "rewards/rejected": -29.48476791381836, "step": 5540 }, { "epoch": 2.53, "learning_rate": 5.17503805175038e-08, "logits/chosen": -2.762392282485962, "logits/rejected": -2.4805731773376465, "logps/chosen": -84.5125503540039, "logps/rejected": -126.39170837402344, "loss": 0.004, "rewards/accuracies": 1.0, "rewards/chosen": -1.1881184577941895, "rewards/margins": 29.901758193969727, "rewards/rejected": -31.089874267578125, "step": 5550 }, { "epoch": 2.54, "learning_rate": 5.1243023845763564e-08, "logits/chosen": -2.760188579559326, "logits/rejected": -2.4107179641723633, "logps/chosen": -87.40550231933594, "logps/rejected": -124.51380920410156, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -0.8094155192375183, "rewards/margins": 30.0570011138916, "rewards/rejected": -30.866418838500977, "step": 5560 }, { "epoch": 2.54, "learning_rate": 5.073566717402333e-08, "logits/chosen": -2.7619729042053223, "logits/rejected": -2.4799978733062744, "logps/chosen": -83.0888671875, "logps/rejected": -128.18246459960938, "loss": 0.0076, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.2581623792648315, "rewards/margins": 29.84149169921875, "rewards/rejected": -31.099658966064453, "step": 5570 }, { "epoch": 2.55, "learning_rate": 5.02283105022831e-08, "logits/chosen": -2.771467685699463, "logits/rejected": -2.4118340015411377, "logps/chosen": -88.58828735351562, "logps/rejected": -119.8714599609375, "loss": 0.0031, "rewards/accuracies": 1.0, "rewards/chosen": -1.0372138023376465, "rewards/margins": 28.661800384521484, "rewards/rejected": -29.699016571044922, "step": 5580 }, { "epoch": 2.55, "learning_rate": 4.9720953830542864e-08, "logits/chosen": -2.745488405227661, "logits/rejected": -2.488839626312256, "logps/chosen": -85.66227722167969, "logps/rejected": -126.21382141113281, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -1.4922962188720703, "rewards/margins": 28.159622192382812, "rewards/rejected": -29.65191650390625, "step": 5590 }, { "epoch": 2.56, "learning_rate": 4.921359715880263e-08, "logits/chosen": -2.7824316024780273, "logits/rejected": -2.4289815425872803, "logps/chosen": -83.65042877197266, "logps/rejected": -121.76817321777344, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -1.2598940134048462, "rewards/margins": 28.383874893188477, "rewards/rejected": -29.64376449584961, "step": 5600 }, { "epoch": 2.56, "eval_logits/chosen": -2.438318967819214, "eval_logits/rejected": -2.158374309539795, "eval_logps/chosen": -85.06520080566406, "eval_logps/rejected": -122.50160217285156, "eval_loss": 0.00952182337641716, "eval_rewards/accuracies": 0.994413435459137, "eval_rewards/chosen": -1.439728856086731, "eval_rewards/margins": 28.580427169799805, "eval_rewards/rejected": -30.020156860351562, "eval_runtime": 193.8134, "eval_samples_per_second": 14.767, "eval_steps_per_second": 0.924, "step": 5600 }, { "epoch": 2.56, "learning_rate": 4.87062404870624e-08, "logits/chosen": -2.768933057785034, "logits/rejected": -2.4125797748565674, "logps/chosen": -84.9870834350586, "logps/rejected": -125.88037109375, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -1.4587297439575195, "rewards/margins": 29.48636245727539, "rewards/rejected": -30.94508934020996, "step": 5610 }, { "epoch": 2.57, "learning_rate": 4.8198883815322164e-08, "logits/chosen": -2.7527966499328613, "logits/rejected": -2.4678454399108887, "logps/chosen": -89.00822448730469, "logps/rejected": -130.18411254882812, "loss": 0.0065, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.472886800765991, "rewards/margins": 28.331567764282227, "rewards/rejected": -30.804454803466797, "step": 5620 }, { "epoch": 2.57, "learning_rate": 4.769152714358193e-08, "logits/chosen": -2.755215883255005, "logits/rejected": -2.447148561477661, "logps/chosen": -82.71800231933594, "logps/rejected": -123.46649169921875, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -1.2614152431488037, "rewards/margins": 28.540180206298828, "rewards/rejected": -29.801599502563477, "step": 5630 }, { "epoch": 2.57, "learning_rate": 4.71841704718417e-08, "logits/chosen": -2.7677557468414307, "logits/rejected": -2.4381189346313477, "logps/chosen": -82.1149673461914, "logps/rejected": -120.96192932128906, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -0.8489066958427429, "rewards/margins": 28.56429672241211, "rewards/rejected": -29.4132080078125, "step": 5640 }, { "epoch": 2.58, "learning_rate": 4.6676813800101464e-08, "logits/chosen": -2.7652928829193115, "logits/rejected": -2.4197866916656494, "logps/chosen": -85.02003479003906, "logps/rejected": -119.82582092285156, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -0.39028388261795044, "rewards/margins": 28.446529388427734, "rewards/rejected": -28.836816787719727, "step": 5650 }, { "epoch": 2.58, "learning_rate": 4.616945712836123e-08, "logits/chosen": -2.7609305381774902, "logits/rejected": -2.4317970275878906, "logps/chosen": -89.29905700683594, "logps/rejected": -122.3862075805664, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": -1.5792943239212036, "rewards/margins": 28.483394622802734, "rewards/rejected": -30.062692642211914, "step": 5660 }, { "epoch": 2.59, "learning_rate": 4.5662100456621e-08, "logits/chosen": -2.769318103790283, "logits/rejected": -2.421332597732544, "logps/chosen": -85.064208984375, "logps/rejected": -125.6937484741211, "loss": 0.0038, "rewards/accuracies": 1.0, "rewards/chosen": -2.0754342079162598, "rewards/margins": 27.670013427734375, "rewards/rejected": -29.745447158813477, "step": 5670 }, { "epoch": 2.59, "learning_rate": 4.5154743784880764e-08, "logits/chosen": -2.7575669288635254, "logits/rejected": -2.5146377086639404, "logps/chosen": -82.90955352783203, "logps/rejected": -128.13299560546875, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -0.8003252148628235, "rewards/margins": 29.151077270507812, "rewards/rejected": -29.951406478881836, "step": 5680 }, { "epoch": 2.6, "learning_rate": 4.464738711314053e-08, "logits/chosen": -2.7499289512634277, "logits/rejected": -2.427319049835205, "logps/chosen": -88.74046325683594, "logps/rejected": -124.1895980834961, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -2.3100099563598633, "rewards/margins": 27.920557022094727, "rewards/rejected": -30.23056983947754, "step": 5690 }, { "epoch": 2.6, "learning_rate": 4.41400304414003e-08, "logits/chosen": -2.7523961067199707, "logits/rejected": -2.389225959777832, "logps/chosen": -89.8683853149414, "logps/rejected": -125.72187805175781, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -1.0094496011734009, "rewards/margins": 30.02132797241211, "rewards/rejected": -31.030776977539062, "step": 5700 }, { "epoch": 2.6, "eval_logits/chosen": -2.4395346641540527, "eval_logits/rejected": -2.160064935684204, "eval_logps/chosen": -85.41078186035156, "eval_logps/rejected": -122.650390625, "eval_loss": 0.009572750888764858, "eval_rewards/accuracies": 0.9916201233863831, "eval_rewards/chosen": -1.6125154495239258, "eval_rewards/margins": 28.482030868530273, "eval_rewards/rejected": -30.094547271728516, "eval_runtime": 221.4129, "eval_samples_per_second": 12.926, "eval_steps_per_second": 0.808, "step": 5700 }, { "epoch": 2.61, "learning_rate": 4.3632673769660064e-08, "logits/chosen": -2.762427568435669, "logits/rejected": -2.4140915870666504, "logps/chosen": -89.68592834472656, "logps/rejected": -129.76156616210938, "loss": 0.0045, "rewards/accuracies": 1.0, "rewards/chosen": -2.40108060836792, "rewards/margins": 27.84518814086914, "rewards/rejected": -30.246265411376953, "step": 5710 }, { "epoch": 2.61, "learning_rate": 4.312531709791983e-08, "logits/chosen": -2.7708516120910645, "logits/rejected": -2.496769428253174, "logps/chosen": -88.31327819824219, "logps/rejected": -129.05381774902344, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -1.5792248249053955, "rewards/margins": 29.627304077148438, "rewards/rejected": -31.206527709960938, "step": 5720 }, { "epoch": 2.62, "learning_rate": 4.26179604261796e-08, "logits/chosen": -2.7759032249450684, "logits/rejected": -2.440566301345825, "logps/chosen": -79.54498291015625, "logps/rejected": -120.24839782714844, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -0.591633141040802, "rewards/margins": 30.0907039642334, "rewards/rejected": -30.682336807250977, "step": 5730 }, { "epoch": 2.62, "learning_rate": 4.2110603754439363e-08, "logits/chosen": -2.7649545669555664, "logits/rejected": -2.4441158771514893, "logps/chosen": -84.98017883300781, "logps/rejected": -126.4588851928711, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": -1.398614764213562, "rewards/margins": 29.087610244750977, "rewards/rejected": -30.486225128173828, "step": 5740 }, { "epoch": 2.62, "learning_rate": 4.160324708269913e-08, "logits/chosen": -2.7674126625061035, "logits/rejected": -2.44022536277771, "logps/chosen": -86.49254608154297, "logps/rejected": -128.5341796875, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": -0.9185127019882202, "rewards/margins": 30.377389907836914, "rewards/rejected": -31.2959041595459, "step": 5750 }, { "epoch": 2.63, "learning_rate": 4.10958904109589e-08, "logits/chosen": -2.7756500244140625, "logits/rejected": -2.479367256164551, "logps/chosen": -80.21389770507812, "logps/rejected": -118.49610900878906, "loss": 0.0035, "rewards/accuracies": 1.0, "rewards/chosen": -0.7814503908157349, "rewards/margins": 27.6958065032959, "rewards/rejected": -28.477258682250977, "step": 5760 }, { "epoch": 2.63, "learning_rate": 4.0588533739218663e-08, "logits/chosen": -2.747576951980591, "logits/rejected": -2.3897807598114014, "logps/chosen": -81.59397888183594, "logps/rejected": -120.45591735839844, "loss": 0.0031, "rewards/accuracies": 1.0, "rewards/chosen": -0.9153077006340027, "rewards/margins": 29.088626861572266, "rewards/rejected": -30.00393295288086, "step": 5770 }, { "epoch": 2.64, "learning_rate": 4.0081177067478437e-08, "logits/chosen": -2.768725872039795, "logits/rejected": -2.4182541370391846, "logps/chosen": -89.11521911621094, "logps/rejected": -122.61747741699219, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -1.5854370594024658, "rewards/margins": 28.07851791381836, "rewards/rejected": -29.663955688476562, "step": 5780 }, { "epoch": 2.64, "learning_rate": 3.95738203957382e-08, "logits/chosen": -2.7484359741210938, "logits/rejected": -2.5369114875793457, "logps/chosen": -82.28369140625, "logps/rejected": -124.33564758300781, "loss": 0.0055, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.7363954782485962, "rewards/margins": 27.852054595947266, "rewards/rejected": -29.588449478149414, "step": 5790 }, { "epoch": 2.65, "learning_rate": 3.906646372399797e-08, "logits/chosen": -2.7648987770080566, "logits/rejected": -2.4015555381774902, "logps/chosen": -91.66053771972656, "logps/rejected": -129.81210327148438, "loss": 0.0053, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.1680419445037842, "rewards/margins": 29.864002227783203, "rewards/rejected": -31.032039642333984, "step": 5800 }, { "epoch": 2.65, "eval_logits/chosen": -2.4397711753845215, "eval_logits/rejected": -2.161456823348999, "eval_logps/chosen": -85.31334686279297, "eval_logps/rejected": -122.46633911132812, "eval_loss": 0.009494620375335217, "eval_rewards/accuracies": 0.994413435459137, "eval_rewards/chosen": -1.5638020038604736, "eval_rewards/margins": 28.438709259033203, "eval_rewards/rejected": -30.002511978149414, "eval_runtime": 192.6451, "eval_samples_per_second": 14.856, "eval_steps_per_second": 0.929, "step": 5800 }, { "epoch": 2.65, "learning_rate": 3.8559107052257736e-08, "logits/chosen": -2.757185459136963, "logits/rejected": -2.482255220413208, "logps/chosen": -83.15132141113281, "logps/rejected": -123.84806060791016, "loss": 0.0033, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.6451876163482666, "rewards/margins": 28.239919662475586, "rewards/rejected": -29.885112762451172, "step": 5810 }, { "epoch": 2.66, "learning_rate": 3.80517503805175e-08, "logits/chosen": -2.7674450874328613, "logits/rejected": -2.429764986038208, "logps/chosen": -93.63639068603516, "logps/rejected": -126.49434661865234, "loss": 0.0032, "rewards/accuracies": 1.0, "rewards/chosen": -1.7822329998016357, "rewards/margins": 27.3085880279541, "rewards/rejected": -29.0908203125, "step": 5820 }, { "epoch": 2.66, "learning_rate": 3.754439370877727e-08, "logits/chosen": -2.7462615966796875, "logits/rejected": -2.40531325340271, "logps/chosen": -86.35981750488281, "logps/rejected": -123.5717544555664, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": -1.55569589138031, "rewards/margins": 28.12066078186035, "rewards/rejected": -29.67635726928711, "step": 5830 }, { "epoch": 2.67, "learning_rate": 3.7037037037037036e-08, "logits/chosen": -2.7728779315948486, "logits/rejected": -2.4625866413116455, "logps/chosen": -81.15315246582031, "logps/rejected": -125.115478515625, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -1.5160882472991943, "rewards/margins": 28.699787139892578, "rewards/rejected": -30.21587562561035, "step": 5840 }, { "epoch": 2.67, "learning_rate": 3.65296803652968e-08, "logits/chosen": -2.766444683074951, "logits/rejected": -2.4712018966674805, "logps/chosen": -83.96014404296875, "logps/rejected": -116.53572082519531, "loss": 0.0057, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.2907227277755737, "rewards/margins": 26.944568634033203, "rewards/rejected": -28.23529052734375, "step": 5850 }, { "epoch": 2.67, "learning_rate": 3.602232369355657e-08, "logits/chosen": -2.76363468170166, "logits/rejected": -2.457428455352783, "logps/chosen": -84.14752960205078, "logps/rejected": -124.1346206665039, "loss": 0.0046, "rewards/accuracies": 1.0, "rewards/chosen": -1.1772854328155518, "rewards/margins": 29.106714248657227, "rewards/rejected": -30.284000396728516, "step": 5860 }, { "epoch": 2.68, "learning_rate": 3.5514967021816336e-08, "logits/chosen": -2.7900850772857666, "logits/rejected": -2.440145492553711, "logps/chosen": -87.7717056274414, "logps/rejected": -125.55278015136719, "loss": 0.0032, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.5750354528427124, "rewards/margins": 29.271316528320312, "rewards/rejected": -29.846349716186523, "step": 5870 }, { "epoch": 2.68, "learning_rate": 3.50076103500761e-08, "logits/chosen": -2.7597193717956543, "logits/rejected": -2.436508893966675, "logps/chosen": -86.91271209716797, "logps/rejected": -126.75114440917969, "loss": 0.0042, "rewards/accuracies": 1.0, "rewards/chosen": -0.8723502159118652, "rewards/margins": 28.568279266357422, "rewards/rejected": -29.440631866455078, "step": 5880 }, { "epoch": 2.69, "learning_rate": 3.450025367833587e-08, "logits/chosen": -2.7778916358947754, "logits/rejected": -2.4695546627044678, "logps/chosen": -89.9264907836914, "logps/rejected": -126.90245056152344, "loss": 0.0076, "rewards/accuracies": 1.0, "rewards/chosen": -2.252490997314453, "rewards/margins": 29.089618682861328, "rewards/rejected": -31.34210777282715, "step": 5890 }, { "epoch": 2.69, "learning_rate": 3.3992897006595636e-08, "logits/chosen": -2.746237277984619, "logits/rejected": -2.440685272216797, "logps/chosen": -89.57696533203125, "logps/rejected": -127.55577087402344, "loss": 0.003, "rewards/accuracies": 1.0, "rewards/chosen": -0.9720476865768433, "rewards/margins": 29.311176300048828, "rewards/rejected": -30.28322410583496, "step": 5900 }, { "epoch": 2.69, "eval_logits/chosen": -2.4405784606933594, "eval_logits/rejected": -2.1606497764587402, "eval_logps/chosen": -85.3665542602539, "eval_logps/rejected": -122.85722351074219, "eval_loss": 0.009489791467785835, "eval_rewards/accuracies": 0.9916201233863831, "eval_rewards/chosen": -1.5904054641723633, "eval_rewards/margins": 28.607553482055664, "eval_rewards/rejected": -30.197961807250977, "eval_runtime": 174.7423, "eval_samples_per_second": 16.378, "eval_steps_per_second": 1.024, "step": 5900 }, { "epoch": 2.7, "learning_rate": 3.34855403348554e-08, "logits/chosen": -2.7697291374206543, "logits/rejected": -2.40328311920166, "logps/chosen": -93.9384765625, "logps/rejected": -125.51119232177734, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -1.8093090057373047, "rewards/margins": 28.628559112548828, "rewards/rejected": -30.437870025634766, "step": 5910 }, { "epoch": 2.7, "learning_rate": 3.297818366311517e-08, "logits/chosen": -2.75812029838562, "logits/rejected": -2.482701539993286, "logps/chosen": -87.49197387695312, "logps/rejected": -126.22723388671875, "loss": 0.0042, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.52978515625, "rewards/margins": 28.522212982177734, "rewards/rejected": -31.0519962310791, "step": 5920 }, { "epoch": 2.71, "learning_rate": 3.2470826991374936e-08, "logits/chosen": -2.7732131481170654, "logits/rejected": -2.4267020225524902, "logps/chosen": -90.380126953125, "logps/rejected": -125.74269104003906, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -1.7779344320297241, "rewards/margins": 29.363544464111328, "rewards/rejected": -31.141475677490234, "step": 5930 }, { "epoch": 2.71, "learning_rate": 3.19634703196347e-08, "logits/chosen": -2.741158962249756, "logits/rejected": -2.434408664703369, "logps/chosen": -89.10270690917969, "logps/rejected": -128.07345581054688, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -1.6214473247528076, "rewards/margins": 28.6042537689209, "rewards/rejected": -30.2257022857666, "step": 5940 }, { "epoch": 2.72, "learning_rate": 3.145611364789447e-08, "logits/chosen": -2.780167818069458, "logits/rejected": -2.466845989227295, "logps/chosen": -83.76596069335938, "logps/rejected": -126.184326171875, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -0.7433135509490967, "rewards/margins": 29.92877197265625, "rewards/rejected": -30.672088623046875, "step": 5950 }, { "epoch": 2.72, "learning_rate": 3.0948756976154236e-08, "logits/chosen": -2.7710635662078857, "logits/rejected": -2.428365468978882, "logps/chosen": -84.9946517944336, "logps/rejected": -121.47776794433594, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -0.5806612968444824, "rewards/margins": 28.433761596679688, "rewards/rejected": -29.014423370361328, "step": 5960 }, { "epoch": 2.73, "learning_rate": 3.0441400304414e-08, "logits/chosen": -2.7689902782440186, "logits/rejected": -2.466844081878662, "logps/chosen": -86.32017517089844, "logps/rejected": -125.04664611816406, "loss": 0.0022, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.5896341800689697, "rewards/margins": 27.400121688842773, "rewards/rejected": -28.989757537841797, "step": 5970 }, { "epoch": 2.73, "learning_rate": 2.993404363267377e-08, "logits/chosen": -2.741663694381714, "logits/rejected": -2.477158308029175, "logps/chosen": -79.96379089355469, "logps/rejected": -127.88536071777344, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -2.2503182888031006, "rewards/margins": 28.697193145751953, "rewards/rejected": -30.947513580322266, "step": 5980 }, { "epoch": 2.73, "learning_rate": 2.9426686960933532e-08, "logits/chosen": -2.790767192840576, "logits/rejected": -2.4255661964416504, "logps/chosen": -93.36601257324219, "logps/rejected": -130.09959411621094, "loss": 0.0043, "rewards/accuracies": 1.0, "rewards/chosen": -1.3622338771820068, "rewards/margins": 29.60115623474121, "rewards/rejected": -30.963390350341797, "step": 5990 }, { "epoch": 2.74, "learning_rate": 2.89193302891933e-08, "logits/chosen": -2.7811505794525146, "logits/rejected": -2.491635799407959, "logps/chosen": -89.34869384765625, "logps/rejected": -129.8238525390625, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -1.3768314123153687, "rewards/margins": 29.57308006286621, "rewards/rejected": -30.94991111755371, "step": 6000 }, { "epoch": 2.74, "eval_logits/chosen": -2.4402503967285156, "eval_logits/rejected": -2.16145396232605, "eval_logps/chosen": -85.24288940429688, "eval_logps/rejected": -122.63772583007812, "eval_loss": 0.009449008852243423, "eval_rewards/accuracies": 0.994413435459137, "eval_rewards/chosen": -1.5285696983337402, "eval_rewards/margins": 28.55963897705078, "eval_rewards/rejected": -30.088207244873047, "eval_runtime": 196.0444, "eval_samples_per_second": 14.599, "eval_steps_per_second": 0.913, "step": 6000 }, { "epoch": 2.74, "learning_rate": 2.8411973617453066e-08, "logits/chosen": -2.7585527896881104, "logits/rejected": -2.4668471813201904, "logps/chosen": -85.78216552734375, "logps/rejected": -125.34798431396484, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": -1.589486837387085, "rewards/margins": 28.5289306640625, "rewards/rejected": -30.1184139251709, "step": 6010 }, { "epoch": 2.75, "learning_rate": 2.7904616945712832e-08, "logits/chosen": -2.7979981899261475, "logits/rejected": -2.3822169303894043, "logps/chosen": -91.48565673828125, "logps/rejected": -122.6725082397461, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": -2.0604963302612305, "rewards/margins": 27.94063377380371, "rewards/rejected": -30.00113296508789, "step": 6020 }, { "epoch": 2.75, "learning_rate": 2.73972602739726e-08, "logits/chosen": -2.7574362754821777, "logits/rejected": -2.4233498573303223, "logps/chosen": -90.86666107177734, "logps/rejected": -120.50953674316406, "loss": 0.0034, "rewards/accuracies": 1.0, "rewards/chosen": -1.9903202056884766, "rewards/margins": 27.44866371154785, "rewards/rejected": -29.438980102539062, "step": 6030 }, { "epoch": 2.76, "learning_rate": 2.6889903602232366e-08, "logits/chosen": -2.759230375289917, "logits/rejected": -2.449897289276123, "logps/chosen": -88.5560531616211, "logps/rejected": -127.04255676269531, "loss": 0.0043, "rewards/accuracies": 1.0, "rewards/chosen": -1.2785322666168213, "rewards/margins": 30.07562255859375, "rewards/rejected": -31.354156494140625, "step": 6040 }, { "epoch": 2.76, "learning_rate": 2.6382546930492132e-08, "logits/chosen": -2.7630362510681152, "logits/rejected": -2.3909080028533936, "logps/chosen": -87.61473846435547, "logps/rejected": -123.6181640625, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -1.1408017873764038, "rewards/margins": 28.492183685302734, "rewards/rejected": -29.632986068725586, "step": 6050 }, { "epoch": 2.77, "learning_rate": 2.58751902587519e-08, "logits/chosen": -2.772871971130371, "logits/rejected": -2.40971302986145, "logps/chosen": -84.69682312011719, "logps/rejected": -120.79264068603516, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": -1.0478880405426025, "rewards/margins": 27.895471572875977, "rewards/rejected": -28.943359375, "step": 6060 }, { "epoch": 2.77, "learning_rate": 2.5367833587011665e-08, "logits/chosen": -2.7631382942199707, "logits/rejected": -2.407094955444336, "logps/chosen": -88.01454162597656, "logps/rejected": -123.83309173583984, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -0.46538084745407104, "rewards/margins": 28.641265869140625, "rewards/rejected": -29.106647491455078, "step": 6070 }, { "epoch": 2.78, "learning_rate": 2.4860476915271432e-08, "logits/chosen": -2.766541004180908, "logits/rejected": -2.351665496826172, "logps/chosen": -90.7790756225586, "logps/rejected": -120.49629211425781, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": 0.5625904202461243, "rewards/margins": 29.730051040649414, "rewards/rejected": -29.16745948791504, "step": 6080 }, { "epoch": 2.78, "learning_rate": 2.43531202435312e-08, "logits/chosen": -2.7696120738983154, "logits/rejected": -2.484177350997925, "logps/chosen": -77.89936065673828, "logps/rejected": -120.05888366699219, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -1.6103872060775757, "rewards/margins": 27.681161880493164, "rewards/rejected": -29.291549682617188, "step": 6090 }, { "epoch": 2.78, "learning_rate": 2.3845763571790965e-08, "logits/chosen": -2.763655662536621, "logits/rejected": -2.445939302444458, "logps/chosen": -86.303955078125, "logps/rejected": -121.3963623046875, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -0.8497899174690247, "rewards/margins": 28.22234535217285, "rewards/rejected": -29.072134017944336, "step": 6100 }, { "epoch": 2.78, "eval_logits/chosen": -2.439971685409546, "eval_logits/rejected": -2.1614773273468018, "eval_logps/chosen": -85.06666564941406, "eval_logps/rejected": -122.4960708618164, "eval_loss": 0.009482893161475658, "eval_rewards/accuracies": 0.9916201233863831, "eval_rewards/chosen": -1.4404624700546265, "eval_rewards/margins": 28.576919555664062, "eval_rewards/rejected": -30.017383575439453, "eval_runtime": 193.3402, "eval_samples_per_second": 14.803, "eval_steps_per_second": 0.926, "step": 6100 }, { "epoch": 2.79, "learning_rate": 2.3338406900050732e-08, "logits/chosen": -2.7781970500946045, "logits/rejected": -2.4191970825195312, "logps/chosen": -85.55188751220703, "logps/rejected": -116.68485260009766, "loss": 0.0043, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.892316460609436, "rewards/margins": 27.853878021240234, "rewards/rejected": -28.746196746826172, "step": 6110 }, { "epoch": 2.79, "learning_rate": 2.28310502283105e-08, "logits/chosen": -2.7508749961853027, "logits/rejected": -2.4440765380859375, "logps/chosen": -87.50543212890625, "logps/rejected": -126.74629211425781, "loss": 0.0022, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.0790069103240967, "rewards/margins": 29.00905990600586, "rewards/rejected": -30.08806800842285, "step": 6120 }, { "epoch": 2.8, "learning_rate": 2.2323693556570265e-08, "logits/chosen": -2.753638744354248, "logits/rejected": -2.47627854347229, "logps/chosen": -80.49906158447266, "logps/rejected": -118.0978775024414, "loss": 0.0045, "rewards/accuracies": 1.0, "rewards/chosen": -1.4512609243392944, "rewards/margins": 28.40765380859375, "rewards/rejected": -29.85891342163086, "step": 6130 }, { "epoch": 2.8, "learning_rate": 2.1816336884830032e-08, "logits/chosen": -2.74639892578125, "logits/rejected": -2.4390718936920166, "logps/chosen": -85.38618469238281, "logps/rejected": -119.59771728515625, "loss": 0.0022, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.9391024708747864, "rewards/margins": 27.43313217163086, "rewards/rejected": -28.372234344482422, "step": 6140 }, { "epoch": 2.81, "learning_rate": 2.13089802130898e-08, "logits/chosen": -2.7636475563049316, "logits/rejected": -2.3613009452819824, "logps/chosen": -90.11565399169922, "logps/rejected": -122.06331634521484, "loss": 0.0034, "rewards/accuracies": 1.0, "rewards/chosen": -1.489884614944458, "rewards/margins": 28.738910675048828, "rewards/rejected": -30.228796005249023, "step": 6150 }, { "epoch": 2.81, "learning_rate": 2.0801623541349565e-08, "logits/chosen": -2.7498292922973633, "logits/rejected": -2.452125072479248, "logps/chosen": -86.10844421386719, "logps/rejected": -128.1103057861328, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -0.9979543685913086, "rewards/margins": 28.91608238220215, "rewards/rejected": -29.914037704467773, "step": 6160 }, { "epoch": 2.82, "learning_rate": 2.0294266869609332e-08, "logits/chosen": -2.774944543838501, "logits/rejected": -2.444215774536133, "logps/chosen": -89.1854019165039, "logps/rejected": -125.04008483886719, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -2.2090940475463867, "rewards/margins": 28.304052352905273, "rewards/rejected": -30.513147354125977, "step": 6170 }, { "epoch": 2.82, "learning_rate": 1.97869101978691e-08, "logits/chosen": -2.7468392848968506, "logits/rejected": -2.417240619659424, "logps/chosen": -89.89546203613281, "logps/rejected": -124.21256256103516, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -0.9482452273368835, "rewards/margins": 29.16946029663086, "rewards/rejected": -30.117706298828125, "step": 6180 }, { "epoch": 2.83, "learning_rate": 1.9279553526128868e-08, "logits/chosen": -2.7608232498168945, "logits/rejected": -2.3906033039093018, "logps/chosen": -86.5302505493164, "logps/rejected": -128.83523559570312, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": -0.6396046876907349, "rewards/margins": 30.30716323852539, "rewards/rejected": -30.946765899658203, "step": 6190 }, { "epoch": 2.83, "learning_rate": 1.8772196854388635e-08, "logits/chosen": -2.758728265762329, "logits/rejected": -2.434004545211792, "logps/chosen": -84.31966400146484, "logps/rejected": -122.57624816894531, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -2.041393518447876, "rewards/margins": 27.10711669921875, "rewards/rejected": -29.148509979248047, "step": 6200 }, { "epoch": 2.83, "eval_logits/chosen": -2.439547300338745, "eval_logits/rejected": -2.15985369682312, "eval_logps/chosen": -84.8874282836914, "eval_logps/rejected": -122.32459259033203, "eval_loss": 0.009298160672187805, "eval_rewards/accuracies": 0.9916201233863831, "eval_rewards/chosen": -1.350844383239746, "eval_rewards/margins": 28.580806732177734, "eval_rewards/rejected": -29.931652069091797, "eval_runtime": 185.1509, "eval_samples_per_second": 15.458, "eval_steps_per_second": 0.967, "step": 6200 }, { "epoch": 2.83, "learning_rate": 1.82648401826484e-08, "logits/chosen": -2.742156982421875, "logits/rejected": -2.394625186920166, "logps/chosen": -86.7176513671875, "logps/rejected": -124.78646087646484, "loss": 0.0055, "rewards/accuracies": 1.0, "rewards/chosen": -1.2381303310394287, "rewards/margins": 29.134098052978516, "rewards/rejected": -30.372222900390625, "step": 6210 }, { "epoch": 2.84, "learning_rate": 1.7757483510908168e-08, "logits/chosen": -2.7632789611816406, "logits/rejected": -2.4544222354888916, "logps/chosen": -83.70695495605469, "logps/rejected": -128.5135498046875, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -1.2514070272445679, "rewards/margins": 29.840045928955078, "rewards/rejected": -31.09145164489746, "step": 6220 }, { "epoch": 2.84, "learning_rate": 1.7250126839167935e-08, "logits/chosen": -2.751649856567383, "logits/rejected": -2.5184075832366943, "logps/chosen": -81.00160217285156, "logps/rejected": -119.5700454711914, "loss": 0.0031, "rewards/accuracies": 1.0, "rewards/chosen": -2.2341742515563965, "rewards/margins": 26.60648536682129, "rewards/rejected": -28.84066390991211, "step": 6230 }, { "epoch": 2.85, "learning_rate": 1.67427701674277e-08, "logits/chosen": -2.7553322315216064, "logits/rejected": -2.4200634956359863, "logps/chosen": -84.46053314208984, "logps/rejected": -124.75267028808594, "loss": 0.0026, "rewards/accuracies": 1.0, "rewards/chosen": -0.6028271913528442, "rewards/margins": 30.28225326538086, "rewards/rejected": -30.885080337524414, "step": 6240 }, { "epoch": 2.85, "learning_rate": 1.6235413495687468e-08, "logits/chosen": -2.7583327293395996, "logits/rejected": -2.453467845916748, "logps/chosen": -84.0030517578125, "logps/rejected": -122.71601867675781, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": -1.0951178073883057, "rewards/margins": 27.6275577545166, "rewards/rejected": -28.72267723083496, "step": 6250 }, { "epoch": 2.86, "learning_rate": 1.5728056823947235e-08, "logits/chosen": -2.7519564628601074, "logits/rejected": -2.435011386871338, "logps/chosen": -83.85610961914062, "logps/rejected": -126.39498138427734, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -0.6248266100883484, "rewards/margins": 29.30548095703125, "rewards/rejected": -29.930307388305664, "step": 6260 }, { "epoch": 2.86, "learning_rate": 1.5220700152207e-08, "logits/chosen": -2.7590296268463135, "logits/rejected": -2.469085931777954, "logps/chosen": -82.1221694946289, "logps/rejected": -126.53608703613281, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -1.6596978902816772, "rewards/margins": 29.254098892211914, "rewards/rejected": -30.913793563842773, "step": 6270 }, { "epoch": 2.87, "learning_rate": 1.4713343480466766e-08, "logits/chosen": -2.766113758087158, "logits/rejected": -2.453413248062134, "logps/chosen": -91.06944274902344, "logps/rejected": -124.0029296875, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -2.389604091644287, "rewards/margins": 26.59371566772461, "rewards/rejected": -28.983322143554688, "step": 6280 }, { "epoch": 2.87, "learning_rate": 1.4205986808726533e-08, "logits/chosen": -2.774259090423584, "logits/rejected": -2.399066209793091, "logps/chosen": -90.34872436523438, "logps/rejected": -120.81922912597656, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -1.225060224533081, "rewards/margins": 28.232065200805664, "rewards/rejected": -29.45712661743164, "step": 6290 }, { "epoch": 2.88, "learning_rate": 1.36986301369863e-08, "logits/chosen": -2.7689788341522217, "logits/rejected": -2.4589011669158936, "logps/chosen": -83.26945495605469, "logps/rejected": -124.42732238769531, "loss": 0.0019, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.8648887872695923, "rewards/margins": 28.850017547607422, "rewards/rejected": -30.71491050720215, "step": 6300 }, { "epoch": 2.88, "eval_logits/chosen": -2.441458225250244, "eval_logits/rejected": -2.162033796310425, "eval_logps/chosen": -84.66895294189453, "eval_logps/rejected": -121.7662582397461, "eval_loss": 0.009261946193873882, "eval_rewards/accuracies": 0.9916201233863831, "eval_rewards/chosen": -1.2416049242019653, "eval_rewards/margins": 28.410871505737305, "eval_rewards/rejected": -29.652481079101562, "eval_runtime": 189.127, "eval_samples_per_second": 15.133, "eval_steps_per_second": 0.946, "step": 6300 }, { "epoch": 2.88, "learning_rate": 1.3191273465246066e-08, "logits/chosen": -2.7684543132781982, "logits/rejected": -2.421797037124634, "logps/chosen": -89.03050231933594, "logps/rejected": -121.92149353027344, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -1.0949914455413818, "rewards/margins": 27.468612670898438, "rewards/rejected": -28.5635986328125, "step": 6310 }, { "epoch": 2.88, "learning_rate": 1.2683916793505833e-08, "logits/chosen": -2.7620081901550293, "logits/rejected": -2.4276468753814697, "logps/chosen": -90.59452819824219, "logps/rejected": -130.93716430664062, "loss": 0.0032, "rewards/accuracies": 1.0, "rewards/chosen": -1.0884459018707275, "rewards/margins": 30.041828155517578, "rewards/rejected": -31.13027572631836, "step": 6320 }, { "epoch": 2.89, "learning_rate": 1.21765601217656e-08, "logits/chosen": -2.7471041679382324, "logits/rejected": -2.428265333175659, "logps/chosen": -81.28025817871094, "logps/rejected": -123.2228775024414, "loss": 0.0065, "rewards/accuracies": 1.0, "rewards/chosen": -1.2295732498168945, "rewards/margins": 28.947052001953125, "rewards/rejected": -30.176626205444336, "step": 6330 }, { "epoch": 2.89, "learning_rate": 1.1669203450025366e-08, "logits/chosen": -2.7810678482055664, "logits/rejected": -2.48518443107605, "logps/chosen": -86.17743682861328, "logps/rejected": -126.41093444824219, "loss": 0.0043, "rewards/accuracies": 1.0, "rewards/chosen": -1.22101628780365, "rewards/margins": 29.040735244750977, "rewards/rejected": -30.261749267578125, "step": 6340 }, { "epoch": 2.9, "learning_rate": 1.1161846778285133e-08, "logits/chosen": -2.7663416862487793, "logits/rejected": -2.4516355991363525, "logps/chosen": -84.50499725341797, "logps/rejected": -120.95661926269531, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": -1.261110544204712, "rewards/margins": 27.491687774658203, "rewards/rejected": -28.752796173095703, "step": 6350 }, { "epoch": 2.9, "learning_rate": 1.06544901065449e-08, "logits/chosen": -2.7554430961608887, "logits/rejected": -2.422839403152466, "logps/chosen": -85.96926879882812, "logps/rejected": -123.37564849853516, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": -0.3166905343532562, "rewards/margins": 29.696842193603516, "rewards/rejected": -30.013530731201172, "step": 6360 }, { "epoch": 2.91, "learning_rate": 1.0147133434804666e-08, "logits/chosen": -2.7821693420410156, "logits/rejected": -2.4158754348754883, "logps/chosen": -88.51810455322266, "logps/rejected": -124.8204345703125, "loss": 0.0085, "rewards/accuracies": 1.0, "rewards/chosen": -1.295811414718628, "rewards/margins": 27.8975830078125, "rewards/rejected": -29.19339942932129, "step": 6370 }, { "epoch": 2.91, "learning_rate": 9.639776763064434e-09, "logits/chosen": -2.7672648429870605, "logits/rejected": -2.415203809738159, "logps/chosen": -83.65962219238281, "logps/rejected": -121.69950866699219, "loss": 0.0053, "rewards/accuracies": 1.0, "rewards/chosen": -0.8128542900085449, "rewards/margins": 29.466506958007812, "rewards/rejected": -30.27936363220215, "step": 6380 }, { "epoch": 2.92, "learning_rate": 9.1324200913242e-09, "logits/chosen": -2.7522847652435303, "logits/rejected": -2.44370174407959, "logps/chosen": -84.73089599609375, "logps/rejected": -120.78410339355469, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -1.9191125631332397, "rewards/margins": 26.731603622436523, "rewards/rejected": -28.65071678161621, "step": 6390 }, { "epoch": 2.92, "learning_rate": 8.625063419583967e-09, "logits/chosen": -2.7593767642974854, "logits/rejected": -2.4367599487304688, "logps/chosen": -85.0198745727539, "logps/rejected": -123.82391357421875, "loss": 0.0034, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.4312102794647217, "rewards/margins": 28.096294403076172, "rewards/rejected": -29.527502059936523, "step": 6400 }, { "epoch": 2.92, "eval_logits/chosen": -2.441237211227417, "eval_logits/rejected": -2.1616415977478027, "eval_logps/chosen": -84.78482818603516, "eval_logps/rejected": -122.0467758178711, "eval_loss": 0.009316547773778439, "eval_rewards/accuracies": 0.9916201233863831, "eval_rewards/chosen": -1.2995460033416748, "eval_rewards/margins": 28.493188858032227, "eval_rewards/rejected": -29.792734146118164, "eval_runtime": 177.0866, "eval_samples_per_second": 16.162, "eval_steps_per_second": 1.011, "step": 6400 }, { "epoch": 2.93, "learning_rate": 8.117706747843734e-09, "logits/chosen": -2.751600742340088, "logits/rejected": -2.431941270828247, "logps/chosen": -88.1320571899414, "logps/rejected": -125.81938171386719, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -1.2640628814697266, "rewards/margins": 28.95229148864746, "rewards/rejected": -30.216354370117188, "step": 6410 }, { "epoch": 2.93, "learning_rate": 7.6103500761035e-09, "logits/chosen": -2.778306245803833, "logits/rejected": -2.4374775886535645, "logps/chosen": -84.7765884399414, "logps/rejected": -123.86201477050781, "loss": 0.0046, "rewards/accuracies": 1.0, "rewards/chosen": -0.431521475315094, "rewards/margins": 30.0347900390625, "rewards/rejected": -30.466312408447266, "step": 6420 }, { "epoch": 2.94, "learning_rate": 7.1029934043632664e-09, "logits/chosen": -2.768923282623291, "logits/rejected": -2.4140679836273193, "logps/chosen": -87.08946990966797, "logps/rejected": -121.84139251708984, "loss": 0.0034, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.2873014807701111, "rewards/margins": 29.34170150756836, "rewards/rejected": -29.629003524780273, "step": 6430 }, { "epoch": 2.94, "learning_rate": 6.595636732623033e-09, "logits/chosen": -2.7443387508392334, "logits/rejected": -2.459656238555908, "logps/chosen": -79.73686218261719, "logps/rejected": -125.28855895996094, "loss": 0.0036, "rewards/accuracies": 1.0, "rewards/chosen": -0.07170786708593369, "rewards/margins": 29.741830825805664, "rewards/rejected": -29.813541412353516, "step": 6440 }, { "epoch": 2.94, "learning_rate": 6.0882800608828e-09, "logits/chosen": -2.7630724906921387, "logits/rejected": -2.4927096366882324, "logps/chosen": -86.48126983642578, "logps/rejected": -127.231201171875, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -1.4614259004592896, "rewards/margins": 28.3530330657959, "rewards/rejected": -29.814456939697266, "step": 6450 }, { "epoch": 2.95, "learning_rate": 5.580923389142566e-09, "logits/chosen": -2.760925531387329, "logits/rejected": -2.4292423725128174, "logps/chosen": -83.03225708007812, "logps/rejected": -123.68485260009766, "loss": 0.0032, "rewards/accuracies": 1.0, "rewards/chosen": 0.11567910015583038, "rewards/margins": 30.139841079711914, "rewards/rejected": -30.0241641998291, "step": 6460 }, { "epoch": 2.95, "learning_rate": 5.073566717402333e-09, "logits/chosen": -2.7655482292175293, "logits/rejected": -2.4329991340637207, "logps/chosen": -90.55628967285156, "logps/rejected": -123.97721099853516, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": -1.7155205011367798, "rewards/margins": 28.31768798828125, "rewards/rejected": -30.0332088470459, "step": 6470 }, { "epoch": 2.96, "learning_rate": 4.5662100456621e-09, "logits/chosen": -2.7627334594726562, "logits/rejected": -2.4173669815063477, "logps/chosen": -84.91883850097656, "logps/rejected": -124.317626953125, "loss": 0.0046, "rewards/accuracies": 1.0, "rewards/chosen": -0.24260666966438293, "rewards/margins": 29.781635284423828, "rewards/rejected": -30.024242401123047, "step": 6480 }, { "epoch": 2.96, "learning_rate": 4.058853373921867e-09, "logits/chosen": -2.778211832046509, "logits/rejected": -2.395915985107422, "logps/chosen": -91.13358306884766, "logps/rejected": -124.83601379394531, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -0.8876069188117981, "rewards/margins": 29.623035430908203, "rewards/rejected": -30.510639190673828, "step": 6490 }, { "epoch": 2.97, "learning_rate": 3.5514967021816332e-09, "logits/chosen": -2.778526544570923, "logits/rejected": -2.495408535003662, "logps/chosen": -83.84934997558594, "logps/rejected": -120.8696060180664, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -0.9658511877059937, "rewards/margins": 28.089733123779297, "rewards/rejected": -29.055583953857422, "step": 6500 }, { "epoch": 2.97, "eval_logits/chosen": -2.440753221511841, "eval_logits/rejected": -2.1595053672790527, "eval_logps/chosen": -84.7005844116211, "eval_logps/rejected": -121.90135192871094, "eval_loss": 0.009238426573574543, "eval_rewards/accuracies": 0.9916201233863831, "eval_rewards/chosen": -1.2574175596237183, "eval_rewards/margins": 28.46260643005371, "eval_rewards/rejected": -29.72002410888672, "eval_runtime": 192.9355, "eval_samples_per_second": 14.834, "eval_steps_per_second": 0.928, "step": 6500 }, { "epoch": 2.97, "learning_rate": 3.0441400304414e-09, "logits/chosen": -2.752532482147217, "logits/rejected": -2.41793155670166, "logps/chosen": -84.92472076416016, "logps/rejected": -124.00596618652344, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -0.6789621710777283, "rewards/margins": 29.16929054260254, "rewards/rejected": -29.848255157470703, "step": 6510 }, { "epoch": 2.98, "learning_rate": 2.5367833587011665e-09, "logits/chosen": -2.7816948890686035, "logits/rejected": -2.4174342155456543, "logps/chosen": -90.63253021240234, "logps/rejected": -123.43448638916016, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -0.9251478314399719, "rewards/margins": 29.31369400024414, "rewards/rejected": -30.238842010498047, "step": 6520 }, { "epoch": 2.98, "learning_rate": 2.0294266869609335e-09, "logits/chosen": -2.7638049125671387, "logits/rejected": -2.5021424293518066, "logps/chosen": -82.45162200927734, "logps/rejected": -122.8984146118164, "loss": 0.0046, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.8968737125396729, "rewards/margins": 26.78082275390625, "rewards/rejected": -28.677698135375977, "step": 6530 }, { "epoch": 2.99, "learning_rate": 1.5220700152207e-09, "logits/chosen": -2.7564539909362793, "logits/rejected": -2.4106497764587402, "logps/chosen": -80.45997619628906, "logps/rejected": -119.024658203125, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": -0.9197688102722168, "rewards/margins": 27.552997589111328, "rewards/rejected": -28.472766876220703, "step": 6540 }, { "epoch": 2.99, "learning_rate": 1.0147133434804667e-09, "logits/chosen": -2.7763352394104004, "logits/rejected": -2.363201141357422, "logps/chosen": -90.98936462402344, "logps/rejected": -119.37083435058594, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -1.0826857089996338, "rewards/margins": 27.587560653686523, "rewards/rejected": -28.670246124267578, "step": 6550 }, { "epoch": 2.99, "learning_rate": 5.073566717402334e-10, "logits/chosen": -2.7704036235809326, "logits/rejected": -2.4516172409057617, "logps/chosen": -88.80804443359375, "logps/rejected": -122.97371673583984, "loss": 0.0011, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.1695826053619385, "rewards/margins": 28.09049415588379, "rewards/rejected": -30.26007652282715, "step": 6560 }, { "epoch": 3.0, "learning_rate": 0.0, "logits/chosen": -2.7697036266326904, "logits/rejected": -2.4491117000579834, "logps/chosen": -82.1366958618164, "logps/rejected": -123.7041015625, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -1.5781421661376953, "rewards/margins": 28.360544204711914, "rewards/rejected": -29.938684463500977, "step": 6570 }, { "epoch": 3.0, "step": 6570, "total_flos": 0.0, "train_loss": 0.020571088252061828, "train_runtime": 85394.2588, "train_samples_per_second": 4.925, "train_steps_per_second": 0.077 } ], "logging_steps": 10, "max_steps": 6570, "num_train_epochs": 3, "save_steps": 100, "total_flos": 0.0, "trial_name": null, "trial_params": null }