{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 3821, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 1.3054830287206268e-08, "logits/chosen": 0.9550814628601074, "logits/rejected": 1.0664727687835693, "logps/chosen": -190.47879028320312, "logps/rejected": -177.6958770751953, "loss": 0.1031, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.0, "learning_rate": 1.3054830287206266e-07, "logits/chosen": 1.021599531173706, "logits/rejected": 1.0737736225128174, "logps/chosen": -277.8912048339844, "logps/rejected": -268.34259033203125, "loss": 0.0514, "rewards/accuracies": 0.4166666567325592, "rewards/chosen": 2.9820108466083184e-05, "rewards/margins": 0.000656133983284235, "rewards/rejected": -0.0006263138493523002, "step": 10 }, { "epoch": 0.01, "learning_rate": 2.610966057441253e-07, "logits/chosen": 1.0539672374725342, "logits/rejected": 1.035296082496643, "logps/chosen": -258.02105712890625, "logps/rejected": -219.51577758789062, "loss": 0.0679, "rewards/accuracies": 0.4124999940395355, "rewards/chosen": -0.00037693610647693276, "rewards/margins": -0.0003669637371785939, "rewards/rejected": -9.972270163416397e-06, "step": 20 }, { "epoch": 0.01, "learning_rate": 3.9164490861618804e-07, "logits/chosen": 0.9785920977592468, "logits/rejected": 0.9956333041191101, "logps/chosen": -234.4257354736328, "logps/rejected": -216.3408660888672, "loss": 0.0522, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.00037702807458117604, "rewards/margins": 0.0003918584552593529, "rewards/rejected": -0.0007688865880481899, "step": 30 }, { "epoch": 0.01, "learning_rate": 5.221932114882506e-07, "logits/chosen": 1.0598526000976562, "logits/rejected": 1.0610239505767822, "logps/chosen": -269.3299865722656, "logps/rejected": -236.5482635498047, "loss": 0.0646, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.0001463999942643568, "rewards/margins": 0.0003663330862764269, "rewards/rejected": -0.0005127330077812076, "step": 40 }, { "epoch": 0.01, "learning_rate": 6.527415143603135e-07, "logits/chosen": 1.0115251541137695, "logits/rejected": 1.0492277145385742, "logps/chosen": -245.1737518310547, "logps/rejected": -241.9782257080078, "loss": 0.053, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -0.0008561966242268682, "rewards/margins": 0.00045777196646668017, "rewards/rejected": -0.0013139685615897179, "step": 50 }, { "epoch": 0.02, "learning_rate": 7.832898172323761e-07, "logits/chosen": 0.9759989976882935, "logits/rejected": 1.09335196018219, "logps/chosen": -283.7034912109375, "logps/rejected": -234.171142578125, "loss": 0.0508, "rewards/accuracies": 0.46875, "rewards/chosen": 0.0002724650257732719, "rewards/margins": 0.0006013559177517891, "rewards/rejected": -0.0003288908628746867, "step": 60 }, { "epoch": 0.02, "learning_rate": 9.138381201044387e-07, "logits/chosen": 1.0061399936676025, "logits/rejected": 1.0819300413131714, "logps/chosen": -272.0354919433594, "logps/rejected": -231.0594482421875, "loss": 0.0533, "rewards/accuracies": 0.45625001192092896, "rewards/chosen": -0.0003669198777060956, "rewards/margins": -0.0001511875307187438, "rewards/rejected": -0.00021573244885075837, "step": 70 }, { "epoch": 0.02, "learning_rate": 1.0443864229765013e-06, "logits/chosen": 1.0220763683319092, "logits/rejected": 1.0622212886810303, "logps/chosen": -283.91650390625, "logps/rejected": -261.65411376953125, "loss": 0.0441, "rewards/accuracies": 0.39375001192092896, "rewards/chosen": -0.00052777084056288, "rewards/margins": -0.0005939611000940204, "rewards/rejected": 6.619028135901317e-05, "step": 80 }, { "epoch": 0.02, "learning_rate": 1.1749347258485642e-06, "logits/chosen": 1.0424718856811523, "logits/rejected": 1.092550277709961, "logps/chosen": -278.462890625, "logps/rejected": -235.7613983154297, "loss": 0.0596, "rewards/accuracies": 0.3812499940395355, "rewards/chosen": -0.0010484650265425444, "rewards/margins": -0.0007164698326960206, "rewards/rejected": -0.0003319952520541847, "step": 90 }, { "epoch": 0.03, "learning_rate": 1.305483028720627e-06, "logits/chosen": 0.9916040301322937, "logits/rejected": 1.066935420036316, "logps/chosen": -237.2812957763672, "logps/rejected": -218.4796905517578, "loss": 0.0659, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -4.606993752531707e-05, "rewards/margins": 0.0002796413318719715, "rewards/rejected": -0.000325711298501119, "step": 100 }, { "epoch": 0.03, "eval_logits/chosen": 0.9710860252380371, "eval_logits/rejected": 1.0635499954223633, "eval_logps/chosen": -277.5683288574219, "eval_logps/rejected": -243.89227294921875, "eval_loss": 0.053576212376356125, "eval_rewards/accuracies": 0.47450000047683716, "eval_rewards/chosen": -0.00021531998936552554, "eval_rewards/margins": 0.0005480629042722285, "eval_rewards/rejected": -0.0007633829372934997, "eval_runtime": 539.1486, "eval_samples_per_second": 3.71, "eval_steps_per_second": 0.927, "step": 100 }, { "epoch": 0.03, "learning_rate": 1.4360313315926894e-06, "logits/chosen": 0.9959138035774231, "logits/rejected": 1.0810822248458862, "logps/chosen": -283.58575439453125, "logps/rejected": -250.1833038330078, "loss": 0.0529, "rewards/accuracies": 0.543749988079071, "rewards/chosen": 0.0003468608483672142, "rewards/margins": 0.0019003556808456779, "rewards/rejected": -0.0015534948324784636, "step": 110 }, { "epoch": 0.03, "learning_rate": 1.5665796344647521e-06, "logits/chosen": 1.0288623571395874, "logits/rejected": 1.0744774341583252, "logps/chosen": -227.82470703125, "logps/rejected": -234.0697479248047, "loss": 0.0857, "rewards/accuracies": 0.39375001192092896, "rewards/chosen": -0.0001596544898347929, "rewards/margins": -0.0005016528302803636, "rewards/rejected": 0.0003419983549974859, "step": 120 }, { "epoch": 0.03, "learning_rate": 1.6971279373368146e-06, "logits/chosen": 1.04789137840271, "logits/rejected": 1.0942102670669556, "logps/chosen": -282.67510986328125, "logps/rejected": -239.3311309814453, "loss": 0.0449, "rewards/accuracies": 0.5625, "rewards/chosen": 0.00027361814863979816, "rewards/margins": 0.002355109201744199, "rewards/rejected": -0.002081490820273757, "step": 130 }, { "epoch": 0.04, "learning_rate": 1.8276762402088774e-06, "logits/chosen": 1.0264707803726196, "logits/rejected": 1.02583646774292, "logps/chosen": -264.01715087890625, "logps/rejected": -237.10549926757812, "loss": 0.0474, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 0.0003395401581656188, "rewards/margins": 0.001898492919281125, "rewards/rejected": -0.0015589528484269977, "step": 140 }, { "epoch": 0.04, "learning_rate": 1.9582245430809403e-06, "logits/chosen": 1.0069233179092407, "logits/rejected": 1.0264513492584229, "logps/chosen": -262.6693420410156, "logps/rejected": -235.0095977783203, "loss": 0.0641, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": 0.00045030430192127824, "rewards/margins": 0.0019102304941043258, "rewards/rejected": -0.0014599261339753866, "step": 150 }, { "epoch": 0.04, "learning_rate": 2.0887728459530026e-06, "logits/chosen": 0.9561678171157837, "logits/rejected": 1.085860252380371, "logps/chosen": -258.2762451171875, "logps/rejected": -240.168701171875, "loss": 0.0571, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.0003362145507708192, "rewards/margins": 0.002913826610893011, "rewards/rejected": -0.0025776117108762264, "step": 160 }, { "epoch": 0.04, "learning_rate": 2.2193211488250653e-06, "logits/chosen": 0.996711254119873, "logits/rejected": 1.0722663402557373, "logps/chosen": -268.49578857421875, "logps/rejected": -218.3070831298828, "loss": 0.0502, "rewards/accuracies": 0.543749988079071, "rewards/chosen": 0.0006658965139649808, "rewards/margins": 0.0029082505498081446, "rewards/rejected": -0.0022423542104661465, "step": 170 }, { "epoch": 0.05, "learning_rate": 2.3498694516971284e-06, "logits/chosen": 0.9853906631469727, "logits/rejected": 1.033320665359497, "logps/chosen": -272.53961181640625, "logps/rejected": -237.8509979248047, "loss": 0.0638, "rewards/accuracies": 0.518750011920929, "rewards/chosen": 0.0014408377464860678, "rewards/margins": 0.003102297894656658, "rewards/rejected": -0.0016614599153399467, "step": 180 }, { "epoch": 0.05, "learning_rate": 2.4804177545691907e-06, "logits/chosen": 0.9930588006973267, "logits/rejected": 1.0107576847076416, "logps/chosen": -269.4462890625, "logps/rejected": -235.57852172851562, "loss": 0.0604, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.0017847366398200393, "rewards/margins": 0.004315118305385113, "rewards/rejected": -0.0025303815491497517, "step": 190 }, { "epoch": 0.05, "learning_rate": 2.610966057441254e-06, "logits/chosen": 1.0219703912734985, "logits/rejected": 1.1328296661376953, "logps/chosen": -278.0802917480469, "logps/rejected": -249.68466186523438, "loss": 0.0597, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 0.004124884493649006, "rewards/margins": 0.005005924496799707, "rewards/rejected": -0.000881039712112397, "step": 200 }, { "epoch": 0.05, "eval_logits/chosen": 0.9688093662261963, "eval_logits/rejected": 1.0617414712905884, "eval_logps/chosen": -277.1978759765625, "eval_logps/rejected": -243.9651336669922, "eval_loss": 0.05182640627026558, "eval_rewards/accuracies": 0.5879999995231628, "eval_rewards/chosen": 0.0034893574193120003, "eval_rewards/margins": 0.00498173339292407, "eval_rewards/rejected": -0.0014923758571967483, "eval_runtime": 539.156, "eval_samples_per_second": 3.71, "eval_steps_per_second": 0.927, "step": 200 }, { "epoch": 0.05, "learning_rate": 2.741514360313316e-06, "logits/chosen": 1.0123722553253174, "logits/rejected": 1.0923728942871094, "logps/chosen": -260.80499267578125, "logps/rejected": -233.2253875732422, "loss": 0.0394, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 0.002722758101299405, "rewards/margins": 0.004253658466041088, "rewards/rejected": -0.0015309008304029703, "step": 210 }, { "epoch": 0.06, "learning_rate": 2.872062663185379e-06, "logits/chosen": 1.040575385093689, "logits/rejected": 1.1116924285888672, "logps/chosen": -277.50433349609375, "logps/rejected": -243.0937042236328, "loss": 0.0443, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.004540332593023777, "rewards/margins": 0.005531441420316696, "rewards/rejected": -0.0009911099914461374, "step": 220 }, { "epoch": 0.06, "learning_rate": 3.0026109660574416e-06, "logits/chosen": 1.029170036315918, "logits/rejected": 1.0374505519866943, "logps/chosen": -268.8113708496094, "logps/rejected": -275.160400390625, "loss": 0.0402, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.005796975456178188, "rewards/margins": 0.006743866018950939, "rewards/rejected": -0.0009468902135267854, "step": 230 }, { "epoch": 0.06, "learning_rate": 3.1331592689295043e-06, "logits/chosen": 1.027822732925415, "logits/rejected": 1.0454634428024292, "logps/chosen": -271.84796142578125, "logps/rejected": -231.60018920898438, "loss": 0.0469, "rewards/accuracies": 0.606249988079071, "rewards/chosen": 0.009604343213140965, "rewards/margins": 0.010616883635520935, "rewards/rejected": -0.001012541470117867, "step": 240 }, { "epoch": 0.07, "learning_rate": 3.263707571801567e-06, "logits/chosen": 0.9246234893798828, "logits/rejected": 1.081726312637329, "logps/chosen": -262.35052490234375, "logps/rejected": -207.2003631591797, "loss": 0.0461, "rewards/accuracies": 0.5625, "rewards/chosen": 0.005290716886520386, "rewards/margins": 0.007529892958700657, "rewards/rejected": -0.002239175606518984, "step": 250 }, { "epoch": 0.07, "learning_rate": 3.3942558746736293e-06, "logits/chosen": 1.0157970190048218, "logits/rejected": 1.0061004161834717, "logps/chosen": -255.8148956298828, "logps/rejected": -249.33810424804688, "loss": 0.0541, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.005560068879276514, "rewards/margins": 0.00910879485309124, "rewards/rejected": -0.003548725973814726, "step": 260 }, { "epoch": 0.07, "learning_rate": 3.524804177545692e-06, "logits/chosen": 0.9183789491653442, "logits/rejected": 1.0651085376739502, "logps/chosen": -250.3922119140625, "logps/rejected": -225.31845092773438, "loss": 0.0573, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.004733686335384846, "rewards/margins": 0.010133610107004642, "rewards/rejected": -0.005399924702942371, "step": 270 }, { "epoch": 0.07, "learning_rate": 3.6553524804177547e-06, "logits/chosen": 0.9533087015151978, "logits/rejected": 0.9936316609382629, "logps/chosen": -262.45989990234375, "logps/rejected": -245.0988006591797, "loss": 0.0652, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 0.009047111496329308, "rewards/margins": 0.01569160632789135, "rewards/rejected": -0.006644496228545904, "step": 280 }, { "epoch": 0.08, "learning_rate": 3.7859007832898174e-06, "logits/chosen": 0.9586070775985718, "logits/rejected": 1.0487323999404907, "logps/chosen": -258.58087158203125, "logps/rejected": -229.2060546875, "loss": 0.0527, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.009910664521157742, "rewards/margins": 0.01812123879790306, "rewards/rejected": -0.008210571482777596, "step": 290 }, { "epoch": 0.08, "learning_rate": 3.9164490861618806e-06, "logits/chosen": 0.9238823056221008, "logits/rejected": 1.0459530353546143, "logps/chosen": -257.22900390625, "logps/rejected": -227.42770385742188, "loss": 0.0564, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": 0.008343839086592197, "rewards/margins": 0.01660408265888691, "rewards/rejected": -0.008260244503617287, "step": 300 }, { "epoch": 0.08, "eval_logits/chosen": 0.9498724341392517, "eval_logits/rejected": 1.0439953804016113, "eval_logps/chosen": -276.5095520019531, "eval_logps/rejected": -244.6271514892578, "eval_loss": 0.047470785677433014, "eval_rewards/accuracies": 0.6175000071525574, "eval_rewards/chosen": 0.010372455231845379, "eval_rewards/margins": 0.018484672531485558, "eval_rewards/rejected": -0.008112218230962753, "eval_runtime": 539.0567, "eval_samples_per_second": 3.71, "eval_steps_per_second": 0.928, "step": 300 }, { "epoch": 0.08, "learning_rate": 4.046997389033943e-06, "logits/chosen": 0.9125510454177856, "logits/rejected": 1.0743194818496704, "logps/chosen": -256.52203369140625, "logps/rejected": -227.2289581298828, "loss": 0.0457, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 0.013696533627808094, "rewards/margins": 0.022891724482178688, "rewards/rejected": -0.009195187129080296, "step": 310 }, { "epoch": 0.08, "learning_rate": 4.177545691906005e-06, "logits/chosen": 0.8873203992843628, "logits/rejected": 1.0165441036224365, "logps/chosen": -282.78265380859375, "logps/rejected": -257.1055603027344, "loss": 0.0416, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.009485239163041115, "rewards/margins": 0.01556326448917389, "rewards/rejected": -0.006078026257455349, "step": 320 }, { "epoch": 0.09, "learning_rate": 4.308093994778068e-06, "logits/chosen": 0.967635989189148, "logits/rejected": 1.0755988359451294, "logps/chosen": -278.8580017089844, "logps/rejected": -244.1697235107422, "loss": 0.0547, "rewards/accuracies": 0.59375, "rewards/chosen": 0.0037938461173325777, "rewards/margins": 0.02032056823372841, "rewards/rejected": -0.01652671955525875, "step": 330 }, { "epoch": 0.09, "learning_rate": 4.4386422976501306e-06, "logits/chosen": 0.9581148028373718, "logits/rejected": 0.9901423454284668, "logps/chosen": -274.140625, "logps/rejected": -268.93115234375, "loss": 0.0454, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.004858436528593302, "rewards/margins": 0.02044074237346649, "rewards/rejected": -0.015582305379211903, "step": 340 }, { "epoch": 0.09, "learning_rate": 4.569190600522193e-06, "logits/chosen": 0.974514365196228, "logits/rejected": 0.9625232815742493, "logps/chosen": -284.48089599609375, "logps/rejected": -250.8555908203125, "loss": 0.0553, "rewards/accuracies": 0.606249988079071, "rewards/chosen": 0.008898411877453327, "rewards/margins": 0.02419520542025566, "rewards/rejected": -0.015296794474124908, "step": 350 }, { "epoch": 0.09, "learning_rate": 4.699738903394257e-06, "logits/chosen": 0.9241229295730591, "logits/rejected": 1.0142980813980103, "logps/chosen": -301.9035949707031, "logps/rejected": -258.56298828125, "loss": 0.0361, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": 0.00739449355751276, "rewards/margins": 0.02596624568104744, "rewards/rejected": -0.018571753054857254, "step": 360 }, { "epoch": 0.1, "learning_rate": 4.8302872062663196e-06, "logits/chosen": 0.9622675180435181, "logits/rejected": 0.9503853917121887, "logps/chosen": -305.0982971191406, "logps/rejected": -260.784423828125, "loss": 0.0439, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": 0.008727970533072948, "rewards/margins": 0.026681995019316673, "rewards/rejected": -0.01795402355492115, "step": 370 }, { "epoch": 0.1, "learning_rate": 4.9608355091383814e-06, "logits/chosen": 0.9401235580444336, "logits/rejected": 1.042701005935669, "logps/chosen": -255.1713409423828, "logps/rejected": -223.62197875976562, "loss": 0.0635, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 0.010734304785728455, "rewards/margins": 0.0330788716673851, "rewards/rejected": -0.022344566881656647, "step": 380 }, { "epoch": 0.1, "learning_rate": 4.9999488562447675e-06, "logits/chosen": 0.9535024762153625, "logits/rejected": 0.9772897958755493, "logps/chosen": -298.8131103515625, "logps/rejected": -256.53302001953125, "loss": 0.0383, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.01018481608480215, "rewards/margins": 0.03526074439287186, "rewards/rejected": -0.02507592737674713, "step": 390 }, { "epoch": 0.1, "learning_rate": 4.999698361256577e-06, "logits/chosen": 0.9641995429992676, "logits/rejected": 0.9660250544548035, "logps/chosen": -278.9350891113281, "logps/rejected": -263.6481628417969, "loss": 0.0402, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.0038296219427138567, "rewards/margins": 0.0380658321082592, "rewards/rejected": -0.03423621878027916, "step": 400 }, { "epoch": 0.1, "eval_logits/chosen": 0.8994618058204651, "eval_logits/rejected": 0.9931817650794983, "eval_logps/chosen": -277.37713623046875, "eval_logps/rejected": -246.910888671875, "eval_loss": 0.04383732005953789, "eval_rewards/accuracies": 0.6324999928474426, "eval_rewards/chosen": 0.0016969649586826563, "eval_rewards/margins": 0.03264675661921501, "eval_rewards/rejected": -0.03094978630542755, "eval_runtime": 539.0327, "eval_samples_per_second": 3.71, "eval_steps_per_second": 0.928, "step": 400 }, { "epoch": 0.11, "learning_rate": 4.999239142174581e-06, "logits/chosen": 0.8831006288528442, "logits/rejected": 0.8935713768005371, "logps/chosen": -293.3919982910156, "logps/rejected": -245.70181274414062, "loss": 0.0399, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.0013237474486231804, "rewards/margins": 0.03327140584588051, "rewards/rejected": -0.03194766119122505, "step": 410 }, { "epoch": 0.11, "learning_rate": 4.99857123734344e-06, "logits/chosen": 0.9195354580879211, "logits/rejected": 1.0214719772338867, "logps/chosen": -278.9452209472656, "logps/rejected": -246.7372589111328, "loss": 0.0479, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.0041291858069598675, "rewards/margins": 0.03341008350253105, "rewards/rejected": -0.03753926604986191, "step": 420 }, { "epoch": 0.11, "learning_rate": 4.997694702533016e-06, "logits/chosen": 0.8224805593490601, "logits/rejected": 0.9571186900138855, "logps/chosen": -265.44757080078125, "logps/rejected": -260.35748291015625, "loss": 0.0417, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.01658559963107109, "rewards/margins": 0.031039753928780556, "rewards/rejected": -0.0476253516972065, "step": 430 }, { "epoch": 0.12, "learning_rate": 4.996609610933713e-06, "logits/chosen": 0.8995935320854187, "logits/rejected": 0.8752401471138, "logps/chosen": -283.92718505859375, "logps/rejected": -262.1937561035156, "loss": 0.0446, "rewards/accuracies": 0.59375, "rewards/chosen": -0.02005813643336296, "rewards/margins": 0.027436578646302223, "rewards/rejected": -0.04749471694231033, "step": 440 }, { "epoch": 0.12, "learning_rate": 4.995316053150366e-06, "logits/chosen": 0.8733296394348145, "logits/rejected": 0.9702059626579285, "logps/chosen": -263.5385437011719, "logps/rejected": -237.3717803955078, "loss": 0.035, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.02067594602704048, "rewards/margins": 0.025421470403671265, "rewards/rejected": -0.046097420156002045, "step": 450 }, { "epoch": 0.12, "learning_rate": 4.9938141371946815e-06, "logits/chosen": 0.9152857661247253, "logits/rejected": 1.0352412462234497, "logps/chosen": -252.8705596923828, "logps/rejected": -253.1604766845703, "loss": 0.0578, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.00973983108997345, "rewards/margins": 0.04021871089935303, "rewards/rejected": -0.049958545714616776, "step": 460 }, { "epoch": 0.12, "learning_rate": 4.992103988476206e-06, "logits/chosen": 0.9032732844352722, "logits/rejected": 0.9913337826728821, "logps/chosen": -296.672119140625, "logps/rejected": -250.1068878173828, "loss": 0.0611, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.013237145729362965, "rewards/margins": 0.0339890792965889, "rewards/rejected": -0.047226227819919586, "step": 470 }, { "epoch": 0.13, "learning_rate": 4.990185749791866e-06, "logits/chosen": 0.852449893951416, "logits/rejected": 0.9530878067016602, "logps/chosen": -244.0757293701172, "logps/rejected": -213.8367919921875, "loss": 0.041, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.016842521727085114, "rewards/margins": 0.039869144558906555, "rewards/rejected": -0.05671166256070137, "step": 480 }, { "epoch": 0.13, "learning_rate": 4.9880595813140395e-06, "logits/chosen": 0.8507224321365356, "logits/rejected": 0.9859424829483032, "logps/chosen": -276.9876403808594, "logps/rejected": -239.509521484375, "loss": 0.0438, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.022139808163046837, "rewards/margins": 0.034604597836732864, "rewards/rejected": -0.05674440786242485, "step": 490 }, { "epoch": 0.13, "learning_rate": 4.985725660577184e-06, "logits/chosen": 0.9554396867752075, "logits/rejected": 0.9389545321464539, "logps/chosen": -246.587158203125, "logps/rejected": -236.51171875, "loss": 0.0421, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.031007111072540283, "rewards/margins": 0.036111582070589066, "rewards/rejected": -0.06711869686841965, "step": 500 }, { "epoch": 0.13, "eval_logits/chosen": 0.8361961841583252, "eval_logits/rejected": 0.9295023679733276, "eval_logps/chosen": -281.6956481933594, "eval_logps/rejected": -251.91390991210938, "eval_loss": 0.041099708527326584, "eval_rewards/accuracies": 0.6194999814033508, "eval_rewards/chosen": -0.0414884127676487, "eval_rewards/margins": 0.03949163854122162, "eval_rewards/rejected": -0.08098004758358002, "eval_runtime": 539.1317, "eval_samples_per_second": 3.71, "eval_steps_per_second": 0.927, "step": 500 }, { "epoch": 0.13, "learning_rate": 4.983184182463009e-06, "logits/chosen": 0.8190703392028809, "logits/rejected": 0.9820553064346313, "logps/chosen": -286.83819580078125, "logps/rejected": -225.30502319335938, "loss": 0.0481, "rewards/accuracies": 0.625, "rewards/chosen": -0.034673698246479034, "rewards/margins": 0.04182344675064087, "rewards/rejected": -0.0764971375465393, "step": 510 }, { "epoch": 0.14, "learning_rate": 4.980435359184203e-06, "logits/chosen": 0.8918254971504211, "logits/rejected": 0.9686266779899597, "logps/chosen": -291.6520080566406, "logps/rejected": -257.6617126464844, "loss": 0.0324, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.034312546253204346, "rewards/margins": 0.04724326729774475, "rewards/rejected": -0.0815558210015297, "step": 520 }, { "epoch": 0.14, "learning_rate": 4.9774794202667236e-06, "logits/chosen": 0.8753170967102051, "logits/rejected": 0.9559276700019836, "logps/chosen": -268.8739929199219, "logps/rejected": -239.81484985351562, "loss": 0.0294, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.028610479086637497, "rewards/margins": 0.04722968488931656, "rewards/rejected": -0.07584016025066376, "step": 530 }, { "epoch": 0.14, "learning_rate": 4.974316612530615e-06, "logits/chosen": 0.8446584939956665, "logits/rejected": 0.9035196304321289, "logps/chosen": -245.0946502685547, "logps/rejected": -227.1122589111328, "loss": 0.0412, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.03522849082946777, "rewards/margins": 0.045352503657341, "rewards/rejected": -0.08058099448680878, "step": 540 }, { "epoch": 0.14, "learning_rate": 4.970947200069416e-06, "logits/chosen": 0.8838707208633423, "logits/rejected": 0.9225630760192871, "logps/chosen": -267.9327087402344, "logps/rejected": -232.60745239257812, "loss": 0.0575, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.05179372429847717, "rewards/margins": 0.05676066875457764, "rewards/rejected": -0.10855438560247421, "step": 550 }, { "epoch": 0.15, "learning_rate": 4.967371464228096e-06, "logits/chosen": 0.9162457585334778, "logits/rejected": 0.9888601303100586, "logps/chosen": -288.94476318359375, "logps/rejected": -267.3609313964844, "loss": 0.0424, "rewards/accuracies": 0.625, "rewards/chosen": -0.05239032953977585, "rewards/margins": 0.040877897292375565, "rewards/rejected": -0.09326823055744171, "step": 560 }, { "epoch": 0.15, "learning_rate": 4.963589703579569e-06, "logits/chosen": 0.8896541595458984, "logits/rejected": 1.071001410484314, "logps/chosen": -271.9025573730469, "logps/rejected": -227.18283081054688, "loss": 0.0489, "rewards/accuracies": 0.6875, "rewards/chosen": -0.039633095264434814, "rewards/margins": 0.06890513002872467, "rewards/rejected": -0.10853822529315948, "step": 570 }, { "epoch": 0.15, "learning_rate": 4.9596022338997615e-06, "logits/chosen": 0.9138982892036438, "logits/rejected": 0.8892068862915039, "logps/chosen": -261.0631408691406, "logps/rejected": -233.52206420898438, "loss": 0.0463, "rewards/accuracies": 0.59375, "rewards/chosen": -0.04562286287546158, "rewards/margins": 0.03961200267076492, "rewards/rejected": -0.0852348655462265, "step": 580 }, { "epoch": 0.15, "learning_rate": 4.955409388141243e-06, "logits/chosen": 0.9704087972640991, "logits/rejected": 0.9119867086410522, "logps/chosen": -272.70599365234375, "logps/rejected": -236.0823211669922, "loss": 0.0383, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.04884126037359238, "rewards/margins": 0.03933250904083252, "rewards/rejected": -0.0881737768650055, "step": 590 }, { "epoch": 0.16, "learning_rate": 4.951011516405429e-06, "logits/chosen": 0.9375241994857788, "logits/rejected": 0.9329082369804382, "logps/chosen": -243.24472045898438, "logps/rejected": -266.67962646484375, "loss": 0.0439, "rewards/accuracies": 0.59375, "rewards/chosen": -0.06609703600406647, "rewards/margins": 0.03910304233431816, "rewards/rejected": -0.10520007461309433, "step": 600 }, { "epoch": 0.16, "eval_logits/chosen": 0.8607339262962341, "eval_logits/rejected": 0.952020525932312, "eval_logps/chosen": -284.5547180175781, "eval_logps/rejected": -255.50050354003906, "eval_loss": 0.03948886692523956, "eval_rewards/accuracies": 0.6175000071525574, "eval_rewards/chosen": -0.07007911801338196, "eval_rewards/margins": 0.046766627579927444, "eval_rewards/rejected": -0.11684573441743851, "eval_runtime": 539.068, "eval_samples_per_second": 3.71, "eval_steps_per_second": 0.928, "step": 600 }, { "epoch": 0.16, "learning_rate": 4.946408985913344e-06, "logits/chosen": 0.9299923777580261, "logits/rejected": 0.949097752571106, "logps/chosen": -251.2418212890625, "logps/rejected": -229.9620819091797, "loss": 0.0418, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.08045734465122223, "rewards/margins": 0.039681874215602875, "rewards/rejected": -0.1201392188668251, "step": 610 }, { "epoch": 0.16, "learning_rate": 4.941602180974958e-06, "logits/chosen": 0.9066370129585266, "logits/rejected": 0.9455870389938354, "logps/chosen": -293.0451354980469, "logps/rejected": -237.9638214111328, "loss": 0.0459, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.07144733518362045, "rewards/margins": 0.0446074940264225, "rewards/rejected": -0.11605483293533325, "step": 620 }, { "epoch": 0.16, "learning_rate": 4.936591502957101e-06, "logits/chosen": 0.8010427355766296, "logits/rejected": 0.9337188005447388, "logps/chosen": -301.9491882324219, "logps/rejected": -257.299560546875, "loss": 0.0313, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.049509234726428986, "rewards/margins": 0.03574910759925842, "rewards/rejected": -0.085258349776268, "step": 630 }, { "epoch": 0.17, "learning_rate": 4.931377370249946e-06, "logits/chosen": 0.9087220430374146, "logits/rejected": 0.9815553426742554, "logps/chosen": -279.0362548828125, "logps/rejected": -235.7110137939453, "loss": 0.0311, "rewards/accuracies": 0.625, "rewards/chosen": -0.0523492693901062, "rewards/margins": 0.03988610580563545, "rewards/rejected": -0.09223536401987076, "step": 640 }, { "epoch": 0.17, "learning_rate": 4.925960218232073e-06, "logits/chosen": 0.9217544794082642, "logits/rejected": 1.0427885055541992, "logps/chosen": -308.48004150390625, "logps/rejected": -286.24566650390625, "loss": 0.0343, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.05535319447517395, "rewards/margins": 0.054420508444309235, "rewards/rejected": -0.10977371037006378, "step": 650 }, { "epoch": 0.17, "learning_rate": 4.920340499234116e-06, "logits/chosen": 0.8749852180480957, "logits/rejected": 0.9889238476753235, "logps/chosen": -288.8839416503906, "logps/rejected": -217.01760864257812, "loss": 0.0459, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.051978230476379395, "rewards/margins": 0.03869297355413437, "rewards/rejected": -0.09067119657993317, "step": 660 }, { "epoch": 0.18, "learning_rate": 4.914518682500995e-06, "logits/chosen": 0.955339252948761, "logits/rejected": 0.939558207988739, "logps/chosen": -282.4548034667969, "logps/rejected": -278.93646240234375, "loss": 0.0427, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.05497046187520027, "rewards/margins": 0.046811606734991074, "rewards/rejected": -0.10178206861019135, "step": 670 }, { "epoch": 0.18, "learning_rate": 4.9084952541527315e-06, "logits/chosen": 0.9087344408035278, "logits/rejected": 0.9323067665100098, "logps/chosen": -258.89776611328125, "logps/rejected": -246.7029266357422, "loss": 0.0366, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.03954412043094635, "rewards/margins": 0.0509122833609581, "rewards/rejected": -0.09045641124248505, "step": 680 }, { "epoch": 0.18, "learning_rate": 4.902270717143858e-06, "logits/chosen": 0.8921301960945129, "logits/rejected": 0.9605242013931274, "logps/chosen": -288.9732666015625, "logps/rejected": -252.2064971923828, "loss": 0.0353, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.03391667455434799, "rewards/margins": 0.05217113345861435, "rewards/rejected": -0.08608780801296234, "step": 690 }, { "epoch": 0.18, "learning_rate": 4.895845591221427e-06, "logits/chosen": 0.8141648173332214, "logits/rejected": 0.9764218330383301, "logps/chosen": -251.57406616210938, "logps/rejected": -232.8114776611328, "loss": 0.0363, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.04507957026362419, "rewards/margins": 0.047078561037778854, "rewards/rejected": -0.09215812385082245, "step": 700 }, { "epoch": 0.18, "eval_logits/chosen": 0.894931435585022, "eval_logits/rejected": 0.9895482063293457, "eval_logps/chosen": -281.16192626953125, "eval_logps/rejected": -251.89256286621094, "eval_loss": 0.03899623081088066, "eval_rewards/accuracies": 0.6309999823570251, "eval_rewards/chosen": -0.036151450127363205, "eval_rewards/margins": 0.04461483657360077, "eval_rewards/rejected": -0.08076628297567368, "eval_runtime": 539.1732, "eval_samples_per_second": 3.709, "eval_steps_per_second": 0.927, "step": 700 }, { "epoch": 0.19, "learning_rate": 4.8892204128816e-06, "logits/chosen": 0.9831956028938293, "logits/rejected": 1.0133693218231201, "logps/chosen": -286.310546875, "logps/rejected": -283.8514099121094, "loss": 0.035, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.034512270241975784, "rewards/margins": 0.047017090022563934, "rewards/rejected": -0.08152935653924942, "step": 710 }, { "epoch": 0.19, "learning_rate": 4.882395735324864e-06, "logits/chosen": 0.8945713043212891, "logits/rejected": 0.8778280019760132, "logps/chosen": -281.03704833984375, "logps/rejected": -255.6659698486328, "loss": 0.0347, "rewards/accuracies": 0.625, "rewards/chosen": -0.027182284742593765, "rewards/margins": 0.04411619156599045, "rewards/rejected": -0.07129846513271332, "step": 720 }, { "epoch": 0.19, "learning_rate": 4.87537212840983e-06, "logits/chosen": 0.9152518510818481, "logits/rejected": 0.9284723997116089, "logps/chosen": -302.0511169433594, "logps/rejected": -256.92071533203125, "loss": 0.0389, "rewards/accuracies": 0.625, "rewards/chosen": -0.02550722099840641, "rewards/margins": 0.049370888620615005, "rewards/rejected": -0.07487811148166656, "step": 730 }, { "epoch": 0.19, "learning_rate": 4.8681501786056545e-06, "logits/chosen": 0.8889036178588867, "logits/rejected": 0.9711803197860718, "logps/chosen": -273.5214538574219, "logps/rejected": -221.85977172851562, "loss": 0.0501, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.03598209470510483, "rewards/margins": 0.03736092895269394, "rewards/rejected": -0.07334302365779877, "step": 740 }, { "epoch": 0.2, "learning_rate": 4.860730488943068e-06, "logits/chosen": 0.9261956214904785, "logits/rejected": 0.9333757162094116, "logps/chosen": -279.0644226074219, "logps/rejected": -245.6189422607422, "loss": 0.0386, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.016734056174755096, "rewards/margins": 0.05964844301342964, "rewards/rejected": -0.07638250291347504, "step": 750 }, { "epoch": 0.2, "learning_rate": 4.853113678964022e-06, "logits/chosen": 0.9745391607284546, "logits/rejected": 0.9919463396072388, "logps/chosen": -249.7964630126953, "logps/rejected": -235.2704315185547, "loss": 0.0336, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.029250269755721092, "rewards/margins": 0.05691809579730034, "rewards/rejected": -0.08616836369037628, "step": 760 }, { "epoch": 0.2, "learning_rate": 4.845300384669958e-06, "logits/chosen": 0.9550157785415649, "logits/rejected": 0.9655323028564453, "logps/chosen": -266.6517028808594, "logps/rejected": -224.41366577148438, "loss": 0.0503, "rewards/accuracies": 0.59375, "rewards/chosen": -0.04020417481660843, "rewards/margins": 0.03571712225675583, "rewards/rejected": -0.07592129707336426, "step": 770 }, { "epoch": 0.2, "learning_rate": 4.837291258468701e-06, "logits/chosen": 0.8910456895828247, "logits/rejected": 0.9127016067504883, "logps/chosen": -280.16632080078125, "logps/rejected": -249.5512237548828, "loss": 0.0404, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.04826400801539421, "rewards/margins": 0.05860195308923721, "rewards/rejected": -0.10686596482992172, "step": 780 }, { "epoch": 0.21, "learning_rate": 4.829086969119984e-06, "logits/chosen": 0.8841217756271362, "logits/rejected": 0.9553950428962708, "logps/chosen": -249.34146118164062, "logps/rejected": -241.98623657226562, "loss": 0.0306, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.04528197646141052, "rewards/margins": 0.04499911516904831, "rewards/rejected": -0.09028108417987823, "step": 790 }, { "epoch": 0.21, "learning_rate": 4.820688201679605e-06, "logits/chosen": 0.9039901494979858, "logits/rejected": 0.9560089111328125, "logps/chosen": -262.4331359863281, "logps/rejected": -234.8318328857422, "loss": 0.0402, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.04591182619333267, "rewards/margins": 0.0473395399749279, "rewards/rejected": -0.09325136244297028, "step": 800 }, { "epoch": 0.21, "eval_logits/chosen": 0.9000641107559204, "eval_logits/rejected": 0.9937340021133423, "eval_logps/chosen": -282.6900939941406, "eval_logps/rejected": -253.87200927734375, "eval_loss": 0.03816115856170654, "eval_rewards/accuracies": 0.621999979019165, "eval_rewards/chosen": -0.05143279209733009, "eval_rewards/margins": 0.049128152430057526, "eval_rewards/rejected": -0.10056094080209732, "eval_runtime": 538.996, "eval_samples_per_second": 3.711, "eval_steps_per_second": 0.928, "step": 800 }, { "epoch": 0.21, "learning_rate": 4.8120956574422315e-06, "logits/chosen": 0.985218346118927, "logits/rejected": 0.9626695513725281, "logps/chosen": -285.3841247558594, "logps/rejected": -283.31024169921875, "loss": 0.0464, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.05067628622055054, "rewards/margins": 0.054511237889528275, "rewards/rejected": -0.10518752038478851, "step": 810 }, { "epoch": 0.21, "learning_rate": 4.803310053882831e-06, "logits/chosen": 0.916561484336853, "logits/rejected": 0.9501992464065552, "logps/chosen": -215.0513458251953, "logps/rejected": -205.7407989501953, "loss": 0.0444, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.05079100281000137, "rewards/margins": 0.042504359036684036, "rewards/rejected": -0.09329536557197571, "step": 820 }, { "epoch": 0.22, "learning_rate": 4.794332124596775e-06, "logits/chosen": 0.9133389592170715, "logits/rejected": 0.9860326647758484, "logps/chosen": -280.44476318359375, "logps/rejected": -256.5655212402344, "loss": 0.0377, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.06920581310987473, "rewards/margins": 0.037345677614212036, "rewards/rejected": -0.10655149072408676, "step": 830 }, { "epoch": 0.22, "learning_rate": 4.785162619238575e-06, "logits/chosen": 0.9965925216674805, "logits/rejected": 1.0270875692367554, "logps/chosen": -269.73480224609375, "logps/rejected": -253.419677734375, "loss": 0.0389, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.0890035331249237, "rewards/margins": 0.05010632425546646, "rewards/rejected": -0.13910984992980957, "step": 840 }, { "epoch": 0.22, "learning_rate": 4.775802303459288e-06, "logits/chosen": 0.9339388012886047, "logits/rejected": 1.0584567785263062, "logps/chosen": -296.1967468261719, "logps/rejected": -267.1576232910156, "loss": 0.0395, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.09242481738328934, "rewards/margins": 0.05164768174290657, "rewards/rejected": -0.1440725028514862, "step": 850 }, { "epoch": 0.23, "learning_rate": 4.766251958842589e-06, "logits/chosen": 0.9728446006774902, "logits/rejected": 1.0179331302642822, "logps/chosen": -308.83551025390625, "logps/rejected": -272.84796142578125, "loss": 0.0325, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.08283834159374237, "rewards/margins": 0.047348491847515106, "rewards/rejected": -0.13018682599067688, "step": 860 }, { "epoch": 0.23, "learning_rate": 4.7565123828395066e-06, "logits/chosen": 0.9140795469284058, "logits/rejected": 1.0925973653793335, "logps/chosen": -319.13250732421875, "logps/rejected": -295.2183837890625, "loss": 0.0217, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.05658285692334175, "rewards/margins": 0.04898856207728386, "rewards/rejected": -0.10557142645120621, "step": 870 }, { "epoch": 0.23, "learning_rate": 4.746584388701831e-06, "logits/chosen": 1.016638994216919, "logits/rejected": 1.144810438156128, "logps/chosen": -261.43853759765625, "logps/rejected": -228.3360595703125, "loss": 0.0479, "rewards/accuracies": 0.6875, "rewards/chosen": -0.05251890420913696, "rewards/margins": 0.07163821160793304, "rewards/rejected": -0.12415711581707001, "step": 880 }, { "epoch": 0.23, "learning_rate": 4.736468805414218e-06, "logits/chosen": 1.0147383213043213, "logits/rejected": 1.1735047101974487, "logps/chosen": -297.06878662109375, "logps/rejected": -270.4248046875, "loss": 0.0401, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.045632537454366684, "rewards/margins": 0.08101598918437958, "rewards/rejected": -0.12664853036403656, "step": 890 }, { "epoch": 0.24, "learning_rate": 4.7261664776249595e-06, "logits/chosen": 1.0279176235198975, "logits/rejected": 1.0230156183242798, "logps/chosen": -273.8691101074219, "logps/rejected": -256.41912841796875, "loss": 0.0381, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.053291238844394684, "rewards/margins": 0.059307873249053955, "rewards/rejected": -0.11259911209344864, "step": 900 }, { "epoch": 0.24, "eval_logits/chosen": 0.9533628225326538, "eval_logits/rejected": 1.0464704036712646, "eval_logps/chosen": -283.0850830078125, "eval_logps/rejected": -254.8046875, "eval_loss": 0.03756963834166527, "eval_rewards/accuracies": 0.6315000057220459, "eval_rewards/chosen": -0.05538267269730568, "eval_rewards/margins": 0.054504893720149994, "eval_rewards/rejected": -0.10988757014274597, "eval_runtime": 538.9934, "eval_samples_per_second": 3.711, "eval_steps_per_second": 0.928, "step": 900 }, { "epoch": 0.24, "learning_rate": 4.715678265575463e-06, "logits/chosen": 1.033189058303833, "logits/rejected": 1.1023738384246826, "logps/chosen": -313.0371398925781, "logps/rejected": -296.1219482421875, "loss": 0.0362, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.05556187033653259, "rewards/margins": 0.036861807107925415, "rewards/rejected": -0.09242367744445801, "step": 910 }, { "epoch": 0.24, "learning_rate": 4.705005045028415e-06, "logits/chosen": 1.0147944688796997, "logits/rejected": 1.0735704898834229, "logps/chosen": -336.9757995605469, "logps/rejected": -290.46820068359375, "loss": 0.0291, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.05297808721661568, "rewards/margins": 0.06880663335323334, "rewards/rejected": -0.12178472429513931, "step": 920 }, { "epoch": 0.24, "learning_rate": 4.694147707194659e-06, "logits/chosen": 1.0121935606002808, "logits/rejected": 1.0971285104751587, "logps/chosen": -291.9516296386719, "logps/rejected": -246.9907684326172, "loss": 0.0394, "rewards/accuracies": 0.65625, "rewards/chosen": -0.0664924904704094, "rewards/margins": 0.05605294555425644, "rewards/rejected": -0.12254543602466583, "step": 930 }, { "epoch": 0.25, "learning_rate": 4.683107158658782e-06, "logits/chosen": 0.976874053478241, "logits/rejected": 0.9745148420333862, "logps/chosen": -267.6925354003906, "logps/rejected": -239.68063354492188, "loss": 0.0418, "rewards/accuracies": 0.625, "rewards/chosen": -0.06668306887149811, "rewards/margins": 0.06082264333963394, "rewards/rejected": -0.12750570476055145, "step": 940 }, { "epoch": 0.25, "learning_rate": 4.671884321303407e-06, "logits/chosen": 0.9618045687675476, "logits/rejected": 0.9925098419189453, "logps/chosen": -293.56109619140625, "logps/rejected": -280.6075439453125, "loss": 0.036, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.08693472295999527, "rewards/margins": 0.055363357067108154, "rewards/rejected": -0.14229807257652283, "step": 950 }, { "epoch": 0.25, "learning_rate": 4.660480132232224e-06, "logits/chosen": 0.9112693071365356, "logits/rejected": 0.9766784906387329, "logps/chosen": -284.80621337890625, "logps/rejected": -257.4582824707031, "loss": 0.0321, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.08965489268302917, "rewards/margins": 0.05001254007220268, "rewards/rejected": -0.13966743648052216, "step": 960 }, { "epoch": 0.25, "learning_rate": 4.6488955436917414e-06, "logits/chosen": 0.9918138384819031, "logits/rejected": 1.0566661357879639, "logps/chosen": -285.3071594238281, "logps/rejected": -222.27645874023438, "loss": 0.0308, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.07797913253307343, "rewards/margins": 0.035942643880844116, "rewards/rejected": -0.11392178386449814, "step": 970 }, { "epoch": 0.26, "learning_rate": 4.6371315229917644e-06, "logits/chosen": 0.9261223077774048, "logits/rejected": 0.991136908531189, "logps/chosen": -268.17742919921875, "logps/rejected": -251.2354278564453, "loss": 0.0314, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.055728018283843994, "rewards/margins": 0.05425562709569931, "rewards/rejected": -0.1099836453795433, "step": 980 }, { "epoch": 0.26, "learning_rate": 4.625189052424638e-06, "logits/chosen": 0.9730453491210938, "logits/rejected": 1.032597303390503, "logps/chosen": -249.26864624023438, "logps/rejected": -216.4525909423828, "loss": 0.0417, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.04507770389318466, "rewards/margins": 0.03995997831225395, "rewards/rejected": -0.08503767102956772, "step": 990 }, { "epoch": 0.26, "learning_rate": 4.613069129183218e-06, "logits/chosen": 0.9484704732894897, "logits/rejected": 0.9633451700210571, "logps/chosen": -264.92633056640625, "logps/rejected": -257.33941650390625, "loss": 0.0421, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.04708124324679375, "rewards/margins": 0.04397277534008026, "rewards/rejected": -0.09105401486158371, "step": 1000 }, { "epoch": 0.26, "eval_logits/chosen": 0.9447739124298096, "eval_logits/rejected": 1.0398797988891602, "eval_logps/chosen": -281.62677001953125, "eval_logps/rejected": -253.11135864257812, "eval_loss": 0.0373673252761364, "eval_rewards/accuracies": 0.6269999742507935, "eval_rewards/chosen": -0.040799498558044434, "eval_rewards/margins": 0.05215470865368843, "eval_rewards/rejected": -0.09295421838760376, "eval_runtime": 539.0882, "eval_samples_per_second": 3.71, "eval_steps_per_second": 0.927, "step": 1000 }, { "epoch": 0.26, "learning_rate": 4.600772765277607e-06, "logits/chosen": 0.9384505152702332, "logits/rejected": 1.036522388458252, "logps/chosen": -300.46435546875, "logps/rejected": -254.4575958251953, "loss": 0.0411, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.04996770992875099, "rewards/margins": 0.05875014141201973, "rewards/rejected": -0.10871784389019012, "step": 1010 }, { "epoch": 0.27, "learning_rate": 4.588300987450652e-06, "logits/chosen": 1.0055780410766602, "logits/rejected": 1.0831704139709473, "logps/chosen": -283.211181640625, "logps/rejected": -268.2267150878906, "loss": 0.0358, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.05758129805326462, "rewards/margins": 0.062214724719524384, "rewards/rejected": -0.1197960153222084, "step": 1020 }, { "epoch": 0.27, "learning_rate": 4.5756548370922136e-06, "logits/chosen": 0.991938591003418, "logits/rejected": 1.0386155843734741, "logps/chosen": -283.02081298828125, "logps/rejected": -256.51434326171875, "loss": 0.0317, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.052321650087833405, "rewards/margins": 0.03847939521074295, "rewards/rejected": -0.09080104529857635, "step": 1030 }, { "epoch": 0.27, "learning_rate": 4.562835370152206e-06, "logits/chosen": 0.9276207089424133, "logits/rejected": 1.045037865638733, "logps/chosen": -254.05892944335938, "logps/rejected": -240.1755828857422, "loss": 0.0383, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.04577355459332466, "rewards/margins": 0.036837171763181686, "rewards/rejected": -0.08261072635650635, "step": 1040 }, { "epoch": 0.27, "learning_rate": 4.54984365705243e-06, "logits/chosen": 1.0454961061477661, "logits/rejected": 1.0517162084579468, "logps/chosen": -255.4774627685547, "logps/rejected": -232.04190063476562, "loss": 0.0469, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.03626035898923874, "rewards/margins": 0.043095506727695465, "rewards/rejected": -0.07935585826635361, "step": 1050 }, { "epoch": 0.28, "learning_rate": 4.536680782597191e-06, "logits/chosen": 0.9645137786865234, "logits/rejected": 1.0626866817474365, "logps/chosen": -298.6683349609375, "logps/rejected": -269.98724365234375, "loss": 0.0301, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.02404281124472618, "rewards/margins": 0.05357781797647476, "rewards/rejected": -0.07762061804533005, "step": 1060 }, { "epoch": 0.28, "learning_rate": 4.523347845882718e-06, "logits/chosen": 1.0478742122650146, "logits/rejected": 1.112269639968872, "logps/chosen": -258.76458740234375, "logps/rejected": -208.38107299804688, "loss": 0.0384, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.03418079391121864, "rewards/margins": 0.05562075227499008, "rewards/rejected": -0.08980154246091843, "step": 1070 }, { "epoch": 0.28, "learning_rate": 4.50984596020539e-06, "logits/chosen": 0.9761837720870972, "logits/rejected": 1.0493123531341553, "logps/chosen": -300.64105224609375, "logps/rejected": -265.405517578125, "loss": 0.0404, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.03310775384306908, "rewards/margins": 0.04876155033707619, "rewards/rejected": -0.08186930418014526, "step": 1080 }, { "epoch": 0.29, "learning_rate": 4.4961762529687745e-06, "logits/chosen": 0.9403045773506165, "logits/rejected": 1.0544614791870117, "logps/chosen": -305.84027099609375, "logps/rejected": -234.25341796875, "loss": 0.0407, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.04744723066687584, "rewards/margins": 0.04316466301679611, "rewards/rejected": -0.09061190485954285, "step": 1090 }, { "epoch": 0.29, "learning_rate": 4.482339865589492e-06, "logits/chosen": 0.8826369047164917, "logits/rejected": 1.0571672916412354, "logps/chosen": -266.90252685546875, "logps/rejected": -235.89877319335938, "loss": 0.0393, "rewards/accuracies": 0.6875, "rewards/chosen": -0.05000295490026474, "rewards/margins": 0.058589059859514236, "rewards/rejected": -0.10859201848506927, "step": 1100 }, { "epoch": 0.29, "eval_logits/chosen": 0.9608851075172424, "eval_logits/rejected": 1.0557035207748413, "eval_logps/chosen": -283.3030700683594, "eval_logps/rejected": -254.34910583496094, "eval_loss": 0.03702974691987038, "eval_rewards/accuracies": 0.6284999847412109, "eval_rewards/chosen": -0.05756256729364395, "eval_rewards/margins": 0.04776925593614578, "eval_rewards/rejected": -0.10533181577920914, "eval_runtime": 539.1778, "eval_samples_per_second": 3.709, "eval_steps_per_second": 0.927, "step": 1100 }, { "epoch": 0.29, "learning_rate": 4.468337953401909e-06, "logits/chosen": 1.0197701454162598, "logits/rejected": 1.0784578323364258, "logps/chosen": -299.9832458496094, "logps/rejected": -247.8249969482422, "loss": 0.039, "rewards/accuracies": 0.5625, "rewards/chosen": -0.04413250833749771, "rewards/margins": 0.051008790731430054, "rewards/rejected": -0.09514130651950836, "step": 1110 }, { "epoch": 0.29, "learning_rate": 4.45417168556166e-06, "logits/chosen": 0.9736183285713196, "logits/rejected": 1.0494579076766968, "logps/chosen": -261.1857604980469, "logps/rejected": -262.29241943359375, "loss": 0.0333, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.03721706196665764, "rewards/margins": 0.04604244977235794, "rewards/rejected": -0.08325951546430588, "step": 1120 }, { "epoch": 0.3, "learning_rate": 4.439842244948036e-06, "logits/chosen": 0.9458913803100586, "logits/rejected": 1.0195437669754028, "logps/chosen": -261.69512939453125, "logps/rejected": -244.91513061523438, "loss": 0.0403, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.03936644643545151, "rewards/margins": 0.04326556995511055, "rewards/rejected": -0.08263202011585236, "step": 1130 }, { "epoch": 0.3, "learning_rate": 4.425350828065204e-06, "logits/chosen": 1.0116994380950928, "logits/rejected": 1.0178403854370117, "logps/chosen": -282.7467956542969, "logps/rejected": -266.43963623046875, "loss": 0.0363, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.03308098763227463, "rewards/margins": 0.048868577927351, "rewards/rejected": -0.08194957673549652, "step": 1140 }, { "epoch": 0.3, "learning_rate": 4.410698644942303e-06, "logits/chosen": 1.0054022073745728, "logits/rejected": 1.0613911151885986, "logps/chosen": -281.3987731933594, "logps/rejected": -285.315673828125, "loss": 0.0349, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.031625766307115555, "rewards/margins": 0.03699468821287155, "rewards/rejected": -0.0686204582452774, "step": 1150 }, { "epoch": 0.3, "learning_rate": 4.395886919032406e-06, "logits/chosen": 1.0045572519302368, "logits/rejected": 1.0691404342651367, "logps/chosen": -274.28741455078125, "logps/rejected": -232.10574340820312, "loss": 0.0388, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.03036217950284481, "rewards/margins": 0.04830170422792435, "rewards/rejected": -0.0786639004945755, "step": 1160 }, { "epoch": 0.31, "learning_rate": 4.380916887110366e-06, "logits/chosen": 0.9450492858886719, "logits/rejected": 1.0655186176300049, "logps/chosen": -251.56259155273438, "logps/rejected": -223.6671600341797, "loss": 0.0337, "rewards/accuracies": 0.65625, "rewards/chosen": -0.03250129520893097, "rewards/margins": 0.06061319261789322, "rewards/rejected": -0.09311448037624359, "step": 1170 }, { "epoch": 0.31, "learning_rate": 4.365789799169539e-06, "logits/chosen": 1.004997968673706, "logits/rejected": 0.9879255294799805, "logps/chosen": -313.8050842285156, "logps/rejected": -263.55718994140625, "loss": 0.0399, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.0565299317240715, "rewards/margins": 0.057772088795900345, "rewards/rejected": -0.11430201679468155, "step": 1180 }, { "epoch": 0.31, "learning_rate": 4.350506918317416e-06, "logits/chosen": 0.9306057095527649, "logits/rejected": 0.8915265798568726, "logps/chosen": -307.5498962402344, "logps/rejected": -276.47088623046875, "loss": 0.0349, "rewards/accuracies": 0.53125, "rewards/chosen": -0.047587279230356216, "rewards/margins": 0.03898516297340393, "rewards/rejected": -0.08657244592905045, "step": 1190 }, { "epoch": 0.31, "learning_rate": 4.335069520670149e-06, "logits/chosen": 0.9720792770385742, "logits/rejected": 1.0783460140228271, "logps/chosen": -281.57232666015625, "logps/rejected": -217.4803009033203, "loss": 0.0533, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.052179861813783646, "rewards/margins": 0.06668353080749512, "rewards/rejected": -0.11886338889598846, "step": 1200 }, { "epoch": 0.31, "eval_logits/chosen": 0.9417441487312317, "eval_logits/rejected": 1.0367752313613892, "eval_logps/chosen": -283.6021728515625, "eval_logps/rejected": -255.3543701171875, "eval_loss": 0.0369240865111351, "eval_rewards/accuracies": 0.6209999918937683, "eval_rewards/chosen": -0.0605538934469223, "eval_rewards/margins": 0.05483054369688034, "eval_rewards/rejected": -0.11538443714380264, "eval_runtime": 538.9866, "eval_samples_per_second": 3.711, "eval_steps_per_second": 0.928, "step": 1200 }, { "epoch": 0.32, "learning_rate": 4.319478895246e-06, "logits/chosen": 0.9820619821548462, "logits/rejected": 0.9384095072746277, "logps/chosen": -279.18499755859375, "logps/rejected": -242.414794921875, "loss": 0.0357, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.05051257088780403, "rewards/margins": 0.04647805169224739, "rewards/rejected": -0.09699061512947083, "step": 1210 }, { "epoch": 0.32, "learning_rate": 4.303736343857704e-06, "logits/chosen": 1.0831791162490845, "logits/rejected": 1.034220814704895, "logps/chosen": -269.3099365234375, "logps/rejected": -252.9770965576172, "loss": 0.0464, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.052128423005342484, "rewards/margins": 0.06647459417581558, "rewards/rejected": -0.11860301345586777, "step": 1220 }, { "epoch": 0.32, "learning_rate": 4.287843181003772e-06, "logits/chosen": 0.9971106648445129, "logits/rejected": 1.1152498722076416, "logps/chosen": -281.20208740234375, "logps/rejected": -239.54006958007812, "loss": 0.0394, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.03529990836977959, "rewards/margins": 0.0698903501033783, "rewards/rejected": -0.10519025474786758, "step": 1230 }, { "epoch": 0.32, "learning_rate": 4.27180073375873e-06, "logits/chosen": 0.9784267544746399, "logits/rejected": 1.0021319389343262, "logps/chosen": -281.3310241699219, "logps/rejected": -254.4761962890625, "loss": 0.038, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.03572789579629898, "rewards/margins": 0.06249629333615303, "rewards/rejected": -0.09822418540716171, "step": 1240 }, { "epoch": 0.33, "learning_rate": 4.255610341662304e-06, "logits/chosen": 0.9751097559928894, "logits/rejected": 1.0435435771942139, "logps/chosen": -306.3908386230469, "logps/rejected": -268.491455078125, "loss": 0.0425, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.01923917606472969, "rewards/margins": 0.06259562820196152, "rewards/rejected": -0.0818348079919815, "step": 1250 }, { "epoch": 0.33, "learning_rate": 4.2392733566075764e-06, "logits/chosen": 0.9983538389205933, "logits/rejected": 1.0708853006362915, "logps/chosen": -243.330322265625, "logps/rejected": -224.99075317382812, "loss": 0.0371, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.0249100960791111, "rewards/margins": 0.06250262260437012, "rewards/rejected": -0.08741272240877151, "step": 1260 }, { "epoch": 0.33, "learning_rate": 4.2227911427280975e-06, "logits/chosen": 0.9960931539535522, "logits/rejected": 1.0461426973342896, "logps/chosen": -286.45440673828125, "logps/rejected": -271.23980712890625, "loss": 0.0393, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.02666524052619934, "rewards/margins": 0.048846714198589325, "rewards/rejected": -0.07551195472478867, "step": 1270 }, { "epoch": 0.33, "learning_rate": 4.206165076283983e-06, "logits/chosen": 0.9975628852844238, "logits/rejected": 1.0400108098983765, "logps/chosen": -238.9987030029297, "logps/rejected": -228.86312866210938, "loss": 0.0446, "rewards/accuracies": 0.625, "rewards/chosen": -0.023251879960298538, "rewards/margins": 0.04423128813505173, "rewards/rejected": -0.06748317182064056, "step": 1280 }, { "epoch": 0.34, "learning_rate": 4.189396545546995e-06, "logits/chosen": 1.0215797424316406, "logits/rejected": 1.1224019527435303, "logps/chosen": -289.75323486328125, "logps/rejected": -280.547607421875, "loss": 0.034, "rewards/accuracies": 0.59375, "rewards/chosen": -0.019200313836336136, "rewards/margins": 0.03457511216402054, "rewards/rejected": -0.053775422275066376, "step": 1290 }, { "epoch": 0.34, "learning_rate": 4.172486950684627e-06, "logits/chosen": 1.0255308151245117, "logits/rejected": 1.0572230815887451, "logps/chosen": -279.2245178222656, "logps/rejected": -261.9948425292969, "loss": 0.0392, "rewards/accuracies": 0.625, "rewards/chosen": -0.02723405882716179, "rewards/margins": 0.047797515988349915, "rewards/rejected": -0.075031578540802, "step": 1300 }, { "epoch": 0.34, "eval_logits/chosen": 0.9659793972969055, "eval_logits/rejected": 1.0633821487426758, "eval_logps/chosen": -279.6128845214844, "eval_logps/rejected": -250.95762634277344, "eval_loss": 0.0366741381585598, "eval_rewards/accuracies": 0.6119999885559082, "eval_rewards/chosen": -0.020660726353526115, "eval_rewards/margins": 0.05075635015964508, "eval_rewards/rejected": -0.07141707837581635, "eval_runtime": 539.1224, "eval_samples_per_second": 3.71, "eval_steps_per_second": 0.927, "step": 1300 }, { "epoch": 0.34, "learning_rate": 4.155437703643182e-06, "logits/chosen": 0.9241663217544556, "logits/rejected": 0.9624761343002319, "logps/chosen": -284.2118225097656, "logps/rejected": -250.8708953857422, "loss": 0.0311, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.01345390360802412, "rewards/margins": 0.06398328393697739, "rewards/rejected": -0.07743719965219498, "step": 1310 }, { "epoch": 0.35, "learning_rate": 4.138250228029882e-06, "logits/chosen": 1.0098600387573242, "logits/rejected": 1.132021188735962, "logps/chosen": -233.54672241210938, "logps/rejected": -216.55630493164062, "loss": 0.0418, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.014631894417107105, "rewards/margins": 0.07016023248434067, "rewards/rejected": -0.0847921296954155, "step": 1320 }, { "epoch": 0.35, "learning_rate": 4.120925958993994e-06, "logits/chosen": 0.9781894683837891, "logits/rejected": 1.0409111976623535, "logps/chosen": -295.82403564453125, "logps/rejected": -249.65530395507812, "loss": 0.0414, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.026302725076675415, "rewards/margins": 0.0485377199947834, "rewards/rejected": -0.07484044134616852, "step": 1330 }, { "epoch": 0.35, "learning_rate": 4.103466343106999e-06, "logits/chosen": 0.9926727414131165, "logits/rejected": 1.0091572999954224, "logps/chosen": -274.465576171875, "logps/rejected": -243.0918731689453, "loss": 0.0381, "rewards/accuracies": 0.53125, "rewards/chosen": -0.019631439819931984, "rewards/margins": 0.047348715364933014, "rewards/rejected": -0.06698014587163925, "step": 1340 }, { "epoch": 0.35, "learning_rate": 4.085872838241797e-06, "logits/chosen": 1.0105888843536377, "logits/rejected": 1.042271614074707, "logps/chosen": -286.24774169921875, "logps/rejected": -284.87847900390625, "loss": 0.0349, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.010948913171887398, "rewards/margins": 0.058446235954761505, "rewards/rejected": -0.06939514726400375, "step": 1350 }, { "epoch": 0.36, "learning_rate": 4.06814691345098e-06, "logits/chosen": 0.9800373911857605, "logits/rejected": 1.1013528108596802, "logps/chosen": -303.1207275390625, "logps/rejected": -257.15679931640625, "loss": 0.0439, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.03485560044646263, "rewards/margins": 0.04474693164229393, "rewards/rejected": -0.07960253953933716, "step": 1360 }, { "epoch": 0.36, "learning_rate": 4.050290048844171e-06, "logits/chosen": 1.0052525997161865, "logits/rejected": 1.061704158782959, "logps/chosen": -256.0435791015625, "logps/rejected": -225.7267303466797, "loss": 0.0437, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.014286425895988941, "rewards/margins": 0.03906578570604324, "rewards/rejected": -0.05335221439599991, "step": 1370 }, { "epoch": 0.36, "learning_rate": 4.032303735464422e-06, "logits/chosen": 0.9533084034919739, "logits/rejected": 1.0529754161834717, "logps/chosen": -289.2705078125, "logps/rejected": -257.12225341796875, "loss": 0.0421, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.001376188127323985, "rewards/margins": 0.05364586040377617, "rewards/rejected": -0.05502205342054367, "step": 1380 }, { "epoch": 0.36, "learning_rate": 4.014189475163727e-06, "logits/chosen": 0.9958294630050659, "logits/rejected": 1.0700651407241821, "logps/chosen": -266.70538330078125, "logps/rejected": -240.39126586914062, "loss": 0.0486, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.004494071938097477, "rewards/margins": 0.0475679449737072, "rewards/rejected": -0.0520620159804821, "step": 1390 }, { "epoch": 0.37, "learning_rate": 3.995948780477605e-06, "logits/chosen": 1.0267126560211182, "logits/rejected": 1.0607044696807861, "logps/chosen": -287.0250549316406, "logps/rejected": -246.72103881835938, "loss": 0.0432, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.006947031710296869, "rewards/margins": 0.038351211696863174, "rewards/rejected": -0.04529824107885361, "step": 1400 }, { "epoch": 0.37, "eval_logits/chosen": 0.9482428431510925, "eval_logits/rejected": 1.0463390350341797, "eval_logps/chosen": -279.0111999511719, "eval_logps/rejected": -250.108154296875, "eval_loss": 0.036706726998090744, "eval_rewards/accuracies": 0.6259999871253967, "eval_rewards/chosen": -0.014643603935837746, "eval_rewards/margins": 0.0482785664498806, "eval_rewards/rejected": -0.0629221647977829, "eval_runtime": 539.1963, "eval_samples_per_second": 3.709, "eval_steps_per_second": 0.927, "step": 1400 }, { "epoch": 0.37, "learning_rate": 3.977583174498816e-06, "logits/chosen": 0.9617465138435364, "logits/rejected": 1.0087401866912842, "logps/chosen": -274.5213317871094, "logps/rejected": -242.7264404296875, "loss": 0.0398, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.014896227046847343, "rewards/margins": 0.038757093250751495, "rewards/rejected": -0.05365331843495369, "step": 1410 }, { "epoch": 0.37, "learning_rate": 3.959094190750172e-06, "logits/chosen": 0.9357368350028992, "logits/rejected": 1.0426499843597412, "logps/chosen": -288.6091003417969, "logps/rejected": -259.63323974609375, "loss": 0.0462, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.030341049656271935, "rewards/margins": 0.037225984036922455, "rewards/rejected": -0.06756703555583954, "step": 1420 }, { "epoch": 0.37, "learning_rate": 3.9404833730564975e-06, "logits/chosen": 1.026078224182129, "logits/rejected": 1.0588185787200928, "logps/chosen": -250.0701141357422, "logps/rejected": -250.2508087158203, "loss": 0.0495, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.03756723925471306, "rewards/margins": 0.042705655097961426, "rewards/rejected": -0.08027289807796478, "step": 1430 }, { "epoch": 0.38, "learning_rate": 3.921752275415712e-06, "logits/chosen": 1.0129649639129639, "logits/rejected": 1.0660035610198975, "logps/chosen": -242.80191040039062, "logps/rejected": -215.3003387451172, "loss": 0.0364, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.041556812822818756, "rewards/margins": 0.0497397780418396, "rewards/rejected": -0.09129659831523895, "step": 1440 }, { "epoch": 0.38, "learning_rate": 3.902902461869079e-06, "logits/chosen": 0.9659714698791504, "logits/rejected": 1.106687307357788, "logps/chosen": -273.6470642089844, "logps/rejected": -246.69326782226562, "loss": 0.0399, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.050172436982393265, "rewards/margins": 0.060846518725156784, "rewards/rejected": -0.11101895570755005, "step": 1450 }, { "epoch": 0.38, "learning_rate": 3.883935506370605e-06, "logits/chosen": 0.941813588142395, "logits/rejected": 1.0076746940612793, "logps/chosen": -282.126708984375, "logps/rejected": -244.9275665283203, "loss": 0.0352, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.05347307771444321, "rewards/margins": 0.053630221635103226, "rewards/rejected": -0.10710330307483673, "step": 1460 }, { "epoch": 0.38, "learning_rate": 3.864852992655617e-06, "logits/chosen": 0.9045804142951965, "logits/rejected": 1.0152260065078735, "logps/chosen": -270.4555969238281, "logps/rejected": -260.31390380859375, "loss": 0.0396, "rewards/accuracies": 0.59375, "rewards/chosen": -0.045248690992593765, "rewards/margins": 0.0546412356197834, "rewards/rejected": -0.09988992661237717, "step": 1470 }, { "epoch": 0.39, "learning_rate": 3.845656514108516e-06, "logits/chosen": 0.9225580096244812, "logits/rejected": 0.9809015989303589, "logps/chosen": -297.4796142578125, "logps/rejected": -250.21530151367188, "loss": 0.0295, "rewards/accuracies": 0.65625, "rewards/chosen": -0.041134659200906754, "rewards/margins": 0.06729653477668762, "rewards/rejected": -0.10843118280172348, "step": 1480 }, { "epoch": 0.39, "learning_rate": 3.826347673629738e-06, "logits/chosen": 0.9812415838241577, "logits/rejected": 1.0070959329605103, "logps/chosen": -247.45516967773438, "logps/rejected": -231.81179809570312, "loss": 0.0372, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.04904834181070328, "rewards/margins": 0.05793965980410576, "rewards/rejected": -0.10698799788951874, "step": 1490 }, { "epoch": 0.39, "learning_rate": 3.8069280835019062e-06, "logits/chosen": 0.9496526718139648, "logits/rejected": 1.0348269939422607, "logps/chosen": -325.68267822265625, "logps/rejected": -257.0068054199219, "loss": 0.0304, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.035646189004182816, "rewards/margins": 0.04558128863573074, "rewards/rejected": -0.08122747391462326, "step": 1500 }, { "epoch": 0.39, "eval_logits/chosen": 0.949572741985321, "eval_logits/rejected": 1.0471240282058716, "eval_logps/chosen": -282.7773132324219, "eval_logps/rejected": -254.43394470214844, "eval_loss": 0.03586630895733833, "eval_rewards/accuracies": 0.6359999775886536, "eval_rewards/chosen": -0.052304789423942566, "eval_rewards/margins": 0.05387549474835396, "eval_rewards/rejected": -0.10618028789758682, "eval_runtime": 539.1493, "eval_samples_per_second": 3.71, "eval_steps_per_second": 0.927, "step": 1500 }, { "epoch": 0.4, "learning_rate": 3.7873993652552077e-06, "logits/chosen": 0.9462829828262329, "logits/rejected": 1.0417277812957764, "logps/chosen": -263.7369689941406, "logps/rejected": -241.88095092773438, "loss": 0.036, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.059374719858169556, "rewards/margins": 0.021540379151701927, "rewards/rejected": -0.08091510832309723, "step": 1510 }, { "epoch": 0.4, "learning_rate": 3.7677631495319953e-06, "logits/chosen": 0.9473394155502319, "logits/rejected": 1.016614556312561, "logps/chosen": -263.25238037109375, "logps/rejected": -217.35693359375, "loss": 0.0371, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.04990892857313156, "rewards/margins": 0.03492476046085358, "rewards/rejected": -0.08483369648456573, "step": 1520 }, { "epoch": 0.4, "learning_rate": 3.748021075950633e-06, "logits/chosen": 1.0440593957901, "logits/rejected": 1.0327621698379517, "logps/chosen": -307.784423828125, "logps/rejected": -280.920654296875, "loss": 0.0398, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.05705567076802254, "rewards/margins": 0.05376668646931648, "rewards/rejected": -0.11082235723733902, "step": 1530 }, { "epoch": 0.4, "learning_rate": 3.7281747929685824e-06, "logits/chosen": 0.9668010473251343, "logits/rejected": 1.1068073511123657, "logps/chosen": -292.89801025390625, "logps/rejected": -257.0223388671875, "loss": 0.0377, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.04842069000005722, "rewards/margins": 0.043641868978738785, "rewards/rejected": -0.0920625552535057, "step": 1540 }, { "epoch": 0.41, "learning_rate": 3.7082259577447604e-06, "logits/chosen": 0.9432889223098755, "logits/rejected": 1.05801522731781, "logps/chosen": -324.9328308105469, "logps/rejected": -255.90939331054688, "loss": 0.0319, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.05441862344741821, "rewards/margins": 0.053149156272411346, "rewards/rejected": -0.10756777226924896, "step": 1550 }, { "epoch": 0.41, "learning_rate": 3.6881762360011688e-06, "logits/chosen": 1.0232698917388916, "logits/rejected": 1.055396318435669, "logps/chosen": -291.96514892578125, "logps/rejected": -249.9015350341797, "loss": 0.0428, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.06017423793673515, "rewards/margins": 0.05147537589073181, "rewards/rejected": -0.11164961010217667, "step": 1560 }, { "epoch": 0.41, "learning_rate": 3.668027301883802e-06, "logits/chosen": 0.9909790754318237, "logits/rejected": 1.0502822399139404, "logps/chosen": -266.43011474609375, "logps/rejected": -224.57601928710938, "loss": 0.033, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.05004848912358284, "rewards/margins": 0.05305255576968193, "rewards/rejected": -0.10310103744268417, "step": 1570 }, { "epoch": 0.41, "learning_rate": 3.64778083782286e-06, "logits/chosen": 1.0207841396331787, "logits/rejected": 1.0920627117156982, "logps/chosen": -270.4779968261719, "logps/rejected": -278.3066101074219, "loss": 0.0354, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.038508545607328415, "rewards/margins": 0.05329999327659607, "rewards/rejected": -0.09180854260921478, "step": 1580 }, { "epoch": 0.42, "learning_rate": 3.627438534392268e-06, "logits/chosen": 0.9771720767021179, "logits/rejected": 1.0112661123275757, "logps/chosen": -264.5091552734375, "logps/rejected": -213.95535278320312, "loss": 0.0401, "rewards/accuracies": 0.59375, "rewards/chosen": -0.03601064160466194, "rewards/margins": 0.04099656641483307, "rewards/rejected": -0.07700721174478531, "step": 1590 }, { "epoch": 0.42, "learning_rate": 3.607002090168506e-06, "logits/chosen": 0.9807453155517578, "logits/rejected": 1.0391424894332886, "logps/chosen": -261.9506530761719, "logps/rejected": -238.798828125, "loss": 0.0436, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.03824782371520996, "rewards/margins": 0.041073787957429886, "rewards/rejected": -0.07932160794734955, "step": 1600 }, { "epoch": 0.42, "eval_logits/chosen": 0.9584904909133911, "eval_logits/rejected": 1.0586249828338623, "eval_logps/chosen": -280.7698669433594, "eval_logps/rejected": -252.26162719726562, "eval_loss": 0.03589407727122307, "eval_rewards/accuracies": 0.6340000033378601, "eval_rewards/chosen": -0.03223072364926338, "eval_rewards/margins": 0.05222645774483681, "eval_rewards/rejected": -0.08445718139410019, "eval_runtime": 538.9153, "eval_samples_per_second": 3.711, "eval_steps_per_second": 0.928, "step": 1600 }, { "epoch": 0.42, "learning_rate": 3.586473211588787e-06, "logits/chosen": 0.9628156423568726, "logits/rejected": 1.0911314487457275, "logps/chosen": -301.49798583984375, "logps/rejected": -244.82583618164062, "loss": 0.0432, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.025343740358948708, "rewards/margins": 0.0616876594722271, "rewards/rejected": -0.08703140914440155, "step": 1610 }, { "epoch": 0.42, "learning_rate": 3.5658536128085623e-06, "logits/chosen": 0.9632102847099304, "logits/rejected": 1.0290127992630005, "logps/chosen": -252.96798706054688, "logps/rejected": -247.17739868164062, "loss": 0.0346, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.03504541888833046, "rewards/margins": 0.056658290326595306, "rewards/rejected": -0.09170371294021606, "step": 1620 }, { "epoch": 0.43, "learning_rate": 3.545145015558399e-06, "logits/chosen": 0.9802696108818054, "logits/rejected": 1.0333788394927979, "logps/chosen": -302.0663146972656, "logps/rejected": -291.22027587890625, "loss": 0.0495, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.03271108120679855, "rewards/margins": 0.047079406678676605, "rewards/rejected": -0.07979048788547516, "step": 1630 }, { "epoch": 0.43, "learning_rate": 3.5243491490002056e-06, "logits/chosen": 0.9676446914672852, "logits/rejected": 1.0016578435897827, "logps/chosen": -277.4888000488281, "logps/rejected": -244.8700714111328, "loss": 0.0395, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.036362119019031525, "rewards/margins": 0.042628705501556396, "rewards/rejected": -0.07899081707000732, "step": 1640 }, { "epoch": 0.43, "learning_rate": 3.503467749582857e-06, "logits/chosen": 0.955204963684082, "logits/rejected": 0.9823210835456848, "logps/chosen": -251.63296508789062, "logps/rejected": -207.33932495117188, "loss": 0.0357, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.0370803065598011, "rewards/margins": 0.05566862225532532, "rewards/rejected": -0.09274892508983612, "step": 1650 }, { "epoch": 0.43, "learning_rate": 3.4825025608971947e-06, "logits/chosen": 0.9128938913345337, "logits/rejected": 1.0501888990402222, "logps/chosen": -319.082275390625, "logps/rejected": -253.41708374023438, "loss": 0.0321, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.04241309314966202, "rewards/margins": 0.05771785229444504, "rewards/rejected": -0.10013093799352646, "step": 1660 }, { "epoch": 0.44, "learning_rate": 3.4614553335304407e-06, "logits/chosen": 0.9686363935470581, "logits/rejected": 1.0950806140899658, "logps/chosen": -281.142333984375, "logps/rejected": -243.0208740234375, "loss": 0.0393, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.05112028867006302, "rewards/margins": 0.06849656254053116, "rewards/rejected": -0.11961684376001358, "step": 1670 }, { "epoch": 0.44, "learning_rate": 3.4403278249200222e-06, "logits/chosen": 0.9172054529190063, "logits/rejected": 1.0074876546859741, "logps/chosen": -262.1912536621094, "logps/rejected": -244.34036254882812, "loss": 0.038, "rewards/accuracies": 0.5625, "rewards/chosen": -0.050746072083711624, "rewards/margins": 0.04991786926984787, "rewards/rejected": -0.10066394507884979, "step": 1680 }, { "epoch": 0.44, "learning_rate": 3.4191217992068293e-06, "logits/chosen": 0.9850749969482422, "logits/rejected": 0.9730724096298218, "logps/chosen": -262.35345458984375, "logps/rejected": -260.61895751953125, "loss": 0.0328, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.06715109199285507, "rewards/margins": 0.05269361659884453, "rewards/rejected": -0.1198447123169899, "step": 1690 }, { "epoch": 0.44, "learning_rate": 3.3978390270879056e-06, "logits/chosen": 0.9383825063705444, "logits/rejected": 1.040351152420044, "logps/chosen": -299.37066650390625, "logps/rejected": -256.8703918457031, "loss": 0.0405, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.047446489334106445, "rewards/margins": 0.05738651007413864, "rewards/rejected": -0.10483300685882568, "step": 1700 }, { "epoch": 0.44, "eval_logits/chosen": 0.9321679472923279, "eval_logits/rejected": 1.0312304496765137, "eval_logps/chosen": -282.8528747558594, "eval_logps/rejected": -254.86965942382812, "eval_loss": 0.03552675619721413, "eval_rewards/accuracies": 0.6334999799728394, "eval_rewards/chosen": -0.05306074023246765, "eval_rewards/margins": 0.05747658386826515, "eval_rewards/rejected": -0.1105373352766037, "eval_runtime": 539.0945, "eval_samples_per_second": 3.71, "eval_steps_per_second": 0.927, "step": 1700 }, { "epoch": 0.45, "learning_rate": 3.3764812856685995e-06, "logits/chosen": 0.9679194688796997, "logits/rejected": 0.9970897436141968, "logps/chosen": -294.0511169433594, "logps/rejected": -265.82025146484375, "loss": 0.0297, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.04057580232620239, "rewards/margins": 0.06438260525465012, "rewards/rejected": -0.1049584150314331, "step": 1710 }, { "epoch": 0.45, "learning_rate": 3.3550503583141726e-06, "logits/chosen": 0.9520149230957031, "logits/rejected": 0.9898689985275269, "logps/chosen": -327.1080627441406, "logps/rejected": -308.2015686035156, "loss": 0.0303, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.045171838253736496, "rewards/margins": 0.06369863450527191, "rewards/rejected": -0.1088704839348793, "step": 1720 }, { "epoch": 0.45, "learning_rate": 3.3335480345008907e-06, "logits/chosen": 0.9297927618026733, "logits/rejected": 1.0150766372680664, "logps/chosen": -266.30914306640625, "logps/rejected": -229.8489532470703, "loss": 0.0462, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.0414881631731987, "rewards/margins": 0.0664457231760025, "rewards/rejected": -0.1079338937997818, "step": 1730 }, { "epoch": 0.46, "learning_rate": 3.3119761096666055e-06, "logits/chosen": 1.0308209657669067, "logits/rejected": 1.0126601457595825, "logps/chosen": -317.2650451660156, "logps/rejected": -265.3246765136719, "loss": 0.04, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.06478316336870193, "rewards/margins": 0.04602901265025139, "rewards/rejected": -0.11081217229366302, "step": 1740 }, { "epoch": 0.46, "learning_rate": 3.290336385060832e-06, "logits/chosen": 0.9886214137077332, "logits/rejected": 0.9748364686965942, "logps/chosen": -289.01776123046875, "logps/rejected": -267.6888427734375, "loss": 0.0375, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.05019726604223251, "rewards/margins": 0.06619967520236969, "rewards/rejected": -0.1163969412446022, "step": 1750 }, { "epoch": 0.46, "learning_rate": 3.268630667594348e-06, "logits/chosen": 0.9688504934310913, "logits/rejected": 1.0535883903503418, "logps/chosen": -284.36700439453125, "logps/rejected": -266.2789306640625, "loss": 0.033, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.05331147834658623, "rewards/margins": 0.05179852992296219, "rewards/rejected": -0.10510998964309692, "step": 1760 }, { "epoch": 0.46, "learning_rate": 3.2468607696883147e-06, "logits/chosen": 0.9664725065231323, "logits/rejected": 1.0324318408966064, "logps/chosen": -302.47113037109375, "logps/rejected": -258.18365478515625, "loss": 0.0377, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.054223883897066116, "rewards/margins": 0.06361141800880432, "rewards/rejected": -0.11783530563116074, "step": 1770 }, { "epoch": 0.47, "learning_rate": 3.225028509122944e-06, "logits/chosen": 0.9445089101791382, "logits/rejected": 1.0780378580093384, "logps/chosen": -306.8780212402344, "logps/rejected": -259.9053955078125, "loss": 0.0198, "rewards/accuracies": 0.6875, "rewards/chosen": -0.02363925240933895, "rewards/margins": 0.06653538346290588, "rewards/rejected": -0.09017463773488998, "step": 1780 }, { "epoch": 0.47, "learning_rate": 3.2031357088857083e-06, "logits/chosen": 1.0200811624526978, "logits/rejected": 1.0404508113861084, "logps/chosen": -275.5518493652344, "logps/rejected": -261.8308410644531, "loss": 0.0388, "rewards/accuracies": 0.625, "rewards/chosen": -0.04572081193327904, "rewards/margins": 0.060186631977558136, "rewards/rejected": -0.10590744018554688, "step": 1790 }, { "epoch": 0.47, "learning_rate": 3.181184197019127e-06, "logits/chosen": 1.0228135585784912, "logits/rejected": 0.9979068636894226, "logps/chosen": -282.7207946777344, "logps/rejected": -236.26962280273438, "loss": 0.0352, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.03686892241239548, "rewards/margins": 0.06626715511083603, "rewards/rejected": -0.1031360775232315, "step": 1800 }, { "epoch": 0.47, "eval_logits/chosen": 0.9538518190383911, "eval_logits/rejected": 1.053285002708435, "eval_logps/chosen": -281.23944091796875, "eval_logps/rejected": -253.37205505371094, "eval_loss": 0.03543499857187271, "eval_rewards/accuracies": 0.621999979019165, "eval_rewards/chosen": -0.0369262732565403, "eval_rewards/margins": 0.05863497406244278, "eval_rewards/rejected": -0.09556125104427338, "eval_runtime": 539.174, "eval_samples_per_second": 3.709, "eval_steps_per_second": 0.927, "step": 1800 }, { "epoch": 0.47, "learning_rate": 3.159175806468126e-06, "logits/chosen": 0.9597219228744507, "logits/rejected": 1.0474979877471924, "logps/chosen": -284.355712890625, "logps/rejected": -276.75836181640625, "loss": 0.0293, "rewards/accuracies": 0.625, "rewards/chosen": -0.030187245458364487, "rewards/margins": 0.052719276398420334, "rewards/rejected": -0.08290652930736542, "step": 1810 }, { "epoch": 0.48, "learning_rate": 3.1371123749269804e-06, "logits/chosen": 1.0226430892944336, "logits/rejected": 1.1090997457504272, "logps/chosen": -282.0672607421875, "logps/rejected": -252.12680053710938, "loss": 0.0356, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.027882922440767288, "rewards/margins": 0.07020456343889236, "rewards/rejected": -0.09808747470378876, "step": 1820 }, { "epoch": 0.48, "learning_rate": 3.114995744685877e-06, "logits/chosen": 1.0181407928466797, "logits/rejected": 0.9908281564712524, "logps/chosen": -258.36480712890625, "logps/rejected": -240.61569213867188, "loss": 0.0308, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.017861558124423027, "rewards/margins": 0.07849525660276413, "rewards/rejected": -0.0963568240404129, "step": 1830 }, { "epoch": 0.48, "learning_rate": 3.0928277624770743e-06, "logits/chosen": 0.9835958480834961, "logits/rejected": 1.0549378395080566, "logps/chosen": -285.7489013671875, "logps/rejected": -232.4912109375, "loss": 0.0297, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.011253233067691326, "rewards/margins": 0.08058271557092667, "rewards/rejected": -0.09183595329523087, "step": 1840 }, { "epoch": 0.48, "learning_rate": 3.070610279320708e-06, "logits/chosen": 0.9505823850631714, "logits/rejected": 1.0996264219284058, "logps/chosen": -286.63714599609375, "logps/rejected": -233.0901336669922, "loss": 0.0359, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.030136246234178543, "rewards/margins": 0.05022455379366875, "rewards/rejected": -0.0803607925772667, "step": 1850 }, { "epoch": 0.49, "learning_rate": 3.0483451503702264e-06, "logits/chosen": 1.0030499696731567, "logits/rejected": 1.0658283233642578, "logps/chosen": -287.8111877441406, "logps/rejected": -264.3537292480469, "loss": 0.043, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.020662177354097366, "rewards/margins": 0.05197754502296448, "rewards/rejected": -0.07263971865177155, "step": 1860 }, { "epoch": 0.49, "learning_rate": 3.0260342347574916e-06, "logits/chosen": 1.0229995250701904, "logits/rejected": 1.0849655866622925, "logps/chosen": -299.0933837890625, "logps/rejected": -248.93527221679688, "loss": 0.0295, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.036781955510377884, "rewards/margins": 0.0717974454164505, "rewards/rejected": -0.10857941210269928, "step": 1870 }, { "epoch": 0.49, "learning_rate": 3.0036793954375358e-06, "logits/chosen": 0.9629353284835815, "logits/rejected": 1.0485321283340454, "logps/chosen": -279.51690673828125, "logps/rejected": -242.5545196533203, "loss": 0.0355, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.01313974242657423, "rewards/margins": 0.07288579642772675, "rewards/rejected": -0.08602554351091385, "step": 1880 }, { "epoch": 0.49, "learning_rate": 2.981282499033009e-06, "logits/chosen": 0.9632253646850586, "logits/rejected": 1.0230735540390015, "logps/chosen": -293.87286376953125, "logps/rejected": -243.94613647460938, "loss": 0.0359, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.022772405296564102, "rewards/margins": 0.05214250087738037, "rewards/rejected": -0.07491490244865417, "step": 1890 }, { "epoch": 0.5, "learning_rate": 2.9588454156783163e-06, "logits/chosen": 1.0420585870742798, "logits/rejected": 1.01620614528656, "logps/chosen": -277.5963439941406, "logps/rejected": -251.56881713867188, "loss": 0.0392, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.020175188779830933, "rewards/margins": 0.05704687908291817, "rewards/rejected": -0.0772220641374588, "step": 1900 }, { "epoch": 0.5, "eval_logits/chosen": 0.9508064389228821, "eval_logits/rejected": 1.0498266220092773, "eval_logps/chosen": -280.359375, "eval_logps/rejected": -252.41934204101562, "eval_loss": 0.03548915684223175, "eval_rewards/accuracies": 0.6209999918937683, "eval_rewards/chosen": -0.028125399723649025, "eval_rewards/margins": 0.05790869519114494, "eval_rewards/rejected": -0.08603409677743912, "eval_runtime": 539.0909, "eval_samples_per_second": 3.71, "eval_steps_per_second": 0.927, "step": 1900 }, { "epoch": 0.5, "learning_rate": 2.9363700188634597e-06, "logits/chosen": 0.9709011912345886, "logits/rejected": 1.0315817594528198, "logps/chosen": -271.8665771484375, "logps/rejected": -249.4420928955078, "loss": 0.031, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.03171471506357193, "rewards/margins": 0.04039089381694794, "rewards/rejected": -0.07210560888051987, "step": 1910 }, { "epoch": 0.5, "learning_rate": 2.9138581852776053e-06, "logits/chosen": 0.9720270037651062, "logits/rejected": 1.086348056793213, "logps/chosen": -276.81109619140625, "logps/rejected": -262.7275085449219, "loss": 0.0384, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.02635134384036064, "rewards/margins": 0.07241298258304596, "rewards/rejected": -0.0987643226981163, "step": 1920 }, { "epoch": 0.51, "learning_rate": 2.8913117946523805e-06, "logits/chosen": 1.0553147792816162, "logits/rejected": 1.024316430091858, "logps/chosen": -261.1982116699219, "logps/rejected": -221.4432373046875, "loss": 0.0406, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.041058607399463654, "rewards/margins": 0.05773719400167465, "rewards/rejected": -0.0987958088517189, "step": 1930 }, { "epoch": 0.51, "learning_rate": 2.8687327296049126e-06, "logits/chosen": 0.9523100852966309, "logits/rejected": 1.032663345336914, "logps/chosen": -280.92706298828125, "logps/rejected": -259.6051940917969, "loss": 0.0323, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.02694753371179104, "rewards/margins": 0.04815928265452385, "rewards/rejected": -0.07510681450366974, "step": 1940 }, { "epoch": 0.51, "learning_rate": 2.8461228754806376e-06, "logits/chosen": 0.9898034930229187, "logits/rejected": 1.073132038116455, "logps/chosen": -250.3188018798828, "logps/rejected": -242.313232421875, "loss": 0.0325, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.03240308165550232, "rewards/margins": 0.05959530547261238, "rewards/rejected": -0.0919983834028244, "step": 1950 }, { "epoch": 0.51, "learning_rate": 2.823484120195865e-06, "logits/chosen": 1.0166945457458496, "logits/rejected": 0.9883760213851929, "logps/chosen": -209.08822631835938, "logps/rejected": -223.0111846923828, "loss": 0.0343, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.029585499316453934, "rewards/margins": 0.055918287485837936, "rewards/rejected": -0.08550377935171127, "step": 1960 }, { "epoch": 0.52, "learning_rate": 2.8008183540801486e-06, "logits/chosen": 0.9123330116271973, "logits/rejected": 1.0141832828521729, "logps/chosen": -288.05084228515625, "logps/rejected": -267.970458984375, "loss": 0.0307, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.030767519026994705, "rewards/margins": 0.04243772476911545, "rewards/rejected": -0.07320524752140045, "step": 1970 }, { "epoch": 0.52, "learning_rate": 2.7781274697184353e-06, "logits/chosen": 0.9999529123306274, "logits/rejected": 1.0701429843902588, "logps/chosen": -274.42816162109375, "logps/rejected": -255.3940887451172, "loss": 0.0341, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.03205768018960953, "rewards/margins": 0.0669126957654953, "rewards/rejected": -0.09897039085626602, "step": 1980 }, { "epoch": 0.52, "learning_rate": 2.7554133617930397e-06, "logits/chosen": 1.0071890354156494, "logits/rejected": 1.0170022249221802, "logps/chosen": -272.82794189453125, "logps/rejected": -268.2226867675781, "loss": 0.035, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.03548605367541313, "rewards/margins": 0.03869449347257614, "rewards/rejected": -0.07418055832386017, "step": 1990 }, { "epoch": 0.52, "learning_rate": 2.7326779269254363e-06, "logits/chosen": 0.945580780506134, "logits/rejected": 1.0344423055648804, "logps/chosen": -254.6405792236328, "logps/rejected": -229.8059844970703, "loss": 0.0368, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.025666531175374985, "rewards/margins": 0.0334133505821228, "rewards/rejected": -0.059079885482788086, "step": 2000 }, { "epoch": 0.52, "eval_logits/chosen": 0.9577403664588928, "eval_logits/rejected": 1.0563304424285889, "eval_logps/chosen": -279.8615417480469, "eval_logps/rejected": -251.5159149169922, "eval_loss": 0.0354422889649868, "eval_rewards/accuracies": 0.6299999952316284, "eval_rewards/chosen": -0.023147189989686012, "eval_rewards/margins": 0.053852878510951996, "eval_rewards/rejected": -0.07700006663799286, "eval_runtime": 539.213, "eval_samples_per_second": 3.709, "eval_steps_per_second": 0.927, "step": 2000 }, { "epoch": 0.53, "learning_rate": 2.7099230635178954e-06, "logits/chosen": 0.9786656498908997, "logits/rejected": 1.0951110124588013, "logps/chosen": -250.3948974609375, "logps/rejected": -213.85202026367188, "loss": 0.0432, "rewards/accuracies": 0.6875, "rewards/chosen": -0.015626171603798866, "rewards/margins": 0.0794781818985939, "rewards/rejected": -0.09510435163974762, "step": 2010 }, { "epoch": 0.53, "learning_rate": 2.6871506715949608e-06, "logits/chosen": 1.0892430543899536, "logits/rejected": 1.11086106300354, "logps/chosen": -277.72430419921875, "logps/rejected": -263.95916748046875, "loss": 0.0435, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.028381651267409325, "rewards/margins": 0.07892771810293198, "rewards/rejected": -0.10730937868356705, "step": 2020 }, { "epoch": 0.53, "learning_rate": 2.6643626526448063e-06, "logits/chosen": 1.0042452812194824, "logits/rejected": 1.097080945968628, "logps/chosen": -245.4960479736328, "logps/rejected": -252.0270233154297, "loss": 0.0336, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.027020296081900597, "rewards/margins": 0.0670652836561203, "rewards/rejected": -0.09408558160066605, "step": 2030 }, { "epoch": 0.53, "learning_rate": 2.6415609094604562e-06, "logits/chosen": 0.9564634561538696, "logits/rejected": 1.0937979221343994, "logps/chosen": -304.175537109375, "logps/rejected": -259.8273010253906, "loss": 0.0401, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.035849399864673615, "rewards/margins": 0.067509725689888, "rewards/rejected": -0.10335911810398102, "step": 2040 }, { "epoch": 0.54, "learning_rate": 2.618747345980904e-06, "logits/chosen": 0.9391676783561707, "logits/rejected": 1.0340735912322998, "logps/chosen": -289.8169860839844, "logps/rejected": -250.7114715576172, "loss": 0.0301, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.02765297330915928, "rewards/margins": 0.06791722774505615, "rewards/rejected": -0.09557019919157028, "step": 2050 }, { "epoch": 0.54, "learning_rate": 2.595923867132136e-06, "logits/chosen": 1.0433982610702515, "logits/rejected": 1.1006929874420166, "logps/chosen": -227.78182983398438, "logps/rejected": -231.31103515625, "loss": 0.0364, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.024369673803448677, "rewards/margins": 0.061494432389736176, "rewards/rejected": -0.0858640968799591, "step": 2060 }, { "epoch": 0.54, "learning_rate": 2.5730923786680672e-06, "logits/chosen": 1.0085296630859375, "logits/rejected": 1.0929278135299683, "logps/chosen": -275.10479736328125, "logps/rejected": -230.3509521484375, "loss": 0.0339, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.031941868364810944, "rewards/margins": 0.05682260915637016, "rewards/rejected": -0.08876447379589081, "step": 2070 }, { "epoch": 0.54, "learning_rate": 2.5502547870114137e-06, "logits/chosen": 1.0008578300476074, "logits/rejected": 1.0413917303085327, "logps/chosen": -231.7599639892578, "logps/rejected": -252.7259063720703, "loss": 0.0245, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.024501001462340355, "rewards/margins": 0.04970362037420273, "rewards/rejected": -0.07420462369918823, "step": 2080 }, { "epoch": 0.55, "learning_rate": 2.527412999094507e-06, "logits/chosen": 1.006413459777832, "logits/rejected": 1.048346996307373, "logps/chosen": -283.4499816894531, "logps/rejected": -256.1369323730469, "loss": 0.0367, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.03352126479148865, "rewards/margins": 0.05489424616098404, "rewards/rejected": -0.08841550350189209, "step": 2090 }, { "epoch": 0.55, "learning_rate": 2.504568922200064e-06, "logits/chosen": 0.9541667699813843, "logits/rejected": 1.0082509517669678, "logps/chosen": -265.67510986328125, "logps/rejected": -248.1321258544922, "loss": 0.0326, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.037846438586711884, "rewards/margins": 0.04263025149703026, "rewards/rejected": -0.08047669380903244, "step": 2100 }, { "epoch": 0.55, "eval_logits/chosen": 0.9759756326675415, "eval_logits/rejected": 1.075065016746521, "eval_logps/chosen": -281.1431884765625, "eval_logps/rejected": -252.96302795410156, "eval_loss": 0.03518374264240265, "eval_rewards/accuracies": 0.6299999952316284, "eval_rewards/chosen": -0.03596383333206177, "eval_rewards/margins": 0.055507466197013855, "eval_rewards/rejected": -0.09147130697965622, "eval_runtime": 539.005, "eval_samples_per_second": 3.711, "eval_steps_per_second": 0.928, "step": 2100 }, { "epoch": 0.55, "learning_rate": 2.4817244638019333e-06, "logits/chosen": 0.9972022175788879, "logits/rejected": 1.1107069253921509, "logps/chosen": -259.2168884277344, "logps/rejected": -252.2943878173828, "loss": 0.0356, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.034331098198890686, "rewards/margins": 0.06633375585079193, "rewards/rejected": -0.10066483914852142, "step": 2110 }, { "epoch": 0.55, "learning_rate": 2.4588815314058155e-06, "logits/chosen": 0.979550838470459, "logits/rejected": 1.015853762626648, "logps/chosen": -308.5239562988281, "logps/rejected": -290.33526611328125, "loss": 0.0363, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.033022016286849976, "rewards/margins": 0.03916890174150467, "rewards/rejected": -0.07219092547893524, "step": 2120 }, { "epoch": 0.56, "learning_rate": 2.4360420323899922e-06, "logits/chosen": 0.9291566610336304, "logits/rejected": 1.0274138450622559, "logps/chosen": -300.00830078125, "logps/rejected": -246.3135528564453, "loss": 0.0364, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.0385669507086277, "rewards/margins": 0.0751514807343483, "rewards/rejected": -0.1137184128165245, "step": 2130 }, { "epoch": 0.56, "learning_rate": 2.4132078738460585e-06, "logits/chosen": 1.0100805759429932, "logits/rejected": 1.0035889148712158, "logps/chosen": -273.31329345703125, "logps/rejected": -260.4046630859375, "loss": 0.0345, "rewards/accuracies": 0.59375, "rewards/chosen": -0.042252346873283386, "rewards/margins": 0.04604203626513481, "rewards/rejected": -0.0882943868637085, "step": 2140 }, { "epoch": 0.56, "learning_rate": 2.3903809624196826e-06, "logits/chosen": 1.0183110237121582, "logits/rejected": 1.1255147457122803, "logps/chosen": -260.5460205078125, "logps/rejected": -231.55355834960938, "loss": 0.0355, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.04934290796518326, "rewards/margins": 0.057183485478162766, "rewards/rejected": -0.10652639716863632, "step": 2150 }, { "epoch": 0.57, "learning_rate": 2.3675632041513978e-06, "logits/chosen": 1.0892599821090698, "logits/rejected": 1.1318366527557373, "logps/chosen": -243.21102905273438, "logps/rejected": -225.78549194335938, "loss": 0.0351, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.05033104866743088, "rewards/margins": 0.03708335757255554, "rewards/rejected": -0.08741440623998642, "step": 2160 }, { "epoch": 0.57, "learning_rate": 2.3447565043174533e-06, "logits/chosen": 1.0009520053863525, "logits/rejected": 0.9941670298576355, "logps/chosen": -273.72149658203125, "logps/rejected": -250.0045166015625, "loss": 0.0432, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.04186940938234329, "rewards/margins": 0.05003537982702255, "rewards/rejected": -0.09190478920936584, "step": 2170 }, { "epoch": 0.57, "learning_rate": 2.321962767270724e-06, "logits/chosen": 1.0270161628723145, "logits/rejected": 1.0816559791564941, "logps/chosen": -258.2284240722656, "logps/rejected": -254.64871215820312, "loss": 0.0411, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.04245874285697937, "rewards/margins": 0.06636542826890945, "rewards/rejected": -0.10882417112588882, "step": 2180 }, { "epoch": 0.57, "learning_rate": 2.299183896281692e-06, "logits/chosen": 0.9619997143745422, "logits/rejected": 1.1526567935943604, "logps/chosen": -272.2848815917969, "logps/rejected": -232.285888671875, "loss": 0.036, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.03838383033871651, "rewards/margins": 0.07473193854093552, "rewards/rejected": -0.11311577260494232, "step": 2190 }, { "epoch": 0.58, "learning_rate": 2.2764217933795297e-06, "logits/chosen": 1.0228240489959717, "logits/rejected": 0.9570671319961548, "logps/chosen": -289.8697204589844, "logps/rejected": -259.46624755859375, "loss": 0.0368, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.04657725244760513, "rewards/margins": 0.04652193933725357, "rewards/rejected": -0.0930991917848587, "step": 2200 }, { "epoch": 0.58, "eval_logits/chosen": 0.9639849066734314, "eval_logits/rejected": 1.064185619354248, "eval_logps/chosen": -281.45947265625, "eval_logps/rejected": -253.4691162109375, "eval_loss": 0.0351751483976841, "eval_rewards/accuracies": 0.6345000267028809, "eval_rewards/chosen": -0.03912654146552086, "eval_rewards/margins": 0.057405244559049606, "eval_rewards/rejected": -0.09653179347515106, "eval_runtime": 539.0332, "eval_samples_per_second": 3.71, "eval_steps_per_second": 0.928, "step": 2200 }, { "epoch": 0.58, "learning_rate": 2.2536783591932786e-06, "logits/chosen": 1.1154290437698364, "logits/rejected": 1.016729712486267, "logps/chosen": -273.88702392578125, "logps/rejected": -253.8929901123047, "loss": 0.0307, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.034655872732400894, "rewards/margins": 0.0648706778883934, "rewards/rejected": -0.0995265543460846, "step": 2210 }, { "epoch": 0.58, "learning_rate": 2.230955492793149e-06, "logits/chosen": 1.015891194343567, "logits/rejected": 1.1429827213287354, "logps/chosen": -274.21929931640625, "logps/rejected": -235.86441040039062, "loss": 0.0409, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.025577425956726074, "rewards/margins": 0.05668734759092331, "rewards/rejected": -0.08226476609706879, "step": 2220 }, { "epoch": 0.58, "learning_rate": 2.208255091531947e-06, "logits/chosen": 1.0092315673828125, "logits/rejected": 1.031456708908081, "logps/chosen": -266.92181396484375, "logps/rejected": -221.04495239257812, "loss": 0.0424, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.02549988031387329, "rewards/margins": 0.06533181667327881, "rewards/rejected": -0.0908316969871521, "step": 2230 }, { "epoch": 0.59, "learning_rate": 2.1855790508866435e-06, "logits/chosen": 0.9829255938529968, "logits/rejected": 1.0525522232055664, "logps/chosen": -292.2393798828125, "logps/rejected": -274.6322326660156, "loss": 0.0217, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.024583814665675163, "rewards/margins": 0.04256455600261688, "rewards/rejected": -0.0671483725309372, "step": 2240 }, { "epoch": 0.59, "learning_rate": 2.162929264300107e-06, "logits/chosen": 1.036615014076233, "logits/rejected": 1.1294059753417969, "logps/chosen": -254.82858276367188, "logps/rejected": -225.80612182617188, "loss": 0.0454, "rewards/accuracies": 0.625, "rewards/chosen": -0.03245388716459274, "rewards/margins": 0.05736144259572029, "rewards/rejected": -0.08981534093618393, "step": 2250 }, { "epoch": 0.59, "learning_rate": 2.1403076230230006e-06, "logits/chosen": 1.020202875137329, "logits/rejected": 1.0851867198944092, "logps/chosen": -270.38360595703125, "logps/rejected": -208.3204345703125, "loss": 0.0419, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.040919043123722076, "rewards/margins": 0.04137270897626877, "rewards/rejected": -0.08229174464941025, "step": 2260 }, { "epoch": 0.59, "learning_rate": 2.11771601595586e-06, "logits/chosen": 1.0087039470672607, "logits/rejected": 1.0265519618988037, "logps/chosen": -259.2693786621094, "logps/rejected": -261.12091064453125, "loss": 0.0282, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.017938809469342232, "rewards/margins": 0.04762765020132065, "rewards/rejected": -0.06556645780801773, "step": 2270 }, { "epoch": 0.6, "learning_rate": 2.0951563294913737e-06, "logits/chosen": 0.9380186796188354, "logits/rejected": 1.0882261991500854, "logps/chosen": -263.954345703125, "logps/rejected": -221.87899780273438, "loss": 0.0331, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.033337127417325974, "rewards/margins": 0.04793107137084007, "rewards/rejected": -0.08126820623874664, "step": 2280 }, { "epoch": 0.6, "learning_rate": 2.0726304473568693e-06, "logits/chosen": 1.0044240951538086, "logits/rejected": 1.0548676252365112, "logps/chosen": -250.0818634033203, "logps/rejected": -269.6571960449219, "loss": 0.0306, "rewards/accuracies": 0.59375, "rewards/chosen": -0.025777125731110573, "rewards/margins": 0.05805457755923271, "rewards/rejected": -0.08383170515298843, "step": 2290 }, { "epoch": 0.6, "learning_rate": 2.050140250457023e-06, "logits/chosen": 0.9972747564315796, "logits/rejected": 0.9751909375190735, "logps/chosen": -250.11572265625, "logps/rejected": -239.9993438720703, "loss": 0.0315, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.019835414364933968, "rewards/margins": 0.05462411791086197, "rewards/rejected": -0.07445952296257019, "step": 2300 }, { "epoch": 0.6, "eval_logits/chosen": 0.9676101803779602, "eval_logits/rejected": 1.0684521198272705, "eval_logps/chosen": -280.0627746582031, "eval_logps/rejected": -251.82423400878906, "eval_loss": 0.03506240248680115, "eval_rewards/accuracies": 0.6330000162124634, "eval_rewards/chosen": -0.025159668177366257, "eval_rewards/margins": 0.05492350831627846, "eval_rewards/rejected": -0.08008317649364471, "eval_runtime": 538.9096, "eval_samples_per_second": 3.711, "eval_steps_per_second": 0.928, "step": 2300 }, { "epoch": 0.6, "learning_rate": 2.0276876167168042e-06, "logits/chosen": 0.9994446039199829, "logits/rejected": 1.1471552848815918, "logps/chosen": -285.36883544921875, "logps/rejected": -264.88861083984375, "loss": 0.0435, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.021740112453699112, "rewards/margins": 0.062014125287532806, "rewards/rejected": -0.08375424146652222, "step": 2310 }, { "epoch": 0.61, "learning_rate": 2.0052744209246682e-06, "logits/chosen": 0.9745758771896362, "logits/rejected": 1.0511457920074463, "logps/chosen": -258.8453674316406, "logps/rejected": -252.9406280517578, "loss": 0.0345, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.029231492429971695, "rewards/margins": 0.04611852020025253, "rewards/rejected": -0.07535000890493393, "step": 2320 }, { "epoch": 0.61, "learning_rate": 1.9829025345760127e-06, "logits/chosen": 0.9900597333908081, "logits/rejected": 1.0469316244125366, "logps/chosen": -315.5625305175781, "logps/rejected": -272.3370361328125, "loss": 0.0276, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.026207396760582924, "rewards/margins": 0.05628987401723862, "rewards/rejected": -0.08249727636575699, "step": 2330 }, { "epoch": 0.61, "learning_rate": 1.9605738257169115e-06, "logits/chosen": 1.0605112314224243, "logits/rejected": 1.0293577909469604, "logps/chosen": -249.8504638671875, "logps/rejected": -250.99655151367188, "loss": 0.0377, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.038626205176115036, "rewards/margins": 0.04553366079926491, "rewards/rejected": -0.08415986597537994, "step": 2340 }, { "epoch": 0.62, "learning_rate": 1.9382901587881275e-06, "logits/chosen": 0.9544248580932617, "logits/rejected": 1.0862176418304443, "logps/chosen": -259.606689453125, "logps/rejected": -246.25537109375, "loss": 0.0303, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.03279640153050423, "rewards/margins": 0.049646954983472824, "rewards/rejected": -0.08244334906339645, "step": 2350 }, { "epoch": 0.62, "learning_rate": 1.916053394469437e-06, "logits/chosen": 0.9803518056869507, "logits/rejected": 1.0460015535354614, "logps/chosen": -227.25918579101562, "logps/rejected": -231.8365478515625, "loss": 0.0443, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.025378238409757614, "rewards/margins": 0.05420111492276192, "rewards/rejected": -0.07957935333251953, "step": 2360 }, { "epoch": 0.62, "learning_rate": 1.8938653895242604e-06, "logits/chosen": 0.9943161010742188, "logits/rejected": 1.0234358310699463, "logps/chosen": -242.02169799804688, "logps/rejected": -225.488525390625, "loss": 0.0387, "rewards/accuracies": 0.65625, "rewards/chosen": -0.0067793638445436954, "rewards/margins": 0.06738194823265076, "rewards/rejected": -0.07416132837533951, "step": 2370 }, { "epoch": 0.62, "learning_rate": 1.8717279966446267e-06, "logits/chosen": 0.9629790186882019, "logits/rejected": 1.0603584051132202, "logps/chosen": -270.99151611328125, "logps/rejected": -255.68594360351562, "loss": 0.0378, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.018644142895936966, "rewards/margins": 0.057522498071193695, "rewards/rejected": -0.07616663724184036, "step": 2380 }, { "epoch": 0.63, "learning_rate": 1.8496430642964698e-06, "logits/chosen": 0.9359694719314575, "logits/rejected": 1.0587961673736572, "logps/chosen": -254.59310913085938, "logps/rejected": -238.82177734375, "loss": 0.0316, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.014369276352226734, "rewards/margins": 0.056524503976106644, "rewards/rejected": -0.0708937793970108, "step": 2390 }, { "epoch": 0.63, "learning_rate": 1.827612436565286e-06, "logits/chosen": 0.947625458240509, "logits/rejected": 1.0177226066589355, "logps/chosen": -253.84335327148438, "logps/rejected": -233.13687133789062, "loss": 0.0341, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.021341439336538315, "rewards/margins": 0.062166161835193634, "rewards/rejected": -0.08350759744644165, "step": 2400 }, { "epoch": 0.63, "eval_logits/chosen": 0.940484881401062, "eval_logits/rejected": 1.0420405864715576, "eval_logps/chosen": -279.9447021484375, "eval_logps/rejected": -251.8425750732422, "eval_loss": 0.0352231003344059, "eval_rewards/accuracies": 0.6320000290870667, "eval_rewards/chosen": -0.023978877812623978, "eval_rewards/margins": 0.05628751218318939, "eval_rewards/rejected": -0.08026638627052307, "eval_runtime": 539.1415, "eval_samples_per_second": 3.71, "eval_steps_per_second": 0.927, "step": 2400 }, { "epoch": 0.63, "learning_rate": 1.8056379530021492e-06, "logits/chosen": 0.9212465286254883, "logits/rejected": 1.0563522577285767, "logps/chosen": -255.9601287841797, "logps/rejected": -224.5750274658203, "loss": 0.044, "rewards/accuracies": 0.625, "rewards/chosen": -0.024405932053923607, "rewards/margins": 0.059529535472393036, "rewards/rejected": -0.0839354619383812, "step": 2410 }, { "epoch": 0.63, "learning_rate": 1.7837214484701154e-06, "logits/chosen": 1.0158764123916626, "logits/rejected": 1.0589698553085327, "logps/chosen": -272.985107421875, "logps/rejected": -264.05865478515625, "loss": 0.0395, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.02469342015683651, "rewards/margins": 0.07536058127880096, "rewards/rejected": -0.10005400329828262, "step": 2420 }, { "epoch": 0.64, "learning_rate": 1.7618647529910043e-06, "logits/chosen": 0.9243197441101074, "logits/rejected": 1.1003185510635376, "logps/chosen": -260.38006591796875, "logps/rejected": -240.22866821289062, "loss": 0.0376, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.01998872682452202, "rewards/margins": 0.06689772009849548, "rewards/rejected": -0.0868864506483078, "step": 2430 }, { "epoch": 0.64, "learning_rate": 1.7400696915925996e-06, "logits/chosen": 0.9673307538032532, "logits/rejected": 1.0932881832122803, "logps/chosen": -277.5309753417969, "logps/rejected": -269.5796813964844, "loss": 0.0248, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.010230915620923042, "rewards/margins": 0.076592355966568, "rewards/rejected": -0.08682326972484589, "step": 2440 }, { "epoch": 0.64, "learning_rate": 1.718338084156254e-06, "logits/chosen": 0.9628580212593079, "logits/rejected": 1.0081040859222412, "logps/chosen": -276.6579895019531, "logps/rejected": -248.1109161376953, "loss": 0.03, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.01852061226963997, "rewards/margins": 0.05575736239552498, "rewards/rejected": -0.07427798211574554, "step": 2450 }, { "epoch": 0.64, "learning_rate": 1.6966717452649372e-06, "logits/chosen": 0.9334288835525513, "logits/rejected": 1.002824068069458, "logps/chosen": -279.92205810546875, "logps/rejected": -271.9393615722656, "loss": 0.0263, "rewards/accuracies": 0.65625, "rewards/chosen": -0.017314914613962173, "rewards/margins": 0.059098273515701294, "rewards/rejected": -0.07641319185495377, "step": 2460 }, { "epoch": 0.65, "learning_rate": 1.6750724840517103e-06, "logits/chosen": 0.9612905383110046, "logits/rejected": 1.0620540380477905, "logps/chosen": -294.86712646484375, "logps/rejected": -253.3771514892578, "loss": 0.0423, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.017185799777507782, "rewards/margins": 0.07057368010282516, "rewards/rejected": -0.08775947988033295, "step": 2470 }, { "epoch": 0.65, "learning_rate": 1.6535421040486686e-06, "logits/chosen": 0.9643794894218445, "logits/rejected": 1.0106008052825928, "logps/chosen": -256.61090087890625, "logps/rejected": -232.49862670898438, "loss": 0.0411, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.018548784777522087, "rewards/margins": 0.054013751447200775, "rewards/rejected": -0.07256253063678741, "step": 2480 }, { "epoch": 0.65, "learning_rate": 1.6320824030363458e-06, "logits/chosen": 0.8658086657524109, "logits/rejected": 1.1067800521850586, "logps/chosen": -268.14715576171875, "logps/rejected": -246.0467529296875, "loss": 0.0403, "rewards/accuracies": 0.65625, "rewards/chosen": -0.025564253330230713, "rewards/margins": 0.06752271950244904, "rewards/rejected": -0.09308697283267975, "step": 2490 }, { "epoch": 0.65, "learning_rate": 1.6106951728936028e-06, "logits/chosen": 1.0061540603637695, "logits/rejected": 1.109466314315796, "logps/chosen": -231.77230834960938, "logps/rejected": -221.00439453125, "loss": 0.0488, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.03300454095005989, "rewards/margins": 0.05412193387746811, "rewards/rejected": -0.0871264785528183, "step": 2500 }, { "epoch": 0.65, "eval_logits/chosen": 0.9377838373184204, "eval_logits/rejected": 1.039380669593811, "eval_logps/chosen": -280.7594299316406, "eval_logps/rejected": -252.99684143066406, "eval_loss": 0.035037338733673096, "eval_rewards/accuracies": 0.6340000033378601, "eval_rewards/chosen": -0.03212602436542511, "eval_rewards/margins": 0.059683240950107574, "eval_rewards/rejected": -0.09180926531553268, "eval_runtime": 539.216, "eval_samples_per_second": 3.709, "eval_steps_per_second": 0.927, "step": 2500 }, { "epoch": 0.66, "learning_rate": 1.5893821994479996e-06, "logits/chosen": 1.040175199508667, "logits/rejected": 1.122536301612854, "logps/chosen": -279.53924560546875, "logps/rejected": -262.05816650390625, "loss": 0.0313, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.026052657514810562, "rewards/margins": 0.08227355033159256, "rewards/rejected": -0.10832621157169342, "step": 2510 }, { "epoch": 0.66, "learning_rate": 1.5681452623266868e-06, "logits/chosen": 0.9631746411323547, "logits/rejected": 1.0485169887542725, "logps/chosen": -300.1724548339844, "logps/rejected": -261.44134521484375, "loss": 0.0336, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.030287887901067734, "rewards/margins": 0.05683339759707451, "rewards/rejected": -0.08712128549814224, "step": 2520 }, { "epoch": 0.66, "learning_rate": 1.5469861348078014e-06, "logits/chosen": 0.9725979566574097, "logits/rejected": 1.0084983110427856, "logps/chosen": -267.7999572753906, "logps/rejected": -241.93508911132812, "loss": 0.0452, "rewards/accuracies": 0.625, "rewards/chosen": -0.027236010879278183, "rewards/margins": 0.05542879179120064, "rewards/rejected": -0.08266480267047882, "step": 2530 }, { "epoch": 0.66, "learning_rate": 1.5259065836724035e-06, "logits/chosen": 0.9311686754226685, "logits/rejected": 1.0091297626495361, "logps/chosen": -268.8539123535156, "logps/rejected": -257.43206787109375, "loss": 0.0284, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.0198749378323555, "rewards/margins": 0.05286857485771179, "rewards/rejected": -0.07274351269006729, "step": 2540 }, { "epoch": 0.67, "learning_rate": 1.5049083690569456e-06, "logits/chosen": 0.8933135867118835, "logits/rejected": 1.0143239498138428, "logps/chosen": -276.6771545410156, "logps/rejected": -236.60525512695312, "loss": 0.0394, "rewards/accuracies": 0.65625, "rewards/chosen": -0.040222011506557465, "rewards/margins": 0.06946249306201935, "rewards/rejected": -0.10968450456857681, "step": 2550 }, { "epoch": 0.67, "learning_rate": 1.4839932443063057e-06, "logits/chosen": 1.0430926084518433, "logits/rejected": 1.0419895648956299, "logps/chosen": -241.27578735351562, "logps/rejected": -215.1394500732422, "loss": 0.0433, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.04254927858710289, "rewards/margins": 0.054975200444459915, "rewards/rejected": -0.0975244790315628, "step": 2560 }, { "epoch": 0.67, "learning_rate": 1.4631629558273803e-06, "logits/chosen": 0.9953416585922241, "logits/rejected": 1.0442326068878174, "logps/chosen": -291.2084045410156, "logps/rejected": -256.3158264160156, "loss": 0.0382, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.02477596327662468, "rewards/margins": 0.07663208246231079, "rewards/rejected": -0.10140804201364517, "step": 2570 }, { "epoch": 0.68, "learning_rate": 1.4424192429432657e-06, "logits/chosen": 1.0064821243286133, "logits/rejected": 1.0545318126678467, "logps/chosen": -307.71038818359375, "logps/rejected": -225.4966583251953, "loss": 0.0361, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.026431897655129433, "rewards/margins": 0.06981770694255829, "rewards/rejected": -0.09624960273504257, "step": 2580 }, { "epoch": 0.68, "learning_rate": 1.421763837748016e-06, "logits/chosen": 0.9946783185005188, "logits/rejected": 1.0760682821273804, "logps/chosen": -267.80877685546875, "logps/rejected": -273.41168212890625, "loss": 0.0416, "rewards/accuracies": 0.6875, "rewards/chosen": -0.04798274114727974, "rewards/margins": 0.05912737920880318, "rewards/rejected": -0.10711012035608292, "step": 2590 }, { "epoch": 0.68, "learning_rate": 1.401198464962021e-06, "logits/chosen": 1.0298030376434326, "logits/rejected": 0.9537370800971985, "logps/chosen": -272.90863037109375, "logps/rejected": -239.8860321044922, "loss": 0.0279, "rewards/accuracies": 0.625, "rewards/chosen": -0.03228624537587166, "rewards/margins": 0.05655151605606079, "rewards/rejected": -0.08883775770664215, "step": 2600 }, { "epoch": 0.68, "eval_logits/chosen": 0.9350239038467407, "eval_logits/rejected": 1.0360502004623413, "eval_logps/chosen": -281.37646484375, "eval_logps/rejected": -253.77207946777344, "eval_loss": 0.03485475853085518, "eval_rewards/accuracies": 0.6315000057220459, "eval_rewards/chosen": -0.038296524435281754, "eval_rewards/margins": 0.06126519292593002, "eval_rewards/rejected": -0.09956171363592148, "eval_runtime": 539.0411, "eval_samples_per_second": 3.71, "eval_steps_per_second": 0.928, "step": 2600 }, { "epoch": 0.68, "learning_rate": 1.3807248417879896e-06, "logits/chosen": 0.9518525004386902, "logits/rejected": 0.9893406629562378, "logps/chosen": -258.8128356933594, "logps/rejected": -219.7599639892578, "loss": 0.0339, "rewards/accuracies": 0.625, "rewards/chosen": -0.02622481808066368, "rewards/margins": 0.05761373043060303, "rewards/rejected": -0.083838552236557, "step": 2610 }, { "epoch": 0.69, "learning_rate": 1.3603446777675665e-06, "logits/chosen": 0.9631742238998413, "logits/rejected": 0.9899166822433472, "logps/chosen": -260.05377197265625, "logps/rejected": -240.6715087890625, "loss": 0.0313, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.033961087465286255, "rewards/margins": 0.05184347182512283, "rewards/rejected": -0.08580456674098969, "step": 2620 }, { "epoch": 0.69, "learning_rate": 1.3400596746385817e-06, "logits/chosen": 0.9519561529159546, "logits/rejected": 0.9998300671577454, "logps/chosen": -290.88958740234375, "logps/rejected": -255.8434600830078, "loss": 0.0278, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.036170247942209244, "rewards/margins": 0.07095544040203094, "rewards/rejected": -0.10712568461894989, "step": 2630 }, { "epoch": 0.69, "learning_rate": 1.3198715261929587e-06, "logits/chosen": 0.9674153327941895, "logits/rejected": 1.0848486423492432, "logps/chosen": -322.2984924316406, "logps/rejected": -246.0180206298828, "loss": 0.0367, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.029658863320946693, "rewards/margins": 0.04690408706665039, "rewards/rejected": -0.07656295597553253, "step": 2640 }, { "epoch": 0.69, "learning_rate": 1.2997819181352823e-06, "logits/chosen": 0.979369044303894, "logits/rejected": 0.9796104431152344, "logps/chosen": -299.4449462890625, "logps/rejected": -267.0638122558594, "loss": 0.0319, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.033346764743328094, "rewards/margins": 0.05519961193203926, "rewards/rejected": -0.08854638040065765, "step": 2650 }, { "epoch": 0.7, "learning_rate": 1.2797925279420454e-06, "logits/chosen": 1.0160847902297974, "logits/rejected": 1.0748382806777954, "logps/chosen": -291.21661376953125, "logps/rejected": -258.7221374511719, "loss": 0.0387, "rewards/accuracies": 0.5625, "rewards/chosen": -0.040911462157964706, "rewards/margins": 0.046320244669914246, "rewards/rejected": -0.08723169565200806, "step": 2660 }, { "epoch": 0.7, "learning_rate": 1.2599050247215764e-06, "logits/chosen": 0.9987077713012695, "logits/rejected": 1.0722219944000244, "logps/chosen": -286.01446533203125, "logps/rejected": -262.60101318359375, "loss": 0.0249, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.023940104991197586, "rewards/margins": 0.07719768583774567, "rewards/rejected": -0.10113777965307236, "step": 2670 }, { "epoch": 0.7, "learning_rate": 1.2401210690746705e-06, "logits/chosen": 0.949263870716095, "logits/rejected": 1.113747239112854, "logps/chosen": -264.36834716796875, "logps/rejected": -241.06570434570312, "loss": 0.0276, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.029313066974282265, "rewards/margins": 0.047565605491399765, "rewards/rejected": -0.07687868177890778, "step": 2680 }, { "epoch": 0.7, "learning_rate": 1.2204423129559306e-06, "logits/chosen": 0.8770235180854797, "logits/rejected": 1.056774377822876, "logps/chosen": -296.3006896972656, "logps/rejected": -270.3150939941406, "loss": 0.0346, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.026878798380494118, "rewards/margins": 0.04574307054281235, "rewards/rejected": -0.07262186706066132, "step": 2690 }, { "epoch": 0.71, "learning_rate": 1.20087039953583e-06, "logits/chosen": 0.9897807240486145, "logits/rejected": 1.0115458965301514, "logps/chosen": -281.16778564453125, "logps/rejected": -259.2882080078125, "loss": 0.0427, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.01994657889008522, "rewards/margins": 0.06008830666542053, "rewards/rejected": -0.08003488928079605, "step": 2700 }, { "epoch": 0.71, "eval_logits/chosen": 0.9318579435348511, "eval_logits/rejected": 1.0336464643478394, "eval_logps/chosen": -280.66436767578125, "eval_logps/rejected": -252.92898559570312, "eval_loss": 0.03483254089951515, "eval_rewards/accuracies": 0.6309999823570251, "eval_rewards/chosen": -0.031175779178738594, "eval_rewards/margins": 0.05995478481054306, "eval_rewards/rejected": -0.0911305621266365, "eval_runtime": 539.1556, "eval_samples_per_second": 3.71, "eval_steps_per_second": 0.927, "step": 2700 }, { "epoch": 0.71, "learning_rate": 1.181406963063507e-06, "logits/chosen": 0.987993061542511, "logits/rejected": 0.9600605964660645, "logps/chosen": -277.44580078125, "logps/rejected": -232.87661743164062, "loss": 0.0366, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.025253716856241226, "rewards/margins": 0.05700179934501648, "rewards/rejected": -0.082255519926548, "step": 2710 }, { "epoch": 0.71, "learning_rate": 1.1620536287303052e-06, "logits/chosen": 0.9831315279006958, "logits/rejected": 1.0662561655044556, "logps/chosen": -261.1286315917969, "logps/rejected": -243.3155059814453, "loss": 0.0467, "rewards/accuracies": 0.65625, "rewards/chosen": -0.03405457362532616, "rewards/margins": 0.06027429178357124, "rewards/rejected": -0.0943288654088974, "step": 2720 }, { "epoch": 0.71, "learning_rate": 1.1428120125340717e-06, "logits/chosen": 1.0292747020721436, "logits/rejected": 0.947592556476593, "logps/chosen": -272.7021484375, "logps/rejected": -243.03701782226562, "loss": 0.0401, "rewards/accuracies": 0.625, "rewards/chosen": -0.02782285585999489, "rewards/margins": 0.05716438964009285, "rewards/rejected": -0.08498723804950714, "step": 2730 }, { "epoch": 0.72, "learning_rate": 1.123683721144223e-06, "logits/chosen": 0.977290153503418, "logits/rejected": 1.0009124279022217, "logps/chosen": -272.8582458496094, "logps/rejected": -241.51687622070312, "loss": 0.0348, "rewards/accuracies": 0.65625, "rewards/chosen": -0.03376708924770355, "rewards/margins": 0.07209788262844086, "rewards/rejected": -0.105864979326725, "step": 2740 }, { "epoch": 0.72, "learning_rate": 1.1046703517675848e-06, "logits/chosen": 0.973223865032196, "logits/rejected": 0.9701916575431824, "logps/chosen": -247.44888305664062, "logps/rejected": -250.46353149414062, "loss": 0.0384, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.028101569041609764, "rewards/margins": 0.05619729310274124, "rewards/rejected": -0.08429885655641556, "step": 2750 }, { "epoch": 0.72, "learning_rate": 1.085773492015028e-06, "logits/chosen": 0.960196852684021, "logits/rejected": 1.067030668258667, "logps/chosen": -288.462158203125, "logps/rejected": -235.9557647705078, "loss": 0.0315, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.03159577399492264, "rewards/margins": 0.07470157742500305, "rewards/rejected": -0.10629735141992569, "step": 2760 }, { "epoch": 0.72, "learning_rate": 1.0669947197689034e-06, "logits/chosen": 0.9508602023124695, "logits/rejected": 1.0008846521377563, "logps/chosen": -229.83627319335938, "logps/rejected": -243.19570922851562, "loss": 0.0447, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.04542272537946701, "rewards/margins": 0.0430048331618309, "rewards/rejected": -0.08842755109071732, "step": 2770 }, { "epoch": 0.73, "learning_rate": 1.048335603051291e-06, "logits/chosen": 0.9108030200004578, "logits/rejected": 1.035298228263855, "logps/chosen": -272.658935546875, "logps/rejected": -255.37905883789062, "loss": 0.0379, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.028898200020194054, "rewards/margins": 0.07144349068403244, "rewards/rejected": -0.10034169256687164, "step": 2780 }, { "epoch": 0.73, "learning_rate": 1.0297976998930665e-06, "logits/chosen": 0.9145883321762085, "logits/rejected": 0.9523155093193054, "logps/chosen": -278.5872497558594, "logps/rejected": -242.91748046875, "loss": 0.0345, "rewards/accuracies": 0.625, "rewards/chosen": -0.023207422345876694, "rewards/margins": 0.05597452074289322, "rewards/rejected": -0.07918194681406021, "step": 2790 }, { "epoch": 0.73, "learning_rate": 1.0113825582038078e-06, "logits/chosen": 0.9705532193183899, "logits/rejected": 1.0568289756774902, "logps/chosen": -271.0597839355469, "logps/rejected": -225.5278778076172, "loss": 0.0331, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.03660514950752258, "rewards/margins": 0.05592336505651474, "rewards/rejected": -0.09252851456403732, "step": 2800 }, { "epoch": 0.73, "eval_logits/chosen": 0.9335169196128845, "eval_logits/rejected": 1.035439372062683, "eval_logps/chosen": -280.4610900878906, "eval_logps/rejected": -252.53688049316406, "eval_loss": 0.0349029041826725, "eval_rewards/accuracies": 0.6290000081062317, "eval_rewards/chosen": -0.0291427094489336, "eval_rewards/margins": 0.058066822588443756, "eval_rewards/rejected": -0.08720952272415161, "eval_runtime": 539.1069, "eval_samples_per_second": 3.71, "eval_steps_per_second": 0.927, "step": 2800 }, { "epoch": 0.74, "learning_rate": 9.930917156425477e-07, "logits/chosen": 0.9502407908439636, "logits/rejected": 1.072632908821106, "logps/chosen": -246.87118530273438, "logps/rejected": -225.63467407226562, "loss": 0.0405, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.018824134021997452, "rewards/margins": 0.0712006688117981, "rewards/rejected": -0.09002481400966644, "step": 2810 }, { "epoch": 0.74, "learning_rate": 9.749266994893756e-07, "logits/chosen": 0.9383459091186523, "logits/rejected": 1.0246083736419678, "logps/chosen": -242.57315063476562, "logps/rejected": -255.66812133789062, "loss": 0.0387, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.029849324375391006, "rewards/margins": 0.04549198970198631, "rewards/rejected": -0.07534130662679672, "step": 2820 }, { "epoch": 0.74, "learning_rate": 9.56889026517913e-07, "logits/chosen": 0.9737696647644043, "logits/rejected": 1.0874649286270142, "logps/chosen": -272.6422424316406, "logps/rejected": -241.7522430419922, "loss": 0.0379, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.027452822774648666, "rewards/margins": 0.06477371603250504, "rewards/rejected": -0.0922265350818634, "step": 2830 }, { "epoch": 0.74, "learning_rate": 9.389802028686617e-07, "logits/chosen": 0.9742962121963501, "logits/rejected": 1.0039392709732056, "logps/chosen": -219.0965118408203, "logps/rejected": -192.56277465820312, "loss": 0.0343, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.029790541157126427, "rewards/margins": 0.05616752430796623, "rewards/rejected": -0.08595806360244751, "step": 2840 }, { "epoch": 0.75, "learning_rate": 9.212017239232427e-07, "logits/chosen": 0.951312243938446, "logits/rejected": 1.0733340978622437, "logps/chosen": -271.9151916503906, "logps/rejected": -280.4118957519531, "loss": 0.0357, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.01736624911427498, "rewards/margins": 0.08112471550703049, "rewards/rejected": -0.09849096834659576, "step": 2850 }, { "epoch": 0.75, "learning_rate": 9.03555074179533e-07, "logits/chosen": 0.9455773234367371, "logits/rejected": 1.05752432346344, "logps/chosen": -267.56500244140625, "logps/rejected": -246.6649932861328, "loss": 0.0291, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.02218179777264595, "rewards/margins": 0.07102219760417938, "rewards/rejected": -0.09320400655269623, "step": 2860 }, { "epoch": 0.75, "learning_rate": 8.860417271277067e-07, "logits/chosen": 0.9300365447998047, "logits/rejected": 1.022707462310791, "logps/chosen": -242.83029174804688, "logps/rejected": -241.719970703125, "loss": 0.0441, "rewards/accuracies": 0.59375, "rewards/chosen": -0.027262404561042786, "rewards/margins": 0.0598478689789772, "rewards/rejected": -0.08711027354001999, "step": 2870 }, { "epoch": 0.75, "learning_rate": 8.686631451272029e-07, "logits/chosen": 0.9614574313163757, "logits/rejected": 0.9695903658866882, "logps/chosen": -269.4878845214844, "logps/rejected": -222.73403930664062, "loss": 0.0401, "rewards/accuracies": 0.59375, "rewards/chosen": -0.035077519714832306, "rewards/margins": 0.054178714752197266, "rewards/rejected": -0.08925624191761017, "step": 2880 }, { "epoch": 0.76, "learning_rate": 8.514207792846168e-07, "logits/chosen": 1.0225694179534912, "logits/rejected": 1.0496468544006348, "logps/chosen": -252.6670379638672, "logps/rejected": -257.1852111816406, "loss": 0.0488, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.02861052379012108, "rewards/margins": 0.060965172946453094, "rewards/rejected": -0.08957569301128387, "step": 2890 }, { "epoch": 0.76, "learning_rate": 8.343160693325356e-07, "logits/chosen": 0.9750697016716003, "logits/rejected": 1.0202362537384033, "logps/chosen": -232.67269897460938, "logps/rejected": -227.71505737304688, "loss": 0.0415, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.03148717060685158, "rewards/margins": 0.05822090432047844, "rewards/rejected": -0.08970808237791061, "step": 2900 }, { "epoch": 0.76, "eval_logits/chosen": 0.9227569699287415, "eval_logits/rejected": 1.0247799158096313, "eval_logps/chosen": -280.527587890625, "eval_logps/rejected": -252.64686584472656, "eval_loss": 0.03488382324576378, "eval_rewards/accuracies": 0.6315000057220459, "eval_rewards/chosen": -0.02980780228972435, "eval_rewards/margins": 0.0585014745593071, "eval_rewards/rejected": -0.08830928802490234, "eval_runtime": 539.0433, "eval_samples_per_second": 3.71, "eval_steps_per_second": 0.928, "step": 2900 }, { "epoch": 0.76, "learning_rate": 8.173504435093174e-07, "logits/chosen": 0.9547770619392395, "logits/rejected": 0.9979850649833679, "logps/chosen": -285.94573974609375, "logps/rejected": -269.0919494628906, "loss": 0.0381, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.032671697437763214, "rewards/margins": 0.04868137463927269, "rewards/rejected": -0.0813530758023262, "step": 2910 }, { "epoch": 0.76, "learning_rate": 8.00525318439836e-07, "logits/chosen": 1.0028339624404907, "logits/rejected": 1.0430415868759155, "logps/chosen": -271.57373046875, "logps/rejected": -232.68222045898438, "loss": 0.0312, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.03152400627732277, "rewards/margins": 0.061547745019197464, "rewards/rejected": -0.09307174384593964, "step": 2920 }, { "epoch": 0.77, "learning_rate": 7.838420990171927e-07, "logits/chosen": 0.9217666387557983, "logits/rejected": 1.0288439989089966, "logps/chosen": -313.1500549316406, "logps/rejected": -267.90985107421875, "loss": 0.0292, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.022273462265729904, "rewards/margins": 0.06082174926996231, "rewards/rejected": -0.08309520781040192, "step": 2930 }, { "epoch": 0.77, "learning_rate": 7.673021782854084e-07, "logits/chosen": 0.9779103994369507, "logits/rejected": 0.9794729351997375, "logps/chosen": -250.5184783935547, "logps/rejected": -264.76287841796875, "loss": 0.0435, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.028768843039870262, "rewards/margins": 0.060712385922670364, "rewards/rejected": -0.08948123455047607, "step": 2940 }, { "epoch": 0.77, "learning_rate": 7.509069373231039e-07, "logits/chosen": 0.9786937832832336, "logits/rejected": 0.9728788137435913, "logps/chosen": -294.4608459472656, "logps/rejected": -271.31146240234375, "loss": 0.0281, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.019416136667132378, "rewards/margins": 0.06897474080324173, "rewards/rejected": -0.08839087188243866, "step": 2950 }, { "epoch": 0.77, "learning_rate": 7.346577451281822e-07, "logits/chosen": 0.9356600046157837, "logits/rejected": 1.0507375001907349, "logps/chosen": -251.129638671875, "logps/rejected": -232.9912567138672, "loss": 0.0454, "rewards/accuracies": 0.5625, "rewards/chosen": -0.03892569988965988, "rewards/margins": 0.061569105833768845, "rewards/rejected": -0.10049480199813843, "step": 2960 }, { "epoch": 0.78, "learning_rate": 7.185559585035138e-07, "logits/chosen": 0.9160947799682617, "logits/rejected": 0.9816699028015137, "logps/chosen": -286.2984924316406, "logps/rejected": -260.083984375, "loss": 0.0455, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.01913222298026085, "rewards/margins": 0.05808521434664726, "rewards/rejected": -0.07721744477748871, "step": 2970 }, { "epoch": 0.78, "learning_rate": 7.026029219436504e-07, "logits/chosen": 0.9672778844833374, "logits/rejected": 1.019814372062683, "logps/chosen": -306.3282470703125, "logps/rejected": -272.06280517578125, "loss": 0.0323, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.02404235675930977, "rewards/margins": 0.06635276973247528, "rewards/rejected": -0.09039512276649475, "step": 2980 }, { "epoch": 0.78, "learning_rate": 6.867999675225523e-07, "logits/chosen": 0.9749993085861206, "logits/rejected": 0.9944796562194824, "logps/chosen": -297.462646484375, "logps/rejected": -249.9281768798828, "loss": 0.0296, "rewards/accuracies": 0.625, "rewards/chosen": -0.019663769751787186, "rewards/margins": 0.053207218647003174, "rewards/rejected": -0.07287098467350006, "step": 2990 }, { "epoch": 0.79, "learning_rate": 6.711484147823663e-07, "logits/chosen": 0.9646242260932922, "logits/rejected": 0.9969871640205383, "logps/chosen": -270.52301025390625, "logps/rejected": -241.90933227539062, "loss": 0.0404, "rewards/accuracies": 0.65625, "rewards/chosen": -0.02490374445915222, "rewards/margins": 0.06743566691875458, "rewards/rejected": -0.0923394113779068, "step": 3000 }, { "epoch": 0.79, "eval_logits/chosen": 0.927689790725708, "eval_logits/rejected": 1.030518889427185, "eval_logps/chosen": -280.2290954589844, "eval_logps/rejected": -252.4009246826172, "eval_loss": 0.034884098917245865, "eval_rewards/accuracies": 0.6294999718666077, "eval_rewards/chosen": -0.026823006570339203, "eval_rewards/margins": 0.059027016162872314, "eval_rewards/rejected": -0.08585001528263092, "eval_runtime": 539.1402, "eval_samples_per_second": 3.71, "eval_steps_per_second": 0.927, "step": 3000 }, { "epoch": 0.79, "learning_rate": 6.556495706232413e-07, "logits/chosen": 0.9685632586479187, "logits/rejected": 1.0489590167999268, "logps/chosen": -274.52239990234375, "logps/rejected": -239.2083740234375, "loss": 0.0369, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.01814478076994419, "rewards/margins": 0.06804581731557846, "rewards/rejected": -0.0861906185746193, "step": 3010 }, { "epoch": 0.79, "learning_rate": 6.403047291942057e-07, "logits/chosen": 0.9458588361740112, "logits/rejected": 0.9609957933425903, "logps/chosen": -278.5867919921875, "logps/rejected": -264.2691955566406, "loss": 0.0291, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.01274092961102724, "rewards/margins": 0.05699128657579422, "rewards/rejected": -0.06973221898078918, "step": 3020 }, { "epoch": 0.79, "learning_rate": 6.251151717851023e-07, "logits/chosen": 0.9313241243362427, "logits/rejected": 0.9795175790786743, "logps/chosen": -291.05364990234375, "logps/rejected": -275.7203063964844, "loss": 0.0411, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.02996246889233589, "rewards/margins": 0.08125524967908859, "rewards/rejected": -0.11121772229671478, "step": 3030 }, { "epoch": 0.8, "learning_rate": 6.100821667196041e-07, "logits/chosen": 0.9516725540161133, "logits/rejected": 1.033613920211792, "logps/chosen": -304.0897216796875, "logps/rejected": -248.3233642578125, "loss": 0.0313, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.012124654836952686, "rewards/margins": 0.06473545730113983, "rewards/rejected": -0.07686010748147964, "step": 3040 }, { "epoch": 0.8, "learning_rate": 5.952069692493062e-07, "logits/chosen": 0.8957148790359497, "logits/rejected": 1.0140354633331299, "logps/chosen": -300.5768127441406, "logps/rejected": -266.0682373046875, "loss": 0.0293, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.006201650947332382, "rewards/margins": 0.07337900996208191, "rewards/rejected": -0.07958065718412399, "step": 3050 }, { "epoch": 0.8, "learning_rate": 5.80490821448918e-07, "logits/chosen": 0.9774069786071777, "logits/rejected": 1.0049412250518799, "logps/chosen": -222.11856079101562, "logps/rejected": -206.6851348876953, "loss": 0.0289, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.029702406376600266, "rewards/margins": 0.059780023992061615, "rewards/rejected": -0.08948242664337158, "step": 3060 }, { "epoch": 0.8, "learning_rate": 5.659349521125459e-07, "logits/chosen": 0.945693850517273, "logits/rejected": 1.0435597896575928, "logps/chosen": -238.2971649169922, "logps/rejected": -243.50106811523438, "loss": 0.0443, "rewards/accuracies": 0.65625, "rewards/chosen": -0.02947511151432991, "rewards/margins": 0.07011254131793976, "rewards/rejected": -0.09958765655755997, "step": 3070 }, { "epoch": 0.81, "learning_rate": 5.5154057665109e-07, "logits/chosen": 0.9794226884841919, "logits/rejected": 1.0414973497390747, "logps/chosen": -280.0814208984375, "logps/rejected": -243.6100311279297, "loss": 0.0303, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.022864539176225662, "rewards/margins": 0.06399150937795639, "rewards/rejected": -0.08685605973005295, "step": 3080 }, { "epoch": 0.81, "learning_rate": 5.373088969907586e-07, "logits/chosen": 0.9371223449707031, "logits/rejected": 1.0262264013290405, "logps/chosen": -260.27728271484375, "logps/rejected": -258.8297424316406, "loss": 0.0314, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.02002180740237236, "rewards/margins": 0.06138715147972107, "rewards/rejected": -0.08140896260738373, "step": 3090 }, { "epoch": 0.81, "learning_rate": 5.23241101472709e-07, "logits/chosen": 1.0079492330551147, "logits/rejected": 0.9960187673568726, "logps/chosen": -255.51211547851562, "logps/rejected": -243.93319702148438, "loss": 0.0362, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.02185986004769802, "rewards/margins": 0.057953812181949615, "rewards/rejected": -0.07981367409229279, "step": 3100 }, { "epoch": 0.81, "eval_logits/chosen": 0.9270116090774536, "eval_logits/rejected": 1.0295790433883667, "eval_logps/chosen": -280.18609619140625, "eval_logps/rejected": -252.307861328125, "eval_loss": 0.03481233865022659, "eval_rewards/accuracies": 0.6305000185966492, "eval_rewards/chosen": -0.026392878964543343, "eval_rewards/margins": 0.058526668697595596, "eval_rewards/rejected": -0.08491955697536469, "eval_runtime": 539.1609, "eval_samples_per_second": 3.709, "eval_steps_per_second": 0.927, "step": 3100 }, { "epoch": 0.81, "learning_rate": 5.09338364753818e-07, "logits/chosen": 0.9137361645698547, "logits/rejected": 1.0361106395721436, "logps/chosen": -293.02032470703125, "logps/rejected": -264.98883056640625, "loss": 0.0454, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.01966175064444542, "rewards/margins": 0.090638667345047, "rewards/rejected": -0.11030042171478271, "step": 3110 }, { "epoch": 0.82, "learning_rate": 4.956018477086005e-07, "logits/chosen": 1.0527143478393555, "logits/rejected": 1.0417900085449219, "logps/chosen": -281.76318359375, "logps/rejected": -230.76205444335938, "loss": 0.0363, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.02277226559817791, "rewards/margins": 0.055615413933992386, "rewards/rejected": -0.07838768512010574, "step": 3120 }, { "epoch": 0.82, "learning_rate": 4.820326973322764e-07, "logits/chosen": 0.9705474972724915, "logits/rejected": 1.0229809284210205, "logps/chosen": -295.6360778808594, "logps/rejected": -221.80020141601562, "loss": 0.033, "rewards/accuracies": 0.65625, "rewards/chosen": -0.01131758838891983, "rewards/margins": 0.0728193074464798, "rewards/rejected": -0.08413688838481903, "step": 3130 }, { "epoch": 0.82, "learning_rate": 4.686320466449981e-07, "logits/chosen": 0.9558774828910828, "logits/rejected": 1.006240725517273, "logps/chosen": -237.345947265625, "logps/rejected": -222.2642364501953, "loss": 0.0358, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.022244829684495926, "rewards/margins": 0.07573308050632477, "rewards/rejected": -0.0979778990149498, "step": 3140 }, { "epoch": 0.82, "learning_rate": 4.554010145972418e-07, "logits/chosen": 0.956200897693634, "logits/rejected": 1.0314735174179077, "logps/chosen": -285.91473388671875, "logps/rejected": -229.6345977783203, "loss": 0.0312, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.015750734135508537, "rewards/margins": 0.0518513098359108, "rewards/rejected": -0.06760205328464508, "step": 3150 }, { "epoch": 0.83, "learning_rate": 4.4234070597637455e-07, "logits/chosen": 0.9090474843978882, "logits/rejected": 1.0826170444488525, "logps/chosen": -270.742919921875, "logps/rejected": -254.69363403320312, "loss": 0.0316, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.029270146042108536, "rewards/margins": 0.04612868279218674, "rewards/rejected": -0.07539881765842438, "step": 3160 }, { "epoch": 0.83, "learning_rate": 4.2945221131440783e-07, "logits/chosen": 0.975232720375061, "logits/rejected": 1.02248215675354, "logps/chosen": -245.90774536132812, "logps/rejected": -221.59896850585938, "loss": 0.045, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.025399979203939438, "rewards/margins": 0.0584503710269928, "rewards/rejected": -0.08385033905506134, "step": 3170 }, { "epoch": 0.83, "learning_rate": 4.167366067969381e-07, "logits/chosen": 0.9106703996658325, "logits/rejected": 0.9676691293716431, "logps/chosen": -272.5838317871094, "logps/rejected": -259.0162658691406, "loss": 0.0395, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.017812054604291916, "rewards/margins": 0.05891140550374985, "rewards/rejected": -0.07672347128391266, "step": 3180 }, { "epoch": 0.83, "learning_rate": 4.041949541732826e-07, "logits/chosen": 0.9986382722854614, "logits/rejected": 1.0691004991531372, "logps/chosen": -296.02490234375, "logps/rejected": -245.5988311767578, "loss": 0.0447, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.02657441236078739, "rewards/margins": 0.05036981776356697, "rewards/rejected": -0.07694423198699951, "step": 3190 }, { "epoch": 0.84, "learning_rate": 3.9182830066782614e-07, "logits/chosen": 0.97566157579422, "logits/rejected": 0.9876262545585632, "logps/chosen": -282.0150146484375, "logps/rejected": -245.9921875, "loss": 0.0412, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.03303904086351395, "rewards/margins": 0.04882500693202019, "rewards/rejected": -0.08186405152082443, "step": 3200 }, { "epoch": 0.84, "eval_logits/chosen": 0.9313199520111084, "eval_logits/rejected": 1.0337715148925781, "eval_logps/chosen": -280.28759765625, "eval_logps/rejected": -252.42367553710938, "eval_loss": 0.03475377336144447, "eval_rewards/accuracies": 0.6259999871253967, "eval_rewards/chosen": -0.027407577261328697, "eval_rewards/margins": 0.05866991728544235, "eval_rewards/rejected": -0.0860774889588356, "eval_runtime": 539.1084, "eval_samples_per_second": 3.71, "eval_steps_per_second": 0.927, "step": 3200 }, { "epoch": 0.84, "learning_rate": 3.796376788925771e-07, "logits/chosen": 0.9154554605484009, "logits/rejected": 1.0822858810424805, "logps/chosen": -247.672119140625, "logps/rejected": -225.2171630859375, "loss": 0.0476, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.03178101405501366, "rewards/margins": 0.0625949501991272, "rewards/rejected": -0.09437596052885056, "step": 3210 }, { "epoch": 0.84, "learning_rate": 3.676241067609465e-07, "logits/chosen": 0.9667531847953796, "logits/rejected": 0.9605924487113953, "logps/chosen": -242.40115356445312, "logps/rejected": -237.51455688476562, "loss": 0.0326, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.02729959413409233, "rewards/margins": 0.06457889080047607, "rewards/rejected": -0.0918785035610199, "step": 3220 }, { "epoch": 0.85, "learning_rate": 3.5578858740274976e-07, "logits/chosen": 0.9239116907119751, "logits/rejected": 1.0316154956817627, "logps/chosen": -300.83447265625, "logps/rejected": -260.6551513671875, "loss": 0.035, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.020300351083278656, "rewards/margins": 0.06348638236522675, "rewards/rejected": -0.0837867259979248, "step": 3230 }, { "epoch": 0.85, "learning_rate": 3.44132109080447e-07, "logits/chosen": 0.9275467991828918, "logits/rejected": 1.080038070678711, "logps/chosen": -259.93646240234375, "logps/rejected": -237.24990844726562, "loss": 0.0382, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.03450951725244522, "rewards/margins": 0.05054662749171257, "rewards/rejected": -0.08505614101886749, "step": 3240 }, { "epoch": 0.85, "learning_rate": 3.3265564510662344e-07, "logits/chosen": 0.9719650149345398, "logits/rejected": 1.0536506175994873, "logps/chosen": -254.2252960205078, "logps/rejected": -218.9423828125, "loss": 0.0369, "rewards/accuracies": 0.625, "rewards/chosen": -0.024826010689139366, "rewards/margins": 0.05269388109445572, "rewards/rejected": -0.07751990109682083, "step": 3250 }, { "epoch": 0.85, "learning_rate": 3.213601537627195e-07, "logits/chosen": 0.9981171488761902, "logits/rejected": 1.0061393976211548, "logps/chosen": -260.99664306640625, "logps/rejected": -252.4512939453125, "loss": 0.0306, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.028290893882513046, "rewards/margins": 0.059080712497234344, "rewards/rejected": -0.08737160265445709, "step": 3260 }, { "epoch": 0.86, "learning_rate": 3.1024657821901063e-07, "logits/chosen": 0.9618428945541382, "logits/rejected": 0.9869762659072876, "logps/chosen": -255.50851440429688, "logps/rejected": -237.8030548095703, "loss": 0.0263, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.023725476115942, "rewards/margins": 0.06770970672369003, "rewards/rejected": -0.09143517911434174, "step": 3270 }, { "epoch": 0.86, "learning_rate": 2.9931584645585654e-07, "logits/chosen": 1.0205438137054443, "logits/rejected": 1.0499489307403564, "logps/chosen": -285.1611328125, "logps/rejected": -280.9693908691406, "loss": 0.0364, "rewards/accuracies": 0.6875, "rewards/chosen": -0.02360624074935913, "rewards/margins": 0.07233406603336334, "rewards/rejected": -0.09594030678272247, "step": 3280 }, { "epoch": 0.86, "learning_rate": 2.885688711862136e-07, "logits/chosen": 0.9615311622619629, "logits/rejected": 1.010024905204773, "logps/chosen": -257.10546875, "logps/rejected": -222.49844360351562, "loss": 0.0356, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.022061806172132492, "rewards/margins": 0.044591888785362244, "rewards/rejected": -0.06665369868278503, "step": 3290 }, { "epoch": 0.86, "learning_rate": 2.7800654977942486e-07, "logits/chosen": 0.9196559190750122, "logits/rejected": 1.0562456846237183, "logps/chosen": -294.05804443359375, "logps/rejected": -225.5612030029297, "loss": 0.0485, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.020386073738336563, "rewards/margins": 0.06678882986307144, "rewards/rejected": -0.0871749073266983, "step": 3300 }, { "epoch": 0.86, "eval_logits/chosen": 0.9336137771606445, "eval_logits/rejected": 1.0358667373657227, "eval_logps/chosen": -279.9648132324219, "eval_logps/rejected": -252.05458068847656, "eval_loss": 0.034665048122406006, "eval_rewards/accuracies": 0.6269999742507935, "eval_rewards/chosen": -0.02418021857738495, "eval_rewards/margins": 0.05820634588599205, "eval_rewards/rejected": -0.0823865681886673, "eval_runtime": 538.8853, "eval_samples_per_second": 3.711, "eval_steps_per_second": 0.928, "step": 3300 }, { "epoch": 0.87, "learning_rate": 2.6762976418628797e-07, "logits/chosen": 0.9499009847640991, "logits/rejected": 0.961447536945343, "logps/chosen": -252.6084747314453, "logps/rejected": -231.51760864257812, "loss": 0.0302, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.020922133699059486, "rewards/margins": 0.055929750204086304, "rewards/rejected": -0.07685188204050064, "step": 3310 }, { "epoch": 0.87, "learning_rate": 2.5743938086541354e-07, "logits/chosen": 0.9769983291625977, "logits/rejected": 1.0180495977401733, "logps/chosen": -297.083740234375, "logps/rejected": -251.3445281982422, "loss": 0.0258, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.013064468279480934, "rewards/margins": 0.08387977629899979, "rewards/rejected": -0.09694425016641617, "step": 3320 }, { "epoch": 0.87, "learning_rate": 2.4743625071087574e-07, "logits/chosen": 0.9248872995376587, "logits/rejected": 1.083939552307129, "logps/chosen": -269.6517639160156, "logps/rejected": -246.59970092773438, "loss": 0.0377, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.029898881912231445, "rewards/margins": 0.06900982558727264, "rewards/rejected": -0.09890870004892349, "step": 3330 }, { "epoch": 0.87, "learning_rate": 2.3762120898116498e-07, "logits/chosen": 0.9920533895492554, "logits/rejected": 1.0261324644088745, "logps/chosen": -267.4979553222656, "logps/rejected": -244.2932891845703, "loss": 0.0377, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.0195697583258152, "rewards/margins": 0.03883281350135803, "rewards/rejected": -0.058402568101882935, "step": 3340 }, { "epoch": 0.88, "learning_rate": 2.2799507522944048e-07, "logits/chosen": 1.0060142278671265, "logits/rejected": 1.0250886678695679, "logps/chosen": -296.4592590332031, "logps/rejected": -239.81381225585938, "loss": 0.0321, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.026712555438280106, "rewards/margins": 0.03180098533630371, "rewards/rejected": -0.058513544499874115, "step": 3350 }, { "epoch": 0.88, "learning_rate": 2.1855865323510056e-07, "logits/chosen": 0.978489100933075, "logits/rejected": 1.032293677330017, "logps/chosen": -289.8194580078125, "logps/rejected": -266.70330810546875, "loss": 0.032, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.015113875269889832, "rewards/margins": 0.06603299081325531, "rewards/rejected": -0.08114685118198395, "step": 3360 }, { "epoch": 0.88, "learning_rate": 2.0931273093666575e-07, "logits/chosen": 1.0238964557647705, "logits/rejected": 1.0438212156295776, "logps/chosen": -267.2007141113281, "logps/rejected": -265.8624572753906, "loss": 0.0403, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.018140006810426712, "rewards/margins": 0.06415996700525284, "rewards/rejected": -0.08229997754096985, "step": 3370 }, { "epoch": 0.88, "learning_rate": 2.002580803659873e-07, "logits/chosen": 1.0145037174224854, "logits/rejected": 1.0263590812683105, "logps/chosen": -293.4837951660156, "logps/rejected": -262.3972473144531, "loss": 0.0357, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.03515145182609558, "rewards/margins": 0.043677233159542084, "rewards/rejected": -0.07882869243621826, "step": 3380 }, { "epoch": 0.89, "learning_rate": 1.913954575837826e-07, "logits/chosen": 1.0240306854248047, "logits/rejected": 1.094536542892456, "logps/chosen": -291.9563293457031, "logps/rejected": -263.8965148925781, "loss": 0.048, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.02634221874177456, "rewards/margins": 0.06700630486011505, "rewards/rejected": -0.09334851801395416, "step": 3390 }, { "epoch": 0.89, "learning_rate": 1.827256026165028e-07, "logits/chosen": 1.024501919746399, "logits/rejected": 1.0359153747558594, "logps/chosen": -271.8841247558594, "logps/rejected": -239.6262664794922, "loss": 0.0376, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.026435798034071922, "rewards/margins": 0.07295812666416168, "rewards/rejected": -0.09939391911029816, "step": 3400 }, { "epoch": 0.89, "eval_logits/chosen": 0.9353539347648621, "eval_logits/rejected": 1.0377308130264282, "eval_logps/chosen": -280.1902160644531, "eval_logps/rejected": -252.35890197753906, "eval_loss": 0.03463303670287132, "eval_rewards/accuracies": 0.6309999823570251, "eval_rewards/chosen": -0.026434103026986122, "eval_rewards/margins": 0.0589958056807518, "eval_rewards/rejected": -0.08542990684509277, "eval_runtime": 538.4849, "eval_samples_per_second": 3.714, "eval_steps_per_second": 0.929, "step": 3400 }, { "epoch": 0.89, "learning_rate": 1.7424923939454274e-07, "logits/chosen": 1.0309066772460938, "logits/rejected": 1.0446019172668457, "logps/chosen": -282.263427734375, "logps/rejected": -256.1120300292969, "loss": 0.0385, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.02827179990708828, "rewards/margins": 0.05899345874786377, "rewards/rejected": -0.0872652679681778, "step": 3410 }, { "epoch": 0.9, "learning_rate": 1.6596707569179304e-07, "logits/chosen": 0.9415512084960938, "logits/rejected": 1.0031477212905884, "logps/chosen": -267.2585754394531, "logps/rejected": -228.9310302734375, "loss": 0.0309, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.020049545913934708, "rewards/margins": 0.05747220665216446, "rewards/rejected": -0.07752174139022827, "step": 3420 }, { "epoch": 0.9, "learning_rate": 1.578798030665385e-07, "logits/chosen": 0.9566577076911926, "logits/rejected": 0.9886308908462524, "logps/chosen": -287.398193359375, "logps/rejected": -231.68643188476562, "loss": 0.0351, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.030035072937607765, "rewards/margins": 0.03915448114275932, "rewards/rejected": -0.06918954849243164, "step": 3430 }, { "epoch": 0.9, "learning_rate": 1.499880968037165e-07, "logits/chosen": 0.9171876907348633, "logits/rejected": 1.0982364416122437, "logps/chosen": -283.79156494140625, "logps/rejected": -253.9375, "loss": 0.0395, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.02584235928952694, "rewards/margins": 0.07793084532022476, "rewards/rejected": -0.10377321392297745, "step": 3440 }, { "epoch": 0.9, "learning_rate": 1.4229261585852805e-07, "logits/chosen": 0.9618092775344849, "logits/rejected": 1.1071628332138062, "logps/chosen": -258.87579345703125, "logps/rejected": -243.849365234375, "loss": 0.0413, "rewards/accuracies": 0.65625, "rewards/chosen": -0.031573228538036346, "rewards/margins": 0.07379934191703796, "rewards/rejected": -0.10537256300449371, "step": 3450 }, { "epoch": 0.91, "learning_rate": 1.3479400280141886e-07, "logits/chosen": 0.9881182909011841, "logits/rejected": 0.9810377359390259, "logps/chosen": -240.4054718017578, "logps/rejected": -220.0426483154297, "loss": 0.0372, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.019237659871578217, "rewards/margins": 0.041987188160419464, "rewards/rejected": -0.06122484803199768, "step": 3460 }, { "epoch": 0.91, "learning_rate": 1.2749288376442044e-07, "logits/chosen": 1.032110571861267, "logits/rejected": 1.0473930835723877, "logps/chosen": -249.2979736328125, "logps/rejected": -242.74606323242188, "loss": 0.0334, "rewards/accuracies": 0.59375, "rewards/chosen": -0.030945682898163795, "rewards/margins": 0.05242834612727165, "rewards/rejected": -0.0833740234375, "step": 3470 }, { "epoch": 0.91, "learning_rate": 1.203898683888713e-07, "logits/chosen": 0.9928043484687805, "logits/rejected": 1.0439598560333252, "logps/chosen": -255.74020385742188, "logps/rejected": -241.57583618164062, "loss": 0.0381, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.03444141149520874, "rewards/margins": 0.06323965638875961, "rewards/rejected": -0.09768106043338776, "step": 3480 }, { "epoch": 0.91, "learning_rate": 1.1348554977451132e-07, "logits/chosen": 1.0385875701904297, "logits/rejected": 1.0472261905670166, "logps/chosen": -243.3946075439453, "logps/rejected": -253.4709014892578, "loss": 0.0353, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.03410564363002777, "rewards/margins": 0.0681803822517395, "rewards/rejected": -0.10228602588176727, "step": 3490 }, { "epoch": 0.92, "learning_rate": 1.0678050442995802e-07, "logits/chosen": 0.9589599370956421, "logits/rejected": 0.9904863238334656, "logps/chosen": -262.4172058105469, "logps/rejected": -220.4215545654297, "loss": 0.0352, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.03152045980095863, "rewards/margins": 0.0699472576379776, "rewards/rejected": -0.10146770626306534, "step": 3500 }, { "epoch": 0.92, "eval_logits/chosen": 0.9392337203025818, "eval_logits/rejected": 1.0417654514312744, "eval_logps/chosen": -280.20367431640625, "eval_logps/rejected": -252.37255859375, "eval_loss": 0.03462912142276764, "eval_rewards/accuracies": 0.6259999871253967, "eval_rewards/chosen": -0.026568960398435593, "eval_rewards/margins": 0.058997511863708496, "eval_rewards/rejected": -0.08556646853685379, "eval_runtime": 538.5332, "eval_samples_per_second": 3.714, "eval_steps_per_second": 0.928, "step": 3500 }, { "epoch": 0.92, "learning_rate": 1.0027529222456755e-07, "logits/chosen": 0.9966568946838379, "logits/rejected": 0.9782639741897583, "logps/chosen": -264.34344482421875, "logps/rejected": -254.0737762451172, "loss": 0.0298, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.030217718333005905, "rewards/margins": 0.05717161297798157, "rewards/rejected": -0.08738932758569717, "step": 3510 }, { "epoch": 0.92, "learning_rate": 9.397045634168766e-08, "logits/chosen": 0.933295726776123, "logits/rejected": 1.088179349899292, "logps/chosen": -275.3388977050781, "logps/rejected": -235.18685913085938, "loss": 0.0367, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.014920748770236969, "rewards/margins": 0.06524350494146347, "rewards/rejected": -0.08016424626111984, "step": 3520 }, { "epoch": 0.92, "learning_rate": 8.78665232332998e-08, "logits/chosen": 0.9489814639091492, "logits/rejected": 1.066748857498169, "logps/chosen": -260.3916320800781, "logps/rejected": -258.72528076171875, "loss": 0.036, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.023377668112516403, "rewards/margins": 0.05880703777074814, "rewards/rejected": -0.08218470215797424, "step": 3530 }, { "epoch": 0.93, "learning_rate": 8.196400257606208e-08, "logits/chosen": 0.9277578592300415, "logits/rejected": 1.0134170055389404, "logps/chosen": -279.0546875, "logps/rejected": -249.3377685546875, "loss": 0.0347, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.018648769706487656, "rewards/margins": 0.04787365719676018, "rewards/rejected": -0.06652243435382843, "step": 3540 }, { "epoch": 0.93, "learning_rate": 7.626338722875076e-08, "logits/chosen": 1.015367031097412, "logits/rejected": 0.9656025171279907, "logps/chosen": -280.31109619140625, "logps/rejected": -254.50277709960938, "loss": 0.0347, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.03300374746322632, "rewards/margins": 0.050588060170412064, "rewards/rejected": -0.08359180390834808, "step": 3550 }, { "epoch": 0.93, "learning_rate": 7.076515319110688e-08, "logits/chosen": 1.0340051651000977, "logits/rejected": 1.0160043239593506, "logps/chosen": -292.3948669433594, "logps/rejected": -252.76358032226562, "loss": 0.0371, "rewards/accuracies": 0.625, "rewards/chosen": -0.02604197897017002, "rewards/margins": 0.05999482423067093, "rewards/rejected": -0.0860368013381958, "step": 3560 }, { "epoch": 0.93, "learning_rate": 6.54697595640899e-08, "logits/chosen": 1.0719053745269775, "logits/rejected": 1.0086462497711182, "logps/chosen": -279.8869323730469, "logps/rejected": -261.8102111816406, "loss": 0.0361, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.023053551092743874, "rewards/margins": 0.05721823126077652, "rewards/rejected": -0.08027178794145584, "step": 3570 }, { "epoch": 0.94, "learning_rate": 6.037764851154426e-08, "logits/chosen": 0.9340742826461792, "logits/rejected": 1.0983483791351318, "logps/chosen": -279.68585205078125, "logps/rejected": -239.94100952148438, "loss": 0.0259, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.02659156359732151, "rewards/margins": 0.05400107428431511, "rewards/rejected": -0.08059263974428177, "step": 3580 }, { "epoch": 0.94, "learning_rate": 5.548924522327748e-08, "logits/chosen": 0.9091449975967407, "logits/rejected": 1.1083462238311768, "logps/chosen": -275.26678466796875, "logps/rejected": -258.33453369140625, "loss": 0.0285, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.019822830334305763, "rewards/margins": 0.06569498032331467, "rewards/rejected": -0.08551780879497528, "step": 3590 }, { "epoch": 0.94, "learning_rate": 5.0804957879556915e-08, "logits/chosen": 0.9512368440628052, "logits/rejected": 1.0698693990707397, "logps/chosen": -318.3247375488281, "logps/rejected": -260.92718505859375, "loss": 0.0379, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.0346975177526474, "rewards/margins": 0.056962646543979645, "rewards/rejected": -0.09166016429662704, "step": 3600 }, { "epoch": 0.94, "eval_logits/chosen": 0.9390192031860352, "eval_logits/rejected": 1.0413662195205688, "eval_logps/chosen": -280.1781311035156, "eval_logps/rejected": -252.33770751953125, "eval_loss": 0.034663841128349304, "eval_rewards/accuracies": 0.6315000057220459, "eval_rewards/chosen": -0.026313286274671555, "eval_rewards/margins": 0.05890476703643799, "eval_rewards/rejected": -0.08521804958581924, "eval_runtime": 538.4137, "eval_samples_per_second": 3.715, "eval_steps_per_second": 0.929, "step": 3600 }, { "epoch": 0.94, "learning_rate": 4.632517761702815e-08, "logits/chosen": 1.0086432695388794, "logits/rejected": 1.0595029592514038, "logps/chosen": -285.149658203125, "logps/rejected": -251.66793823242188, "loss": 0.0434, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.018525902181863785, "rewards/margins": 0.078687384724617, "rewards/rejected": -0.09721329808235168, "step": 3610 }, { "epoch": 0.95, "learning_rate": 4.205027849605359e-08, "logits/chosen": 1.0005525350570679, "logits/rejected": 0.973240852355957, "logps/chosen": -231.69265747070312, "logps/rejected": -235.55581665039062, "loss": 0.0401, "rewards/accuracies": 0.59375, "rewards/chosen": -0.02195514738559723, "rewards/margins": 0.049081120640039444, "rewards/rejected": -0.07103626430034637, "step": 3620 }, { "epoch": 0.95, "learning_rate": 3.798061746947995e-08, "logits/chosen": 0.9860151410102844, "logits/rejected": 1.0056957006454468, "logps/chosen": -345.712890625, "logps/rejected": -263.0353088378906, "loss": 0.0229, "rewards/accuracies": 0.625, "rewards/chosen": -0.02045946940779686, "rewards/margins": 0.05285441130399704, "rewards/rejected": -0.0733138769865036, "step": 3630 }, { "epoch": 0.95, "learning_rate": 3.411653435283158e-08, "logits/chosen": 0.9643945693969727, "logits/rejected": 1.0569902658462524, "logps/chosen": -294.2444763183594, "logps/rejected": -257.2134704589844, "loss": 0.0379, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.0335361547768116, "rewards/margins": 0.04753485321998596, "rewards/rejected": -0.08107100427150726, "step": 3640 }, { "epoch": 0.96, "learning_rate": 3.04583517959367e-08, "logits/chosen": 0.957710862159729, "logits/rejected": 0.983841598033905, "logps/chosen": -299.52471923828125, "logps/rejected": -250.6820831298828, "loss": 0.0363, "rewards/accuracies": 0.65625, "rewards/chosen": -0.008963306434452534, "rewards/margins": 0.08262725919485092, "rewards/rejected": -0.09159056842327118, "step": 3650 }, { "epoch": 0.96, "learning_rate": 2.7006375255985984e-08, "logits/chosen": 0.9700638055801392, "logits/rejected": 1.0240037441253662, "logps/chosen": -241.0752410888672, "logps/rejected": -215.42562866210938, "loss": 0.0298, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.026114290580153465, "rewards/margins": 0.048808712512254715, "rewards/rejected": -0.07492300122976303, "step": 3660 }, { "epoch": 0.96, "learning_rate": 2.3760892972027328e-08, "logits/chosen": 0.9013713002204895, "logits/rejected": 1.0410950183868408, "logps/chosen": -290.4670104980469, "logps/rejected": -238.00454711914062, "loss": 0.0394, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.023938851431012154, "rewards/margins": 0.06292831152677536, "rewards/rejected": -0.08686716854572296, "step": 3670 }, { "epoch": 0.96, "learning_rate": 2.072217594089765e-08, "logits/chosen": 1.0121930837631226, "logits/rejected": 0.9898314476013184, "logps/chosen": -247.4111785888672, "logps/rejected": -232.48184204101562, "loss": 0.0394, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.021979061886668205, "rewards/margins": 0.0453622080385685, "rewards/rejected": -0.06734126806259155, "step": 3680 }, { "epoch": 0.97, "learning_rate": 1.789047789459375e-08, "logits/chosen": 0.9309120178222656, "logits/rejected": 1.0076799392700195, "logps/chosen": -279.5455627441406, "logps/rejected": -257.4979248046875, "loss": 0.0377, "rewards/accuracies": 0.625, "rewards/chosen": -0.027569543570280075, "rewards/margins": 0.0605216808617115, "rewards/rejected": -0.08809121698141098, "step": 3690 }, { "epoch": 0.97, "learning_rate": 1.5266035279088708e-08, "logits/chosen": 0.9448448419570923, "logits/rejected": 1.0074546337127686, "logps/chosen": -320.0146179199219, "logps/rejected": -246.0650634765625, "loss": 0.0361, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.023909619078040123, "rewards/margins": 0.05834698677062988, "rewards/rejected": -0.08225660026073456, "step": 3700 }, { "epoch": 0.97, "eval_logits/chosen": 0.9376645088195801, "eval_logits/rejected": 1.0399267673492432, "eval_logps/chosen": -280.2046813964844, "eval_logps/rejected": -252.3740997314453, "eval_loss": 0.03461700677871704, "eval_rewards/accuracies": 0.6309999823570251, "eval_rewards/chosen": -0.026578795164823532, "eval_rewards/margins": 0.05900290608406067, "eval_rewards/rejected": -0.0855816975235939, "eval_runtime": 538.2979, "eval_samples_per_second": 3.715, "eval_steps_per_second": 0.929, "step": 3700 }, { "epoch": 0.97, "learning_rate": 1.2849067234584623e-08, "logits/chosen": 0.9224729537963867, "logits/rejected": 1.0151176452636719, "logps/chosen": -258.44451904296875, "logps/rejected": -240.2635498046875, "loss": 0.0317, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.02202828973531723, "rewards/margins": 0.055525414645671844, "rewards/rejected": -0.07755370438098907, "step": 3710 }, { "epoch": 0.97, "learning_rate": 1.0639775577218625e-08, "logits/chosen": 0.9609501957893372, "logits/rejected": 1.0031676292419434, "logps/chosen": -288.76519775390625, "logps/rejected": -266.21478271484375, "loss": 0.0313, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.01723320782184601, "rewards/margins": 0.06596283614635468, "rewards/rejected": -0.08319603651762009, "step": 3720 }, { "epoch": 0.98, "learning_rate": 8.638344782207486e-09, "logits/chosen": 0.9164566993713379, "logits/rejected": 1.0248987674713135, "logps/chosen": -303.3111877441406, "logps/rejected": -282.0052185058594, "loss": 0.0371, "rewards/accuracies": 0.5625, "rewards/chosen": -0.036383308470249176, "rewards/margins": 0.0420096218585968, "rewards/rejected": -0.07839293777942657, "step": 3730 }, { "epoch": 0.98, "learning_rate": 6.84494196844715e-09, "logits/chosen": 0.9781646728515625, "logits/rejected": 1.0682637691497803, "logps/chosen": -265.3148193359375, "logps/rejected": -244.98489379882812, "loss": 0.0314, "rewards/accuracies": 0.625, "rewards/chosen": -0.016114359721541405, "rewards/margins": 0.0599919855594635, "rewards/rejected": -0.07610634714365005, "step": 3740 }, { "epoch": 0.98, "learning_rate": 5.259716884556121e-09, "logits/chosen": 0.9260244369506836, "logits/rejected": 1.0235192775726318, "logps/chosen": -311.13995361328125, "logps/rejected": -255.76925659179688, "loss": 0.0216, "rewards/accuracies": 0.6875, "rewards/chosen": -0.019463708624243736, "rewards/margins": 0.06179703399538994, "rewards/rejected": -0.08126074075698853, "step": 3750 }, { "epoch": 0.98, "learning_rate": 3.882801896372967e-09, "logits/chosen": 0.8902843594551086, "logits/rejected": 1.0316295623779297, "logps/chosen": -282.79034423828125, "logps/rejected": -261.2944641113281, "loss": 0.0333, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.024235766381025314, "rewards/margins": 0.04566502943634987, "rewards/rejected": -0.06990079581737518, "step": 3760 }, { "epoch": 0.99, "learning_rate": 2.7143119759026614e-09, "logits/chosen": 0.9450544118881226, "logits/rejected": 1.0837864875793457, "logps/chosen": -270.1015319824219, "logps/rejected": -273.8962707519531, "loss": 0.0297, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.007587161846458912, "rewards/margins": 0.06377876549959183, "rewards/rejected": -0.07136592268943787, "step": 3770 }, { "epoch": 0.99, "learning_rate": 1.754344691717591e-09, "logits/chosen": 0.9069304466247559, "logits/rejected": 0.979387640953064, "logps/chosen": -264.6932067871094, "logps/rejected": -218.8267822265625, "loss": 0.0347, "rewards/accuracies": 0.65625, "rewards/chosen": -0.011129969730973244, "rewards/margins": 0.06624534726142883, "rewards/rejected": -0.07737531512975693, "step": 3780 }, { "epoch": 0.99, "learning_rate": 1.0029802008096335e-09, "logits/chosen": 1.0251834392547607, "logits/rejected": 0.9546536207199097, "logps/chosen": -267.88116455078125, "logps/rejected": -228.50390625, "loss": 0.0348, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.026254802942276, "rewards/margins": 0.06516362726688385, "rewards/rejected": -0.09141843020915985, "step": 3790 }, { "epoch": 0.99, "learning_rate": 4.602812418974534e-10, "logits/chosen": 0.9103859066963196, "logits/rejected": 1.032867670059204, "logps/chosen": -254.2440643310547, "logps/rejected": -240.1820831298828, "loss": 0.0298, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.03087850846350193, "rewards/margins": 0.050477832555770874, "rewards/rejected": -0.08135633170604706, "step": 3800 }, { "epoch": 0.99, "eval_logits/chosen": 0.9387494921684265, "eval_logits/rejected": 1.0411853790283203, "eval_logps/chosen": -280.1766662597656, "eval_logps/rejected": -252.320068359375, "eval_loss": 0.03467794507741928, "eval_rewards/accuracies": 0.6274999976158142, "eval_rewards/chosen": -0.02629854343831539, "eval_rewards/margins": 0.058742720633745193, "eval_rewards/rejected": -0.08504127711057663, "eval_runtime": 538.3057, "eval_samples_per_second": 3.715, "eval_steps_per_second": 0.929, "step": 3800 }, { "epoch": 1.0, "learning_rate": 1.2629313018819312e-10, "logits/chosen": 0.9089424014091492, "logits/rejected": 1.0153186321258545, "logps/chosen": -302.22296142578125, "logps/rejected": -284.89483642578125, "loss": 0.0326, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.007386692799627781, "rewards/margins": 0.025576096028089523, "rewards/rejected": -0.03296279162168503, "step": 3810 }, { "epoch": 1.0, "learning_rate": 1.0437535929996855e-12, "logits/chosen": 0.9965023994445801, "logits/rejected": 0.9472867250442505, "logps/chosen": -285.3251037597656, "logps/rejected": -266.6712341308594, "loss": 0.035, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.012905353680253029, "rewards/margins": 0.030642932280898094, "rewards/rejected": -0.04354828968644142, "step": 3820 }, { "epoch": 1.0, "step": 3821, "total_flos": 0.0, "train_loss": 0.0001892112643969555, "train_runtime": 195.8108, "train_samples_per_second": 312.215, "train_steps_per_second": 19.514 } ], "logging_steps": 10, "max_steps": 3821, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }