diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,6000 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 100, + "global_step": 3821, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "learning_rate": 1.3054830287206268e-08, + "logits/chosen": 0.9550814628601074, + "logits/rejected": 1.0664727687835693, + "logps/chosen": -190.47879028320312, + "logps/rejected": -177.6958770751953, + "loss": 0.1031, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1 + }, + { + "epoch": 0.0, + "learning_rate": 1.3054830287206266e-07, + "logits/chosen": 1.021599531173706, + "logits/rejected": 1.0737736225128174, + "logps/chosen": -277.8912048339844, + "logps/rejected": -268.34259033203125, + "loss": 0.0514, + "rewards/accuracies": 0.4166666567325592, + "rewards/chosen": 2.9820108466083184e-05, + "rewards/margins": 0.000656133983284235, + "rewards/rejected": -0.0006263138493523002, + "step": 10 + }, + { + "epoch": 0.01, + "learning_rate": 2.610966057441253e-07, + "logits/chosen": 1.0539672374725342, + "logits/rejected": 1.035296082496643, + "logps/chosen": -258.02105712890625, + "logps/rejected": -219.51577758789062, + "loss": 0.0679, + "rewards/accuracies": 0.4124999940395355, + "rewards/chosen": -0.00037693610647693276, + "rewards/margins": -0.0003669637371785939, + "rewards/rejected": -9.972270163416397e-06, + "step": 20 + }, + { + "epoch": 0.01, + "learning_rate": 3.9164490861618804e-07, + "logits/chosen": 0.9785920977592468, + "logits/rejected": 0.9956333041191101, + "logps/chosen": -234.4257354736328, + "logps/rejected": -216.3408660888672, + "loss": 0.0522, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -0.00037702807458117604, + "rewards/margins": 0.0003918584552593529, + "rewards/rejected": -0.0007688865880481899, + "step": 30 + }, + { + "epoch": 0.01, + "learning_rate": 5.221932114882506e-07, + "logits/chosen": 1.0598526000976562, + "logits/rejected": 1.0610239505767822, + "logps/chosen": -269.3299865722656, + "logps/rejected": -236.5482635498047, + "loss": 0.0646, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": -0.0001463999942643568, + "rewards/margins": 0.0003663330862764269, + "rewards/rejected": -0.0005127330077812076, + "step": 40 + }, + { + "epoch": 0.01, + "learning_rate": 6.527415143603135e-07, + "logits/chosen": 1.0115251541137695, + "logits/rejected": 1.0492277145385742, + "logps/chosen": -245.1737518310547, + "logps/rejected": -241.9782257080078, + "loss": 0.053, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -0.0008561966242268682, + "rewards/margins": 0.00045777196646668017, + "rewards/rejected": -0.0013139685615897179, + "step": 50 + }, + { + "epoch": 0.02, + "learning_rate": 7.832898172323761e-07, + "logits/chosen": 0.9759989976882935, + "logits/rejected": 1.09335196018219, + "logps/chosen": -283.7034912109375, + "logps/rejected": -234.171142578125, + "loss": 0.0508, + "rewards/accuracies": 0.46875, + "rewards/chosen": 0.0002724650257732719, + "rewards/margins": 0.0006013559177517891, + "rewards/rejected": -0.0003288908628746867, + "step": 60 + }, + { + "epoch": 0.02, + "learning_rate": 9.138381201044387e-07, + "logits/chosen": 1.0061399936676025, + "logits/rejected": 1.0819300413131714, + "logps/chosen": -272.0354919433594, + "logps/rejected": -231.0594482421875, + "loss": 0.0533, + "rewards/accuracies": 0.45625001192092896, + "rewards/chosen": -0.0003669198777060956, + "rewards/margins": -0.0001511875307187438, + "rewards/rejected": -0.00021573244885075837, + "step": 70 + }, + { + "epoch": 0.02, + "learning_rate": 1.0443864229765013e-06, + "logits/chosen": 1.0220763683319092, + "logits/rejected": 1.0622212886810303, + "logps/chosen": -283.91650390625, + "logps/rejected": -261.65411376953125, + "loss": 0.0441, + "rewards/accuracies": 0.39375001192092896, + "rewards/chosen": -0.00052777084056288, + "rewards/margins": -0.0005939611000940204, + "rewards/rejected": 6.619028135901317e-05, + "step": 80 + }, + { + "epoch": 0.02, + "learning_rate": 1.1749347258485642e-06, + "logits/chosen": 1.0424718856811523, + "logits/rejected": 1.092550277709961, + "logps/chosen": -278.462890625, + "logps/rejected": -235.7613983154297, + "loss": 0.0596, + "rewards/accuracies": 0.3812499940395355, + "rewards/chosen": -0.0010484650265425444, + "rewards/margins": -0.0007164698326960206, + "rewards/rejected": -0.0003319952520541847, + "step": 90 + }, + { + "epoch": 0.03, + "learning_rate": 1.305483028720627e-06, + "logits/chosen": 0.9916040301322937, + "logits/rejected": 1.066935420036316, + "logps/chosen": -237.2812957763672, + "logps/rejected": -218.4796905517578, + "loss": 0.0659, + "rewards/accuracies": 0.48124998807907104, + "rewards/chosen": -4.606993752531707e-05, + "rewards/margins": 0.0002796413318719715, + "rewards/rejected": -0.000325711298501119, + "step": 100 + }, + { + "epoch": 0.03, + "eval_logits/chosen": 0.9710860252380371, + "eval_logits/rejected": 1.0635499954223633, + "eval_logps/chosen": -277.5683288574219, + "eval_logps/rejected": -243.89227294921875, + "eval_loss": 0.053576212376356125, + "eval_rewards/accuracies": 0.47450000047683716, + "eval_rewards/chosen": -0.00021531998936552554, + "eval_rewards/margins": 0.0005480629042722285, + "eval_rewards/rejected": -0.0007633829372934997, + "eval_runtime": 539.1486, + "eval_samples_per_second": 3.71, + "eval_steps_per_second": 0.927, + "step": 100 + }, + { + "epoch": 0.03, + "learning_rate": 1.4360313315926894e-06, + "logits/chosen": 0.9959138035774231, + "logits/rejected": 1.0810822248458862, + "logps/chosen": -283.58575439453125, + "logps/rejected": -250.1833038330078, + "loss": 0.0529, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": 0.0003468608483672142, + "rewards/margins": 0.0019003556808456779, + "rewards/rejected": -0.0015534948324784636, + "step": 110 + }, + { + "epoch": 0.03, + "learning_rate": 1.5665796344647521e-06, + "logits/chosen": 1.0288623571395874, + "logits/rejected": 1.0744774341583252, + "logps/chosen": -227.82470703125, + "logps/rejected": -234.0697479248047, + "loss": 0.0857, + "rewards/accuracies": 0.39375001192092896, + "rewards/chosen": -0.0001596544898347929, + "rewards/margins": -0.0005016528302803636, + "rewards/rejected": 0.0003419983549974859, + "step": 120 + }, + { + "epoch": 0.03, + "learning_rate": 1.6971279373368146e-06, + "logits/chosen": 1.04789137840271, + "logits/rejected": 1.0942102670669556, + "logps/chosen": -282.67510986328125, + "logps/rejected": -239.3311309814453, + "loss": 0.0449, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.00027361814863979816, + "rewards/margins": 0.002355109201744199, + "rewards/rejected": -0.002081490820273757, + "step": 130 + }, + { + "epoch": 0.04, + "learning_rate": 1.8276762402088774e-06, + "logits/chosen": 1.0264707803726196, + "logits/rejected": 1.02583646774292, + "logps/chosen": -264.01715087890625, + "logps/rejected": -237.10549926757812, + "loss": 0.0474, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": 0.0003395401581656188, + "rewards/margins": 0.001898492919281125, + "rewards/rejected": -0.0015589528484269977, + "step": 140 + }, + { + "epoch": 0.04, + "learning_rate": 1.9582245430809403e-06, + "logits/chosen": 1.0069233179092407, + "logits/rejected": 1.0264513492584229, + "logps/chosen": -262.6693420410156, + "logps/rejected": -235.0095977783203, + "loss": 0.0641, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": 0.00045030430192127824, + "rewards/margins": 0.0019102304941043258, + "rewards/rejected": -0.0014599261339753866, + "step": 150 + }, + { + "epoch": 0.04, + "learning_rate": 2.0887728459530026e-06, + "logits/chosen": 0.9561678171157837, + "logits/rejected": 1.085860252380371, + "logps/chosen": -258.2762451171875, + "logps/rejected": -240.168701171875, + "loss": 0.0571, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": 0.0003362145507708192, + "rewards/margins": 0.002913826610893011, + "rewards/rejected": -0.0025776117108762264, + "step": 160 + }, + { + "epoch": 0.04, + "learning_rate": 2.2193211488250653e-06, + "logits/chosen": 0.996711254119873, + "logits/rejected": 1.0722663402557373, + "logps/chosen": -268.49578857421875, + "logps/rejected": -218.3070831298828, + "loss": 0.0502, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": 0.0006658965139649808, + "rewards/margins": 0.0029082505498081446, + "rewards/rejected": -0.0022423542104661465, + "step": 170 + }, + { + "epoch": 0.05, + "learning_rate": 2.3498694516971284e-06, + "logits/chosen": 0.9853906631469727, + "logits/rejected": 1.033320665359497, + "logps/chosen": -272.53961181640625, + "logps/rejected": -237.8509979248047, + "loss": 0.0638, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": 0.0014408377464860678, + "rewards/margins": 0.003102297894656658, + "rewards/rejected": -0.0016614599153399467, + "step": 180 + }, + { + "epoch": 0.05, + "learning_rate": 2.4804177545691907e-06, + "logits/chosen": 0.9930588006973267, + "logits/rejected": 1.0107576847076416, + "logps/chosen": -269.4462890625, + "logps/rejected": -235.57852172851562, + "loss": 0.0604, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.0017847366398200393, + "rewards/margins": 0.004315118305385113, + "rewards/rejected": -0.0025303815491497517, + "step": 190 + }, + { + "epoch": 0.05, + "learning_rate": 2.610966057441254e-06, + "logits/chosen": 1.0219703912734985, + "logits/rejected": 1.1328296661376953, + "logps/chosen": -278.0802917480469, + "logps/rejected": -249.68466186523438, + "loss": 0.0597, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": 0.004124884493649006, + "rewards/margins": 0.005005924496799707, + "rewards/rejected": -0.000881039712112397, + "step": 200 + }, + { + "epoch": 0.05, + "eval_logits/chosen": 0.9688093662261963, + "eval_logits/rejected": 1.0617414712905884, + "eval_logps/chosen": -277.1978759765625, + "eval_logps/rejected": -243.9651336669922, + "eval_loss": 0.05182640627026558, + "eval_rewards/accuracies": 0.5879999995231628, + "eval_rewards/chosen": 0.0034893574193120003, + "eval_rewards/margins": 0.00498173339292407, + "eval_rewards/rejected": -0.0014923758571967483, + "eval_runtime": 539.156, + "eval_samples_per_second": 3.71, + "eval_steps_per_second": 0.927, + "step": 200 + }, + { + "epoch": 0.05, + "learning_rate": 2.741514360313316e-06, + "logits/chosen": 1.0123722553253174, + "logits/rejected": 1.0923728942871094, + "logps/chosen": -260.80499267578125, + "logps/rejected": -233.2253875732422, + "loss": 0.0394, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": 0.002722758101299405, + "rewards/margins": 0.004253658466041088, + "rewards/rejected": -0.0015309008304029703, + "step": 210 + }, + { + "epoch": 0.06, + "learning_rate": 2.872062663185379e-06, + "logits/chosen": 1.040575385093689, + "logits/rejected": 1.1116924285888672, + "logps/chosen": -277.50433349609375, + "logps/rejected": -243.0937042236328, + "loss": 0.0443, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.004540332593023777, + "rewards/margins": 0.005531441420316696, + "rewards/rejected": -0.0009911099914461374, + "step": 220 + }, + { + "epoch": 0.06, + "learning_rate": 3.0026109660574416e-06, + "logits/chosen": 1.029170036315918, + "logits/rejected": 1.0374505519866943, + "logps/chosen": -268.8113708496094, + "logps/rejected": -275.160400390625, + "loss": 0.0402, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.005796975456178188, + "rewards/margins": 0.006743866018950939, + "rewards/rejected": -0.0009468902135267854, + "step": 230 + }, + { + "epoch": 0.06, + "learning_rate": 3.1331592689295043e-06, + "logits/chosen": 1.027822732925415, + "logits/rejected": 1.0454634428024292, + "logps/chosen": -271.84796142578125, + "logps/rejected": -231.60018920898438, + "loss": 0.0469, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": 0.009604343213140965, + "rewards/margins": 0.010616883635520935, + "rewards/rejected": -0.001012541470117867, + "step": 240 + }, + { + "epoch": 0.07, + "learning_rate": 3.263707571801567e-06, + "logits/chosen": 0.9246234893798828, + "logits/rejected": 1.081726312637329, + "logps/chosen": -262.35052490234375, + "logps/rejected": -207.2003631591797, + "loss": 0.0461, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.005290716886520386, + "rewards/margins": 0.007529892958700657, + "rewards/rejected": -0.002239175606518984, + "step": 250 + }, + { + "epoch": 0.07, + "learning_rate": 3.3942558746736293e-06, + "logits/chosen": 1.0157970190048218, + "logits/rejected": 1.0061004161834717, + "logps/chosen": -255.8148956298828, + "logps/rejected": -249.33810424804688, + "loss": 0.0541, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.005560068879276514, + "rewards/margins": 0.00910879485309124, + "rewards/rejected": -0.003548725973814726, + "step": 260 + }, + { + "epoch": 0.07, + "learning_rate": 3.524804177545692e-06, + "logits/chosen": 0.9183789491653442, + "logits/rejected": 1.0651085376739502, + "logps/chosen": -250.3922119140625, + "logps/rejected": -225.31845092773438, + "loss": 0.0573, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.004733686335384846, + "rewards/margins": 0.010133610107004642, + "rewards/rejected": -0.005399924702942371, + "step": 270 + }, + { + "epoch": 0.07, + "learning_rate": 3.6553524804177547e-06, + "logits/chosen": 0.9533087015151978, + "logits/rejected": 0.9936316609382629, + "logps/chosen": -262.45989990234375, + "logps/rejected": -245.0988006591797, + "loss": 0.0652, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": 0.009047111496329308, + "rewards/margins": 0.01569160632789135, + "rewards/rejected": -0.006644496228545904, + "step": 280 + }, + { + "epoch": 0.08, + "learning_rate": 3.7859007832898174e-06, + "logits/chosen": 0.9586070775985718, + "logits/rejected": 1.0487323999404907, + "logps/chosen": -258.58087158203125, + "logps/rejected": -229.2060546875, + "loss": 0.0527, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.009910664521157742, + "rewards/margins": 0.01812123879790306, + "rewards/rejected": -0.008210571482777596, + "step": 290 + }, + { + "epoch": 0.08, + "learning_rate": 3.9164490861618806e-06, + "logits/chosen": 0.9238823056221008, + "logits/rejected": 1.0459530353546143, + "logps/chosen": -257.22900390625, + "logps/rejected": -227.42770385742188, + "loss": 0.0564, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": 0.008343839086592197, + "rewards/margins": 0.01660408265888691, + "rewards/rejected": -0.008260244503617287, + "step": 300 + }, + { + "epoch": 0.08, + "eval_logits/chosen": 0.9498724341392517, + "eval_logits/rejected": 1.0439953804016113, + "eval_logps/chosen": -276.5095520019531, + "eval_logps/rejected": -244.6271514892578, + "eval_loss": 0.047470785677433014, + "eval_rewards/accuracies": 0.6175000071525574, + "eval_rewards/chosen": 0.010372455231845379, + "eval_rewards/margins": 0.018484672531485558, + "eval_rewards/rejected": -0.008112218230962753, + "eval_runtime": 539.0567, + "eval_samples_per_second": 3.71, + "eval_steps_per_second": 0.928, + "step": 300 + }, + { + "epoch": 0.08, + "learning_rate": 4.046997389033943e-06, + "logits/chosen": 0.9125510454177856, + "logits/rejected": 1.0743194818496704, + "logps/chosen": -256.52203369140625, + "logps/rejected": -227.2289581298828, + "loss": 0.0457, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": 0.013696533627808094, + "rewards/margins": 0.022891724482178688, + "rewards/rejected": -0.009195187129080296, + "step": 310 + }, + { + "epoch": 0.08, + "learning_rate": 4.177545691906005e-06, + "logits/chosen": 0.8873203992843628, + "logits/rejected": 1.0165441036224365, + "logps/chosen": -282.78265380859375, + "logps/rejected": -257.1055603027344, + "loss": 0.0416, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.009485239163041115, + "rewards/margins": 0.01556326448917389, + "rewards/rejected": -0.006078026257455349, + "step": 320 + }, + { + "epoch": 0.09, + "learning_rate": 4.308093994778068e-06, + "logits/chosen": 0.967635989189148, + "logits/rejected": 1.0755988359451294, + "logps/chosen": -278.8580017089844, + "logps/rejected": -244.1697235107422, + "loss": 0.0547, + "rewards/accuracies": 0.59375, + "rewards/chosen": 0.0037938461173325777, + "rewards/margins": 0.02032056823372841, + "rewards/rejected": -0.01652671955525875, + "step": 330 + }, + { + "epoch": 0.09, + "learning_rate": 4.4386422976501306e-06, + "logits/chosen": 0.9581148028373718, + "logits/rejected": 0.9901423454284668, + "logps/chosen": -274.140625, + "logps/rejected": -268.93115234375, + "loss": 0.0454, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.004858436528593302, + "rewards/margins": 0.02044074237346649, + "rewards/rejected": -0.015582305379211903, + "step": 340 + }, + { + "epoch": 0.09, + "learning_rate": 4.569190600522193e-06, + "logits/chosen": 0.974514365196228, + "logits/rejected": 0.9625232815742493, + "logps/chosen": -284.48089599609375, + "logps/rejected": -250.8555908203125, + "loss": 0.0553, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": 0.008898411877453327, + "rewards/margins": 0.02419520542025566, + "rewards/rejected": -0.015296794474124908, + "step": 350 + }, + { + "epoch": 0.09, + "learning_rate": 4.699738903394257e-06, + "logits/chosen": 0.9241229295730591, + "logits/rejected": 1.0142980813980103, + "logps/chosen": -301.9035949707031, + "logps/rejected": -258.56298828125, + "loss": 0.0361, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": 0.00739449355751276, + "rewards/margins": 0.02596624568104744, + "rewards/rejected": -0.018571753054857254, + "step": 360 + }, + { + "epoch": 0.1, + "learning_rate": 4.8302872062663196e-06, + "logits/chosen": 0.9622675180435181, + "logits/rejected": 0.9503853917121887, + "logps/chosen": -305.0982971191406, + "logps/rejected": -260.784423828125, + "loss": 0.0439, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": 0.008727970533072948, + "rewards/margins": 0.026681995019316673, + "rewards/rejected": -0.01795402355492115, + "step": 370 + }, + { + "epoch": 0.1, + "learning_rate": 4.9608355091383814e-06, + "logits/chosen": 0.9401235580444336, + "logits/rejected": 1.042701005935669, + "logps/chosen": -255.1713409423828, + "logps/rejected": -223.62197875976562, + "loss": 0.0635, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": 0.010734304785728455, + "rewards/margins": 0.0330788716673851, + "rewards/rejected": -0.022344566881656647, + "step": 380 + }, + { + "epoch": 0.1, + "learning_rate": 4.9999488562447675e-06, + "logits/chosen": 0.9535024762153625, + "logits/rejected": 0.9772897958755493, + "logps/chosen": -298.8131103515625, + "logps/rejected": -256.53302001953125, + "loss": 0.0383, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": 0.01018481608480215, + "rewards/margins": 0.03526074439287186, + "rewards/rejected": -0.02507592737674713, + "step": 390 + }, + { + "epoch": 0.1, + "learning_rate": 4.999698361256577e-06, + "logits/chosen": 0.9641995429992676, + "logits/rejected": 0.9660250544548035, + "logps/chosen": -278.9350891113281, + "logps/rejected": -263.6481628417969, + "loss": 0.0402, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.0038296219427138567, + "rewards/margins": 0.0380658321082592, + "rewards/rejected": -0.03423621878027916, + "step": 400 + }, + { + "epoch": 0.1, + "eval_logits/chosen": 0.8994618058204651, + "eval_logits/rejected": 0.9931817650794983, + "eval_logps/chosen": -277.37713623046875, + "eval_logps/rejected": -246.910888671875, + "eval_loss": 0.04383732005953789, + "eval_rewards/accuracies": 0.6324999928474426, + "eval_rewards/chosen": 0.0016969649586826563, + "eval_rewards/margins": 0.03264675661921501, + "eval_rewards/rejected": -0.03094978630542755, + "eval_runtime": 539.0327, + "eval_samples_per_second": 3.71, + "eval_steps_per_second": 0.928, + "step": 400 + }, + { + "epoch": 0.11, + "learning_rate": 4.999239142174581e-06, + "logits/chosen": 0.8831006288528442, + "logits/rejected": 0.8935713768005371, + "logps/chosen": -293.3919982910156, + "logps/rejected": -245.70181274414062, + "loss": 0.0399, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.0013237474486231804, + "rewards/margins": 0.03327140584588051, + "rewards/rejected": -0.03194766119122505, + "step": 410 + }, + { + "epoch": 0.11, + "learning_rate": 4.99857123734344e-06, + "logits/chosen": 0.9195354580879211, + "logits/rejected": 1.0214719772338867, + "logps/chosen": -278.9452209472656, + "logps/rejected": -246.7372589111328, + "loss": 0.0479, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.0041291858069598675, + "rewards/margins": 0.03341008350253105, + "rewards/rejected": -0.03753926604986191, + "step": 420 + }, + { + "epoch": 0.11, + "learning_rate": 4.997694702533016e-06, + "logits/chosen": 0.8224805593490601, + "logits/rejected": 0.9571186900138855, + "logps/chosen": -265.44757080078125, + "logps/rejected": -260.35748291015625, + "loss": 0.0417, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.01658559963107109, + "rewards/margins": 0.031039753928780556, + "rewards/rejected": -0.0476253516972065, + "step": 430 + }, + { + "epoch": 0.12, + "learning_rate": 4.996609610933713e-06, + "logits/chosen": 0.8995935320854187, + "logits/rejected": 0.8752401471138, + "logps/chosen": -283.92718505859375, + "logps/rejected": -262.1937561035156, + "loss": 0.0446, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.02005813643336296, + "rewards/margins": 0.027436578646302223, + "rewards/rejected": -0.04749471694231033, + "step": 440 + }, + { + "epoch": 0.12, + "learning_rate": 4.995316053150366e-06, + "logits/chosen": 0.8733296394348145, + "logits/rejected": 0.9702059626579285, + "logps/chosen": -263.5385437011719, + "logps/rejected": -237.3717803955078, + "loss": 0.035, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.02067594602704048, + "rewards/margins": 0.025421470403671265, + "rewards/rejected": -0.046097420156002045, + "step": 450 + }, + { + "epoch": 0.12, + "learning_rate": 4.9938141371946815e-06, + "logits/chosen": 0.9152857661247253, + "logits/rejected": 1.0352412462234497, + "logps/chosen": -252.8705596923828, + "logps/rejected": -253.1604766845703, + "loss": 0.0578, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.00973983108997345, + "rewards/margins": 0.04021871089935303, + "rewards/rejected": -0.049958545714616776, + "step": 460 + }, + { + "epoch": 0.12, + "learning_rate": 4.992103988476206e-06, + "logits/chosen": 0.9032732844352722, + "logits/rejected": 0.9913337826728821, + "logps/chosen": -296.672119140625, + "logps/rejected": -250.1068878173828, + "loss": 0.0611, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.013237145729362965, + "rewards/margins": 0.0339890792965889, + "rewards/rejected": -0.047226227819919586, + "step": 470 + }, + { + "epoch": 0.13, + "learning_rate": 4.990185749791866e-06, + "logits/chosen": 0.852449893951416, + "logits/rejected": 0.9530878067016602, + "logps/chosen": -244.0757293701172, + "logps/rejected": -213.8367919921875, + "loss": 0.041, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.016842521727085114, + "rewards/margins": 0.039869144558906555, + "rewards/rejected": -0.05671166256070137, + "step": 480 + }, + { + "epoch": 0.13, + "learning_rate": 4.9880595813140395e-06, + "logits/chosen": 0.8507224321365356, + "logits/rejected": 0.9859424829483032, + "logps/chosen": -276.9876403808594, + "logps/rejected": -239.509521484375, + "loss": 0.0438, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.022139808163046837, + "rewards/margins": 0.034604597836732864, + "rewards/rejected": -0.05674440786242485, + "step": 490 + }, + { + "epoch": 0.13, + "learning_rate": 4.985725660577184e-06, + "logits/chosen": 0.9554396867752075, + "logits/rejected": 0.9389545321464539, + "logps/chosen": -246.587158203125, + "logps/rejected": -236.51171875, + "loss": 0.0421, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.031007111072540283, + "rewards/margins": 0.036111582070589066, + "rewards/rejected": -0.06711869686841965, + "step": 500 + }, + { + "epoch": 0.13, + "eval_logits/chosen": 0.8361961841583252, + "eval_logits/rejected": 0.9295023679733276, + "eval_logps/chosen": -281.6956481933594, + "eval_logps/rejected": -251.91390991210938, + "eval_loss": 0.041099708527326584, + "eval_rewards/accuracies": 0.6194999814033508, + "eval_rewards/chosen": -0.0414884127676487, + "eval_rewards/margins": 0.03949163854122162, + "eval_rewards/rejected": -0.08098004758358002, + "eval_runtime": 539.1317, + "eval_samples_per_second": 3.71, + "eval_steps_per_second": 0.927, + "step": 500 + }, + { + "epoch": 0.13, + "learning_rate": 4.983184182463009e-06, + "logits/chosen": 0.8190703392028809, + "logits/rejected": 0.9820553064346313, + "logps/chosen": -286.83819580078125, + "logps/rejected": -225.30502319335938, + "loss": 0.0481, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.034673698246479034, + "rewards/margins": 0.04182344675064087, + "rewards/rejected": -0.0764971375465393, + "step": 510 + }, + { + "epoch": 0.14, + "learning_rate": 4.980435359184203e-06, + "logits/chosen": 0.8918254971504211, + "logits/rejected": 0.9686266779899597, + "logps/chosen": -291.6520080566406, + "logps/rejected": -257.6617126464844, + "loss": 0.0324, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.034312546253204346, + "rewards/margins": 0.04724326729774475, + "rewards/rejected": -0.0815558210015297, + "step": 520 + }, + { + "epoch": 0.14, + "learning_rate": 4.9774794202667236e-06, + "logits/chosen": 0.8753170967102051, + "logits/rejected": 0.9559276700019836, + "logps/chosen": -268.8739929199219, + "logps/rejected": -239.81484985351562, + "loss": 0.0294, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.028610479086637497, + "rewards/margins": 0.04722968488931656, + "rewards/rejected": -0.07584016025066376, + "step": 530 + }, + { + "epoch": 0.14, + "learning_rate": 4.974316612530615e-06, + "logits/chosen": 0.8446584939956665, + "logits/rejected": 0.9035196304321289, + "logps/chosen": -245.0946502685547, + "logps/rejected": -227.1122589111328, + "loss": 0.0412, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.03522849082946777, + "rewards/margins": 0.045352503657341, + "rewards/rejected": -0.08058099448680878, + "step": 540 + }, + { + "epoch": 0.14, + "learning_rate": 4.970947200069416e-06, + "logits/chosen": 0.8838707208633423, + "logits/rejected": 0.9225630760192871, + "logps/chosen": -267.9327087402344, + "logps/rejected": -232.60745239257812, + "loss": 0.0575, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.05179372429847717, + "rewards/margins": 0.05676066875457764, + "rewards/rejected": -0.10855438560247421, + "step": 550 + }, + { + "epoch": 0.15, + "learning_rate": 4.967371464228096e-06, + "logits/chosen": 0.9162457585334778, + "logits/rejected": 0.9888601303100586, + "logps/chosen": -288.94476318359375, + "logps/rejected": -267.3609313964844, + "loss": 0.0424, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.05239032953977585, + "rewards/margins": 0.040877897292375565, + "rewards/rejected": -0.09326823055744171, + "step": 560 + }, + { + "epoch": 0.15, + "learning_rate": 4.963589703579569e-06, + "logits/chosen": 0.8896541595458984, + "logits/rejected": 1.071001410484314, + "logps/chosen": -271.9025573730469, + "logps/rejected": -227.18283081054688, + "loss": 0.0489, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.039633095264434814, + "rewards/margins": 0.06890513002872467, + "rewards/rejected": -0.10853822529315948, + "step": 570 + }, + { + "epoch": 0.15, + "learning_rate": 4.9596022338997615e-06, + "logits/chosen": 0.9138982892036438, + "logits/rejected": 0.8892068862915039, + "logps/chosen": -261.0631408691406, + "logps/rejected": -233.52206420898438, + "loss": 0.0463, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.04562286287546158, + "rewards/margins": 0.03961200267076492, + "rewards/rejected": -0.0852348655462265, + "step": 580 + }, + { + "epoch": 0.15, + "learning_rate": 4.955409388141243e-06, + "logits/chosen": 0.9704087972640991, + "logits/rejected": 0.9119867086410522, + "logps/chosen": -272.70599365234375, + "logps/rejected": -236.0823211669922, + "loss": 0.0383, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.04884126037359238, + "rewards/margins": 0.03933250904083252, + "rewards/rejected": -0.0881737768650055, + "step": 590 + }, + { + "epoch": 0.16, + "learning_rate": 4.951011516405429e-06, + "logits/chosen": 0.9375241994857788, + "logits/rejected": 0.9329082369804382, + "logps/chosen": -243.24472045898438, + "logps/rejected": -266.67962646484375, + "loss": 0.0439, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.06609703600406647, + "rewards/margins": 0.03910304233431816, + "rewards/rejected": -0.10520007461309433, + "step": 600 + }, + { + "epoch": 0.16, + "eval_logits/chosen": 0.8607339262962341, + "eval_logits/rejected": 0.952020525932312, + "eval_logps/chosen": -284.5547180175781, + "eval_logps/rejected": -255.50050354003906, + "eval_loss": 0.03948886692523956, + "eval_rewards/accuracies": 0.6175000071525574, + "eval_rewards/chosen": -0.07007911801338196, + "eval_rewards/margins": 0.046766627579927444, + "eval_rewards/rejected": -0.11684573441743851, + "eval_runtime": 539.068, + "eval_samples_per_second": 3.71, + "eval_steps_per_second": 0.928, + "step": 600 + }, + { + "epoch": 0.16, + "learning_rate": 4.946408985913344e-06, + "logits/chosen": 0.9299923777580261, + "logits/rejected": 0.949097752571106, + "logps/chosen": -251.2418212890625, + "logps/rejected": -229.9620819091797, + "loss": 0.0418, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.08045734465122223, + "rewards/margins": 0.039681874215602875, + "rewards/rejected": -0.1201392188668251, + "step": 610 + }, + { + "epoch": 0.16, + "learning_rate": 4.941602180974958e-06, + "logits/chosen": 0.9066370129585266, + "logits/rejected": 0.9455870389938354, + "logps/chosen": -293.0451354980469, + "logps/rejected": -237.9638214111328, + "loss": 0.0459, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.07144733518362045, + "rewards/margins": 0.0446074940264225, + "rewards/rejected": -0.11605483293533325, + "step": 620 + }, + { + "epoch": 0.16, + "learning_rate": 4.936591502957101e-06, + "logits/chosen": 0.8010427355766296, + "logits/rejected": 0.9337188005447388, + "logps/chosen": -301.9491882324219, + "logps/rejected": -257.299560546875, + "loss": 0.0313, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.049509234726428986, + "rewards/margins": 0.03574910759925842, + "rewards/rejected": -0.085258349776268, + "step": 630 + }, + { + "epoch": 0.17, + "learning_rate": 4.931377370249946e-06, + "logits/chosen": 0.9087220430374146, + "logits/rejected": 0.9815553426742554, + "logps/chosen": -279.0362548828125, + "logps/rejected": -235.7110137939453, + "loss": 0.0311, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.0523492693901062, + "rewards/margins": 0.03988610580563545, + "rewards/rejected": -0.09223536401987076, + "step": 640 + }, + { + "epoch": 0.17, + "learning_rate": 4.925960218232073e-06, + "logits/chosen": 0.9217544794082642, + "logits/rejected": 1.0427885055541992, + "logps/chosen": -308.48004150390625, + "logps/rejected": -286.24566650390625, + "loss": 0.0343, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.05535319447517395, + "rewards/margins": 0.054420508444309235, + "rewards/rejected": -0.10977371037006378, + "step": 650 + }, + { + "epoch": 0.17, + "learning_rate": 4.920340499234116e-06, + "logits/chosen": 0.8749852180480957, + "logits/rejected": 0.9889238476753235, + "logps/chosen": -288.8839416503906, + "logps/rejected": -217.01760864257812, + "loss": 0.0459, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.051978230476379395, + "rewards/margins": 0.03869297355413437, + "rewards/rejected": -0.09067119657993317, + "step": 660 + }, + { + "epoch": 0.18, + "learning_rate": 4.914518682500995e-06, + "logits/chosen": 0.955339252948761, + "logits/rejected": 0.939558207988739, + "logps/chosen": -282.4548034667969, + "logps/rejected": -278.93646240234375, + "loss": 0.0427, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.05497046187520027, + "rewards/margins": 0.046811606734991074, + "rewards/rejected": -0.10178206861019135, + "step": 670 + }, + { + "epoch": 0.18, + "learning_rate": 4.9084952541527315e-06, + "logits/chosen": 0.9087344408035278, + "logits/rejected": 0.9323067665100098, + "logps/chosen": -258.89776611328125, + "logps/rejected": -246.7029266357422, + "loss": 0.0366, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.03954412043094635, + "rewards/margins": 0.0509122833609581, + "rewards/rejected": -0.09045641124248505, + "step": 680 + }, + { + "epoch": 0.18, + "learning_rate": 4.902270717143858e-06, + "logits/chosen": 0.8921301960945129, + "logits/rejected": 0.9605242013931274, + "logps/chosen": -288.9732666015625, + "logps/rejected": -252.2064971923828, + "loss": 0.0353, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.03391667455434799, + "rewards/margins": 0.05217113345861435, + "rewards/rejected": -0.08608780801296234, + "step": 690 + }, + { + "epoch": 0.18, + "learning_rate": 4.895845591221427e-06, + "logits/chosen": 0.8141648173332214, + "logits/rejected": 0.9764218330383301, + "logps/chosen": -251.57406616210938, + "logps/rejected": -232.8114776611328, + "loss": 0.0363, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.04507957026362419, + "rewards/margins": 0.047078561037778854, + "rewards/rejected": -0.09215812385082245, + "step": 700 + }, + { + "epoch": 0.18, + "eval_logits/chosen": 0.894931435585022, + "eval_logits/rejected": 0.9895482063293457, + "eval_logps/chosen": -281.16192626953125, + "eval_logps/rejected": -251.89256286621094, + "eval_loss": 0.03899623081088066, + "eval_rewards/accuracies": 0.6309999823570251, + "eval_rewards/chosen": -0.036151450127363205, + "eval_rewards/margins": 0.04461483657360077, + "eval_rewards/rejected": -0.08076628297567368, + "eval_runtime": 539.1732, + "eval_samples_per_second": 3.709, + "eval_steps_per_second": 0.927, + "step": 700 + }, + { + "epoch": 0.19, + "learning_rate": 4.8892204128816e-06, + "logits/chosen": 0.9831956028938293, + "logits/rejected": 1.0133693218231201, + "logps/chosen": -286.310546875, + "logps/rejected": -283.8514099121094, + "loss": 0.035, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.034512270241975784, + "rewards/margins": 0.047017090022563934, + "rewards/rejected": -0.08152935653924942, + "step": 710 + }, + { + "epoch": 0.19, + "learning_rate": 4.882395735324864e-06, + "logits/chosen": 0.8945713043212891, + "logits/rejected": 0.8778280019760132, + "logps/chosen": -281.03704833984375, + "logps/rejected": -255.6659698486328, + "loss": 0.0347, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.027182284742593765, + "rewards/margins": 0.04411619156599045, + "rewards/rejected": -0.07129846513271332, + "step": 720 + }, + { + "epoch": 0.19, + "learning_rate": 4.87537212840983e-06, + "logits/chosen": 0.9152518510818481, + "logits/rejected": 0.9284723997116089, + "logps/chosen": -302.0511169433594, + "logps/rejected": -256.92071533203125, + "loss": 0.0389, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.02550722099840641, + "rewards/margins": 0.049370888620615005, + "rewards/rejected": -0.07487811148166656, + "step": 730 + }, + { + "epoch": 0.19, + "learning_rate": 4.8681501786056545e-06, + "logits/chosen": 0.8889036178588867, + "logits/rejected": 0.9711803197860718, + "logps/chosen": -273.5214538574219, + "logps/rejected": -221.85977172851562, + "loss": 0.0501, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -0.03598209470510483, + "rewards/margins": 0.03736092895269394, + "rewards/rejected": -0.07334302365779877, + "step": 740 + }, + { + "epoch": 0.2, + "learning_rate": 4.860730488943068e-06, + "logits/chosen": 0.9261956214904785, + "logits/rejected": 0.9333757162094116, + "logps/chosen": -279.0644226074219, + "logps/rejected": -245.6189422607422, + "loss": 0.0386, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.016734056174755096, + "rewards/margins": 0.05964844301342964, + "rewards/rejected": -0.07638250291347504, + "step": 750 + }, + { + "epoch": 0.2, + "learning_rate": 4.853113678964022e-06, + "logits/chosen": 0.9745391607284546, + "logits/rejected": 0.9919463396072388, + "logps/chosen": -249.7964630126953, + "logps/rejected": -235.2704315185547, + "loss": 0.0336, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.029250269755721092, + "rewards/margins": 0.05691809579730034, + "rewards/rejected": -0.08616836369037628, + "step": 760 + }, + { + "epoch": 0.2, + "learning_rate": 4.845300384669958e-06, + "logits/chosen": 0.9550157785415649, + "logits/rejected": 0.9655323028564453, + "logps/chosen": -266.6517028808594, + "logps/rejected": -224.41366577148438, + "loss": 0.0503, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.04020417481660843, + "rewards/margins": 0.03571712225675583, + "rewards/rejected": -0.07592129707336426, + "step": 770 + }, + { + "epoch": 0.2, + "learning_rate": 4.837291258468701e-06, + "logits/chosen": 0.8910456895828247, + "logits/rejected": 0.9127016067504883, + "logps/chosen": -280.16632080078125, + "logps/rejected": -249.5512237548828, + "loss": 0.0404, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.04826400801539421, + "rewards/margins": 0.05860195308923721, + "rewards/rejected": -0.10686596482992172, + "step": 780 + }, + { + "epoch": 0.21, + "learning_rate": 4.829086969119984e-06, + "logits/chosen": 0.8841217756271362, + "logits/rejected": 0.9553950428962708, + "logps/chosen": -249.34146118164062, + "logps/rejected": -241.98623657226562, + "loss": 0.0306, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.04528197646141052, + "rewards/margins": 0.04499911516904831, + "rewards/rejected": -0.09028108417987823, + "step": 790 + }, + { + "epoch": 0.21, + "learning_rate": 4.820688201679605e-06, + "logits/chosen": 0.9039901494979858, + "logits/rejected": 0.9560089111328125, + "logps/chosen": -262.4331359863281, + "logps/rejected": -234.8318328857422, + "loss": 0.0402, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.04591182619333267, + "rewards/margins": 0.0473395399749279, + "rewards/rejected": -0.09325136244297028, + "step": 800 + }, + { + "epoch": 0.21, + "eval_logits/chosen": 0.9000641107559204, + "eval_logits/rejected": 0.9937340021133423, + "eval_logps/chosen": -282.6900939941406, + "eval_logps/rejected": -253.87200927734375, + "eval_loss": 0.03816115856170654, + "eval_rewards/accuracies": 0.621999979019165, + "eval_rewards/chosen": -0.05143279209733009, + "eval_rewards/margins": 0.049128152430057526, + "eval_rewards/rejected": -0.10056094080209732, + "eval_runtime": 538.996, + "eval_samples_per_second": 3.711, + "eval_steps_per_second": 0.928, + "step": 800 + }, + { + "epoch": 0.21, + "learning_rate": 4.8120956574422315e-06, + "logits/chosen": 0.985218346118927, + "logits/rejected": 0.9626695513725281, + "logps/chosen": -285.3841247558594, + "logps/rejected": -283.31024169921875, + "loss": 0.0464, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.05067628622055054, + "rewards/margins": 0.054511237889528275, + "rewards/rejected": -0.10518752038478851, + "step": 810 + }, + { + "epoch": 0.21, + "learning_rate": 4.803310053882831e-06, + "logits/chosen": 0.916561484336853, + "logits/rejected": 0.9501992464065552, + "logps/chosen": -215.0513458251953, + "logps/rejected": -205.7407989501953, + "loss": 0.0444, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.05079100281000137, + "rewards/margins": 0.042504359036684036, + "rewards/rejected": -0.09329536557197571, + "step": 820 + }, + { + "epoch": 0.22, + "learning_rate": 4.794332124596775e-06, + "logits/chosen": 0.9133389592170715, + "logits/rejected": 0.9860326647758484, + "logps/chosen": -280.44476318359375, + "logps/rejected": -256.5655212402344, + "loss": 0.0377, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -0.06920581310987473, + "rewards/margins": 0.037345677614212036, + "rewards/rejected": -0.10655149072408676, + "step": 830 + }, + { + "epoch": 0.22, + "learning_rate": 4.785162619238575e-06, + "logits/chosen": 0.9965925216674805, + "logits/rejected": 1.0270875692367554, + "logps/chosen": -269.73480224609375, + "logps/rejected": -253.419677734375, + "loss": 0.0389, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.0890035331249237, + "rewards/margins": 0.05010632425546646, + "rewards/rejected": -0.13910984992980957, + "step": 840 + }, + { + "epoch": 0.22, + "learning_rate": 4.775802303459288e-06, + "logits/chosen": 0.9339388012886047, + "logits/rejected": 1.0584567785263062, + "logps/chosen": -296.1967468261719, + "logps/rejected": -267.1576232910156, + "loss": 0.0395, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.09242481738328934, + "rewards/margins": 0.05164768174290657, + "rewards/rejected": -0.1440725028514862, + "step": 850 + }, + { + "epoch": 0.23, + "learning_rate": 4.766251958842589e-06, + "logits/chosen": 0.9728446006774902, + "logits/rejected": 1.0179331302642822, + "logps/chosen": -308.83551025390625, + "logps/rejected": -272.84796142578125, + "loss": 0.0325, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.08283834159374237, + "rewards/margins": 0.047348491847515106, + "rewards/rejected": -0.13018682599067688, + "step": 860 + }, + { + "epoch": 0.23, + "learning_rate": 4.7565123828395066e-06, + "logits/chosen": 0.9140795469284058, + "logits/rejected": 1.0925973653793335, + "logps/chosen": -319.13250732421875, + "logps/rejected": -295.2183837890625, + "loss": 0.0217, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.05658285692334175, + "rewards/margins": 0.04898856207728386, + "rewards/rejected": -0.10557142645120621, + "step": 870 + }, + { + "epoch": 0.23, + "learning_rate": 4.746584388701831e-06, + "logits/chosen": 1.016638994216919, + "logits/rejected": 1.144810438156128, + "logps/chosen": -261.43853759765625, + "logps/rejected": -228.3360595703125, + "loss": 0.0479, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.05251890420913696, + "rewards/margins": 0.07163821160793304, + "rewards/rejected": -0.12415711581707001, + "step": 880 + }, + { + "epoch": 0.23, + "learning_rate": 4.736468805414218e-06, + "logits/chosen": 1.0147383213043213, + "logits/rejected": 1.1735047101974487, + "logps/chosen": -297.06878662109375, + "logps/rejected": -270.4248046875, + "loss": 0.0401, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.045632537454366684, + "rewards/margins": 0.08101598918437958, + "rewards/rejected": -0.12664853036403656, + "step": 890 + }, + { + "epoch": 0.24, + "learning_rate": 4.7261664776249595e-06, + "logits/chosen": 1.0279176235198975, + "logits/rejected": 1.0230156183242798, + "logps/chosen": -273.8691101074219, + "logps/rejected": -256.41912841796875, + "loss": 0.0381, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.053291238844394684, + "rewards/margins": 0.059307873249053955, + "rewards/rejected": -0.11259911209344864, + "step": 900 + }, + { + "epoch": 0.24, + "eval_logits/chosen": 0.9533628225326538, + "eval_logits/rejected": 1.0464704036712646, + "eval_logps/chosen": -283.0850830078125, + "eval_logps/rejected": -254.8046875, + "eval_loss": 0.03756963834166527, + "eval_rewards/accuracies": 0.6315000057220459, + "eval_rewards/chosen": -0.05538267269730568, + "eval_rewards/margins": 0.054504893720149994, + "eval_rewards/rejected": -0.10988757014274597, + "eval_runtime": 538.9934, + "eval_samples_per_second": 3.711, + "eval_steps_per_second": 0.928, + "step": 900 + }, + { + "epoch": 0.24, + "learning_rate": 4.715678265575463e-06, + "logits/chosen": 1.033189058303833, + "logits/rejected": 1.1023738384246826, + "logps/chosen": -313.0371398925781, + "logps/rejected": -296.1219482421875, + "loss": 0.0362, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.05556187033653259, + "rewards/margins": 0.036861807107925415, + "rewards/rejected": -0.09242367744445801, + "step": 910 + }, + { + "epoch": 0.24, + "learning_rate": 4.705005045028415e-06, + "logits/chosen": 1.0147944688796997, + "logits/rejected": 1.0735704898834229, + "logps/chosen": -336.9757995605469, + "logps/rejected": -290.46820068359375, + "loss": 0.0291, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.05297808721661568, + "rewards/margins": 0.06880663335323334, + "rewards/rejected": -0.12178472429513931, + "step": 920 + }, + { + "epoch": 0.24, + "learning_rate": 4.694147707194659e-06, + "logits/chosen": 1.0121935606002808, + "logits/rejected": 1.0971285104751587, + "logps/chosen": -291.9516296386719, + "logps/rejected": -246.9907684326172, + "loss": 0.0394, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.0664924904704094, + "rewards/margins": 0.05605294555425644, + "rewards/rejected": -0.12254543602466583, + "step": 930 + }, + { + "epoch": 0.25, + "learning_rate": 4.683107158658782e-06, + "logits/chosen": 0.976874053478241, + "logits/rejected": 0.9745148420333862, + "logps/chosen": -267.6925354003906, + "logps/rejected": -239.68063354492188, + "loss": 0.0418, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.06668306887149811, + "rewards/margins": 0.06082264333963394, + "rewards/rejected": -0.12750570476055145, + "step": 940 + }, + { + "epoch": 0.25, + "learning_rate": 4.671884321303407e-06, + "logits/chosen": 0.9618045687675476, + "logits/rejected": 0.9925098419189453, + "logps/chosen": -293.56109619140625, + "logps/rejected": -280.6075439453125, + "loss": 0.036, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.08693472295999527, + "rewards/margins": 0.055363357067108154, + "rewards/rejected": -0.14229807257652283, + "step": 950 + }, + { + "epoch": 0.25, + "learning_rate": 4.660480132232224e-06, + "logits/chosen": 0.9112693071365356, + "logits/rejected": 0.9766784906387329, + "logps/chosen": -284.80621337890625, + "logps/rejected": -257.4582824707031, + "loss": 0.0321, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.08965489268302917, + "rewards/margins": 0.05001254007220268, + "rewards/rejected": -0.13966743648052216, + "step": 960 + }, + { + "epoch": 0.25, + "learning_rate": 4.6488955436917414e-06, + "logits/chosen": 0.9918138384819031, + "logits/rejected": 1.0566661357879639, + "logps/chosen": -285.3071594238281, + "logps/rejected": -222.27645874023438, + "loss": 0.0308, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.07797913253307343, + "rewards/margins": 0.035942643880844116, + "rewards/rejected": -0.11392178386449814, + "step": 970 + }, + { + "epoch": 0.26, + "learning_rate": 4.6371315229917644e-06, + "logits/chosen": 0.9261223077774048, + "logits/rejected": 0.991136908531189, + "logps/chosen": -268.17742919921875, + "logps/rejected": -251.2354278564453, + "loss": 0.0314, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.055728018283843994, + "rewards/margins": 0.05425562709569931, + "rewards/rejected": -0.1099836453795433, + "step": 980 + }, + { + "epoch": 0.26, + "learning_rate": 4.625189052424638e-06, + "logits/chosen": 0.9730453491210938, + "logits/rejected": 1.032597303390503, + "logps/chosen": -249.26864624023438, + "logps/rejected": -216.4525909423828, + "loss": 0.0417, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.04507770389318466, + "rewards/margins": 0.03995997831225395, + "rewards/rejected": -0.08503767102956772, + "step": 990 + }, + { + "epoch": 0.26, + "learning_rate": 4.613069129183218e-06, + "logits/chosen": 0.9484704732894897, + "logits/rejected": 0.9633451700210571, + "logps/chosen": -264.92633056640625, + "logps/rejected": -257.33941650390625, + "loss": 0.0421, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.04708124324679375, + "rewards/margins": 0.04397277534008026, + "rewards/rejected": -0.09105401486158371, + "step": 1000 + }, + { + "epoch": 0.26, + "eval_logits/chosen": 0.9447739124298096, + "eval_logits/rejected": 1.0398797988891602, + "eval_logps/chosen": -281.62677001953125, + "eval_logps/rejected": -253.11135864257812, + "eval_loss": 0.0373673252761364, + "eval_rewards/accuracies": 0.6269999742507935, + "eval_rewards/chosen": -0.040799498558044434, + "eval_rewards/margins": 0.05215470865368843, + "eval_rewards/rejected": -0.09295421838760376, + "eval_runtime": 539.0882, + "eval_samples_per_second": 3.71, + "eval_steps_per_second": 0.927, + "step": 1000 + }, + { + "epoch": 0.26, + "learning_rate": 4.600772765277607e-06, + "logits/chosen": 0.9384505152702332, + "logits/rejected": 1.036522388458252, + "logps/chosen": -300.46435546875, + "logps/rejected": -254.4575958251953, + "loss": 0.0411, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.04996770992875099, + "rewards/margins": 0.05875014141201973, + "rewards/rejected": -0.10871784389019012, + "step": 1010 + }, + { + "epoch": 0.27, + "learning_rate": 4.588300987450652e-06, + "logits/chosen": 1.0055780410766602, + "logits/rejected": 1.0831704139709473, + "logps/chosen": -283.211181640625, + "logps/rejected": -268.2267150878906, + "loss": 0.0358, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.05758129805326462, + "rewards/margins": 0.062214724719524384, + "rewards/rejected": -0.1197960153222084, + "step": 1020 + }, + { + "epoch": 0.27, + "learning_rate": 4.5756548370922136e-06, + "logits/chosen": 0.991938591003418, + "logits/rejected": 1.0386155843734741, + "logps/chosen": -283.02081298828125, + "logps/rejected": -256.51434326171875, + "loss": 0.0317, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.052321650087833405, + "rewards/margins": 0.03847939521074295, + "rewards/rejected": -0.09080104529857635, + "step": 1030 + }, + { + "epoch": 0.27, + "learning_rate": 4.562835370152206e-06, + "logits/chosen": 0.9276207089424133, + "logits/rejected": 1.045037865638733, + "logps/chosen": -254.05892944335938, + "logps/rejected": -240.1755828857422, + "loss": 0.0383, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.04577355459332466, + "rewards/margins": 0.036837171763181686, + "rewards/rejected": -0.08261072635650635, + "step": 1040 + }, + { + "epoch": 0.27, + "learning_rate": 4.54984365705243e-06, + "logits/chosen": 1.0454961061477661, + "logits/rejected": 1.0517162084579468, + "logps/chosen": -255.4774627685547, + "logps/rejected": -232.04190063476562, + "loss": 0.0469, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.03626035898923874, + "rewards/margins": 0.043095506727695465, + "rewards/rejected": -0.07935585826635361, + "step": 1050 + }, + { + "epoch": 0.28, + "learning_rate": 4.536680782597191e-06, + "logits/chosen": 0.9645137786865234, + "logits/rejected": 1.0626866817474365, + "logps/chosen": -298.6683349609375, + "logps/rejected": -269.98724365234375, + "loss": 0.0301, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.02404281124472618, + "rewards/margins": 0.05357781797647476, + "rewards/rejected": -0.07762061804533005, + "step": 1060 + }, + { + "epoch": 0.28, + "learning_rate": 4.523347845882718e-06, + "logits/chosen": 1.0478742122650146, + "logits/rejected": 1.112269639968872, + "logps/chosen": -258.76458740234375, + "logps/rejected": -208.38107299804688, + "loss": 0.0384, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.03418079391121864, + "rewards/margins": 0.05562075227499008, + "rewards/rejected": -0.08980154246091843, + "step": 1070 + }, + { + "epoch": 0.28, + "learning_rate": 4.50984596020539e-06, + "logits/chosen": 0.9761837720870972, + "logits/rejected": 1.0493123531341553, + "logps/chosen": -300.64105224609375, + "logps/rejected": -265.405517578125, + "loss": 0.0404, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.03310775384306908, + "rewards/margins": 0.04876155033707619, + "rewards/rejected": -0.08186930418014526, + "step": 1080 + }, + { + "epoch": 0.29, + "learning_rate": 4.4961762529687745e-06, + "logits/chosen": 0.9403045773506165, + "logits/rejected": 1.0544614791870117, + "logps/chosen": -305.84027099609375, + "logps/rejected": -234.25341796875, + "loss": 0.0407, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.04744723066687584, + "rewards/margins": 0.04316466301679611, + "rewards/rejected": -0.09061190485954285, + "step": 1090 + }, + { + "epoch": 0.29, + "learning_rate": 4.482339865589492e-06, + "logits/chosen": 0.8826369047164917, + "logits/rejected": 1.0571672916412354, + "logps/chosen": -266.90252685546875, + "logps/rejected": -235.89877319335938, + "loss": 0.0393, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.05000295490026474, + "rewards/margins": 0.058589059859514236, + "rewards/rejected": -0.10859201848506927, + "step": 1100 + }, + { + "epoch": 0.29, + "eval_logits/chosen": 0.9608851075172424, + "eval_logits/rejected": 1.0557035207748413, + "eval_logps/chosen": -283.3030700683594, + "eval_logps/rejected": -254.34910583496094, + "eval_loss": 0.03702974691987038, + "eval_rewards/accuracies": 0.6284999847412109, + "eval_rewards/chosen": -0.05756256729364395, + "eval_rewards/margins": 0.04776925593614578, + "eval_rewards/rejected": -0.10533181577920914, + "eval_runtime": 539.1778, + "eval_samples_per_second": 3.709, + "eval_steps_per_second": 0.927, + "step": 1100 + }, + { + "epoch": 0.29, + "learning_rate": 4.468337953401909e-06, + "logits/chosen": 1.0197701454162598, + "logits/rejected": 1.0784578323364258, + "logps/chosen": -299.9832458496094, + "logps/rejected": -247.8249969482422, + "loss": 0.039, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.04413250833749771, + "rewards/margins": 0.051008790731430054, + "rewards/rejected": -0.09514130651950836, + "step": 1110 + }, + { + "epoch": 0.29, + "learning_rate": 4.45417168556166e-06, + "logits/chosen": 0.9736183285713196, + "logits/rejected": 1.0494579076766968, + "logps/chosen": -261.1857604980469, + "logps/rejected": -262.29241943359375, + "loss": 0.0333, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.03721706196665764, + "rewards/margins": 0.04604244977235794, + "rewards/rejected": -0.08325951546430588, + "step": 1120 + }, + { + "epoch": 0.3, + "learning_rate": 4.439842244948036e-06, + "logits/chosen": 0.9458913803100586, + "logits/rejected": 1.0195437669754028, + "logps/chosen": -261.69512939453125, + "logps/rejected": -244.91513061523438, + "loss": 0.0403, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.03936644643545151, + "rewards/margins": 0.04326556995511055, + "rewards/rejected": -0.08263202011585236, + "step": 1130 + }, + { + "epoch": 0.3, + "learning_rate": 4.425350828065204e-06, + "logits/chosen": 1.0116994380950928, + "logits/rejected": 1.0178403854370117, + "logps/chosen": -282.7467956542969, + "logps/rejected": -266.43963623046875, + "loss": 0.0363, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.03308098763227463, + "rewards/margins": 0.048868577927351, + "rewards/rejected": -0.08194957673549652, + "step": 1140 + }, + { + "epoch": 0.3, + "learning_rate": 4.410698644942303e-06, + "logits/chosen": 1.0054022073745728, + "logits/rejected": 1.0613911151885986, + "logps/chosen": -281.3987731933594, + "logps/rejected": -285.315673828125, + "loss": 0.0349, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -0.031625766307115555, + "rewards/margins": 0.03699468821287155, + "rewards/rejected": -0.0686204582452774, + "step": 1150 + }, + { + "epoch": 0.3, + "learning_rate": 4.395886919032406e-06, + "logits/chosen": 1.0045572519302368, + "logits/rejected": 1.0691404342651367, + "logps/chosen": -274.28741455078125, + "logps/rejected": -232.10574340820312, + "loss": 0.0388, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.03036217950284481, + "rewards/margins": 0.04830170422792435, + "rewards/rejected": -0.0786639004945755, + "step": 1160 + }, + { + "epoch": 0.31, + "learning_rate": 4.380916887110366e-06, + "logits/chosen": 0.9450492858886719, + "logits/rejected": 1.0655186176300049, + "logps/chosen": -251.56259155273438, + "logps/rejected": -223.6671600341797, + "loss": 0.0337, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.03250129520893097, + "rewards/margins": 0.06061319261789322, + "rewards/rejected": -0.09311448037624359, + "step": 1170 + }, + { + "epoch": 0.31, + "learning_rate": 4.365789799169539e-06, + "logits/chosen": 1.004997968673706, + "logits/rejected": 0.9879255294799805, + "logps/chosen": -313.8050842285156, + "logps/rejected": -263.55718994140625, + "loss": 0.0399, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.0565299317240715, + "rewards/margins": 0.057772088795900345, + "rewards/rejected": -0.11430201679468155, + "step": 1180 + }, + { + "epoch": 0.31, + "learning_rate": 4.350506918317416e-06, + "logits/chosen": 0.9306057095527649, + "logits/rejected": 0.8915265798568726, + "logps/chosen": -307.5498962402344, + "logps/rejected": -276.47088623046875, + "loss": 0.0349, + "rewards/accuracies": 0.53125, + "rewards/chosen": -0.047587279230356216, + "rewards/margins": 0.03898516297340393, + "rewards/rejected": -0.08657244592905045, + "step": 1190 + }, + { + "epoch": 0.31, + "learning_rate": 4.335069520670149e-06, + "logits/chosen": 0.9720792770385742, + "logits/rejected": 1.0783460140228271, + "logps/chosen": -281.57232666015625, + "logps/rejected": -217.4803009033203, + "loss": 0.0533, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.052179861813783646, + "rewards/margins": 0.06668353080749512, + "rewards/rejected": -0.11886338889598846, + "step": 1200 + }, + { + "epoch": 0.31, + "eval_logits/chosen": 0.9417441487312317, + "eval_logits/rejected": 1.0367752313613892, + "eval_logps/chosen": -283.6021728515625, + "eval_logps/rejected": -255.3543701171875, + "eval_loss": 0.0369240865111351, + "eval_rewards/accuracies": 0.6209999918937683, + "eval_rewards/chosen": -0.0605538934469223, + "eval_rewards/margins": 0.05483054369688034, + "eval_rewards/rejected": -0.11538443714380264, + "eval_runtime": 538.9866, + "eval_samples_per_second": 3.711, + "eval_steps_per_second": 0.928, + "step": 1200 + }, + { + "epoch": 0.32, + "learning_rate": 4.319478895246e-06, + "logits/chosen": 0.9820619821548462, + "logits/rejected": 0.9384095072746277, + "logps/chosen": -279.18499755859375, + "logps/rejected": -242.414794921875, + "loss": 0.0357, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.05051257088780403, + "rewards/margins": 0.04647805169224739, + "rewards/rejected": -0.09699061512947083, + "step": 1210 + }, + { + "epoch": 0.32, + "learning_rate": 4.303736343857704e-06, + "logits/chosen": 1.0831791162490845, + "logits/rejected": 1.034220814704895, + "logps/chosen": -269.3099365234375, + "logps/rejected": -252.9770965576172, + "loss": 0.0464, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.052128423005342484, + "rewards/margins": 0.06647459417581558, + "rewards/rejected": -0.11860301345586777, + "step": 1220 + }, + { + "epoch": 0.32, + "learning_rate": 4.287843181003772e-06, + "logits/chosen": 0.9971106648445129, + "logits/rejected": 1.1152498722076416, + "logps/chosen": -281.20208740234375, + "logps/rejected": -239.54006958007812, + "loss": 0.0394, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.03529990836977959, + "rewards/margins": 0.0698903501033783, + "rewards/rejected": -0.10519025474786758, + "step": 1230 + }, + { + "epoch": 0.32, + "learning_rate": 4.27180073375873e-06, + "logits/chosen": 0.9784267544746399, + "logits/rejected": 1.0021319389343262, + "logps/chosen": -281.3310241699219, + "logps/rejected": -254.4761962890625, + "loss": 0.038, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.03572789579629898, + "rewards/margins": 0.06249629333615303, + "rewards/rejected": -0.09822418540716171, + "step": 1240 + }, + { + "epoch": 0.33, + "learning_rate": 4.255610341662304e-06, + "logits/chosen": 0.9751097559928894, + "logits/rejected": 1.0435435771942139, + "logps/chosen": -306.3908386230469, + "logps/rejected": -268.491455078125, + "loss": 0.0425, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.01923917606472969, + "rewards/margins": 0.06259562820196152, + "rewards/rejected": -0.0818348079919815, + "step": 1250 + }, + { + "epoch": 0.33, + "learning_rate": 4.2392733566075764e-06, + "logits/chosen": 0.9983538389205933, + "logits/rejected": 1.0708853006362915, + "logps/chosen": -243.330322265625, + "logps/rejected": -224.99075317382812, + "loss": 0.0371, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.0249100960791111, + "rewards/margins": 0.06250262260437012, + "rewards/rejected": -0.08741272240877151, + "step": 1260 + }, + { + "epoch": 0.33, + "learning_rate": 4.2227911427280975e-06, + "logits/chosen": 0.9960931539535522, + "logits/rejected": 1.0461426973342896, + "logps/chosen": -286.45440673828125, + "logps/rejected": -271.23980712890625, + "loss": 0.0393, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.02666524052619934, + "rewards/margins": 0.048846714198589325, + "rewards/rejected": -0.07551195472478867, + "step": 1270 + }, + { + "epoch": 0.33, + "learning_rate": 4.206165076283983e-06, + "logits/chosen": 0.9975628852844238, + "logits/rejected": 1.0400108098983765, + "logps/chosen": -238.9987030029297, + "logps/rejected": -228.86312866210938, + "loss": 0.0446, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.023251879960298538, + "rewards/margins": 0.04423128813505173, + "rewards/rejected": -0.06748317182064056, + "step": 1280 + }, + { + "epoch": 0.34, + "learning_rate": 4.189396545546995e-06, + "logits/chosen": 1.0215797424316406, + "logits/rejected": 1.1224019527435303, + "logps/chosen": -289.75323486328125, + "logps/rejected": -280.547607421875, + "loss": 0.034, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.019200313836336136, + "rewards/margins": 0.03457511216402054, + "rewards/rejected": -0.053775422275066376, + "step": 1290 + }, + { + "epoch": 0.34, + "learning_rate": 4.172486950684627e-06, + "logits/chosen": 1.0255308151245117, + "logits/rejected": 1.0572230815887451, + "logps/chosen": -279.2245178222656, + "logps/rejected": -261.9948425292969, + "loss": 0.0392, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.02723405882716179, + "rewards/margins": 0.047797515988349915, + "rewards/rejected": -0.075031578540802, + "step": 1300 + }, + { + "epoch": 0.34, + "eval_logits/chosen": 0.9659793972969055, + "eval_logits/rejected": 1.0633821487426758, + "eval_logps/chosen": -279.6128845214844, + "eval_logps/rejected": -250.95762634277344, + "eval_loss": 0.0366741381585598, + "eval_rewards/accuracies": 0.6119999885559082, + "eval_rewards/chosen": -0.020660726353526115, + "eval_rewards/margins": 0.05075635015964508, + "eval_rewards/rejected": -0.07141707837581635, + "eval_runtime": 539.1224, + "eval_samples_per_second": 3.71, + "eval_steps_per_second": 0.927, + "step": 1300 + }, + { + "epoch": 0.34, + "learning_rate": 4.155437703643182e-06, + "logits/chosen": 0.9241663217544556, + "logits/rejected": 0.9624761343002319, + "logps/chosen": -284.2118225097656, + "logps/rejected": -250.8708953857422, + "loss": 0.0311, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.01345390360802412, + "rewards/margins": 0.06398328393697739, + "rewards/rejected": -0.07743719965219498, + "step": 1310 + }, + { + "epoch": 0.35, + "learning_rate": 4.138250228029882e-06, + "logits/chosen": 1.0098600387573242, + "logits/rejected": 1.132021188735962, + "logps/chosen": -233.54672241210938, + "logps/rejected": -216.55630493164062, + "loss": 0.0418, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.014631894417107105, + "rewards/margins": 0.07016023248434067, + "rewards/rejected": -0.0847921296954155, + "step": 1320 + }, + { + "epoch": 0.35, + "learning_rate": 4.120925958993994e-06, + "logits/chosen": 0.9781894683837891, + "logits/rejected": 1.0409111976623535, + "logps/chosen": -295.82403564453125, + "logps/rejected": -249.65530395507812, + "loss": 0.0414, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.026302725076675415, + "rewards/margins": 0.0485377199947834, + "rewards/rejected": -0.07484044134616852, + "step": 1330 + }, + { + "epoch": 0.35, + "learning_rate": 4.103466343106999e-06, + "logits/chosen": 0.9926727414131165, + "logits/rejected": 1.0091572999954224, + "logps/chosen": -274.465576171875, + "logps/rejected": -243.0918731689453, + "loss": 0.0381, + "rewards/accuracies": 0.53125, + "rewards/chosen": -0.019631439819931984, + "rewards/margins": 0.047348715364933014, + "rewards/rejected": -0.06698014587163925, + "step": 1340 + }, + { + "epoch": 0.35, + "learning_rate": 4.085872838241797e-06, + "logits/chosen": 1.0105888843536377, + "logits/rejected": 1.042271614074707, + "logps/chosen": -286.24774169921875, + "logps/rejected": -284.87847900390625, + "loss": 0.0349, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.010948913171887398, + "rewards/margins": 0.058446235954761505, + "rewards/rejected": -0.06939514726400375, + "step": 1350 + }, + { + "epoch": 0.36, + "learning_rate": 4.06814691345098e-06, + "logits/chosen": 0.9800373911857605, + "logits/rejected": 1.1013528108596802, + "logps/chosen": -303.1207275390625, + "logps/rejected": -257.15679931640625, + "loss": 0.0439, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.03485560044646263, + "rewards/margins": 0.04474693164229393, + "rewards/rejected": -0.07960253953933716, + "step": 1360 + }, + { + "epoch": 0.36, + "learning_rate": 4.050290048844171e-06, + "logits/chosen": 1.0052525997161865, + "logits/rejected": 1.061704158782959, + "logps/chosen": -256.0435791015625, + "logps/rejected": -225.7267303466797, + "loss": 0.0437, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.014286425895988941, + "rewards/margins": 0.03906578570604324, + "rewards/rejected": -0.05335221439599991, + "step": 1370 + }, + { + "epoch": 0.36, + "learning_rate": 4.032303735464422e-06, + "logits/chosen": 0.9533084034919739, + "logits/rejected": 1.0529754161834717, + "logps/chosen": -289.2705078125, + "logps/rejected": -257.12225341796875, + "loss": 0.0421, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.001376188127323985, + "rewards/margins": 0.05364586040377617, + "rewards/rejected": -0.05502205342054367, + "step": 1380 + }, + { + "epoch": 0.36, + "learning_rate": 4.014189475163727e-06, + "logits/chosen": 0.9958294630050659, + "logits/rejected": 1.0700651407241821, + "logps/chosen": -266.70538330078125, + "logps/rejected": -240.39126586914062, + "loss": 0.0486, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.004494071938097477, + "rewards/margins": 0.0475679449737072, + "rewards/rejected": -0.0520620159804821, + "step": 1390 + }, + { + "epoch": 0.37, + "learning_rate": 3.995948780477605e-06, + "logits/chosen": 1.0267126560211182, + "logits/rejected": 1.0607044696807861, + "logps/chosen": -287.0250549316406, + "logps/rejected": -246.72103881835938, + "loss": 0.0432, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.006947031710296869, + "rewards/margins": 0.038351211696863174, + "rewards/rejected": -0.04529824107885361, + "step": 1400 + }, + { + "epoch": 0.37, + "eval_logits/chosen": 0.9482428431510925, + "eval_logits/rejected": 1.0463390350341797, + "eval_logps/chosen": -279.0111999511719, + "eval_logps/rejected": -250.108154296875, + "eval_loss": 0.036706726998090744, + "eval_rewards/accuracies": 0.6259999871253967, + "eval_rewards/chosen": -0.014643603935837746, + "eval_rewards/margins": 0.0482785664498806, + "eval_rewards/rejected": -0.0629221647977829, + "eval_runtime": 539.1963, + "eval_samples_per_second": 3.709, + "eval_steps_per_second": 0.927, + "step": 1400 + }, + { + "epoch": 0.37, + "learning_rate": 3.977583174498816e-06, + "logits/chosen": 0.9617465138435364, + "logits/rejected": 1.0087401866912842, + "logps/chosen": -274.5213317871094, + "logps/rejected": -242.7264404296875, + "loss": 0.0398, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.014896227046847343, + "rewards/margins": 0.038757093250751495, + "rewards/rejected": -0.05365331843495369, + "step": 1410 + }, + { + "epoch": 0.37, + "learning_rate": 3.959094190750172e-06, + "logits/chosen": 0.9357368350028992, + "logits/rejected": 1.0426499843597412, + "logps/chosen": -288.6091003417969, + "logps/rejected": -259.63323974609375, + "loss": 0.0462, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.030341049656271935, + "rewards/margins": 0.037225984036922455, + "rewards/rejected": -0.06756703555583954, + "step": 1420 + }, + { + "epoch": 0.37, + "learning_rate": 3.9404833730564975e-06, + "logits/chosen": 1.026078224182129, + "logits/rejected": 1.0588185787200928, + "logps/chosen": -250.0701141357422, + "logps/rejected": -250.2508087158203, + "loss": 0.0495, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.03756723925471306, + "rewards/margins": 0.042705655097961426, + "rewards/rejected": -0.08027289807796478, + "step": 1430 + }, + { + "epoch": 0.38, + "learning_rate": 3.921752275415712e-06, + "logits/chosen": 1.0129649639129639, + "logits/rejected": 1.0660035610198975, + "logps/chosen": -242.80191040039062, + "logps/rejected": -215.3003387451172, + "loss": 0.0364, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.041556812822818756, + "rewards/margins": 0.0497397780418396, + "rewards/rejected": -0.09129659831523895, + "step": 1440 + }, + { + "epoch": 0.38, + "learning_rate": 3.902902461869079e-06, + "logits/chosen": 0.9659714698791504, + "logits/rejected": 1.106687307357788, + "logps/chosen": -273.6470642089844, + "logps/rejected": -246.69326782226562, + "loss": 0.0399, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.050172436982393265, + "rewards/margins": 0.060846518725156784, + "rewards/rejected": -0.11101895570755005, + "step": 1450 + }, + { + "epoch": 0.38, + "learning_rate": 3.883935506370605e-06, + "logits/chosen": 0.941813588142395, + "logits/rejected": 1.0076746940612793, + "logps/chosen": -282.126708984375, + "logps/rejected": -244.9275665283203, + "loss": 0.0352, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.05347307771444321, + "rewards/margins": 0.053630221635103226, + "rewards/rejected": -0.10710330307483673, + "step": 1460 + }, + { + "epoch": 0.38, + "learning_rate": 3.864852992655617e-06, + "logits/chosen": 0.9045804142951965, + "logits/rejected": 1.0152260065078735, + "logps/chosen": -270.4555969238281, + "logps/rejected": -260.31390380859375, + "loss": 0.0396, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.045248690992593765, + "rewards/margins": 0.0546412356197834, + "rewards/rejected": -0.09988992661237717, + "step": 1470 + }, + { + "epoch": 0.39, + "learning_rate": 3.845656514108516e-06, + "logits/chosen": 0.9225580096244812, + "logits/rejected": 0.9809015989303589, + "logps/chosen": -297.4796142578125, + "logps/rejected": -250.21530151367188, + "loss": 0.0295, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.041134659200906754, + "rewards/margins": 0.06729653477668762, + "rewards/rejected": -0.10843118280172348, + "step": 1480 + }, + { + "epoch": 0.39, + "learning_rate": 3.826347673629738e-06, + "logits/chosen": 0.9812415838241577, + "logits/rejected": 1.0070959329605103, + "logps/chosen": -247.45516967773438, + "logps/rejected": -231.81179809570312, + "loss": 0.0372, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.04904834181070328, + "rewards/margins": 0.05793965980410576, + "rewards/rejected": -0.10698799788951874, + "step": 1490 + }, + { + "epoch": 0.39, + "learning_rate": 3.8069280835019062e-06, + "logits/chosen": 0.9496526718139648, + "logits/rejected": 1.0348269939422607, + "logps/chosen": -325.68267822265625, + "logps/rejected": -257.0068054199219, + "loss": 0.0304, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.035646189004182816, + "rewards/margins": 0.04558128863573074, + "rewards/rejected": -0.08122747391462326, + "step": 1500 + }, + { + "epoch": 0.39, + "eval_logits/chosen": 0.949572741985321, + "eval_logits/rejected": 1.0471240282058716, + "eval_logps/chosen": -282.7773132324219, + "eval_logps/rejected": -254.43394470214844, + "eval_loss": 0.03586630895733833, + "eval_rewards/accuracies": 0.6359999775886536, + "eval_rewards/chosen": -0.052304789423942566, + "eval_rewards/margins": 0.05387549474835396, + "eval_rewards/rejected": -0.10618028789758682, + "eval_runtime": 539.1493, + "eval_samples_per_second": 3.71, + "eval_steps_per_second": 0.927, + "step": 1500 + }, + { + "epoch": 0.4, + "learning_rate": 3.7873993652552077e-06, + "logits/chosen": 0.9462829828262329, + "logits/rejected": 1.0417277812957764, + "logps/chosen": -263.7369689941406, + "logps/rejected": -241.88095092773438, + "loss": 0.036, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.059374719858169556, + "rewards/margins": 0.021540379151701927, + "rewards/rejected": -0.08091510832309723, + "step": 1510 + }, + { + "epoch": 0.4, + "learning_rate": 3.7677631495319953e-06, + "logits/chosen": 0.9473394155502319, + "logits/rejected": 1.016614556312561, + "logps/chosen": -263.25238037109375, + "logps/rejected": -217.35693359375, + "loss": 0.0371, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.04990892857313156, + "rewards/margins": 0.03492476046085358, + "rewards/rejected": -0.08483369648456573, + "step": 1520 + }, + { + "epoch": 0.4, + "learning_rate": 3.748021075950633e-06, + "logits/chosen": 1.0440593957901, + "logits/rejected": 1.0327621698379517, + "logps/chosen": -307.784423828125, + "logps/rejected": -280.920654296875, + "loss": 0.0398, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.05705567076802254, + "rewards/margins": 0.05376668646931648, + "rewards/rejected": -0.11082235723733902, + "step": 1530 + }, + { + "epoch": 0.4, + "learning_rate": 3.7281747929685824e-06, + "logits/chosen": 0.9668010473251343, + "logits/rejected": 1.1068073511123657, + "logps/chosen": -292.89801025390625, + "logps/rejected": -257.0223388671875, + "loss": 0.0377, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -0.04842069000005722, + "rewards/margins": 0.043641868978738785, + "rewards/rejected": -0.0920625552535057, + "step": 1540 + }, + { + "epoch": 0.41, + "learning_rate": 3.7082259577447604e-06, + "logits/chosen": 0.9432889223098755, + "logits/rejected": 1.05801522731781, + "logps/chosen": -324.9328308105469, + "logps/rejected": -255.90939331054688, + "loss": 0.0319, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.05441862344741821, + "rewards/margins": 0.053149156272411346, + "rewards/rejected": -0.10756777226924896, + "step": 1550 + }, + { + "epoch": 0.41, + "learning_rate": 3.6881762360011688e-06, + "logits/chosen": 1.0232698917388916, + "logits/rejected": 1.055396318435669, + "logps/chosen": -291.96514892578125, + "logps/rejected": -249.9015350341797, + "loss": 0.0428, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.06017423793673515, + "rewards/margins": 0.05147537589073181, + "rewards/rejected": -0.11164961010217667, + "step": 1560 + }, + { + "epoch": 0.41, + "learning_rate": 3.668027301883802e-06, + "logits/chosen": 0.9909790754318237, + "logits/rejected": 1.0502822399139404, + "logps/chosen": -266.43011474609375, + "logps/rejected": -224.57601928710938, + "loss": 0.033, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.05004848912358284, + "rewards/margins": 0.05305255576968193, + "rewards/rejected": -0.10310103744268417, + "step": 1570 + }, + { + "epoch": 0.41, + "learning_rate": 3.64778083782286e-06, + "logits/chosen": 1.0207841396331787, + "logits/rejected": 1.0920627117156982, + "logps/chosen": -270.4779968261719, + "logps/rejected": -278.3066101074219, + "loss": 0.0354, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.038508545607328415, + "rewards/margins": 0.05329999327659607, + "rewards/rejected": -0.09180854260921478, + "step": 1580 + }, + { + "epoch": 0.42, + "learning_rate": 3.627438534392268e-06, + "logits/chosen": 0.9771720767021179, + "logits/rejected": 1.0112661123275757, + "logps/chosen": -264.5091552734375, + "logps/rejected": -213.95535278320312, + "loss": 0.0401, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.03601064160466194, + "rewards/margins": 0.04099656641483307, + "rewards/rejected": -0.07700721174478531, + "step": 1590 + }, + { + "epoch": 0.42, + "learning_rate": 3.607002090168506e-06, + "logits/chosen": 0.9807453155517578, + "logits/rejected": 1.0391424894332886, + "logps/chosen": -261.9506530761719, + "logps/rejected": -238.798828125, + "loss": 0.0436, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.03824782371520996, + "rewards/margins": 0.041073787957429886, + "rewards/rejected": -0.07932160794734955, + "step": 1600 + }, + { + "epoch": 0.42, + "eval_logits/chosen": 0.9584904909133911, + "eval_logits/rejected": 1.0586249828338623, + "eval_logps/chosen": -280.7698669433594, + "eval_logps/rejected": -252.26162719726562, + "eval_loss": 0.03589407727122307, + "eval_rewards/accuracies": 0.6340000033378601, + "eval_rewards/chosen": -0.03223072364926338, + "eval_rewards/margins": 0.05222645774483681, + "eval_rewards/rejected": -0.08445718139410019, + "eval_runtime": 538.9153, + "eval_samples_per_second": 3.711, + "eval_steps_per_second": 0.928, + "step": 1600 + }, + { + "epoch": 0.42, + "learning_rate": 3.586473211588787e-06, + "logits/chosen": 0.9628156423568726, + "logits/rejected": 1.0911314487457275, + "logps/chosen": -301.49798583984375, + "logps/rejected": -244.82583618164062, + "loss": 0.0432, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.025343740358948708, + "rewards/margins": 0.0616876594722271, + "rewards/rejected": -0.08703140914440155, + "step": 1610 + }, + { + "epoch": 0.42, + "learning_rate": 3.5658536128085623e-06, + "logits/chosen": 0.9632102847099304, + "logits/rejected": 1.0290127992630005, + "logps/chosen": -252.96798706054688, + "logps/rejected": -247.17739868164062, + "loss": 0.0346, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.03504541888833046, + "rewards/margins": 0.056658290326595306, + "rewards/rejected": -0.09170371294021606, + "step": 1620 + }, + { + "epoch": 0.43, + "learning_rate": 3.545145015558399e-06, + "logits/chosen": 0.9802696108818054, + "logits/rejected": 1.0333788394927979, + "logps/chosen": -302.0663146972656, + "logps/rejected": -291.22027587890625, + "loss": 0.0495, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.03271108120679855, + "rewards/margins": 0.047079406678676605, + "rewards/rejected": -0.07979048788547516, + "step": 1630 + }, + { + "epoch": 0.43, + "learning_rate": 3.5243491490002056e-06, + "logits/chosen": 0.9676446914672852, + "logits/rejected": 1.0016578435897827, + "logps/chosen": -277.4888000488281, + "logps/rejected": -244.8700714111328, + "loss": 0.0395, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.036362119019031525, + "rewards/margins": 0.042628705501556396, + "rewards/rejected": -0.07899081707000732, + "step": 1640 + }, + { + "epoch": 0.43, + "learning_rate": 3.503467749582857e-06, + "logits/chosen": 0.955204963684082, + "logits/rejected": 0.9823210835456848, + "logps/chosen": -251.63296508789062, + "logps/rejected": -207.33932495117188, + "loss": 0.0357, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.0370803065598011, + "rewards/margins": 0.05566862225532532, + "rewards/rejected": -0.09274892508983612, + "step": 1650 + }, + { + "epoch": 0.43, + "learning_rate": 3.4825025608971947e-06, + "logits/chosen": 0.9128938913345337, + "logits/rejected": 1.0501888990402222, + "logps/chosen": -319.082275390625, + "logps/rejected": -253.41708374023438, + "loss": 0.0321, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.04241309314966202, + "rewards/margins": 0.05771785229444504, + "rewards/rejected": -0.10013093799352646, + "step": 1660 + }, + { + "epoch": 0.44, + "learning_rate": 3.4614553335304407e-06, + "logits/chosen": 0.9686363935470581, + "logits/rejected": 1.0950806140899658, + "logps/chosen": -281.142333984375, + "logps/rejected": -243.0208740234375, + "loss": 0.0393, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.05112028867006302, + "rewards/margins": 0.06849656254053116, + "rewards/rejected": -0.11961684376001358, + "step": 1670 + }, + { + "epoch": 0.44, + "learning_rate": 3.4403278249200222e-06, + "logits/chosen": 0.9172054529190063, + "logits/rejected": 1.0074876546859741, + "logps/chosen": -262.1912536621094, + "logps/rejected": -244.34036254882812, + "loss": 0.038, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.050746072083711624, + "rewards/margins": 0.04991786926984787, + "rewards/rejected": -0.10066394507884979, + "step": 1680 + }, + { + "epoch": 0.44, + "learning_rate": 3.4191217992068293e-06, + "logits/chosen": 0.9850749969482422, + "logits/rejected": 0.9730724096298218, + "logps/chosen": -262.35345458984375, + "logps/rejected": -260.61895751953125, + "loss": 0.0328, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.06715109199285507, + "rewards/margins": 0.05269361659884453, + "rewards/rejected": -0.1198447123169899, + "step": 1690 + }, + { + "epoch": 0.44, + "learning_rate": 3.3978390270879056e-06, + "logits/chosen": 0.9383825063705444, + "logits/rejected": 1.040351152420044, + "logps/chosen": -299.37066650390625, + "logps/rejected": -256.8703918457031, + "loss": 0.0405, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.047446489334106445, + "rewards/margins": 0.05738651007413864, + "rewards/rejected": -0.10483300685882568, + "step": 1700 + }, + { + "epoch": 0.44, + "eval_logits/chosen": 0.9321679472923279, + "eval_logits/rejected": 1.0312304496765137, + "eval_logps/chosen": -282.8528747558594, + "eval_logps/rejected": -254.86965942382812, + "eval_loss": 0.03552675619721413, + "eval_rewards/accuracies": 0.6334999799728394, + "eval_rewards/chosen": -0.05306074023246765, + "eval_rewards/margins": 0.05747658386826515, + "eval_rewards/rejected": -0.1105373352766037, + "eval_runtime": 539.0945, + "eval_samples_per_second": 3.71, + "eval_steps_per_second": 0.927, + "step": 1700 + }, + { + "epoch": 0.45, + "learning_rate": 3.3764812856685995e-06, + "logits/chosen": 0.9679194688796997, + "logits/rejected": 0.9970897436141968, + "logps/chosen": -294.0511169433594, + "logps/rejected": -265.82025146484375, + "loss": 0.0297, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.04057580232620239, + "rewards/margins": 0.06438260525465012, + "rewards/rejected": -0.1049584150314331, + "step": 1710 + }, + { + "epoch": 0.45, + "learning_rate": 3.3550503583141726e-06, + "logits/chosen": 0.9520149230957031, + "logits/rejected": 0.9898689985275269, + "logps/chosen": -327.1080627441406, + "logps/rejected": -308.2015686035156, + "loss": 0.0303, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.045171838253736496, + "rewards/margins": 0.06369863450527191, + "rewards/rejected": -0.1088704839348793, + "step": 1720 + }, + { + "epoch": 0.45, + "learning_rate": 3.3335480345008907e-06, + "logits/chosen": 0.9297927618026733, + "logits/rejected": 1.0150766372680664, + "logps/chosen": -266.30914306640625, + "logps/rejected": -229.8489532470703, + "loss": 0.0462, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.0414881631731987, + "rewards/margins": 0.0664457231760025, + "rewards/rejected": -0.1079338937997818, + "step": 1730 + }, + { + "epoch": 0.46, + "learning_rate": 3.3119761096666055e-06, + "logits/chosen": 1.0308209657669067, + "logits/rejected": 1.0126601457595825, + "logps/chosen": -317.2650451660156, + "logps/rejected": -265.3246765136719, + "loss": 0.04, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.06478316336870193, + "rewards/margins": 0.04602901265025139, + "rewards/rejected": -0.11081217229366302, + "step": 1740 + }, + { + "epoch": 0.46, + "learning_rate": 3.290336385060832e-06, + "logits/chosen": 0.9886214137077332, + "logits/rejected": 0.9748364686965942, + "logps/chosen": -289.01776123046875, + "logps/rejected": -267.6888427734375, + "loss": 0.0375, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.05019726604223251, + "rewards/margins": 0.06619967520236969, + "rewards/rejected": -0.1163969412446022, + "step": 1750 + }, + { + "epoch": 0.46, + "learning_rate": 3.268630667594348e-06, + "logits/chosen": 0.9688504934310913, + "logits/rejected": 1.0535883903503418, + "logps/chosen": -284.36700439453125, + "logps/rejected": -266.2789306640625, + "loss": 0.033, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.05331147834658623, + "rewards/margins": 0.05179852992296219, + "rewards/rejected": -0.10510998964309692, + "step": 1760 + }, + { + "epoch": 0.46, + "learning_rate": 3.2468607696883147e-06, + "logits/chosen": 0.9664725065231323, + "logits/rejected": 1.0324318408966064, + "logps/chosen": -302.47113037109375, + "logps/rejected": -258.18365478515625, + "loss": 0.0377, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.054223883897066116, + "rewards/margins": 0.06361141800880432, + "rewards/rejected": -0.11783530563116074, + "step": 1770 + }, + { + "epoch": 0.47, + "learning_rate": 3.225028509122944e-06, + "logits/chosen": 0.9445089101791382, + "logits/rejected": 1.0780378580093384, + "logps/chosen": -306.8780212402344, + "logps/rejected": -259.9053955078125, + "loss": 0.0198, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.02363925240933895, + "rewards/margins": 0.06653538346290588, + "rewards/rejected": -0.09017463773488998, + "step": 1780 + }, + { + "epoch": 0.47, + "learning_rate": 3.2031357088857083e-06, + "logits/chosen": 1.0200811624526978, + "logits/rejected": 1.0404508113861084, + "logps/chosen": -275.5518493652344, + "logps/rejected": -261.8308410644531, + "loss": 0.0388, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.04572081193327904, + "rewards/margins": 0.060186631977558136, + "rewards/rejected": -0.10590744018554688, + "step": 1790 + }, + { + "epoch": 0.47, + "learning_rate": 3.181184197019127e-06, + "logits/chosen": 1.0228135585784912, + "logits/rejected": 0.9979068636894226, + "logps/chosen": -282.7207946777344, + "logps/rejected": -236.26962280273438, + "loss": 0.0352, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.03686892241239548, + "rewards/margins": 0.06626715511083603, + "rewards/rejected": -0.1031360775232315, + "step": 1800 + }, + { + "epoch": 0.47, + "eval_logits/chosen": 0.9538518190383911, + "eval_logits/rejected": 1.053285002708435, + "eval_logps/chosen": -281.23944091796875, + "eval_logps/rejected": -253.37205505371094, + "eval_loss": 0.03543499857187271, + "eval_rewards/accuracies": 0.621999979019165, + "eval_rewards/chosen": -0.0369262732565403, + "eval_rewards/margins": 0.05863497406244278, + "eval_rewards/rejected": -0.09556125104427338, + "eval_runtime": 539.174, + "eval_samples_per_second": 3.709, + "eval_steps_per_second": 0.927, + "step": 1800 + }, + { + "epoch": 0.47, + "learning_rate": 3.159175806468126e-06, + "logits/chosen": 0.9597219228744507, + "logits/rejected": 1.0474979877471924, + "logps/chosen": -284.355712890625, + "logps/rejected": -276.75836181640625, + "loss": 0.0293, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.030187245458364487, + "rewards/margins": 0.052719276398420334, + "rewards/rejected": -0.08290652930736542, + "step": 1810 + }, + { + "epoch": 0.48, + "learning_rate": 3.1371123749269804e-06, + "logits/chosen": 1.0226430892944336, + "logits/rejected": 1.1090997457504272, + "logps/chosen": -282.0672607421875, + "logps/rejected": -252.12680053710938, + "loss": 0.0356, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.027882922440767288, + "rewards/margins": 0.07020456343889236, + "rewards/rejected": -0.09808747470378876, + "step": 1820 + }, + { + "epoch": 0.48, + "learning_rate": 3.114995744685877e-06, + "logits/chosen": 1.0181407928466797, + "logits/rejected": 0.9908281564712524, + "logps/chosen": -258.36480712890625, + "logps/rejected": -240.61569213867188, + "loss": 0.0308, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.017861558124423027, + "rewards/margins": 0.07849525660276413, + "rewards/rejected": -0.0963568240404129, + "step": 1830 + }, + { + "epoch": 0.48, + "learning_rate": 3.0928277624770743e-06, + "logits/chosen": 0.9835958480834961, + "logits/rejected": 1.0549378395080566, + "logps/chosen": -285.7489013671875, + "logps/rejected": -232.4912109375, + "loss": 0.0297, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.011253233067691326, + "rewards/margins": 0.08058271557092667, + "rewards/rejected": -0.09183595329523087, + "step": 1840 + }, + { + "epoch": 0.48, + "learning_rate": 3.070610279320708e-06, + "logits/chosen": 0.9505823850631714, + "logits/rejected": 1.0996264219284058, + "logps/chosen": -286.63714599609375, + "logps/rejected": -233.0901336669922, + "loss": 0.0359, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.030136246234178543, + "rewards/margins": 0.05022455379366875, + "rewards/rejected": -0.0803607925772667, + "step": 1850 + }, + { + "epoch": 0.49, + "learning_rate": 3.0483451503702264e-06, + "logits/chosen": 1.0030499696731567, + "logits/rejected": 1.0658283233642578, + "logps/chosen": -287.8111877441406, + "logps/rejected": -264.3537292480469, + "loss": 0.043, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.020662177354097366, + "rewards/margins": 0.05197754502296448, + "rewards/rejected": -0.07263971865177155, + "step": 1860 + }, + { + "epoch": 0.49, + "learning_rate": 3.0260342347574916e-06, + "logits/chosen": 1.0229995250701904, + "logits/rejected": 1.0849655866622925, + "logps/chosen": -299.0933837890625, + "logps/rejected": -248.93527221679688, + "loss": 0.0295, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.036781955510377884, + "rewards/margins": 0.0717974454164505, + "rewards/rejected": -0.10857941210269928, + "step": 1870 + }, + { + "epoch": 0.49, + "learning_rate": 3.0036793954375358e-06, + "logits/chosen": 0.9629353284835815, + "logits/rejected": 1.0485321283340454, + "logps/chosen": -279.51690673828125, + "logps/rejected": -242.5545196533203, + "loss": 0.0355, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.01313974242657423, + "rewards/margins": 0.07288579642772675, + "rewards/rejected": -0.08602554351091385, + "step": 1880 + }, + { + "epoch": 0.49, + "learning_rate": 2.981282499033009e-06, + "logits/chosen": 0.9632253646850586, + "logits/rejected": 1.0230735540390015, + "logps/chosen": -293.87286376953125, + "logps/rejected": -243.94613647460938, + "loss": 0.0359, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.022772405296564102, + "rewards/margins": 0.05214250087738037, + "rewards/rejected": -0.07491490244865417, + "step": 1890 + }, + { + "epoch": 0.5, + "learning_rate": 2.9588454156783163e-06, + "logits/chosen": 1.0420585870742798, + "logits/rejected": 1.01620614528656, + "logps/chosen": -277.5963439941406, + "logps/rejected": -251.56881713867188, + "loss": 0.0392, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.020175188779830933, + "rewards/margins": 0.05704687908291817, + "rewards/rejected": -0.0772220641374588, + "step": 1900 + }, + { + "epoch": 0.5, + "eval_logits/chosen": 0.9508064389228821, + "eval_logits/rejected": 1.0498266220092773, + "eval_logps/chosen": -280.359375, + "eval_logps/rejected": -252.41934204101562, + "eval_loss": 0.03548915684223175, + "eval_rewards/accuracies": 0.6209999918937683, + "eval_rewards/chosen": -0.028125399723649025, + "eval_rewards/margins": 0.05790869519114494, + "eval_rewards/rejected": -0.08603409677743912, + "eval_runtime": 539.0909, + "eval_samples_per_second": 3.71, + "eval_steps_per_second": 0.927, + "step": 1900 + }, + { + "epoch": 0.5, + "learning_rate": 2.9363700188634597e-06, + "logits/chosen": 0.9709011912345886, + "logits/rejected": 1.0315817594528198, + "logps/chosen": -271.8665771484375, + "logps/rejected": -249.4420928955078, + "loss": 0.031, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.03171471506357193, + "rewards/margins": 0.04039089381694794, + "rewards/rejected": -0.07210560888051987, + "step": 1910 + }, + { + "epoch": 0.5, + "learning_rate": 2.9138581852776053e-06, + "logits/chosen": 0.9720270037651062, + "logits/rejected": 1.086348056793213, + "logps/chosen": -276.81109619140625, + "logps/rejected": -262.7275085449219, + "loss": 0.0384, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.02635134384036064, + "rewards/margins": 0.07241298258304596, + "rewards/rejected": -0.0987643226981163, + "step": 1920 + }, + { + "epoch": 0.51, + "learning_rate": 2.8913117946523805e-06, + "logits/chosen": 1.0553147792816162, + "logits/rejected": 1.024316430091858, + "logps/chosen": -261.1982116699219, + "logps/rejected": -221.4432373046875, + "loss": 0.0406, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.041058607399463654, + "rewards/margins": 0.05773719400167465, + "rewards/rejected": -0.0987958088517189, + "step": 1930 + }, + { + "epoch": 0.51, + "learning_rate": 2.8687327296049126e-06, + "logits/chosen": 0.9523100852966309, + "logits/rejected": 1.032663345336914, + "logps/chosen": -280.92706298828125, + "logps/rejected": -259.6051940917969, + "loss": 0.0323, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.02694753371179104, + "rewards/margins": 0.04815928265452385, + "rewards/rejected": -0.07510681450366974, + "step": 1940 + }, + { + "epoch": 0.51, + "learning_rate": 2.8461228754806376e-06, + "logits/chosen": 0.9898034930229187, + "logits/rejected": 1.073132038116455, + "logps/chosen": -250.3188018798828, + "logps/rejected": -242.313232421875, + "loss": 0.0325, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.03240308165550232, + "rewards/margins": 0.05959530547261238, + "rewards/rejected": -0.0919983834028244, + "step": 1950 + }, + { + "epoch": 0.51, + "learning_rate": 2.823484120195865e-06, + "logits/chosen": 1.0166945457458496, + "logits/rejected": 0.9883760213851929, + "logps/chosen": -209.08822631835938, + "logps/rejected": -223.0111846923828, + "loss": 0.0343, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.029585499316453934, + "rewards/margins": 0.055918287485837936, + "rewards/rejected": -0.08550377935171127, + "step": 1960 + }, + { + "epoch": 0.52, + "learning_rate": 2.8008183540801486e-06, + "logits/chosen": 0.9123330116271973, + "logits/rejected": 1.0141832828521729, + "logps/chosen": -288.05084228515625, + "logps/rejected": -267.970458984375, + "loss": 0.0307, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.030767519026994705, + "rewards/margins": 0.04243772476911545, + "rewards/rejected": -0.07320524752140045, + "step": 1970 + }, + { + "epoch": 0.52, + "learning_rate": 2.7781274697184353e-06, + "logits/chosen": 0.9999529123306274, + "logits/rejected": 1.0701429843902588, + "logps/chosen": -274.42816162109375, + "logps/rejected": -255.3940887451172, + "loss": 0.0341, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.03205768018960953, + "rewards/margins": 0.0669126957654953, + "rewards/rejected": -0.09897039085626602, + "step": 1980 + }, + { + "epoch": 0.52, + "learning_rate": 2.7554133617930397e-06, + "logits/chosen": 1.0071890354156494, + "logits/rejected": 1.0170022249221802, + "logps/chosen": -272.82794189453125, + "logps/rejected": -268.2226867675781, + "loss": 0.035, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.03548605367541313, + "rewards/margins": 0.03869449347257614, + "rewards/rejected": -0.07418055832386017, + "step": 1990 + }, + { + "epoch": 0.52, + "learning_rate": 2.7326779269254363e-06, + "logits/chosen": 0.945580780506134, + "logits/rejected": 1.0344423055648804, + "logps/chosen": -254.6405792236328, + "logps/rejected": -229.8059844970703, + "loss": 0.0368, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.025666531175374985, + "rewards/margins": 0.0334133505821228, + "rewards/rejected": -0.059079885482788086, + "step": 2000 + }, + { + "epoch": 0.52, + "eval_logits/chosen": 0.9577403664588928, + "eval_logits/rejected": 1.0563304424285889, + "eval_logps/chosen": -279.8615417480469, + "eval_logps/rejected": -251.5159149169922, + "eval_loss": 0.0354422889649868, + "eval_rewards/accuracies": 0.6299999952316284, + "eval_rewards/chosen": -0.023147189989686012, + "eval_rewards/margins": 0.053852878510951996, + "eval_rewards/rejected": -0.07700006663799286, + "eval_runtime": 539.213, + "eval_samples_per_second": 3.709, + "eval_steps_per_second": 0.927, + "step": 2000 + }, + { + "epoch": 0.53, + "learning_rate": 2.7099230635178954e-06, + "logits/chosen": 0.9786656498908997, + "logits/rejected": 1.0951110124588013, + "logps/chosen": -250.3948974609375, + "logps/rejected": -213.85202026367188, + "loss": 0.0432, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.015626171603798866, + "rewards/margins": 0.0794781818985939, + "rewards/rejected": -0.09510435163974762, + "step": 2010 + }, + { + "epoch": 0.53, + "learning_rate": 2.6871506715949608e-06, + "logits/chosen": 1.0892430543899536, + "logits/rejected": 1.11086106300354, + "logps/chosen": -277.72430419921875, + "logps/rejected": -263.95916748046875, + "loss": 0.0435, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.028381651267409325, + "rewards/margins": 0.07892771810293198, + "rewards/rejected": -0.10730937868356705, + "step": 2020 + }, + { + "epoch": 0.53, + "learning_rate": 2.6643626526448063e-06, + "logits/chosen": 1.0042452812194824, + "logits/rejected": 1.097080945968628, + "logps/chosen": -245.4960479736328, + "logps/rejected": -252.0270233154297, + "loss": 0.0336, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.027020296081900597, + "rewards/margins": 0.0670652836561203, + "rewards/rejected": -0.09408558160066605, + "step": 2030 + }, + { + "epoch": 0.53, + "learning_rate": 2.6415609094604562e-06, + "logits/chosen": 0.9564634561538696, + "logits/rejected": 1.0937979221343994, + "logps/chosen": -304.175537109375, + "logps/rejected": -259.8273010253906, + "loss": 0.0401, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.035849399864673615, + "rewards/margins": 0.067509725689888, + "rewards/rejected": -0.10335911810398102, + "step": 2040 + }, + { + "epoch": 0.54, + "learning_rate": 2.618747345980904e-06, + "logits/chosen": 0.9391676783561707, + "logits/rejected": 1.0340735912322998, + "logps/chosen": -289.8169860839844, + "logps/rejected": -250.7114715576172, + "loss": 0.0301, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.02765297330915928, + "rewards/margins": 0.06791722774505615, + "rewards/rejected": -0.09557019919157028, + "step": 2050 + }, + { + "epoch": 0.54, + "learning_rate": 2.595923867132136e-06, + "logits/chosen": 1.0433982610702515, + "logits/rejected": 1.1006929874420166, + "logps/chosen": -227.78182983398438, + "logps/rejected": -231.31103515625, + "loss": 0.0364, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.024369673803448677, + "rewards/margins": 0.061494432389736176, + "rewards/rejected": -0.0858640968799591, + "step": 2060 + }, + { + "epoch": 0.54, + "learning_rate": 2.5730923786680672e-06, + "logits/chosen": 1.0085296630859375, + "logits/rejected": 1.0929278135299683, + "logps/chosen": -275.10479736328125, + "logps/rejected": -230.3509521484375, + "loss": 0.0339, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.031941868364810944, + "rewards/margins": 0.05682260915637016, + "rewards/rejected": -0.08876447379589081, + "step": 2070 + }, + { + "epoch": 0.54, + "learning_rate": 2.5502547870114137e-06, + "logits/chosen": 1.0008578300476074, + "logits/rejected": 1.0413917303085327, + "logps/chosen": -231.7599639892578, + "logps/rejected": -252.7259063720703, + "loss": 0.0245, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.024501001462340355, + "rewards/margins": 0.04970362037420273, + "rewards/rejected": -0.07420462369918823, + "step": 2080 + }, + { + "epoch": 0.55, + "learning_rate": 2.527412999094507e-06, + "logits/chosen": 1.006413459777832, + "logits/rejected": 1.048346996307373, + "logps/chosen": -283.4499816894531, + "logps/rejected": -256.1369323730469, + "loss": 0.0367, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.03352126479148865, + "rewards/margins": 0.05489424616098404, + "rewards/rejected": -0.08841550350189209, + "step": 2090 + }, + { + "epoch": 0.55, + "learning_rate": 2.504568922200064e-06, + "logits/chosen": 0.9541667699813843, + "logits/rejected": 1.0082509517669678, + "logps/chosen": -265.67510986328125, + "logps/rejected": -248.1321258544922, + "loss": 0.0326, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.037846438586711884, + "rewards/margins": 0.04263025149703026, + "rewards/rejected": -0.08047669380903244, + "step": 2100 + }, + { + "epoch": 0.55, + "eval_logits/chosen": 0.9759756326675415, + "eval_logits/rejected": 1.075065016746521, + "eval_logps/chosen": -281.1431884765625, + "eval_logps/rejected": -252.96302795410156, + "eval_loss": 0.03518374264240265, + "eval_rewards/accuracies": 0.6299999952316284, + "eval_rewards/chosen": -0.03596383333206177, + "eval_rewards/margins": 0.055507466197013855, + "eval_rewards/rejected": -0.09147130697965622, + "eval_runtime": 539.005, + "eval_samples_per_second": 3.711, + "eval_steps_per_second": 0.928, + "step": 2100 + }, + { + "epoch": 0.55, + "learning_rate": 2.4817244638019333e-06, + "logits/chosen": 0.9972022175788879, + "logits/rejected": 1.1107069253921509, + "logps/chosen": -259.2168884277344, + "logps/rejected": -252.2943878173828, + "loss": 0.0356, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.034331098198890686, + "rewards/margins": 0.06633375585079193, + "rewards/rejected": -0.10066483914852142, + "step": 2110 + }, + { + "epoch": 0.55, + "learning_rate": 2.4588815314058155e-06, + "logits/chosen": 0.979550838470459, + "logits/rejected": 1.015853762626648, + "logps/chosen": -308.5239562988281, + "logps/rejected": -290.33526611328125, + "loss": 0.0363, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.033022016286849976, + "rewards/margins": 0.03916890174150467, + "rewards/rejected": -0.07219092547893524, + "step": 2120 + }, + { + "epoch": 0.56, + "learning_rate": 2.4360420323899922e-06, + "logits/chosen": 0.9291566610336304, + "logits/rejected": 1.0274138450622559, + "logps/chosen": -300.00830078125, + "logps/rejected": -246.3135528564453, + "loss": 0.0364, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.0385669507086277, + "rewards/margins": 0.0751514807343483, + "rewards/rejected": -0.1137184128165245, + "step": 2130 + }, + { + "epoch": 0.56, + "learning_rate": 2.4132078738460585e-06, + "logits/chosen": 1.0100805759429932, + "logits/rejected": 1.0035889148712158, + "logps/chosen": -273.31329345703125, + "logps/rejected": -260.4046630859375, + "loss": 0.0345, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.042252346873283386, + "rewards/margins": 0.04604203626513481, + "rewards/rejected": -0.0882943868637085, + "step": 2140 + }, + { + "epoch": 0.56, + "learning_rate": 2.3903809624196826e-06, + "logits/chosen": 1.0183110237121582, + "logits/rejected": 1.1255147457122803, + "logps/chosen": -260.5460205078125, + "logps/rejected": -231.55355834960938, + "loss": 0.0355, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.04934290796518326, + "rewards/margins": 0.057183485478162766, + "rewards/rejected": -0.10652639716863632, + "step": 2150 + }, + { + "epoch": 0.57, + "learning_rate": 2.3675632041513978e-06, + "logits/chosen": 1.0892599821090698, + "logits/rejected": 1.1318366527557373, + "logps/chosen": -243.21102905273438, + "logps/rejected": -225.78549194335938, + "loss": 0.0351, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.05033104866743088, + "rewards/margins": 0.03708335757255554, + "rewards/rejected": -0.08741440623998642, + "step": 2160 + }, + { + "epoch": 0.57, + "learning_rate": 2.3447565043174533e-06, + "logits/chosen": 1.0009520053863525, + "logits/rejected": 0.9941670298576355, + "logps/chosen": -273.72149658203125, + "logps/rejected": -250.0045166015625, + "loss": 0.0432, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.04186940938234329, + "rewards/margins": 0.05003537982702255, + "rewards/rejected": -0.09190478920936584, + "step": 2170 + }, + { + "epoch": 0.57, + "learning_rate": 2.321962767270724e-06, + "logits/chosen": 1.0270161628723145, + "logits/rejected": 1.0816559791564941, + "logps/chosen": -258.2284240722656, + "logps/rejected": -254.64871215820312, + "loss": 0.0411, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.04245874285697937, + "rewards/margins": 0.06636542826890945, + "rewards/rejected": -0.10882417112588882, + "step": 2180 + }, + { + "epoch": 0.57, + "learning_rate": 2.299183896281692e-06, + "logits/chosen": 0.9619997143745422, + "logits/rejected": 1.1526567935943604, + "logps/chosen": -272.2848815917969, + "logps/rejected": -232.285888671875, + "loss": 0.036, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.03838383033871651, + "rewards/margins": 0.07473193854093552, + "rewards/rejected": -0.11311577260494232, + "step": 2190 + }, + { + "epoch": 0.58, + "learning_rate": 2.2764217933795297e-06, + "logits/chosen": 1.0228240489959717, + "logits/rejected": 0.9570671319961548, + "logps/chosen": -289.8697204589844, + "logps/rejected": -259.46624755859375, + "loss": 0.0368, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.04657725244760513, + "rewards/margins": 0.04652193933725357, + "rewards/rejected": -0.0930991917848587, + "step": 2200 + }, + { + "epoch": 0.58, + "eval_logits/chosen": 0.9639849066734314, + "eval_logits/rejected": 1.064185619354248, + "eval_logps/chosen": -281.45947265625, + "eval_logps/rejected": -253.4691162109375, + "eval_loss": 0.0351751483976841, + "eval_rewards/accuracies": 0.6345000267028809, + "eval_rewards/chosen": -0.03912654146552086, + "eval_rewards/margins": 0.057405244559049606, + "eval_rewards/rejected": -0.09653179347515106, + "eval_runtime": 539.0332, + "eval_samples_per_second": 3.71, + "eval_steps_per_second": 0.928, + "step": 2200 + }, + { + "epoch": 0.58, + "learning_rate": 2.2536783591932786e-06, + "logits/chosen": 1.1154290437698364, + "logits/rejected": 1.016729712486267, + "logps/chosen": -273.88702392578125, + "logps/rejected": -253.8929901123047, + "loss": 0.0307, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.034655872732400894, + "rewards/margins": 0.0648706778883934, + "rewards/rejected": -0.0995265543460846, + "step": 2210 + }, + { + "epoch": 0.58, + "learning_rate": 2.230955492793149e-06, + "logits/chosen": 1.015891194343567, + "logits/rejected": 1.1429827213287354, + "logps/chosen": -274.21929931640625, + "logps/rejected": -235.86441040039062, + "loss": 0.0409, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.025577425956726074, + "rewards/margins": 0.05668734759092331, + "rewards/rejected": -0.08226476609706879, + "step": 2220 + }, + { + "epoch": 0.58, + "learning_rate": 2.208255091531947e-06, + "logits/chosen": 1.0092315673828125, + "logits/rejected": 1.031456708908081, + "logps/chosen": -266.92181396484375, + "logps/rejected": -221.04495239257812, + "loss": 0.0424, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.02549988031387329, + "rewards/margins": 0.06533181667327881, + "rewards/rejected": -0.0908316969871521, + "step": 2230 + }, + { + "epoch": 0.59, + "learning_rate": 2.1855790508866435e-06, + "logits/chosen": 0.9829255938529968, + "logits/rejected": 1.0525522232055664, + "logps/chosen": -292.2393798828125, + "logps/rejected": -274.6322326660156, + "loss": 0.0217, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.024583814665675163, + "rewards/margins": 0.04256455600261688, + "rewards/rejected": -0.0671483725309372, + "step": 2240 + }, + { + "epoch": 0.59, + "learning_rate": 2.162929264300107e-06, + "logits/chosen": 1.036615014076233, + "logits/rejected": 1.1294059753417969, + "logps/chosen": -254.82858276367188, + "logps/rejected": -225.80612182617188, + "loss": 0.0454, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.03245388716459274, + "rewards/margins": 0.05736144259572029, + "rewards/rejected": -0.08981534093618393, + "step": 2250 + }, + { + "epoch": 0.59, + "learning_rate": 2.1403076230230006e-06, + "logits/chosen": 1.020202875137329, + "logits/rejected": 1.0851867198944092, + "logps/chosen": -270.38360595703125, + "logps/rejected": -208.3204345703125, + "loss": 0.0419, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.040919043123722076, + "rewards/margins": 0.04137270897626877, + "rewards/rejected": -0.08229174464941025, + "step": 2260 + }, + { + "epoch": 0.59, + "learning_rate": 2.11771601595586e-06, + "logits/chosen": 1.0087039470672607, + "logits/rejected": 1.0265519618988037, + "logps/chosen": -259.2693786621094, + "logps/rejected": -261.12091064453125, + "loss": 0.0282, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.017938809469342232, + "rewards/margins": 0.04762765020132065, + "rewards/rejected": -0.06556645780801773, + "step": 2270 + }, + { + "epoch": 0.6, + "learning_rate": 2.0951563294913737e-06, + "logits/chosen": 0.9380186796188354, + "logits/rejected": 1.0882261991500854, + "logps/chosen": -263.954345703125, + "logps/rejected": -221.87899780273438, + "loss": 0.0331, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.033337127417325974, + "rewards/margins": 0.04793107137084007, + "rewards/rejected": -0.08126820623874664, + "step": 2280 + }, + { + "epoch": 0.6, + "learning_rate": 2.0726304473568693e-06, + "logits/chosen": 1.0044240951538086, + "logits/rejected": 1.0548676252365112, + "logps/chosen": -250.0818634033203, + "logps/rejected": -269.6571960449219, + "loss": 0.0306, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.025777125731110573, + "rewards/margins": 0.05805457755923271, + "rewards/rejected": -0.08383170515298843, + "step": 2290 + }, + { + "epoch": 0.6, + "learning_rate": 2.050140250457023e-06, + "logits/chosen": 0.9972747564315796, + "logits/rejected": 0.9751909375190735, + "logps/chosen": -250.11572265625, + "logps/rejected": -239.9993438720703, + "loss": 0.0315, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.019835414364933968, + "rewards/margins": 0.05462411791086197, + "rewards/rejected": -0.07445952296257019, + "step": 2300 + }, + { + "epoch": 0.6, + "eval_logits/chosen": 0.9676101803779602, + "eval_logits/rejected": 1.0684521198272705, + "eval_logps/chosen": -280.0627746582031, + "eval_logps/rejected": -251.82423400878906, + "eval_loss": 0.03506240248680115, + "eval_rewards/accuracies": 0.6330000162124634, + "eval_rewards/chosen": -0.025159668177366257, + "eval_rewards/margins": 0.05492350831627846, + "eval_rewards/rejected": -0.08008317649364471, + "eval_runtime": 538.9096, + "eval_samples_per_second": 3.711, + "eval_steps_per_second": 0.928, + "step": 2300 + }, + { + "epoch": 0.6, + "learning_rate": 2.0276876167168042e-06, + "logits/chosen": 0.9994446039199829, + "logits/rejected": 1.1471552848815918, + "logps/chosen": -285.36883544921875, + "logps/rejected": -264.88861083984375, + "loss": 0.0435, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.021740112453699112, + "rewards/margins": 0.062014125287532806, + "rewards/rejected": -0.08375424146652222, + "step": 2310 + }, + { + "epoch": 0.61, + "learning_rate": 2.0052744209246682e-06, + "logits/chosen": 0.9745758771896362, + "logits/rejected": 1.0511457920074463, + "logps/chosen": -258.8453674316406, + "logps/rejected": -252.9406280517578, + "loss": 0.0345, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.029231492429971695, + "rewards/margins": 0.04611852020025253, + "rewards/rejected": -0.07535000890493393, + "step": 2320 + }, + { + "epoch": 0.61, + "learning_rate": 1.9829025345760127e-06, + "logits/chosen": 0.9900597333908081, + "logits/rejected": 1.0469316244125366, + "logps/chosen": -315.5625305175781, + "logps/rejected": -272.3370361328125, + "loss": 0.0276, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.026207396760582924, + "rewards/margins": 0.05628987401723862, + "rewards/rejected": -0.08249727636575699, + "step": 2330 + }, + { + "epoch": 0.61, + "learning_rate": 1.9605738257169115e-06, + "logits/chosen": 1.0605112314224243, + "logits/rejected": 1.0293577909469604, + "logps/chosen": -249.8504638671875, + "logps/rejected": -250.99655151367188, + "loss": 0.0377, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -0.038626205176115036, + "rewards/margins": 0.04553366079926491, + "rewards/rejected": -0.08415986597537994, + "step": 2340 + }, + { + "epoch": 0.62, + "learning_rate": 1.9382901587881275e-06, + "logits/chosen": 0.9544248580932617, + "logits/rejected": 1.0862176418304443, + "logps/chosen": -259.606689453125, + "logps/rejected": -246.25537109375, + "loss": 0.0303, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.03279640153050423, + "rewards/margins": 0.049646954983472824, + "rewards/rejected": -0.08244334906339645, + "step": 2350 + }, + { + "epoch": 0.62, + "learning_rate": 1.916053394469437e-06, + "logits/chosen": 0.9803518056869507, + "logits/rejected": 1.0460015535354614, + "logps/chosen": -227.25918579101562, + "logps/rejected": -231.8365478515625, + "loss": 0.0443, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.025378238409757614, + "rewards/margins": 0.05420111492276192, + "rewards/rejected": -0.07957935333251953, + "step": 2360 + }, + { + "epoch": 0.62, + "learning_rate": 1.8938653895242604e-06, + "logits/chosen": 0.9943161010742188, + "logits/rejected": 1.0234358310699463, + "logps/chosen": -242.02169799804688, + "logps/rejected": -225.488525390625, + "loss": 0.0387, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.0067793638445436954, + "rewards/margins": 0.06738194823265076, + "rewards/rejected": -0.07416132837533951, + "step": 2370 + }, + { + "epoch": 0.62, + "learning_rate": 1.8717279966446267e-06, + "logits/chosen": 0.9629790186882019, + "logits/rejected": 1.0603584051132202, + "logps/chosen": -270.99151611328125, + "logps/rejected": -255.68594360351562, + "loss": 0.0378, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.018644142895936966, + "rewards/margins": 0.057522498071193695, + "rewards/rejected": -0.07616663724184036, + "step": 2380 + }, + { + "epoch": 0.63, + "learning_rate": 1.8496430642964698e-06, + "logits/chosen": 0.9359694719314575, + "logits/rejected": 1.0587961673736572, + "logps/chosen": -254.59310913085938, + "logps/rejected": -238.82177734375, + "loss": 0.0316, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.014369276352226734, + "rewards/margins": 0.056524503976106644, + "rewards/rejected": -0.0708937793970108, + "step": 2390 + }, + { + "epoch": 0.63, + "learning_rate": 1.827612436565286e-06, + "logits/chosen": 0.947625458240509, + "logits/rejected": 1.0177226066589355, + "logps/chosen": -253.84335327148438, + "logps/rejected": -233.13687133789062, + "loss": 0.0341, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.021341439336538315, + "rewards/margins": 0.062166161835193634, + "rewards/rejected": -0.08350759744644165, + "step": 2400 + }, + { + "epoch": 0.63, + "eval_logits/chosen": 0.940484881401062, + "eval_logits/rejected": 1.0420405864715576, + "eval_logps/chosen": -279.9447021484375, + "eval_logps/rejected": -251.8425750732422, + "eval_loss": 0.0352231003344059, + "eval_rewards/accuracies": 0.6320000290870667, + "eval_rewards/chosen": -0.023978877812623978, + "eval_rewards/margins": 0.05628751218318939, + "eval_rewards/rejected": -0.08026638627052307, + "eval_runtime": 539.1415, + "eval_samples_per_second": 3.71, + "eval_steps_per_second": 0.927, + "step": 2400 + }, + { + "epoch": 0.63, + "learning_rate": 1.8056379530021492e-06, + "logits/chosen": 0.9212465286254883, + "logits/rejected": 1.0563522577285767, + "logps/chosen": -255.9601287841797, + "logps/rejected": -224.5750274658203, + "loss": 0.044, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.024405932053923607, + "rewards/margins": 0.059529535472393036, + "rewards/rejected": -0.0839354619383812, + "step": 2410 + }, + { + "epoch": 0.63, + "learning_rate": 1.7837214484701154e-06, + "logits/chosen": 1.0158764123916626, + "logits/rejected": 1.0589698553085327, + "logps/chosen": -272.985107421875, + "logps/rejected": -264.05865478515625, + "loss": 0.0395, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.02469342015683651, + "rewards/margins": 0.07536058127880096, + "rewards/rejected": -0.10005400329828262, + "step": 2420 + }, + { + "epoch": 0.64, + "learning_rate": 1.7618647529910043e-06, + "logits/chosen": 0.9243197441101074, + "logits/rejected": 1.1003185510635376, + "logps/chosen": -260.38006591796875, + "logps/rejected": -240.22866821289062, + "loss": 0.0376, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.01998872682452202, + "rewards/margins": 0.06689772009849548, + "rewards/rejected": -0.0868864506483078, + "step": 2430 + }, + { + "epoch": 0.64, + "learning_rate": 1.7400696915925996e-06, + "logits/chosen": 0.9673307538032532, + "logits/rejected": 1.0932881832122803, + "logps/chosen": -277.5309753417969, + "logps/rejected": -269.5796813964844, + "loss": 0.0248, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.010230915620923042, + "rewards/margins": 0.076592355966568, + "rewards/rejected": -0.08682326972484589, + "step": 2440 + }, + { + "epoch": 0.64, + "learning_rate": 1.718338084156254e-06, + "logits/chosen": 0.9628580212593079, + "logits/rejected": 1.0081040859222412, + "logps/chosen": -276.6579895019531, + "logps/rejected": -248.1109161376953, + "loss": 0.03, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.01852061226963997, + "rewards/margins": 0.05575736239552498, + "rewards/rejected": -0.07427798211574554, + "step": 2450 + }, + { + "epoch": 0.64, + "learning_rate": 1.6966717452649372e-06, + "logits/chosen": 0.9334288835525513, + "logits/rejected": 1.002824068069458, + "logps/chosen": -279.92205810546875, + "logps/rejected": -271.9393615722656, + "loss": 0.0263, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.017314914613962173, + "rewards/margins": 0.059098273515701294, + "rewards/rejected": -0.07641319185495377, + "step": 2460 + }, + { + "epoch": 0.65, + "learning_rate": 1.6750724840517103e-06, + "logits/chosen": 0.9612905383110046, + "logits/rejected": 1.0620540380477905, + "logps/chosen": -294.86712646484375, + "logps/rejected": -253.3771514892578, + "loss": 0.0423, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.017185799777507782, + "rewards/margins": 0.07057368010282516, + "rewards/rejected": -0.08775947988033295, + "step": 2470 + }, + { + "epoch": 0.65, + "learning_rate": 1.6535421040486686e-06, + "logits/chosen": 0.9643794894218445, + "logits/rejected": 1.0106008052825928, + "logps/chosen": -256.61090087890625, + "logps/rejected": -232.49862670898438, + "loss": 0.0411, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.018548784777522087, + "rewards/margins": 0.054013751447200775, + "rewards/rejected": -0.07256253063678741, + "step": 2480 + }, + { + "epoch": 0.65, + "learning_rate": 1.6320824030363458e-06, + "logits/chosen": 0.8658086657524109, + "logits/rejected": 1.1067800521850586, + "logps/chosen": -268.14715576171875, + "logps/rejected": -246.0467529296875, + "loss": 0.0403, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.025564253330230713, + "rewards/margins": 0.06752271950244904, + "rewards/rejected": -0.09308697283267975, + "step": 2490 + }, + { + "epoch": 0.65, + "learning_rate": 1.6106951728936028e-06, + "logits/chosen": 1.0061540603637695, + "logits/rejected": 1.109466314315796, + "logps/chosen": -231.77230834960938, + "logps/rejected": -221.00439453125, + "loss": 0.0488, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.03300454095005989, + "rewards/margins": 0.05412193387746811, + "rewards/rejected": -0.0871264785528183, + "step": 2500 + }, + { + "epoch": 0.65, + "eval_logits/chosen": 0.9377838373184204, + "eval_logits/rejected": 1.039380669593811, + "eval_logps/chosen": -280.7594299316406, + "eval_logps/rejected": -252.99684143066406, + "eval_loss": 0.035037338733673096, + "eval_rewards/accuracies": 0.6340000033378601, + "eval_rewards/chosen": -0.03212602436542511, + "eval_rewards/margins": 0.059683240950107574, + "eval_rewards/rejected": -0.09180926531553268, + "eval_runtime": 539.216, + "eval_samples_per_second": 3.709, + "eval_steps_per_second": 0.927, + "step": 2500 + }, + { + "epoch": 0.66, + "learning_rate": 1.5893821994479996e-06, + "logits/chosen": 1.040175199508667, + "logits/rejected": 1.122536301612854, + "logps/chosen": -279.53924560546875, + "logps/rejected": -262.05816650390625, + "loss": 0.0313, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.026052657514810562, + "rewards/margins": 0.08227355033159256, + "rewards/rejected": -0.10832621157169342, + "step": 2510 + }, + { + "epoch": 0.66, + "learning_rate": 1.5681452623266868e-06, + "logits/chosen": 0.9631746411323547, + "logits/rejected": 1.0485169887542725, + "logps/chosen": -300.1724548339844, + "logps/rejected": -261.44134521484375, + "loss": 0.0336, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.030287887901067734, + "rewards/margins": 0.05683339759707451, + "rewards/rejected": -0.08712128549814224, + "step": 2520 + }, + { + "epoch": 0.66, + "learning_rate": 1.5469861348078014e-06, + "logits/chosen": 0.9725979566574097, + "logits/rejected": 1.0084983110427856, + "logps/chosen": -267.7999572753906, + "logps/rejected": -241.93508911132812, + "loss": 0.0452, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.027236010879278183, + "rewards/margins": 0.05542879179120064, + "rewards/rejected": -0.08266480267047882, + "step": 2530 + }, + { + "epoch": 0.66, + "learning_rate": 1.5259065836724035e-06, + "logits/chosen": 0.9311686754226685, + "logits/rejected": 1.0091297626495361, + "logps/chosen": -268.8539123535156, + "logps/rejected": -257.43206787109375, + "loss": 0.0284, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.0198749378323555, + "rewards/margins": 0.05286857485771179, + "rewards/rejected": -0.07274351269006729, + "step": 2540 + }, + { + "epoch": 0.67, + "learning_rate": 1.5049083690569456e-06, + "logits/chosen": 0.8933135867118835, + "logits/rejected": 1.0143239498138428, + "logps/chosen": -276.6771545410156, + "logps/rejected": -236.60525512695312, + "loss": 0.0394, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.040222011506557465, + "rewards/margins": 0.06946249306201935, + "rewards/rejected": -0.10968450456857681, + "step": 2550 + }, + { + "epoch": 0.67, + "learning_rate": 1.4839932443063057e-06, + "logits/chosen": 1.0430926084518433, + "logits/rejected": 1.0419895648956299, + "logps/chosen": -241.27578735351562, + "logps/rejected": -215.1394500732422, + "loss": 0.0433, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.04254927858710289, + "rewards/margins": 0.054975200444459915, + "rewards/rejected": -0.0975244790315628, + "step": 2560 + }, + { + "epoch": 0.67, + "learning_rate": 1.4631629558273803e-06, + "logits/chosen": 0.9953416585922241, + "logits/rejected": 1.0442326068878174, + "logps/chosen": -291.2084045410156, + "logps/rejected": -256.3158264160156, + "loss": 0.0382, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.02477596327662468, + "rewards/margins": 0.07663208246231079, + "rewards/rejected": -0.10140804201364517, + "step": 2570 + }, + { + "epoch": 0.68, + "learning_rate": 1.4424192429432657e-06, + "logits/chosen": 1.0064821243286133, + "logits/rejected": 1.0545318126678467, + "logps/chosen": -307.71038818359375, + "logps/rejected": -225.4966583251953, + "loss": 0.0361, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.026431897655129433, + "rewards/margins": 0.06981770694255829, + "rewards/rejected": -0.09624960273504257, + "step": 2580 + }, + { + "epoch": 0.68, + "learning_rate": 1.421763837748016e-06, + "logits/chosen": 0.9946783185005188, + "logits/rejected": 1.0760682821273804, + "logps/chosen": -267.80877685546875, + "logps/rejected": -273.41168212890625, + "loss": 0.0416, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.04798274114727974, + "rewards/margins": 0.05912737920880318, + "rewards/rejected": -0.10711012035608292, + "step": 2590 + }, + { + "epoch": 0.68, + "learning_rate": 1.401198464962021e-06, + "logits/chosen": 1.0298030376434326, + "logits/rejected": 0.9537370800971985, + "logps/chosen": -272.90863037109375, + "logps/rejected": -239.8860321044922, + "loss": 0.0279, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.03228624537587166, + "rewards/margins": 0.05655151605606079, + "rewards/rejected": -0.08883775770664215, + "step": 2600 + }, + { + "epoch": 0.68, + "eval_logits/chosen": 0.9350239038467407, + "eval_logits/rejected": 1.0360502004623413, + "eval_logps/chosen": -281.37646484375, + "eval_logps/rejected": -253.77207946777344, + "eval_loss": 0.03485475853085518, + "eval_rewards/accuracies": 0.6315000057220459, + "eval_rewards/chosen": -0.038296524435281754, + "eval_rewards/margins": 0.06126519292593002, + "eval_rewards/rejected": -0.09956171363592148, + "eval_runtime": 539.0411, + "eval_samples_per_second": 3.71, + "eval_steps_per_second": 0.928, + "step": 2600 + }, + { + "epoch": 0.68, + "learning_rate": 1.3807248417879896e-06, + "logits/chosen": 0.9518525004386902, + "logits/rejected": 0.9893406629562378, + "logps/chosen": -258.8128356933594, + "logps/rejected": -219.7599639892578, + "loss": 0.0339, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.02622481808066368, + "rewards/margins": 0.05761373043060303, + "rewards/rejected": -0.083838552236557, + "step": 2610 + }, + { + "epoch": 0.69, + "learning_rate": 1.3603446777675665e-06, + "logits/chosen": 0.9631742238998413, + "logits/rejected": 0.9899166822433472, + "logps/chosen": -260.05377197265625, + "logps/rejected": -240.6715087890625, + "loss": 0.0313, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.033961087465286255, + "rewards/margins": 0.05184347182512283, + "rewards/rejected": -0.08580456674098969, + "step": 2620 + }, + { + "epoch": 0.69, + "learning_rate": 1.3400596746385817e-06, + "logits/chosen": 0.9519561529159546, + "logits/rejected": 0.9998300671577454, + "logps/chosen": -290.88958740234375, + "logps/rejected": -255.8434600830078, + "loss": 0.0278, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.036170247942209244, + "rewards/margins": 0.07095544040203094, + "rewards/rejected": -0.10712568461894989, + "step": 2630 + }, + { + "epoch": 0.69, + "learning_rate": 1.3198715261929587e-06, + "logits/chosen": 0.9674153327941895, + "logits/rejected": 1.0848486423492432, + "logps/chosen": -322.2984924316406, + "logps/rejected": -246.0180206298828, + "loss": 0.0367, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.029658863320946693, + "rewards/margins": 0.04690408706665039, + "rewards/rejected": -0.07656295597553253, + "step": 2640 + }, + { + "epoch": 0.69, + "learning_rate": 1.2997819181352823e-06, + "logits/chosen": 0.979369044303894, + "logits/rejected": 0.9796104431152344, + "logps/chosen": -299.4449462890625, + "logps/rejected": -267.0638122558594, + "loss": 0.0319, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.033346764743328094, + "rewards/margins": 0.05519961193203926, + "rewards/rejected": -0.08854638040065765, + "step": 2650 + }, + { + "epoch": 0.7, + "learning_rate": 1.2797925279420454e-06, + "logits/chosen": 1.0160847902297974, + "logits/rejected": 1.0748382806777954, + "logps/chosen": -291.21661376953125, + "logps/rejected": -258.7221374511719, + "loss": 0.0387, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.040911462157964706, + "rewards/margins": 0.046320244669914246, + "rewards/rejected": -0.08723169565200806, + "step": 2660 + }, + { + "epoch": 0.7, + "learning_rate": 1.2599050247215764e-06, + "logits/chosen": 0.9987077713012695, + "logits/rejected": 1.0722219944000244, + "logps/chosen": -286.01446533203125, + "logps/rejected": -262.60101318359375, + "loss": 0.0249, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.023940104991197586, + "rewards/margins": 0.07719768583774567, + "rewards/rejected": -0.10113777965307236, + "step": 2670 + }, + { + "epoch": 0.7, + "learning_rate": 1.2401210690746705e-06, + "logits/chosen": 0.949263870716095, + "logits/rejected": 1.113747239112854, + "logps/chosen": -264.36834716796875, + "logps/rejected": -241.06570434570312, + "loss": 0.0276, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.029313066974282265, + "rewards/margins": 0.047565605491399765, + "rewards/rejected": -0.07687868177890778, + "step": 2680 + }, + { + "epoch": 0.7, + "learning_rate": 1.2204423129559306e-06, + "logits/chosen": 0.8770235180854797, + "logits/rejected": 1.056774377822876, + "logps/chosen": -296.3006896972656, + "logps/rejected": -270.3150939941406, + "loss": 0.0346, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.026878798380494118, + "rewards/margins": 0.04574307054281235, + "rewards/rejected": -0.07262186706066132, + "step": 2690 + }, + { + "epoch": 0.71, + "learning_rate": 1.20087039953583e-06, + "logits/chosen": 0.9897807240486145, + "logits/rejected": 1.0115458965301514, + "logps/chosen": -281.16778564453125, + "logps/rejected": -259.2882080078125, + "loss": 0.0427, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.01994657889008522, + "rewards/margins": 0.06008830666542053, + "rewards/rejected": -0.08003488928079605, + "step": 2700 + }, + { + "epoch": 0.71, + "eval_logits/chosen": 0.9318579435348511, + "eval_logits/rejected": 1.0336464643478394, + "eval_logps/chosen": -280.66436767578125, + "eval_logps/rejected": -252.92898559570312, + "eval_loss": 0.03483254089951515, + "eval_rewards/accuracies": 0.6309999823570251, + "eval_rewards/chosen": -0.031175779178738594, + "eval_rewards/margins": 0.05995478481054306, + "eval_rewards/rejected": -0.0911305621266365, + "eval_runtime": 539.1556, + "eval_samples_per_second": 3.71, + "eval_steps_per_second": 0.927, + "step": 2700 + }, + { + "epoch": 0.71, + "learning_rate": 1.181406963063507e-06, + "logits/chosen": 0.987993061542511, + "logits/rejected": 0.9600605964660645, + "logps/chosen": -277.44580078125, + "logps/rejected": -232.87661743164062, + "loss": 0.0366, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.025253716856241226, + "rewards/margins": 0.05700179934501648, + "rewards/rejected": -0.082255519926548, + "step": 2710 + }, + { + "epoch": 0.71, + "learning_rate": 1.1620536287303052e-06, + "logits/chosen": 0.9831315279006958, + "logits/rejected": 1.0662561655044556, + "logps/chosen": -261.1286315917969, + "logps/rejected": -243.3155059814453, + "loss": 0.0467, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.03405457362532616, + "rewards/margins": 0.06027429178357124, + "rewards/rejected": -0.0943288654088974, + "step": 2720 + }, + { + "epoch": 0.71, + "learning_rate": 1.1428120125340717e-06, + "logits/chosen": 1.0292747020721436, + "logits/rejected": 0.947592556476593, + "logps/chosen": -272.7021484375, + "logps/rejected": -243.03701782226562, + "loss": 0.0401, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.02782285585999489, + "rewards/margins": 0.05716438964009285, + "rewards/rejected": -0.08498723804950714, + "step": 2730 + }, + { + "epoch": 0.72, + "learning_rate": 1.123683721144223e-06, + "logits/chosen": 0.977290153503418, + "logits/rejected": 1.0009124279022217, + "logps/chosen": -272.8582458496094, + "logps/rejected": -241.51687622070312, + "loss": 0.0348, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.03376708924770355, + "rewards/margins": 0.07209788262844086, + "rewards/rejected": -0.105864979326725, + "step": 2740 + }, + { + "epoch": 0.72, + "learning_rate": 1.1046703517675848e-06, + "logits/chosen": 0.973223865032196, + "logits/rejected": 0.9701916575431824, + "logps/chosen": -247.44888305664062, + "logps/rejected": -250.46353149414062, + "loss": 0.0384, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.028101569041609764, + "rewards/margins": 0.05619729310274124, + "rewards/rejected": -0.08429885655641556, + "step": 2750 + }, + { + "epoch": 0.72, + "learning_rate": 1.085773492015028e-06, + "logits/chosen": 0.960196852684021, + "logits/rejected": 1.067030668258667, + "logps/chosen": -288.462158203125, + "logps/rejected": -235.9557647705078, + "loss": 0.0315, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.03159577399492264, + "rewards/margins": 0.07470157742500305, + "rewards/rejected": -0.10629735141992569, + "step": 2760 + }, + { + "epoch": 0.72, + "learning_rate": 1.0669947197689034e-06, + "logits/chosen": 0.9508602023124695, + "logits/rejected": 1.0008846521377563, + "logps/chosen": -229.83627319335938, + "logps/rejected": -243.19570922851562, + "loss": 0.0447, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.04542272537946701, + "rewards/margins": 0.0430048331618309, + "rewards/rejected": -0.08842755109071732, + "step": 2770 + }, + { + "epoch": 0.73, + "learning_rate": 1.048335603051291e-06, + "logits/chosen": 0.9108030200004578, + "logits/rejected": 1.035298228263855, + "logps/chosen": -272.658935546875, + "logps/rejected": -255.37905883789062, + "loss": 0.0379, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.028898200020194054, + "rewards/margins": 0.07144349068403244, + "rewards/rejected": -0.10034169256687164, + "step": 2780 + }, + { + "epoch": 0.73, + "learning_rate": 1.0297976998930665e-06, + "logits/chosen": 0.9145883321762085, + "logits/rejected": 0.9523155093193054, + "logps/chosen": -278.5872497558594, + "logps/rejected": -242.91748046875, + "loss": 0.0345, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.023207422345876694, + "rewards/margins": 0.05597452074289322, + "rewards/rejected": -0.07918194681406021, + "step": 2790 + }, + { + "epoch": 0.73, + "learning_rate": 1.0113825582038078e-06, + "logits/chosen": 0.9705532193183899, + "logits/rejected": 1.0568289756774902, + "logps/chosen": -271.0597839355469, + "logps/rejected": -225.5278778076172, + "loss": 0.0331, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.03660514950752258, + "rewards/margins": 0.05592336505651474, + "rewards/rejected": -0.09252851456403732, + "step": 2800 + }, + { + "epoch": 0.73, + "eval_logits/chosen": 0.9335169196128845, + "eval_logits/rejected": 1.035439372062683, + "eval_logps/chosen": -280.4610900878906, + "eval_logps/rejected": -252.53688049316406, + "eval_loss": 0.0349029041826725, + "eval_rewards/accuracies": 0.6290000081062317, + "eval_rewards/chosen": -0.0291427094489336, + "eval_rewards/margins": 0.058066822588443756, + "eval_rewards/rejected": -0.08720952272415161, + "eval_runtime": 539.1069, + "eval_samples_per_second": 3.71, + "eval_steps_per_second": 0.927, + "step": 2800 + }, + { + "epoch": 0.74, + "learning_rate": 9.930917156425477e-07, + "logits/chosen": 0.9502407908439636, + "logits/rejected": 1.072632908821106, + "logps/chosen": -246.87118530273438, + "logps/rejected": -225.63467407226562, + "loss": 0.0405, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.018824134021997452, + "rewards/margins": 0.0712006688117981, + "rewards/rejected": -0.09002481400966644, + "step": 2810 + }, + { + "epoch": 0.74, + "learning_rate": 9.749266994893756e-07, + "logits/chosen": 0.9383459091186523, + "logits/rejected": 1.0246083736419678, + "logps/chosen": -242.57315063476562, + "logps/rejected": -255.66812133789062, + "loss": 0.0387, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.029849324375391006, + "rewards/margins": 0.04549198970198631, + "rewards/rejected": -0.07534130662679672, + "step": 2820 + }, + { + "epoch": 0.74, + "learning_rate": 9.56889026517913e-07, + "logits/chosen": 0.9737696647644043, + "logits/rejected": 1.0874649286270142, + "logps/chosen": -272.6422424316406, + "logps/rejected": -241.7522430419922, + "loss": 0.0379, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.027452822774648666, + "rewards/margins": 0.06477371603250504, + "rewards/rejected": -0.0922265350818634, + "step": 2830 + }, + { + "epoch": 0.74, + "learning_rate": 9.389802028686617e-07, + "logits/chosen": 0.9742962121963501, + "logits/rejected": 1.0039392709732056, + "logps/chosen": -219.0965118408203, + "logps/rejected": -192.56277465820312, + "loss": 0.0343, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.029790541157126427, + "rewards/margins": 0.05616752430796623, + "rewards/rejected": -0.08595806360244751, + "step": 2840 + }, + { + "epoch": 0.75, + "learning_rate": 9.212017239232427e-07, + "logits/chosen": 0.951312243938446, + "logits/rejected": 1.0733340978622437, + "logps/chosen": -271.9151916503906, + "logps/rejected": -280.4118957519531, + "loss": 0.0357, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.01736624911427498, + "rewards/margins": 0.08112471550703049, + "rewards/rejected": -0.09849096834659576, + "step": 2850 + }, + { + "epoch": 0.75, + "learning_rate": 9.03555074179533e-07, + "logits/chosen": 0.9455773234367371, + "logits/rejected": 1.05752432346344, + "logps/chosen": -267.56500244140625, + "logps/rejected": -246.6649932861328, + "loss": 0.0291, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.02218179777264595, + "rewards/margins": 0.07102219760417938, + "rewards/rejected": -0.09320400655269623, + "step": 2860 + }, + { + "epoch": 0.75, + "learning_rate": 8.860417271277067e-07, + "logits/chosen": 0.9300365447998047, + "logits/rejected": 1.022707462310791, + "logps/chosen": -242.83029174804688, + "logps/rejected": -241.719970703125, + "loss": 0.0441, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.027262404561042786, + "rewards/margins": 0.0598478689789772, + "rewards/rejected": -0.08711027354001999, + "step": 2870 + }, + { + "epoch": 0.75, + "learning_rate": 8.686631451272029e-07, + "logits/chosen": 0.9614574313163757, + "logits/rejected": 0.9695903658866882, + "logps/chosen": -269.4878845214844, + "logps/rejected": -222.73403930664062, + "loss": 0.0401, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.035077519714832306, + "rewards/margins": 0.054178714752197266, + "rewards/rejected": -0.08925624191761017, + "step": 2880 + }, + { + "epoch": 0.76, + "learning_rate": 8.514207792846168e-07, + "logits/chosen": 1.0225694179534912, + "logits/rejected": 1.0496468544006348, + "logps/chosen": -252.6670379638672, + "logps/rejected": -257.1852111816406, + "loss": 0.0488, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.02861052379012108, + "rewards/margins": 0.060965172946453094, + "rewards/rejected": -0.08957569301128387, + "step": 2890 + }, + { + "epoch": 0.76, + "learning_rate": 8.343160693325356e-07, + "logits/chosen": 0.9750697016716003, + "logits/rejected": 1.0202362537384033, + "logps/chosen": -232.67269897460938, + "logps/rejected": -227.71505737304688, + "loss": 0.0415, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.03148717060685158, + "rewards/margins": 0.05822090432047844, + "rewards/rejected": -0.08970808237791061, + "step": 2900 + }, + { + "epoch": 0.76, + "eval_logits/chosen": 0.9227569699287415, + "eval_logits/rejected": 1.0247799158096313, + "eval_logps/chosen": -280.527587890625, + "eval_logps/rejected": -252.64686584472656, + "eval_loss": 0.03488382324576378, + "eval_rewards/accuracies": 0.6315000057220459, + "eval_rewards/chosen": -0.02980780228972435, + "eval_rewards/margins": 0.0585014745593071, + "eval_rewards/rejected": -0.08830928802490234, + "eval_runtime": 539.0433, + "eval_samples_per_second": 3.71, + "eval_steps_per_second": 0.928, + "step": 2900 + }, + { + "epoch": 0.76, + "learning_rate": 8.173504435093174e-07, + "logits/chosen": 0.9547770619392395, + "logits/rejected": 0.9979850649833679, + "logps/chosen": -285.94573974609375, + "logps/rejected": -269.0919494628906, + "loss": 0.0381, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.032671697437763214, + "rewards/margins": 0.04868137463927269, + "rewards/rejected": -0.0813530758023262, + "step": 2910 + }, + { + "epoch": 0.76, + "learning_rate": 8.00525318439836e-07, + "logits/chosen": 1.0028339624404907, + "logits/rejected": 1.0430415868759155, + "logps/chosen": -271.57373046875, + "logps/rejected": -232.68222045898438, + "loss": 0.0312, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.03152400627732277, + "rewards/margins": 0.061547745019197464, + "rewards/rejected": -0.09307174384593964, + "step": 2920 + }, + { + "epoch": 0.77, + "learning_rate": 7.838420990171927e-07, + "logits/chosen": 0.9217666387557983, + "logits/rejected": 1.0288439989089966, + "logps/chosen": -313.1500549316406, + "logps/rejected": -267.90985107421875, + "loss": 0.0292, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.022273462265729904, + "rewards/margins": 0.06082174926996231, + "rewards/rejected": -0.08309520781040192, + "step": 2930 + }, + { + "epoch": 0.77, + "learning_rate": 7.673021782854084e-07, + "logits/chosen": 0.9779103994369507, + "logits/rejected": 0.9794729351997375, + "logps/chosen": -250.5184783935547, + "logps/rejected": -264.76287841796875, + "loss": 0.0435, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.028768843039870262, + "rewards/margins": 0.060712385922670364, + "rewards/rejected": -0.08948123455047607, + "step": 2940 + }, + { + "epoch": 0.77, + "learning_rate": 7.509069373231039e-07, + "logits/chosen": 0.9786937832832336, + "logits/rejected": 0.9728788137435913, + "logps/chosen": -294.4608459472656, + "logps/rejected": -271.31146240234375, + "loss": 0.0281, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.019416136667132378, + "rewards/margins": 0.06897474080324173, + "rewards/rejected": -0.08839087188243866, + "step": 2950 + }, + { + "epoch": 0.77, + "learning_rate": 7.346577451281822e-07, + "logits/chosen": 0.9356600046157837, + "logits/rejected": 1.0507375001907349, + "logps/chosen": -251.129638671875, + "logps/rejected": -232.9912567138672, + "loss": 0.0454, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.03892569988965988, + "rewards/margins": 0.061569105833768845, + "rewards/rejected": -0.10049480199813843, + "step": 2960 + }, + { + "epoch": 0.78, + "learning_rate": 7.185559585035138e-07, + "logits/chosen": 0.9160947799682617, + "logits/rejected": 0.9816699028015137, + "logps/chosen": -286.2984924316406, + "logps/rejected": -260.083984375, + "loss": 0.0455, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.01913222298026085, + "rewards/margins": 0.05808521434664726, + "rewards/rejected": -0.07721744477748871, + "step": 2970 + }, + { + "epoch": 0.78, + "learning_rate": 7.026029219436504e-07, + "logits/chosen": 0.9672778844833374, + "logits/rejected": 1.019814372062683, + "logps/chosen": -306.3282470703125, + "logps/rejected": -272.06280517578125, + "loss": 0.0323, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.02404235675930977, + "rewards/margins": 0.06635276973247528, + "rewards/rejected": -0.09039512276649475, + "step": 2980 + }, + { + "epoch": 0.78, + "learning_rate": 6.867999675225523e-07, + "logits/chosen": 0.9749993085861206, + "logits/rejected": 0.9944796562194824, + "logps/chosen": -297.462646484375, + "logps/rejected": -249.9281768798828, + "loss": 0.0296, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.019663769751787186, + "rewards/margins": 0.053207218647003174, + "rewards/rejected": -0.07287098467350006, + "step": 2990 + }, + { + "epoch": 0.79, + "learning_rate": 6.711484147823663e-07, + "logits/chosen": 0.9646242260932922, + "logits/rejected": 0.9969871640205383, + "logps/chosen": -270.52301025390625, + "logps/rejected": -241.90933227539062, + "loss": 0.0404, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.02490374445915222, + "rewards/margins": 0.06743566691875458, + "rewards/rejected": -0.0923394113779068, + "step": 3000 + }, + { + "epoch": 0.79, + "eval_logits/chosen": 0.927689790725708, + "eval_logits/rejected": 1.030518889427185, + "eval_logps/chosen": -280.2290954589844, + "eval_logps/rejected": -252.4009246826172, + "eval_loss": 0.034884098917245865, + "eval_rewards/accuracies": 0.6294999718666077, + "eval_rewards/chosen": -0.026823006570339203, + "eval_rewards/margins": 0.059027016162872314, + "eval_rewards/rejected": -0.08585001528263092, + "eval_runtime": 539.1402, + "eval_samples_per_second": 3.71, + "eval_steps_per_second": 0.927, + "step": 3000 + }, + { + "epoch": 0.79, + "learning_rate": 6.556495706232413e-07, + "logits/chosen": 0.9685632586479187, + "logits/rejected": 1.0489590167999268, + "logps/chosen": -274.52239990234375, + "logps/rejected": -239.2083740234375, + "loss": 0.0369, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.01814478076994419, + "rewards/margins": 0.06804581731557846, + "rewards/rejected": -0.0861906185746193, + "step": 3010 + }, + { + "epoch": 0.79, + "learning_rate": 6.403047291942057e-07, + "logits/chosen": 0.9458588361740112, + "logits/rejected": 0.9609957933425903, + "logps/chosen": -278.5867919921875, + "logps/rejected": -264.2691955566406, + "loss": 0.0291, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.01274092961102724, + "rewards/margins": 0.05699128657579422, + "rewards/rejected": -0.06973221898078918, + "step": 3020 + }, + { + "epoch": 0.79, + "learning_rate": 6.251151717851023e-07, + "logits/chosen": 0.9313241243362427, + "logits/rejected": 0.9795175790786743, + "logps/chosen": -291.05364990234375, + "logps/rejected": -275.7203063964844, + "loss": 0.0411, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.02996246889233589, + "rewards/margins": 0.08125524967908859, + "rewards/rejected": -0.11121772229671478, + "step": 3030 + }, + { + "epoch": 0.8, + "learning_rate": 6.100821667196041e-07, + "logits/chosen": 0.9516725540161133, + "logits/rejected": 1.033613920211792, + "logps/chosen": -304.0897216796875, + "logps/rejected": -248.3233642578125, + "loss": 0.0313, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.012124654836952686, + "rewards/margins": 0.06473545730113983, + "rewards/rejected": -0.07686010748147964, + "step": 3040 + }, + { + "epoch": 0.8, + "learning_rate": 5.952069692493062e-07, + "logits/chosen": 0.8957148790359497, + "logits/rejected": 1.0140354633331299, + "logps/chosen": -300.5768127441406, + "logps/rejected": -266.0682373046875, + "loss": 0.0293, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.006201650947332382, + "rewards/margins": 0.07337900996208191, + "rewards/rejected": -0.07958065718412399, + "step": 3050 + }, + { + "epoch": 0.8, + "learning_rate": 5.80490821448918e-07, + "logits/chosen": 0.9774069786071777, + "logits/rejected": 1.0049412250518799, + "logps/chosen": -222.11856079101562, + "logps/rejected": -206.6851348876953, + "loss": 0.0289, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.029702406376600266, + "rewards/margins": 0.059780023992061615, + "rewards/rejected": -0.08948242664337158, + "step": 3060 + }, + { + "epoch": 0.8, + "learning_rate": 5.659349521125459e-07, + "logits/chosen": 0.945693850517273, + "logits/rejected": 1.0435597896575928, + "logps/chosen": -238.2971649169922, + "logps/rejected": -243.50106811523438, + "loss": 0.0443, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.02947511151432991, + "rewards/margins": 0.07011254131793976, + "rewards/rejected": -0.09958765655755997, + "step": 3070 + }, + { + "epoch": 0.81, + "learning_rate": 5.5154057665109e-07, + "logits/chosen": 0.9794226884841919, + "logits/rejected": 1.0414973497390747, + "logps/chosen": -280.0814208984375, + "logps/rejected": -243.6100311279297, + "loss": 0.0303, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.022864539176225662, + "rewards/margins": 0.06399150937795639, + "rewards/rejected": -0.08685605973005295, + "step": 3080 + }, + { + "epoch": 0.81, + "learning_rate": 5.373088969907586e-07, + "logits/chosen": 0.9371223449707031, + "logits/rejected": 1.0262264013290405, + "logps/chosen": -260.27728271484375, + "logps/rejected": -258.8297424316406, + "loss": 0.0314, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.02002180740237236, + "rewards/margins": 0.06138715147972107, + "rewards/rejected": -0.08140896260738373, + "step": 3090 + }, + { + "epoch": 0.81, + "learning_rate": 5.23241101472709e-07, + "logits/chosen": 1.0079492330551147, + "logits/rejected": 0.9960187673568726, + "logps/chosen": -255.51211547851562, + "logps/rejected": -243.93319702148438, + "loss": 0.0362, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.02185986004769802, + "rewards/margins": 0.057953812181949615, + "rewards/rejected": -0.07981367409229279, + "step": 3100 + }, + { + "epoch": 0.81, + "eval_logits/chosen": 0.9270116090774536, + "eval_logits/rejected": 1.0295790433883667, + "eval_logps/chosen": -280.18609619140625, + "eval_logps/rejected": -252.307861328125, + "eval_loss": 0.03481233865022659, + "eval_rewards/accuracies": 0.6305000185966492, + "eval_rewards/chosen": -0.026392878964543343, + "eval_rewards/margins": 0.058526668697595596, + "eval_rewards/rejected": -0.08491955697536469, + "eval_runtime": 539.1609, + "eval_samples_per_second": 3.709, + "eval_steps_per_second": 0.927, + "step": 3100 + }, + { + "epoch": 0.81, + "learning_rate": 5.09338364753818e-07, + "logits/chosen": 0.9137361645698547, + "logits/rejected": 1.0361106395721436, + "logps/chosen": -293.02032470703125, + "logps/rejected": -264.98883056640625, + "loss": 0.0454, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.01966175064444542, + "rewards/margins": 0.090638667345047, + "rewards/rejected": -0.11030042171478271, + "step": 3110 + }, + { + "epoch": 0.82, + "learning_rate": 4.956018477086005e-07, + "logits/chosen": 1.0527143478393555, + "logits/rejected": 1.0417900085449219, + "logps/chosen": -281.76318359375, + "logps/rejected": -230.76205444335938, + "loss": 0.0363, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.02277226559817791, + "rewards/margins": 0.055615413933992386, + "rewards/rejected": -0.07838768512010574, + "step": 3120 + }, + { + "epoch": 0.82, + "learning_rate": 4.820326973322764e-07, + "logits/chosen": 0.9705474972724915, + "logits/rejected": 1.0229809284210205, + "logps/chosen": -295.6360778808594, + "logps/rejected": -221.80020141601562, + "loss": 0.033, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.01131758838891983, + "rewards/margins": 0.0728193074464798, + "rewards/rejected": -0.08413688838481903, + "step": 3130 + }, + { + "epoch": 0.82, + "learning_rate": 4.686320466449981e-07, + "logits/chosen": 0.9558774828910828, + "logits/rejected": 1.006240725517273, + "logps/chosen": -237.345947265625, + "logps/rejected": -222.2642364501953, + "loss": 0.0358, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.022244829684495926, + "rewards/margins": 0.07573308050632477, + "rewards/rejected": -0.0979778990149498, + "step": 3140 + }, + { + "epoch": 0.82, + "learning_rate": 4.554010145972418e-07, + "logits/chosen": 0.956200897693634, + "logits/rejected": 1.0314735174179077, + "logps/chosen": -285.91473388671875, + "logps/rejected": -229.6345977783203, + "loss": 0.0312, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.015750734135508537, + "rewards/margins": 0.0518513098359108, + "rewards/rejected": -0.06760205328464508, + "step": 3150 + }, + { + "epoch": 0.83, + "learning_rate": 4.4234070597637455e-07, + "logits/chosen": 0.9090474843978882, + "logits/rejected": 1.0826170444488525, + "logps/chosen": -270.742919921875, + "logps/rejected": -254.69363403320312, + "loss": 0.0316, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.029270146042108536, + "rewards/margins": 0.04612868279218674, + "rewards/rejected": -0.07539881765842438, + "step": 3160 + }, + { + "epoch": 0.83, + "learning_rate": 4.2945221131440783e-07, + "logits/chosen": 0.975232720375061, + "logits/rejected": 1.02248215675354, + "logps/chosen": -245.90774536132812, + "logps/rejected": -221.59896850585938, + "loss": 0.045, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.025399979203939438, + "rewards/margins": 0.0584503710269928, + "rewards/rejected": -0.08385033905506134, + "step": 3170 + }, + { + "epoch": 0.83, + "learning_rate": 4.167366067969381e-07, + "logits/chosen": 0.9106703996658325, + "logits/rejected": 0.9676691293716431, + "logps/chosen": -272.5838317871094, + "logps/rejected": -259.0162658691406, + "loss": 0.0395, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.017812054604291916, + "rewards/margins": 0.05891140550374985, + "rewards/rejected": -0.07672347128391266, + "step": 3180 + }, + { + "epoch": 0.83, + "learning_rate": 4.041949541732826e-07, + "logits/chosen": 0.9986382722854614, + "logits/rejected": 1.0691004991531372, + "logps/chosen": -296.02490234375, + "logps/rejected": -245.5988311767578, + "loss": 0.0447, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.02657441236078739, + "rewards/margins": 0.05036981776356697, + "rewards/rejected": -0.07694423198699951, + "step": 3190 + }, + { + "epoch": 0.84, + "learning_rate": 3.9182830066782614e-07, + "logits/chosen": 0.97566157579422, + "logits/rejected": 0.9876262545585632, + "logps/chosen": -282.0150146484375, + "logps/rejected": -245.9921875, + "loss": 0.0412, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.03303904086351395, + "rewards/margins": 0.04882500693202019, + "rewards/rejected": -0.08186405152082443, + "step": 3200 + }, + { + "epoch": 0.84, + "eval_logits/chosen": 0.9313199520111084, + "eval_logits/rejected": 1.0337715148925781, + "eval_logps/chosen": -280.28759765625, + "eval_logps/rejected": -252.42367553710938, + "eval_loss": 0.03475377336144447, + "eval_rewards/accuracies": 0.6259999871253967, + "eval_rewards/chosen": -0.027407577261328697, + "eval_rewards/margins": 0.05866991728544235, + "eval_rewards/rejected": -0.0860774889588356, + "eval_runtime": 539.1084, + "eval_samples_per_second": 3.71, + "eval_steps_per_second": 0.927, + "step": 3200 + }, + { + "epoch": 0.84, + "learning_rate": 3.796376788925771e-07, + "logits/chosen": 0.9154554605484009, + "logits/rejected": 1.0822858810424805, + "logps/chosen": -247.672119140625, + "logps/rejected": -225.2171630859375, + "loss": 0.0476, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.03178101405501366, + "rewards/margins": 0.0625949501991272, + "rewards/rejected": -0.09437596052885056, + "step": 3210 + }, + { + "epoch": 0.84, + "learning_rate": 3.676241067609465e-07, + "logits/chosen": 0.9667531847953796, + "logits/rejected": 0.9605924487113953, + "logps/chosen": -242.40115356445312, + "logps/rejected": -237.51455688476562, + "loss": 0.0326, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.02729959413409233, + "rewards/margins": 0.06457889080047607, + "rewards/rejected": -0.0918785035610199, + "step": 3220 + }, + { + "epoch": 0.85, + "learning_rate": 3.5578858740274976e-07, + "logits/chosen": 0.9239116907119751, + "logits/rejected": 1.0316154956817627, + "logps/chosen": -300.83447265625, + "logps/rejected": -260.6551513671875, + "loss": 0.035, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.020300351083278656, + "rewards/margins": 0.06348638236522675, + "rewards/rejected": -0.0837867259979248, + "step": 3230 + }, + { + "epoch": 0.85, + "learning_rate": 3.44132109080447e-07, + "logits/chosen": 0.9275467991828918, + "logits/rejected": 1.080038070678711, + "logps/chosen": -259.93646240234375, + "logps/rejected": -237.24990844726562, + "loss": 0.0382, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.03450951725244522, + "rewards/margins": 0.05054662749171257, + "rewards/rejected": -0.08505614101886749, + "step": 3240 + }, + { + "epoch": 0.85, + "learning_rate": 3.3265564510662344e-07, + "logits/chosen": 0.9719650149345398, + "logits/rejected": 1.0536506175994873, + "logps/chosen": -254.2252960205078, + "logps/rejected": -218.9423828125, + "loss": 0.0369, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.024826010689139366, + "rewards/margins": 0.05269388109445572, + "rewards/rejected": -0.07751990109682083, + "step": 3250 + }, + { + "epoch": 0.85, + "learning_rate": 3.213601537627195e-07, + "logits/chosen": 0.9981171488761902, + "logits/rejected": 1.0061393976211548, + "logps/chosen": -260.99664306640625, + "logps/rejected": -252.4512939453125, + "loss": 0.0306, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.028290893882513046, + "rewards/margins": 0.059080712497234344, + "rewards/rejected": -0.08737160265445709, + "step": 3260 + }, + { + "epoch": 0.86, + "learning_rate": 3.1024657821901063e-07, + "logits/chosen": 0.9618428945541382, + "logits/rejected": 0.9869762659072876, + "logps/chosen": -255.50851440429688, + "logps/rejected": -237.8030548095703, + "loss": 0.0263, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.023725476115942, + "rewards/margins": 0.06770970672369003, + "rewards/rejected": -0.09143517911434174, + "step": 3270 + }, + { + "epoch": 0.86, + "learning_rate": 2.9931584645585654e-07, + "logits/chosen": 1.0205438137054443, + "logits/rejected": 1.0499489307403564, + "logps/chosen": -285.1611328125, + "logps/rejected": -280.9693908691406, + "loss": 0.0364, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.02360624074935913, + "rewards/margins": 0.07233406603336334, + "rewards/rejected": -0.09594030678272247, + "step": 3280 + }, + { + "epoch": 0.86, + "learning_rate": 2.885688711862136e-07, + "logits/chosen": 0.9615311622619629, + "logits/rejected": 1.010024905204773, + "logps/chosen": -257.10546875, + "logps/rejected": -222.49844360351562, + "loss": 0.0356, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.022061806172132492, + "rewards/margins": 0.044591888785362244, + "rewards/rejected": -0.06665369868278503, + "step": 3290 + }, + { + "epoch": 0.86, + "learning_rate": 2.7800654977942486e-07, + "logits/chosen": 0.9196559190750122, + "logits/rejected": 1.0562456846237183, + "logps/chosen": -294.05804443359375, + "logps/rejected": -225.5612030029297, + "loss": 0.0485, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.020386073738336563, + "rewards/margins": 0.06678882986307144, + "rewards/rejected": -0.0871749073266983, + "step": 3300 + }, + { + "epoch": 0.86, + "eval_logits/chosen": 0.9336137771606445, + "eval_logits/rejected": 1.0358667373657227, + "eval_logps/chosen": -279.9648132324219, + "eval_logps/rejected": -252.05458068847656, + "eval_loss": 0.034665048122406006, + "eval_rewards/accuracies": 0.6269999742507935, + "eval_rewards/chosen": -0.02418021857738495, + "eval_rewards/margins": 0.05820634588599205, + "eval_rewards/rejected": -0.0823865681886673, + "eval_runtime": 538.8853, + "eval_samples_per_second": 3.711, + "eval_steps_per_second": 0.928, + "step": 3300 + }, + { + "epoch": 0.87, + "learning_rate": 2.6762976418628797e-07, + "logits/chosen": 0.9499009847640991, + "logits/rejected": 0.961447536945343, + "logps/chosen": -252.6084747314453, + "logps/rejected": -231.51760864257812, + "loss": 0.0302, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.020922133699059486, + "rewards/margins": 0.055929750204086304, + "rewards/rejected": -0.07685188204050064, + "step": 3310 + }, + { + "epoch": 0.87, + "learning_rate": 2.5743938086541354e-07, + "logits/chosen": 0.9769983291625977, + "logits/rejected": 1.0180495977401733, + "logps/chosen": -297.083740234375, + "logps/rejected": -251.3445281982422, + "loss": 0.0258, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.013064468279480934, + "rewards/margins": 0.08387977629899979, + "rewards/rejected": -0.09694425016641617, + "step": 3320 + }, + { + "epoch": 0.87, + "learning_rate": 2.4743625071087574e-07, + "logits/chosen": 0.9248872995376587, + "logits/rejected": 1.083939552307129, + "logps/chosen": -269.6517639160156, + "logps/rejected": -246.59970092773438, + "loss": 0.0377, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.029898881912231445, + "rewards/margins": 0.06900982558727264, + "rewards/rejected": -0.09890870004892349, + "step": 3330 + }, + { + "epoch": 0.87, + "learning_rate": 2.3762120898116498e-07, + "logits/chosen": 0.9920533895492554, + "logits/rejected": 1.0261324644088745, + "logps/chosen": -267.4979553222656, + "logps/rejected": -244.2932891845703, + "loss": 0.0377, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.0195697583258152, + "rewards/margins": 0.03883281350135803, + "rewards/rejected": -0.058402568101882935, + "step": 3340 + }, + { + "epoch": 0.88, + "learning_rate": 2.2799507522944048e-07, + "logits/chosen": 1.0060142278671265, + "logits/rejected": 1.0250886678695679, + "logps/chosen": -296.4592590332031, + "logps/rejected": -239.81381225585938, + "loss": 0.0321, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -0.026712555438280106, + "rewards/margins": 0.03180098533630371, + "rewards/rejected": -0.058513544499874115, + "step": 3350 + }, + { + "epoch": 0.88, + "learning_rate": 2.1855865323510056e-07, + "logits/chosen": 0.978489100933075, + "logits/rejected": 1.032293677330017, + "logps/chosen": -289.8194580078125, + "logps/rejected": -266.70330810546875, + "loss": 0.032, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.015113875269889832, + "rewards/margins": 0.06603299081325531, + "rewards/rejected": -0.08114685118198395, + "step": 3360 + }, + { + "epoch": 0.88, + "learning_rate": 2.0931273093666575e-07, + "logits/chosen": 1.0238964557647705, + "logits/rejected": 1.0438212156295776, + "logps/chosen": -267.2007141113281, + "logps/rejected": -265.8624572753906, + "loss": 0.0403, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.018140006810426712, + "rewards/margins": 0.06415996700525284, + "rewards/rejected": -0.08229997754096985, + "step": 3370 + }, + { + "epoch": 0.88, + "learning_rate": 2.002580803659873e-07, + "logits/chosen": 1.0145037174224854, + "logits/rejected": 1.0263590812683105, + "logps/chosen": -293.4837951660156, + "logps/rejected": -262.3972473144531, + "loss": 0.0357, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.03515145182609558, + "rewards/margins": 0.043677233159542084, + "rewards/rejected": -0.07882869243621826, + "step": 3380 + }, + { + "epoch": 0.89, + "learning_rate": 1.913954575837826e-07, + "logits/chosen": 1.0240306854248047, + "logits/rejected": 1.094536542892456, + "logps/chosen": -291.9563293457031, + "logps/rejected": -263.8965148925781, + "loss": 0.048, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.02634221874177456, + "rewards/margins": 0.06700630486011505, + "rewards/rejected": -0.09334851801395416, + "step": 3390 + }, + { + "epoch": 0.89, + "learning_rate": 1.827256026165028e-07, + "logits/chosen": 1.024501919746399, + "logits/rejected": 1.0359153747558594, + "logps/chosen": -271.8841247558594, + "logps/rejected": -239.6262664794922, + "loss": 0.0376, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.026435798034071922, + "rewards/margins": 0.07295812666416168, + "rewards/rejected": -0.09939391911029816, + "step": 3400 + }, + { + "epoch": 0.89, + "eval_logits/chosen": 0.9353539347648621, + "eval_logits/rejected": 1.0377308130264282, + "eval_logps/chosen": -280.1902160644531, + "eval_logps/rejected": -252.35890197753906, + "eval_loss": 0.03463303670287132, + "eval_rewards/accuracies": 0.6309999823570251, + "eval_rewards/chosen": -0.026434103026986122, + "eval_rewards/margins": 0.0589958056807518, + "eval_rewards/rejected": -0.08542990684509277, + "eval_runtime": 538.4849, + "eval_samples_per_second": 3.714, + "eval_steps_per_second": 0.929, + "step": 3400 + }, + { + "epoch": 0.89, + "learning_rate": 1.7424923939454274e-07, + "logits/chosen": 1.0309066772460938, + "logits/rejected": 1.0446019172668457, + "logps/chosen": -282.263427734375, + "logps/rejected": -256.1120300292969, + "loss": 0.0385, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.02827179990708828, + "rewards/margins": 0.05899345874786377, + "rewards/rejected": -0.0872652679681778, + "step": 3410 + }, + { + "epoch": 0.9, + "learning_rate": 1.6596707569179304e-07, + "logits/chosen": 0.9415512084960938, + "logits/rejected": 1.0031477212905884, + "logps/chosen": -267.2585754394531, + "logps/rejected": -228.9310302734375, + "loss": 0.0309, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.020049545913934708, + "rewards/margins": 0.05747220665216446, + "rewards/rejected": -0.07752174139022827, + "step": 3420 + }, + { + "epoch": 0.9, + "learning_rate": 1.578798030665385e-07, + "logits/chosen": 0.9566577076911926, + "logits/rejected": 0.9886308908462524, + "logps/chosen": -287.398193359375, + "logps/rejected": -231.68643188476562, + "loss": 0.0351, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.030035072937607765, + "rewards/margins": 0.03915448114275932, + "rewards/rejected": -0.06918954849243164, + "step": 3430 + }, + { + "epoch": 0.9, + "learning_rate": 1.499880968037165e-07, + "logits/chosen": 0.9171876907348633, + "logits/rejected": 1.0982364416122437, + "logps/chosen": -283.79156494140625, + "logps/rejected": -253.9375, + "loss": 0.0395, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.02584235928952694, + "rewards/margins": 0.07793084532022476, + "rewards/rejected": -0.10377321392297745, + "step": 3440 + }, + { + "epoch": 0.9, + "learning_rate": 1.4229261585852805e-07, + "logits/chosen": 0.9618092775344849, + "logits/rejected": 1.1071628332138062, + "logps/chosen": -258.87579345703125, + "logps/rejected": -243.849365234375, + "loss": 0.0413, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.031573228538036346, + "rewards/margins": 0.07379934191703796, + "rewards/rejected": -0.10537256300449371, + "step": 3450 + }, + { + "epoch": 0.91, + "learning_rate": 1.3479400280141886e-07, + "logits/chosen": 0.9881182909011841, + "logits/rejected": 0.9810377359390259, + "logps/chosen": -240.4054718017578, + "logps/rejected": -220.0426483154297, + "loss": 0.0372, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.019237659871578217, + "rewards/margins": 0.041987188160419464, + "rewards/rejected": -0.06122484803199768, + "step": 3460 + }, + { + "epoch": 0.91, + "learning_rate": 1.2749288376442044e-07, + "logits/chosen": 1.032110571861267, + "logits/rejected": 1.0473930835723877, + "logps/chosen": -249.2979736328125, + "logps/rejected": -242.74606323242188, + "loss": 0.0334, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.030945682898163795, + "rewards/margins": 0.05242834612727165, + "rewards/rejected": -0.0833740234375, + "step": 3470 + }, + { + "epoch": 0.91, + "learning_rate": 1.203898683888713e-07, + "logits/chosen": 0.9928043484687805, + "logits/rejected": 1.0439598560333252, + "logps/chosen": -255.74020385742188, + "logps/rejected": -241.57583618164062, + "loss": 0.0381, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.03444141149520874, + "rewards/margins": 0.06323965638875961, + "rewards/rejected": -0.09768106043338776, + "step": 3480 + }, + { + "epoch": 0.91, + "learning_rate": 1.1348554977451132e-07, + "logits/chosen": 1.0385875701904297, + "logits/rejected": 1.0472261905670166, + "logps/chosen": -243.3946075439453, + "logps/rejected": -253.4709014892578, + "loss": 0.0353, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.03410564363002777, + "rewards/margins": 0.0681803822517395, + "rewards/rejected": -0.10228602588176727, + "step": 3490 + }, + { + "epoch": 0.92, + "learning_rate": 1.0678050442995802e-07, + "logits/chosen": 0.9589599370956421, + "logits/rejected": 0.9904863238334656, + "logps/chosen": -262.4172058105469, + "logps/rejected": -220.4215545654297, + "loss": 0.0352, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.03152045980095863, + "rewards/margins": 0.0699472576379776, + "rewards/rejected": -0.10146770626306534, + "step": 3500 + }, + { + "epoch": 0.92, + "eval_logits/chosen": 0.9392337203025818, + "eval_logits/rejected": 1.0417654514312744, + "eval_logps/chosen": -280.20367431640625, + "eval_logps/rejected": -252.37255859375, + "eval_loss": 0.03462912142276764, + "eval_rewards/accuracies": 0.6259999871253967, + "eval_rewards/chosen": -0.026568960398435593, + "eval_rewards/margins": 0.058997511863708496, + "eval_rewards/rejected": -0.08556646853685379, + "eval_runtime": 538.5332, + "eval_samples_per_second": 3.714, + "eval_steps_per_second": 0.928, + "step": 3500 + }, + { + "epoch": 0.92, + "learning_rate": 1.0027529222456755e-07, + "logits/chosen": 0.9966568946838379, + "logits/rejected": 0.9782639741897583, + "logps/chosen": -264.34344482421875, + "logps/rejected": -254.0737762451172, + "loss": 0.0298, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.030217718333005905, + "rewards/margins": 0.05717161297798157, + "rewards/rejected": -0.08738932758569717, + "step": 3510 + }, + { + "epoch": 0.92, + "learning_rate": 9.397045634168766e-08, + "logits/chosen": 0.933295726776123, + "logits/rejected": 1.088179349899292, + "logps/chosen": -275.3388977050781, + "logps/rejected": -235.18685913085938, + "loss": 0.0367, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.014920748770236969, + "rewards/margins": 0.06524350494146347, + "rewards/rejected": -0.08016424626111984, + "step": 3520 + }, + { + "epoch": 0.92, + "learning_rate": 8.78665232332998e-08, + "logits/chosen": 0.9489814639091492, + "logits/rejected": 1.066748857498169, + "logps/chosen": -260.3916320800781, + "logps/rejected": -258.72528076171875, + "loss": 0.036, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.023377668112516403, + "rewards/margins": 0.05880703777074814, + "rewards/rejected": -0.08218470215797424, + "step": 3530 + }, + { + "epoch": 0.93, + "learning_rate": 8.196400257606208e-08, + "logits/chosen": 0.9277578592300415, + "logits/rejected": 1.0134170055389404, + "logps/chosen": -279.0546875, + "logps/rejected": -249.3377685546875, + "loss": 0.0347, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.018648769706487656, + "rewards/margins": 0.04787365719676018, + "rewards/rejected": -0.06652243435382843, + "step": 3540 + }, + { + "epoch": 0.93, + "learning_rate": 7.626338722875076e-08, + "logits/chosen": 1.015367031097412, + "logits/rejected": 0.9656025171279907, + "logps/chosen": -280.31109619140625, + "logps/rejected": -254.50277709960938, + "loss": 0.0347, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.03300374746322632, + "rewards/margins": 0.050588060170412064, + "rewards/rejected": -0.08359180390834808, + "step": 3550 + }, + { + "epoch": 0.93, + "learning_rate": 7.076515319110688e-08, + "logits/chosen": 1.0340051651000977, + "logits/rejected": 1.0160043239593506, + "logps/chosen": -292.3948669433594, + "logps/rejected": -252.76358032226562, + "loss": 0.0371, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.02604197897017002, + "rewards/margins": 0.05999482423067093, + "rewards/rejected": -0.0860368013381958, + "step": 3560 + }, + { + "epoch": 0.93, + "learning_rate": 6.54697595640899e-08, + "logits/chosen": 1.0719053745269775, + "logits/rejected": 1.0086462497711182, + "logps/chosen": -279.8869323730469, + "logps/rejected": -261.8102111816406, + "loss": 0.0361, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.023053551092743874, + "rewards/margins": 0.05721823126077652, + "rewards/rejected": -0.08027178794145584, + "step": 3570 + }, + { + "epoch": 0.94, + "learning_rate": 6.037764851154426e-08, + "logits/chosen": 0.9340742826461792, + "logits/rejected": 1.0983483791351318, + "logps/chosen": -279.68585205078125, + "logps/rejected": -239.94100952148438, + "loss": 0.0259, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.02659156359732151, + "rewards/margins": 0.05400107428431511, + "rewards/rejected": -0.08059263974428177, + "step": 3580 + }, + { + "epoch": 0.94, + "learning_rate": 5.548924522327748e-08, + "logits/chosen": 0.9091449975967407, + "logits/rejected": 1.1083462238311768, + "logps/chosen": -275.26678466796875, + "logps/rejected": -258.33453369140625, + "loss": 0.0285, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.019822830334305763, + "rewards/margins": 0.06569498032331467, + "rewards/rejected": -0.08551780879497528, + "step": 3590 + }, + { + "epoch": 0.94, + "learning_rate": 5.0804957879556915e-08, + "logits/chosen": 0.9512368440628052, + "logits/rejected": 1.0698693990707397, + "logps/chosen": -318.3247375488281, + "logps/rejected": -260.92718505859375, + "loss": 0.0379, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.0346975177526474, + "rewards/margins": 0.056962646543979645, + "rewards/rejected": -0.09166016429662704, + "step": 3600 + }, + { + "epoch": 0.94, + "eval_logits/chosen": 0.9390192031860352, + "eval_logits/rejected": 1.0413662195205688, + "eval_logps/chosen": -280.1781311035156, + "eval_logps/rejected": -252.33770751953125, + "eval_loss": 0.034663841128349304, + "eval_rewards/accuracies": 0.6315000057220459, + "eval_rewards/chosen": -0.026313286274671555, + "eval_rewards/margins": 0.05890476703643799, + "eval_rewards/rejected": -0.08521804958581924, + "eval_runtime": 538.4137, + "eval_samples_per_second": 3.715, + "eval_steps_per_second": 0.929, + "step": 3600 + }, + { + "epoch": 0.94, + "learning_rate": 4.632517761702815e-08, + "logits/chosen": 1.0086432695388794, + "logits/rejected": 1.0595029592514038, + "logps/chosen": -285.149658203125, + "logps/rejected": -251.66793823242188, + "loss": 0.0434, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.018525902181863785, + "rewards/margins": 0.078687384724617, + "rewards/rejected": -0.09721329808235168, + "step": 3610 + }, + { + "epoch": 0.95, + "learning_rate": 4.205027849605359e-08, + "logits/chosen": 1.0005525350570679, + "logits/rejected": 0.973240852355957, + "logps/chosen": -231.69265747070312, + "logps/rejected": -235.55581665039062, + "loss": 0.0401, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.02195514738559723, + "rewards/margins": 0.049081120640039444, + "rewards/rejected": -0.07103626430034637, + "step": 3620 + }, + { + "epoch": 0.95, + "learning_rate": 3.798061746947995e-08, + "logits/chosen": 0.9860151410102844, + "logits/rejected": 1.0056957006454468, + "logps/chosen": -345.712890625, + "logps/rejected": -263.0353088378906, + "loss": 0.0229, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.02045946940779686, + "rewards/margins": 0.05285441130399704, + "rewards/rejected": -0.0733138769865036, + "step": 3630 + }, + { + "epoch": 0.95, + "learning_rate": 3.411653435283158e-08, + "logits/chosen": 0.9643945693969727, + "logits/rejected": 1.0569902658462524, + "logps/chosen": -294.2444763183594, + "logps/rejected": -257.2134704589844, + "loss": 0.0379, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.0335361547768116, + "rewards/margins": 0.04753485321998596, + "rewards/rejected": -0.08107100427150726, + "step": 3640 + }, + { + "epoch": 0.96, + "learning_rate": 3.04583517959367e-08, + "logits/chosen": 0.957710862159729, + "logits/rejected": 0.983841598033905, + "logps/chosen": -299.52471923828125, + "logps/rejected": -250.6820831298828, + "loss": 0.0363, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.008963306434452534, + "rewards/margins": 0.08262725919485092, + "rewards/rejected": -0.09159056842327118, + "step": 3650 + }, + { + "epoch": 0.96, + "learning_rate": 2.7006375255985984e-08, + "logits/chosen": 0.9700638055801392, + "logits/rejected": 1.0240037441253662, + "logps/chosen": -241.0752410888672, + "logps/rejected": -215.42562866210938, + "loss": 0.0298, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.026114290580153465, + "rewards/margins": 0.048808712512254715, + "rewards/rejected": -0.07492300122976303, + "step": 3660 + }, + { + "epoch": 0.96, + "learning_rate": 2.3760892972027328e-08, + "logits/chosen": 0.9013713002204895, + "logits/rejected": 1.0410950183868408, + "logps/chosen": -290.4670104980469, + "logps/rejected": -238.00454711914062, + "loss": 0.0394, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.023938851431012154, + "rewards/margins": 0.06292831152677536, + "rewards/rejected": -0.08686716854572296, + "step": 3670 + }, + { + "epoch": 0.96, + "learning_rate": 2.072217594089765e-08, + "logits/chosen": 1.0121930837631226, + "logits/rejected": 0.9898314476013184, + "logps/chosen": -247.4111785888672, + "logps/rejected": -232.48184204101562, + "loss": 0.0394, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.021979061886668205, + "rewards/margins": 0.0453622080385685, + "rewards/rejected": -0.06734126806259155, + "step": 3680 + }, + { + "epoch": 0.97, + "learning_rate": 1.789047789459375e-08, + "logits/chosen": 0.9309120178222656, + "logits/rejected": 1.0076799392700195, + "logps/chosen": -279.5455627441406, + "logps/rejected": -257.4979248046875, + "loss": 0.0377, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.027569543570280075, + "rewards/margins": 0.0605216808617115, + "rewards/rejected": -0.08809121698141098, + "step": 3690 + }, + { + "epoch": 0.97, + "learning_rate": 1.5266035279088708e-08, + "logits/chosen": 0.9448448419570923, + "logits/rejected": 1.0074546337127686, + "logps/chosen": -320.0146179199219, + "logps/rejected": -246.0650634765625, + "loss": 0.0361, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.023909619078040123, + "rewards/margins": 0.05834698677062988, + "rewards/rejected": -0.08225660026073456, + "step": 3700 + }, + { + "epoch": 0.97, + "eval_logits/chosen": 0.9376645088195801, + "eval_logits/rejected": 1.0399267673492432, + "eval_logps/chosen": -280.2046813964844, + "eval_logps/rejected": -252.3740997314453, + "eval_loss": 0.03461700677871704, + "eval_rewards/accuracies": 0.6309999823570251, + "eval_rewards/chosen": -0.026578795164823532, + "eval_rewards/margins": 0.05900290608406067, + "eval_rewards/rejected": -0.0855816975235939, + "eval_runtime": 538.2979, + "eval_samples_per_second": 3.715, + "eval_steps_per_second": 0.929, + "step": 3700 + }, + { + "epoch": 0.97, + "learning_rate": 1.2849067234584623e-08, + "logits/chosen": 0.9224729537963867, + "logits/rejected": 1.0151176452636719, + "logps/chosen": -258.44451904296875, + "logps/rejected": -240.2635498046875, + "loss": 0.0317, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.02202828973531723, + "rewards/margins": 0.055525414645671844, + "rewards/rejected": -0.07755370438098907, + "step": 3710 + }, + { + "epoch": 0.97, + "learning_rate": 1.0639775577218625e-08, + "logits/chosen": 0.9609501957893372, + "logits/rejected": 1.0031676292419434, + "logps/chosen": -288.76519775390625, + "logps/rejected": -266.21478271484375, + "loss": 0.0313, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.01723320782184601, + "rewards/margins": 0.06596283614635468, + "rewards/rejected": -0.08319603651762009, + "step": 3720 + }, + { + "epoch": 0.98, + "learning_rate": 8.638344782207486e-09, + "logits/chosen": 0.9164566993713379, + "logits/rejected": 1.0248987674713135, + "logps/chosen": -303.3111877441406, + "logps/rejected": -282.0052185058594, + "loss": 0.0371, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.036383308470249176, + "rewards/margins": 0.0420096218585968, + "rewards/rejected": -0.07839293777942657, + "step": 3730 + }, + { + "epoch": 0.98, + "learning_rate": 6.84494196844715e-09, + "logits/chosen": 0.9781646728515625, + "logits/rejected": 1.0682637691497803, + "logps/chosen": -265.3148193359375, + "logps/rejected": -244.98489379882812, + "loss": 0.0314, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.016114359721541405, + "rewards/margins": 0.0599919855594635, + "rewards/rejected": -0.07610634714365005, + "step": 3740 + }, + { + "epoch": 0.98, + "learning_rate": 5.259716884556121e-09, + "logits/chosen": 0.9260244369506836, + "logits/rejected": 1.0235192775726318, + "logps/chosen": -311.13995361328125, + "logps/rejected": -255.76925659179688, + "loss": 0.0216, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.019463708624243736, + "rewards/margins": 0.06179703399538994, + "rewards/rejected": -0.08126074075698853, + "step": 3750 + }, + { + "epoch": 0.98, + "learning_rate": 3.882801896372967e-09, + "logits/chosen": 0.8902843594551086, + "logits/rejected": 1.0316295623779297, + "logps/chosen": -282.79034423828125, + "logps/rejected": -261.2944641113281, + "loss": 0.0333, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.024235766381025314, + "rewards/margins": 0.04566502943634987, + "rewards/rejected": -0.06990079581737518, + "step": 3760 + }, + { + "epoch": 0.99, + "learning_rate": 2.7143119759026614e-09, + "logits/chosen": 0.9450544118881226, + "logits/rejected": 1.0837864875793457, + "logps/chosen": -270.1015319824219, + "logps/rejected": -273.8962707519531, + "loss": 0.0297, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.007587161846458912, + "rewards/margins": 0.06377876549959183, + "rewards/rejected": -0.07136592268943787, + "step": 3770 + }, + { + "epoch": 0.99, + "learning_rate": 1.754344691717591e-09, + "logits/chosen": 0.9069304466247559, + "logits/rejected": 0.979387640953064, + "logps/chosen": -264.6932067871094, + "logps/rejected": -218.8267822265625, + "loss": 0.0347, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.011129969730973244, + "rewards/margins": 0.06624534726142883, + "rewards/rejected": -0.07737531512975693, + "step": 3780 + }, + { + "epoch": 0.99, + "learning_rate": 1.0029802008096335e-09, + "logits/chosen": 1.0251834392547607, + "logits/rejected": 0.9546536207199097, + "logps/chosen": -267.88116455078125, + "logps/rejected": -228.50390625, + "loss": 0.0348, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.026254802942276, + "rewards/margins": 0.06516362726688385, + "rewards/rejected": -0.09141843020915985, + "step": 3790 + }, + { + "epoch": 0.99, + "learning_rate": 4.602812418974534e-10, + "logits/chosen": 0.9103859066963196, + "logits/rejected": 1.032867670059204, + "logps/chosen": -254.2440643310547, + "logps/rejected": -240.1820831298828, + "loss": 0.0298, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.03087850846350193, + "rewards/margins": 0.050477832555770874, + "rewards/rejected": -0.08135633170604706, + "step": 3800 + }, + { + "epoch": 0.99, + "eval_logits/chosen": 0.9387494921684265, + "eval_logits/rejected": 1.0411853790283203, + "eval_logps/chosen": -280.1766662597656, + "eval_logps/rejected": -252.320068359375, + "eval_loss": 0.03467794507741928, + "eval_rewards/accuracies": 0.6274999976158142, + "eval_rewards/chosen": -0.02629854343831539, + "eval_rewards/margins": 0.058742720633745193, + "eval_rewards/rejected": -0.08504127711057663, + "eval_runtime": 538.3057, + "eval_samples_per_second": 3.715, + "eval_steps_per_second": 0.929, + "step": 3800 + }, + { + "epoch": 1.0, + "learning_rate": 1.2629313018819312e-10, + "logits/chosen": 0.9093559980392456, + "logits/rejected": 1.0155284404754639, + "logps/chosen": -302.20904541015625, + "logps/rejected": -284.8764953613281, + "loss": 0.0297, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.014634281396865845, + "rewards/margins": 0.05110809952020645, + "rewards/rejected": -0.0657423883676529, + "step": 3810 + }, + { + "epoch": 1.0, + "learning_rate": 1.0437535929996855e-12, + "logits/chosen": 0.9967252612113953, + "logits/rejected": 0.9475840330123901, + "logps/chosen": -285.2461242675781, + "logps/rejected": -266.7107238769531, + "loss": 0.0281, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.02502075769007206, + "rewards/margins": 0.06247056648135185, + "rewards/rejected": -0.08749131858348846, + "step": 3820 + }, + { + "epoch": 1.0, + "step": 3821, + "total_flos": 0.0, + "train_loss": 0.038748307645445686, + "train_runtime": 55741.6245, + "train_samples_per_second": 1.097, + "train_steps_per_second": 0.069 + } + ], + "logging_steps": 10, + "max_steps": 3821, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 100, + "total_flos": 0.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}