{ "best_metric": null, "best_model_checkpoint": null, "epoch": 9.954476479514415, "eval_steps": 250, "global_step": 2050, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 2.4390243902439025e-09, "logits/chosen": 23.83146095275879, "logits/rejected": 24.366979598999023, "logps/chosen": -513.1724243164062, "logps/rejected": -471.20977783203125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.05, "learning_rate": 2.4390243902439023e-08, "logits/chosen": 25.113672256469727, "logits/rejected": 25.29496955871582, "logps/chosen": -440.1910400390625, "logps/rejected": -464.28997802734375, "loss": 0.6934, "rewards/accuracies": 0.4583333432674408, "rewards/chosen": 0.0020149427000433207, "rewards/margins": 0.00028653975459747016, "rewards/rejected": 0.0017284027999266982, "step": 10 }, { "epoch": 0.1, "learning_rate": 4.878048780487805e-08, "logits/chosen": 24.36227035522461, "logits/rejected": 23.910043716430664, "logps/chosen": -433.28289794921875, "logps/rejected": -493.90667724609375, "loss": 0.6929, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -0.00046742885024286807, "rewards/margins": 0.0003720354288816452, "rewards/rejected": -0.0008394649485126138, "step": 20 }, { "epoch": 0.15, "learning_rate": 7.317073170731706e-08, "logits/chosen": 24.743637084960938, "logits/rejected": 24.689483642578125, "logps/chosen": -507.70458984375, "logps/rejected": -543.5693969726562, "loss": 0.6932, "rewards/accuracies": 0.4437499940395355, "rewards/chosen": 0.0006420022109523416, "rewards/margins": -0.0008240239694714546, "rewards/rejected": 0.001466025598347187, "step": 30 }, { "epoch": 0.19, "learning_rate": 9.75609756097561e-08, "logits/chosen": 26.19744873046875, "logits/rejected": 25.904888153076172, "logps/chosen": -483.9391174316406, "logps/rejected": -522.8155517578125, "loss": 0.6918, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": 0.000658182892948389, "rewards/margins": 0.0012660929933190346, "rewards/rejected": -0.0006079099839553237, "step": 40 }, { "epoch": 0.24, "learning_rate": 1.219512195121951e-07, "logits/chosen": 25.00406265258789, "logits/rejected": 25.312259674072266, "logps/chosen": -505.9356384277344, "logps/rejected": -498.18035888671875, "loss": 0.6935, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": 0.0002137926931027323, "rewards/margins": -0.0007602882687933743, "rewards/rejected": 0.0009740809909999371, "step": 50 }, { "epoch": 0.29, "learning_rate": 1.4634146341463413e-07, "logits/chosen": 24.721242904663086, "logits/rejected": 24.36328887939453, "logps/chosen": -491.00518798828125, "logps/rejected": -569.5256958007812, "loss": 0.694, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.0011952219065278769, "rewards/margins": -0.001965393777936697, "rewards/rejected": 0.000770171987824142, "step": 60 }, { "epoch": 0.34, "learning_rate": 1.7073170731707317e-07, "logits/chosen": 24.990764617919922, "logits/rejected": 24.78268814086914, "logps/chosen": -418.36761474609375, "logps/rejected": -448.0746154785156, "loss": 0.6932, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": 0.0008768116822466254, "rewards/margins": 0.0001256523682968691, "rewards/rejected": 0.0007511593285016716, "step": 70 }, { "epoch": 0.39, "learning_rate": 1.951219512195122e-07, "logits/chosen": 25.365007400512695, "logits/rejected": 24.888202667236328, "logps/chosen": -414.1890563964844, "logps/rejected": -434.57550048828125, "loss": 0.6934, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.0006678829668089747, "rewards/margins": -0.001990665215998888, "rewards/rejected": 0.0013227818999439478, "step": 80 }, { "epoch": 0.44, "learning_rate": 2.195121951219512e-07, "logits/chosen": 25.361377716064453, "logits/rejected": 25.293743133544922, "logps/chosen": -452.96783447265625, "logps/rejected": -467.5086975097656, "loss": 0.6927, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.0010917505715042353, "rewards/margins": 0.004853174090385437, "rewards/rejected": -0.0037614230532199144, "step": 90 }, { "epoch": 0.49, "learning_rate": 2.439024390243902e-07, "logits/chosen": 24.833248138427734, "logits/rejected": 24.911867141723633, "logps/chosen": -463.259033203125, "logps/rejected": -451.12750244140625, "loss": 0.6936, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 0.001009153900668025, "rewards/margins": -0.000572516699321568, "rewards/rejected": 0.0015816707164049149, "step": 100 }, { "epoch": 0.53, "learning_rate": 2.682926829268293e-07, "logits/chosen": 25.980321884155273, "logits/rejected": 26.036426544189453, "logps/chosen": -485.67852783203125, "logps/rejected": -501.48291015625, "loss": 0.6927, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.003403586568310857, "rewards/margins": 0.007840663194656372, "rewards/rejected": -0.004437076393514872, "step": 110 }, { "epoch": 0.58, "learning_rate": 2.9268292682926825e-07, "logits/chosen": 24.954368591308594, "logits/rejected": 25.147586822509766, "logps/chosen": -493.1083984375, "logps/rejected": -499.5880432128906, "loss": 0.6937, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.001717461971566081, "rewards/margins": -0.0020078145898878574, "rewards/rejected": 0.00029035229817964137, "step": 120 }, { "epoch": 0.63, "learning_rate": 3.170731707317073e-07, "logits/chosen": 25.536571502685547, "logits/rejected": 25.209754943847656, "logps/chosen": -443.41363525390625, "logps/rejected": -473.89483642578125, "loss": 0.693, "rewards/accuracies": 0.5, "rewards/chosen": -0.0012768972665071487, "rewards/margins": -0.0017949879402294755, "rewards/rejected": 0.0005180907319299877, "step": 130 }, { "epoch": 0.68, "learning_rate": 3.4146341463414634e-07, "logits/chosen": 24.778900146484375, "logits/rejected": 24.485666275024414, "logps/chosen": -463.8463439941406, "logps/rejected": -505.55657958984375, "loss": 0.693, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -0.003415347309783101, "rewards/margins": -0.0018013190710917115, "rewards/rejected": -0.0016140276566147804, "step": 140 }, { "epoch": 0.73, "learning_rate": 3.6585365853658536e-07, "logits/chosen": 26.186752319335938, "logits/rejected": 26.14641761779785, "logps/chosen": -454.46356201171875, "logps/rejected": -493.9950256347656, "loss": 0.6929, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": 0.0016146504785865545, "rewards/margins": 0.0017917368095368147, "rewards/rejected": -0.00017708637460600585, "step": 150 }, { "epoch": 0.78, "learning_rate": 3.902439024390244e-07, "logits/chosen": 26.05021095275879, "logits/rejected": 25.78249740600586, "logps/chosen": -446.4471740722656, "logps/rejected": -482.791015625, "loss": 0.6929, "rewards/accuracies": 0.4437499940395355, "rewards/chosen": -0.0032919864170253277, "rewards/margins": -0.0010836247820407152, "rewards/rejected": -0.002208361867815256, "step": 160 }, { "epoch": 0.83, "learning_rate": 4.146341463414634e-07, "logits/chosen": 25.29595184326172, "logits/rejected": 24.819242477416992, "logps/chosen": -449.38006591796875, "logps/rejected": -485.6714782714844, "loss": 0.6932, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.004366643726825714, "rewards/margins": 0.0014269489329308271, "rewards/rejected": -0.00579359196126461, "step": 170 }, { "epoch": 0.87, "learning_rate": 4.390243902439024e-07, "logits/chosen": 25.511014938354492, "logits/rejected": 25.20175552368164, "logps/chosen": -485.9794006347656, "logps/rejected": -547.3993530273438, "loss": 0.6914, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -9.034853428602219e-05, "rewards/margins": 0.007031626999378204, "rewards/rejected": -0.007121975068002939, "step": 180 }, { "epoch": 0.92, "learning_rate": 4.634146341463415e-07, "logits/chosen": 25.762130737304688, "logits/rejected": 25.043359756469727, "logps/chosen": -436.8501892089844, "logps/rejected": -504.27337646484375, "loss": 0.6927, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.0031238270457834005, "rewards/margins": 0.0019209437305107713, "rewards/rejected": -0.0050447722896933556, "step": 190 }, { "epoch": 0.97, "learning_rate": 4.878048780487804e-07, "logits/chosen": 25.300086975097656, "logits/rejected": 25.226686477661133, "logps/chosen": -411.12091064453125, "logps/rejected": -453.89599609375, "loss": 0.6934, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.0008737426251173019, "rewards/margins": 0.0030270516872406006, "rewards/rejected": -0.0021533078979700804, "step": 200 }, { "epoch": 1.02, "learning_rate": 4.999909394533081e-07, "logits/chosen": 25.390342712402344, "logits/rejected": 24.91876983642578, "logps/chosen": -444.7391052246094, "logps/rejected": -464.7679138183594, "loss": 0.6903, "rewards/accuracies": 0.59375, "rewards/chosen": -0.0024602802004665136, "rewards/margins": 0.011555373668670654, "rewards/rejected": -0.014015654101967812, "step": 210 }, { "epoch": 1.07, "learning_rate": 4.99918459020214e-07, "logits/chosen": 25.847061157226562, "logits/rejected": 26.030590057373047, "logps/chosen": -463.7713928222656, "logps/rejected": -441.03118896484375, "loss": 0.6897, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -0.005838096607476473, "rewards/margins": 0.0016079202760010958, "rewards/rejected": -0.007446016184985638, "step": 220 }, { "epoch": 1.12, "learning_rate": 4.997735191684404e-07, "logits/chosen": 24.9683837890625, "logits/rejected": 24.894073486328125, "logps/chosen": -421.6605529785156, "logps/rejected": -454.12445068359375, "loss": 0.6908, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -0.0041381316259503365, "rewards/margins": 0.0008476437069475651, "rewards/rejected": -0.004985774867236614, "step": 230 }, { "epoch": 1.17, "learning_rate": 4.995561619207226e-07, "logits/chosen": 24.7789249420166, "logits/rejected": 24.977474212646484, "logps/chosen": -551.719482421875, "logps/rejected": -586.6744995117188, "loss": 0.6901, "rewards/accuracies": 0.5, "rewards/chosen": -0.019014570862054825, "rewards/margins": 0.008881723508238792, "rewards/rejected": -0.027896294370293617, "step": 240 }, { "epoch": 1.21, "learning_rate": 4.992664502959351e-07, "logits/chosen": 24.666425704956055, "logits/rejected": 24.653018951416016, "logps/chosen": -446.96844482421875, "logps/rejected": -463.4334411621094, "loss": 0.6895, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.006912347860634327, "rewards/margins": 0.005273967050015926, "rewards/rejected": -0.012186313979327679, "step": 250 }, { "epoch": 1.26, "learning_rate": 4.989044682908178e-07, "logits/chosen": 25.226409912109375, "logits/rejected": 24.97530174255371, "logps/chosen": -491.82000732421875, "logps/rejected": -502.476318359375, "loss": 0.6887, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.005164532456547022, "rewards/margins": 0.008609605953097343, "rewards/rejected": -0.013774137012660503, "step": 260 }, { "epoch": 1.31, "learning_rate": 4.984703208556244e-07, "logits/chosen": 24.473495483398438, "logits/rejected": 24.27614974975586, "logps/chosen": -405.4839782714844, "logps/rejected": -428.57958984375, "loss": 0.6873, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.004830534104257822, "rewards/margins": 0.00787128135561943, "rewards/rejected": -0.012701814994215965, "step": 270 }, { "epoch": 1.36, "learning_rate": 4.979641338636934e-07, "logits/chosen": 25.67917823791504, "logits/rejected": 25.326581954956055, "logps/chosen": -449.4097595214844, "logps/rejected": -484.1585998535156, "loss": 0.6879, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.006435055285692215, "rewards/margins": 0.010736002586781979, "rewards/rejected": -0.017171058803796768, "step": 280 }, { "epoch": 1.41, "learning_rate": 4.973860540749533e-07, "logits/chosen": 25.372289657592773, "logits/rejected": 25.274517059326172, "logps/chosen": -453.90704345703125, "logps/rejected": -508.42364501953125, "loss": 0.6872, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.017870599403977394, "rewards/margins": 0.010955670848488808, "rewards/rejected": -0.028826270252466202, "step": 290 }, { "epoch": 1.46, "learning_rate": 4.967362490933723e-07, "logits/chosen": 24.464265823364258, "logits/rejected": 24.45247459411621, "logps/chosen": -483.70562744140625, "logps/rejected": -502.24139404296875, "loss": 0.6873, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.013566325418651104, "rewards/margins": 0.0146838603541255, "rewards/rejected": -0.028250187635421753, "step": 300 }, { "epoch": 1.51, "learning_rate": 4.960149073183643e-07, "logits/chosen": 24.628841400146484, "logits/rejected": 24.8803768157959, "logps/chosen": -490.34234619140625, "logps/rejected": -478.694580078125, "loss": 0.6866, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -0.01605142280459404, "rewards/margins": 0.00866149552166462, "rewards/rejected": -0.02471291646361351, "step": 310 }, { "epoch": 1.55, "learning_rate": 4.95222237890166e-07, "logits/chosen": 24.773128509521484, "logits/rejected": 25.139238357543945, "logps/chosen": -479.4823303222656, "logps/rejected": -452.5008850097656, "loss": 0.6861, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.013470214791595936, "rewards/margins": 0.006181957200169563, "rewards/rejected": -0.019652169197797775, "step": 320 }, { "epoch": 1.6, "learning_rate": 4.943584706292005e-07, "logits/chosen": 23.42044448852539, "logits/rejected": 23.604700088500977, "logps/chosen": -449.1239318847656, "logps/rejected": -457.1341247558594, "loss": 0.6848, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.015130296349525452, "rewards/margins": 0.01666988618671894, "rewards/rejected": -0.03180018067359924, "step": 330 }, { "epoch": 1.65, "learning_rate": 4.934238559694447e-07, "logits/chosen": 24.050968170166016, "logits/rejected": 23.906604766845703, "logps/chosen": -468.267333984375, "logps/rejected": -493.30780029296875, "loss": 0.6877, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.024675091728568077, "rewards/margins": 0.010691693052649498, "rewards/rejected": -0.035366784781217575, "step": 340 }, { "epoch": 1.7, "learning_rate": 4.924186648858207e-07, "logits/chosen": 24.65400505065918, "logits/rejected": 24.585697174072266, "logps/chosen": -400.6718444824219, "logps/rejected": -425.316650390625, "loss": 0.6856, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.009073630906641483, "rewards/margins": 0.01387846004217863, "rewards/rejected": -0.022952087223529816, "step": 350 }, { "epoch": 1.75, "learning_rate": 4.913431888156309e-07, "logits/chosen": 23.99672508239746, "logits/rejected": 24.127235412597656, "logps/chosen": -453.24615478515625, "logps/rejected": -438.322509765625, "loss": 0.6853, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.027061685919761658, "rewards/margins": 0.006015921477228403, "rewards/rejected": -0.03307760879397392, "step": 360 }, { "epoch": 1.8, "learning_rate": 4.901977395740619e-07, "logits/chosen": 25.19208526611328, "logits/rejected": 24.945858001708984, "logps/chosen": -420.7489318847656, "logps/rejected": -421.52813720703125, "loss": 0.6833, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.013960194773972034, "rewards/margins": 0.031657829880714417, "rewards/rejected": -0.045618023723363876, "step": 370 }, { "epoch": 1.85, "learning_rate": 4.889826492637781e-07, "logits/chosen": 24.140026092529297, "logits/rejected": 24.18136978149414, "logps/chosen": -455.3021545410156, "logps/rejected": -455.71685791015625, "loss": 0.6842, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.02444566786289215, "rewards/margins": 0.021536489948630333, "rewards/rejected": -0.04598215967416763, "step": 380 }, { "epoch": 1.89, "learning_rate": 4.876982701786351e-07, "logits/chosen": 24.755725860595703, "logits/rejected": 24.589107513427734, "logps/chosen": -481.8275451660156, "logps/rejected": -526.6813354492188, "loss": 0.6825, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.02834920585155487, "rewards/margins": 0.04028704762458801, "rewards/rejected": -0.06863625347614288, "step": 390 }, { "epoch": 1.94, "learning_rate": 4.863449747015383e-07, "logits/chosen": 24.02549171447754, "logits/rejected": 23.976415634155273, "logps/chosen": -439.884765625, "logps/rejected": -450.75079345703125, "loss": 0.6872, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.031968437135219574, "rewards/margins": 0.012536533176898956, "rewards/rejected": -0.04450497403740883, "step": 400 }, { "epoch": 1.99, "learning_rate": 4.849231551964771e-07, "logits/chosen": 23.458415985107422, "logits/rejected": 23.864471435546875, "logps/chosen": -444.437744140625, "logps/rejected": -470.9718322753906, "loss": 0.6817, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.027122044935822487, "rewards/margins": 0.01693027839064598, "rewards/rejected": -0.04405232518911362, "step": 410 }, { "epoch": 2.04, "learning_rate": 4.834332238947655e-07, "logits/chosen": 25.05409812927246, "logits/rejected": 25.178359985351562, "logps/chosen": -397.0162658691406, "logps/rejected": -431.53997802734375, "loss": 0.6798, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.016871152445673943, "rewards/margins": 0.027827546000480652, "rewards/rejected": -0.044698696583509445, "step": 420 }, { "epoch": 2.09, "learning_rate": 4.818756127755237e-07, "logits/chosen": 24.50407600402832, "logits/rejected": 24.129384994506836, "logps/chosen": -440.77374267578125, "logps/rejected": -477.9395446777344, "loss": 0.6728, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.027348514646291733, "rewards/margins": 0.043705716729164124, "rewards/rejected": -0.07105423510074615, "step": 430 }, { "epoch": 2.14, "learning_rate": 4.802507734404325e-07, "logits/chosen": 24.7203426361084, "logits/rejected": 24.716331481933594, "logps/chosen": -503.48583984375, "logps/rejected": -516.9214477539062, "loss": 0.6794, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.025318074971437454, "rewards/margins": 0.05121081322431564, "rewards/rejected": -0.0765288919210434, "step": 440 }, { "epoch": 2.19, "learning_rate": 4.785591769828005e-07, "logits/chosen": 24.545543670654297, "logits/rejected": 24.529743194580078, "logps/chosen": -462.376953125, "logps/rejected": -532.814453125, "loss": 0.6778, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.036473408341407776, "rewards/margins": 0.03656883165240288, "rewards/rejected": -0.07304224371910095, "step": 450 }, { "epoch": 2.23, "learning_rate": 4.76801313850978e-07, "logits/chosen": 22.99500274658203, "logits/rejected": 22.959396362304688, "logps/chosen": -456.98760986328125, "logps/rejected": -483.37432861328125, "loss": 0.6748, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.036379098892211914, "rewards/margins": 0.05310980603098869, "rewards/rejected": -0.0894889086484909, "step": 460 }, { "epoch": 2.28, "learning_rate": 4.749776937061606e-07, "logits/chosen": 24.91106414794922, "logits/rejected": 24.693490982055664, "logps/chosen": -492.05322265625, "logps/rejected": -560.6348876953125, "loss": 0.6703, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.06551917642354965, "rewards/margins": 0.06626399606466293, "rewards/rejected": -0.13178317248821259, "step": 470 }, { "epoch": 2.33, "learning_rate": 4.730888452746222e-07, "logits/chosen": 23.304880142211914, "logits/rejected": 23.373096466064453, "logps/chosen": -489.0440979003906, "logps/rejected": -516.6759033203125, "loss": 0.6716, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.05236157774925232, "rewards/margins": 0.04353173449635506, "rewards/rejected": -0.09589330852031708, "step": 480 }, { "epoch": 2.38, "learning_rate": 4.711353161944198e-07, "logits/chosen": 24.48396873474121, "logits/rejected": 24.153030395507812, "logps/chosen": -482.59100341796875, "logps/rejected": -511.71112060546875, "loss": 0.6692, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.06634119898080826, "rewards/margins": 0.05835053324699402, "rewards/rejected": -0.12469172477722168, "step": 490 }, { "epoch": 2.43, "learning_rate": 4.6911767285661583e-07, "logits/chosen": 23.99973487854004, "logits/rejected": 23.886028289794922, "logps/chosen": -451.34564208984375, "logps/rejected": -481.80126953125, "loss": 0.6719, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.04214207082986832, "rewards/margins": 0.0447273924946785, "rewards/rejected": -0.08686945587396622, "step": 500 }, { "epoch": 2.48, "learning_rate": 4.6703650024106324e-07, "logits/chosen": 24.31854248046875, "logits/rejected": 24.71249771118164, "logps/chosen": -512.5477294921875, "logps/rejected": -497.3389587402344, "loss": 0.673, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.05323835089802742, "rewards/margins": 0.03783535957336426, "rewards/rejected": -0.09107370674610138, "step": 510 }, { "epoch": 2.53, "learning_rate": 4.6489240174680026e-07, "logits/chosen": 23.387109756469727, "logits/rejected": 23.42694664001465, "logps/chosen": -469.442626953125, "logps/rejected": -464.22137451171875, "loss": 0.6719, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.06812478601932526, "rewards/margins": 0.04352138563990593, "rewards/rejected": -0.11164617538452148, "step": 520 }, { "epoch": 2.57, "learning_rate": 4.626859990171067e-07, "logits/chosen": 22.03474998474121, "logits/rejected": 22.551021575927734, "logps/chosen": -503.203369140625, "logps/rejected": -512.3323364257812, "loss": 0.6698, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.07380495220422745, "rewards/margins": 0.0384240485727787, "rewards/rejected": -0.11222900450229645, "step": 530 }, { "epoch": 2.62, "learning_rate": 4.604179317592686e-07, "logits/chosen": 22.982210159301758, "logits/rejected": 23.15829086303711, "logps/chosen": -482.0206604003906, "logps/rejected": -482.66864013671875, "loss": 0.6695, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.07758830487728119, "rewards/margins": 0.05295907333493233, "rewards/rejected": -0.13054737448692322, "step": 540 }, { "epoch": 2.67, "learning_rate": 4.5808885755910673e-07, "logits/chosen": 23.076644897460938, "logits/rejected": 23.11397361755371, "logps/chosen": -428.58734130859375, "logps/rejected": -437.57757568359375, "loss": 0.6677, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.05579688400030136, "rewards/margins": 0.037121035158634186, "rewards/rejected": -0.09291792660951614, "step": 550 }, { "epoch": 2.72, "learning_rate": 4.5569945169032164e-07, "logits/chosen": 23.258991241455078, "logits/rejected": 23.74163818359375, "logps/chosen": -438.8357849121094, "logps/rejected": -455.6856384277344, "loss": 0.6698, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.06992115080356598, "rewards/margins": 0.06130353733897209, "rewards/rejected": -0.13122470676898956, "step": 560 }, { "epoch": 2.77, "learning_rate": 4.532504069187094e-07, "logits/chosen": 23.573583602905273, "logits/rejected": 23.63486099243164, "logps/chosen": -433.8453063964844, "logps/rejected": -453.99713134765625, "loss": 0.672, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.05322835594415665, "rewards/margins": 0.038681793957948685, "rewards/rejected": -0.09191014617681503, "step": 570 }, { "epoch": 2.82, "learning_rate": 4.507424333013069e-07, "logits/chosen": 22.418657302856445, "logits/rejected": 22.448223114013672, "logps/chosen": -407.6749267578125, "logps/rejected": -427.78485107421875, "loss": 0.668, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.08355311304330826, "rewards/margins": 0.053498029708862305, "rewards/rejected": -0.13705115020275116, "step": 580 }, { "epoch": 2.86, "learning_rate": 4.481762579805232e-07, "logits/chosen": 22.880319595336914, "logits/rejected": 22.28330421447754, "logps/chosen": -441.55804443359375, "logps/rejected": -507.30206298828125, "loss": 0.6648, "rewards/accuracies": 0.65625, "rewards/chosen": -0.08028487116098404, "rewards/margins": 0.10107441991567612, "rewards/rejected": -0.18135927617549896, "step": 590 }, { "epoch": 2.91, "learning_rate": 4.455526249733178e-07, "logits/chosen": 22.544607162475586, "logits/rejected": 22.759769439697266, "logps/chosen": -467.66009521484375, "logps/rejected": -458.80419921875, "loss": 0.6687, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.06196471303701401, "rewards/margins": 0.0634593591094017, "rewards/rejected": -0.12542405724525452, "step": 600 }, { "epoch": 2.96, "learning_rate": 4.4287229495548573e-07, "logits/chosen": 23.092634201049805, "logits/rejected": 22.90291976928711, "logps/chosen": -462.6261291503906, "logps/rejected": -483.6346130371094, "loss": 0.6649, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.09866807609796524, "rewards/margins": 0.11319071054458618, "rewards/rejected": -0.21185874938964844, "step": 610 }, { "epoch": 3.01, "learning_rate": 4.4013604504111347e-07, "logits/chosen": 23.19097328186035, "logits/rejected": 22.682292938232422, "logps/chosen": -420.0496520996094, "logps/rejected": -469.2996520996094, "loss": 0.6614, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.059682805091142654, "rewards/margins": 0.0631704181432724, "rewards/rejected": -0.12285321950912476, "step": 620 }, { "epoch": 3.06, "learning_rate": 4.3734466855726823e-07, "logits/chosen": 23.40199089050293, "logits/rejected": 23.664241790771484, "logps/chosen": -521.8033447265625, "logps/rejected": -551.7887573242188, "loss": 0.658, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.10334376245737076, "rewards/margins": 0.07836399972438812, "rewards/rejected": -0.18170776963233948, "step": 630 }, { "epoch": 3.11, "learning_rate": 4.344989748139873e-07, "logits/chosen": 22.226797103881836, "logits/rejected": 21.844837188720703, "logps/chosen": -438.06756591796875, "logps/rejected": -524.7212524414062, "loss": 0.6528, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.08786562830209732, "rewards/margins": 0.17397987842559814, "rewards/rejected": -0.26184552907943726, "step": 640 }, { "epoch": 3.16, "learning_rate": 4.315997888696322e-07, "logits/chosen": 22.644672393798828, "logits/rejected": 22.466157913208008, "logps/chosen": -456.107666015625, "logps/rejected": -488.3030700683594, "loss": 0.6502, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.09486567229032516, "rewards/margins": 0.18466800451278687, "rewards/rejected": -0.2795336842536926, "step": 650 }, { "epoch": 3.2, "learning_rate": 4.2864795129167865e-07, "logits/chosen": 21.293453216552734, "logits/rejected": 21.037721633911133, "logps/chosen": -492.1611328125, "logps/rejected": -532.6552734375, "loss": 0.6516, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.148245707154274, "rewards/margins": 0.15969568490982056, "rewards/rejected": -0.30794137716293335, "step": 660 }, { "epoch": 3.25, "learning_rate": 4.25644317913008e-07, "logits/chosen": 22.722232818603516, "logits/rejected": 22.16741943359375, "logps/chosen": -501.4042053222656, "logps/rejected": -566.7221069335938, "loss": 0.6456, "rewards/accuracies": 0.6875, "rewards/chosen": -0.13956955075263977, "rewards/margins": 0.29204845428466797, "rewards/rejected": -0.4316180348396301, "step": 670 }, { "epoch": 3.3, "learning_rate": 4.2258975958377437e-07, "logits/chosen": 22.7330379486084, "logits/rejected": 22.67351722717285, "logps/chosen": -489.033935546875, "logps/rejected": -503.78778076171875, "loss": 0.6467, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.12569200992584229, "rewards/margins": 0.0982416495680809, "rewards/rejected": -0.22393366694450378, "step": 680 }, { "epoch": 3.35, "learning_rate": 4.194851619189169e-07, "logits/chosen": 21.356544494628906, "logits/rejected": 20.965524673461914, "logps/chosen": -456.1627502441406, "logps/rejected": -518.4822387695312, "loss": 0.6516, "rewards/accuracies": 0.71875, "rewards/chosen": -0.12254482507705688, "rewards/margins": 0.10786397755146027, "rewards/rejected": -0.23040878772735596, "step": 690 }, { "epoch": 3.4, "learning_rate": 4.163314250413913e-07, "logits/chosen": 22.082752227783203, "logits/rejected": 22.4614315032959, "logps/chosen": -464.14080810546875, "logps/rejected": -480.8935546875, "loss": 0.6588, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.137410506606102, "rewards/margins": 0.1056610569357872, "rewards/rejected": -0.24307158589363098, "step": 700 }, { "epoch": 3.45, "learning_rate": 4.131294633211954e-07, "logits/chosen": 22.201583862304688, "logits/rejected": 22.151172637939453, "logps/chosen": -499.27032470703125, "logps/rejected": -530.463623046875, "loss": 0.6526, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.1417187750339508, "rewards/margins": 0.10248730331659317, "rewards/rejected": -0.24420607089996338, "step": 710 }, { "epoch": 3.5, "learning_rate": 4.098802051102635e-07, "logits/chosen": 21.357421875, "logits/rejected": 21.796932220458984, "logps/chosen": -467.2061462402344, "logps/rejected": -489.28143310546875, "loss": 0.65, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.14098785817623138, "rewards/margins": 0.11554199457168579, "rewards/rejected": -0.25652986764907837, "step": 720 }, { "epoch": 3.54, "learning_rate": 4.065845924733076e-07, "logits/chosen": 21.877614974975586, "logits/rejected": 21.869991302490234, "logps/chosen": -525.2653198242188, "logps/rejected": -533.6386108398438, "loss": 0.65, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.14704535901546478, "rewards/margins": 0.15783420205116272, "rewards/rejected": -0.3048795163631439, "step": 730 }, { "epoch": 3.59, "learning_rate": 4.0324358091468226e-07, "logits/chosen": 21.090862274169922, "logits/rejected": 20.943960189819336, "logps/chosen": -482.4337463378906, "logps/rejected": -536.8486328125, "loss": 0.6535, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.21552202105522156, "rewards/margins": 0.1812928169965744, "rewards/rejected": -0.39681482315063477, "step": 740 }, { "epoch": 3.64, "learning_rate": 3.99858139101353e-07, "logits/chosen": 21.675159454345703, "logits/rejected": 21.479019165039062, "logps/chosen": -580.7256469726562, "logps/rejected": -579.4886474609375, "loss": 0.644, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.1938856542110443, "rewards/margins": 0.13946060836315155, "rewards/rejected": -0.33334627747535706, "step": 750 }, { "epoch": 3.69, "learning_rate": 3.964292485820487e-07, "logits/chosen": 21.661422729492188, "logits/rejected": 21.30933380126953, "logps/chosen": -474.04315185546875, "logps/rejected": -520.8201904296875, "loss": 0.6441, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.13717155158519745, "rewards/margins": 0.17133645713329315, "rewards/rejected": -0.3085080087184906, "step": 760 }, { "epoch": 3.74, "learning_rate": 3.929579035026788e-07, "logits/chosen": 22.402542114257812, "logits/rejected": 22.312419891357422, "logps/chosen": -430.6495056152344, "logps/rejected": -463.7230529785156, "loss": 0.6431, "rewards/accuracies": 0.71875, "rewards/chosen": -0.07999514788389206, "rewards/margins": 0.18375879526138306, "rewards/rejected": -0.2637539505958557, "step": 770 }, { "epoch": 3.79, "learning_rate": 3.8944511031809865e-07, "logits/chosen": 21.463491439819336, "logits/rejected": 21.453006744384766, "logps/chosen": -502.7759704589844, "logps/rejected": -547.2340087890625, "loss": 0.6575, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.1663341373205185, "rewards/margins": 0.12559418380260468, "rewards/rejected": -0.29192835092544556, "step": 780 }, { "epoch": 3.84, "learning_rate": 3.858918875003053e-07, "logits/chosen": 21.34784698486328, "logits/rejected": 21.8105525970459, "logps/chosen": -517.6514282226562, "logps/rejected": -540.4078979492188, "loss": 0.6481, "rewards/accuracies": 0.75, "rewards/chosen": -0.16342517733573914, "rewards/margins": 0.11599358171224594, "rewards/rejected": -0.2794187664985657, "step": 790 }, { "epoch": 3.88, "learning_rate": 3.8229926524315013e-07, "logits/chosen": 22.361557006835938, "logits/rejected": 22.051918029785156, "logps/chosen": -479.0370178222656, "logps/rejected": -508.67523193359375, "loss": 0.6509, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.17205052077770233, "rewards/margins": 0.13092947006225586, "rewards/rejected": -0.3029800057411194, "step": 800 }, { "epoch": 3.93, "learning_rate": 3.7866828516365223e-07, "logits/chosen": 21.549245834350586, "logits/rejected": 21.51742172241211, "logps/chosen": -488.44146728515625, "logps/rejected": -505.7699279785156, "loss": 0.6473, "rewards/accuracies": 0.71875, "rewards/chosen": -0.1810511201620102, "rewards/margins": 0.11019665002822876, "rewards/rejected": -0.29124775528907776, "step": 810 }, { "epoch": 3.98, "learning_rate": 3.75e-07, "logits/chosen": 21.30332374572754, "logits/rejected": 21.18606185913086, "logps/chosen": -468.67852783203125, "logps/rejected": -542.9542236328125, "loss": 0.6399, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.14327475428581238, "rewards/margins": 0.19075681269168854, "rewards/rejected": -0.3340315818786621, "step": 820 }, { "epoch": 4.03, "learning_rate": 3.712954733063284e-07, "logits/chosen": 20.62077522277832, "logits/rejected": 20.85430908203125, "logps/chosen": -418.8779296875, "logps/rejected": -455.548095703125, "loss": 0.6344, "rewards/accuracies": 0.6875, "rewards/chosen": -0.11488916724920273, "rewards/margins": 0.23587051033973694, "rewards/rejected": -0.3507596552371979, "step": 830 }, { "epoch": 4.08, "learning_rate": 3.6755577914436054e-07, "logits/chosen": 21.663427352905273, "logits/rejected": 21.377941131591797, "logps/chosen": -541.1193237304688, "logps/rejected": -579.5726318359375, "loss": 0.6321, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.19904150068759918, "rewards/margins": 0.19058963656425476, "rewards/rejected": -0.3896311819553375, "step": 840 }, { "epoch": 4.13, "learning_rate": 3.637820017720022e-07, "logits/chosen": 20.406190872192383, "logits/rejected": 19.97982406616211, "logps/chosen": -452.04754638671875, "logps/rejected": -489.46826171875, "loss": 0.6266, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.16855832934379578, "rewards/margins": 0.13646240532398224, "rewards/rejected": -0.3050207495689392, "step": 850 }, { "epoch": 4.18, "learning_rate": 3.599752353289808e-07, "logits/chosen": 20.002662658691406, "logits/rejected": 20.125558853149414, "logps/chosen": -461.048095703125, "logps/rejected": -485.640869140625, "loss": 0.6367, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.18868403136730194, "rewards/margins": 0.16586804389953613, "rewards/rejected": -0.35455209016799927, "step": 860 }, { "epoch": 4.22, "learning_rate": 3.56136583519619e-07, "logits/chosen": 20.461118698120117, "logits/rejected": 20.211368560791016, "logps/chosen": -473.6031188964844, "logps/rejected": -509.35565185546875, "loss": 0.634, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.1867034137248993, "rewards/margins": 0.1752786636352539, "rewards/rejected": -0.3619820773601532, "step": 870 }, { "epoch": 4.27, "learning_rate": 3.52267159292835e-07, "logits/chosen": 20.3443603515625, "logits/rejected": 20.236209869384766, "logps/chosen": -500.35247802734375, "logps/rejected": -538.0313720703125, "loss": 0.6186, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.2049115002155304, "rewards/margins": 0.20946213603019714, "rewards/rejected": -0.41437363624572754, "step": 880 }, { "epoch": 4.32, "learning_rate": 3.483680845194629e-07, "logits/chosen": 19.040111541748047, "logits/rejected": 19.063495635986328, "logps/chosen": -503.2266540527344, "logps/rejected": -557.2120361328125, "loss": 0.6408, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.2737266421318054, "rewards/margins": 0.1703735888004303, "rewards/rejected": -0.4441002309322357, "step": 890 }, { "epoch": 4.37, "learning_rate": 3.444404896669864e-07, "logits/chosen": 19.740901947021484, "logits/rejected": 19.64006233215332, "logps/chosen": -513.2991333007812, "logps/rejected": -551.9752807617188, "loss": 0.6262, "rewards/accuracies": 0.75, "rewards/chosen": -0.22840385138988495, "rewards/margins": 0.22724993526935577, "rewards/rejected": -0.45565375685691833, "step": 900 }, { "epoch": 4.42, "learning_rate": 3.4048551347177943e-07, "logits/chosen": 19.359893798828125, "logits/rejected": 19.82449722290039, "logps/chosen": -478.95452880859375, "logps/rejected": -482.80731201171875, "loss": 0.6325, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.21430854499340057, "rewards/margins": 0.14684443175792694, "rewards/rejected": -0.3611529767513275, "step": 910 }, { "epoch": 4.47, "learning_rate": 3.365043026089501e-07, "logits/chosen": 19.821773529052734, "logits/rejected": 19.241750717163086, "logps/chosen": -502.0999450683594, "logps/rejected": -568.6721801757812, "loss": 0.6354, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.2433268278837204, "rewards/margins": 0.28582024574279785, "rewards/rejected": -0.5291470289230347, "step": 920 }, { "epoch": 4.52, "learning_rate": 3.3249801135988236e-07, "logits/chosen": 20.231494903564453, "logits/rejected": 20.169645309448242, "logps/chosen": -473.277099609375, "logps/rejected": -506.4814453125, "loss": 0.6261, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.19130480289459229, "rewards/margins": 0.21689121425151825, "rewards/rejected": -0.40819603204727173, "step": 930 }, { "epoch": 4.56, "learning_rate": 3.284678012775727e-07, "logits/chosen": 20.264537811279297, "logits/rejected": 19.87087631225586, "logps/chosen": -489.9398498535156, "logps/rejected": -575.63671875, "loss": 0.6216, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.2465265691280365, "rewards/margins": 0.2502959966659546, "rewards/rejected": -0.4968225359916687, "step": 940 }, { "epoch": 4.61, "learning_rate": 3.2441484084985866e-07, "logits/chosen": 19.91929054260254, "logits/rejected": 19.753122329711914, "logps/chosen": -473.8438415527344, "logps/rejected": -550.4216918945312, "loss": 0.6254, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.2036692351102829, "rewards/margins": 0.2004026174545288, "rewards/rejected": -0.4040718674659729, "step": 950 }, { "epoch": 4.66, "learning_rate": 3.203403051606362e-07, "logits/chosen": 19.359331130981445, "logits/rejected": 18.84432601928711, "logps/chosen": -432.80950927734375, "logps/rejected": -495.875, "loss": 0.6273, "rewards/accuracies": 0.78125, "rewards/chosen": -0.2121230810880661, "rewards/margins": 0.2295958250761032, "rewards/rejected": -0.4417189061641693, "step": 960 }, { "epoch": 4.71, "learning_rate": 3.162453755491655e-07, "logits/chosen": 19.773300170898438, "logits/rejected": 20.017602920532227, "logps/chosen": -488.0621032714844, "logps/rejected": -516.9225463867188, "loss": 0.6339, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -0.22858639061450958, "rewards/margins": 0.1711055487394333, "rewards/rejected": -0.3996918797492981, "step": 970 }, { "epoch": 4.76, "learning_rate": 3.1213123926756174e-07, "logits/chosen": 19.31648063659668, "logits/rejected": 19.08823013305664, "logps/chosen": -508.760009765625, "logps/rejected": -547.8890991210938, "loss": 0.6245, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.25501328706741333, "rewards/margins": 0.20005568861961365, "rewards/rejected": -0.455068975687027, "step": 980 }, { "epoch": 4.81, "learning_rate": 3.0799908913657367e-07, "logits/chosen": 21.24723243713379, "logits/rejected": 20.8863468170166, "logps/chosen": -459.1988830566406, "logps/rejected": -493.8885803222656, "loss": 0.6141, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.18732424080371857, "rewards/margins": 0.16734528541564941, "rewards/rejected": -0.3546695113182068, "step": 990 }, { "epoch": 4.86, "learning_rate": 3.0385012319974533e-07, "logits/chosen": 19.451326370239258, "logits/rejected": 18.73493766784668, "logps/chosen": -506.89697265625, "logps/rejected": -547.7899169921875, "loss": 0.6236, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.25920993089675903, "rewards/margins": 0.2783968448638916, "rewards/rejected": -0.5376068353652954, "step": 1000 }, { "epoch": 4.9, "learning_rate": 2.996855443760651e-07, "logits/chosen": 18.372570037841797, "logits/rejected": 18.149747848510742, "logps/chosen": -456.7911682128906, "logps/rejected": -510.40155029296875, "loss": 0.6235, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.29108864068984985, "rewards/margins": 0.19627229869365692, "rewards/rejected": -0.4873608648777008, "step": 1010 }, { "epoch": 4.95, "learning_rate": 2.955065601112005e-07, "logits/chosen": 18.089101791381836, "logits/rejected": 17.99216079711914, "logps/chosen": -497.9737854003906, "logps/rejected": -488.06475830078125, "loss": 0.6259, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.2766265869140625, "rewards/margins": 0.1618172526359558, "rewards/rejected": -0.4384438395500183, "step": 1020 }, { "epoch": 5.0, "learning_rate": 2.913143820274212e-07, "logits/chosen": 19.157297134399414, "logits/rejected": 18.781457901000977, "logps/chosen": -478.51336669921875, "logps/rejected": -547.4224853515625, "loss": 0.6255, "rewards/accuracies": 0.75, "rewards/chosen": -0.3197309076786041, "rewards/margins": 0.22106032073497772, "rewards/rejected": -0.5407912135124207, "step": 1030 }, { "epoch": 5.05, "learning_rate": 2.8711022557231015e-07, "logits/chosen": 18.902210235595703, "logits/rejected": 18.82394027709961, "logps/chosen": -529.0880737304688, "logps/rejected": -508.85565185546875, "loss": 0.6151, "rewards/accuracies": 0.78125, "rewards/chosen": -0.3066019117832184, "rewards/margins": 0.16971367597579956, "rewards/rejected": -0.47631555795669556, "step": 1040 }, { "epoch": 5.1, "learning_rate": 2.828953096663662e-07, "logits/chosen": 19.924898147583008, "logits/rejected": 19.486276626586914, "logps/chosen": -437.6498107910156, "logps/rejected": -507.98040771484375, "loss": 0.6111, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.21237564086914062, "rewards/margins": 0.23849359154701233, "rewards/rejected": -0.4508691728115082, "step": 1050 }, { "epoch": 5.15, "learning_rate": 2.786708563496001e-07, "logits/chosen": 19.466875076293945, "logits/rejected": 19.788875579833984, "logps/chosen": -494.97149658203125, "logps/rejected": -527.1760864257812, "loss": 0.6105, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.27766185998916626, "rewards/margins": 0.17516490817070007, "rewards/rejected": -0.4528267979621887, "step": 1060 }, { "epoch": 5.2, "learning_rate": 2.7443809042722544e-07, "logits/chosen": 18.834243774414062, "logits/rejected": 18.642332077026367, "logps/chosen": -465.34686279296875, "logps/rejected": -512.1998291015625, "loss": 0.61, "rewards/accuracies": 0.75, "rewards/chosen": -0.28242748975753784, "rewards/margins": 0.32754284143447876, "rewards/rejected": -0.6099702715873718, "step": 1070 }, { "epoch": 5.24, "learning_rate": 2.7019823911454807e-07, "logits/chosen": 19.293468475341797, "logits/rejected": 19.021459579467773, "logps/chosen": -548.0870971679688, "logps/rejected": -661.7999267578125, "loss": 0.6068, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.2817661762237549, "rewards/margins": 0.36686158180236816, "rewards/rejected": -0.648627758026123, "step": 1080 }, { "epoch": 5.29, "learning_rate": 2.6595253168115705e-07, "logits/chosen": 19.84808921813965, "logits/rejected": 19.46322250366211, "logps/chosen": -505.37445068359375, "logps/rejected": -561.1935424804688, "loss": 0.6063, "rewards/accuracies": 0.78125, "rewards/chosen": -0.24887773394584656, "rewards/margins": 0.29243093729019165, "rewards/rejected": -0.5413086414337158, "step": 1090 }, { "epoch": 5.34, "learning_rate": 2.6170219909451967e-07, "logits/chosen": 18.073366165161133, "logits/rejected": 18.1170597076416, "logps/chosen": -528.2265625, "logps/rejected": -528.5629272460938, "loss": 0.6039, "rewards/accuracies": 0.75, "rewards/chosen": -0.33055204153060913, "rewards/margins": 0.24000290036201477, "rewards/rejected": -0.5705549120903015, "step": 1100 }, { "epoch": 5.39, "learning_rate": 2.5744847366308395e-07, "logits/chosen": 18.11019515991211, "logits/rejected": 18.144580841064453, "logps/chosen": -494.4580993652344, "logps/rejected": -561.71484375, "loss": 0.6052, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.3098324239253998, "rewards/margins": 0.20377469062805176, "rewards/rejected": -0.5136070847511292, "step": 1110 }, { "epoch": 5.44, "learning_rate": 2.5319258867899344e-07, "logits/chosen": 18.397960662841797, "logits/rejected": 18.59024429321289, "logps/chosen": -441.19732666015625, "logps/rejected": -474.4454040527344, "loss": 0.6058, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.2723812758922577, "rewards/margins": 0.18869531154632568, "rewards/rejected": -0.4610765874385834, "step": 1120 }, { "epoch": 5.49, "learning_rate": 2.4893577806051536e-07, "logits/chosen": 18.157339096069336, "logits/rejected": 18.65304946899414, "logps/chosen": -463.7784118652344, "logps/rejected": -516.1624145507812, "loss": 0.6077, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.26962539553642273, "rewards/margins": 0.26316118240356445, "rewards/rejected": -0.5327866077423096, "step": 1130 }, { "epoch": 5.54, "learning_rate": 2.4467927599428815e-07, "logits/chosen": 18.12697982788086, "logits/rejected": 17.87653923034668, "logps/chosen": -479.36029052734375, "logps/rejected": -521.6390380859375, "loss": 0.6157, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -0.3562749922275543, "rewards/margins": 0.23832616209983826, "rewards/rejected": -0.5946012139320374, "step": 1140 }, { "epoch": 5.58, "learning_rate": 2.4042431657749115e-07, "logits/chosen": 18.84893035888672, "logits/rejected": 18.976377487182617, "logps/chosen": -420.193359375, "logps/rejected": -480.762451171875, "loss": 0.6109, "rewards/accuracies": 0.78125, "rewards/chosen": -0.231888085603714, "rewards/margins": 0.21741366386413574, "rewards/rejected": -0.44930171966552734, "step": 1150 }, { "epoch": 5.63, "learning_rate": 2.3617213346003988e-07, "logits/chosen": 18.333866119384766, "logits/rejected": 18.267248153686523, "logps/chosen": -506.20098876953125, "logps/rejected": -571.8406372070312, "loss": 0.6076, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.32090744376182556, "rewards/margins": 0.24834957718849182, "rewards/rejected": -0.5692570209503174, "step": 1160 }, { "epoch": 5.68, "learning_rate": 2.319239594869112e-07, "logits/chosen": 17.52167510986328, "logits/rejected": 17.264745712280273, "logps/chosen": -436.5166015625, "logps/rejected": -520.7471923828125, "loss": 0.5941, "rewards/accuracies": 0.78125, "rewards/chosen": -0.30473703145980835, "rewards/margins": 0.257717102766037, "rewards/rejected": -0.562454104423523, "step": 1170 }, { "epoch": 5.73, "learning_rate": 2.2768102634070143e-07, "logits/chosen": 17.672739028930664, "logits/rejected": 17.970993041992188, "logps/chosen": -445.01165771484375, "logps/rejected": -475.51495361328125, "loss": 0.6056, "rewards/accuracies": 0.78125, "rewards/chosen": -0.2711699604988098, "rewards/margins": 0.23058536648750305, "rewards/rejected": -0.5017553567886353, "step": 1180 }, { "epoch": 5.78, "learning_rate": 2.2344456418452267e-07, "logits/chosen": 17.519277572631836, "logits/rejected": 16.88127326965332, "logps/chosen": -530.1575927734375, "logps/rejected": -594.2427978515625, "loss": 0.5959, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.4372480809688568, "rewards/margins": 0.3837299942970276, "rewards/rejected": -0.8209781646728516, "step": 1190 }, { "epoch": 5.83, "learning_rate": 2.1921580130533827e-07, "logits/chosen": 18.267105102539062, "logits/rejected": 18.296194076538086, "logps/chosen": -558.4171142578125, "logps/rejected": -604.7532958984375, "loss": 0.6001, "rewards/accuracies": 0.78125, "rewards/chosen": -0.3732374906539917, "rewards/margins": 0.3241427540779114, "rewards/rejected": -0.6973801851272583, "step": 1200 }, { "epoch": 5.88, "learning_rate": 2.1499596375784279e-07, "logits/chosen": 17.248226165771484, "logits/rejected": 17.385725021362305, "logps/chosen": -491.4832458496094, "logps/rejected": -536.4549560546875, "loss": 0.6098, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.3684207499027252, "rewards/margins": 0.2918558120727539, "rewards/rejected": -0.6602765321731567, "step": 1210 }, { "epoch": 5.92, "learning_rate": 2.1078627500898936e-07, "logits/chosen": 17.595806121826172, "logits/rejected": 17.477294921875, "logps/chosen": -474.83770751953125, "logps/rejected": -536.64013671875, "loss": 0.5951, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.3259069621562958, "rewards/margins": 0.24975259602069855, "rewards/rejected": -0.5756595134735107, "step": 1220 }, { "epoch": 5.97, "learning_rate": 2.065879555832674e-07, "logits/chosen": 17.800256729125977, "logits/rejected": 17.910017013549805, "logps/chosen": -471.83135986328125, "logps/rejected": -539.9712524414062, "loss": 0.6031, "rewards/accuracies": 0.75, "rewards/chosen": -0.32458698749542236, "rewards/margins": 0.3115597665309906, "rewards/rejected": -0.6361468434333801, "step": 1230 }, { "epoch": 6.02, "learning_rate": 2.0240222270883288e-07, "logits/chosen": 18.058645248413086, "logits/rejected": 17.72833824157715, "logps/chosen": -495.13092041015625, "logps/rejected": -523.317138671875, "loss": 0.5936, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.36168938875198364, "rewards/margins": 0.30222177505493164, "rewards/rejected": -0.6639112234115601, "step": 1240 }, { "epoch": 6.07, "learning_rate": 1.9823028996459483e-07, "logits/chosen": 16.63167953491211, "logits/rejected": 17.25162696838379, "logps/chosen": -548.2149658203125, "logps/rejected": -542.8499755859375, "loss": 0.5939, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.3957754969596863, "rewards/margins": 0.31186193227767944, "rewards/rejected": -0.7076374292373657, "step": 1250 }, { "epoch": 6.12, "learning_rate": 1.9407336692835946e-07, "logits/chosen": 17.020973205566406, "logits/rejected": 16.73345947265625, "logps/chosen": -446.87579345703125, "logps/rejected": -543.0826416015625, "loss": 0.5857, "rewards/accuracies": 0.78125, "rewards/chosen": -0.3498944342136383, "rewards/margins": 0.3392466902732849, "rewards/rejected": -0.6891411542892456, "step": 1260 }, { "epoch": 6.17, "learning_rate": 1.899326588261348e-07, "logits/chosen": 17.527660369873047, "logits/rejected": 16.82056999206543, "logps/chosen": -457.8648376464844, "logps/rejected": -546.5458984375, "loss": 0.5926, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.3373507261276245, "rewards/margins": 0.3978506922721863, "rewards/rejected": -0.7352014183998108, "step": 1270 }, { "epoch": 6.22, "learning_rate": 1.8580936618269693e-07, "logits/chosen": 17.878108978271484, "logits/rejected": 18.693519592285156, "logps/chosen": -512.388671875, "logps/rejected": -550.9382934570312, "loss": 0.5984, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -0.3437446653842926, "rewards/margins": 0.27838242053985596, "rewards/rejected": -0.6221270561218262, "step": 1280 }, { "epoch": 6.26, "learning_rate": 1.8170468447351857e-07, "logits/chosen": 16.762582778930664, "logits/rejected": 16.87929916381836, "logps/chosen": -499.5602111816406, "logps/rejected": -558.8322143554688, "loss": 0.5861, "rewards/accuracies": 0.75, "rewards/chosen": -0.4218806326389313, "rewards/margins": 0.30065587162971497, "rewards/rejected": -0.722536563873291, "step": 1290 }, { "epoch": 6.31, "learning_rate": 1.7761980377816284e-07, "logits/chosen": 18.27288818359375, "logits/rejected": 18.402935028076172, "logps/chosen": -531.6474609375, "logps/rejected": -601.9354248046875, "loss": 0.5913, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.3757680356502533, "rewards/margins": 0.3616601228713989, "rewards/rejected": -0.7374281883239746, "step": 1300 }, { "epoch": 6.36, "learning_rate": 1.7355590843524053e-07, "logits/chosen": 16.844575881958008, "logits/rejected": 16.470760345458984, "logps/chosen": -515.7340087890625, "logps/rejected": -567.6885375976562, "loss": 0.5888, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.38387516140937805, "rewards/margins": 0.3353000283241272, "rewards/rejected": -0.7191751599311829, "step": 1310 }, { "epoch": 6.41, "learning_rate": 1.6951417669903228e-07, "logits/chosen": 17.121362686157227, "logits/rejected": 17.330198287963867, "logps/chosen": -500.6908264160156, "logps/rejected": -558.0948486328125, "loss": 0.5833, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.34819692373275757, "rewards/margins": 0.33231672644615173, "rewards/rejected": -0.6805136799812317, "step": 1320 }, { "epoch": 6.46, "learning_rate": 1.6549578039787434e-07, "logits/chosen": 17.425867080688477, "logits/rejected": 17.398069381713867, "logps/chosen": -491.4676208496094, "logps/rejected": -521.668701171875, "loss": 0.5899, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -0.37772536277770996, "rewards/margins": 0.28015801310539246, "rewards/rejected": -0.6578834652900696, "step": 1330 }, { "epoch": 6.51, "learning_rate": 1.615018845944081e-07, "logits/chosen": 18.616252899169922, "logits/rejected": 18.190723419189453, "logps/chosen": -510.36358642578125, "logps/rejected": -567.6229248046875, "loss": 0.5824, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -0.3491878807544708, "rewards/margins": 0.26081031560897827, "rewards/rejected": -0.6099982857704163, "step": 1340 }, { "epoch": 6.56, "learning_rate": 1.575336472477909e-07, "logits/chosen": 17.597698211669922, "logits/rejected": 17.078960418701172, "logps/chosen": -488.8829650878906, "logps/rejected": -548.6746826171875, "loss": 0.5956, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -0.35902029275894165, "rewards/margins": 0.33025866746902466, "rewards/rejected": -0.6892789602279663, "step": 1350 }, { "epoch": 6.6, "learning_rate": 1.5359221887796613e-07, "logits/chosen": 16.720373153686523, "logits/rejected": 16.681528091430664, "logps/chosen": -466.1805114746094, "logps/rejected": -475.7158203125, "loss": 0.5899, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -0.3708762228488922, "rewards/margins": 0.19428952038288116, "rewards/rejected": -0.5651656985282898, "step": 1360 }, { "epoch": 6.65, "learning_rate": 1.4967874223209033e-07, "logits/chosen": 17.109895706176758, "logits/rejected": 17.34455108642578, "logps/chosen": -473.8462829589844, "logps/rejected": -489.8412170410156, "loss": 0.5861, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.3177076280117035, "rewards/margins": 0.24576309323310852, "rewards/rejected": -0.563470721244812, "step": 1370 }, { "epoch": 6.7, "learning_rate": 1.4579435195321432e-07, "logits/chosen": 17.316837310791016, "logits/rejected": 16.62727928161621, "logps/chosen": -489.8931579589844, "logps/rejected": -570.500732421875, "loss": 0.5877, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -0.42431893944740295, "rewards/margins": 0.4839262366294861, "rewards/rejected": -0.9082452058792114, "step": 1380 }, { "epoch": 6.75, "learning_rate": 1.4194017425131323e-07, "logits/chosen": 17.513019561767578, "logits/rejected": 16.158910751342773, "logps/chosen": -538.9515380859375, "logps/rejected": -655.079345703125, "loss": 0.5822, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -0.4365549683570862, "rewards/margins": 0.43610841035842896, "rewards/rejected": -0.8726633191108704, "step": 1390 }, { "epoch": 6.8, "learning_rate": 1.381173265767623e-07, "logits/chosen": 16.728803634643555, "logits/rejected": 16.60254669189453, "logps/chosen": -563.25732421875, "logps/rejected": -635.5299072265625, "loss": 0.5872, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.5063729286193848, "rewards/margins": 0.488800585269928, "rewards/rejected": -0.995173454284668, "step": 1400 }, { "epoch": 6.85, "learning_rate": 1.343269172963513e-07, "logits/chosen": 16.771358489990234, "logits/rejected": 16.57272720336914, "logps/chosen": -491.09466552734375, "logps/rejected": -566.2666015625, "loss": 0.5768, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -0.4542843699455261, "rewards/margins": 0.352291464805603, "rewards/rejected": -0.8065758943557739, "step": 1410 }, { "epoch": 6.9, "learning_rate": 1.3057004537193422e-07, "logits/chosen": 17.443374633789062, "logits/rejected": 17.598270416259766, "logps/chosen": -503.72674560546875, "logps/rejected": -556.9064331054688, "loss": 0.5844, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -0.41296952962875366, "rewards/margins": 0.30032414197921753, "rewards/rejected": -0.7132936716079712, "step": 1420 }, { "epoch": 6.94, "learning_rate": 1.268478000418041e-07, "logits/chosen": 16.66560935974121, "logits/rejected": 16.443384170532227, "logps/chosen": -522.2667236328125, "logps/rejected": -560.718017578125, "loss": 0.5828, "rewards/accuracies": 0.8125, "rewards/chosen": -0.39559242129325867, "rewards/margins": 0.3799566328525543, "rewards/rejected": -0.775549054145813, "step": 1430 }, { "epoch": 6.99, "learning_rate": 1.2316126050488782e-07, "logits/chosen": 16.437084197998047, "logits/rejected": 16.211811065673828, "logps/chosen": -542.4644775390625, "logps/rejected": -548.2205810546875, "loss": 0.5879, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -0.4820192754268646, "rewards/margins": 0.30185410380363464, "rewards/rejected": -0.7838733792304993, "step": 1440 }, { "epoch": 7.04, "learning_rate": 1.1951149560785166e-07, "logits/chosen": 15.954843521118164, "logits/rejected": 15.96337890625, "logps/chosen": -590.3001708984375, "logps/rejected": -672.43505859375, "loss": 0.5841, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -0.5148499608039856, "rewards/margins": 0.5929259061813354, "rewards/rejected": -1.1077758073806763, "step": 1450 }, { "epoch": 7.09, "learning_rate": 1.1589956353520833e-07, "logits/chosen": 17.03178596496582, "logits/rejected": 17.020648956298828, "logps/chosen": -431.1114807128906, "logps/rejected": -483.35211181640625, "loss": 0.5811, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.30041372776031494, "rewards/margins": 0.21424512565135956, "rewards/rejected": -0.5146588683128357, "step": 1460 }, { "epoch": 7.14, "learning_rate": 1.1232651150251504e-07, "logits/chosen": 15.323884963989258, "logits/rejected": 15.189486503601074, "logps/chosen": -494.89739990234375, "logps/rejected": -535.5572509765625, "loss": 0.5802, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -0.49184292554855347, "rewards/margins": 0.30568668246269226, "rewards/rejected": -0.7975295782089233, "step": 1470 }, { "epoch": 7.19, "learning_rate": 1.0879337545275164e-07, "logits/chosen": 15.969990730285645, "logits/rejected": 15.853551864624023, "logps/chosen": -559.8504028320312, "logps/rejected": -637.55810546875, "loss": 0.5799, "rewards/accuracies": 0.78125, "rewards/chosen": -0.49292922019958496, "rewards/margins": 0.4328843653202057, "rewards/rejected": -0.9258135557174683, "step": 1480 }, { "epoch": 7.24, "learning_rate": 1.0530117975596789e-07, "logits/chosen": 16.58124351501465, "logits/rejected": 15.693090438842773, "logps/chosen": -526.1766967773438, "logps/rejected": -569.5706787109375, "loss": 0.5595, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -0.37696436047554016, "rewards/margins": 0.4419701099395752, "rewards/rejected": -0.8189345598220825, "step": 1490 }, { "epoch": 7.28, "learning_rate": 1.0185093691228533e-07, "logits/chosen": 16.440006256103516, "logits/rejected": 15.588663101196289, "logps/chosen": -476.3081970214844, "logps/rejected": -536.4998168945312, "loss": 0.566, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.40554875135421753, "rewards/margins": 0.4396079182624817, "rewards/rejected": -0.8451566696166992, "step": 1500 }, { "epoch": 7.33, "learning_rate": 9.844364725834056e-08, "logits/chosen": 16.686330795288086, "logits/rejected": 16.508487701416016, "logps/chosen": -488.6481018066406, "logps/rejected": -532.7208251953125, "loss": 0.5787, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -0.3779573440551758, "rewards/margins": 0.32084184885025024, "rewards/rejected": -0.6987992525100708, "step": 1510 }, { "epoch": 7.38, "learning_rate": 9.508029867725578e-08, "logits/chosen": 15.356477737426758, "logits/rejected": 15.598932266235352, "logps/chosen": -435.1556701660156, "logps/rejected": -488.14666748046875, "loss": 0.5834, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -0.3525981307029724, "rewards/margins": 0.313102662563324, "rewards/rejected": -0.6657007932662964, "step": 1520 }, { "epoch": 7.43, "learning_rate": 9.176186631221958e-08, "logits/chosen": 15.836163520812988, "logits/rejected": 15.421684265136719, "logps/chosen": -503.1708984375, "logps/rejected": -574.23779296875, "loss": 0.5743, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.5309593081474304, "rewards/margins": 0.6058937907218933, "rewards/rejected": -1.1368530988693237, "step": 1530 }, { "epoch": 7.48, "learning_rate": 8.848931228376136e-08, "logits/chosen": 15.552096366882324, "logits/rejected": 15.042068481445312, "logps/chosen": -465.777099609375, "logps/rejected": -539.7342529296875, "loss": 0.5621, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -0.42118939757347107, "rewards/margins": 0.3839268982410431, "rewards/rejected": -0.8051162958145142, "step": 1540 }, { "epoch": 7.53, "learning_rate": 8.526358541080172e-08, "logits/chosen": 16.235004425048828, "logits/rejected": 16.21014976501465, "logps/chosen": -497.1258850097656, "logps/rejected": -547.8870849609375, "loss": 0.5804, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.4342379570007324, "rewards/margins": 0.3557444214820862, "rewards/rejected": -0.7899823784828186, "step": 1550 }, { "epoch": 7.58, "learning_rate": 8.208562093555887e-08, "logits/chosen": 15.568893432617188, "logits/rejected": 15.181528091430664, "logps/chosen": -463.81097412109375, "logps/rejected": -515.5152587890625, "loss": 0.5826, "rewards/accuracies": 0.75, "rewards/chosen": -0.4401473104953766, "rewards/margins": 0.3010689616203308, "rewards/rejected": -0.7412161827087402, "step": 1560 }, { "epoch": 7.62, "learning_rate": 7.895634025239242e-08, "logits/chosen": 15.460016250610352, "logits/rejected": 14.932809829711914, "logps/chosen": -470.60552978515625, "logps/rejected": -520.3035278320312, "loss": 0.5777, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -0.4876958727836609, "rewards/margins": 0.33743318915367126, "rewards/rejected": -0.8251290321350098, "step": 1570 }, { "epoch": 7.67, "learning_rate": 7.587665064066085e-08, "logits/chosen": 16.43811798095703, "logits/rejected": 16.288951873779297, "logps/chosen": -470.00360107421875, "logps/rejected": -557.7108764648438, "loss": 0.5784, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.4131472706794739, "rewards/margins": 0.3947201371192932, "rewards/rejected": -0.8078674077987671, "step": 1580 }, { "epoch": 7.72, "learning_rate": 7.284744500167217e-08, "logits/chosen": 14.962780952453613, "logits/rejected": 14.988690376281738, "logps/chosen": -490.5482482910156, "logps/rejected": -576.4386596679688, "loss": 0.5611, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.5098827481269836, "rewards/margins": 0.4253455102443695, "rewards/rejected": -0.9352282285690308, "step": 1590 }, { "epoch": 7.77, "learning_rate": 6.986960159980326e-08, "logits/chosen": 15.525604248046875, "logits/rejected": 15.047343254089355, "logps/chosen": -457.3753967285156, "logps/rejected": -528.1970825195312, "loss": 0.5788, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.48724955320358276, "rewards/margins": 0.36003851890563965, "rewards/rejected": -0.8472881317138672, "step": 1600 }, { "epoch": 7.82, "learning_rate": 6.694398380786245e-08, "logits/chosen": 15.537455558776855, "logits/rejected": 16.031370162963867, "logps/chosen": -470.35986328125, "logps/rejected": -515.3436279296875, "loss": 0.5703, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -0.47040343284606934, "rewards/margins": 0.26535606384277344, "rewards/rejected": -0.735759437084198, "step": 1610 }, { "epoch": 7.87, "learning_rate": 6.40714398567701e-08, "logits/chosen": 15.932138442993164, "logits/rejected": 16.016658782958984, "logps/chosen": -534.5318603515625, "logps/rejected": -534.1517944335938, "loss": 0.5821, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.4990972578525543, "rewards/margins": 0.2680392861366272, "rewards/rejected": -0.7671364545822144, "step": 1620 }, { "epoch": 7.92, "learning_rate": 6.125280258962872e-08, "logits/chosen": 15.469932556152344, "logits/rejected": 15.58923053741455, "logps/chosen": -482.654541015625, "logps/rejected": -505.08673095703125, "loss": 0.5816, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.46863317489624023, "rewards/margins": 0.312537282705307, "rewards/rejected": -0.7811704277992249, "step": 1630 }, { "epoch": 7.96, "learning_rate": 5.848888922025552e-08, "logits/chosen": 16.112123489379883, "logits/rejected": 15.844825744628906, "logps/chosen": -530.7129516601562, "logps/rejected": -626.224365234375, "loss": 0.5729, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -0.5626592636108398, "rewards/margins": 0.5355942249298096, "rewards/rejected": -1.0982534885406494, "step": 1640 }, { "epoch": 8.01, "learning_rate": 5.57805010962451e-08, "logits/chosen": 16.61453628540039, "logits/rejected": 16.19707679748535, "logps/chosen": -539.4405517578125, "logps/rejected": -595.6859130859375, "loss": 0.5768, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -0.5616611242294312, "rewards/margins": 0.3783496618270874, "rewards/rejected": -0.9400107264518738, "step": 1650 }, { "epoch": 8.06, "learning_rate": 5.3128423466633624e-08, "logits/chosen": 15.611248970031738, "logits/rejected": 15.253950119018555, "logps/chosen": -500.92755126953125, "logps/rejected": -547.9854125976562, "loss": 0.5778, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.456888347864151, "rewards/margins": 0.2701273262500763, "rewards/rejected": -0.7270156145095825, "step": 1660 }, { "epoch": 8.11, "learning_rate": 5.053342525422918e-08, "logits/chosen": 15.895495414733887, "logits/rejected": 16.411907196044922, "logps/chosen": -491.34246826171875, "logps/rejected": -502.4891052246094, "loss": 0.5779, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.42431122064590454, "rewards/margins": 0.2600492835044861, "rewards/rejected": -0.6843605041503906, "step": 1670 }, { "epoch": 8.16, "learning_rate": 4.7996258832676716e-08, "logits/chosen": 15.992956161499023, "logits/rejected": 15.863334655761719, "logps/chosen": -570.5872192382812, "logps/rejected": -589.7510375976562, "loss": 0.5665, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -0.5297167897224426, "rewards/margins": 0.35133522748947144, "rewards/rejected": -0.8810520172119141, "step": 1680 }, { "epoch": 8.21, "learning_rate": 4.551765980832059e-08, "logits/chosen": 15.435505867004395, "logits/rejected": 15.565594673156738, "logps/chosen": -519.7587280273438, "logps/rejected": -555.607421875, "loss": 0.566, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -0.44829368591308594, "rewards/margins": 0.4504165053367615, "rewards/rejected": -0.8987102508544922, "step": 1690 }, { "epoch": 8.25, "learning_rate": 4.309834680692831e-08, "logits/chosen": 15.550143241882324, "logits/rejected": 15.454347610473633, "logps/chosen": -535.6947021484375, "logps/rejected": -586.8497314453125, "loss": 0.5675, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -0.5995014905929565, "rewards/margins": 0.37678369879722595, "rewards/rejected": -0.9762851595878601, "step": 1700 }, { "epoch": 8.3, "learning_rate": 4.07390212653379e-08, "logits/chosen": 15.492868423461914, "logits/rejected": 15.356470108032227, "logps/chosen": -533.2853393554688, "logps/rejected": -584.7105102539062, "loss": 0.5739, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -0.5474352836608887, "rewards/margins": 0.6199637651443481, "rewards/rejected": -1.1673991680145264, "step": 1710 }, { "epoch": 8.35, "learning_rate": 3.844036722808899e-08, "logits/chosen": 16.752445220947266, "logits/rejected": 16.3040771484375, "logps/chosen": -484.51416015625, "logps/rejected": -536.3505249023438, "loss": 0.5718, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.43505972623825073, "rewards/margins": 0.2838330864906311, "rewards/rejected": -0.7188928127288818, "step": 1720 }, { "epoch": 8.4, "learning_rate": 3.620305114909597e-08, "logits/chosen": 16.092082977294922, "logits/rejected": 16.506916046142578, "logps/chosen": -589.302490234375, "logps/rejected": -597.6064453125, "loss": 0.567, "rewards/accuracies": 0.78125, "rewards/chosen": -0.5278688669204712, "rewards/margins": 0.34603801369667053, "rewards/rejected": -0.8739069104194641, "step": 1730 }, { "epoch": 8.45, "learning_rate": 3.4027721698421466e-08, "logits/chosen": 14.892633438110352, "logits/rejected": 15.243593215942383, "logps/chosen": -518.6754150390625, "logps/rejected": -537.8123779296875, "loss": 0.5702, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.5415439605712891, "rewards/margins": 0.358787477016449, "rewards/rejected": -0.9003314971923828, "step": 1740 }, { "epoch": 8.5, "learning_rate": 3.191500957420626e-08, "logits/chosen": 16.676576614379883, "logits/rejected": 16.07375144958496, "logps/chosen": -466.3976135253906, "logps/rejected": -529.3568115234375, "loss": 0.5677, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.4896460175514221, "rewards/margins": 0.3948792517185211, "rewards/rejected": -0.8845251798629761, "step": 1750 }, { "epoch": 8.55, "learning_rate": 2.986552731980932e-08, "logits/chosen": 15.437856674194336, "logits/rejected": 15.470430374145508, "logps/chosen": -502.57696533203125, "logps/rejected": -518.8566284179688, "loss": 0.5697, "rewards/accuracies": 0.78125, "rewards/chosen": -0.4669625759124756, "rewards/margins": 0.27933841943740845, "rewards/rejected": -0.7463010549545288, "step": 1760 }, { "epoch": 8.59, "learning_rate": 2.787986914621182e-08, "logits/chosen": 16.045337677001953, "logits/rejected": 15.906814575195312, "logps/chosen": -537.7979125976562, "logps/rejected": -605.097900390625, "loss": 0.564, "rewards/accuracies": 0.8125, "rewards/chosen": -0.5049812197685242, "rewards/margins": 0.3923494517803192, "rewards/rejected": -0.897330641746521, "step": 1770 }, { "epoch": 8.64, "learning_rate": 2.5958610759736126e-08, "logits/chosen": 15.55151081085205, "logits/rejected": 14.9086332321167, "logps/chosen": -501.61322021484375, "logps/rejected": -578.8296508789062, "loss": 0.5655, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -0.47102347016334534, "rewards/margins": 0.46625399589538574, "rewards/rejected": -0.9372774958610535, "step": 1780 }, { "epoch": 8.69, "learning_rate": 2.410230919513023e-08, "logits/chosen": 14.670611381530762, "logits/rejected": 13.935040473937988, "logps/chosen": -508.17877197265625, "logps/rejected": -615.4990234375, "loss": 0.5551, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -0.4717566967010498, "rewards/margins": 0.4692964553833008, "rewards/rejected": -0.9410530924797058, "step": 1790 }, { "epoch": 8.74, "learning_rate": 2.231150265406512e-08, "logits/chosen": 14.696261405944824, "logits/rejected": 14.434527397155762, "logps/chosen": -389.96209716796875, "logps/rejected": -480.9751892089844, "loss": 0.5744, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -0.4124878942966461, "rewards/margins": 0.5330362319946289, "rewards/rejected": -0.9455240964889526, "step": 1800 }, { "epoch": 8.79, "learning_rate": 2.058671034909301e-08, "logits/chosen": 15.37951374053955, "logits/rejected": 15.185888290405273, "logps/chosen": -507.5043029785156, "logps/rejected": -560.7593994140625, "loss": 0.5675, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.5982163548469543, "rewards/margins": 0.356797456741333, "rewards/rejected": -0.9550137519836426, "step": 1810 }, { "epoch": 8.84, "learning_rate": 1.892843235311059e-08, "logits/chosen": 14.205958366394043, "logits/rejected": 15.277796745300293, "logps/chosen": -525.5435791015625, "logps/rejected": -556.5437622070312, "loss": 0.5742, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.57078617811203, "rewards/margins": 0.3275993764400482, "rewards/rejected": -0.8983856439590454, "step": 1820 }, { "epoch": 8.89, "learning_rate": 1.733714945437212e-08, "logits/chosen": 15.41334056854248, "logits/rejected": 15.21619701385498, "logps/chosen": -524.0565795898438, "logps/rejected": -608.4319458007812, "loss": 0.5716, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -0.5062750577926636, "rewards/margins": 0.621990442276001, "rewards/rejected": -1.128265619277954, "step": 1830 }, { "epoch": 8.93, "learning_rate": 1.581332301709304e-08, "logits/chosen": 14.983938217163086, "logits/rejected": 14.970956802368164, "logps/chosen": -502.68621826171875, "logps/rejected": -581.88720703125, "loss": 0.5593, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.522268533706665, "rewards/margins": 0.4709092974662781, "rewards/rejected": -0.9931778907775879, "step": 1840 }, { "epoch": 8.98, "learning_rate": 1.4357394847686027e-08, "logits/chosen": 14.20927619934082, "logits/rejected": 14.463226318359375, "logps/chosen": -503.64678955078125, "logps/rejected": -510.6846618652344, "loss": 0.5691, "rewards/accuracies": 0.78125, "rewards/chosen": -0.5376115441322327, "rewards/margins": 0.3068149983882904, "rewards/rejected": -0.8444265127182007, "step": 1850 }, { "epoch": 9.03, "learning_rate": 1.2969787066666654e-08, "logits/chosen": 15.562335014343262, "logits/rejected": 15.038406372070312, "logps/chosen": -493.4950256347656, "logps/rejected": -572.0565185546875, "loss": 0.5653, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.5538553595542908, "rewards/margins": 0.42482686042785645, "rewards/rejected": -0.9786823391914368, "step": 1860 }, { "epoch": 9.08, "learning_rate": 1.1650901986267364e-08, "logits/chosen": 15.639094352722168, "logits/rejected": 14.934486389160156, "logps/chosen": -545.4066162109375, "logps/rejected": -635.1442260742188, "loss": 0.5599, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -0.5437031984329224, "rewards/margins": 0.5424902439117432, "rewards/rejected": -1.086193323135376, "step": 1870 }, { "epoch": 9.13, "learning_rate": 1.0401121993794032e-08, "logits/chosen": 14.307469367980957, "logits/rejected": 13.92272663116455, "logps/chosen": -485.2925720214844, "logps/rejected": -516.3497924804688, "loss": 0.5667, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.42878469824790955, "rewards/margins": 0.43366050720214844, "rewards/rejected": -0.8624452352523804, "step": 1880 }, { "epoch": 9.18, "learning_rate": 9.220809440759592e-09, "logits/chosen": 15.5972900390625, "logits/rejected": 15.571401596069336, "logps/chosen": -468.67254638671875, "logps/rejected": -490.158447265625, "loss": 0.5921, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.4918997883796692, "rewards/margins": 0.321148544549942, "rewards/rejected": -0.813048243522644, "step": 1890 }, { "epoch": 9.23, "learning_rate": 8.1103065378266e-09, "logits/chosen": 15.903124809265137, "logits/rejected": 15.235029220581055, "logps/chosen": -504.0237731933594, "logps/rejected": -586.244140625, "loss": 0.5656, "rewards/accuracies": 0.8125, "rewards/chosen": -0.49678725004196167, "rewards/margins": 0.39936769008636475, "rewards/rejected": -0.8961549997329712, "step": 1900 }, { "epoch": 9.27, "learning_rate": 7.0699352555893825e-09, "logits/chosen": 14.437850952148438, "logits/rejected": 14.658166885375977, "logps/chosen": -552.8507690429688, "logps/rejected": -595.6864013671875, "loss": 0.565, "rewards/accuracies": 0.75, "rewards/chosen": -0.5628484487533569, "rewards/margins": 0.3715595602989197, "rewards/rejected": -0.9344080090522766, "step": 1910 }, { "epoch": 9.32, "learning_rate": 6.099997231224452e-09, "logits/chosen": 14.984025955200195, "logits/rejected": 14.939895629882812, "logps/chosen": -483.92315673828125, "logps/rejected": -557.912841796875, "loss": 0.5689, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.5138859152793884, "rewards/margins": 0.3408345878124237, "rewards/rejected": -0.8547204732894897, "step": 1920 }, { "epoch": 9.37, "learning_rate": 5.200773681035969e-09, "logits/chosen": 14.894918441772461, "logits/rejected": 15.099899291992188, "logps/chosen": -491.946044921875, "logps/rejected": -533.989501953125, "loss": 0.5626, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.43056002259254456, "rewards/margins": 0.3058411478996277, "rewards/rejected": -0.7364012002944946, "step": 1930 }, { "epoch": 9.42, "learning_rate": 4.372525318922266e-09, "logits/chosen": 15.696123123168945, "logits/rejected": 14.719775199890137, "logps/chosen": -542.4873046875, "logps/rejected": -633.1751708984375, "loss": 0.5629, "rewards/accuracies": 0.84375, "rewards/chosen": -0.517546534538269, "rewards/margins": 0.4977279305458069, "rewards/rejected": -1.0152745246887207, "step": 1940 }, { "epoch": 9.47, "learning_rate": 3.6154922807863643e-09, "logits/chosen": 14.244386672973633, "logits/rejected": 13.889488220214844, "logps/chosen": -456.33135986328125, "logps/rejected": -569.2753295898438, "loss": 0.5784, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.5168919563293457, "rewards/margins": 0.4235960841178894, "rewards/rejected": -0.9404880404472351, "step": 1950 }, { "epoch": 9.52, "learning_rate": 2.929894054912896e-09, "logits/chosen": 15.562467575073242, "logits/rejected": 15.46942138671875, "logps/chosen": -548.723876953125, "logps/rejected": -627.305908203125, "loss": 0.5561, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.4751965403556824, "rewards/margins": 0.44715824723243713, "rewards/rejected": -0.9223548173904419, "step": 1960 }, { "epoch": 9.57, "learning_rate": 2.3159294183312804e-09, "logits/chosen": 15.579564094543457, "logits/rejected": 15.467860221862793, "logps/chosen": -508.56231689453125, "logps/rejected": -582.9691162109375, "loss": 0.5663, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -0.48075562715530396, "rewards/margins": 0.35179024934768677, "rewards/rejected": -0.8325458765029907, "step": 1970 }, { "epoch": 9.61, "learning_rate": 1.7737763791840499e-09, "logits/chosen": 15.496403694152832, "logits/rejected": 14.966221809387207, "logps/chosen": -438.64410400390625, "logps/rejected": -536.9528198242188, "loss": 0.5684, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -0.4091852605342865, "rewards/margins": 0.39350926876068115, "rewards/rejected": -0.8026946187019348, "step": 1980 }, { "epoch": 9.66, "learning_rate": 1.3035921251163263e-09, "logits/chosen": 15.670194625854492, "logits/rejected": 15.499285697937012, "logps/chosen": -510.4190979003906, "logps/rejected": -574.4082641601562, "loss": 0.5712, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -0.49581804871559143, "rewards/margins": 0.3804387152194977, "rewards/rejected": -0.8762567639350891, "step": 1990 }, { "epoch": 9.71, "learning_rate": 9.055129777021663e-10, "logits/chosen": 14.768771171569824, "logits/rejected": 14.854043960571289, "logps/chosen": -461.849609375, "logps/rejected": -519.3204345703125, "loss": 0.5585, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.42806243896484375, "rewards/margins": 0.41432294249534607, "rewards/rejected": -0.842385470867157, "step": 2000 }, { "epoch": 9.76, "learning_rate": 5.796543529205389e-10, "logits/chosen": 15.0086030960083, "logits/rejected": 15.746871948242188, "logps/chosen": -584.0777587890625, "logps/rejected": -618.7239379882812, "loss": 0.5594, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.6217775940895081, "rewards/margins": 0.3591920733451843, "rewards/rejected": -0.9809697270393372, "step": 2010 }, { "epoch": 9.81, "learning_rate": 3.261107276925079e-10, "logits/chosen": 15.121289253234863, "logits/rejected": 15.262140274047852, "logps/chosen": -490.37420654296875, "logps/rejected": -545.4072265625, "loss": 0.5679, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -0.45917099714279175, "rewards/margins": 0.5055267214775085, "rewards/rejected": -0.9646978378295898, "step": 2020 }, { "epoch": 9.86, "learning_rate": 1.4495561248931144e-10, "logits/chosen": 15.449140548706055, "logits/rejected": 16.043636322021484, "logps/chosen": -548.4398803710938, "logps/rejected": -583.19287109375, "loss": 0.5659, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.5652498006820679, "rewards/margins": 0.3585050702095032, "rewards/rejected": -0.923754870891571, "step": 2030 }, { "epoch": 9.91, "learning_rate": 3.6241530019326397e-11, "logits/chosen": 15.511439323425293, "logits/rejected": 15.162747383117676, "logps/chosen": -504.12799072265625, "logps/rejected": -518.1746215820312, "loss": 0.5789, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.5089441537857056, "rewards/margins": 0.29728806018829346, "rewards/rejected": -0.8062320947647095, "step": 2040 }, { "epoch": 9.95, "learning_rate": 0.0, "logits/chosen": 14.901571273803711, "logits/rejected": 14.849283218383789, "logps/chosen": -539.0538330078125, "logps/rejected": -590.5074462890625, "loss": 0.5537, "rewards/accuracies": 0.84375, "rewards/chosen": -0.6097260117530823, "rewards/margins": 0.40109339356422424, "rewards/rejected": -1.0108195543289185, "step": 2050 }, { "epoch": 9.95, "step": 2050, "total_flos": 0.0, "train_loss": 0.6235497470890603, "train_runtime": 23319.902, "train_samples_per_second": 8.476, "train_steps_per_second": 0.088 } ], "logging_steps": 10, "max_steps": 2050, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }