diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,4200 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.998919113673212, + "eval_steps": 100, + "global_step": 2774, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "grad_norm": 0.30078125, + "learning_rate": 1.798561151079137e-08, + "logits/chosen": -2.5878467559814453, + "logits/rejected": -2.596919059753418, + "logps/chosen": -50.55097579956055, + "logps/rejected": -53.270023345947266, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1 + }, + { + "epoch": 0.01, + "grad_norm": 0.33203125, + "learning_rate": 1.7985611510791368e-07, + "logits/chosen": -2.6599929332733154, + "logits/rejected": -2.6492068767547607, + "logps/chosen": -58.52377700805664, + "logps/rejected": -61.61543273925781, + "loss": 0.6931, + "rewards/accuracies": 0.4027777910232544, + "rewards/chosen": -4.6036697312956676e-05, + "rewards/margins": 4.705908213509247e-05, + "rewards/rejected": -9.309577581007034e-05, + "step": 10 + }, + { + "epoch": 0.01, + "grad_norm": 0.357421875, + "learning_rate": 3.5971223021582736e-07, + "logits/chosen": -2.65588641166687, + "logits/rejected": -2.661142110824585, + "logps/chosen": -60.95711135864258, + "logps/rejected": -63.73247146606445, + "loss": 0.6932, + "rewards/accuracies": 0.4312500059604645, + "rewards/chosen": -0.00015254078607540578, + "rewards/margins": -0.00013396346184890717, + "rewards/rejected": -1.857726601883769e-05, + "step": 20 + }, + { + "epoch": 0.02, + "grad_norm": 0.435546875, + "learning_rate": 5.39568345323741e-07, + "logits/chosen": -2.626067876815796, + "logits/rejected": -2.6205759048461914, + "logps/chosen": -65.40022277832031, + "logps/rejected": -68.29045104980469, + "loss": 0.6933, + "rewards/accuracies": 0.4312500059604645, + "rewards/chosen": -3.5934197512688115e-05, + "rewards/margins": -0.00024712778395041823, + "rewards/rejected": 0.0002111935755237937, + "step": 30 + }, + { + "epoch": 0.03, + "grad_norm": 0.3515625, + "learning_rate": 7.194244604316547e-07, + "logits/chosen": -2.6541905403137207, + "logits/rejected": -2.6613316535949707, + "logps/chosen": -58.868675231933594, + "logps/rejected": -62.767356872558594, + "loss": 0.6931, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": 0.00014814280439168215, + "rewards/margins": 0.00017737274174578488, + "rewards/rejected": -2.922994281107094e-05, + "step": 40 + }, + { + "epoch": 0.04, + "grad_norm": 0.37109375, + "learning_rate": 8.992805755395684e-07, + "logits/chosen": -2.614741802215576, + "logits/rejected": -2.617932081222534, + "logps/chosen": -59.7147216796875, + "logps/rejected": -61.980995178222656, + "loss": 0.6931, + "rewards/accuracies": 0.48124998807907104, + "rewards/chosen": 0.00018150641699321568, + "rewards/margins": 0.00013201191904954612, + "rewards/rejected": 4.949455615133047e-05, + "step": 50 + }, + { + "epoch": 0.04, + "grad_norm": 0.50390625, + "learning_rate": 1.079136690647482e-06, + "logits/chosen": -2.6651856899261475, + "logits/rejected": -2.6654398441314697, + "logps/chosen": -68.95173645019531, + "logps/rejected": -71.27698516845703, + "loss": 0.6931, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.00047902195365168154, + "rewards/margins": 0.0001825519575504586, + "rewards/rejected": 0.0002964699815493077, + "step": 60 + }, + { + "epoch": 0.05, + "grad_norm": 0.291015625, + "learning_rate": 1.2589928057553958e-06, + "logits/chosen": -2.6852972507476807, + "logits/rejected": -2.6725258827209473, + "logps/chosen": -68.01790618896484, + "logps/rejected": -72.10859680175781, + "loss": 0.693, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": 0.00035084557021036744, + "rewards/margins": 0.00020036422938574106, + "rewards/rejected": 0.00015048135537654161, + "step": 70 + }, + { + "epoch": 0.06, + "grad_norm": 0.45703125, + "learning_rate": 1.4388489208633094e-06, + "logits/chosen": -2.6697287559509277, + "logits/rejected": -2.668147563934326, + "logps/chosen": -70.40176391601562, + "logps/rejected": -73.07498931884766, + "loss": 0.693, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.00043141478090547025, + "rewards/margins": 0.00031604542164132, + "rewards/rejected": 0.00011536936654010788, + "step": 80 + }, + { + "epoch": 0.06, + "grad_norm": 0.39453125, + "learning_rate": 1.618705035971223e-06, + "logits/chosen": -2.670775890350342, + "logits/rejected": -2.674410343170166, + "logps/chosen": -66.90149688720703, + "logps/rejected": -69.80754089355469, + "loss": 0.6929, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": 0.0009530371753498912, + "rewards/margins": 0.0005468233721330762, + "rewards/rejected": 0.0004062137159053236, + "step": 90 + }, + { + "epoch": 0.07, + "grad_norm": 0.380859375, + "learning_rate": 1.7985611510791368e-06, + "logits/chosen": -2.657923460006714, + "logits/rejected": -2.658536911010742, + "logps/chosen": -62.22175979614258, + "logps/rejected": -66.25755310058594, + "loss": 0.6931, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.0009646881371736526, + "rewards/margins": 0.00016965254326350987, + "rewards/rejected": 0.0007950355065986514, + "step": 100 + }, + { + "epoch": 0.08, + "grad_norm": 0.3125, + "learning_rate": 1.9784172661870504e-06, + "logits/chosen": -2.6612608432769775, + "logits/rejected": -2.6600637435913086, + "logps/chosen": -66.11808013916016, + "logps/rejected": -69.09329986572266, + "loss": 0.693, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.0011165592586621642, + "rewards/margins": 0.00037375936517491937, + "rewards/rejected": 0.0007428000681102276, + "step": 110 + }, + { + "epoch": 0.09, + "grad_norm": 0.322265625, + "learning_rate": 2.158273381294964e-06, + "logits/chosen": -2.6269524097442627, + "logits/rejected": -2.627486228942871, + "logps/chosen": -61.392478942871094, + "logps/rejected": -64.30213165283203, + "loss": 0.6928, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": 0.001123746857047081, + "rewards/margins": 0.0006171964341774583, + "rewards/rejected": 0.0005065504228696227, + "step": 120 + }, + { + "epoch": 0.09, + "grad_norm": 0.359375, + "learning_rate": 2.3381294964028776e-06, + "logits/chosen": -2.7004921436309814, + "logits/rejected": -2.7044999599456787, + "logps/chosen": -68.17378234863281, + "logps/rejected": -70.49411010742188, + "loss": 0.6929, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.0015111321117728949, + "rewards/margins": 0.0004890409181825817, + "rewards/rejected": 0.0010220912517979741, + "step": 130 + }, + { + "epoch": 0.1, + "grad_norm": 0.37890625, + "learning_rate": 2.5179856115107916e-06, + "logits/chosen": -2.646902084350586, + "logits/rejected": -2.6411759853363037, + "logps/chosen": -63.913551330566406, + "logps/rejected": -68.77268981933594, + "loss": 0.6925, + "rewards/accuracies": 0.59375, + "rewards/chosen": 0.002318110316991806, + "rewards/margins": 0.0013834238052368164, + "rewards/rejected": 0.0009346865117549896, + "step": 140 + }, + { + "epoch": 0.11, + "grad_norm": 0.453125, + "learning_rate": 2.6978417266187052e-06, + "logits/chosen": -2.6741390228271484, + "logits/rejected": -2.6727261543273926, + "logps/chosen": -65.45186614990234, + "logps/rejected": -69.5618667602539, + "loss": 0.6922, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.0026420415379107, + "rewards/margins": 0.001897258684039116, + "rewards/rejected": 0.0007447830284945667, + "step": 150 + }, + { + "epoch": 0.12, + "grad_norm": 0.314453125, + "learning_rate": 2.877697841726619e-06, + "logits/chosen": -2.645352602005005, + "logits/rejected": -2.6511170864105225, + "logps/chosen": -57.53055953979492, + "logps/rejected": -61.90361404418945, + "loss": 0.692, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": 0.003107175463810563, + "rewards/margins": 0.0023571993224322796, + "rewards/rejected": 0.0007499762577936053, + "step": 160 + }, + { + "epoch": 0.12, + "grad_norm": 0.42578125, + "learning_rate": 3.0575539568345324e-06, + "logits/chosen": -2.650054454803467, + "logits/rejected": -2.6488354206085205, + "logps/chosen": -61.54291534423828, + "logps/rejected": -64.30693054199219, + "loss": 0.6917, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.0038940298836678267, + "rewards/margins": 0.0028370567597448826, + "rewards/rejected": 0.001056973123922944, + "step": 170 + }, + { + "epoch": 0.13, + "grad_norm": 0.33203125, + "learning_rate": 3.237410071942446e-06, + "logits/chosen": -2.645240306854248, + "logits/rejected": -2.642111301422119, + "logps/chosen": -61.097076416015625, + "logps/rejected": -64.71502685546875, + "loss": 0.6909, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.004625825677067041, + "rewards/margins": 0.004492693580687046, + "rewards/rejected": 0.00013313218369148672, + "step": 180 + }, + { + "epoch": 0.14, + "grad_norm": 0.3515625, + "learning_rate": 3.4172661870503596e-06, + "logits/chosen": -2.7127881050109863, + "logits/rejected": -2.710603713989258, + "logps/chosen": -57.48582077026367, + "logps/rejected": -62.37571334838867, + "loss": 0.6913, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": 0.004764446523040533, + "rewards/margins": 0.003814270021393895, + "rewards/rejected": 0.0009501769091002643, + "step": 190 + }, + { + "epoch": 0.14, + "grad_norm": 0.515625, + "learning_rate": 3.5971223021582737e-06, + "logits/chosen": -2.6896915435791016, + "logits/rejected": -2.684767246246338, + "logps/chosen": -59.758766174316406, + "logps/rejected": -64.19367980957031, + "loss": 0.6917, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": 0.0045052021741867065, + "rewards/margins": 0.003016799222677946, + "rewards/rejected": 0.0014884021366015077, + "step": 200 + }, + { + "epoch": 0.15, + "grad_norm": 0.392578125, + "learning_rate": 3.7769784172661873e-06, + "logits/chosen": -2.6605515480041504, + "logits/rejected": -2.6634459495544434, + "logps/chosen": -58.80467987060547, + "logps/rejected": -60.49141311645508, + "loss": 0.691, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": 0.0028493874706327915, + "rewards/margins": 0.004452340304851532, + "rewards/rejected": -0.0016029527178034186, + "step": 210 + }, + { + "epoch": 0.16, + "grad_norm": 0.41796875, + "learning_rate": 3.956834532374101e-06, + "logits/chosen": -2.6214749813079834, + "logits/rejected": -2.6205554008483887, + "logps/chosen": -63.977142333984375, + "logps/rejected": -71.72235870361328, + "loss": 0.6886, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": 0.0048708124086260796, + "rewards/margins": 0.009143907576799393, + "rewards/rejected": -0.0042730942368507385, + "step": 220 + }, + { + "epoch": 0.17, + "grad_norm": 0.390625, + "learning_rate": 4.1366906474820145e-06, + "logits/chosen": -2.663078784942627, + "logits/rejected": -2.667092800140381, + "logps/chosen": -61.06050491333008, + "logps/rejected": -66.15110778808594, + "loss": 0.6897, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": 0.005165424197912216, + "rewards/margins": 0.007069968618452549, + "rewards/rejected": -0.0019045437220484018, + "step": 230 + }, + { + "epoch": 0.17, + "grad_norm": 0.453125, + "learning_rate": 4.316546762589928e-06, + "logits/chosen": -2.675718069076538, + "logits/rejected": -2.6735589504241943, + "logps/chosen": -65.82478332519531, + "logps/rejected": -69.08268737792969, + "loss": 0.6891, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.00702512264251709, + "rewards/margins": 0.008214818313717842, + "rewards/rejected": -0.0011896961368620396, + "step": 240 + }, + { + "epoch": 0.18, + "grad_norm": 0.58984375, + "learning_rate": 4.496402877697842e-06, + "logits/chosen": -2.6274218559265137, + "logits/rejected": -2.6306469440460205, + "logps/chosen": -67.89946746826172, + "logps/rejected": -71.547119140625, + "loss": 0.6877, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.003348552156239748, + "rewards/margins": 0.011096605099737644, + "rewards/rejected": -0.0077480534091591835, + "step": 250 + }, + { + "epoch": 0.19, + "grad_norm": 0.462890625, + "learning_rate": 4.676258992805755e-06, + "logits/chosen": -2.6246440410614014, + "logits/rejected": -2.643188238143921, + "logps/chosen": -67.15058135986328, + "logps/rejected": -71.12448120117188, + "loss": 0.6875, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.0009993333369493484, + "rewards/margins": 0.011576562188565731, + "rewards/rejected": -0.012575894594192505, + "step": 260 + }, + { + "epoch": 0.19, + "grad_norm": 0.51171875, + "learning_rate": 4.856115107913669e-06, + "logits/chosen": -2.6977336406707764, + "logits/rejected": -2.6968212127685547, + "logps/chosen": -65.34959411621094, + "logps/rejected": -68.08098602294922, + "loss": 0.6882, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.0017364125233143568, + "rewards/margins": 0.010187914595007896, + "rewards/rejected": -0.011924326419830322, + "step": 270 + }, + { + "epoch": 0.2, + "grad_norm": 0.41796875, + "learning_rate": 4.999992078993707e-06, + "logits/chosen": -2.6335489749908447, + "logits/rejected": -2.640903949737549, + "logps/chosen": -58.345176696777344, + "logps/rejected": -61.308982849121094, + "loss": 0.6871, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": 0.000765187491197139, + "rewards/margins": 0.012545737437903881, + "rewards/rejected": -0.01178054977208376, + "step": 280 + }, + { + "epoch": 0.21, + "grad_norm": 0.44921875, + "learning_rate": 4.999714849043746e-06, + "logits/chosen": -2.662158489227295, + "logits/rejected": -2.674367904663086, + "logps/chosen": -62.21772003173828, + "logps/rejected": -65.60545349121094, + "loss": 0.6867, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.007329708430916071, + "rewards/margins": 0.013346971943974495, + "rewards/rejected": -0.020676681771874428, + "step": 290 + }, + { + "epoch": 0.22, + "grad_norm": 0.46484375, + "learning_rate": 4.999041618971537e-06, + "logits/chosen": -2.6512532234191895, + "logits/rejected": -2.6503214836120605, + "logps/chosen": -67.29080963134766, + "logps/rejected": -72.53589630126953, + "loss": 0.6861, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -0.010256023146212101, + "rewards/margins": 0.014919854700565338, + "rewards/rejected": -0.025175878778100014, + "step": 300 + }, + { + "epoch": 0.22, + "grad_norm": 0.56640625, + "learning_rate": 4.997972495428924e-06, + "logits/chosen": -2.615621328353882, + "logits/rejected": -2.6233325004577637, + "logps/chosen": -66.02967071533203, + "logps/rejected": -70.49574279785156, + "loss": 0.6852, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.007946187630295753, + "rewards/margins": 0.016535501927137375, + "rewards/rejected": -0.02448168769478798, + "step": 310 + }, + { + "epoch": 0.23, + "grad_norm": 0.439453125, + "learning_rate": 4.996507647784446e-06, + "logits/chosen": -2.638176441192627, + "logits/rejected": -2.6347122192382812, + "logps/chosen": -67.33381652832031, + "logps/rejected": -73.75712585449219, + "loss": 0.6856, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.01815110817551613, + "rewards/margins": 0.016093209385871887, + "rewards/rejected": -0.034244317561388016, + "step": 320 + }, + { + "epoch": 0.24, + "grad_norm": 0.427734375, + "learning_rate": 4.994647308096509e-06, + "logits/chosen": -2.629110813140869, + "logits/rejected": -2.6443512439727783, + "logps/chosen": -69.91134643554688, + "logps/rejected": -69.85363006591797, + "loss": 0.6876, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -0.014942710287868977, + "rewards/margins": 0.012206320650875568, + "rewards/rejected": -0.027149027213454247, + "step": 330 + }, + { + "epoch": 0.25, + "grad_norm": 0.4765625, + "learning_rate": 4.9923917710766266e-06, + "logits/chosen": -2.6785271167755127, + "logits/rejected": -2.6757400035858154, + "logps/chosen": -71.02973937988281, + "logps/rejected": -75.72981262207031, + "loss": 0.6807, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.014097088947892189, + "rewards/margins": 0.026118427515029907, + "rewards/rejected": -0.04021551460027695, + "step": 340 + }, + { + "epoch": 0.25, + "grad_norm": 0.66015625, + "learning_rate": 4.989741394042728e-06, + "logits/chosen": -2.598215103149414, + "logits/rejected": -2.5950300693511963, + "logps/chosen": -65.64091491699219, + "logps/rejected": -70.74314880371094, + "loss": 0.6838, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.028698483482003212, + "rewards/margins": 0.019990913569927216, + "rewards/rejected": -0.04868939518928528, + "step": 350 + }, + { + "epoch": 0.26, + "grad_norm": 0.5625, + "learning_rate": 4.986696596862556e-06, + "logits/chosen": -2.625063180923462, + "logits/rejected": -2.631725788116455, + "logps/chosen": -78.42835998535156, + "logps/rejected": -84.2737045288086, + "loss": 0.6802, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.03507710248231888, + "rewards/margins": 0.027645844966173172, + "rewards/rejected": -0.06272295117378235, + "step": 360 + }, + { + "epoch": 0.27, + "grad_norm": 0.76953125, + "learning_rate": 4.983257861887148e-06, + "logits/chosen": -2.6487419605255127, + "logits/rejected": -2.6524715423583984, + "logps/chosen": -71.53236389160156, + "logps/rejected": -81.15141296386719, + "loss": 0.6738, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.05367087572813034, + "rewards/margins": 0.04164598509669304, + "rewards/rejected": -0.09531687200069427, + "step": 370 + }, + { + "epoch": 0.27, + "grad_norm": 0.56640625, + "learning_rate": 4.979425733874431e-06, + "logits/chosen": -2.575629472732544, + "logits/rejected": -2.5949313640594482, + "logps/chosen": -71.41996765136719, + "logps/rejected": -75.95075225830078, + "loss": 0.6794, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.07148171961307526, + "rewards/margins": 0.03022712469100952, + "rewards/rejected": -0.10170884430408478, + "step": 380 + }, + { + "epoch": 0.28, + "grad_norm": 0.75390625, + "learning_rate": 4.975200819902911e-06, + "logits/chosen": -2.608182430267334, + "logits/rejected": -2.613959550857544, + "logps/chosen": -77.80644226074219, + "logps/rejected": -86.1133804321289, + "loss": 0.6738, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.10805950313806534, + "rewards/margins": 0.04201812297105789, + "rewards/rejected": -0.15007762610912323, + "step": 390 + }, + { + "epoch": 0.29, + "grad_norm": 0.7109375, + "learning_rate": 4.970583789275508e-06, + "logits/chosen": -2.565563440322876, + "logits/rejected": -2.575218677520752, + "logps/chosen": -72.14826965332031, + "logps/rejected": -76.73294830322266, + "loss": 0.6828, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.09949363768100739, + "rewards/margins": 0.02443886548280716, + "rewards/rejected": -0.12393250316381454, + "step": 400 + }, + { + "epoch": 0.3, + "grad_norm": 1.046875, + "learning_rate": 4.965575373413527e-06, + "logits/chosen": -2.5901551246643066, + "logits/rejected": -2.592224359512329, + "logps/chosen": -78.75377655029297, + "logps/rejected": -87.20631408691406, + "loss": 0.6708, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": -0.1401161104440689, + "rewards/margins": 0.05296441912651062, + "rewards/rejected": -0.19308052957057953, + "step": 410 + }, + { + "epoch": 0.3, + "grad_norm": 0.8359375, + "learning_rate": 4.960176365740783e-06, + "logits/chosen": -2.568718671798706, + "logits/rejected": -2.5703847408294678, + "logps/chosen": -82.48625183105469, + "logps/rejected": -91.03981018066406, + "loss": 0.6763, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.1931959092617035, + "rewards/margins": 0.04230925068259239, + "rewards/rejected": -0.2355051338672638, + "step": 420 + }, + { + "epoch": 0.31, + "grad_norm": 0.96875, + "learning_rate": 4.954387621557911e-06, + "logits/chosen": -2.472228527069092, + "logits/rejected": -2.4818115234375, + "logps/chosen": -83.677978515625, + "logps/rejected": -90.09959411621094, + "loss": 0.6676, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -0.20174559950828552, + "rewards/margins": 0.060646455734968185, + "rewards/rejected": -0.2623920440673828, + "step": 430 + }, + { + "epoch": 0.32, + "grad_norm": 1.046875, + "learning_rate": 4.948210057906871e-06, + "logits/chosen": -2.424100637435913, + "logits/rejected": -2.4418275356292725, + "logps/chosen": -88.38543701171875, + "logps/rejected": -100.091552734375, + "loss": 0.6649, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.25682324171066284, + "rewards/margins": 0.07104425877332687, + "rewards/rejected": -0.3278675079345703, + "step": 440 + }, + { + "epoch": 0.32, + "grad_norm": 0.78515625, + "learning_rate": 4.941644653425671e-06, + "logits/chosen": -2.452075481414795, + "logits/rejected": -2.4671432971954346, + "logps/chosen": -100.57665252685547, + "logps/rejected": -104.40872955322266, + "loss": 0.6732, + "rewards/accuracies": 0.53125, + "rewards/chosen": -0.2847747802734375, + "rewards/margins": 0.05320798233151436, + "rewards/rejected": -0.33798274397850037, + "step": 450 + }, + { + "epoch": 0.33, + "grad_norm": 1.0234375, + "learning_rate": 4.9346924481933345e-06, + "logits/chosen": -2.459083318710327, + "logits/rejected": -2.4748520851135254, + "logps/chosen": -96.22517395019531, + "logps/rejected": -105.0258560180664, + "loss": 0.6636, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.23906302452087402, + "rewards/margins": 0.06904648244380951, + "rewards/rejected": -0.3081095516681671, + "step": 460 + }, + { + "epoch": 0.34, + "grad_norm": 1.4140625, + "learning_rate": 4.927354543565131e-06, + "logits/chosen": -2.404327630996704, + "logits/rejected": -2.4193339347839355, + "logps/chosen": -101.07810974121094, + "logps/rejected": -109.60540771484375, + "loss": 0.6601, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.2962692379951477, + "rewards/margins": 0.07886885851621628, + "rewards/rejected": -0.3751381039619446, + "step": 470 + }, + { + "epoch": 0.35, + "grad_norm": 1.15625, + "learning_rate": 4.919632101998101e-06, + "logits/chosen": -2.4055585861206055, + "logits/rejected": -2.4047584533691406, + "logps/chosen": -83.26808166503906, + "logps/rejected": -96.07670593261719, + "loss": 0.6557, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.25196436047554016, + "rewards/margins": 0.09068725258111954, + "rewards/rejected": -0.3426516056060791, + "step": 480 + }, + { + "epoch": 0.35, + "grad_norm": 1.2109375, + "learning_rate": 4.911526346866907e-06, + "logits/chosen": -2.3670365810394287, + "logits/rejected": -2.380223512649536, + "logps/chosen": -96.45356750488281, + "logps/rejected": -111.1182861328125, + "loss": 0.6479, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.32315492630004883, + "rewards/margins": 0.11133052408695221, + "rewards/rejected": -0.43448543548583984, + "step": 490 + }, + { + "epoch": 0.36, + "grad_norm": 0.953125, + "learning_rate": 4.9030385622700225e-06, + "logits/chosen": -2.3522255420684814, + "logits/rejected": -2.358100414276123, + "logps/chosen": -96.55814361572266, + "logps/rejected": -112.30567932128906, + "loss": 0.6517, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.37474876642227173, + "rewards/margins": 0.10861654579639435, + "rewards/rejected": -0.4833652973175049, + "step": 500 + }, + { + "epoch": 0.37, + "grad_norm": 1.125, + "learning_rate": 4.89417009282631e-06, + "logits/chosen": -2.3778271675109863, + "logits/rejected": -2.390409469604492, + "logps/chosen": -98.19575500488281, + "logps/rejected": -111.83695983886719, + "loss": 0.6495, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.3482569754123688, + "rewards/margins": 0.11011794954538345, + "rewards/rejected": -0.45837491750717163, + "step": 510 + }, + { + "epoch": 0.37, + "grad_norm": 1.0703125, + "learning_rate": 4.88492234346201e-06, + "logits/chosen": -2.3503499031066895, + "logits/rejected": -2.3607373237609863, + "logps/chosen": -109.85440826416016, + "logps/rejected": -122.345703125, + "loss": 0.6576, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.37264811992645264, + "rewards/margins": 0.09832003712654114, + "rewards/rejected": -0.47096818685531616, + "step": 520 + }, + { + "epoch": 0.38, + "grad_norm": 1.1328125, + "learning_rate": 4.8752967791881735e-06, + "logits/chosen": -2.356555461883545, + "logits/rejected": -2.362435817718506, + "logps/chosen": -101.36775970458984, + "logps/rejected": -111.54450988769531, + "loss": 0.6626, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -0.3653712868690491, + "rewards/margins": 0.08574860543012619, + "rewards/rejected": -0.45111989974975586, + "step": 530 + }, + { + "epoch": 0.39, + "grad_norm": 1.5390625, + "learning_rate": 4.865294924868578e-06, + "logits/chosen": -2.3726258277893066, + "logits/rejected": -2.3774704933166504, + "logps/chosen": -95.63130950927734, + "logps/rejected": -108.5069580078125, + "loss": 0.6528, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.3163696825504303, + "rewards/margins": 0.10675134509801865, + "rewards/rejected": -0.42312103509902954, + "step": 540 + }, + { + "epoch": 0.4, + "grad_norm": 1.03125, + "learning_rate": 4.854918364978163e-06, + "logits/chosen": -2.318713665008545, + "logits/rejected": -2.3387556076049805, + "logps/chosen": -92.5940933227539, + "logps/rejected": -102.9334945678711, + "loss": 0.6572, + "rewards/accuracies": 0.53125, + "rewards/chosen": -0.27377182245254517, + "rewards/margins": 0.09678633511066437, + "rewards/rejected": -0.3705581724643707, + "step": 550 + }, + { + "epoch": 0.4, + "grad_norm": 1.0625, + "learning_rate": 4.844168743352019e-06, + "logits/chosen": -2.3034849166870117, + "logits/rejected": -2.322415828704834, + "logps/chosen": -93.90568542480469, + "logps/rejected": -103.6268081665039, + "loss": 0.6716, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.28542959690093994, + "rewards/margins": 0.07337291538715363, + "rewards/rejected": -0.35880252718925476, + "step": 560 + }, + { + "epoch": 0.41, + "grad_norm": 1.21875, + "learning_rate": 4.833047762924975e-06, + "logits/chosen": -2.3396031856536865, + "logits/rejected": -2.3490686416625977, + "logps/chosen": -106.96073913574219, + "logps/rejected": -120.58781433105469, + "loss": 0.6514, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.35568124055862427, + "rewards/margins": 0.10962893068790436, + "rewards/rejected": -0.4653101861476898, + "step": 570 + }, + { + "epoch": 0.42, + "grad_norm": 0.78515625, + "learning_rate": 4.8215571854618216e-06, + "logits/chosen": -2.2915313243865967, + "logits/rejected": -2.3102214336395264, + "logps/chosen": -95.60445404052734, + "logps/rejected": -107.88221740722656, + "loss": 0.6512, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.3615376055240631, + "rewards/margins": 0.10991451889276505, + "rewards/rejected": -0.47145208716392517, + "step": 580 + }, + { + "epoch": 0.43, + "grad_norm": 1.9765625, + "learning_rate": 4.809698831278217e-06, + "logits/chosen": -2.359297513961792, + "logits/rejected": -2.364837884902954, + "logps/chosen": -97.43984985351562, + "logps/rejected": -118.7339096069336, + "loss": 0.6343, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.3123398721218109, + "rewards/margins": 0.1503853052854538, + "rewards/rejected": -0.4627251625061035, + "step": 590 + }, + { + "epoch": 0.43, + "grad_norm": 1.015625, + "learning_rate": 4.797474578952315e-06, + "logits/chosen": -2.364551544189453, + "logits/rejected": -2.368478536605835, + "logps/chosen": -97.71867370605469, + "logps/rejected": -116.11451721191406, + "loss": 0.6416, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.3463926315307617, + "rewards/margins": 0.13709910213947296, + "rewards/rejected": -0.4834917485713959, + "step": 600 + }, + { + "epoch": 0.44, + "grad_norm": 1.171875, + "learning_rate": 4.7848863650271645e-06, + "logits/chosen": -2.346735954284668, + "logits/rejected": -2.349565029144287, + "logps/chosen": -99.36726379394531, + "logps/rejected": -108.9990463256836, + "loss": 0.6641, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.31063222885131836, + "rewards/margins": 0.08253936469554901, + "rewards/rejected": -0.39317160844802856, + "step": 610 + }, + { + "epoch": 0.45, + "grad_norm": 1.2421875, + "learning_rate": 4.771936183703927e-06, + "logits/chosen": -2.2801272869110107, + "logits/rejected": -2.286823034286499, + "logps/chosen": -90.63265228271484, + "logps/rejected": -99.37889099121094, + "loss": 0.6713, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.30601394176483154, + "rewards/margins": 0.0690702348947525, + "rewards/rejected": -0.37508416175842285, + "step": 620 + }, + { + "epoch": 0.45, + "grad_norm": 1.40625, + "learning_rate": 4.758626086525956e-06, + "logits/chosen": -2.3465566635131836, + "logits/rejected": -2.3557307720184326, + "logps/chosen": -91.6519546508789, + "logps/rejected": -107.96331787109375, + "loss": 0.6483, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.32235291600227356, + "rewards/margins": 0.11441938579082489, + "rewards/rejected": -0.43677228689193726, + "step": 630 + }, + { + "epoch": 0.46, + "grad_norm": 1.9453125, + "learning_rate": 4.7449581820538e-06, + "logits/chosen": -2.3313632011413574, + "logits/rejected": -2.3418033123016357, + "logps/chosen": -95.18330383300781, + "logps/rejected": -111.15785217285156, + "loss": 0.6478, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.32686877250671387, + "rewards/margins": 0.12550675868988037, + "rewards/rejected": -0.45237550139427185, + "step": 640 + }, + { + "epoch": 0.47, + "grad_norm": 1.46875, + "learning_rate": 4.730934635531161e-06, + "logits/chosen": -2.3043503761291504, + "logits/rejected": -2.310375690460205, + "logps/chosen": -97.12528228759766, + "logps/rejected": -108.67928314208984, + "loss": 0.6521, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.34178540110588074, + "rewards/margins": 0.11280516535043716, + "rewards/rejected": -0.4545906186103821, + "step": 650 + }, + { + "epoch": 0.48, + "grad_norm": 1.7578125, + "learning_rate": 4.716557668541893e-06, + "logits/chosen": -2.343346118927002, + "logits/rejected": -2.3510937690734863, + "logps/chosen": -97.1328125, + "logps/rejected": -114.3864974975586, + "loss": 0.6387, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.31538838148117065, + "rewards/margins": 0.13619980216026306, + "rewards/rejected": -0.4515882134437561, + "step": 660 + }, + { + "epoch": 0.48, + "grad_norm": 1.328125, + "learning_rate": 4.701829558658047e-06, + "logits/chosen": -2.3206913471221924, + "logits/rejected": -2.3359267711639404, + "logps/chosen": -102.4328384399414, + "logps/rejected": -114.6532211303711, + "loss": 0.6502, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.343585729598999, + "rewards/margins": 0.11280594021081924, + "rewards/rejected": -0.4563916325569153, + "step": 670 + }, + { + "epoch": 0.49, + "grad_norm": 1.21875, + "learning_rate": 4.686752639079076e-06, + "logits/chosen": -2.28320050239563, + "logits/rejected": -2.2843213081359863, + "logps/chosen": -101.29241180419922, + "logps/rejected": -113.52595520019531, + "loss": 0.649, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -0.33359086513519287, + "rewards/margins": 0.11714836210012436, + "rewards/rejected": -0.45073920488357544, + "step": 680 + }, + { + "epoch": 0.5, + "grad_norm": 1.7734375, + "learning_rate": 4.671329298262208e-06, + "logits/chosen": -2.351982593536377, + "logits/rejected": -2.357144832611084, + "logps/chosen": -102.9426498413086, + "logps/rejected": -118.25224304199219, + "loss": 0.6478, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.3722483217716217, + "rewards/margins": 0.11313033103942871, + "rewards/rejected": -0.4853786528110504, + "step": 690 + }, + { + "epoch": 0.5, + "grad_norm": 1.15625, + "learning_rate": 4.655561979544069e-06, + "logits/chosen": -2.2974140644073486, + "logits/rejected": -2.307819366455078, + "logps/chosen": -101.06309509277344, + "logps/rejected": -117.390625, + "loss": 0.6451, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.3607991635799408, + "rewards/margins": 0.1265007108449936, + "rewards/rejected": -0.4872998595237732, + "step": 700 + }, + { + "epoch": 0.51, + "grad_norm": 1.4765625, + "learning_rate": 4.639453180753619e-06, + "logits/chosen": -2.248704433441162, + "logits/rejected": -2.257744789123535, + "logps/chosen": -100.77429962158203, + "logps/rejected": -117.0492935180664, + "loss": 0.647, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": -0.43067851662635803, + "rewards/margins": 0.1362551599740982, + "rewards/rejected": -0.5669336318969727, + "step": 710 + }, + { + "epoch": 0.52, + "grad_norm": 1.2890625, + "learning_rate": 4.623005453816447e-06, + "logits/chosen": -2.3472437858581543, + "logits/rejected": -2.352238655090332, + "logps/chosen": -115.71247863769531, + "logps/rejected": -131.8263702392578, + "loss": 0.647, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -0.47270336747169495, + "rewards/margins": 0.12508396804332733, + "rewards/rejected": -0.5977872610092163, + "step": 720 + }, + { + "epoch": 0.53, + "grad_norm": 1.6328125, + "learning_rate": 4.606221404350504e-06, + "logits/chosen": -2.28971529006958, + "logits/rejected": -2.29419207572937, + "logps/chosen": -109.21917724609375, + "logps/rejected": -124.91645812988281, + "loss": 0.6463, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.4223889708518982, + "rewards/margins": 0.13507941365242004, + "rewards/rejected": -0.5574684143066406, + "step": 730 + }, + { + "epoch": 0.53, + "grad_norm": 2.171875, + "learning_rate": 4.589103691253317e-06, + "logits/chosen": -2.250274658203125, + "logits/rejected": -2.2717387676239014, + "logps/chosen": -112.26399230957031, + "logps/rejected": -119.15122985839844, + "loss": 0.6655, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -0.47238197922706604, + "rewards/margins": 0.08317569643259048, + "rewards/rejected": -0.5555577278137207, + "step": 740 + }, + { + "epoch": 0.54, + "grad_norm": 1.921875, + "learning_rate": 4.571655026280785e-06, + "logits/chosen": -2.2718663215637207, + "logits/rejected": -2.284795045852661, + "logps/chosen": -112.97920227050781, + "logps/rejected": -127.8006591796875, + "loss": 0.6484, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.4664887487888336, + "rewards/margins": 0.13087505102157593, + "rewards/rejected": -0.5973638296127319, + "step": 750 + }, + { + "epoch": 0.55, + "grad_norm": 1.234375, + "learning_rate": 4.553878173617576e-06, + "logits/chosen": -2.28155517578125, + "logits/rejected": -2.289883613586426, + "logps/chosen": -99.67669677734375, + "logps/rejected": -116.39395904541016, + "loss": 0.6423, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3680870532989502, + "rewards/margins": 0.13191227614879608, + "rewards/rejected": -0.49999934434890747, + "step": 760 + }, + { + "epoch": 0.55, + "grad_norm": 1.4453125, + "learning_rate": 4.5357759494392354e-06, + "logits/chosen": -2.2865400314331055, + "logits/rejected": -2.301579475402832, + "logps/chosen": -103.4640884399414, + "logps/rejected": -120.11143493652344, + "loss": 0.6447, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.42991527915000916, + "rewards/margins": 0.13045726716518402, + "rewards/rejected": -0.560372531414032, + "step": 770 + }, + { + "epoch": 0.56, + "grad_norm": 1.8828125, + "learning_rate": 4.5173512214660495e-06, + "logits/chosen": -2.290435314178467, + "logits/rejected": -2.3016152381896973, + "logps/chosen": -104.1209716796875, + "logps/rejected": -120.07255554199219, + "loss": 0.6424, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.3812081217765808, + "rewards/margins": 0.13323888182640076, + "rewards/rejected": -0.5144469738006592, + "step": 780 + }, + { + "epoch": 0.57, + "grad_norm": 1.5546875, + "learning_rate": 4.498606908508754e-06, + "logits/chosen": -2.281541109085083, + "logits/rejected": -2.2845287322998047, + "logps/chosen": -108.74382019042969, + "logps/rejected": -127.21275329589844, + "loss": 0.6438, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.4051045775413513, + "rewards/margins": 0.14055274426937103, + "rewards/rejected": -0.5456573367118835, + "step": 790 + }, + { + "epoch": 0.58, + "grad_norm": 1.2578125, + "learning_rate": 4.47954598000613e-06, + "logits/chosen": -2.3543689250946045, + "logits/rejected": -2.363257646560669, + "logps/chosen": -96.17681884765625, + "logps/rejected": -110.3593521118164, + "loss": 0.6472, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.319545716047287, + "rewards/margins": 0.120999276638031, + "rewards/rejected": -0.440544992685318, + "step": 800 + }, + { + "epoch": 0.58, + "grad_norm": 1.578125, + "learning_rate": 4.460171455554603e-06, + "logits/chosen": -2.2809572219848633, + "logits/rejected": -2.2786245346069336, + "logps/chosen": -99.40967559814453, + "logps/rejected": -117.13094329833984, + "loss": 0.6423, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.37397512793540955, + "rewards/margins": 0.14184913039207458, + "rewards/rejected": -0.5158242583274841, + "step": 810 + }, + { + "epoch": 0.59, + "grad_norm": 1.890625, + "learning_rate": 4.4404864044298755e-06, + "logits/chosen": -2.23799467086792, + "logits/rejected": -2.245177745819092, + "logps/chosen": -108.29356384277344, + "logps/rejected": -121.1603012084961, + "loss": 0.653, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.43523016571998596, + "rewards/margins": 0.12470052391290665, + "rewards/rejected": -0.559930682182312, + "step": 820 + }, + { + "epoch": 0.6, + "grad_norm": 1.578125, + "learning_rate": 4.420493945100702e-06, + "logits/chosen": -2.266139507293701, + "logits/rejected": -2.2764334678649902, + "logps/chosen": -99.6155014038086, + "logps/rejected": -117.4134521484375, + "loss": 0.6368, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.40347641706466675, + "rewards/margins": 0.1434660702943802, + "rewards/rejected": -0.5469424724578857, + "step": 830 + }, + { + "epoch": 0.61, + "grad_norm": 1.3203125, + "learning_rate": 4.400197244734866e-06, + "logits/chosen": -2.3086845874786377, + "logits/rejected": -2.3136982917785645, + "logps/chosen": -105.7784652709961, + "logps/rejected": -123.13346099853516, + "loss": 0.6323, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.41168150305747986, + "rewards/margins": 0.1643691062927246, + "rewards/rejected": -0.5760505795478821, + "step": 840 + }, + { + "epoch": 0.61, + "grad_norm": 1.6171875, + "learning_rate": 4.379599518697444e-06, + "logits/chosen": -2.302346706390381, + "logits/rejected": -2.305290699005127, + "logps/chosen": -110.01615905761719, + "logps/rejected": -132.0598602294922, + "loss": 0.6283, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.45303821563720703, + "rewards/margins": 0.17430761456489563, + "rewards/rejected": -0.627345860004425, + "step": 850 + }, + { + "epoch": 0.62, + "grad_norm": 2.109375, + "learning_rate": 4.3587040300414325e-06, + "logits/chosen": -2.249532461166382, + "logits/rejected": -2.2589190006256104, + "logps/chosen": -117.6961441040039, + "logps/rejected": -128.64663696289062, + "loss": 0.6567, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.40040212869644165, + "rewards/margins": 0.11040042340755463, + "rewards/rejected": -0.5108025670051575, + "step": 860 + }, + { + "epoch": 0.63, + "grad_norm": 1.296875, + "learning_rate": 4.337514088990822e-06, + "logits/chosen": -2.278533458709717, + "logits/rejected": -2.281517267227173, + "logps/chosen": -103.110595703125, + "logps/rejected": -122.44902038574219, + "loss": 0.6331, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.3665826916694641, + "rewards/margins": 0.16001132130622864, + "rewards/rejected": -0.5265940427780151, + "step": 870 + }, + { + "epoch": 0.63, + "grad_norm": 2.09375, + "learning_rate": 4.316033052416196e-06, + "logits/chosen": -2.2408275604248047, + "logits/rejected": -2.2425954341888428, + "logps/chosen": -104.7763442993164, + "logps/rejected": -116.91007232666016, + "loss": 0.6591, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -0.4133949875831604, + "rewards/margins": 0.10547590255737305, + "rewards/rejected": -0.5188708901405334, + "step": 880 + }, + { + "epoch": 0.64, + "grad_norm": 1.4296875, + "learning_rate": 4.294264323302946e-06, + "logits/chosen": -2.3082475662231445, + "logits/rejected": -2.3192391395568848, + "logps/chosen": -103.19981384277344, + "logps/rejected": -117.45137023925781, + "loss": 0.6493, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.413215696811676, + "rewards/margins": 0.1274113655090332, + "rewards/rejected": -0.5406270027160645, + "step": 890 + }, + { + "epoch": 0.65, + "grad_norm": 2.140625, + "learning_rate": 4.272211350212171e-06, + "logits/chosen": -2.3206677436828613, + "logits/rejected": -2.3214950561523438, + "logps/chosen": -110.54658508300781, + "logps/rejected": -124.14703369140625, + "loss": 0.6599, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.38299205899238586, + "rewards/margins": 0.10237312316894531, + "rewards/rejected": -0.48536521196365356, + "step": 900 + }, + { + "epoch": 0.66, + "grad_norm": 1.7578125, + "learning_rate": 4.249877626734366e-06, + "logits/chosen": -2.2740793228149414, + "logits/rejected": -2.2952816486358643, + "logps/chosen": -108.4576644897461, + "logps/rejected": -121.01176452636719, + "loss": 0.6539, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.38486912846565247, + "rewards/margins": 0.1163020133972168, + "rewards/rejected": -0.5011711120605469, + "step": 910 + }, + { + "epoch": 0.66, + "grad_norm": 1.7109375, + "learning_rate": 4.2272666909359784e-06, + "logits/chosen": -2.2910335063934326, + "logits/rejected": -2.295705795288086, + "logps/chosen": -102.13375091552734, + "logps/rejected": -124.9188003540039, + "loss": 0.6204, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.4369734823703766, + "rewards/margins": 0.20262956619262695, + "rewards/rejected": -0.6396030783653259, + "step": 920 + }, + { + "epoch": 0.67, + "grad_norm": 1.1484375, + "learning_rate": 4.2043821247989036e-06, + "logits/chosen": -2.278778553009033, + "logits/rejected": -2.2924065589904785, + "logps/chosen": -103.04777526855469, + "logps/rejected": -120.569091796875, + "loss": 0.6403, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.38494181632995605, + "rewards/margins": 0.1459546983242035, + "rewards/rejected": -0.5308965444564819, + "step": 930 + }, + { + "epoch": 0.68, + "grad_norm": 2.171875, + "learning_rate": 4.181227553653045e-06, + "logits/chosen": -2.278262138366699, + "logits/rejected": -2.3009345531463623, + "logps/chosen": -121.49980163574219, + "logps/rejected": -137.34304809570312, + "loss": 0.6449, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.4821853041648865, + "rewards/margins": 0.14446020126342773, + "rewards/rejected": -0.6266454458236694, + "step": 940 + }, + { + "epoch": 0.68, + "grad_norm": 2.203125, + "learning_rate": 4.1578066456019885e-06, + "logits/chosen": -2.2163925170898438, + "logits/rejected": -2.204552412033081, + "logps/chosen": -114.92356872558594, + "logps/rejected": -136.88050842285156, + "loss": 0.6338, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.5124049186706543, + "rewards/margins": 0.1587189882993698, + "rewards/rejected": -0.6711238622665405, + "step": 950 + }, + { + "epoch": 0.69, + "grad_norm": 1.734375, + "learning_rate": 4.1341231109419135e-06, + "logits/chosen": -2.203275442123413, + "logits/rejected": -2.2119054794311523, + "logps/chosen": -123.16712951660156, + "logps/rejected": -137.21139526367188, + "loss": 0.6585, + "rewards/accuracies": 0.4937500059604645, + "rewards/chosen": -0.5569948554039001, + "rewards/margins": 0.1151203066110611, + "rewards/rejected": -0.6721151471138, + "step": 960 + }, + { + "epoch": 0.7, + "grad_norm": 1.5859375, + "learning_rate": 4.110180701573809e-06, + "logits/chosen": -2.200212001800537, + "logits/rejected": -2.198477268218994, + "logps/chosen": -109.5115966796875, + "logps/rejected": -132.9260711669922, + "loss": 0.6196, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.4715866148471832, + "rewards/margins": 0.19370624423027039, + "rewards/rejected": -0.6652928590774536, + "step": 970 + }, + { + "epoch": 0.71, + "grad_norm": 1.0546875, + "learning_rate": 4.085983210409114e-06, + "logits/chosen": -2.227853775024414, + "logits/rejected": -2.2186429500579834, + "logps/chosen": -118.89933013916016, + "logps/rejected": -136.7680206298828, + "loss": 0.6514, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.49003705382347107, + "rewards/margins": 0.12437830865383148, + "rewards/rejected": -0.6144154071807861, + "step": 980 + }, + { + "epoch": 0.71, + "grad_norm": 1.1640625, + "learning_rate": 4.061534470768841e-06, + "logits/chosen": -2.2407491207122803, + "logits/rejected": -2.2455482482910156, + "logps/chosen": -111.09078216552734, + "logps/rejected": -124.2870101928711, + "loss": 0.6505, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.42453131079673767, + "rewards/margins": 0.11639855057001114, + "rewards/rejected": -0.540929913520813, + "step": 990 + }, + { + "epoch": 0.72, + "grad_norm": 2.484375, + "learning_rate": 4.036838355776313e-06, + "logits/chosen": -2.1629438400268555, + "logits/rejected": -2.169175386428833, + "logps/chosen": -115.92777252197266, + "logps/rejected": -131.81134033203125, + "loss": 0.6513, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.5123138427734375, + "rewards/margins": 0.13096138834953308, + "rewards/rejected": -0.6432752013206482, + "step": 1000 + }, + { + "epoch": 0.73, + "grad_norm": 1.21875, + "learning_rate": 4.011898777743594e-06, + "logits/chosen": -2.211540699005127, + "logits/rejected": -2.2166659832000732, + "logps/chosen": -101.62760162353516, + "logps/rejected": -119.66593933105469, + "loss": 0.6398, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.4295649528503418, + "rewards/margins": 0.15017978847026825, + "rewards/rejected": -0.5797447562217712, + "step": 1010 + }, + { + "epoch": 0.74, + "grad_norm": 2.453125, + "learning_rate": 3.9867196875517025e-06, + "logits/chosen": -2.20629620552063, + "logits/rejected": -2.2112419605255127, + "logps/chosen": -107.81523132324219, + "logps/rejected": -119.03303527832031, + "loss": 0.6656, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.4413573145866394, + "rewards/margins": 0.09499961137771606, + "rewards/rejected": -0.5363569855690002, + "step": 1020 + }, + { + "epoch": 0.74, + "grad_norm": 1.8828125, + "learning_rate": 3.961305074024722e-06, + "logits/chosen": -2.125932216644287, + "logits/rejected": -2.130676746368408, + "logps/chosen": -112.90400695800781, + "logps/rejected": -138.2743377685547, + "loss": 0.6151, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.5330706834793091, + "rewards/margins": 0.2042117565870285, + "rewards/rejected": -0.737282395362854, + "step": 1030 + }, + { + "epoch": 0.75, + "grad_norm": 1.5234375, + "learning_rate": 3.935658963297902e-06, + "logits/chosen": -2.212306261062622, + "logits/rejected": -2.2203996181488037, + "logps/chosen": -109.0052261352539, + "logps/rejected": -125.78495788574219, + "loss": 0.6437, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.44839105010032654, + "rewards/margins": 0.14669093489646912, + "rewards/rejected": -0.5950819849967957, + "step": 1040 + }, + { + "epoch": 0.76, + "grad_norm": 2.28125, + "learning_rate": 3.90978541817984e-06, + "logits/chosen": -2.1384072303771973, + "logits/rejected": -2.143054485321045, + "logps/chosen": -108.29044342041016, + "logps/rejected": -127.79345703125, + "loss": 0.6431, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.5090142488479614, + "rewards/margins": 0.16035351157188416, + "rewards/rejected": -0.669367790222168, + "step": 1050 + }, + { + "epoch": 0.76, + "grad_norm": 1.28125, + "learning_rate": 3.8836885375088635e-06, + "logits/chosen": -2.131621837615967, + "logits/rejected": -2.1531243324279785, + "logps/chosen": -115.8088150024414, + "logps/rejected": -133.919921875, + "loss": 0.6405, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.5001341104507446, + "rewards/margins": 0.1682174801826477, + "rewards/rejected": -0.6683515310287476, + "step": 1060 + }, + { + "epoch": 0.77, + "grad_norm": 2.28125, + "learning_rate": 3.857372455503698e-06, + "logits/chosen": -2.1725549697875977, + "logits/rejected": -2.1732017993927, + "logps/chosen": -117.09075927734375, + "logps/rejected": -135.18533325195312, + "loss": 0.6503, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.43959444761276245, + "rewards/margins": 0.13271991908550262, + "rewards/rejected": -0.5723143815994263, + "step": 1070 + }, + { + "epoch": 0.78, + "grad_norm": 2.453125, + "learning_rate": 3.830841341108528e-06, + "logits/chosen": -2.212951421737671, + "logits/rejected": -2.2198729515075684, + "logps/chosen": -111.72041320800781, + "logps/rejected": -132.552001953125, + "loss": 0.6288, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.45366325974464417, + "rewards/margins": 0.17270301282405853, + "rewards/rejected": -0.6263662576675415, + "step": 1080 + }, + { + "epoch": 0.79, + "grad_norm": 2.28125, + "learning_rate": 3.804099397332572e-06, + "logits/chosen": -2.215224027633667, + "logits/rejected": -2.210907459259033, + "logps/chosen": -112.65693664550781, + "logps/rejected": -135.70443725585938, + "loss": 0.6231, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.41242438554763794, + "rewards/margins": 0.18879784643650055, + "rewards/rejected": -0.6012222766876221, + "step": 1090 + }, + { + "epoch": 0.79, + "grad_norm": 1.9375, + "learning_rate": 3.7771508605842372e-06, + "logits/chosen": -2.112990140914917, + "logits/rejected": -2.1238150596618652, + "logps/chosen": -116.11753845214844, + "logps/rejected": -138.2332000732422, + "loss": 0.6198, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.5049333572387695, + "rewards/margins": 0.20600661635398865, + "rewards/rejected": -0.7109400033950806, + "step": 1100 + }, + { + "epoch": 0.8, + "grad_norm": 2.078125, + "learning_rate": 3.7500000000000005e-06, + "logits/chosen": -2.1379756927490234, + "logits/rejected": -2.13820219039917, + "logps/chosen": -114.88435363769531, + "logps/rejected": -133.52407836914062, + "loss": 0.6373, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.4984721541404724, + "rewards/margins": 0.16139784455299377, + "rewards/rejected": -0.6598700881004333, + "step": 1110 + }, + { + "epoch": 0.81, + "grad_norm": 1.8125, + "learning_rate": 3.7226511167681014e-06, + "logits/chosen": -2.135016918182373, + "logits/rejected": -2.126314163208008, + "logps/chosen": -111.63895416259766, + "logps/rejected": -126.47679138183594, + "loss": 0.6509, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.48495396971702576, + "rewards/margins": 0.13388456404209137, + "rewards/rejected": -0.6188385486602783, + "step": 1120 + }, + { + "epoch": 0.81, + "grad_norm": 1.8203125, + "learning_rate": 3.6951085434471544e-06, + "logits/chosen": -2.1722989082336426, + "logits/rejected": -2.166605234146118, + "logps/chosen": -105.00669860839844, + "logps/rejected": -117.9762191772461, + "loss": 0.6555, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.44906917214393616, + "rewards/margins": 0.1117476224899292, + "rewards/rejected": -0.5608168840408325, + "step": 1130 + }, + { + "epoch": 0.82, + "grad_norm": 2.21875, + "learning_rate": 3.6673766432797948e-06, + "logits/chosen": -2.1750612258911133, + "logits/rejected": -2.1879947185516357, + "logps/chosen": -123.87275695800781, + "logps/rejected": -144.77828979492188, + "loss": 0.6334, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.584213137626648, + "rewards/margins": 0.1833323985338211, + "rewards/rejected": -0.7675455808639526, + "step": 1140 + }, + { + "epoch": 0.83, + "grad_norm": 1.671875, + "learning_rate": 3.6394598095014577e-06, + "logits/chosen": -2.210446834564209, + "logits/rejected": -2.213280200958252, + "logps/chosen": -107.16650390625, + "logps/rejected": -124.11837005615234, + "loss": 0.6448, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.40074795484542847, + "rewards/margins": 0.14601869881153107, + "rewards/rejected": -0.5467666387557983, + "step": 1150 + }, + { + "epoch": 0.84, + "grad_norm": 2.25, + "learning_rate": 3.611362464644415e-06, + "logits/chosen": -2.128871202468872, + "logits/rejected": -2.1372876167297363, + "logps/chosen": -116.8992919921875, + "logps/rejected": -126.83099365234375, + "loss": 0.6706, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.47680729627609253, + "rewards/margins": 0.09380488097667694, + "rewards/rejected": -0.5706123113632202, + "step": 1160 + }, + { + "epoch": 0.84, + "grad_norm": 1.734375, + "learning_rate": 3.5830890598371636e-06, + "logits/chosen": -2.23905611038208, + "logits/rejected": -2.252377510070801, + "logps/chosen": -107.7120361328125, + "logps/rejected": -124.34709167480469, + "loss": 0.6327, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4397021234035492, + "rewards/margins": 0.1647356003522873, + "rewards/rejected": -0.6044376492500305, + "step": 1170 + }, + { + "epoch": 0.85, + "grad_norm": 1.859375, + "learning_rate": 3.5546440740992856e-06, + "logits/chosen": -2.1930408477783203, + "logits/rejected": -2.2014918327331543, + "logps/chosen": -117.5343017578125, + "logps/rejected": -131.0255889892578, + "loss": 0.6547, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.5026694536209106, + "rewards/margins": 0.11716220527887344, + "rewards/rejected": -0.6198316812515259, + "step": 1180 + }, + { + "epoch": 0.86, + "grad_norm": 1.765625, + "learning_rate": 3.5260320136318927e-06, + "logits/chosen": -2.1664159297943115, + "logits/rejected": -2.176593542098999, + "logps/chosen": -120.20884704589844, + "logps/rejected": -136.73406982421875, + "loss": 0.637, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.522860586643219, + "rewards/margins": 0.15705768764019012, + "rewards/rejected": -0.6799181699752808, + "step": 1190 + }, + { + "epoch": 0.86, + "grad_norm": 2.140625, + "learning_rate": 3.4972574111037587e-06, + "logits/chosen": -2.1755106449127197, + "logits/rejected": -2.1772923469543457, + "logps/chosen": -115.28816223144531, + "logps/rejected": -133.49465942382812, + "loss": 0.6426, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.44179558753967285, + "rewards/margins": 0.16654905676841736, + "rewards/rejected": -0.6083446741104126, + "step": 1200 + }, + { + "epoch": 0.87, + "grad_norm": 1.484375, + "learning_rate": 3.468324824933267e-06, + "logits/chosen": -2.151540756225586, + "logits/rejected": -2.1717865467071533, + "logps/chosen": -115.64791107177734, + "logps/rejected": -132.0801239013672, + "loss": 0.6446, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.548166036605835, + "rewards/margins": 0.13710884749889374, + "rewards/rejected": -0.6852747797966003, + "step": 1210 + }, + { + "epoch": 0.88, + "grad_norm": 1.0390625, + "learning_rate": 3.4392388385662713e-06, + "logits/chosen": -2.1935017108917236, + "logits/rejected": -2.195500612258911, + "logps/chosen": -107.0605697631836, + "logps/rejected": -129.38616943359375, + "loss": 0.6337, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.43120819330215454, + "rewards/margins": 0.18481549620628357, + "rewards/rejected": -0.6160237193107605, + "step": 1220 + }, + { + "epoch": 0.89, + "grad_norm": 1.671875, + "learning_rate": 3.410004059749996e-06, + "logits/chosen": -2.164797067642212, + "logits/rejected": -2.172008514404297, + "logps/chosen": -110.7061767578125, + "logps/rejected": -132.82736206054688, + "loss": 0.6242, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.4369031488895416, + "rewards/margins": 0.19907937943935394, + "rewards/rejected": -0.6359825134277344, + "step": 1230 + }, + { + "epoch": 0.89, + "grad_norm": 1.6796875, + "learning_rate": 3.3806251198030843e-06, + "logits/chosen": -2.1183745861053467, + "logits/rejected": -2.13506817817688, + "logps/chosen": -103.0443344116211, + "logps/rejected": -128.4714813232422, + "loss": 0.6102, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.4332190155982971, + "rewards/margins": 0.22365593910217285, + "rewards/rejected": -0.65687495470047, + "step": 1240 + }, + { + "epoch": 0.9, + "grad_norm": 1.484375, + "learning_rate": 3.351106672881915e-06, + "logits/chosen": -2.1771786212921143, + "logits/rejected": -2.1897802352905273, + "logps/chosen": -114.33122253417969, + "logps/rejected": -135.31690979003906, + "loss": 0.6334, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.44074517488479614, + "rewards/margins": 0.17149756848812103, + "rewards/rejected": -0.6122426986694336, + "step": 1250 + }, + { + "epoch": 0.91, + "grad_norm": 2.21875, + "learning_rate": 3.3214533952433017e-06, + "logits/chosen": -2.203437328338623, + "logits/rejected": -2.194852113723755, + "logps/chosen": -114.83846282958984, + "logps/rejected": -132.64036560058594, + "loss": 0.6596, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.5022262334823608, + "rewards/margins": 0.1002851277589798, + "rewards/rejected": -0.6025113463401794, + "step": 1260 + }, + { + "epoch": 0.92, + "grad_norm": 1.421875, + "learning_rate": 3.291669984503682e-06, + "logits/chosen": -2.09834361076355, + "logits/rejected": -2.1034817695617676, + "logps/chosen": -119.2296371459961, + "logps/rejected": -144.80642700195312, + "loss": 0.6184, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.5479999780654907, + "rewards/margins": 0.22145429253578186, + "rewards/rejected": -0.7694542407989502, + "step": 1270 + }, + { + "epoch": 0.92, + "grad_norm": 2.09375, + "learning_rate": 3.261761158894937e-06, + "logits/chosen": -2.072908878326416, + "logits/rejected": -2.075850009918213, + "logps/chosen": -121.2236099243164, + "logps/rejected": -149.28993225097656, + "loss": 0.6121, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.5698887705802917, + "rewards/margins": 0.23544566333293915, + "rewards/rejected": -0.8053344488143921, + "step": 1280 + }, + { + "epoch": 0.93, + "grad_norm": 3.453125, + "learning_rate": 3.231731656516936e-06, + "logits/chosen": -2.1070938110351562, + "logits/rejected": -2.1028828620910645, + "logps/chosen": -110.7509765625, + "logps/rejected": -132.27105712890625, + "loss": 0.6311, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.46972590684890747, + "rewards/margins": 0.17397567629814148, + "rewards/rejected": -0.6437015533447266, + "step": 1290 + }, + { + "epoch": 0.94, + "grad_norm": 1.5234375, + "learning_rate": 3.2015862345869335e-06, + "logits/chosen": -2.1732888221740723, + "logits/rejected": -2.181213855743408, + "logps/chosen": -111.25699615478516, + "logps/rejected": -123.9486083984375, + "loss": 0.6552, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.44834476709365845, + "rewards/margins": 0.11553524434566498, + "rewards/rejected": -0.5638800263404846, + "step": 1300 + }, + { + "epoch": 0.94, + "grad_norm": 1.7734375, + "learning_rate": 3.171329668685942e-06, + "logits/chosen": -2.0767674446105957, + "logits/rejected": -2.070704936981201, + "logps/chosen": -110.462890625, + "logps/rejected": -133.4669647216797, + "loss": 0.6249, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.5272938013076782, + "rewards/margins": 0.18891780078411102, + "rewards/rejected": -0.7162116765975952, + "step": 1310 + }, + { + "epoch": 0.95, + "grad_norm": 1.078125, + "learning_rate": 3.140966752002193e-06, + "logits/chosen": -2.0980172157287598, + "logits/rejected": -2.102271556854248, + "logps/chosen": -103.88057708740234, + "logps/rejected": -130.67318725585938, + "loss": 0.6066, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.39303597807884216, + "rewards/margins": 0.2400202453136444, + "rewards/rejected": -0.6330562233924866, + "step": 1320 + }, + { + "epoch": 0.96, + "grad_norm": 1.4921875, + "learning_rate": 3.1105022945718076e-06, + "logits/chosen": -2.0586235523223877, + "logits/rejected": -2.080989360809326, + "logps/chosen": -132.25013732910156, + "logps/rejected": -150.72528076171875, + "loss": 0.641, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.6205312013626099, + "rewards/margins": 0.177308589220047, + "rewards/rejected": -0.7978397607803345, + "step": 1330 + }, + { + "epoch": 0.97, + "grad_norm": 1.78125, + "learning_rate": 3.079941122516803e-06, + "logits/chosen": -2.0391013622283936, + "logits/rejected": -2.037480592727661, + "logps/chosen": -114.75526428222656, + "logps/rejected": -132.84378051757812, + "loss": 0.6534, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.5115147829055786, + "rewards/margins": 0.1457727551460266, + "rewards/rejected": -0.6572875380516052, + "step": 1340 + }, + { + "epoch": 0.97, + "grad_norm": 1.2578125, + "learning_rate": 3.0492880772805433e-06, + "logits/chosen": -2.05072283744812, + "logits/rejected": -2.057342767715454, + "logps/chosen": -120.1377182006836, + "logps/rejected": -134.4777374267578, + "loss": 0.6499, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.4882054328918457, + "rewards/margins": 0.1324242353439331, + "rewards/rejected": -0.6206297278404236, + "step": 1350 + }, + { + "epoch": 0.98, + "grad_norm": 1.5, + "learning_rate": 3.018548014860769e-06, + "logits/chosen": -2.007279872894287, + "logits/rejected": -2.0130622386932373, + "logps/chosen": -120.3442153930664, + "logps/rejected": -143.51101684570312, + "loss": 0.6304, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.5948348045349121, + "rewards/margins": 0.20342986285686493, + "rewards/rejected": -0.7982646822929382, + "step": 1360 + }, + { + "epoch": 0.99, + "grad_norm": 2.3125, + "learning_rate": 2.9877258050403214e-06, + "logits/chosen": -2.031801223754883, + "logits/rejected": -2.023563861846924, + "logps/chosen": -121.27830505371094, + "logps/rejected": -140.32757568359375, + "loss": 0.6447, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.6015397310256958, + "rewards/margins": 0.1492132693529129, + "rewards/rejected": -0.7507530450820923, + "step": 1370 + }, + { + "epoch": 0.99, + "grad_norm": 1.7578125, + "learning_rate": 2.9568263306156754e-06, + "logits/chosen": -2.0874016284942627, + "logits/rejected": -2.0975565910339355, + "logps/chosen": -107.88935852050781, + "logps/rejected": -121.3756103515625, + "loss": 0.6615, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.4233587682247162, + "rewards/margins": 0.1184331625699997, + "rewards/rejected": -0.5417919754981995, + "step": 1380 + }, + { + "epoch": 1.0, + "grad_norm": 2.0, + "learning_rate": 2.9258544866234206e-06, + "logits/chosen": -2.070168972015381, + "logits/rejected": -2.0742554664611816, + "logps/chosen": -112.909423828125, + "logps/rejected": -130.5807342529297, + "loss": 0.6372, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.49084582924842834, + "rewards/margins": 0.15788979828357697, + "rewards/rejected": -0.6487356424331665, + "step": 1390 + }, + { + "epoch": 1.01, + "grad_norm": 1.6015625, + "learning_rate": 2.8948151795647994e-06, + "logits/chosen": -1.9922540187835693, + "logits/rejected": -2.0036025047302246, + "logps/chosen": -108.33551025390625, + "logps/rejected": -134.94259643554688, + "loss": 0.6044, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.44488996267318726, + "rewards/margins": 0.2470279037952423, + "rewards/rejected": -0.6919177770614624, + "step": 1400 + }, + { + "epoch": 1.02, + "grad_norm": 2.34375, + "learning_rate": 2.863713326628422e-06, + "logits/chosen": -1.9860804080963135, + "logits/rejected": -1.98525071144104, + "logps/chosen": -111.2559585571289, + "logps/rejected": -138.44357299804688, + "loss": 0.6063, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.4986530840396881, + "rewards/margins": 0.2310309112071991, + "rewards/rejected": -0.7296839952468872, + "step": 1410 + }, + { + "epoch": 1.02, + "grad_norm": 1.828125, + "learning_rate": 2.8325538549113006e-06, + "logits/chosen": -2.030186891555786, + "logits/rejected": -2.0408942699432373, + "logps/chosen": -113.96142578125, + "logps/rejected": -141.45428466796875, + "loss": 0.6113, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.46484389901161194, + "rewards/margins": 0.2421310842037201, + "rewards/rejected": -0.706974983215332, + "step": 1420 + }, + { + "epoch": 1.03, + "grad_norm": 2.515625, + "learning_rate": 2.8013417006383078e-06, + "logits/chosen": -1.9514284133911133, + "logits/rejected": -1.963894248008728, + "logps/chosen": -110.47412109375, + "logps/rejected": -128.65701293945312, + "loss": 0.6357, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.5639623999595642, + "rewards/margins": 0.1714598834514618, + "rewards/rejected": -0.7354224324226379, + "step": 1430 + }, + { + "epoch": 1.04, + "grad_norm": 2.65625, + "learning_rate": 2.770081808380186e-06, + "logits/chosen": -2.04837703704834, + "logits/rejected": -2.05118989944458, + "logps/chosen": -125.71846771240234, + "logps/rejected": -144.2294158935547, + "loss": 0.6284, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.48360365629196167, + "rewards/margins": 0.19055330753326416, + "rewards/rejected": -0.6741569638252258, + "step": 1440 + }, + { + "epoch": 1.04, + "grad_norm": 2.328125, + "learning_rate": 2.7387791302702398e-06, + "logits/chosen": -1.980463981628418, + "logits/rejected": -1.980164885520935, + "logps/chosen": -123.45314025878906, + "logps/rejected": -152.37655639648438, + "loss": 0.6185, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.6476872563362122, + "rewards/margins": 0.214513897895813, + "rewards/rejected": -0.8622010946273804, + "step": 1450 + }, + { + "epoch": 1.05, + "grad_norm": 1.4296875, + "learning_rate": 2.707438625219827e-06, + "logits/chosen": -1.942488670349121, + "logits/rejected": -1.9465347528457642, + "logps/chosen": -128.45228576660156, + "logps/rejected": -162.00949096679688, + "loss": 0.5957, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.6348342895507812, + "rewards/margins": 0.292163610458374, + "rewards/rejected": -0.9269979596138, + "step": 1460 + }, + { + "epoch": 1.06, + "grad_norm": 1.2421875, + "learning_rate": 2.67606525813278e-06, + "logits/chosen": -1.9286657571792603, + "logits/rejected": -1.9460529088974, + "logps/chosen": -115.72102355957031, + "logps/rejected": -142.26600646972656, + "loss": 0.6079, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.5551806092262268, + "rewards/margins": 0.2322642058134079, + "rewards/rejected": -0.7874448299407959, + "step": 1470 + }, + { + "epoch": 1.07, + "grad_norm": 1.6796875, + "learning_rate": 2.6446639991188716e-06, + "logits/chosen": -1.973655343055725, + "logits/rejected": -1.9923969507217407, + "logps/chosen": -116.53816223144531, + "logps/rejected": -137.4978790283203, + "loss": 0.6328, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.5160804986953735, + "rewards/margins": 0.17561517655849457, + "rewards/rejected": -0.6916956305503845, + "step": 1480 + }, + { + "epoch": 1.07, + "grad_norm": 2.1875, + "learning_rate": 2.6132398227064615e-06, + "logits/chosen": -2.0569424629211426, + "logits/rejected": -2.061692237854004, + "logps/chosen": -129.03684997558594, + "logps/rejected": -151.86997985839844, + "loss": 0.6229, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5522770285606384, + "rewards/margins": 0.2137361317873001, + "rewards/rejected": -0.7660132050514221, + "step": 1490 + }, + { + "epoch": 1.08, + "grad_norm": 2.46875, + "learning_rate": 2.5817977070544408e-06, + "logits/chosen": -1.9222244024276733, + "logits/rejected": -1.928789496421814, + "logps/chosen": -122.2890396118164, + "logps/rejected": -146.0366668701172, + "loss": 0.6174, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.6112322807312012, + "rewards/margins": 0.2106863260269165, + "rewards/rejected": -0.8219184875488281, + "step": 1500 + }, + { + "epoch": 1.09, + "grad_norm": 2.0, + "learning_rate": 2.550342633163601e-06, + "logits/chosen": -1.994757890701294, + "logits/rejected": -1.998810052871704, + "logps/chosen": -119.19169616699219, + "logps/rejected": -146.54074096679688, + "loss": 0.6078, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.596774697303772, + "rewards/margins": 0.23899777233600616, + "rewards/rejected": -0.8357726335525513, + "step": 1510 + }, + { + "epoch": 1.1, + "grad_norm": 1.9609375, + "learning_rate": 2.5188795840875546e-06, + "logits/chosen": -1.98430597782135, + "logits/rejected": -1.989297866821289, + "logps/chosen": -124.1072769165039, + "logps/rejected": -133.15005493164062, + "loss": 0.6559, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5129493474960327, + "rewards/margins": 0.10699422657489777, + "rewards/rejected": -0.6199434995651245, + "step": 1520 + }, + { + "epoch": 1.1, + "grad_norm": 1.8828125, + "learning_rate": 2.487413544143325e-06, + "logits/chosen": -2.003361701965332, + "logits/rejected": -1.9991636276245117, + "logps/chosen": -120.54267883300781, + "logps/rejected": -145.92556762695312, + "loss": 0.6157, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.5307844281196594, + "rewards/margins": 0.21387752890586853, + "rewards/rejected": -0.7446619868278503, + "step": 1530 + }, + { + "epoch": 1.11, + "grad_norm": 1.8125, + "learning_rate": 2.4559494981217464e-06, + "logits/chosen": -2.009737968444824, + "logits/rejected": -2.0052223205566406, + "logps/chosen": -115.0971450805664, + "logps/rejected": -140.93968200683594, + "loss": 0.6113, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.513275146484375, + "rewards/margins": 0.22737202048301697, + "rewards/rejected": -0.7406471967697144, + "step": 1540 + }, + { + "epoch": 1.12, + "grad_norm": 1.3828125, + "learning_rate": 2.4244924304977785e-06, + "logits/chosen": -1.9526363611221313, + "logits/rejected": -1.9619600772857666, + "logps/chosen": -117.15467834472656, + "logps/rejected": -141.62472534179688, + "loss": 0.6155, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.529160737991333, + "rewards/margins": 0.21543464064598083, + "rewards/rejected": -0.7445953488349915, + "step": 1550 + }, + { + "epoch": 1.12, + "grad_norm": 2.96875, + "learning_rate": 2.3930473246408752e-06, + "logits/chosen": -2.0411906242370605, + "logits/rejected": -2.056326389312744, + "logps/chosen": -129.92892456054688, + "logps/rejected": -157.43417358398438, + "loss": 0.6072, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.612835705280304, + "rewards/margins": 0.2452736347913742, + "rewards/rejected": -0.8581092953681946, + "step": 1560 + }, + { + "epoch": 1.13, + "grad_norm": 2.109375, + "learning_rate": 2.3616191620255307e-06, + "logits/chosen": -2.016146421432495, + "logits/rejected": -2.031141996383667, + "logps/chosen": -125.10477447509766, + "logps/rejected": -144.62144470214844, + "loss": 0.6344, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.5862436890602112, + "rewards/margins": 0.16933271288871765, + "rewards/rejected": -0.7555764317512512, + "step": 1570 + }, + { + "epoch": 1.14, + "grad_norm": 1.84375, + "learning_rate": 2.3302129214421244e-06, + "logits/chosen": -1.9942152500152588, + "logits/rejected": -1.9925035238265991, + "logps/chosen": -126.97408294677734, + "logps/rejected": -157.83267211914062, + "loss": 0.5967, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.5849136114120483, + "rewards/margins": 0.2714024782180786, + "rewards/rejected": -0.8563162088394165, + "step": 1580 + }, + { + "epoch": 1.15, + "grad_norm": 1.2578125, + "learning_rate": 2.2988335782081854e-06, + "logits/chosen": -1.9507849216461182, + "logits/rejected": -1.9640982151031494, + "logps/chosen": -114.99700927734375, + "logps/rejected": -141.39306640625, + "loss": 0.6105, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.5272036194801331, + "rewards/margins": 0.21672149002552032, + "rewards/rejected": -0.7439250349998474, + "step": 1590 + }, + { + "epoch": 1.15, + "grad_norm": 2.53125, + "learning_rate": 2.2674861033802182e-06, + "logits/chosen": -1.9975817203521729, + "logits/rejected": -2.006187915802002, + "logps/chosen": -121.58160400390625, + "logps/rejected": -147.51416015625, + "loss": 0.6135, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.5389881134033203, + "rewards/margins": 0.24041156470775604, + "rewards/rejected": -0.7793997526168823, + "step": 1600 + }, + { + "epoch": 1.16, + "grad_norm": 2.1875, + "learning_rate": 2.236175462966192e-06, + "logits/chosen": -1.9745140075683594, + "logits/rejected": -1.990915060043335, + "logps/chosen": -119.48726654052734, + "logps/rejected": -139.58609008789062, + "loss": 0.6336, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5391074419021606, + "rewards/margins": 0.17995604872703552, + "rewards/rejected": -0.7190635204315186, + "step": 1610 + }, + { + "epoch": 1.17, + "grad_norm": 1.2421875, + "learning_rate": 2.204906617138839e-06, + "logits/chosen": -2.052870750427246, + "logits/rejected": -2.0588982105255127, + "logps/chosen": -115.16014099121094, + "logps/rejected": -138.7529754638672, + "loss": 0.6187, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.46134477853775024, + "rewards/margins": 0.18981757760047913, + "rewards/rejected": -0.651162326335907, + "step": 1620 + }, + { + "epoch": 1.17, + "grad_norm": 2.3125, + "learning_rate": 2.173684519449872e-06, + "logits/chosen": -2.017367124557495, + "logits/rejected": -2.0284934043884277, + "logps/chosen": -118.7997055053711, + "logps/rejected": -136.1349639892578, + "loss": 0.6259, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.49513015151023865, + "rewards/margins": 0.1775195300579071, + "rewards/rejected": -0.672649621963501, + "step": 1630 + }, + { + "epoch": 1.18, + "grad_norm": 1.609375, + "learning_rate": 2.1425141160452495e-06, + "logits/chosen": -1.9408687353134155, + "logits/rejected": -1.9594764709472656, + "logps/chosen": -116.89726257324219, + "logps/rejected": -135.9090118408203, + "loss": 0.6256, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.5176368951797485, + "rewards/margins": 0.18570610880851746, + "rewards/rejected": -0.7033429145812988, + "step": 1640 + }, + { + "epoch": 1.19, + "grad_norm": 2.046875, + "learning_rate": 2.1114003448816205e-06, + "logits/chosen": -1.9267289638519287, + "logits/rejected": -1.930748701095581, + "logps/chosen": -111.66670227050781, + "logps/rejected": -129.04257202148438, + "loss": 0.6341, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.5672639608383179, + "rewards/margins": 0.16399827599525452, + "rewards/rejected": -0.7312622666358948, + "step": 1650 + }, + { + "epoch": 1.2, + "grad_norm": 2.03125, + "learning_rate": 2.080348134944063e-06, + "logits/chosen": -1.9702529907226562, + "logits/rejected": -1.9817975759506226, + "logps/chosen": -119.13056945800781, + "logps/rejected": -137.2602081298828, + "loss": 0.6419, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.5436175465583801, + "rewards/margins": 0.15731260180473328, + "rewards/rejected": -0.7009302377700806, + "step": 1660 + }, + { + "epoch": 1.2, + "grad_norm": 1.890625, + "learning_rate": 2.049362405465236e-06, + "logits/chosen": -2.0406806468963623, + "logits/rejected": -2.043137550354004, + "logps/chosen": -112.21296691894531, + "logps/rejected": -136.54315185546875, + "loss": 0.6188, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.4973304271697998, + "rewards/margins": 0.20603354275226593, + "rewards/rejected": -0.7033639550209045, + "step": 1670 + }, + { + "epoch": 1.21, + "grad_norm": 1.8515625, + "learning_rate": 2.0184480651460943e-06, + "logits/chosen": -1.961282730102539, + "logits/rejected": -1.9708878993988037, + "logps/chosen": -121.48686218261719, + "logps/rejected": -150.19656372070312, + "loss": 0.5987, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.5785536170005798, + "rewards/margins": 0.25495901703834534, + "rewards/rejected": -0.8335126638412476, + "step": 1680 + }, + { + "epoch": 1.22, + "grad_norm": 2.3125, + "learning_rate": 1.9876100113782534e-06, + "logits/chosen": -2.0227205753326416, + "logits/rejected": -2.0364012718200684, + "logps/chosen": -114.74979400634766, + "logps/rejected": -138.68084716796875, + "loss": 0.6135, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.4629599153995514, + "rewards/margins": 0.2168576419353485, + "rewards/rejected": -0.6798175573348999, + "step": 1690 + }, + { + "epoch": 1.23, + "grad_norm": 1.75, + "learning_rate": 1.9568531294681585e-06, + "logits/chosen": -1.9471362829208374, + "logits/rejected": -1.9518378973007202, + "logps/chosen": -122.4446792602539, + "logps/rejected": -156.8361053466797, + "loss": 0.5817, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.5372573137283325, + "rewards/margins": 0.3052898943424225, + "rewards/rejected": -0.8425471186637878, + "step": 1700 + }, + { + "epoch": 1.23, + "grad_norm": 2.96875, + "learning_rate": 1.926182291863162e-06, + "logits/chosen": -1.8842859268188477, + "logits/rejected": -1.8872559070587158, + "logps/chosen": -115.28511810302734, + "logps/rejected": -142.72998046875, + "loss": 0.6057, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.5601629018783569, + "rewards/margins": 0.23351116478443146, + "rewards/rejected": -0.7936740517616272, + "step": 1710 + }, + { + "epoch": 1.24, + "grad_norm": 2.484375, + "learning_rate": 1.895602357379637e-06, + "logits/chosen": -1.851300597190857, + "logits/rejected": -1.8685184717178345, + "logps/chosen": -120.60140228271484, + "logps/rejected": -148.75662231445312, + "loss": 0.6097, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.5954613089561462, + "rewards/margins": 0.26449450850486755, + "rewards/rejected": -0.859955906867981, + "step": 1720 + }, + { + "epoch": 1.25, + "grad_norm": 2.390625, + "learning_rate": 1.8651181704332578e-06, + "logits/chosen": -1.9334551095962524, + "logits/rejected": -1.9329140186309814, + "logps/chosen": -126.85723876953125, + "logps/rejected": -153.77548217773438, + "loss": 0.6113, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.6201340556144714, + "rewards/margins": 0.23935556411743164, + "rewards/rejected": -0.8594895601272583, + "step": 1730 + }, + { + "epoch": 1.25, + "grad_norm": 1.4296875, + "learning_rate": 1.8347345602715543e-06, + "logits/chosen": -1.9892994165420532, + "logits/rejected": -2.0142014026641846, + "logps/chosen": -119.41783142089844, + "logps/rejected": -146.21707153320312, + "loss": 0.6015, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.5245205163955688, + "rewards/margins": 0.25306034088134766, + "rewards/rejected": -0.7775809168815613, + "step": 1740 + }, + { + "epoch": 1.26, + "grad_norm": 1.546875, + "learning_rate": 1.8044563402088686e-06, + "logits/chosen": -1.9546706676483154, + "logits/rejected": -1.9724689722061157, + "logps/chosen": -130.3236541748047, + "logps/rejected": -160.91209411621094, + "loss": 0.5816, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.6136053800582886, + "rewards/margins": 0.3152478337287903, + "rewards/rejected": -0.9288532137870789, + "step": 1750 + }, + { + "epoch": 1.27, + "grad_norm": 2.46875, + "learning_rate": 1.7742883068638447e-06, + "logits/chosen": -2.0497043132781982, + "logits/rejected": -2.048368453979492, + "logps/chosen": -127.9777603149414, + "logps/rejected": -154.79327392578125, + "loss": 0.6093, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.5853675603866577, + "rewards/margins": 0.23666468262672424, + "rewards/rejected": -0.8220322728157043, + "step": 1760 + }, + { + "epoch": 1.28, + "grad_norm": 1.5625, + "learning_rate": 1.7442352393995516e-06, + "logits/chosen": -1.9354140758514404, + "logits/rejected": -1.9446359872817993, + "logps/chosen": -124.5389175415039, + "logps/rejected": -148.15814208984375, + "loss": 0.6249, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.6259867548942566, + "rewards/margins": 0.19919133186340332, + "rewards/rejected": -0.8251781463623047, + "step": 1770 + }, + { + "epoch": 1.28, + "grad_norm": 1.65625, + "learning_rate": 1.7143018987663814e-06, + "logits/chosen": -1.9998855590820312, + "logits/rejected": -2.0096142292022705, + "logps/chosen": -126.11322021484375, + "logps/rejected": -145.43692016601562, + "loss": 0.6251, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.5246410369873047, + "rewards/margins": 0.19330283999443054, + "rewards/rejected": -0.7179439663887024, + "step": 1780 + }, + { + "epoch": 1.29, + "grad_norm": 2.25, + "learning_rate": 1.6844930269478274e-06, + "logits/chosen": -1.9050662517547607, + "logits/rejected": -1.9045469760894775, + "logps/chosen": -123.74418640136719, + "logps/rejected": -137.86448669433594, + "loss": 0.6512, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.5779796242713928, + "rewards/margins": 0.1330142766237259, + "rewards/rejected": -0.7109938859939575, + "step": 1790 + }, + { + "epoch": 1.3, + "grad_norm": 2.828125, + "learning_rate": 1.6548133462092647e-06, + "logits/chosen": -1.9649972915649414, + "logits/rejected": -1.9714637994766235, + "logps/chosen": -129.48873901367188, + "logps/rejected": -158.52069091796875, + "loss": 0.6204, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.6639618277549744, + "rewards/margins": 0.22173753380775452, + "rewards/rejected": -0.885699450969696, + "step": 1800 + }, + { + "epoch": 1.3, + "grad_norm": 2.375, + "learning_rate": 1.6252675583498644e-06, + "logits/chosen": -1.9044713973999023, + "logits/rejected": -1.9047183990478516, + "logps/chosen": -114.80845642089844, + "logps/rejected": -141.17929077148438, + "loss": 0.606, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.506965696811676, + "rewards/margins": 0.22917525470256805, + "rewards/rejected": -0.7361409068107605, + "step": 1810 + }, + { + "epoch": 1.31, + "grad_norm": 2.40625, + "learning_rate": 1.5958603439577381e-06, + "logits/chosen": -1.883062720298767, + "logits/rejected": -1.8782352209091187, + "logps/chosen": -115.2265853881836, + "logps/rejected": -145.7313690185547, + "loss": 0.6037, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.5807043313980103, + "rewards/margins": 0.2627798616886139, + "rewards/rejected": -0.843484103679657, + "step": 1820 + }, + { + "epoch": 1.32, + "grad_norm": 1.9375, + "learning_rate": 1.5665963616684477e-06, + "logits/chosen": -1.8872991800308228, + "logits/rejected": -1.9082088470458984, + "logps/chosen": -118.02657318115234, + "logps/rejected": -144.72972106933594, + "loss": 0.6075, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.5587154626846313, + "rewards/margins": 0.24482004344463348, + "rewards/rejected": -0.8035355806350708, + "step": 1830 + }, + { + "epoch": 1.33, + "grad_norm": 2.03125, + "learning_rate": 1.5374802474269973e-06, + "logits/chosen": -1.8889667987823486, + "logits/rejected": -1.8945300579071045, + "logps/chosen": -120.35169982910156, + "logps/rejected": -144.44015502929688, + "loss": 0.6084, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.5581179857254028, + "rewards/margins": 0.23101505637168884, + "rewards/rejected": -0.7891330718994141, + "step": 1840 + }, + { + "epoch": 1.33, + "grad_norm": 1.828125, + "learning_rate": 1.5085166137534124e-06, + "logits/chosen": -1.8958622217178345, + "logits/rejected": -1.8905636072158813, + "logps/chosen": -124.46125793457031, + "logps/rejected": -151.10842895507812, + "loss": 0.6129, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.6295716762542725, + "rewards/margins": 0.23376984894275665, + "rewards/rejected": -0.8633416295051575, + "step": 1850 + }, + { + "epoch": 1.34, + "grad_norm": 2.140625, + "learning_rate": 1.479710049012033e-06, + "logits/chosen": -1.9351632595062256, + "logits/rejected": -1.9478946924209595, + "logps/chosen": -121.83003234863281, + "logps/rejected": -154.7652587890625, + "loss": 0.5865, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.5323207974433899, + "rewards/margins": 0.2836820185184479, + "rewards/rejected": -0.8160028457641602, + "step": 1860 + }, + { + "epoch": 1.35, + "grad_norm": 1.34375, + "learning_rate": 1.4510651166846369e-06, + "logits/chosen": -1.8797328472137451, + "logits/rejected": -1.9063024520874023, + "logps/chosen": -112.57334899902344, + "logps/rejected": -139.17276000976562, + "loss": 0.6023, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.5350430011749268, + "rewards/margins": 0.23087510466575623, + "rewards/rejected": -0.7659180760383606, + "step": 1870 + }, + { + "epoch": 1.35, + "grad_norm": 1.8046875, + "learning_rate": 1.4225863546474944e-06, + "logits/chosen": -1.9153077602386475, + "logits/rejected": -1.92630934715271, + "logps/chosen": -117.9030532836914, + "logps/rejected": -144.8286895751953, + "loss": 0.6007, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.5061390399932861, + "rewards/margins": 0.2523689270019531, + "rewards/rejected": -0.7585079669952393, + "step": 1880 + }, + { + "epoch": 1.36, + "grad_norm": 1.625, + "learning_rate": 1.3942782744524974e-06, + "logits/chosen": -1.9394657611846924, + "logits/rejected": -1.9521135091781616, + "logps/chosen": -122.05293273925781, + "logps/rejected": -145.07711791992188, + "loss": 0.6148, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.5658047199249268, + "rewards/margins": 0.20031043887138367, + "rewards/rejected": -0.7661150693893433, + "step": 1890 + }, + { + "epoch": 1.37, + "grad_norm": 2.046875, + "learning_rate": 1.3661453606124353e-06, + "logits/chosen": -1.8490660190582275, + "logits/rejected": -1.849898099899292, + "logps/chosen": -117.15348815917969, + "logps/rejected": -144.00486755371094, + "loss": 0.6162, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -0.5616058111190796, + "rewards/margins": 0.2321668118238449, + "rewards/rejected": -0.7937726378440857, + "step": 1900 + }, + { + "epoch": 1.38, + "grad_norm": 1.9453125, + "learning_rate": 1.3381920698905788e-06, + "logits/chosen": -1.8940210342407227, + "logits/rejected": -1.8968864679336548, + "logps/chosen": -122.25953674316406, + "logps/rejected": -151.90289306640625, + "loss": 0.6012, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.587809145450592, + "rewards/margins": 0.24534039199352264, + "rewards/rejected": -0.8331495523452759, + "step": 1910 + }, + { + "epoch": 1.38, + "grad_norm": 1.453125, + "learning_rate": 1.3104228305946385e-06, + "logits/chosen": -1.8536640405654907, + "logits/rejected": -1.8629567623138428, + "logps/chosen": -108.5637435913086, + "logps/rejected": -140.7650146484375, + "loss": 0.5988, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.5375715494155884, + "rewards/margins": 0.26606285572052, + "rewards/rejected": -0.8036344647407532, + "step": 1920 + }, + { + "epoch": 1.39, + "grad_norm": 1.90625, + "learning_rate": 1.2828420418752442e-06, + "logits/chosen": -1.8929815292358398, + "logits/rejected": -1.9167912006378174, + "logps/chosen": -130.51766967773438, + "logps/rejected": -146.90655517578125, + "loss": 0.6459, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.5917934775352478, + "rewards/margins": 0.16113656759262085, + "rewards/rejected": -0.7529300451278687, + "step": 1930 + }, + { + "epoch": 1.4, + "grad_norm": 2.0, + "learning_rate": 1.2554540730290437e-06, + "logits/chosen": -1.8626874685287476, + "logits/rejected": -1.8674083948135376, + "logps/chosen": -122.74143981933594, + "logps/rejected": -148.19137573242188, + "loss": 0.6133, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.6179603338241577, + "rewards/margins": 0.22583599388599396, + "rewards/rejected": -0.8437963724136353, + "step": 1940 + }, + { + "epoch": 1.41, + "grad_norm": 2.78125, + "learning_rate": 1.2282632628065197e-06, + "logits/chosen": -1.8630259037017822, + "logits/rejected": -1.8686736822128296, + "logps/chosen": -127.19625091552734, + "logps/rejected": -152.7012939453125, + "loss": 0.6144, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.6396566033363342, + "rewards/margins": 0.2267201840877533, + "rewards/rejected": -0.8663768768310547, + "step": 1950 + }, + { + "epoch": 1.41, + "grad_norm": 2.09375, + "learning_rate": 1.2012739187246575e-06, + "logits/chosen": -1.9101310968399048, + "logits/rejected": -1.9150078296661377, + "logps/chosen": -125.06834411621094, + "logps/rejected": -152.13970947265625, + "loss": 0.6079, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.6242455840110779, + "rewards/margins": 0.23204731941223145, + "rewards/rejected": -0.8562929034233093, + "step": 1960 + }, + { + "epoch": 1.42, + "grad_norm": 1.7890625, + "learning_rate": 1.1744903163845578e-06, + "logits/chosen": -1.9141194820404053, + "logits/rejected": -1.9090967178344727, + "logps/chosen": -125.41385650634766, + "logps/rejected": -153.21505737304688, + "loss": 0.6196, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6701093912124634, + "rewards/margins": 0.23631341755390167, + "rewards/rejected": -0.9064227938652039, + "step": 1970 + }, + { + "epoch": 1.43, + "grad_norm": 2.3125, + "learning_rate": 1.1479166987940981e-06, + "logits/chosen": -1.9218595027923584, + "logits/rejected": -1.9358152151107788, + "logps/chosen": -121.7696533203125, + "logps/rejected": -142.92947387695312, + "loss": 0.6406, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.6072500944137573, + "rewards/margins": 0.1642817258834839, + "rewards/rejected": -0.771531879901886, + "step": 1980 + }, + { + "epoch": 1.43, + "grad_norm": 1.890625, + "learning_rate": 1.121557275695771e-06, + "logits/chosen": -1.8173805475234985, + "logits/rejected": -1.8257992267608643, + "logps/chosen": -123.26933288574219, + "logps/rejected": -149.6290740966797, + "loss": 0.6063, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.6274382472038269, + "rewards/margins": 0.2434215098619461, + "rewards/rejected": -0.8708597421646118, + "step": 1990 + }, + { + "epoch": 1.44, + "grad_norm": 2.734375, + "learning_rate": 1.0954162228997778e-06, + "logits/chosen": -1.944850206375122, + "logits/rejected": -1.9471490383148193, + "logps/chosen": -121.35750579833984, + "logps/rejected": -149.9652557373047, + "loss": 0.6078, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.6109983325004578, + "rewards/margins": 0.23012125492095947, + "rewards/rejected": -0.8411195874214172, + "step": 2000 + }, + { + "epoch": 1.45, + "grad_norm": 1.8359375, + "learning_rate": 1.0694976816225072e-06, + "logits/chosen": -1.931652307510376, + "logits/rejected": -1.9369986057281494, + "logps/chosen": -121.70970153808594, + "logps/rejected": -147.0286102294922, + "loss": 0.6173, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.6017236709594727, + "rewards/margins": 0.22414302825927734, + "rewards/rejected": -0.82586669921875, + "step": 2010 + }, + { + "epoch": 1.46, + "grad_norm": 2.015625, + "learning_rate": 1.043805757830495e-06, + "logits/chosen": -1.888380765914917, + "logits/rejected": -1.898215889930725, + "logps/chosen": -123.5376968383789, + "logps/rejected": -143.40316772460938, + "loss": 0.6269, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.5882707834243774, + "rewards/margins": 0.17841866612434387, + "rewards/rejected": -0.7666894793510437, + "step": 2020 + }, + { + "epoch": 1.46, + "grad_norm": 1.921875, + "learning_rate": 1.0183445215899585e-06, + "logits/chosen": -1.9046001434326172, + "logits/rejected": -1.8922739028930664, + "logps/chosen": -119.24836730957031, + "logps/rejected": -143.8057861328125, + "loss": 0.6215, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.5545870065689087, + "rewards/margins": 0.2050396203994751, + "rewards/rejected": -0.7596266269683838, + "step": 2030 + }, + { + "epoch": 1.47, + "grad_norm": 2.578125, + "learning_rate": 9.931180064220276e-07, + "logits/chosen": -1.92236328125, + "logits/rejected": -1.924556016921997, + "logps/chosen": -136.11215209960938, + "logps/rejected": -159.76925659179688, + "loss": 0.6302, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.6600568890571594, + "rewards/margins": 0.19632843136787415, + "rewards/rejected": -0.8563854098320007, + "step": 2040 + }, + { + "epoch": 1.48, + "grad_norm": 1.671875, + "learning_rate": 9.681302086637634e-07, + "logits/chosen": -1.8995593786239624, + "logits/rejected": -1.922876000404358, + "logps/chosen": -134.8125457763672, + "logps/rejected": -150.06996154785156, + "loss": 0.6384, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.647091805934906, + "rewards/margins": 0.1497163623571396, + "rewards/rejected": -0.796808123588562, + "step": 2050 + }, + { + "epoch": 1.48, + "grad_norm": 1.40625, + "learning_rate": 9.433850868350619e-07, + "logits/chosen": -1.8294461965560913, + "logits/rejected": -1.845920205116272, + "logps/chosen": -116.8541259765625, + "logps/rejected": -143.2628936767578, + "loss": 0.6033, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.5514515042304993, + "rewards/margins": 0.2609653174877167, + "rewards/rejected": -0.8124168515205383, + "step": 2060 + }, + { + "epoch": 1.49, + "grad_norm": 1.953125, + "learning_rate": 9.188865610115572e-07, + "logits/chosen": -1.921491265296936, + "logits/rejected": -1.93572998046875, + "logps/chosen": -126.38240814208984, + "logps/rejected": -145.36019897460938, + "loss": 0.6306, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5697722434997559, + "rewards/margins": 0.17015670239925385, + "rewards/rejected": -0.7399289608001709, + "step": 2070 + }, + { + "epoch": 1.5, + "grad_norm": 1.7890625, + "learning_rate": 8.946385122036066e-07, + "logits/chosen": -1.8846461772918701, + "logits/rejected": -1.8940550088882446, + "logps/chosen": -121.21919250488281, + "logps/rejected": -144.52047729492188, + "loss": 0.6212, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.5412122011184692, + "rewards/margins": 0.19210803508758545, + "rewards/rejected": -0.7333202958106995, + "step": 2080 + }, + { + "epoch": 1.51, + "grad_norm": 1.8125, + "learning_rate": 8.706447817414696e-07, + "logits/chosen": -1.9248275756835938, + "logits/rejected": -1.922368049621582, + "logps/chosen": -127.80845642089844, + "logps/rejected": -151.72030639648438, + "loss": 0.6227, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.6831592917442322, + "rewards/margins": 0.2034546136856079, + "rewards/rejected": -0.8866138458251953, + "step": 2090 + }, + { + "epoch": 1.51, + "grad_norm": 1.5390625, + "learning_rate": 8.469091706667748e-07, + "logits/chosen": -1.8915945291519165, + "logits/rejected": -1.895379662513733, + "logps/chosen": -122.15771484375, + "logps/rejected": -147.4517364501953, + "loss": 0.6135, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.6304605007171631, + "rewards/margins": 0.22080358862876892, + "rewards/rejected": -0.8512641191482544, + "step": 2100 + }, + { + "epoch": 1.52, + "grad_norm": 1.96875, + "learning_rate": 8.234354391303606e-07, + "logits/chosen": -1.8591235876083374, + "logits/rejected": -1.8553224802017212, + "logps/chosen": -124.28314208984375, + "logps/rejected": -152.6465301513672, + "loss": 0.6102, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.6334540247917175, + "rewards/margins": 0.24212419986724854, + "rewards/rejected": -0.8755782246589661, + "step": 2110 + }, + { + "epoch": 1.53, + "grad_norm": 2.15625, + "learning_rate": 8.002273057966012e-07, + "logits/chosen": -1.8992531299591064, + "logits/rejected": -1.9243097305297852, + "logps/chosen": -128.35739135742188, + "logps/rejected": -149.07540893554688, + "loss": 0.6183, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.5612360835075378, + "rewards/margins": 0.19729313254356384, + "rewards/rejected": -0.7585291862487793, + "step": 2120 + }, + { + "epoch": 1.53, + "grad_norm": 2.015625, + "learning_rate": 7.772884472543066e-07, + "logits/chosen": -1.9013763666152954, + "logits/rejected": -1.9246801137924194, + "logps/chosen": -124.08580017089844, + "logps/rejected": -140.3448028564453, + "loss": 0.6467, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.5578123927116394, + "rewards/margins": 0.1421774923801422, + "rewards/rejected": -0.699989914894104, + "step": 2130 + }, + { + "epoch": 1.54, + "grad_norm": 2.09375, + "learning_rate": 7.546224974342775e-07, + "logits/chosen": -1.9061437845230103, + "logits/rejected": -1.899762749671936, + "logps/chosen": -136.278076171875, + "logps/rejected": -161.5443115234375, + "loss": 0.6124, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.6313563585281372, + "rewards/margins": 0.23241499066352844, + "rewards/rejected": -0.8637714385986328, + "step": 2140 + }, + { + "epoch": 1.55, + "grad_norm": 2.28125, + "learning_rate": 7.322330470336314e-07, + "logits/chosen": -1.919785499572754, + "logits/rejected": -1.9172786474227905, + "logps/chosen": -130.88584899902344, + "logps/rejected": -155.8290252685547, + "loss": 0.627, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.6158424019813538, + "rewards/margins": 0.20419493317604065, + "rewards/rejected": -0.8200373649597168, + "step": 2150 + }, + { + "epoch": 1.56, + "grad_norm": 2.6875, + "learning_rate": 7.10123642946966e-07, + "logits/chosen": -1.9181368350982666, + "logits/rejected": -1.9378869533538818, + "logps/chosen": -125.4980239868164, + "logps/rejected": -148.65103149414062, + "loss": 0.6113, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.5430660247802734, + "rewards/margins": 0.2324432134628296, + "rewards/rejected": -0.775509238243103, + "step": 2160 + }, + { + "epoch": 1.56, + "grad_norm": 2.296875, + "learning_rate": 6.882977877044691e-07, + "logits/chosen": -1.9170925617218018, + "logits/rejected": -1.9317693710327148, + "logps/chosen": -118.6207504272461, + "logps/rejected": -140.1538543701172, + "loss": 0.6322, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.576205313205719, + "rewards/margins": 0.18266887962818146, + "rewards/rejected": -0.758874237537384, + "step": 2170 + }, + { + "epoch": 1.57, + "grad_norm": 2.0625, + "learning_rate": 6.667589389170561e-07, + "logits/chosen": -1.913522720336914, + "logits/rejected": -1.9155197143554688, + "logps/chosen": -127.30131530761719, + "logps/rejected": -149.8275909423828, + "loss": 0.6301, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.5541995763778687, + "rewards/margins": 0.18559743463993073, + "rewards/rejected": -0.7397969365119934, + "step": 2180 + }, + { + "epoch": 1.58, + "grad_norm": 1.9921875, + "learning_rate": 6.455105087286173e-07, + "logits/chosen": -1.9797407388687134, + "logits/rejected": -1.9776780605316162, + "logps/chosen": -130.6244659423828, + "logps/rejected": -150.9293670654297, + "loss": 0.6455, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.6223429441452026, + "rewards/margins": 0.1531866490840912, + "rewards/rejected": -0.775529682636261, + "step": 2190 + }, + { + "epoch": 1.59, + "grad_norm": 1.3359375, + "learning_rate": 6.245558632754778e-07, + "logits/chosen": -1.8683338165283203, + "logits/rejected": -1.8908354043960571, + "logps/chosen": -125.47029876708984, + "logps/rejected": -154.7410125732422, + "loss": 0.5925, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.5848723649978638, + "rewards/margins": 0.2577180862426758, + "rewards/rejected": -0.8425905108451843, + "step": 2200 + }, + { + "epoch": 1.59, + "grad_norm": 1.9375, + "learning_rate": 6.038983221531353e-07, + "logits/chosen": -1.9424070119857788, + "logits/rejected": -1.9472051858901978, + "logps/chosen": -120.87110900878906, + "logps/rejected": -145.4131622314453, + "loss": 0.604, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.5482410192489624, + "rewards/margins": 0.23777303099632263, + "rewards/rejected": -0.7860140204429626, + "step": 2210 + }, + { + "epoch": 1.6, + "grad_norm": 1.9296875, + "learning_rate": 5.83541157890379e-07, + "logits/chosen": -1.9903781414031982, + "logits/rejected": -2.0020580291748047, + "logps/chosen": -123.49699401855469, + "logps/rejected": -151.511474609375, + "loss": 0.6134, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.501916766166687, + "rewards/margins": 0.23306772112846375, + "rewards/rejected": -0.7349845170974731, + "step": 2220 + }, + { + "epoch": 1.61, + "grad_norm": 2.46875, + "learning_rate": 5.634875954308638e-07, + "logits/chosen": -1.9178415536880493, + "logits/rejected": -1.908630609512329, + "logps/chosen": -129.9990692138672, + "logps/rejected": -152.32931518554688, + "loss": 0.6365, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.6379269361495972, + "rewards/margins": 0.18584506213665009, + "rewards/rejected": -0.8237720727920532, + "step": 2230 + }, + { + "epoch": 1.61, + "grad_norm": 1.6953125, + "learning_rate": 5.437408116222148e-07, + "logits/chosen": -1.8094866275787354, + "logits/rejected": -1.8253847360610962, + "logps/chosen": -115.37788391113281, + "logps/rejected": -147.9441375732422, + "loss": 0.5973, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.5823795199394226, + "rewards/margins": 0.27050092816352844, + "rewards/rejected": -0.852880597114563, + "step": 2240 + }, + { + "epoch": 1.62, + "grad_norm": 2.234375, + "learning_rate": 5.243039347127621e-07, + "logits/chosen": -1.9520610570907593, + "logits/rejected": -1.9586107730865479, + "logps/chosen": -133.17977905273438, + "logps/rejected": -154.48606872558594, + "loss": 0.6407, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.6545882225036621, + "rewards/margins": 0.17907670140266418, + "rewards/rejected": -0.8336648941040039, + "step": 2250 + }, + { + "epoch": 1.63, + "grad_norm": 1.2421875, + "learning_rate": 5.05180043855969e-07, + "logits/chosen": -1.8850457668304443, + "logits/rejected": -1.895381212234497, + "logps/chosen": -113.73271179199219, + "logps/rejected": -137.9407501220703, + "loss": 0.6167, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.5211442708969116, + "rewards/margins": 0.21343907713890076, + "rewards/rejected": -0.7345833778381348, + "step": 2260 + }, + { + "epoch": 1.64, + "grad_norm": 1.8515625, + "learning_rate": 4.86372168622635e-07, + "logits/chosen": -1.8701765537261963, + "logits/rejected": -1.897936463356018, + "logps/chosen": -124.44834899902344, + "logps/rejected": -146.36325073242188, + "loss": 0.6207, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.6087551116943359, + "rewards/margins": 0.192201167345047, + "rewards/rejected": -0.8009563684463501, + "step": 2270 + }, + { + "epoch": 1.64, + "grad_norm": 2.359375, + "learning_rate": 4.678832885209622e-07, + "logits/chosen": -1.9065357446670532, + "logits/rejected": -1.9053528308868408, + "logps/chosen": -133.0020294189453, + "logps/rejected": -151.4831085205078, + "loss": 0.6418, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.6242747902870178, + "rewards/margins": 0.15634949505329132, + "rewards/rejected": -0.7806242108345032, + "step": 2280 + }, + { + "epoch": 1.65, + "grad_norm": 2.03125, + "learning_rate": 4.497163325245416e-07, + "logits/chosen": -1.869490623474121, + "logits/rejected": -1.8728406429290771, + "logps/chosen": -129.19412231445312, + "logps/rejected": -148.17112731933594, + "loss": 0.6197, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.601394534111023, + "rewards/margins": 0.20290544629096985, + "rewards/rejected": -0.8042998313903809, + "step": 2290 + }, + { + "epoch": 1.66, + "grad_norm": 2.34375, + "learning_rate": 4.3187417860835386e-07, + "logits/chosen": -1.8597100973129272, + "logits/rejected": -1.8595672845840454, + "logps/chosen": -123.48005676269531, + "logps/rejected": -146.0297088623047, + "loss": 0.6135, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.6197245717048645, + "rewards/margins": 0.22459180653095245, + "rewards/rejected": -0.8443164825439453, + "step": 2300 + }, + { + "epoch": 1.66, + "grad_norm": 1.7265625, + "learning_rate": 4.143596532928468e-07, + "logits/chosen": -1.8806402683258057, + "logits/rejected": -1.901450753211975, + "logps/chosen": -121.08549499511719, + "logps/rejected": -143.3876190185547, + "loss": 0.6214, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.48497408628463745, + "rewards/margins": 0.20996761322021484, + "rewards/rejected": -0.6949416995048523, + "step": 2310 + }, + { + "epoch": 1.67, + "grad_norm": 1.9921875, + "learning_rate": 3.971755311961606e-07, + "logits/chosen": -1.9731667041778564, + "logits/rejected": -1.9899402856826782, + "logps/chosen": -119.02622985839844, + "logps/rejected": -144.31222534179688, + "loss": 0.6119, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.5730189681053162, + "rewards/margins": 0.22726324200630188, + "rewards/rejected": -0.8002821207046509, + "step": 2320 + }, + { + "epoch": 1.68, + "grad_norm": 2.609375, + "learning_rate": 3.8032453459457884e-07, + "logits/chosen": -1.8654229640960693, + "logits/rejected": -1.8797037601470947, + "logps/chosen": -130.2448272705078, + "logps/rejected": -158.59487915039062, + "loss": 0.6099, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.7119914293289185, + "rewards/margins": 0.24810293316841125, + "rewards/rejected": -0.9600943326950073, + "step": 2330 + }, + { + "epoch": 1.69, + "grad_norm": 1.5234375, + "learning_rate": 3.6380933299127285e-07, + "logits/chosen": -1.9288314580917358, + "logits/rejected": -1.943996787071228, + "logps/chosen": -119.17585754394531, + "logps/rejected": -147.80038452148438, + "loss": 0.6022, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.5984092950820923, + "rewards/margins": 0.2575072944164276, + "rewards/rejected": -0.8559166193008423, + "step": 2340 + }, + { + "epoch": 1.69, + "grad_norm": 1.515625, + "learning_rate": 3.4763254269339965e-07, + "logits/chosen": -1.8540977239608765, + "logits/rejected": -1.8645613193511963, + "logps/chosen": -138.80377197265625, + "logps/rejected": -159.77102661132812, + "loss": 0.621, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.6650180816650391, + "rewards/margins": 0.20767009258270264, + "rewards/rejected": -0.8726881146430969, + "step": 2350 + }, + { + "epoch": 1.7, + "grad_norm": 2.296875, + "learning_rate": 3.3179672639763737e-07, + "logits/chosen": -1.9465539455413818, + "logits/rejected": -1.9527965784072876, + "logps/chosen": -113.87541198730469, + "logps/rejected": -147.63290405273438, + "loss": 0.5874, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.52118980884552, + "rewards/margins": 0.291698157787323, + "rewards/rejected": -0.812887966632843, + "step": 2360 + }, + { + "epoch": 1.71, + "grad_norm": 2.03125, + "learning_rate": 3.163043927842019e-07, + "logits/chosen": -1.9162803888320923, + "logits/rejected": -1.934597373008728, + "logps/chosen": -128.00485229492188, + "logps/rejected": -146.68711853027344, + "loss": 0.6268, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.596336841583252, + "rewards/margins": 0.18232461810112, + "rewards/rejected": -0.7786614298820496, + "step": 2370 + }, + { + "epoch": 1.72, + "grad_norm": 1.453125, + "learning_rate": 3.011579961194286e-07, + "logits/chosen": -1.956756591796875, + "logits/rejected": -1.9522291421890259, + "logps/chosen": -130.30319213867188, + "logps/rejected": -157.8980712890625, + "loss": 0.61, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.6217106580734253, + "rewards/margins": 0.24674446880817413, + "rewards/rejected": -0.8684550523757935, + "step": 2380 + }, + { + "epoch": 1.72, + "grad_norm": 2.109375, + "learning_rate": 2.8635993586697555e-07, + "logits/chosen": -1.8693218231201172, + "logits/rejected": -1.8768870830535889, + "logps/chosen": -117.62882995605469, + "logps/rejected": -141.52174377441406, + "loss": 0.6162, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.554486870765686, + "rewards/margins": 0.21078386902809143, + "rewards/rejected": -0.7652707695960999, + "step": 2390 + }, + { + "epoch": 1.73, + "grad_norm": 2.125, + "learning_rate": 2.7191255630769855e-07, + "logits/chosen": -1.9183435440063477, + "logits/rejected": -1.9085958003997803, + "logps/chosen": -131.42051696777344, + "logps/rejected": -160.8799591064453, + "loss": 0.5994, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.6411622762680054, + "rewards/margins": 0.2594824433326721, + "rewards/rejected": -0.9006446599960327, + "step": 2400 + }, + { + "epoch": 1.74, + "grad_norm": 2.09375, + "learning_rate": 2.5781814616827936e-07, + "logits/chosen": -1.9339672327041626, + "logits/rejected": -1.9259040355682373, + "logps/chosen": -126.9478530883789, + "logps/rejected": -150.4621124267578, + "loss": 0.6299, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.6331970691680908, + "rewards/margins": 0.18595722317695618, + "rewards/rejected": -0.8191541433334351, + "step": 2410 + }, + { + "epoch": 1.74, + "grad_norm": 2.046875, + "learning_rate": 2.4407893825864893e-07, + "logits/chosen": -1.8841025829315186, + "logits/rejected": -1.9010101556777954, + "logps/chosen": -123.89164733886719, + "logps/rejected": -148.62289428710938, + "loss": 0.6121, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.6260435581207275, + "rewards/margins": 0.23781859874725342, + "rewards/rejected": -0.863862156867981, + "step": 2420 + }, + { + "epoch": 1.75, + "grad_norm": 2.5, + "learning_rate": 2.3069710911826858e-07, + "logits/chosen": -1.873400092124939, + "logits/rejected": -1.8696212768554688, + "logps/chosen": -131.77992248535156, + "logps/rejected": -158.84988403320312, + "loss": 0.6255, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.671171247959137, + "rewards/margins": 0.2179645597934723, + "rewards/rejected": -0.8891357183456421, + "step": 2430 + }, + { + "epoch": 1.76, + "grad_norm": 1.5703125, + "learning_rate": 2.176747786713282e-07, + "logits/chosen": -1.8760631084442139, + "logits/rejected": -1.8778858184814453, + "logps/chosen": -125.80195617675781, + "logps/rejected": -149.81651306152344, + "loss": 0.6215, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.6132515072822571, + "rewards/margins": 0.2072262316942215, + "rewards/rejected": -0.8204777836799622, + "step": 2440 + }, + { + "epoch": 1.77, + "grad_norm": 2.859375, + "learning_rate": 2.0501400989091036e-07, + "logits/chosen": -1.9188148975372314, + "logits/rejected": -1.9283740520477295, + "logps/chosen": -126.12955474853516, + "logps/rejected": -145.44830322265625, + "loss": 0.6261, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.5981575846672058, + "rewards/margins": 0.18680432438850403, + "rewards/rejected": -0.7849618792533875, + "step": 2450 + }, + { + "epoch": 1.77, + "grad_norm": 3.578125, + "learning_rate": 1.927168084721795e-07, + "logits/chosen": -1.9040225744247437, + "logits/rejected": -1.918349027633667, + "logps/chosen": -121.2413558959961, + "logps/rejected": -146.60833740234375, + "loss": 0.6096, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.5998507738113403, + "rewards/margins": 0.24554471671581268, + "rewards/rejected": -0.8453954458236694, + "step": 2460 + }, + { + "epoch": 1.78, + "grad_norm": 1.6796875, + "learning_rate": 1.8078512251464285e-07, + "logits/chosen": -1.9655431509017944, + "logits/rejected": -1.9582946300506592, + "logps/chosen": -131.67860412597656, + "logps/rejected": -152.63572692871094, + "loss": 0.6273, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.5897047519683838, + "rewards/margins": 0.19095389544963837, + "rewards/rejected": -0.7806587219238281, + "step": 2470 + }, + { + "epoch": 1.79, + "grad_norm": 2.421875, + "learning_rate": 1.6922084221353607e-07, + "logits/chosen": -1.915435791015625, + "logits/rejected": -1.941033124923706, + "logps/chosen": -124.4185562133789, + "logps/rejected": -150.6072540283203, + "loss": 0.6078, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.5871225595474243, + "rewards/margins": 0.2428620308637619, + "rewards/rejected": -0.8299845457077026, + "step": 2480 + }, + { + "epoch": 1.79, + "grad_norm": 1.7109375, + "learning_rate": 1.5802579956038093e-07, + "logits/chosen": -1.8886842727661133, + "logits/rejected": -1.9026222229003906, + "logps/chosen": -115.16085052490234, + "logps/rejected": -143.67367553710938, + "loss": 0.5948, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.5255457758903503, + "rewards/margins": 0.26099538803100586, + "rewards/rejected": -0.7865411639213562, + "step": 2490 + }, + { + "epoch": 1.8, + "grad_norm": 2.453125, + "learning_rate": 1.472017680527685e-07, + "logits/chosen": -1.9219309091567993, + "logits/rejected": -1.9138708114624023, + "logps/chosen": -121.44775390625, + "logps/rejected": -151.65969848632812, + "loss": 0.6042, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.6054600477218628, + "rewards/margins": 0.24131233990192413, + "rewards/rejected": -0.846772313117981, + "step": 2500 + }, + { + "epoch": 1.81, + "grad_norm": 2.515625, + "learning_rate": 1.3675046241339918e-07, + "logits/chosen": -1.8917341232299805, + "logits/rejected": -1.9025049209594727, + "logps/chosen": -127.67252349853516, + "logps/rejected": -148.03077697753906, + "loss": 0.6303, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.5955443978309631, + "rewards/margins": 0.18720856308937073, + "rewards/rejected": -0.7827528715133667, + "step": 2510 + }, + { + "epoch": 1.82, + "grad_norm": 1.609375, + "learning_rate": 1.2667353831844585e-07, + "logits/chosen": -1.83237624168396, + "logits/rejected": -1.8439161777496338, + "logps/chosen": -128.77362060546875, + "logps/rejected": -150.86912536621094, + "loss": 0.62, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6072179675102234, + "rewards/margins": 0.20041854679584503, + "rewards/rejected": -0.807636559009552, + "step": 2520 + }, + { + "epoch": 1.82, + "grad_norm": 1.8359375, + "learning_rate": 1.1697259213525936e-07, + "logits/chosen": -1.8945062160491943, + "logits/rejected": -1.8865067958831787, + "logps/chosen": -113.77557373046875, + "logps/rejected": -145.04769897460938, + "loss": 0.5948, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.5280709266662598, + "rewards/margins": 0.2756304144859314, + "rewards/rejected": -0.8037012815475464, + "step": 2530 + }, + { + "epoch": 1.83, + "grad_norm": 2.078125, + "learning_rate": 1.0764916066947795e-07, + "logits/chosen": -1.801983118057251, + "logits/rejected": -1.791394829750061, + "logps/chosen": -131.66268920898438, + "logps/rejected": -166.57620239257812, + "loss": 0.5934, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.7564202547073364, + "rewards/margins": 0.3014541268348694, + "rewards/rejected": -1.057874321937561, + "step": 2540 + }, + { + "epoch": 1.84, + "grad_norm": 1.640625, + "learning_rate": 9.870472092156941e-08, + "logits/chosen": -1.8863308429718018, + "logits/rejected": -1.8997328281402588, + "logps/chosen": -120.3888931274414, + "logps/rejected": -147.81582641601562, + "loss": 0.6151, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.6340813040733337, + "rewards/margins": 0.23104877769947052, + "rewards/rejected": -0.8651300668716431, + "step": 2550 + }, + { + "epoch": 1.84, + "grad_norm": 2.140625, + "learning_rate": 9.014068985284618e-08, + "logits/chosen": -1.8636146783828735, + "logits/rejected": -1.8543964624404907, + "logps/chosen": -123.03385162353516, + "logps/rejected": -138.28298950195312, + "loss": 0.6387, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.5915199518203735, + "rewards/margins": 0.1568307727575302, + "rewards/rejected": -0.7483507394790649, + "step": 2560 + }, + { + "epoch": 1.85, + "grad_norm": 2.078125, + "learning_rate": 8.19584241609936e-08, + "logits/chosen": -1.983232855796814, + "logits/rejected": -1.9851760864257812, + "logps/chosen": -132.586669921875, + "logps/rejected": -164.07855224609375, + "loss": 0.5915, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.6490141153335571, + "rewards/margins": 0.2791653275489807, + "rewards/rejected": -0.9281795620918274, + "step": 2570 + }, + { + "epoch": 1.86, + "grad_norm": 2.25, + "learning_rate": 7.415922006514448e-08, + "logits/chosen": -1.888055443763733, + "logits/rejected": -1.9014968872070312, + "logps/chosen": -122.8010482788086, + "logps/rejected": -147.2124786376953, + "loss": 0.6102, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.582960307598114, + "rewards/margins": 0.22722116112709045, + "rewards/rejected": -0.8101814985275269, + "step": 2580 + }, + { + "epoch": 1.87, + "grad_norm": 1.9140625, + "learning_rate": 6.674431310053519e-08, + "logits/chosen": -1.9087717533111572, + "logits/rejected": -1.9133113622665405, + "logps/chosen": -117.87506103515625, + "logps/rejected": -142.63198852539062, + "loss": 0.6151, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.5861650109291077, + "rewards/margins": 0.21530351042747498, + "rewards/rejected": -0.8014683723449707, + "step": 2590 + }, + { + "epoch": 1.87, + "grad_norm": 1.625, + "learning_rate": 5.971487792277297e-08, + "logits/chosen": -1.927983045578003, + "logits/rejected": -1.9456886053085327, + "logps/chosen": -117.11873626708984, + "logps/rejected": -138.4589385986328, + "loss": 0.6214, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.5543134212493896, + "rewards/margins": 0.19218505918979645, + "rewards/rejected": -0.7464984655380249, + "step": 2600 + }, + { + "epoch": 1.88, + "grad_norm": 1.8984375, + "learning_rate": 5.307202812175005e-08, + "logits/chosen": -1.8637211322784424, + "logits/rejected": -1.878003716468811, + "logps/chosen": -124.0638656616211, + "logps/rejected": -146.49771118164062, + "loss": 0.6155, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.6212812662124634, + "rewards/margins": 0.21200039982795715, + "rewards/rejected": -0.8332816362380981, + "step": 2610 + }, + { + "epoch": 1.89, + "grad_norm": 1.7421875, + "learning_rate": 4.681681604523064e-08, + "logits/chosen": -1.877753496170044, + "logits/rejected": -1.8904426097869873, + "logps/chosen": -126.96830749511719, + "logps/rejected": -155.18118286132812, + "loss": 0.6053, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.6443455815315247, + "rewards/margins": 0.26543739438056946, + "rewards/rejected": -0.9097830057144165, + "step": 2620 + }, + { + "epoch": 1.9, + "grad_norm": 1.625, + "learning_rate": 4.0950232632141205e-08, + "logits/chosen": -1.9946720600128174, + "logits/rejected": -2.0097415447235107, + "logps/chosen": -134.808349609375, + "logps/rejected": -155.46096801757812, + "loss": 0.622, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.6130391359329224, + "rewards/margins": 0.19625858962535858, + "rewards/rejected": -0.8092976808547974, + "step": 2630 + }, + { + "epoch": 1.9, + "grad_norm": 2.015625, + "learning_rate": 3.547320725558495e-08, + "logits/chosen": -1.8689781427383423, + "logits/rejected": -1.8868701457977295, + "logps/chosen": -127.9247817993164, + "logps/rejected": -149.6826629638672, + "loss": 0.6185, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.6171109676361084, + "rewards/margins": 0.20821337401866913, + "rewards/rejected": -0.8253243565559387, + "step": 2640 + }, + { + "epoch": 1.91, + "grad_norm": 1.3828125, + "learning_rate": 3.038660757561568e-08, + "logits/chosen": -1.9256298542022705, + "logits/rejected": -1.9275277853012085, + "logps/chosen": -131.28964233398438, + "logps/rejected": -156.5281524658203, + "loss": 0.6136, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.5464403629302979, + "rewards/margins": 0.2162129133939743, + "rewards/rejected": -0.7626532316207886, + "step": 2650 + }, + { + "epoch": 1.92, + "grad_norm": 2.1875, + "learning_rate": 2.569123940178192e-08, + "logits/chosen": -1.8706880807876587, + "logits/rejected": -1.895275354385376, + "logps/chosen": -126.8041000366211, + "logps/rejected": -152.7987060546875, + "loss": 0.6077, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.6154114007949829, + "rewards/margins": 0.23175501823425293, + "rewards/rejected": -0.8471664190292358, + "step": 2660 + }, + { + "epoch": 1.92, + "grad_norm": 2.4375, + "learning_rate": 2.1387846565474047e-08, + "logits/chosen": -1.9511226415634155, + "logits/rejected": -1.9575185775756836, + "logps/chosen": -122.9830322265625, + "logps/rejected": -150.94505310058594, + "loss": 0.6051, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.6026027798652649, + "rewards/margins": 0.24615421891212463, + "rewards/rejected": -0.8487569689750671, + "step": 2670 + }, + { + "epoch": 1.93, + "grad_norm": 1.375, + "learning_rate": 1.7477110802086583e-08, + "logits/chosen": -1.8935045003890991, + "logits/rejected": -1.9025996923446655, + "logps/chosen": -128.3074188232422, + "logps/rejected": -151.99522399902344, + "loss": 0.6256, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.6137545704841614, + "rewards/margins": 0.19285576045513153, + "rewards/rejected": -0.8066104054450989, + "step": 2680 + }, + { + "epoch": 1.94, + "grad_norm": 2.0, + "learning_rate": 1.3959651643019601e-08, + "logits/chosen": -1.9040815830230713, + "logits/rejected": -1.9198287725448608, + "logps/chosen": -127.39599609375, + "logps/rejected": -146.60008239746094, + "loss": 0.6284, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.5616117119789124, + "rewards/margins": 0.1830439418554306, + "rewards/rejected": -0.7446557283401489, + "step": 2690 + }, + { + "epoch": 1.95, + "grad_norm": 2.390625, + "learning_rate": 1.0836026317533887e-08, + "logits/chosen": -1.925762414932251, + "logits/rejected": -1.9249995946884155, + "logps/chosen": -134.05189514160156, + "logps/rejected": -150.71640014648438, + "loss": 0.6467, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.6878820657730103, + "rewards/margins": 0.15121865272521973, + "rewards/rejected": -0.83910071849823, + "step": 2700 + }, + { + "epoch": 1.95, + "grad_norm": 2.453125, + "learning_rate": 8.106729664475178e-09, + "logits/chosen": -1.9120187759399414, + "logits/rejected": -1.9409675598144531, + "logps/chosen": -126.06358337402344, + "logps/rejected": -143.04531860351562, + "loss": 0.6437, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.599898636341095, + "rewards/margins": 0.15934984385967255, + "rewards/rejected": -0.7592484951019287, + "step": 2710 + }, + { + "epoch": 1.96, + "grad_norm": 1.78125, + "learning_rate": 5.772194053882962e-09, + "logits/chosen": -1.8914750814437866, + "logits/rejected": -1.8805005550384521, + "logps/chosen": -124.221435546875, + "logps/rejected": -154.97128295898438, + "loss": 0.6014, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.6593270301818848, + "rewards/margins": 0.2628743350505829, + "rewards/rejected": -0.9222013354301453, + "step": 2720 + }, + { + "epoch": 1.97, + "grad_norm": 1.3984375, + "learning_rate": 3.832789318495289e-09, + "logits/chosen": -1.9097976684570312, + "logits/rejected": -1.9211629629135132, + "logps/chosen": -117.1760025024414, + "logps/rejected": -139.37118530273438, + "loss": 0.6256, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.5425055027008057, + "rewards/margins": 0.20210960507392883, + "rewards/rejected": -0.7446150779724121, + "step": 2730 + }, + { + "epoch": 1.97, + "grad_norm": 3.078125, + "learning_rate": 2.288822695160897e-09, + "logits/chosen": -1.8931448459625244, + "logits/rejected": -1.8963727951049805, + "logps/chosen": -138.54782104492188, + "logps/rejected": -168.485595703125, + "loss": 0.6046, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.7985516786575317, + "rewards/margins": 0.25337541103363037, + "rewards/rejected": -1.051927089691162, + "step": 2740 + }, + { + "epoch": 1.98, + "grad_norm": 1.515625, + "learning_rate": 1.1405387761664888e-09, + "logits/chosen": -1.9339786767959595, + "logits/rejected": -1.9223487377166748, + "logps/chosen": -120.66938781738281, + "logps/rejected": -145.23965454101562, + "loss": 0.6159, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.5394836664199829, + "rewards/margins": 0.20085513591766357, + "rewards/rejected": -0.7403386831283569, + "step": 2750 + }, + { + "epoch": 1.99, + "grad_norm": 2.140625, + "learning_rate": 3.8811947048994494e-10, + "logits/chosen": -1.9571815729141235, + "logits/rejected": -1.9642584323883057, + "logps/chosen": -129.06796264648438, + "logps/rejected": -154.98562622070312, + "loss": 0.6102, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.6091276407241821, + "rewards/margins": 0.23150965571403503, + "rewards/rejected": -0.8406373262405396, + "step": 2760 + }, + { + "epoch": 2.0, + "grad_norm": 2.453125, + "learning_rate": 3.168397498115594e-11, + "logits/chosen": -1.903794288635254, + "logits/rejected": -1.9025567770004272, + "logps/chosen": -130.3432159423828, + "logps/rejected": -151.00790405273438, + "loss": 0.6335, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.6614962220191956, + "rewards/margins": 0.1874585896730423, + "rewards/rejected": -0.8489547967910767, + "step": 2770 + }, + { + "epoch": 2.0, + "step": 2774, + "total_flos": 0.0, + "train_loss": 0.6373478753496264, + "train_runtime": 5106.1264, + "train_samples_per_second": 8.696, + "train_steps_per_second": 0.543 + } + ], + "logging_steps": 10, + "max_steps": 2774, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 100, + "total_flos": 0.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}