diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -1,21 +1,21 @@ { "best_metric": null, "best_model_checkpoint": null, - "epoch": 1.0, + "epoch": 0.9994242947610823, "eval_steps": 100, - "global_step": 1346, + "global_step": 868, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, - "grad_norm": 23.545113700609754, - "learning_rate": 3.7037037037037036e-09, - "logits/chosen": -2.017277240753174, - "logits/rejected": -1.9505600929260254, - "logps/chosen": -342.8155212402344, - "logps/rejected": -264.6424865722656, + "grad_norm": 24.168190559264126, + "learning_rate": 5.747126436781609e-09, + "logits/chosen": -1.9734797477722168, + "logits/rejected": -1.856537938117981, + "logps/chosen": -206.3428497314453, + "logps/rejected": -155.26254272460938, "loss": 0.6932, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, @@ -25,2234 +25,1434 @@ }, { "epoch": 0.01, - "grad_norm": 23.704110924178444, - "learning_rate": 3.7037037037037036e-08, - "logits/chosen": -1.852867603302002, - "logits/rejected": -1.7641547918319702, - "logps/chosen": -243.63710021972656, - "logps/rejected": -215.13551330566406, - "loss": 0.6933, - "rewards/accuracies": 0.4027777910232544, - "rewards/chosen": -0.0004846964729949832, - "rewards/margins": -0.001089173136278987, - "rewards/rejected": 0.0006044767214916646, + "grad_norm": 24.91157064859394, + "learning_rate": 5.747126436781609e-08, + "logits/chosen": -1.9969236850738525, + "logits/rejected": -1.9658927917480469, + "logps/chosen": -214.8179168701172, + "logps/rejected": -192.40969848632812, + "loss": 0.6932, + "rewards/accuracies": 0.4791666567325592, + "rewards/chosen": 0.0006090968381613493, + "rewards/margins": 0.0008069847244769335, + "rewards/rejected": -0.00019788791541941464, "step": 10 }, { - "epoch": 0.01, - "grad_norm": 27.48286479448467, - "learning_rate": 7.407407407407407e-08, - "logits/chosen": -1.9755146503448486, - "logits/rejected": -1.8412548303604126, - "logps/chosen": -241.4310302734375, - "logps/rejected": -210.738037109375, - "loss": 0.6927, - "rewards/accuracies": 0.53125, - "rewards/chosen": 0.0005561274592764676, - "rewards/margins": 0.0004348217917140573, - "rewards/rejected": 0.00012130556569900364, + "epoch": 0.02, + "grad_norm": 24.574313763724412, + "learning_rate": 1.1494252873563217e-07, + "logits/chosen": -2.0616955757141113, + "logits/rejected": -1.9390573501586914, + "logps/chosen": -256.38787841796875, + "logps/rejected": -191.22067260742188, + "loss": 0.6921, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.0014679343439638615, + "rewards/margins": 0.002354845404624939, + "rewards/rejected": -0.0008869109442457557, "step": 20 }, { - "epoch": 0.02, - "grad_norm": 23.49895713678948, - "learning_rate": 1.111111111111111e-07, - "logits/chosen": -1.8477449417114258, - "logits/rejected": -1.781266450881958, - "logps/chosen": -277.84527587890625, - "logps/rejected": -244.1582489013672, - "loss": 0.6915, - "rewards/accuracies": 0.606249988079071, - "rewards/chosen": 0.005596889648586512, - "rewards/margins": 0.0021990840323269367, - "rewards/rejected": 0.003397804917767644, + "epoch": 0.03, + "grad_norm": 23.51349624308262, + "learning_rate": 1.7241379310344828e-07, + "logits/chosen": -1.9969984292984009, + "logits/rejected": -1.9430469274520874, + "logps/chosen": -215.03457641601562, + "logps/rejected": -196.2289581298828, + "loss": 0.6867, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": 0.012777608819305897, + "rewards/margins": 0.016061924397945404, + "rewards/rejected": -0.00328431511297822, "step": 30 }, { - "epoch": 0.03, - "grad_norm": 21.952979365752906, - "learning_rate": 1.4814814814814815e-07, - "logits/chosen": -1.8662084341049194, - "logits/rejected": -1.8252031803131104, - "logps/chosen": -279.81585693359375, - "logps/rejected": -256.37322998046875, - "loss": 0.6867, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": 0.026755522936582565, - "rewards/margins": 0.01376323588192463, - "rewards/rejected": 0.01299228798598051, + "epoch": 0.05, + "grad_norm": 21.809703812897773, + "learning_rate": 2.2988505747126435e-07, + "logits/chosen": -1.9486480951309204, + "logits/rejected": -1.9111521244049072, + "logps/chosen": -212.9443817138672, + "logps/rejected": -192.1443328857422, + "loss": 0.6745, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.030324190855026245, + "rewards/margins": 0.04425760358572006, + "rewards/rejected": -0.013933415524661541, "step": 40 }, { - "epoch": 0.04, - "grad_norm": 22.515894719363914, - "learning_rate": 1.8518518518518516e-07, - "logits/chosen": -1.886828064918518, - "logits/rejected": -1.796974539756775, - "logps/chosen": -245.1302490234375, - "logps/rejected": -207.6703338623047, - "loss": 0.68, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": 0.05396401137113571, - "rewards/margins": 0.03148679807782173, - "rewards/rejected": 0.02247721515595913, + "epoch": 0.06, + "grad_norm": 22.7618695537933, + "learning_rate": 2.873563218390804e-07, + "logits/chosen": -2.025139331817627, + "logits/rejected": -1.9583957195281982, + "logps/chosen": -201.2593231201172, + "logps/rejected": -188.67807006835938, + "loss": 0.6565, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": 0.029964879155158997, + "rewards/margins": 0.10461034625768661, + "rewards/rejected": -0.07464545965194702, "step": 50 }, { - "epoch": 0.04, - "grad_norm": 21.11853715417876, - "learning_rate": 2.222222222222222e-07, - "logits/chosen": -1.8658056259155273, - "logits/rejected": -1.7990939617156982, - "logps/chosen": -245.4588623046875, - "logps/rejected": -228.79067993164062, - "loss": 0.6687, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": 0.0710381492972374, - "rewards/margins": 0.053314320743083954, - "rewards/rejected": 0.01772383041679859, + "epoch": 0.07, + "grad_norm": 20.921825952553665, + "learning_rate": 3.4482758620689656e-07, + "logits/chosen": -2.064497709274292, + "logits/rejected": -1.9918781518936157, + "logps/chosen": -265.92425537109375, + "logps/rejected": -233.8927459716797, + "loss": 0.623, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.04738049954175949, + "rewards/margins": 0.17422285676002502, + "rewards/rejected": -0.22160334885120392, "step": 60 }, { - "epoch": 0.05, - "grad_norm": 21.639022509531838, - "learning_rate": 2.5925925925925923e-07, - "logits/chosen": -1.8920536041259766, - "logits/rejected": -1.8345096111297607, - "logps/chosen": -223.96511840820312, - "logps/rejected": -196.08775329589844, - "loss": 0.6547, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": 0.06073574349284172, - "rewards/margins": 0.08626440167427063, - "rewards/rejected": -0.02552866004407406, + "epoch": 0.08, + "grad_norm": 25.686036310459873, + "learning_rate": 4.0229885057471266e-07, + "logits/chosen": -1.9937725067138672, + "logits/rejected": -1.9978179931640625, + "logps/chosen": -224.78897094726562, + "logps/rejected": -234.25808715820312, + "loss": 0.5987, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.28290849924087524, + "rewards/margins": 0.266859233379364, + "rewards/rejected": -0.549767792224884, "step": 70 }, { - "epoch": 0.06, - "grad_norm": 22.179495576107882, - "learning_rate": 2.962962962962963e-07, - "logits/chosen": -1.8825687170028687, - "logits/rejected": -1.847541093826294, - "logps/chosen": -232.0540313720703, - "logps/rejected": -240.20120239257812, - "loss": 0.6407, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": 0.03458085656166077, - "rewards/margins": 0.1154135912656784, - "rewards/rejected": -0.08083274215459824, + "epoch": 0.09, + "grad_norm": 31.860669472520566, + "learning_rate": 4.597701149425287e-07, + "logits/chosen": -1.7500314712524414, + "logits/rejected": -1.687111258506775, + "logps/chosen": -242.4202117919922, + "logps/rejected": -248.33908081054688, + "loss": 0.5806, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.4695563316345215, + "rewards/margins": 0.3295554518699646, + "rewards/rejected": -0.7991117835044861, "step": 80 }, { - "epoch": 0.07, - "grad_norm": 21.88163995061792, - "learning_rate": 3.333333333333333e-07, - "logits/chosen": -1.9384691715240479, - "logits/rejected": -1.922488808631897, - "logps/chosen": -248.4744415283203, - "logps/rejected": -261.0725402832031, - "loss": 0.6135, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -0.029202425852417946, - "rewards/margins": 0.2103302925825119, - "rewards/rejected": -0.2395327389240265, + "epoch": 0.1, + "grad_norm": 37.15734802388127, + "learning_rate": 4.999817969178237e-07, + "logits/chosen": -1.71634042263031, + "logits/rejected": -1.6681534051895142, + "logps/chosen": -319.0685119628906, + "logps/rejected": -339.21966552734375, + "loss": 0.5175, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9969006776809692, + "rewards/margins": 0.5603370070457458, + "rewards/rejected": -1.5572377443313599, "step": 90 }, { - "epoch": 0.07, - "grad_norm": 27.693123307166786, - "learning_rate": 3.703703703703703e-07, - "logits/chosen": -1.9232885837554932, - "logits/rejected": -1.9198648929595947, - "logps/chosen": -245.3694610595703, - "logps/rejected": -275.853515625, - "loss": 0.5905, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -0.23111872375011444, - "rewards/margins": 0.2522026598453522, - "rewards/rejected": -0.4833213686943054, + "epoch": 0.12, + "grad_norm": 38.96794122383384, + "learning_rate": 4.996582603056428e-07, + "logits/chosen": -1.7202441692352295, + "logits/rejected": -1.6160236597061157, + "logps/chosen": -322.7548522949219, + "logps/rejected": -370.2288818359375, + "loss": 0.5295, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.038615107536316, + "rewards/margins": 0.6558629870414734, + "rewards/rejected": -1.6944783926010132, "step": 100 }, { - "epoch": 0.07, - "eval_logits/chosen": -1.787776231765747, - "eval_logits/rejected": -1.7244033813476562, - "eval_logps/chosen": -325.57440185546875, - "eval_logps/rejected": -351.93182373046875, - "eval_loss": 0.6428781747817993, - "eval_rewards/accuracies": 0.671875, - "eval_rewards/chosen": -0.13797907531261444, - "eval_rewards/margins": 0.2060878425836563, - "eval_rewards/rejected": -0.34406691789627075, - "eval_runtime": 97.6555, - "eval_samples_per_second": 20.48, - "eval_steps_per_second": 0.328, + "epoch": 0.12, + "eval_logits/chosen": -1.7860382795333862, + "eval_logits/rejected": -1.69411301612854, + "eval_logps/chosen": -378.6120910644531, + "eval_logps/rejected": -405.63275146484375, + "eval_loss": 0.6075600981712341, + "eval_rewards/accuracies": 0.69921875, + "eval_rewards/chosen": -0.43100571632385254, + "eval_rewards/margins": 0.2942630350589752, + "eval_rewards/rejected": -0.7252687811851501, + "eval_runtime": 97.9953, + "eval_samples_per_second": 20.409, + "eval_steps_per_second": 0.327, "step": 100 }, { - "epoch": 0.08, - "grad_norm": 33.52938589908786, - "learning_rate": 4.0740740740740737e-07, - "logits/chosen": -1.8354734182357788, - "logits/rejected": -1.7754793167114258, - "logps/chosen": -295.2403869628906, - "logps/rejected": -316.46923828125, - "loss": 0.5723, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -0.5448485016822815, - "rewards/margins": 0.3984159529209137, - "rewards/rejected": -0.943264365196228, + "epoch": 0.13, + "grad_norm": 39.11820261051907, + "learning_rate": 4.989308132738126e-07, + "logits/chosen": -1.7690223455429077, + "logits/rejected": -1.6496288776397705, + "logps/chosen": -304.85015869140625, + "logps/rejected": -325.87982177734375, + "loss": 0.4999, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -0.8704218864440918, + "rewards/margins": 0.6589836478233337, + "rewards/rejected": -1.5294055938720703, "step": 110 }, { - "epoch": 0.09, - "grad_norm": 32.42547027840792, - "learning_rate": 4.444444444444444e-07, - "logits/chosen": -1.7011499404907227, - "logits/rejected": -1.708805799484253, - "logps/chosen": -307.11334228515625, - "logps/rejected": -348.78729248046875, - "loss": 0.5442, - "rewards/accuracies": 0.75, - "rewards/chosen": -0.5989453196525574, - "rewards/margins": 0.6007151007652283, - "rewards/rejected": -1.1996605396270752, + "epoch": 0.14, + "grad_norm": 43.9866022819552, + "learning_rate": 4.978006327248536e-07, + "logits/chosen": -1.7066357135772705, + "logits/rejected": -1.625906229019165, + "logps/chosen": -341.0208740234375, + "logps/rejected": -396.2286071777344, + "loss": 0.48, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.2682462930679321, + "rewards/margins": 0.8490262031555176, + "rewards/rejected": -2.1172726154327393, "step": 120 }, { - "epoch": 0.1, - "grad_norm": 33.08064593315955, - "learning_rate": 4.814814814814814e-07, - "logits/chosen": -1.70786452293396, - "logits/rejected": -1.6745007038116455, - "logps/chosen": -290.42498779296875, - "logps/rejected": -343.42510986328125, - "loss": 0.5139, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -0.7598094344139099, - "rewards/margins": 0.6571252346038818, - "rewards/rejected": -1.4169347286224365, + "epoch": 0.15, + "grad_norm": 51.95435254496828, + "learning_rate": 4.962695471250032e-07, + "logits/chosen": -1.7163381576538086, + "logits/rejected": -1.6467857360839844, + "logps/chosen": -330.796142578125, + "logps/rejected": -384.80596923828125, + "loss": 0.4743, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.1125227212905884, + "rewards/margins": 0.8911293148994446, + "rewards/rejected": -2.0036520957946777, "step": 130 }, { - "epoch": 0.1, - "grad_norm": 33.94320124887001, - "learning_rate": 4.999789692194508e-07, - "logits/chosen": -1.8099472522735596, - "logits/rejected": -1.754595398902893, - "logps/chosen": -314.9842224121094, - "logps/rejected": -356.81011962890625, - "loss": 0.5172, - "rewards/accuracies": 0.793749988079071, - "rewards/chosen": -0.812475860118866, - "rewards/margins": 0.6942508816719055, - "rewards/rejected": -1.5067269802093506, + "epoch": 0.16, + "grad_norm": 41.76231027113118, + "learning_rate": 4.94340033546025e-07, + "logits/chosen": -1.768608808517456, + "logits/rejected": -1.7792564630508423, + "logps/chosen": -292.7682189941406, + "logps/rejected": -356.3222351074219, + "loss": 0.494, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.0807616710662842, + "rewards/margins": 0.6589750647544861, + "rewards/rejected": -1.739736795425415, "step": 140 }, { - "epoch": 0.11, - "grad_norm": 39.07047935152003, - "learning_rate": 4.998107442045616e-07, - "logits/chosen": -1.6377861499786377, - "logits/rejected": -1.6226139068603516, - "logps/chosen": -304.92840576171875, - "logps/rejected": -393.1883239746094, - "loss": 0.5094, - "rewards/accuracies": 0.793749988079071, - "rewards/chosen": -0.8283722996711731, - "rewards/margins": 0.8278924822807312, - "rewards/rejected": -1.6562646627426147, + "epoch": 0.17, + "grad_norm": 42.74539380847778, + "learning_rate": 4.920152136576705e-07, + "logits/chosen": -1.5584046840667725, + "logits/rejected": -1.4847816228866577, + "logps/chosen": -360.10626220703125, + "logps/rejected": -438.8833923339844, + "loss": 0.4604, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -1.4732930660247803, + "rewards/margins": 1.046295166015625, + "rewards/rejected": -2.519587993621826, "step": 150 }, { - "epoch": 0.12, - "grad_norm": 42.785505208166626, - "learning_rate": 4.994744073829293e-07, - "logits/chosen": -1.5746722221374512, - "logits/rejected": -1.4142063856124878, - "logps/chosen": -343.25823974609375, - "logps/rejected": -402.02691650390625, - "loss": 0.5011, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -0.8369730710983276, - "rewards/margins": 0.8556060791015625, - "rewards/rejected": -1.6925792694091797, + "epoch": 0.18, + "grad_norm": 52.492556284072315, + "learning_rate": 4.892988486772756e-07, + "logits/chosen": -1.4800939559936523, + "logits/rejected": -1.375672698020935, + "logps/chosen": -331.51190185546875, + "logps/rejected": -424.3758850097656, + "loss": 0.4551, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -1.2153559923171997, + "rewards/margins": 1.116850733757019, + "rewards/rejected": -2.3322067260742188, "step": 160 }, { - "epoch": 0.13, - "grad_norm": 48.274083606893925, - "learning_rate": 4.989701850946613e-07, - "logits/chosen": -1.5056556463241577, - "logits/rejected": -1.3766965866088867, - "logps/chosen": -335.7103271484375, - "logps/rejected": -388.94097900390625, - "loss": 0.4643, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -0.9376843571662903, - "rewards/margins": 0.8313838243484497, - "rewards/rejected": -1.7690680027008057, + "epoch": 0.2, + "grad_norm": 41.386384025836755, + "learning_rate": 4.861953332846629e-07, + "logits/chosen": -1.3303937911987305, + "logits/rejected": -1.2361196279525757, + "logps/chosen": -376.7776794433594, + "logps/rejected": -447.00604248046875, + "loss": 0.4547, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.4968476295471191, + "rewards/margins": 1.0457650423049927, + "rewards/rejected": -2.5426125526428223, "step": 170 }, { - "epoch": 0.13, - "grad_norm": 46.176765511998994, - "learning_rate": 4.982984166595104e-07, - "logits/chosen": -1.4761296510696411, - "logits/rejected": -1.3599636554718018, - "logps/chosen": -408.171630859375, - "logps/rejected": -472.0873107910156, - "loss": 0.4577, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": -1.2097257375717163, - "rewards/margins": 1.240505576133728, - "rewards/rejected": -2.4502310752868652, + "epoch": 0.21, + "grad_norm": 60.02192982418561, + "learning_rate": 4.827096885121953e-07, + "logits/chosen": -1.4225201606750488, + "logits/rejected": -1.2863495349884033, + "logps/chosen": -405.1893005371094, + "logps/rejected": -479.203857421875, + "loss": 0.4595, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.7183666229248047, + "rewards/margins": 1.0051409006118774, + "rewards/rejected": -2.7235074043273926, "step": 180 }, { - "epoch": 0.14, - "grad_norm": 43.28509926988276, - "learning_rate": 4.974595541485259e-07, - "logits/chosen": -1.3221380710601807, - "logits/rejected": -1.204590082168579, - "logps/chosen": -335.5089416503906, - "logps/rejected": -428.30621337890625, - "loss": 0.4635, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": -1.076790452003479, - "rewards/margins": 1.0969324111938477, - "rewards/rejected": -2.173722743988037, + "epoch": 0.22, + "grad_norm": 43.1272858041485, + "learning_rate": 4.788475536214821e-07, + "logits/chosen": -1.1765668392181396, + "logits/rejected": -1.0755670070648193, + "logps/chosen": -343.91741943359375, + "logps/rejected": -436.421142578125, + "loss": 0.4334, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.488790512084961, + "rewards/margins": 1.1071021556854248, + "rewards/rejected": -2.5958926677703857, "step": 190 }, { - "epoch": 0.15, - "grad_norm": 56.09927596713516, - "learning_rate": 4.964541620798307e-07, - "logits/chosen": -1.2160365581512451, - "logits/rejected": -1.118375539779663, - "logps/chosen": -348.90753173828125, - "logps/rejected": -468.21563720703125, - "loss": 0.4495, - "rewards/accuracies": 0.78125, - "rewards/chosen": -1.2727657556533813, - "rewards/margins": 1.1830675601959229, - "rewards/rejected": -2.4558334350585938, + "epoch": 0.23, + "grad_norm": 46.70523415562261, + "learning_rate": 4.746151769798818e-07, + "logits/chosen": -1.0615050792694092, + "logits/rejected": -0.8847019076347351, + "logps/chosen": -420.40704345703125, + "logps/rejected": -515.6028442382812, + "loss": 0.436, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -1.8328346014022827, + "rewards/margins": 1.3578665256500244, + "rewards/rejected": -3.1907010078430176, "step": 200 }, { - "epoch": 0.15, - "eval_logits/chosen": -1.4371435642242432, - "eval_logits/rejected": -1.366525650024414, - "eval_logps/chosen": -361.1814880371094, - "eval_logps/rejected": -427.2509765625, - "eval_loss": 0.559985339641571, - "eval_rewards/accuracies": 0.74609375, - "eval_rewards/chosen": -0.4940495491027832, - "eval_rewards/margins": 0.6032084226608276, - "eval_rewards/rejected": -1.0972579717636108, - "eval_runtime": 97.4901, - "eval_samples_per_second": 20.515, - "eval_steps_per_second": 0.328, + "epoch": 0.23, + "eval_logits/chosen": -1.0563830137252808, + "eval_logits/rejected": -0.9159815311431885, + "eval_logps/chosen": -438.318115234375, + "eval_logps/rejected": -487.5738525390625, + "eval_loss": 0.5481002330780029, + "eval_rewards/accuracies": 0.7578125, + "eval_rewards/chosen": -1.0280659198760986, + "eval_rewards/margins": 0.5166138410568237, + "eval_rewards/rejected": -1.5446796417236328, + "eval_runtime": 97.9791, + "eval_samples_per_second": 20.413, + "eval_steps_per_second": 0.327, "step": 200 }, { - "epoch": 0.16, - "grad_norm": 49.36366262587358, - "learning_rate": 4.952829170387241e-07, - "logits/chosen": -1.1800302267074585, - "logits/rejected": -1.0126550197601318, - "logps/chosen": -380.48828125, - "logps/rejected": -450.0765075683594, - "loss": 0.4458, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -1.3101383447647095, - "rewards/margins": 0.9806028604507446, - "rewards/rejected": -2.290741443634033, + "epoch": 0.24, + "grad_norm": 38.107555337824884, + "learning_rate": 4.7001940595156055e-07, + "logits/chosen": -1.0661436319351196, + "logits/rejected": -0.9267956018447876, + "logps/chosen": -323.78460693359375, + "logps/rejected": -411.57861328125, + "loss": 0.47, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.3589224815368652, + "rewards/margins": 1.0502302646636963, + "rewards/rejected": -2.4091525077819824, "step": 210 }, { - "epoch": 0.16, - "grad_norm": 57.25684926546983, - "learning_rate": 4.939466072223697e-07, - "logits/chosen": -1.2157623767852783, - "logits/rejected": -1.0489680767059326, - "logps/chosen": -372.591064453125, - "logps/rejected": -468.7542419433594, - "loss": 0.4545, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -1.3519532680511475, - "rewards/margins": 1.1502256393432617, - "rewards/rejected": -2.50217866897583, + "epoch": 0.25, + "grad_norm": 40.45903949711078, + "learning_rate": 4.650676758194623e-07, + "logits/chosen": -0.990399181842804, + "logits/rejected": -0.7643444538116455, + "logps/chosen": -397.4464111328125, + "logps/rejected": -486.4747619628906, + "loss": 0.4237, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.74880051612854, + "rewards/margins": 1.280381441116333, + "rewards/rejected": -3.029181957244873, "step": 220 }, { - "epoch": 0.17, - "grad_norm": 40.98752146946231, - "learning_rate": 4.924461319093725e-07, - "logits/chosen": -1.1049861907958984, - "logits/rejected": -1.0018864870071411, - "logps/chosen": -361.7793884277344, - "logps/rejected": -487.15460205078125, - "loss": 0.4436, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.1743983030319214, - "rewards/margins": 1.1021788120269775, - "rewards/rejected": -2.2765772342681885, + "epoch": 0.26, + "grad_norm": 46.05669291806829, + "learning_rate": 4.5976799775611215e-07, + "logits/chosen": -0.9613090753555298, + "logits/rejected": -0.6790561676025391, + "logps/chosen": -400.0892639160156, + "logps/rejected": -520.5755615234375, + "loss": 0.4331, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -1.871617078781128, + "rewards/margins": 1.643728256225586, + "rewards/rejected": -3.515345335006714, "step": 230 }, { - "epoch": 0.18, - "grad_norm": 57.39176618017778, - "learning_rate": 4.907825008546038e-07, - "logits/chosen": -0.7271394729614258, - "logits/rejected": -0.6813848614692688, - "logps/chosen": -377.90118408203125, - "logps/rejected": -523.9625244140625, - "loss": 0.4333, + "epoch": 0.28, + "grad_norm": 48.193806404654026, + "learning_rate": 4.5412894586271543e-07, + "logits/chosen": -0.7735807299613953, + "logits/rejected": -0.4405640959739685, + "logps/chosen": -401.48089599609375, + "logps/rejected": -470.61663818359375, + "loss": 0.4189, "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -1.4791629314422607, - "rewards/margins": 1.4326350688934326, - "rewards/rejected": -2.9117980003356934, + "rewards/chosen": -1.8665469884872437, + "rewards/margins": 1.244009256362915, + "rewards/rejected": -3.1105563640594482, "step": 240 }, { - "epoch": 0.19, - "grad_norm": 51.26102709104704, - "learning_rate": 4.889568336096795e-07, - "logits/chosen": -0.5312275290489197, - "logits/rejected": -0.37771934270858765, - "logps/chosen": -381.1251220703125, - "logps/rejected": -479.7431640625, - "loss": 0.4272, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": -1.5479203462600708, - "rewards/margins": 1.1352421045303345, - "rewards/rejected": -2.6831624507904053, + "epoch": 0.29, + "grad_norm": 42.10264842470945, + "learning_rate": 4.481596432975201e-07, + "logits/chosen": -0.5647836923599243, + "logits/rejected": -0.4109571576118469, + "logps/chosen": -371.7290954589844, + "logps/rejected": -471.7470703125, + "loss": 0.4267, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.94748055934906, + "rewards/margins": 1.1521204710006714, + "rewards/rejected": -3.0996010303497314, "step": 250 }, { - "epoch": 0.19, - "grad_norm": 46.69946748969463, - "learning_rate": 4.869703587695508e-07, - "logits/chosen": -0.44748228788375854, - "logits/rejected": -0.18481455743312836, - "logps/chosen": -379.5589904785156, - "logps/rejected": -527.2100830078125, - "loss": 0.4464, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": -1.412641167640686, - "rewards/margins": 1.667824149131775, - "rewards/rejected": -3.080465793609619, + "epoch": 0.3, + "grad_norm": 47.08572063870737, + "learning_rate": 4.41869747515886e-07, + "logits/chosen": -0.5823490023612976, + "logits/rejected": -0.4703378677368164, + "logps/chosen": -381.5385437011719, + "logps/rejected": -498.1053771972656, + "loss": 0.427, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -1.5624696016311646, + "rewards/margins": 1.1981754302978516, + "rewards/rejected": -2.7606451511383057, "step": 260 }, { - "epoch": 0.2, - "grad_norm": 40.8957837906737, - "learning_rate": 4.848244131457127e-07, - "logits/chosen": -0.9530747532844543, - "logits/rejected": -0.6137160062789917, - "logps/chosen": -400.1986083984375, - "logps/rejected": -499.60308837890625, - "loss": 0.4211, - "rewards/accuracies": 0.8187500238418579, - "rewards/chosen": -1.4335994720458984, - "rewards/margins": 1.4832035303115845, - "rewards/rejected": -2.9168028831481934, + "epoch": 0.31, + "grad_norm": 50.684416210598954, + "learning_rate": 4.352694346459396e-07, + "logits/chosen": -0.3632057011127472, + "logits/rejected": -0.24974389374256134, + "logps/chosen": -385.367431640625, + "logps/rejected": -490.25115966796875, + "loss": 0.4255, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.7891931533813477, + "rewards/margins": 1.2302545309066772, + "rewards/rejected": -3.0194478034973145, "step": 270 }, { - "epoch": 0.21, - "grad_norm": 45.308995144235396, - "learning_rate": 4.825204408665877e-07, - "logits/chosen": -1.2076747417449951, - "logits/rejected": -0.9289032220840454, - "logps/chosen": -426.99114990234375, - "logps/rejected": -532.0573120117188, - "loss": 0.4124, - "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -1.4818888902664185, - "rewards/margins": 1.4990845918655396, - "rewards/rejected": -2.980973720550537, + "epoch": 0.32, + "grad_norm": 40.676202653615285, + "learning_rate": 4.2836938302509256e-07, + "logits/chosen": -0.3894518315792084, + "logits/rejected": -0.02691759541630745, + "logps/chosen": -369.89105224609375, + "logps/rejected": -490.12823486328125, + "loss": 0.4298, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.6734466552734375, + "rewards/margins": 1.521540641784668, + "rewards/rejected": -3.1949872970581055, "step": 280 }, { - "epoch": 0.22, - "grad_norm": 57.75176826411474, - "learning_rate": 4.800599924056907e-07, - "logits/chosen": -0.7638604044914246, - "logits/rejected": -0.7332445383071899, - "logps/chosen": -383.2490539550781, - "logps/rejected": -556.2003784179688, - "loss": 0.3833, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -1.5847924947738647, - "rewards/margins": 1.5942741632461548, - "rewards/rejected": -3.1790668964385986, + "epoch": 0.33, + "grad_norm": 44.13447631032205, + "learning_rate": 4.2118075592405874e-07, + "logits/chosen": -0.3429946303367615, + "logits/rejected": -0.1553725004196167, + "logps/chosen": -405.2006530761719, + "logps/rejected": -525.8575439453125, + "loss": 0.41, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.8301327228546143, + "rewards/margins": 1.3930988311767578, + "rewards/rejected": -3.223231554031372, "step": 290 }, { - "epoch": 0.22, - "grad_norm": 45.582764097748154, - "learning_rate": 4.774447235382259e-07, - "logits/chosen": -0.5798165202140808, - "logits/rejected": -0.5653051733970642, - "logps/chosen": -411.58154296875, - "logps/rejected": -582.2734375, - "loss": 0.3963, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": -1.766920804977417, - "rewards/margins": 1.7389370203018188, - "rewards/rejected": -3.5058579444885254, + "epoch": 0.35, + "grad_norm": 49.31357681048272, + "learning_rate": 4.137151834863213e-07, + "logits/chosen": 0.24964532256126404, + "logits/rejected": 0.487697035074234, + "logps/chosen": -384.4162902832031, + "logps/rejected": -526.8263549804688, + "loss": 0.4266, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.1807823181152344, + "rewards/margins": 1.4443342685699463, + "rewards/rejected": -3.6251163482666016, "step": 300 }, { - "epoch": 0.22, - "eval_logits/chosen": -1.4608731269836426, - "eval_logits/rejected": -1.2769949436187744, - "eval_logps/chosen": -423.00341796875, - "eval_logps/rejected": -521.115478515625, - "eval_loss": 0.5291498303413391, - "eval_rewards/accuracies": 0.7421875, - "eval_rewards/chosen": -1.1122692823410034, - "eval_rewards/margins": 0.9236345291137695, - "eval_rewards/rejected": -2.0359039306640625, - "eval_runtime": 97.2217, - "eval_samples_per_second": 20.572, - "eval_steps_per_second": 0.329, + "epoch": 0.35, + "eval_logits/chosen": -0.2921224534511566, + "eval_logits/rejected": -0.00792422890663147, + "eval_logps/chosen": -501.82440185546875, + "eval_logps/rejected": -573.9229125976562, + "eval_loss": 0.4992181956768036, + "eval_rewards/accuracies": 0.80859375, + "eval_rewards/chosen": -1.66312837600708, + "eval_rewards/margins": 0.7450418472290039, + "eval_rewards/rejected": -2.408170223236084, + "eval_runtime": 97.8814, + "eval_samples_per_second": 20.433, + "eval_steps_per_second": 0.327, "step": 300 }, { - "epoch": 0.23, - "grad_norm": 42.82644939529418, - "learning_rate": 4.7467639422682426e-07, - "logits/chosen": -0.6843788623809814, - "logits/rejected": -0.46269315481185913, - "logps/chosen": -417.7638244628906, - "logps/rejected": -573.83837890625, - "loss": 0.4006, - "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -1.8430830240249634, - "rewards/margins": 1.669550895690918, - "rewards/rejected": -3.512633800506592, + "epoch": 0.36, + "grad_norm": 40.75479432871009, + "learning_rate": 4.059847439122671e-07, + "logits/chosen": -0.09978775680065155, + "logits/rejected": 0.3156757950782776, + "logps/chosen": -446.403564453125, + "logps/rejected": -547.4361572265625, + "loss": 0.4217, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.313018798828125, + "rewards/margins": 1.2966973781585693, + "rewards/rejected": -3.609715700149536, "step": 310 }, { - "epoch": 0.24, - "grad_norm": 55.146360598406936, - "learning_rate": 4.7175686743716223e-07, - "logits/chosen": -1.140579104423523, - "logits/rejected": -0.8973017930984497, - "logps/chosen": -419.18048095703125, - "logps/rejected": -527.0257568359375, - "loss": 0.405, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": -1.4635722637176514, - "rewards/margins": 1.3773781061172485, - "rewards/rejected": -2.8409504890441895, + "epoch": 0.37, + "grad_norm": 52.15388720725064, + "learning_rate": 3.98001943918432e-07, + "logits/chosen": -0.26411113142967224, + "logits/rejected": -0.02265077270567417, + "logps/chosen": -394.48846435546875, + "logps/rejected": -499.8885803222656, + "loss": 0.417, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.9507391452789307, + "rewards/margins": 1.151451826095581, + "rewards/rejected": -3.1021907329559326, "step": 320 }, { - "epoch": 0.25, - "grad_norm": 45.88101703811544, - "learning_rate": 4.686881078842688e-07, - "logits/chosen": -1.0653458833694458, - "logits/rejected": -0.8751330375671387, - "logps/chosen": -386.37335205078125, - "logps/rejected": -510.29949951171875, - "loss": 0.3899, + "epoch": 0.38, + "grad_norm": 51.359405875032216, + "learning_rate": 3.8977969850346866e-07, + "logits/chosen": -0.22085031867027283, + "logits/rejected": 0.16305044293403625, + "logps/chosen": -414.28179931640625, + "logps/rejected": -521.32275390625, + "loss": 0.4006, "rewards/accuracies": 0.78125, - "rewards/chosen": -1.47976553440094, - "rewards/margins": 1.366317868232727, - "rewards/rejected": -2.846083164215088, + "rewards/chosen": -1.9662139415740967, + "rewards/margins": 1.4125010967254639, + "rewards/rejected": -3.3787150382995605, "step": 330 }, { - "epoch": 0.25, - "grad_norm": 58.11307992254104, - "learning_rate": 4.654721807103558e-07, - "logits/chosen": -0.5151967406272888, - "logits/rejected": -0.14977958798408508, - "logps/chosen": -400.7736511230469, - "logps/rejected": -529.3316650390625, - "loss": 0.3938, - "rewards/accuracies": 0.8125, - "rewards/chosen": -1.7082515954971313, - "rewards/margins": 1.6958554983139038, - "rewards/rejected": -3.404106855392456, + "epoch": 0.39, + "grad_norm": 46.94996126435224, + "learning_rate": 3.8133131005357465e-07, + "logits/chosen": -0.14262983202934265, + "logits/rejected": 0.33533433079719543, + "logps/chosen": -382.92584228515625, + "logps/rejected": -544.5906982421875, + "loss": 0.3957, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.7721102237701416, + "rewards/margins": 1.8106200695037842, + "rewards/rejected": -3.5827300548553467, "step": 340 }, { - "epoch": 0.26, - "grad_norm": 48.499175539211535, - "learning_rate": 4.621112500950678e-07, - "logits/chosen": -0.8198322057723999, - "logits/rejected": -0.5934363603591919, - "logps/chosen": -429.72113037109375, - "logps/rejected": -547.5772705078125, - "loss": 0.3843, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -1.8615728616714478, - "rewards/margins": 1.499329924583435, - "rewards/rejected": -3.3609023094177246, + "epoch": 0.4, + "grad_norm": 44.17747510829861, + "learning_rate": 3.7267044682118435e-07, + "logits/chosen": 0.15123403072357178, + "logits/rejected": 0.505038857460022, + "logps/chosen": -381.1158447265625, + "logps/rejected": -513.79248046875, + "loss": 0.3939, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.9833688735961914, + "rewards/margins": 1.5414726734161377, + "rewards/rejected": -3.524841785430908, "step": 350 }, { - "epoch": 0.27, - "grad_norm": 55.599844022581365, - "learning_rate": 4.5860757779908225e-07, - "logits/chosen": -1.0455310344696045, - "logits/rejected": -0.6826554536819458, - "logps/chosen": -413.38739013671875, - "logps/rejected": -542.2623291015625, - "loss": 0.3736, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": -1.5897157192230225, - "rewards/margins": 1.6853986978530884, - "rewards/rejected": -3.2751145362854004, + "epoch": 0.41, + "grad_norm": 44.94408605122442, + "learning_rate": 3.638111208117425e-07, + "logits/chosen": -0.1444779336452484, + "logits/rejected": 0.1327807903289795, + "logps/chosen": -403.40716552734375, + "logps/rejected": -509.33642578125, + "loss": 0.4116, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.0103116035461426, + "rewards/margins": 1.1474075317382812, + "rewards/rejected": -3.157719135284424, "step": 360 }, { - "epoch": 0.27, - "grad_norm": 74.71151634556864, - "learning_rate": 4.5496352164204304e-07, - "logits/chosen": -0.4619407057762146, - "logits/rejected": -0.23415322601795197, - "logps/chosen": -426.197998046875, - "logps/rejected": -620.7210693359375, - "loss": 0.3997, - "rewards/accuracies": 0.793749988079071, - "rewards/chosen": -2.0138180255889893, - "rewards/margins": 2.0114035606384277, - "rewards/rejected": -4.025221347808838, + "epoch": 0.43, + "grad_norm": 38.27635210179194, + "learning_rate": 3.5476766511433605e-07, + "logits/chosen": -0.3243602216243744, + "logits/rejected": 0.23012924194335938, + "logps/chosen": -437.81951904296875, + "logps/rejected": -535.6306762695312, + "loss": 0.4207, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -2.0247864723205566, + "rewards/margins": 1.4481357336044312, + "rewards/rejected": -3.4729220867156982, "step": 370 }, { - "epoch": 0.28, - "grad_norm": 46.835706945950214, - "learning_rate": 4.5118153391584966e-07, - "logits/chosen": -0.7893734574317932, - "logits/rejected": -0.5286726951599121, - "logps/chosen": -348.12554931640625, - "logps/rejected": -483.89215087890625, - "loss": 0.3909, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": -1.0020155906677246, - "rewards/margins": 1.7324419021606445, - "rewards/rejected": -2.734457492828369, + "epoch": 0.44, + "grad_norm": 45.6051115938209, + "learning_rate": 3.455547107128602e-07, + "logits/chosen": -0.22209982573986053, + "logits/rejected": 0.37390798330307007, + "logps/chosen": -420.72906494140625, + "logps/rejected": -538.1783447265625, + "loss": 0.3725, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -1.714195966720581, + "rewards/margins": 1.7354304790496826, + "rewards/rejected": -3.4496264457702637, "step": 380 }, { - "epoch": 0.29, - "grad_norm": 51.06658825135186, - "learning_rate": 4.472641597343713e-07, - "logits/chosen": -0.5109713077545166, - "logits/rejected": -0.07112047076225281, - "logps/chosen": -389.3044738769531, - "logps/rejected": -567.7926635742188, - "loss": 0.3846, - "rewards/accuracies": 0.831250011920929, - "rewards/chosen": -1.6159217357635498, - "rewards/margins": 1.9207748174667358, - "rewards/rejected": -3.536696672439575, + "epoch": 0.45, + "grad_norm": 62.59138136421735, + "learning_rate": 3.361871628152338e-07, + "logits/chosen": 0.11889226734638214, + "logits/rejected": 0.4943965971469879, + "logps/chosen": -413.30584716796875, + "logps/rejected": -576.02294921875, + "loss": 0.4157, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.1105499267578125, + "rewards/margins": 1.6212915182113647, + "rewards/rejected": -3.7318413257598877, "step": 390 }, { - "epoch": 0.3, - "grad_norm": 44.181665144710905, - "learning_rate": 4.4321403532069523e-07, - "logits/chosen": -0.5097373127937317, - "logits/rejected": -0.2719523012638092, - "logps/chosen": -353.91278076171875, - "logps/rejected": -517.2376708984375, - "loss": 0.4012, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": -1.5704162120819092, - "rewards/margins": 1.8435367345809937, - "rewards/rejected": -3.4139533042907715, + "epoch": 0.46, + "grad_norm": 46.94899679949689, + "learning_rate": 3.2668017673896077e-07, + "logits/chosen": 0.019377555698156357, + "logits/rejected": 0.4903746545314789, + "logps/chosen": -378.95196533203125, + "logps/rejected": -502.42132568359375, + "loss": 0.3779, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -1.7431423664093018, + "rewards/margins": 1.6023037433624268, + "rewards/rejected": -3.3454463481903076, "step": 400 }, { - "epoch": 0.3, - "eval_logits/chosen": -1.3372514247894287, - "eval_logits/rejected": -1.1222751140594482, - "eval_logps/chosen": -417.65863037109375, - "eval_logps/rejected": -516.7505493164062, - "eval_loss": 0.5314938426017761, - "eval_rewards/accuracies": 0.7734375, - "eval_rewards/chosen": -1.058821201324463, - "eval_rewards/margins": 0.9334329962730408, - "eval_rewards/rejected": -1.9922541379928589, - "eval_runtime": 97.4658, - "eval_samples_per_second": 20.52, - "eval_steps_per_second": 0.328, + "epoch": 0.46, + "eval_logits/chosen": -0.29974818229675293, + "eval_logits/rejected": 0.09132258594036102, + "eval_logps/chosen": -488.41680908203125, + "eval_logps/rejected": -561.919921875, + "eval_loss": 0.4760280251502991, + "eval_rewards/accuracies": 0.81640625, + "eval_rewards/chosen": -1.529052972793579, + "eval_rewards/margins": 0.7590875029563904, + "eval_rewards/rejected": -2.288140296936035, + "eval_runtime": 97.9145, + "eval_samples_per_second": 20.426, + "eval_steps_per_second": 0.327, "step": 400 }, { - "epoch": 0.3, - "grad_norm": 50.26869622592037, - "learning_rate": 4.390338862330631e-07, - "logits/chosen": -0.7592865824699402, - "logits/rejected": -0.4464483857154846, - "logps/chosen": -401.47607421875, - "logps/rejected": -523.3784790039062, - "loss": 0.3803, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -1.7566916942596436, - "rewards/margins": 1.5606569051742554, - "rewards/rejected": -3.3173484802246094, + "epoch": 0.47, + "grad_norm": 42.41150092354519, + "learning_rate": 3.1704913339205103e-07, + "logits/chosen": 0.11346012353897095, + "logits/rejected": 0.6633824706077576, + "logps/chosen": -411.09710693359375, + "logps/rejected": -563.5574951171875, + "loss": 0.3878, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -1.9903961420059204, + "rewards/margins": 1.649317741394043, + "rewards/rejected": -3.639713764190674, "step": 410 }, { - "epoch": 0.31, - "grad_norm": 51.57934206296598, - "learning_rate": 4.3472652553068835e-07, - "logits/chosen": -0.6644355654716492, - "logits/rejected": -0.23346371948719025, - "logps/chosen": -404.8458557128906, - "logps/rejected": -540.8956298828125, - "loss": 0.3797, + "epoch": 0.48, + "grad_norm": 40.93253239819171, + "learning_rate": 3.0730961438896885e-07, + "logits/chosen": -0.09407112747430801, + "logits/rejected": 0.41040462255477905, + "logps/chosen": -475.8846130371094, + "logps/rejected": -580.8755493164062, + "loss": 0.3865, "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -1.7600839138031006, - "rewards/margins": 1.6869585514068604, - "rewards/rejected": -3.4470419883728027, + "rewards/chosen": -2.0496103763580322, + "rewards/margins": 1.4907814264297485, + "rewards/rejected": -3.540391445159912, "step": 420 }, { - "epoch": 0.32, - "grad_norm": 73.04228089758476, - "learning_rate": 4.3029485188068895e-07, - "logits/chosen": 0.10370206832885742, - "logits/rejected": 0.39608412981033325, - "logps/chosen": -385.42498779296875, - "logps/rejected": -570.5172729492188, - "loss": 0.3655, - "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -1.839719533920288, - "rewards/margins": 1.714897871017456, - "rewards/rejected": -3.5546176433563232, + "epoch": 0.5, + "grad_norm": 52.33032593793305, + "learning_rate": 2.9747737684186795e-07, + "logits/chosen": 0.3089054524898529, + "logits/rejected": 0.609168529510498, + "logps/chosen": -392.91888427734375, + "logps/rejected": -519.9400024414062, + "loss": 0.3899, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.8225826025009155, + "rewards/margins": 1.5612199306488037, + "rewards/rejected": -3.383802890777588, "step": 430 }, { - "epoch": 0.33, - "grad_norm": 54.512857623037554, - "learning_rate": 4.257418476074103e-07, - "logits/chosen": -0.023069072514772415, - "logits/rejected": 0.3960541784763336, - "logps/chosen": -423.490478515625, - "logps/rejected": -592.7897338867188, - "loss": 0.3638, + "epoch": 0.51, + "grad_norm": 40.99316990627953, + "learning_rate": 2.8756832786789663e-07, + "logits/chosen": 0.07930847257375717, + "logits/rejected": 0.5284430980682373, + "logps/chosen": -410.39459228515625, + "logps/rejected": -527.0020751953125, + "loss": 0.4048, "rewards/accuracies": 0.824999988079071, - "rewards/chosen": -1.7877943515777588, - "rewards/margins": 2.115088701248169, - "rewards/rejected": -3.9028830528259277, + "rewards/chosen": -1.807734727859497, + "rewards/margins": 1.5688127279281616, + "rewards/rejected": -3.3765475749969482, "step": 440 }, { - "epoch": 0.33, - "grad_norm": 55.7162708155443, - "learning_rate": 4.210705766854504e-07, - "logits/chosen": 0.15324774384498596, - "logits/rejected": 0.521506667137146, - "logps/chosen": -456.01776123046875, - "logps/rejected": -625.3338623046875, - "loss": 0.352, + "epoch": 0.52, + "grad_norm": 50.98336558470055, + "learning_rate": 2.7759849885381747e-07, + "logits/chosen": 0.13231831789016724, + "logits/rejected": 0.8564577102661133, + "logps/chosen": -452.3924255371094, + "logps/rejected": -579.27490234375, + "loss": 0.3869, "rewards/accuracies": 0.824999988079071, - "rewards/chosen": -2.096989870071411, - "rewards/margins": 1.874829649925232, - "rewards/rejected": -3.9718196392059326, + "rewards/chosen": -2.116553783416748, + "rewards/margins": 1.8397691249847412, + "rewards/rejected": -3.9563231468200684, "step": 450 }, { - "epoch": 0.34, - "grad_norm": 51.50110954656292, - "learning_rate": 4.16284182677737e-07, - "logits/chosen": 0.3847750127315521, - "logits/rejected": 0.9687877893447876, - "logps/chosen": -421.48321533203125, - "logps/rejected": -571.6495361328125, - "loss": 0.3771, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": -1.7761863470077515, - "rewards/margins": 1.777931809425354, - "rewards/rejected": -3.5541183948516846, + "epoch": 0.53, + "grad_norm": 36.53232487094238, + "learning_rate": 2.675840195195762e-07, + "logits/chosen": 0.1366969347000122, + "logits/rejected": 0.6087800860404968, + "logps/chosen": -388.03546142578125, + "logps/rejected": -536.0306396484375, + "loss": 0.3903, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -1.9025928974151611, + "rewards/margins": 1.6377290487289429, + "rewards/rejected": -3.5403220653533936, "step": 460 }, { - "epoch": 0.35, - "grad_norm": 42.17081561639591, - "learning_rate": 4.113858866200466e-07, - "logits/chosen": 0.5899291634559631, - "logits/rejected": 0.9651363492012024, - "logps/chosen": -411.4060974121094, - "logps/rejected": -587.0046997070312, - "loss": 0.3551, - "rewards/accuracies": 0.8687499761581421, - "rewards/chosen": -1.751307725906372, - "rewards/margins": 1.814639687538147, - "rewards/rejected": -3.5659472942352295, + "epoch": 0.54, + "grad_norm": 48.92609199634003, + "learning_rate": 2.575410918227829e-07, + "logits/chosen": 0.14545145630836487, + "logits/rejected": 0.5727478861808777, + "logps/chosen": -420.89129638671875, + "logps/rejected": -546.9252319335938, + "loss": 0.3812, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.8398689031600952, + "rewards/margins": 1.5490201711654663, + "rewards/rejected": -3.3888893127441406, "step": 470 }, { - "epoch": 0.36, - "grad_norm": 48.02610054790726, - "learning_rate": 4.063789848533865e-07, - "logits/chosen": 0.46232396364212036, - "logits/rejected": 1.0872290134429932, - "logps/chosen": -472.24139404296875, - "logps/rejected": -634.9567260742188, - "loss": 0.374, - "rewards/accuracies": 0.78125, - "rewards/chosen": -2.287501573562622, - "rewards/margins": 1.8356859683990479, - "rewards/rejected": -4.123187065124512, + "epoch": 0.55, + "grad_norm": 51.43432335912027, + "learning_rate": 2.474859637463226e-07, + "logits/chosen": 0.060339294373989105, + "logits/rejected": 0.8116003274917603, + "logps/chosen": -399.6466369628906, + "logps/rejected": -525.1762084960938, + "loss": 0.3789, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.7933282852172852, + "rewards/margins": 1.7597835063934326, + "rewards/rejected": -3.553112030029297, "step": 480 }, { - "epoch": 0.36, - "grad_norm": 45.88835702974933, - "learning_rate": 4.0126684680570074e-07, - "logits/chosen": -0.3817380368709564, - "logits/rejected": 0.1566486358642578, - "logps/chosen": -461.13934326171875, - "logps/rejected": -592.1519165039062, - "loss": 0.334, + "epoch": 0.56, + "grad_norm": 43.984850774488535, + "learning_rate": 2.3743490301150355e-07, + "logits/chosen": 0.20873646438121796, + "logits/rejected": 1.0175427198410034, + "logps/chosen": -386.0199890136719, + "logps/rejected": -533.820068359375, + "loss": 0.3935, "rewards/accuracies": 0.856249988079071, - "rewards/chosen": -1.8447940349578857, - "rewards/margins": 1.7669038772583008, - "rewards/rejected": -3.6116981506347656, + "rewards/chosen": -1.6440702676773071, + "rewards/margins": 1.8672387599945068, + "rewards/rejected": -3.5113091468811035, "step": 490 }, { - "epoch": 0.37, - "grad_norm": 53.85769217498667, - "learning_rate": 3.960529127243902e-07, - "logits/chosen": -0.31509625911712646, - "logits/rejected": -0.04504912719130516, - "logps/chosen": -477.027099609375, - "logps/rejected": -654.2672119140625, - "loss": 0.3559, - "rewards/accuracies": 0.856249988079071, - "rewards/chosen": -2.053821086883545, - "rewards/margins": 2.070889711380005, - "rewards/rejected": -4.124711036682129, + "epoch": 0.58, + "grad_norm": 47.341362361122584, + "learning_rate": 2.274041707592724e-07, + "logits/chosen": 0.6152507066726685, + "logits/rejected": 1.061025619506836, + "logps/chosen": -425.3057556152344, + "logps/rejected": -613.043212890625, + "loss": 0.3713, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.342782974243164, + "rewards/margins": 1.9541997909545898, + "rewards/rejected": -4.296982765197754, "step": 500 }, { - "epoch": 0.37, - "eval_logits/chosen": -1.0066841840744019, - "eval_logits/rejected": -0.6833571791648865, - "eval_logps/chosen": -456.0086364746094, - "eval_logps/rejected": -568.9822387695312, - "eval_loss": 0.5275729894638062, - "eval_rewards/accuracies": 0.7578125, - "eval_rewards/chosen": -1.4423211812973022, - "eval_rewards/margins": 1.0722503662109375, - "eval_rewards/rejected": -2.5145716667175293, - "eval_runtime": 97.6519, - "eval_samples_per_second": 20.481, - "eval_steps_per_second": 0.328, + "epoch": 0.58, + "eval_logits/chosen": 0.0540677085518837, + "eval_logits/rejected": 0.4885663688182831, + "eval_logps/chosen": -471.02880859375, + "eval_logps/rejected": -557.1675415039062, + "eval_loss": 0.4526832401752472, + "eval_rewards/accuracies": 0.83203125, + "eval_rewards/chosen": -1.3551722764968872, + "eval_rewards/margins": 0.885444164276123, + "eval_rewards/rejected": -2.2406163215637207, + "eval_runtime": 98.0229, + "eval_samples_per_second": 20.403, + "eval_steps_per_second": 0.326, "step": 500 }, { - "epoch": 0.38, - "grad_norm": 53.47947486686438, - "learning_rate": 3.9074069136117594e-07, - "logits/chosen": -0.6587181687355042, - "logits/rejected": -0.11707913875579834, - "logps/chosen": -478.9352111816406, - "logps/rejected": -631.669921875, - "loss": 0.35, - "rewards/accuracies": 0.831250011920929, - "rewards/chosen": -2.0555968284606934, - "rewards/margins": 1.9847618341445923, - "rewards/rejected": -4.040358543395996, + "epoch": 0.59, + "grad_norm": 44.58362843206397, + "learning_rate": 2.17409995242075e-07, + "logits/chosen": 0.7180274128913879, + "logits/rejected": 1.3152275085449219, + "logps/chosen": -407.83294677734375, + "logps/rejected": -553.6475830078125, + "loss": 0.3947, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.083230972290039, + "rewards/margins": 1.8255109786987305, + "rewards/rejected": -3.9087421894073486, "step": 510 }, { - "epoch": 0.39, - "grad_norm": 48.01190508303512, - "learning_rate": 3.8533375761086094e-07, - "logits/chosen": -0.6520954966545105, - "logits/rejected": -0.19666698575019836, - "logps/chosen": -399.66455078125, - "logps/rejected": -589.08251953125, - "loss": 0.3518, - "rewards/accuracies": 0.8187500238418579, - "rewards/chosen": -1.5765998363494873, - "rewards/margins": 2.0024795532226562, - "rewards/rejected": -3.5790793895721436, + "epoch": 0.6, + "grad_norm": 39.74511163289153, + "learning_rate": 2.0746854556892544e-07, + "logits/chosen": 0.7251445055007935, + "logits/rejected": 0.930740475654602, + "logps/chosen": -376.7163391113281, + "logps/rejected": -514.938232421875, + "loss": 0.4067, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.9309794902801514, + "rewards/margins": 1.4579864740371704, + "rewards/rejected": -3.3889663219451904, "step": 520 }, { - "epoch": 0.39, - "grad_norm": 58.201909693922666, - "learning_rate": 3.79835750105581e-07, - "logits/chosen": -0.015231219120323658, - "logits/rejected": 0.524590253829956, - "logps/chosen": -425.837890625, - "logps/rejected": -576.46630859375, - "loss": 0.364, - "rewards/accuracies": 0.8187500238418579, - "rewards/chosen": -1.9596973657608032, - "rewards/margins": 1.918087363243103, - "rewards/rejected": -3.8777847290039062, + "epoch": 0.61, + "grad_norm": 41.78948639916862, + "learning_rate": 1.9759590554616173e-07, + "logits/chosen": 0.3273155689239502, + "logits/rejected": 0.7224196195602417, + "logps/chosen": -409.301513671875, + "logps/rejected": -529.0758056640625, + "loss": 0.3983, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.8711875677108765, + "rewards/margins": 1.4573709964752197, + "rewards/rejected": -3.328559160232544, "step": 530 }, { - "epoch": 0.4, - "grad_norm": 53.67325387574443, - "learning_rate": 3.742503687661627e-07, - "logits/chosen": 0.3345823585987091, - "logits/rejected": 0.8041492700576782, - "logps/chosen": -436.06170654296875, - "logps/rejected": -628.6650390625, - "loss": 0.3413, - "rewards/accuracies": 0.831250011920929, - "rewards/chosen": -2.163074016571045, - "rewards/margins": 2.0728249549865723, - "rewards/rejected": -4.235899925231934, + "epoch": 0.62, + "grad_norm": 46.45949710434537, + "learning_rate": 1.8780804765620746e-07, + "logits/chosen": 0.4330478608608246, + "logits/rejected": 0.7140570878982544, + "logps/chosen": -430.78216552734375, + "logps/rejected": -596.1729736328125, + "loss": 0.3842, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.9393028020858765, + "rewards/margins": 1.6490939855575562, + "rewards/rejected": -3.5883967876434326, "step": 540 }, { - "epoch": 0.41, - "grad_norm": 54.5126564713129, - "learning_rate": 3.685813723122372e-07, - "logits/chosen": 0.6497628688812256, - "logits/rejected": 1.1682524681091309, - "logps/chosen": -425.30157470703125, - "logps/rejected": -617.69482421875, - "loss": 0.3365, - "rewards/accuracies": 0.8687499761581421, - "rewards/chosen": -1.9300180673599243, - "rewards/margins": 2.057875394821167, - "rewards/rejected": -3.987893581390381, + "epoch": 0.63, + "grad_norm": 43.293723569052126, + "learning_rate": 1.7812080721643973e-07, + "logits/chosen": 0.6848554015159607, + "logits/rejected": 1.3088548183441162, + "logps/chosen": -445.448974609375, + "logps/rejected": -563.468017578125, + "loss": 0.3878, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -2.22265887260437, + "rewards/margins": 1.7124555110931396, + "rewards/rejected": -3.935114622116089, "step": 550 }, { - "epoch": 0.42, - "grad_norm": 62.74924566191948, - "learning_rate": 3.6283257573278466e-07, - "logits/chosen": 0.867998480796814, - "logits/rejected": 1.330685019493103, - "logps/chosen": -455.71124267578125, - "logps/rejected": -659.052978515625, - "loss": 0.3223, - "rewards/accuracies": 0.8687499761581421, - "rewards/chosen": -2.0765323638916016, - "rewards/margins": 2.156247615814209, - "rewards/rejected": -4.2327799797058105, + "epoch": 0.64, + "grad_norm": 45.49524925522091, + "learning_rate": 1.6854985675997063e-07, + "logits/chosen": 0.5498164296150208, + "logits/rejected": 0.9551798105239868, + "logps/chosen": -438.2308044433594, + "logps/rejected": -576.6223754882812, + "loss": 0.3709, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -2.190483570098877, + "rewards/margins": 1.594870686531067, + "rewards/rejected": -3.7853546142578125, "step": 560 }, { - "epoch": 0.42, - "grad_norm": 48.6969642598068, - "learning_rate": 3.5700784771881224e-07, - "logits/chosen": 1.0166234970092773, - "logits/rejected": 1.6870880126953125, - "logps/chosen": -478.86407470703125, - "logps/rejected": -635.7424926757812, - "loss": 0.3382, - "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -2.4357941150665283, - "rewards/margins": 1.9054218530654907, - "rewards/rejected": -4.341216087341309, + "epoch": 0.66, + "grad_norm": 49.552774447582685, + "learning_rate": 1.5911068067978818e-07, + "logits/chosen": 1.0865147113800049, + "logits/rejected": 1.311841368675232, + "logps/chosen": -423.2745666503906, + "logps/rejected": -610.8968505859375, + "loss": 0.3604, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -2.3493454456329346, + "rewards/margins": 1.8509175777435303, + "rewards/rejected": -4.200263023376465, "step": 570 }, { - "epoch": 0.43, - "grad_norm": 43.243072977055355, - "learning_rate": 3.511111080598925e-07, - "logits/chosen": 0.6339820623397827, - "logits/rejected": 1.3627948760986328, - "logps/chosen": -447.268798828125, - "logps/rejected": -636.5888671875, - "loss": 0.3276, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": -1.9055280685424805, - "rewards/margins": 2.3114867210388184, - "rewards/rejected": -4.217014312744141, + "epoch": 0.67, + "grad_norm": 63.52041865054028, + "learning_rate": 1.4981855017728197e-07, + "logits/chosen": 0.6576471328735352, + "logits/rejected": 1.0078567266464233, + "logps/chosen": -491.30853271484375, + "logps/rejected": -656.1266479492188, + "loss": 0.3887, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.717817783355713, + "rewards/margins": 1.6682507991790771, + "rewards/rejected": -4.386068820953369, "step": 580 }, { - "epoch": 0.44, - "grad_norm": 69.40196325230258, - "learning_rate": 3.451463250063146e-07, - "logits/chosen": 0.8395903706550598, - "logits/rejected": 1.488012671470642, - "logps/chosen": -432.853271484375, - "logps/rejected": -630.223876953125, - "loss": 0.3378, - "rewards/accuracies": 0.8374999761581421, - "rewards/chosen": -1.9797086715698242, - "rewards/margins": 2.143889904022217, - "rewards/rejected": -4.123598098754883, + "epoch": 0.68, + "grad_norm": 43.46932861137291, + "learning_rate": 1.406884985556804e-07, + "logits/chosen": 0.5208752155303955, + "logits/rejected": 1.1108620166778564, + "logps/chosen": -435.3440856933594, + "logps/rejected": -587.5715942382812, + "loss": 0.3827, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.262911081314087, + "rewards/margins": 1.835397720336914, + "rewards/rejected": -4.098308563232422, "step": 590 }, { - "epoch": 0.45, - "grad_norm": 59.19017069860126, - "learning_rate": 3.3911751259862403e-07, - "logits/chosen": 0.9315579533576965, - "logits/rejected": 1.3961995840072632, - "logps/chosen": -493.1189880371094, - "logps/rejected": -684.4100341796875, - "loss": 0.3291, - "rewards/accuracies": 0.893750011920929, - "rewards/chosen": -2.291141986846924, - "rewards/margins": 2.0969302654266357, - "rewards/rejected": -4.3880720138549805, + "epoch": 0.69, + "grad_norm": 55.685425152683656, + "learning_rate": 1.3173529689837354e-07, + "logits/chosen": 0.7303274869918823, + "logits/rejected": 1.3893494606018066, + "logps/chosen": -411.096435546875, + "logps/rejected": -538.6531982421875, + "loss": 0.3817, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.0049891471862793, + "rewards/margins": 1.6658912897109985, + "rewards/rejected": -3.670880079269409, "step": 600 }, { - "epoch": 0.45, - "eval_logits/chosen": -0.2334394007921219, - "eval_logits/rejected": 0.188625305891037, - "eval_logps/chosen": -477.9444580078125, - "eval_logps/rejected": -595.6332397460938, - "eval_loss": 0.5102677941322327, - "eval_rewards/accuracies": 0.76953125, - "eval_rewards/chosen": -1.6616793870925903, - "eval_rewards/margins": 1.1194015741348267, - "eval_rewards/rejected": -2.781080961227417, - "eval_runtime": 97.2562, - "eval_samples_per_second": 20.564, - "eval_steps_per_second": 0.329, + "epoch": 0.69, + "eval_logits/chosen": 0.45960405468940735, + "eval_logits/rejected": 0.9378364086151123, + "eval_logps/chosen": -488.27398681640625, + "eval_logps/rejected": -576.624755859375, + "eval_loss": 0.43982604146003723, + "eval_rewards/accuracies": 0.8515625, + "eval_rewards/chosen": -1.5276248455047607, + "eval_rewards/margins": 0.9075638055801392, + "eval_rewards/rejected": -2.4351885318756104, + "eval_runtime": 98.0553, + "eval_samples_per_second": 20.397, + "eval_steps_per_second": 0.326, "step": 600 }, { - "epoch": 0.45, - "grad_norm": 37.653590501774474, - "learning_rate": 3.3302872796634754e-07, - "logits/chosen": 0.9580332040786743, - "logits/rejected": 1.3357497453689575, - "logps/chosen": -427.964111328125, - "logps/rejected": -620.7327880859375, - "loss": 0.3122, - "rewards/accuracies": 0.8687499761581421, - "rewards/chosen": -1.95559823513031, - "rewards/margins": 2.1169991493225098, - "rewards/rejected": -4.072597503662109, + "epoch": 0.7, + "grad_norm": 51.141776378841776, + "learning_rate": 1.2297343017146726e-07, + "logits/chosen": 1.2994943857192993, + "logits/rejected": 1.7489960193634033, + "logps/chosen": -437.8944396972656, + "logps/rejected": -563.5843505859375, + "loss": 0.3939, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -2.239246129989624, + "rewards/margins": 1.5640047788619995, + "rewards/rejected": -3.803250789642334, "step": 610 }, { - "epoch": 0.46, - "grad_norm": 47.96131506831022, - "learning_rate": 3.2688406859772035e-07, - "logits/chosen": 0.8878351449966431, - "logits/rejected": 1.4351171255111694, - "logps/chosen": -489.7989196777344, - "logps/rejected": -665.8047485351562, - "loss": 0.3224, + "epoch": 0.71, + "grad_norm": 47.72075892004601, + "learning_rate": 1.1441707378923474e-07, + "logits/chosen": 0.8435400128364563, + "logits/rejected": 1.541442632675171, + "logps/chosen": -382.5939025878906, + "logps/rejected": -538.156005859375, + "loss": 0.3816, "rewards/accuracies": 0.84375, - "rewards/chosen": -2.195067882537842, - "rewards/margins": 2.1086602210998535, - "rewards/rejected": -4.3037285804748535, + "rewards/chosen": -1.7874940633773804, + "rewards/margins": 1.7763278484344482, + "rewards/rejected": -3.5638222694396973, "step": 620 }, { - "epoch": 0.47, - "grad_norm": 65.32009143781127, - "learning_rate": 3.206876695822541e-07, - "logits/chosen": 1.3710159063339233, - "logits/rejected": 1.7163244485855103, - "logps/chosen": -493.956298828125, - "logps/rejected": -688.6646728515625, - "loss": 0.3129, - "rewards/accuracies": 0.856249988079071, - "rewards/chosen": -2.438476085662842, - "rewards/margins": 2.2680106163024902, - "rewards/rejected": -4.706486701965332, + "epoch": 0.73, + "grad_norm": 46.403069038802954, + "learning_rate": 1.06080070680377e-07, + "logits/chosen": 1.035014271736145, + "logits/rejected": 1.5653488636016846, + "logps/chosen": -415.8087463378906, + "logps/rejected": -552.2741088867188, + "loss": 0.3781, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.8548622131347656, + "rewards/margins": 1.7130234241485596, + "rewards/rejected": -3.567885637283325, "step": 630 }, { - "epoch": 0.48, - "grad_norm": 66.03238810693847, - "learning_rate": 3.144437008280012e-07, - "logits/chosen": 0.709919273853302, - "logits/rejected": 1.0818461179733276, - "logps/chosen": -468.56890869140625, - "logps/rejected": -691.1434326171875, - "loss": 0.3232, - "rewards/accuracies": 0.90625, - "rewards/chosen": -2.252897262573242, - "rewards/margins": 2.3767807483673096, - "rewards/rejected": -4.629677772521973, + "epoch": 0.74, + "grad_norm": 42.088226042573645, + "learning_rate": 9.797590889219587e-08, + "logits/chosen": 0.6099433302879333, + "logits/rejected": 1.3236600160598755, + "logps/chosen": -412.41351318359375, + "logps/rejected": -552.0162353515625, + "loss": 0.389, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -1.8625577688217163, + "rewards/margins": 1.764461874961853, + "rewards/rejected": -3.6270194053649902, "step": 640 }, { - "epoch": 0.48, - "grad_norm": 47.885060646853404, - "learning_rate": 3.0815636425538665e-07, - "logits/chosen": 1.0194989442825317, - "logits/rejected": 1.571274995803833, - "logps/chosen": -446.6681213378906, - "logps/rejected": -611.84033203125, - "loss": 0.3429, - "rewards/accuracies": 0.862500011920929, - "rewards/chosen": -2.190187454223633, - "rewards/margins": 2.0423951148986816, - "rewards/rejected": -4.232582092285156, + "epoch": 0.75, + "grad_norm": 48.75764274598144, + "learning_rate": 9.011769976891367e-08, + "logits/chosen": 1.0017446279525757, + "logits/rejected": 1.4410614967346191, + "logps/chosen": -427.5480041503906, + "logps/rejected": -576.4400634765625, + "loss": 0.3789, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -2.1677966117858887, + "rewards/margins": 1.6422433853149414, + "rewards/rejected": -3.810039520263672, "step": 650 }, { - "epoch": 0.49, - "grad_norm": 59.75526535732341, - "learning_rate": 3.018298909694986e-07, - "logits/chosen": 1.3580573797225952, - "logits/rejected": 1.913851022720337, - "logps/chosen": -489.56982421875, - "logps/rejected": -673.2572021484375, - "loss": 0.3288, - "rewards/accuracies": 0.856249988079071, - "rewards/chosen": -2.595083236694336, - "rewards/margins": 2.0307328701019287, - "rewards/rejected": -4.6258158683776855, + "epoch": 0.76, + "grad_norm": 48.19640920928193, + "learning_rate": 8.251815673944218e-08, + "logits/chosen": 1.2581113576889038, + "logits/rejected": 1.898616075515747, + "logps/chosen": -470.30450439453125, + "logps/rejected": -610.0867919921875, + "loss": 0.3836, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.4716596603393555, + "rewards/margins": 1.8109443187713623, + "rewards/rejected": -4.2826032638549805, "step": 660 }, { - "epoch": 0.5, - "grad_norm": 51.20761564052719, - "learning_rate": 2.954685384127371e-07, - "logits/chosen": 0.8674410581588745, - "logits/rejected": 1.4072096347808838, - "logps/chosen": -482.65789794921875, - "logps/rejected": -649.311279296875, - "loss": 0.301, - "rewards/accuracies": 0.856249988079071, - "rewards/chosen": -2.194945812225342, - "rewards/margins": 2.093947172164917, - "rewards/rejected": -4.288893222808838, + "epoch": 0.77, + "grad_norm": 42.83800076757557, + "learning_rate": 7.518957474892148e-08, + "logits/chosen": 1.1020927429199219, + "logits/rejected": 1.865277886390686, + "logps/chosen": -445.0255432128906, + "logps/rejected": -610.1484375, + "loss": 0.3632, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.2662646770477295, + "rewards/margins": 2.038935661315918, + "rewards/rejected": -4.305201053619385, "step": 670 }, { - "epoch": 0.51, - "grad_norm": 62.65952308868226, - "learning_rate": 2.8907658749974054e-07, - "logits/chosen": 0.9979363679885864, - "logits/rejected": 1.4131087064743042, - "logps/chosen": -457.8363342285156, - "logps/rejected": -703.2235107421875, - "loss": 0.2929, - "rewards/accuracies": 0.9125000238418579, - "rewards/chosen": -2.264411449432373, - "rewards/margins": 2.5431039333343506, - "rewards/rejected": -4.807515621185303, + "epoch": 0.78, + "grad_norm": 43.94063414538923, + "learning_rate": 6.814381036730274e-08, + "logits/chosen": 1.0993672609329224, + "logits/rejected": 1.476314663887024, + "logps/chosen": -414.5472717285156, + "logps/rejected": -555.24169921875, + "loss": 0.3891, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -2.070984363555908, + "rewards/margins": 1.6533327102661133, + "rewards/rejected": -3.7243168354034424, "step": 680 }, { - "epoch": 0.51, - "grad_norm": 49.65473672539794, - "learning_rate": 2.8265833973651503e-07, - "logits/chosen": 0.6275979280471802, - "logits/rejected": 1.0561200380325317, - "logps/chosen": -459.69976806640625, - "logps/rejected": -684.1864013671875, - "loss": 0.2859, - "rewards/accuracies": 0.9125000238418579, - "rewards/chosen": -1.8421128988265991, - "rewards/margins": 2.5259382724761963, - "rewards/rejected": -4.368051528930664, + "epoch": 0.79, + "grad_norm": 48.996886773937526, + "learning_rate": 6.139226260715872e-08, + "logits/chosen": 1.0745365619659424, + "logits/rejected": 1.6123872995376587, + "logps/chosen": -420.8257751464844, + "logps/rejected": -591.067138671875, + "loss": 0.3752, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.08198618888855, + "rewards/margins": 1.920911431312561, + "rewards/rejected": -4.0028977394104, "step": 690 }, { - "epoch": 0.52, - "grad_norm": 48.72864396453521, - "learning_rate": 2.7621811432570736e-07, - "logits/chosen": 0.8585799336433411, - "logits/rejected": 1.5937745571136475, - "logps/chosen": -518.5455932617188, - "logps/rejected": -734.5382690429688, - "loss": 0.2735, - "rewards/accuracies": 0.887499988079071, - "rewards/chosen": -2.441080093383789, - "rewards/margins": 2.6617679595947266, - "rewards/rejected": -5.102847576141357, + "epoch": 0.81, + "grad_norm": 44.913687680486454, + "learning_rate": 5.4945854481754734e-08, + "logits/chosen": 1.2772502899169922, + "logits/rejected": 1.7744108438491821, + "logps/chosen": -427.616943359375, + "logps/rejected": -586.7508544921875, + "loss": 0.3613, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.3168463706970215, + "rewards/margins": 1.800844430923462, + "rewards/rejected": -4.117690563201904, "step": 700 }, { - "epoch": 0.52, - "eval_logits/chosen": 0.18704134225845337, - "eval_logits/rejected": 0.6721899509429932, - "eval_logps/chosen": -541.279541015625, - "eval_logps/rejected": -687.587158203125, - "eval_loss": 0.5288776159286499, - "eval_rewards/accuracies": 0.76171875, - "eval_rewards/chosen": -2.2950310707092285, - "eval_rewards/margins": 1.40558922290802, - "eval_rewards/rejected": -3.70061993598938, - "eval_runtime": 97.5006, - "eval_samples_per_second": 20.513, - "eval_steps_per_second": 0.328, + "epoch": 0.81, + "eval_logits/chosen": 0.4112035036087036, + "eval_logits/rejected": 0.9228037595748901, + "eval_logps/chosen": -482.6694641113281, + "eval_logps/rejected": -572.7808837890625, + "eval_loss": 0.4307582378387451, + "eval_rewards/accuracies": 0.87109375, + "eval_rewards/chosen": -1.4715794324874878, + "eval_rewards/margins": 0.9251713752746582, + "eval_rewards/rejected": -2.3967509269714355, + "eval_runtime": 97.8788, + "eval_samples_per_second": 20.433, + "eval_steps_per_second": 0.327, "step": 700 }, { - "epoch": 0.53, - "grad_norm": 50.62866425523001, - "learning_rate": 2.6976024525996917e-07, - "logits/chosen": 1.1524347066879272, - "logits/rejected": 1.7467842102050781, - "logps/chosen": -503.6927795410156, - "logps/rejected": -780.6187744140625, - "loss": 0.286, - "rewards/accuracies": 0.90625, - "rewards/chosen": -2.7125723361968994, - "rewards/margins": 2.8134512901306152, - "rewards/rejected": -5.5260233879089355, + "epoch": 0.82, + "grad_norm": 45.470449244785705, + "learning_rate": 4.881501533321605e-08, + "logits/chosen": 1.6023136377334595, + "logits/rejected": 2.1105284690856934, + "logps/chosen": -408.48602294921875, + "logps/rejected": -587.4254150390625, + "loss": 0.355, + "rewards/accuracies": 0.84375, + "rewards/chosen": -2.2400927543640137, + "rewards/margins": 1.9537967443466187, + "rewards/rejected": -4.193889617919922, "step": 710 }, { - "epoch": 0.53, - "grad_norm": 56.03367218705217, - "learning_rate": 2.6328907840536706e-07, - "logits/chosen": 0.7062090039253235, - "logits/rejected": 1.2199087142944336, - "logps/chosen": -460.45794677734375, - "logps/rejected": -685.5617065429688, - "loss": 0.3244, - "rewards/accuracies": 0.8687499761581421, - "rewards/chosen": -2.43827748298645, - "rewards/margins": 2.252427577972412, - "rewards/rejected": -4.690704822540283, + "epoch": 0.83, + "grad_norm": 42.625832201611274, + "learning_rate": 4.300966395938377e-08, + "logits/chosen": 1.143576741218567, + "logits/rejected": 1.7894223928451538, + "logps/chosen": -466.27520751953125, + "logps/rejected": -622.3167724609375, + "loss": 0.3773, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.4046239852905273, + "rewards/margins": 1.8917633295059204, + "rewards/rejected": -4.296387672424316, "step": 720 }, { - "epoch": 0.54, - "grad_norm": 57.82647372234183, - "learning_rate": 2.568089685768038e-07, - "logits/chosen": 0.6572129130363464, - "logits/rejected": 1.0754339694976807, - "logps/chosen": -530.2496337890625, - "logps/rejected": -698.03662109375, - "loss": 0.313, - "rewards/accuracies": 0.831250011920929, - "rewards/chosen": -2.59128475189209, - "rewards/margins": 2.117705821990967, - "rewards/rejected": -4.708990573883057, + "epoch": 0.84, + "grad_norm": 40.15824305792834, + "learning_rate": 3.7539192566655246e-08, + "logits/chosen": 1.1031806468963623, + "logits/rejected": 1.7454750537872314, + "logps/chosen": -413.6033630371094, + "logps/rejected": -561.4763793945312, + "loss": 0.3708, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.902054786682129, + "rewards/margins": 1.8310045003890991, + "rewards/rejected": -3.7330594062805176, "step": 730 }, { - "epoch": 0.55, - "grad_norm": 50.473574423912424, - "learning_rate": 2.503242766074156e-07, - "logits/chosen": 0.42826253175735474, - "logits/rejected": 1.0195951461791992, - "logps/chosen": -451.046142578125, - "logps/rejected": -653.2913818359375, - "loss": 0.2898, - "rewards/accuracies": 0.9125000238418579, - "rewards/chosen": -1.9979403018951416, - "rewards/margins": 2.318507432937622, - "rewards/rejected": -4.316447734832764, + "epoch": 0.85, + "grad_norm": 44.28447675992636, + "learning_rate": 3.24124515747731e-08, + "logits/chosen": 1.2360585927963257, + "logits/rejected": 1.7114673852920532, + "logps/chosen": -428.1229553222656, + "logps/rejected": -599.7882690429688, + "loss": 0.3783, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.270901679992676, + "rewards/margins": 1.863284707069397, + "rewards/rejected": -4.134186267852783, "step": 740 }, { - "epoch": 0.56, - "grad_norm": 61.13648555404995, - "learning_rate": 2.4383936641392136e-07, - "logits/chosen": 0.6429548859596252, - "logits/rejected": 1.103127360343933, - "logps/chosen": -467.82049560546875, - "logps/rejected": -702.5692749023438, - "loss": 0.2975, - "rewards/accuracies": 0.8812500238418579, - "rewards/chosen": -2.0785393714904785, - "rewards/margins": 2.386026382446289, - "rewards/rejected": -4.464566230773926, + "epoch": 0.86, + "grad_norm": 48.30307447289823, + "learning_rate": 2.763773529814506e-08, + "logits/chosen": 0.926419734954834, + "logits/rejected": 1.3899872303009033, + "logps/chosen": -468.01031494140625, + "logps/rejected": -613.7434692382812, + "loss": 0.3788, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -2.2661325931549072, + "rewards/margins": 1.9401893615722656, + "rewards/rejected": -4.206322193145752, "step": 750 }, { - "epoch": 0.56, - "grad_norm": 51.760001565819636, - "learning_rate": 2.3735860205989493e-07, - "logits/chosen": 0.7451823353767395, - "logits/rejected": 1.1489431858062744, - "logps/chosen": -462.767333984375, - "logps/rejected": -706.5615234375, - "loss": 0.2627, - "rewards/accuracies": 0.90625, - "rewards/chosen": -2.312885284423828, - "rewards/margins": 2.6091692447662354, - "rewards/rejected": -4.922054767608643, + "epoch": 0.88, + "grad_norm": 56.11779432942193, + "learning_rate": 2.3222768526860698e-08, + "logits/chosen": 1.0837215185165405, + "logits/rejected": 1.9027248620986938, + "logps/chosen": -431.9537048339844, + "logps/rejected": -601.56787109375, + "loss": 0.3908, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -2.2378554344177246, + "rewards/margins": 1.9761545658111572, + "rewards/rejected": -4.214009761810303, "step": 760 }, { - "epoch": 0.57, - "grad_norm": 56.13632726849474, - "learning_rate": 2.308863448189402e-07, - "logits/chosen": 0.5960752367973328, - "logits/rejected": 1.0421712398529053, - "logps/chosen": -498.1941833496094, - "logps/rejected": -695.0504760742188, - "loss": 0.2811, - "rewards/accuracies": 0.893750011920929, - "rewards/chosen": -2.29612398147583, - "rewards/margins": 2.4551825523376465, - "rewards/rejected": -4.751306533813477, + "epoch": 0.89, + "grad_norm": 40.169886925607514, + "learning_rate": 1.9174694029115146e-08, + "logits/chosen": 1.0360382795333862, + "logits/rejected": 1.7473223209381104, + "logps/chosen": -454.27264404296875, + "logps/rejected": -561.0426635742188, + "loss": 0.3798, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -2.1006946563720703, + "rewards/margins": 1.62350332736969, + "rewards/rejected": -3.72419810295105, "step": 770 }, { - "epoch": 0.58, - "grad_norm": 67.7549300842345, - "learning_rate": 2.2442695023974246e-07, - "logits/chosen": 0.6856900453567505, - "logits/rejected": 1.3306076526641846, - "logps/chosen": -444.3168029785156, - "logps/rejected": -679.816650390625, - "loss": 0.2713, - "rewards/accuracies": 0.893750011920929, - "rewards/chosen": -2.0717947483062744, - "rewards/margins": 2.6752490997314453, - "rewards/rejected": -4.747043609619141, + "epoch": 0.9, + "grad_norm": 47.263235112753776, + "learning_rate": 1.5500060995258134e-08, + "logits/chosen": 1.1812760829925537, + "logits/rejected": 1.7791898250579834, + "logps/chosen": -428.4283142089844, + "logps/rejected": -571.8958129882812, + "loss": 0.349, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -2.121316909790039, + "rewards/margins": 1.8303005695343018, + "rewards/rejected": -3.9516170024871826, "step": 780 }, { - "epoch": 0.59, - "grad_norm": 55.628538802719504, - "learning_rate": 2.179847652149729e-07, - "logits/chosen": 0.7401930093765259, - "logits/rejected": 1.288172960281372, - "logps/chosen": -496.6468811035156, - "logps/rejected": -687.7960205078125, - "loss": 0.295, + "epoch": 0.91, + "grad_norm": 54.446594272094224, + "learning_rate": 1.2204814442165812e-08, + "logits/chosen": 1.0776536464691162, + "logits/rejected": 1.8896220922470093, + "logps/chosen": -431.5889587402344, + "logps/rejected": -589.2385864257812, + "loss": 0.3802, "rewards/accuracies": 0.831250011920929, - "rewards/chosen": -2.4609100818634033, - "rewards/margins": 2.223629951477051, - "rewards/rejected": -4.684540271759033, + "rewards/chosen": -2.211080312728882, + "rewards/margins": 2.0271711349487305, + "rewards/rejected": -4.238251686096191, "step": 790 }, { - "epoch": 0.59, - "grad_norm": 63.651106043315345, - "learning_rate": 2.115641250560183e-07, - "logits/chosen": 0.8801604509353638, - "logits/rejected": 1.5266039371490479, - "logps/chosen": -473.2115173339844, - "logps/rejected": -701.8800659179688, - "loss": 0.2752, - "rewards/accuracies": 0.8687499761581421, - "rewards/chosen": -2.4201507568359375, - "rewards/margins": 2.4442293643951416, - "rewards/rejected": -4.864380836486816, - "step": 800 - }, - { - "epoch": 0.59, - "eval_logits/chosen": -0.16280797123908997, - "eval_logits/rejected": 0.2751551866531372, - "eval_logps/chosen": -533.1201782226562, - "eval_logps/rejected": -668.2235717773438, - "eval_loss": 0.5228938460350037, - "eval_rewards/accuracies": 0.765625, - "eval_rewards/chosen": -2.2134366035461426, - "eval_rewards/margins": 1.2935477495193481, - "eval_rewards/rejected": -3.506984233856201, - "eval_runtime": 97.387, - "eval_samples_per_second": 20.537, - "eval_steps_per_second": 0.329, + "epoch": 0.92, + "grad_norm": 48.742336588451536, + "learning_rate": 9.294285595075669e-09, + "logits/chosen": 0.6350497007369995, + "logits/rejected": 1.2895339727401733, + "logps/chosen": -462.9306640625, + "logps/rejected": -595.6204223632812, + "loss": 0.4032, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -2.274238348007202, + "rewards/margins": 1.7595179080963135, + "rewards/rejected": -4.033755779266357, "step": 800 }, - { - "epoch": 0.6, - "grad_norm": 70.2608582618962, - "learning_rate": 2.051693505755042e-07, - "logits/chosen": 0.8354732394218445, - "logits/rejected": 1.2750941514968872, - "logps/chosen": -461.49786376953125, - "logps/rejected": -705.8599853515625, - "loss": 0.2946, - "rewards/accuracies": 0.893750011920929, - "rewards/chosen": -2.4096267223358154, - "rewards/margins": 2.483677864074707, - "rewards/rejected": -4.893305778503418, - "step": 810 - }, - { - "epoch": 0.61, - "grad_norm": 49.246802198712466, - "learning_rate": 1.9880474517957542e-07, - "logits/chosen": 0.9254199862480164, - "logits/rejected": 1.563522458076477, - "logps/chosen": -481.2748107910156, - "logps/rejected": -658.328125, - "loss": 0.2674, - "rewards/accuracies": 0.8687499761581421, - "rewards/chosen": -2.385385036468506, - "rewards/margins": 2.1492881774902344, - "rewards/rejected": -4.53467321395874, - "step": 820 - }, - { - "epoch": 0.62, - "grad_norm": 88.28145029556197, - "learning_rate": 1.9247459197189e-07, - "logits/chosen": 0.8668380975723267, - "logits/rejected": 1.5001232624053955, - "logps/chosen": -488.27685546875, - "logps/rejected": -680.9069213867188, - "loss": 0.2652, - "rewards/accuracies": 0.893750011920929, - "rewards/chosen": -2.6699295043945312, - "rewards/margins": 2.2055306434631348, - "rewards/rejected": -4.875459671020508, - "step": 830 - }, - { - "epoch": 0.62, - "grad_norm": 43.13543734061108, - "learning_rate": 1.8618315087127602e-07, - "logits/chosen": 0.6826521754264832, - "logits/rejected": 1.2443543672561646, - "logps/chosen": -499.20892333984375, - "logps/rejected": -706.3511962890625, - "loss": 0.2563, - "rewards/accuracies": 0.8812500238418579, - "rewards/chosen": -2.4423558712005615, - "rewards/margins": 2.461874485015869, - "rewards/rejected": -4.904230117797852, - "step": 840 - }, - { - "epoch": 0.63, - "grad_norm": 56.63843357010467, - "learning_rate": 1.7993465574499102e-07, - "logits/chosen": 0.5323538184165955, - "logits/rejected": 1.2176125049591064, - "logps/chosen": -463.47857666015625, - "logps/rejected": -663.4465942382812, - "loss": 0.2759, - "rewards/accuracies": 0.887499988079071, - "rewards/chosen": -2.189335823059082, - "rewards/margins": 2.420409679412842, - "rewards/rejected": -4.609745502471924, - "step": 850 - }, - { - "epoch": 0.64, - "grad_norm": 56.31423994279339, - "learning_rate": 1.7373331155951233e-07, - "logits/chosen": 0.8688204884529114, - "logits/rejected": 1.4698970317840576, - "logps/chosen": -510.4227600097656, - "logps/rejected": -748.5259399414062, - "loss": 0.2649, - "rewards/accuracies": 0.8812500238418579, - "rewards/chosen": -2.550417900085449, - "rewards/margins": 2.730776309967041, - "rewards/rejected": -5.28119421005249, - "step": 860 - }, - { - "epoch": 0.65, - "grad_norm": 50.688626621321205, - "learning_rate": 1.6758329155077743e-07, - "logits/chosen": 1.0613950490951538, - "logits/rejected": 1.5818780660629272, - "logps/chosen": -495.5560607910156, - "logps/rejected": -708.2391967773438, - "loss": 0.2711, - "rewards/accuracies": 0.862500011920929, - "rewards/chosen": -2.642883777618408, - "rewards/margins": 2.6204209327697754, - "rewards/rejected": -5.263304710388184, - "step": 870 - }, - { - "epoch": 0.65, - "grad_norm": 46.10359729315069, - "learning_rate": 1.6148873441577662e-07, - "logits/chosen": 1.0479947328567505, - "logits/rejected": 1.5524357557296753, - "logps/chosen": -480.2462463378906, - "logps/rejected": -707.98681640625, - "loss": 0.2699, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": -2.261603355407715, - "rewards/margins": 2.4961774349212646, - "rewards/rejected": -4.757781028747559, - "step": 880 - }, - { - "epoch": 0.66, - "grad_norm": 41.346767344116245, - "learning_rate": 1.5545374152738934e-07, - "logits/chosen": 1.1905092000961304, - "logits/rejected": 1.6182410717010498, - "logps/chosen": -468.92083740234375, - "logps/rejected": -689.1092529296875, - "loss": 0.2722, - "rewards/accuracies": 0.875, - "rewards/chosen": -2.264604091644287, - "rewards/margins": 2.391749143600464, - "rewards/rejected": -4.65635347366333, - "step": 890 - }, - { - "epoch": 0.67, - "grad_norm": 60.48896334839974, - "learning_rate": 1.4948237417433775e-07, - "logits/chosen": 1.380293369293213, - "logits/rejected": 2.2697908878326416, - "logps/chosen": -436.1393127441406, - "logps/rejected": -673.2228393554688, - "loss": 0.2492, - "rewards/accuracies": 0.8812500238418579, - "rewards/chosen": -2.151729106903076, - "rewards/margins": 2.624401330947876, - "rewards/rejected": -4.776130676269531, - "step": 900 - }, - { - "epoch": 0.67, - "eval_logits/chosen": 0.5183509588241577, - "eval_logits/rejected": 1.0725551843643188, - "eval_logps/chosen": -518.2382202148438, - "eval_logps/rejected": -652.8116455078125, - "eval_loss": 0.5152209997177124, - "eval_rewards/accuracies": 0.7734375, - "eval_rewards/chosen": -2.064617395401001, - "eval_rewards/margins": 1.2882475852966309, - "eval_rewards/rejected": -3.352864980697632, - "eval_runtime": 97.3137, - "eval_samples_per_second": 20.552, - "eval_steps_per_second": 0.329, - "step": 900 - }, - { - "epoch": 0.68, - "grad_norm": 59.39383985362304, - "learning_rate": 1.435786508281158e-07, - "logits/chosen": 1.9009380340576172, - "logits/rejected": 2.567354679107666, - "logps/chosen": -482.70513916015625, - "logps/rejected": -720.0316162109375, - "loss": 0.2499, - "rewards/accuracies": 0.8687499761581421, - "rewards/chosen": -2.3441195487976074, - "rewards/margins": 2.6516547203063965, - "rewards/rejected": -4.995774269104004, - "step": 910 - }, - { - "epoch": 0.68, - "grad_norm": 58.953283614647454, - "learning_rate": 1.3774654443873174e-07, - "logits/chosen": 1.749333381652832, - "logits/rejected": 2.4905173778533936, - "logps/chosen": -512.65625, - "logps/rejected": -763.8499145507812, - "loss": 0.2542, - "rewards/accuracies": 0.8812500238418579, - "rewards/chosen": -2.783947467803955, - "rewards/margins": 2.989567756652832, - "rewards/rejected": -5.773515224456787, - "step": 920 - }, - { - "epoch": 0.69, - "grad_norm": 57.229551980352035, - "learning_rate": 1.31989979761085e-07, - "logits/chosen": 1.3056137561798096, - "logits/rejected": 2.2303478717803955, - "logps/chosen": -465.61627197265625, - "logps/rejected": -746.7559814453125, - "loss": 0.2416, - "rewards/accuracies": 0.9437500238418579, - "rewards/chosen": -2.5093438625335693, - "rewards/margins": 3.106735944747925, - "rewards/rejected": -5.616079807281494, - "step": 930 - }, - { - "epoch": 0.7, - "grad_norm": 53.92751444407525, - "learning_rate": 1.2631283071377618e-07, - "logits/chosen": 1.6224052906036377, - "logits/rejected": 1.9630991220474243, - "logps/chosen": -458.9669494628906, - "logps/rejected": -742.6818237304688, - "loss": 0.2429, - "rewards/accuracies": 0.90625, - "rewards/chosen": -2.4590606689453125, - "rewards/margins": 2.7507693767547607, - "rewards/rejected": -5.209830284118652, - "step": 940 - }, - { - "epoch": 0.71, - "grad_norm": 48.183067890071925, - "learning_rate": 1.2071891777212744e-07, - "logits/chosen": 1.061023235321045, - "logits/rejected": 1.9151092767715454, - "logps/chosen": -507.06744384765625, - "logps/rejected": -707.039794921875, - "loss": 0.253, - "rewards/accuracies": 0.875, - "rewards/chosen": -2.448425054550171, - "rewards/margins": 2.3641083240509033, - "rewards/rejected": -4.812533855438232, - "step": 950 - }, - { - "epoch": 0.71, - "grad_norm": 48.31856194766799, - "learning_rate": 1.1521200539716874e-07, - "logits/chosen": 1.2143045663833618, - "logits/rejected": 1.9916166067123413, - "logps/chosen": -500.71038818359375, - "logps/rejected": -771.3677978515625, - "loss": 0.2426, - "rewards/accuracies": 0.925000011920929, - "rewards/chosen": -2.3821799755096436, - "rewards/margins": 3.1737558841705322, - "rewards/rejected": -5.555935859680176, - "step": 960 - }, - { - "epoch": 0.72, - "grad_norm": 57.66373376149326, - "learning_rate": 1.0979579950231821e-07, - "logits/chosen": 1.1112618446350098, - "logits/rejected": 2.246898889541626, - "logps/chosen": -502.126220703125, - "logps/rejected": -734.8248901367188, - "loss": 0.241, - "rewards/accuracies": 0.8687499761581421, - "rewards/chosen": -2.395838737487793, - "rewards/margins": 2.6420400142669678, - "rewards/rejected": -5.03787899017334, - "step": 970 - }, - { - "epoch": 0.73, - "grad_norm": 55.20670800594472, - "learning_rate": 1.0447394495946291e-07, - "logits/chosen": 1.387683391571045, - "logits/rejected": 2.400949478149414, - "logps/chosen": -515.9779052734375, - "logps/rejected": -765.4949340820312, - "loss": 0.2468, - "rewards/accuracies": 0.8687499761581421, - "rewards/chosen": -2.70845365524292, - "rewards/margins": 2.7117531299591064, - "rewards/rejected": -5.420206546783447, - "step": 980 - }, - { - "epoch": 0.74, - "grad_norm": 45.9412294534277, - "learning_rate": 9.925002314611841e-08, - "logits/chosen": 1.8099420070648193, - "logits/rejected": 2.5098319053649902, - "logps/chosen": -484.7242736816406, - "logps/rejected": -777.49169921875, - "loss": 0.2383, - "rewards/accuracies": 0.9125000238418579, - "rewards/chosen": -2.564988374710083, - "rewards/margins": 2.9337170124053955, - "rewards/rejected": -5.498705863952637, - "step": 990 - }, - { - "epoch": 0.74, - "grad_norm": 64.863814963629, - "learning_rate": 9.412754953531663e-08, - "logits/chosen": 1.5222892761230469, - "logits/rejected": 2.5317773818969727, - "logps/chosen": -507.424072265625, - "logps/rejected": -756.7098388671875, - "loss": 0.262, - "rewards/accuracies": 0.893750011920929, - "rewards/chosen": -2.656026601791382, - "rewards/margins": 2.7969748973846436, - "rewards/rejected": -5.453001976013184, - "step": 1000 - }, - { - "epoch": 0.74, - "eval_logits/chosen": 0.6804571151733398, - "eval_logits/rejected": 1.3123811483383179, - "eval_logps/chosen": -556.8264770507812, - "eval_logps/rejected": -703.1602783203125, - "eval_loss": 0.5241079330444336, - "eval_rewards/accuracies": 0.76171875, - "eval_rewards/chosen": -2.4504995346069336, - "eval_rewards/margins": 1.405852198600769, - "eval_rewards/rejected": -3.856351613998413, - "eval_runtime": 97.4441, - "eval_samples_per_second": 20.525, - "eval_steps_per_second": 0.328, - "step": 1000 - }, - { - "epoch": 0.75, - "grad_norm": 69.68207773392557, - "learning_rate": 8.910997132984479e-08, - "logits/chosen": 1.820955514907837, - "logits/rejected": 2.952479839324951, - "logps/chosen": -544.1399536132812, - "logps/rejected": -808.0184936523438, - "loss": 0.2504, - "rewards/accuracies": 0.9125000238418579, - "rewards/chosen": -2.861184597015381, - "rewards/margins": 3.071931838989258, - "rewards/rejected": -5.933116436004639, - "step": 1010 - }, - { - "epoch": 0.76, - "grad_norm": 50.59071094029437, - "learning_rate": 8.42006651424274e-08, - "logits/chosen": 1.8404204845428467, - "logits/rejected": 2.6863815784454346, - "logps/chosen": -461.4169921875, - "logps/rejected": -703.1361083984375, - "loss": 0.2318, - "rewards/accuracies": 0.893750011920929, - "rewards/chosen": -2.4329962730407715, - "rewards/margins": 2.7300188541412354, - "rewards/rejected": -5.163014888763428, - "step": 1020 - }, - { - "epoch": 0.77, - "grad_norm": 57.22762908033313, - "learning_rate": 7.940293472341217e-08, - "logits/chosen": 2.013861894607544, - "logits/rejected": 2.7502970695495605, - "logps/chosen": -477.7572326660156, - "logps/rejected": -773.4556884765625, - "loss": 0.2276, - "rewards/accuracies": 0.925000011920929, - "rewards/chosen": -2.6210336685180664, - "rewards/margins": 3.139965057373047, - "rewards/rejected": -5.7609992027282715, - "step": 1030 - }, - { - "epoch": 0.77, - "grad_norm": 55.15868922046573, - "learning_rate": 7.472000873748918e-08, - "logits/chosen": 2.0298519134521484, - "logits/rejected": 2.990135431289673, - "logps/chosen": -528.5840454101562, - "logps/rejected": -781.4909057617188, - "loss": 0.2487, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": -2.6361494064331055, - "rewards/margins": 2.9660372734069824, - "rewards/rejected": -5.602187156677246, - "step": 1040 - }, - { - "epoch": 0.78, - "grad_norm": 43.438291077124795, - "learning_rate": 7.015503859093927e-08, - "logits/chosen": 2.1326801776885986, - "logits/rejected": 2.5511794090270996, - "logps/chosen": -486.6455078125, - "logps/rejected": -757.7630004882812, - "loss": 0.2148, - "rewards/accuracies": 0.925000011920929, - "rewards/chosen": -2.617185592651367, - "rewards/margins": 2.795973062515259, - "rewards/rejected": -5.413158893585205, - "step": 1050 - }, - { - "epoch": 0.79, - "grad_norm": 63.14016572546011, - "learning_rate": 6.571109631087451e-08, - "logits/chosen": 2.417752742767334, - "logits/rejected": 3.036146402359009, - "logps/chosen": -494.73046875, - "logps/rejected": -811.0126953125, - "loss": 0.2112, - "rewards/accuracies": 0.9312499761581421, - "rewards/chosen": -2.588284492492676, - "rewards/margins": 3.300442934036255, - "rewards/rejected": -5.888727188110352, - "step": 1060 - }, - { - "epoch": 0.79, - "grad_norm": 58.89863039830767, - "learning_rate": 6.139117247789687e-08, - "logits/chosen": 2.5516977310180664, - "logits/rejected": 3.055995464324951, - "logps/chosen": -535.7842407226562, - "logps/rejected": -800.0374145507812, - "loss": 0.2248, - "rewards/accuracies": 0.90625, - "rewards/chosen": -2.956123113632202, - "rewards/margins": 2.720890998840332, - "rewards/rejected": -5.677014350891113, - "step": 1070 - }, - { - "epoch": 0.8, - "grad_norm": 41.21215573686561, - "learning_rate": 5.719817421356685e-08, - "logits/chosen": 1.9021530151367188, - "logits/rejected": 2.7421538829803467, - "logps/chosen": -549.5343017578125, - "logps/rejected": -820.0753784179688, - "loss": 0.2033, - "rewards/accuracies": 0.9437500238418579, - "rewards/chosen": -2.7265052795410156, - "rewards/margins": 3.281470537185669, - "rewards/rejected": -6.007976055145264, - "step": 1080 - }, - { - "epoch": 0.81, - "grad_norm": 58.39711865385947, - "learning_rate": 5.313492322403701e-08, - "logits/chosen": 2.2018539905548096, - "logits/rejected": 2.951138496398926, - "logps/chosen": -533.9331665039062, - "logps/rejected": -891.0558471679688, - "loss": 0.1937, - "rewards/accuracies": 0.9437500238418579, - "rewards/chosen": -2.8866357803344727, - "rewards/margins": 3.6149306297302246, - "rewards/rejected": -6.501566410064697, - "step": 1090 - }, - { - "epoch": 0.82, - "grad_norm": 51.18256501676837, - "learning_rate": 4.9204153901165805e-08, - "logits/chosen": 1.9893665313720703, - "logits/rejected": 2.7781219482421875, - "logps/chosen": -530.7794189453125, - "logps/rejected": -824.0559692382812, - "loss": 0.2299, - "rewards/accuracies": 0.893750011920929, - "rewards/chosen": -2.8573508262634277, - "rewards/margins": 3.2189173698425293, - "rewards/rejected": -6.076268196105957, - "step": 1100 - }, - { - "epoch": 0.82, - "eval_logits/chosen": 0.8391125202178955, - "eval_logits/rejected": 1.4834216833114624, - "eval_logps/chosen": -588.2494506835938, - "eval_logps/rejected": -741.857421875, - "eval_loss": 0.5312901139259338, - "eval_rewards/accuracies": 0.7578125, - "eval_rewards/chosen": -2.7647294998168945, - "eval_rewards/margins": 1.4785932302474976, - "eval_rewards/rejected": -4.243322849273682, - "eval_runtime": 97.5423, - "eval_samples_per_second": 20.504, - "eval_steps_per_second": 0.328, - "step": 1100 - }, - { - "epoch": 0.82, - "grad_norm": 68.60925195657734, - "learning_rate": 4.540851148239036e-08, - "logits/chosen": 1.7061752080917358, - "logits/rejected": 2.698995351791382, - "logps/chosen": -537.1931762695312, - "logps/rejected": -848.33154296875, - "loss": 0.2129, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": -2.7809014320373535, - "rewards/margins": 3.3348469734191895, - "rewards/rejected": -6.115748405456543, - "step": 1110 - }, - { - "epoch": 0.83, - "grad_norm": 48.80096479357628, - "learning_rate": 4.1750550270596206e-08, - "logits/chosen": 1.531884789466858, - "logits/rejected": 2.923696994781494, - "logps/chosen": -509.5885314941406, - "logps/rejected": -794.9307250976562, - "loss": 0.1954, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -2.630959987640381, - "rewards/margins": 3.3725571632385254, - "rewards/rejected": -6.003516674041748, - "step": 1120 - }, - { - "epoch": 0.84, - "grad_norm": 68.79197398198284, - "learning_rate": 3.823273191518234e-08, - "logits/chosen": 1.5292671918869019, - "logits/rejected": 2.3230159282684326, - "logps/chosen": -568.5833740234375, - "logps/rejected": -835.826171875, - "loss": 0.2178, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": -3.0106937885284424, - "rewards/margins": 3.2017643451690674, - "rewards/rejected": -6.212458610534668, - "step": 1130 - }, - { - "epoch": 0.85, - "grad_norm": 59.434543375011025, - "learning_rate": 3.485742375547745e-08, - "logits/chosen": 1.4421080350875854, - "logits/rejected": 2.442089796066284, - "logps/chosen": -553.727294921875, - "logps/rejected": -822.7138671875, - "loss": 0.2009, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": -3.1090734004974365, - "rewards/margins": 2.9853668212890625, - "rewards/rejected": -6.094440460205078, - "step": 1140 - }, - { - "epoch": 0.85, - "grad_norm": 38.888275757403804, - "learning_rate": 3.162689722762365e-08, - "logits/chosen": 1.5811113119125366, - "logits/rejected": 2.2564284801483154, - "logps/chosen": -543.1163940429688, - "logps/rejected": -842.681640625, - "loss": 0.2095, - "rewards/accuracies": 0.918749988079071, - "rewards/chosen": -2.9668571949005127, - "rewards/margins": 3.10882830619812, - "rewards/rejected": -6.075685024261475, - "step": 1150 - }, - { - "epoch": 0.86, - "grad_norm": 42.47551430381964, - "learning_rate": 2.8543326335997904e-08, - "logits/chosen": 1.768690824508667, - "logits/rejected": 2.4484939575195312, - "logps/chosen": -556.0635375976562, - "logps/rejected": -805.807373046875, - "loss": 0.2046, - "rewards/accuracies": 0.918749988079071, - "rewards/chosen": -2.865739107131958, - "rewards/margins": 2.8989548683166504, - "rewards/rejected": -5.764693737030029, - "step": 1160 - }, - { - "epoch": 0.87, - "grad_norm": 59.36158165544989, - "learning_rate": 2.560878619020157e-08, - "logits/chosen": 1.9017894268035889, - "logits/rejected": 2.7026009559631348, - "logps/chosen": -521.269287109375, - "logps/rejected": -813.7127685546875, - "loss": 0.1964, - "rewards/accuracies": 0.9312499761581421, - "rewards/chosen": -2.9693474769592285, - "rewards/margins": 3.1322848796844482, - "rewards/rejected": -6.101632595062256, - "step": 1170 - }, - { - "epoch": 0.88, - "grad_norm": 49.475189963130575, - "learning_rate": 2.2825251608601466e-08, - "logits/chosen": 1.8870357275009155, - "logits/rejected": 2.8944287300109863, - "logps/chosen": -558.059814453125, - "logps/rejected": -868.568359375, - "loss": 0.1891, - "rewards/accuracies": 0.9125000238418579, - "rewards/chosen": -3.1376397609710693, - "rewards/margins": 3.2884891033172607, - "rewards/rejected": -6.426129341125488, - "step": 1180 - }, - { - "epoch": 0.88, - "grad_norm": 85.599165147591, - "learning_rate": 2.0194595789362474e-08, - "logits/chosen": 1.9095745086669922, - "logits/rejected": 2.530900478363037, - "logps/chosen": -577.1746826171875, - "logps/rejected": -892.88623046875, - "loss": 0.2027, - "rewards/accuracies": 0.9375, - "rewards/chosen": -3.0735995769500732, - "rewards/margins": 3.377427339553833, - "rewards/rejected": -6.451026916503906, - "step": 1190 - }, - { - "epoch": 0.89, - "grad_norm": 45.52491787365754, - "learning_rate": 1.7718589049866728e-08, - "logits/chosen": 2.376490592956543, - "logits/rejected": 3.1364424228668213, - "logps/chosen": -510.269287109375, - "logps/rejected": -829.1940307617188, - "loss": 0.1974, - "rewards/accuracies": 0.9375, - "rewards/chosen": -2.9278645515441895, - "rewards/margins": 3.433408737182617, - "rewards/rejected": -6.361273765563965, - "step": 1200 - }, - { - "epoch": 0.89, - "eval_logits/chosen": 0.8963963389396667, - "eval_logits/rejected": 1.5457934141159058, - "eval_logps/chosen": -606.617431640625, - "eval_logps/rejected": -764.6512451171875, - "eval_loss": 0.5366576910018921, - "eval_rewards/accuracies": 0.76171875, - "eval_rewards/chosen": -2.948409080505371, - "eval_rewards/margins": 1.5228519439697266, - "eval_rewards/rejected": -4.471261024475098, - "eval_runtime": 97.4355, - "eval_samples_per_second": 20.526, - "eval_steps_per_second": 0.328, - "step": 1200 - }, - { - "epoch": 0.9, - "grad_norm": 56.7147448955845, - "learning_rate": 1.539889763536645e-08, - "logits/chosen": 1.9441492557525635, - "logits/rejected": 3.0478804111480713, - "logps/chosen": -538.355224609375, - "logps/rejected": -856.01416015625, - "loss": 0.2187, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": -2.842240571975708, - "rewards/margins": 3.5280959606170654, - "rewards/rejected": -6.370336055755615, - "step": 1210 - }, - { - "epoch": 0.91, - "grad_norm": 60.258963508413004, - "learning_rate": 1.3237082597673172e-08, - "logits/chosen": 2.1856608390808105, - "logits/rejected": 2.853616237640381, - "logps/chosen": -517.0845947265625, - "logps/rejected": -845.6990966796875, - "loss": 0.204, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -3.0185937881469727, - "rewards/margins": 3.2306289672851562, - "rewards/rejected": -6.249222755432129, - "step": 1220 - }, - { - "epoch": 0.91, - "grad_norm": 71.41232139420377, - "learning_rate": 1.1234598744637502e-08, - "logits/chosen": 1.5448696613311768, - "logits/rejected": 2.610525608062744, - "logps/chosen": -545.0371704101562, - "logps/rejected": -821.2421875, - "loss": 0.2063, - "rewards/accuracies": 0.893750011920929, - "rewards/chosen": -3.1403965950012207, - "rewards/margins": 3.1843514442443848, - "rewards/rejected": -6.3247480392456055, - "step": 1230 - }, { "epoch": 0.92, - "grad_norm": 57.959377016977456, - "learning_rate": 9.392793661126414e-09, - "logits/chosen": 1.898782730102539, - "logits/rejected": 2.7061781883239746, - "logps/chosen": -582.9857177734375, - "logps/rejected": -879.3019409179688, - "loss": 0.1979, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": -3.2453556060791016, - "rewards/margins": 3.297309160232544, - "rewards/rejected": -6.542665004730225, - "step": 1240 + "eval_logits/chosen": 0.34458938241004944, + "eval_logits/rejected": 0.8732683658599854, + "eval_logps/chosen": -476.1736145019531, + "eval_logps/rejected": -567.4185180664062, + "eval_loss": 0.42825520038604736, + "eval_rewards/accuracies": 0.859375, + "eval_rewards/chosen": -1.406620979309082, + "eval_rewards/margins": 0.9365058541297913, + "eval_rewards/rejected": -2.3431267738342285, + "eval_runtime": 97.856, + "eval_samples_per_second": 20.438, + "eval_steps_per_second": 0.327, + "step": 800 }, { "epoch": 0.93, - "grad_norm": 50.86760187147993, - "learning_rate": 7.71290680215711e-09, - "logits/chosen": 2.0340778827667236, - "logits/rejected": 2.8080642223358154, - "logps/chosen": -558.147705078125, - "logps/rejected": -874.9266357421875, - "loss": 0.1974, - "rewards/accuracies": 0.925000011920929, - "rewards/chosen": -3.0640769004821777, - "rewards/margins": 3.380338668823242, - "rewards/rejected": -6.444415092468262, - "step": 1250 - }, - { - "epoch": 0.94, - "grad_norm": 61.973766270626015, - "learning_rate": 6.196068658797543e-09, - "logits/chosen": 1.8814232349395752, - "logits/rejected": 2.7813236713409424, - "logps/chosen": -551.5777587890625, - "logps/rejected": -826.7698974609375, - "loss": 0.1971, - "rewards/accuracies": 0.9125000238418579, - "rewards/chosen": -2.9602150917053223, - "rewards/margins": 3.0024728775024414, - "rewards/rejected": -5.9626874923706055, - "step": 1260 + "grad_norm": 49.18818695219976, + "learning_rate": 6.773183262446914e-09, + "logits/chosen": 1.1270101070404053, + "logits/rejected": 1.9546995162963867, + "logps/chosen": -432.09771728515625, + "logps/rejected": -578.98779296875, + "loss": 0.3869, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.2396090030670166, + "rewards/margins": 1.7456976175308228, + "rewards/rejected": -3.98530650138855, + "step": 810 }, { "epoch": 0.94, - "grad_norm": 67.6695850405579, - "learning_rate": 4.843299997394717e-09, - "logits/chosen": 1.856507658958435, - "logits/rejected": 2.7601516246795654, - "logps/chosen": -540.268310546875, - "logps/rejected": -846.9691162109375, - "loss": 0.2067, - "rewards/accuracies": 0.9375, - "rewards/chosen": -3.077454090118408, - "rewards/margins": 3.414836883544922, - "rewards/rejected": -6.492290496826172, - "step": 1270 - }, - { - "epoch": 0.95, - "grad_norm": 68.73319089653008, - "learning_rate": 3.655511172643372e-09, - "logits/chosen": 1.932074785232544, - "logits/rejected": 2.437225818634033, - "logps/chosen": -531.4140625, - "logps/rejected": -836.9505615234375, - "loss": 0.1876, - "rewards/accuracies": 0.925000011920929, - "rewards/chosen": -2.8276994228363037, - "rewards/margins": 3.25665020942688, - "rewards/rejected": -6.084350109100342, - "step": 1280 + "grad_norm": 48.42955108889553, + "learning_rate": 4.645586217799452e-09, + "logits/chosen": 1.1346285343170166, + "logits/rejected": 1.6442772150039673, + "logps/chosen": -434.9295349121094, + "logps/rejected": -606.5115966796875, + "loss": 0.4024, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.1286814212799072, + "rewards/margins": 1.8881809711456299, + "rewards/rejected": -4.016861915588379, + "step": 820 }, { "epoch": 0.96, - "grad_norm": 50.423800165908794, - "learning_rate": 2.633501514956532e-09, - "logits/chosen": 1.9169034957885742, - "logits/rejected": 2.7369441986083984, - "logps/chosen": -586.8289794921875, - "logps/rejected": -896.8014526367188, - "loss": 0.2044, - "rewards/accuracies": 0.9312499761581421, - "rewards/chosen": -3.1295228004455566, - "rewards/margins": 3.5232949256896973, - "rewards/rejected": -6.652817726135254, - "step": 1290 - }, - { - "epoch": 0.97, - "grad_norm": 57.31903342529662, - "learning_rate": 1.777958792550993e-09, - "logits/chosen": 1.5464543104171753, - "logits/rejected": 2.9688878059387207, - "logps/chosen": -587.2015380859375, - "logps/rejected": -853.0357666015625, - "loss": 0.1842, - "rewards/accuracies": 0.918749988079071, - "rewards/chosen": -2.988502025604248, - "rewards/margins": 3.156489372253418, - "rewards/rejected": -6.144991397857666, - "step": 1300 - }, - { - "epoch": 0.97, - "eval_logits/chosen": 0.9558575749397278, - "eval_logits/rejected": 1.609464406967163, - "eval_logps/chosen": -609.159423828125, - "eval_logps/rejected": -767.4317016601562, - "eval_loss": 0.5365558862686157, - "eval_rewards/accuracies": 0.76171875, - "eval_rewards/chosen": -2.9738292694091797, - "eval_rewards/margins": 1.5252362489700317, - "eval_rewards/rejected": -4.499065399169922, - "eval_runtime": 97.3239, - "eval_samples_per_second": 20.55, - "eval_steps_per_second": 0.329, - "step": 1300 + "grad_norm": 52.31327093944872, + "learning_rate": 2.9149366008568987e-09, + "logits/chosen": 1.01073157787323, + "logits/rejected": 1.627968192100525, + "logps/chosen": -430.75146484375, + "logps/rejected": -594.0120849609375, + "loss": 0.3907, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -2.109837055206299, + "rewards/margins": 1.913220763206482, + "rewards/rejected": -4.023057460784912, + "step": 830 }, { "epoch": 0.97, - "grad_norm": 66.21886288694567, - "learning_rate": 1.0894587486089125e-09, - "logits/chosen": 1.8931999206542969, - "logits/rejected": 2.824298858642578, - "logps/chosen": -563.06201171875, - "logps/rejected": -834.8709716796875, - "loss": 0.2157, - "rewards/accuracies": 0.887499988079071, - "rewards/chosen": -3.2370285987854004, - "rewards/margins": 3.035515546798706, - "rewards/rejected": -6.272543430328369, - "step": 1310 + "grad_norm": 51.70748969913889, + "learning_rate": 1.5840343486700215e-09, + "logits/chosen": 0.7685258984565735, + "logits/rejected": 1.560227870941162, + "logps/chosen": -438.5198669433594, + "logps/rejected": -593.31591796875, + "loss": 0.3724, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -1.9523073434829712, + "rewards/margins": 2.0151476860046387, + "rewards/rejected": -3.9674553871154785, + "step": 840 }, { "epoch": 0.98, - "grad_norm": 45.779926433395936, - "learning_rate": 5.684647138277098e-10, - "logits/chosen": 1.7055333852767944, - "logits/rejected": 2.308079719543457, - "logps/chosen": -531.0139770507812, - "logps/rejected": -862.2609252929688, - "loss": 0.1974, - "rewards/accuracies": 0.918749988079071, - "rewards/chosen": -2.956573486328125, - "rewards/margins": 3.375626802444458, - "rewards/rejected": -6.332200050354004, - "step": 1320 + "grad_norm": 42.90945507861596, + "learning_rate": 6.550326657293881e-10, + "logits/chosen": 0.80241459608078, + "logits/rejected": 1.472318410873413, + "logps/chosen": -426.9049377441406, + "logps/rejected": -606.2322387695312, + "loss": 0.3471, + "rewards/accuracies": 0.84375, + "rewards/chosen": -2.0855860710144043, + "rewards/margins": 2.160794734954834, + "rewards/rejected": -4.24638032913208, + "step": 850 }, { "epoch": 0.99, - "grad_norm": 58.05458328657747, - "learning_rate": 2.153272946184559e-10, - "logits/chosen": 1.735358476638794, - "logits/rejected": 2.259385585784912, - "logps/chosen": -585.9295043945312, - "logps/rejected": -861.4645385742188, - "loss": 0.1738, - "rewards/accuracies": 0.9312499761581421, - "rewards/chosen": -3.10073184967041, - "rewards/margins": 2.996291399002075, - "rewards/rejected": -6.097023010253906, - "step": 1330 - }, - { - "epoch": 1.0, - "grad_norm": 46.42702960995785, - "learning_rate": 3.0284137163189004e-11, - "logits/chosen": 2.000138759613037, - "logits/rejected": 2.7859671115875244, - "logps/chosen": -530.1033935546875, - "logps/rejected": -878.3465576171875, - "loss": 0.1884, - "rewards/accuracies": 0.918749988079071, - "rewards/chosen": -3.1844658851623535, - "rewards/margins": 3.3884029388427734, - "rewards/rejected": -6.572869300842285, - "step": 1340 + "grad_norm": 52.65613220022833, + "learning_rate": 1.2943454039654467e-10, + "logits/chosen": 1.3825061321258545, + "logits/rejected": 1.9333372116088867, + "logps/chosen": -426.72332763671875, + "logps/rejected": -573.5838623046875, + "loss": 0.359, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.260263442993164, + "rewards/margins": 1.6737343072891235, + "rewards/rejected": -3.9339981079101562, + "step": 860 }, { "epoch": 1.0, - "step": 1346, + "step": 868, "total_flos": 0.0, - "train_loss": 0.335402155391883, - "train_runtime": 21644.3608, - "train_samples_per_second": 7.959, - "train_steps_per_second": 0.062 + "train_loss": 0.42703192135156026, + "train_runtime": 13837.373, + "train_samples_per_second": 8.031, + "train_steps_per_second": 0.063 } ], "logging_steps": 10, - "max_steps": 1346, + "max_steps": 868, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100,