diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -10,13 +10,13 @@ "log_history": [ { "epoch": 0.0, - "grad_norm": 19.997434679340365, + "grad_norm": 23.545113700609754, "learning_rate": 3.7037037037037036e-09, - "logits/chosen": -1.5453667640686035, - "logits/rejected": -1.4501094818115234, - "logps/chosen": -154.90496826171875, - "logps/rejected": -163.46749877929688, - "loss": 0.6931, + "logits/chosen": -2.017277240753174, + "logits/rejected": -1.9505600929260254, + "logps/chosen": -342.8155212402344, + "logps/rejected": -264.6424865722656, + "loss": 0.6932, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, @@ -25,2229 +25,2229 @@ }, { "epoch": 0.01, - "grad_norm": 23.34805628576002, + "grad_norm": 23.704110924178444, "learning_rate": 3.7037037037037036e-08, - "logits/chosen": -1.726180076599121, - "logits/rejected": -1.6590251922607422, - "logps/chosen": -212.55889892578125, - "logps/rejected": -207.66229248046875, - "loss": 0.693, - "rewards/accuracies": 0.4236111044883728, - "rewards/chosen": -0.0009580536279827356, - "rewards/margins": -0.0008072062046267092, - "rewards/rejected": -0.00015084730694070458, + "logits/chosen": -1.852867603302002, + "logits/rejected": -1.7641547918319702, + "logps/chosen": -243.63710021972656, + "logps/rejected": -215.13551330566406, + "loss": 0.6933, + "rewards/accuracies": 0.4027777910232544, + "rewards/chosen": -0.0004846964729949832, + "rewards/margins": -0.001089173136278987, + "rewards/rejected": 0.0006044767214916646, "step": 10 }, { "epoch": 0.01, - "grad_norm": 25.078221894482837, + "grad_norm": 27.48286479448467, "learning_rate": 7.407407407407407e-08, - "logits/chosen": -1.7819910049438477, - "logits/rejected": -1.7246497869491577, - "logps/chosen": -215.92410278320312, - "logps/rejected": -204.28173828125, - "loss": 0.6928, + "logits/chosen": -1.9755146503448486, + "logits/rejected": -1.8412548303604126, + "logps/chosen": -241.4310302734375, + "logps/rejected": -210.738037109375, + "loss": 0.6927, "rewards/accuracies": 0.53125, - "rewards/chosen": 0.0006128093227744102, - "rewards/margins": 0.0009098866721615195, - "rewards/rejected": -0.00029707717476412654, + "rewards/chosen": 0.0005561274592764676, + "rewards/margins": 0.0004348217917140573, + "rewards/rejected": 0.00012130556569900364, "step": 20 }, { "epoch": 0.02, - "grad_norm": 22.11850294931131, + "grad_norm": 23.49895713678948, "learning_rate": 1.111111111111111e-07, - "logits/chosen": -1.7216441631317139, - "logits/rejected": -1.6432338953018188, - "logps/chosen": -225.88699340820312, - "logps/rejected": -185.396240234375, - "loss": 0.6914, + "logits/chosen": -1.8477449417114258, + "logits/rejected": -1.781266450881958, + "logps/chosen": -277.84527587890625, + "logps/rejected": -244.1582489013672, + "loss": 0.6915, "rewards/accuracies": 0.606249988079071, - "rewards/chosen": 0.004545359406620264, - "rewards/margins": 0.0039664083160459995, - "rewards/rejected": 0.0005789512651972473, + "rewards/chosen": 0.005596889648586512, + "rewards/margins": 0.0021990840323269367, + "rewards/rejected": 0.003397804917767644, "step": 30 }, { "epoch": 0.03, - "grad_norm": 21.112061167479588, + "grad_norm": 21.952979365752906, "learning_rate": 1.4814814814814815e-07, - "logits/chosen": -1.8239158391952515, - "logits/rejected": -1.7619836330413818, - "logps/chosen": -256.3050231933594, - "logps/rejected": -245.38198852539062, - "loss": 0.6877, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": 0.019687145948410034, - "rewards/margins": 0.009989428333938122, - "rewards/rejected": 0.009697715751826763, + "logits/chosen": -1.8662084341049194, + "logits/rejected": -1.8252031803131104, + "logps/chosen": -279.81585693359375, + "logps/rejected": -256.37322998046875, + "loss": 0.6867, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.026755522936582565, + "rewards/margins": 0.01376323588192463, + "rewards/rejected": 0.01299228798598051, "step": 40 }, { "epoch": 0.04, - "grad_norm": 20.90083454488397, + "grad_norm": 22.515894719363914, "learning_rate": 1.8518518518518516e-07, - "logits/chosen": -1.7489999532699585, - "logits/rejected": -1.7239185571670532, - "logps/chosen": -217.8907928466797, - "logps/rejected": -211.642333984375, + "logits/chosen": -1.886828064918518, + "logits/rejected": -1.796974539756775, + "logps/chosen": -245.1302490234375, + "logps/rejected": -207.6703338623047, "loss": 0.68, - "rewards/accuracies": 0.65625, - "rewards/chosen": 0.03657875210046768, - "rewards/margins": 0.0253153033554554, - "rewards/rejected": 0.011263446882367134, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.05396401137113571, + "rewards/margins": 0.03148679807782173, + "rewards/rejected": 0.02247721515595913, "step": 50 }, { "epoch": 0.04, - "grad_norm": 21.77197780821128, + "grad_norm": 21.11853715417876, "learning_rate": 2.222222222222222e-07, - "logits/chosen": -1.8024189472198486, - "logits/rejected": -1.6662009954452515, - "logps/chosen": -271.22235107421875, - "logps/rejected": -233.28823852539062, - "loss": 0.6713, + "logits/chosen": -1.8658056259155273, + "logits/rejected": -1.7990939617156982, + "logps/chosen": -245.4588623046875, + "logps/rejected": -228.79067993164062, + "loss": 0.6687, "rewards/accuracies": 0.706250011920929, - "rewards/chosen": 0.06959772109985352, - "rewards/margins": 0.04836159199476242, - "rewards/rejected": 0.021236125379800797, + "rewards/chosen": 0.0710381492972374, + "rewards/margins": 0.053314320743083954, + "rewards/rejected": 0.01772383041679859, "step": 60 }, { "epoch": 0.05, - "grad_norm": 21.554118928262394, + "grad_norm": 21.639022509531838, "learning_rate": 2.5925925925925923e-07, - "logits/chosen": -1.7849029302597046, - "logits/rejected": -1.718541145324707, - "logps/chosen": -224.2913055419922, - "logps/rejected": -203.49624633789062, - "loss": 0.6608, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": 0.05916479229927063, - "rewards/margins": 0.06906653940677643, - "rewards/rejected": -0.009901740588247776, + "logits/chosen": -1.8920536041259766, + "logits/rejected": -1.8345096111297607, + "logps/chosen": -223.96511840820312, + "logps/rejected": -196.08775329589844, + "loss": 0.6547, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": 0.06073574349284172, + "rewards/margins": 0.08626440167427063, + "rewards/rejected": -0.02552866004407406, "step": 70 }, { "epoch": 0.06, - "grad_norm": 20.717805914821145, + "grad_norm": 22.179495576107882, "learning_rate": 2.962962962962963e-07, - "logits/chosen": -1.866686463356018, - "logits/rejected": -1.7345689535140991, - "logps/chosen": -252.84555053710938, - "logps/rejected": -228.15359497070312, - "loss": 0.6487, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": 0.05975146219134331, - "rewards/margins": 0.1380428969860077, - "rewards/rejected": -0.07829144597053528, + "logits/chosen": -1.8825687170028687, + "logits/rejected": -1.847541093826294, + "logps/chosen": -232.0540313720703, + "logps/rejected": -240.20120239257812, + "loss": 0.6407, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": 0.03458085656166077, + "rewards/margins": 0.1154135912656784, + "rewards/rejected": -0.08083274215459824, "step": 80 }, { "epoch": 0.07, - "grad_norm": 24.15752327353423, + "grad_norm": 21.88163995061792, "learning_rate": 3.333333333333333e-07, - "logits/chosen": -1.7628377676010132, - "logits/rejected": -1.696528673171997, - "logps/chosen": -256.96954345703125, - "logps/rejected": -260.56292724609375, - "loss": 0.6244, - "rewards/accuracies": 0.71875, - "rewards/chosen": 0.009766384959220886, - "rewards/margins": 0.20868687331676483, - "rewards/rejected": -0.19892050325870514, + "logits/chosen": -1.9384691715240479, + "logits/rejected": -1.922488808631897, + "logps/chosen": -248.4744415283203, + "logps/rejected": -261.0725402832031, + "loss": 0.6135, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.029202425852417946, + "rewards/margins": 0.2103302925825119, + "rewards/rejected": -0.2395327389240265, "step": 90 }, { "epoch": 0.07, - "grad_norm": 22.043263558957296, + "grad_norm": 27.693123307166786, "learning_rate": 3.703703703703703e-07, - "logits/chosen": -1.758519172668457, - "logits/rejected": -1.6800458431243896, - "logps/chosen": -239.4019775390625, - "logps/rejected": -226.4539794921875, - "loss": 0.6142, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -0.1279800534248352, - "rewards/margins": 0.24427437782287598, - "rewards/rejected": -0.3722544014453888, + "logits/chosen": -1.9232885837554932, + "logits/rejected": -1.9198648929595947, + "logps/chosen": -245.3694610595703, + "logps/rejected": -275.853515625, + "loss": 0.5905, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.23111872375011444, + "rewards/margins": 0.2522026598453522, + "rewards/rejected": -0.4833213686943054, "step": 100 }, { "epoch": 0.07, - "eval_logits/chosen": -1.7870845794677734, - "eval_logits/rejected": -1.752089500427246, - "eval_logps/chosen": -302.7545166015625, - "eval_logps/rejected": -330.3116455078125, - "eval_loss": 0.6372222304344177, - "eval_rewards/accuracies": 0.69921875, - "eval_rewards/chosen": -0.21120953559875488, - "eval_rewards/margins": 0.2142910659313202, - "eval_rewards/rejected": -0.42550066113471985, - "eval_runtime": 98.1168, - "eval_samples_per_second": 20.384, - "eval_steps_per_second": 0.326, + "eval_logits/chosen": -1.787776231765747, + "eval_logits/rejected": -1.7244033813476562, + "eval_logps/chosen": -325.57440185546875, + "eval_logps/rejected": -351.93182373046875, + "eval_loss": 0.6428781747817993, + "eval_rewards/accuracies": 0.671875, + "eval_rewards/chosen": -0.13797907531261444, + "eval_rewards/margins": 0.2060878425836563, + "eval_rewards/rejected": -0.34406691789627075, + "eval_runtime": 97.6555, + "eval_samples_per_second": 20.48, + "eval_steps_per_second": 0.328, "step": 100 }, { "epoch": 0.08, - "grad_norm": 27.76371677719842, + "grad_norm": 33.52938589908786, "learning_rate": 4.0740740740740737e-07, - "logits/chosen": -1.7010570764541626, - "logits/rejected": -1.6076711416244507, - "logps/chosen": -263.64080810546875, - "logps/rejected": -244.4661407470703, - "loss": 0.5944, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": -0.26566341519355774, - "rewards/margins": 0.27579429745674133, - "rewards/rejected": -0.5414577722549438, + "logits/chosen": -1.8354734182357788, + "logits/rejected": -1.7754793167114258, + "logps/chosen": -295.2403869628906, + "logps/rejected": -316.46923828125, + "loss": 0.5723, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.5448485016822815, + "rewards/margins": 0.3984159529209137, + "rewards/rejected": -0.943264365196228, "step": 110 }, { "epoch": 0.09, - "grad_norm": 31.731787834651325, + "grad_norm": 32.42547027840792, "learning_rate": 4.444444444444444e-07, - "logits/chosen": -1.7044484615325928, - "logits/rejected": -1.6412681341171265, - "logps/chosen": -269.258544921875, - "logps/rejected": -282.24713134765625, - "loss": 0.5696, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -0.39111006259918213, - "rewards/margins": 0.36286839842796326, - "rewards/rejected": -0.7539784908294678, + "logits/chosen": -1.7011499404907227, + "logits/rejected": -1.708805799484253, + "logps/chosen": -307.11334228515625, + "logps/rejected": -348.78729248046875, + "loss": 0.5442, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5989453196525574, + "rewards/margins": 0.6007151007652283, + "rewards/rejected": -1.1996605396270752, "step": 120 }, { "epoch": 0.1, - "grad_norm": 37.69053452768137, + "grad_norm": 33.08064593315955, "learning_rate": 4.814814814814814e-07, - "logits/chosen": -1.860668420791626, - "logits/rejected": -1.8003613948822021, - "logps/chosen": -286.83123779296875, - "logps/rejected": -299.9700927734375, - "loss": 0.553, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -0.3989219665527344, - "rewards/margins": 0.4949173927307129, - "rewards/rejected": -0.8938392400741577, + "logits/chosen": -1.70786452293396, + "logits/rejected": -1.6745007038116455, + "logps/chosen": -290.42498779296875, + "logps/rejected": -343.42510986328125, + "loss": 0.5139, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.7598094344139099, + "rewards/margins": 0.6571252346038818, + "rewards/rejected": -1.4169347286224365, "step": 130 }, { "epoch": 0.1, - "grad_norm": 45.43263624776604, + "grad_norm": 33.94320124887001, "learning_rate": 4.999789692194508e-07, - "logits/chosen": -1.7893892526626587, - "logits/rejected": -1.7304092645645142, - "logps/chosen": -324.6953430175781, - "logps/rejected": -354.299072265625, - "loss": 0.5254, - "rewards/accuracies": 0.6875, - "rewards/chosen": -0.7778688073158264, - "rewards/margins": 0.5480495095252991, - "rewards/rejected": -1.325918436050415, + "logits/chosen": -1.8099472522735596, + "logits/rejected": -1.754595398902893, + "logps/chosen": -314.9842224121094, + "logps/rejected": -356.81011962890625, + "loss": 0.5172, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -0.812475860118866, + "rewards/margins": 0.6942508816719055, + "rewards/rejected": -1.5067269802093506, "step": 140 }, { "epoch": 0.11, - "grad_norm": 38.76316746995152, + "grad_norm": 39.07047935152003, "learning_rate": 4.998107442045616e-07, - "logits/chosen": -1.7821582555770874, - "logits/rejected": -1.7230908870697021, - "logps/chosen": -304.8173828125, - "logps/rejected": -349.8572082519531, - "loss": 0.5181, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -0.702239990234375, - "rewards/margins": 0.6602993011474609, - "rewards/rejected": -1.362539291381836, + "logits/chosen": -1.6377861499786377, + "logits/rejected": -1.6226139068603516, + "logps/chosen": -304.92840576171875, + "logps/rejected": -393.1883239746094, + "loss": 0.5094, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -0.8283722996711731, + "rewards/margins": 0.8278924822807312, + "rewards/rejected": -1.6562646627426147, "step": 150 }, { "epoch": 0.12, - "grad_norm": 38.48791118895226, + "grad_norm": 42.785505208166626, "learning_rate": 4.994744073829293e-07, - "logits/chosen": -1.9425595998764038, - "logits/rejected": -1.8510286808013916, - "logps/chosen": -315.58038330078125, - "logps/rejected": -354.3575744628906, - "loss": 0.5104, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": -0.928961455821991, - "rewards/margins": 0.6893821954727173, - "rewards/rejected": -1.618343710899353, + "logits/chosen": -1.5746722221374512, + "logits/rejected": -1.4142063856124878, + "logps/chosen": -343.25823974609375, + "logps/rejected": -402.02691650390625, + "loss": 0.5011, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -0.8369730710983276, + "rewards/margins": 0.8556060791015625, + "rewards/rejected": -1.6925792694091797, "step": 160 }, { "epoch": 0.13, - "grad_norm": 34.98469211427676, + "grad_norm": 48.274083606893925, "learning_rate": 4.989701850946613e-07, - "logits/chosen": -2.1142868995666504, - "logits/rejected": -2.0348961353302, - "logps/chosen": -336.83929443359375, - "logps/rejected": -374.04046630859375, - "loss": 0.5009, - "rewards/accuracies": 0.8125, - "rewards/chosen": -0.7460821866989136, - "rewards/margins": 0.756350040435791, - "rewards/rejected": -1.5024322271347046, + "logits/chosen": -1.5056556463241577, + "logits/rejected": -1.3766965866088867, + "logps/chosen": -335.7103271484375, + "logps/rejected": -388.94097900390625, + "loss": 0.4643, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.9376843571662903, + "rewards/margins": 0.8313838243484497, + "rewards/rejected": -1.7690680027008057, "step": 170 }, { "epoch": 0.13, - "grad_norm": 45.707896850254954, + "grad_norm": 46.176765511998994, "learning_rate": 4.982984166595104e-07, - "logits/chosen": -2.1524124145507812, - "logits/rejected": -2.127004861831665, - "logps/chosen": -349.15081787109375, - "logps/rejected": -417.07452392578125, - "loss": 0.4877, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -1.093063473701477, - "rewards/margins": 0.8320043683052063, - "rewards/rejected": -1.9250679016113281, + "logits/chosen": -1.4761296510696411, + "logits/rejected": -1.3599636554718018, + "logps/chosen": -408.171630859375, + "logps/rejected": -472.0873107910156, + "loss": 0.4577, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.2097257375717163, + "rewards/margins": 1.240505576133728, + "rewards/rejected": -2.4502310752868652, "step": 180 }, { "epoch": 0.14, - "grad_norm": 41.90753016315753, + "grad_norm": 43.28509926988276, "learning_rate": 4.974595541485259e-07, - "logits/chosen": -2.118150234222412, - "logits/rejected": -2.0182414054870605, - "logps/chosen": -376.4796447753906, - "logps/rejected": -435.715087890625, - "loss": 0.4957, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -1.2654850482940674, - "rewards/margins": 0.9563212394714355, - "rewards/rejected": -2.221806049346924, + "logits/chosen": -1.3221380710601807, + "logits/rejected": -1.204590082168579, + "logps/chosen": -335.5089416503906, + "logps/rejected": -428.30621337890625, + "loss": 0.4635, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.076790452003479, + "rewards/margins": 1.0969324111938477, + "rewards/rejected": -2.173722743988037, "step": 190 }, { "epoch": 0.15, - "grad_norm": 43.92066412227129, + "grad_norm": 56.09927596713516, "learning_rate": 4.964541620798307e-07, - "logits/chosen": -2.0093464851379395, - "logits/rejected": -1.843632698059082, - "logps/chosen": -358.4123840332031, - "logps/rejected": -408.1987609863281, - "loss": 0.4726, - "rewards/accuracies": 0.793749988079071, - "rewards/chosen": -1.1928468942642212, - "rewards/margins": 0.9090213775634766, - "rewards/rejected": -2.101868152618408, + "logits/chosen": -1.2160365581512451, + "logits/rejected": -1.118375539779663, + "logps/chosen": -348.90753173828125, + "logps/rejected": -468.21563720703125, + "loss": 0.4495, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.2727657556533813, + "rewards/margins": 1.1830675601959229, + "rewards/rejected": -2.4558334350585938, "step": 200 }, { "epoch": 0.15, - "eval_logits/chosen": -2.047096014022827, - "eval_logits/rejected": -2.0018393993377686, - "eval_logps/chosen": -416.041015625, - "eval_logps/rejected": -498.22076416015625, - "eval_loss": 0.5515660047531128, - "eval_rewards/accuracies": 0.75, - "eval_rewards/chosen": -1.3440749645233154, - "eval_rewards/margins": 0.7605167627334595, - "eval_rewards/rejected": -2.1045916080474854, - "eval_runtime": 98.1193, - "eval_samples_per_second": 20.383, - "eval_steps_per_second": 0.326, + "eval_logits/chosen": -1.4371435642242432, + "eval_logits/rejected": -1.366525650024414, + "eval_logps/chosen": -361.1814880371094, + "eval_logps/rejected": -427.2509765625, + "eval_loss": 0.559985339641571, + "eval_rewards/accuracies": 0.74609375, + "eval_rewards/chosen": -0.4940495491027832, + "eval_rewards/margins": 0.6032084226608276, + "eval_rewards/rejected": -1.0972579717636108, + "eval_runtime": 97.4901, + "eval_samples_per_second": 20.515, + "eval_steps_per_second": 0.328, "step": 200 }, { "epoch": 0.16, - "grad_norm": 38.95795396663457, + "grad_norm": 49.36366262587358, "learning_rate": 4.952829170387241e-07, - "logits/chosen": -1.9518171548843384, - "logits/rejected": -1.8534870147705078, - "logps/chosen": -342.5758361816406, - "logps/rejected": -430.5548400878906, - "loss": 0.4601, - "rewards/accuracies": 0.831250011920929, - "rewards/chosen": -1.3162614107131958, - "rewards/margins": 1.062984824180603, - "rewards/rejected": -2.3792459964752197, + "logits/chosen": -1.1800302267074585, + "logits/rejected": -1.0126550197601318, + "logps/chosen": -380.48828125, + "logps/rejected": -450.0765075683594, + "loss": 0.4458, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.3101383447647095, + "rewards/margins": 0.9806028604507446, + "rewards/rejected": -2.290741443634033, "step": 210 }, { "epoch": 0.16, - "grad_norm": 43.63837784340095, + "grad_norm": 57.25684926546983, "learning_rate": 4.939466072223697e-07, - "logits/chosen": -1.9940170049667358, - "logits/rejected": -1.8776836395263672, - "logps/chosen": -409.5476989746094, - "logps/rejected": -429.2942810058594, - "loss": 0.4812, - "rewards/accuracies": 0.6875, - "rewards/chosen": -1.562101125717163, - "rewards/margins": 0.7047168612480164, - "rewards/rejected": -2.266817808151245, + "logits/chosen": -1.2157623767852783, + "logits/rejected": -1.0489680767059326, + "logps/chosen": -372.591064453125, + "logps/rejected": -468.7542419433594, + "loss": 0.4545, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.3519532680511475, + "rewards/margins": 1.1502256393432617, + "rewards/rejected": -2.50217866897583, "step": 220 }, { "epoch": 0.17, - "grad_norm": 48.17484694517937, + "grad_norm": 40.98752146946231, "learning_rate": 4.924461319093725e-07, - "logits/chosen": -1.911233901977539, - "logits/rejected": -1.8419215679168701, - "logps/chosen": -366.72210693359375, - "logps/rejected": -425.0821838378906, - "loss": 0.4424, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -1.3060904741287231, - "rewards/margins": 0.9145557284355164, - "rewards/rejected": -2.220646381378174, + "logits/chosen": -1.1049861907958984, + "logits/rejected": -1.0018864870071411, + "logps/chosen": -361.7793884277344, + "logps/rejected": -487.15460205078125, + "loss": 0.4436, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1743983030319214, + "rewards/margins": 1.1021788120269775, + "rewards/rejected": -2.2765772342681885, "step": 230 }, { "epoch": 0.18, - "grad_norm": 42.0731451177421, + "grad_norm": 57.39176618017778, "learning_rate": 4.907825008546038e-07, - "logits/chosen": -1.7983890771865845, - "logits/rejected": -1.6776669025421143, - "logps/chosen": -404.84942626953125, - "logps/rejected": -461.9938049316406, - "loss": 0.452, + "logits/chosen": -0.7271394729614258, + "logits/rejected": -0.6813848614692688, + "logps/chosen": -377.90118408203125, + "logps/rejected": -523.9625244140625, + "loss": 0.4333, "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -1.4010937213897705, - "rewards/margins": 0.9703173637390137, - "rewards/rejected": -2.371410846710205, + "rewards/chosen": -1.4791629314422607, + "rewards/margins": 1.4326350688934326, + "rewards/rejected": -2.9117980003356934, "step": 240 }, { "epoch": 0.19, - "grad_norm": 45.788391518578905, + "grad_norm": 51.26102709104704, "learning_rate": 4.889568336096795e-07, - "logits/chosen": -1.7662121057510376, - "logits/rejected": -1.6878684759140015, - "logps/chosen": -368.25567626953125, - "logps/rejected": -454.2179260253906, - "loss": 0.4514, - "rewards/accuracies": 0.8187500238418579, - "rewards/chosen": -1.3891403675079346, - "rewards/margins": 1.1145490407943726, - "rewards/rejected": -2.5036895275115967, + "logits/chosen": -0.5312275290489197, + "logits/rejected": -0.37771934270858765, + "logps/chosen": -381.1251220703125, + "logps/rejected": -479.7431640625, + "loss": 0.4272, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.5479203462600708, + "rewards/margins": 1.1352421045303345, + "rewards/rejected": -2.6831624507904053, "step": 250 }, { "epoch": 0.19, - "grad_norm": 53.99920452912666, + "grad_norm": 46.69946748969463, "learning_rate": 4.869703587695508e-07, - "logits/chosen": -1.7349084615707397, - "logits/rejected": -1.6830803155899048, - "logps/chosen": -421.4908752441406, - "logps/rejected": -481.27752685546875, - "loss": 0.4519, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.3783433437347412, - "rewards/margins": 1.064690351486206, - "rewards/rejected": -2.4430336952209473, + "logits/chosen": -0.44748228788375854, + "logits/rejected": -0.18481455743312836, + "logps/chosen": -379.5589904785156, + "logps/rejected": -527.2100830078125, + "loss": 0.4464, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.412641167640686, + "rewards/margins": 1.667824149131775, + "rewards/rejected": -3.080465793609619, "step": 260 }, { "epoch": 0.2, - "grad_norm": 48.505757522556536, + "grad_norm": 40.8957837906737, "learning_rate": 4.848244131457127e-07, - "logits/chosen": -1.601601243019104, - "logits/rejected": -1.4739550352096558, - "logps/chosen": -386.08428955078125, - "logps/rejected": -498.78558349609375, - "loss": 0.4367, - "rewards/accuracies": 0.78125, - "rewards/chosen": -1.4528039693832397, - "rewards/margins": 1.1855452060699463, - "rewards/rejected": -2.6383490562438965, + "logits/chosen": -0.9530747532844543, + "logits/rejected": -0.6137160062789917, + "logps/chosen": -400.1986083984375, + "logps/rejected": -499.60308837890625, + "loss": 0.4211, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -1.4335994720458984, + "rewards/margins": 1.4832035303115845, + "rewards/rejected": -2.9168028831481934, "step": 270 }, { "epoch": 0.21, - "grad_norm": 49.31169957835279, + "grad_norm": 45.308995144235396, "learning_rate": 4.825204408665877e-07, - "logits/chosen": -1.697139024734497, - "logits/rejected": -1.5427892208099365, - "logps/chosen": -399.53729248046875, - "logps/rejected": -492.3568420410156, - "loss": 0.4205, - "rewards/accuracies": 0.84375, - "rewards/chosen": -1.5937035083770752, - "rewards/margins": 1.3382676839828491, - "rewards/rejected": -2.931971311569214, + "logits/chosen": -1.2076747417449951, + "logits/rejected": -0.9289032220840454, + "logps/chosen": -426.99114990234375, + "logps/rejected": -532.0573120117188, + "loss": 0.4124, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -1.4818888902664185, + "rewards/margins": 1.4990845918655396, + "rewards/rejected": -2.980973720550537, "step": 280 }, { "epoch": 0.22, - "grad_norm": 38.828455566499606, + "grad_norm": 57.75176826411474, "learning_rate": 4.800599924056907e-07, - "logits/chosen": -1.7314732074737549, - "logits/rejected": -1.5896819829940796, - "logps/chosen": -407.6255798339844, - "logps/rejected": -502.23016357421875, - "loss": 0.4189, - "rewards/accuracies": 0.84375, - "rewards/chosen": -1.5207576751708984, - "rewards/margins": 1.350722074508667, - "rewards/rejected": -2.8714797496795654, + "logits/chosen": -0.7638604044914246, + "logits/rejected": -0.7332445383071899, + "logps/chosen": -383.2490539550781, + "logps/rejected": -556.2003784179688, + "loss": 0.3833, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.5847924947738647, + "rewards/margins": 1.5942741632461548, + "rewards/rejected": -3.1790668964385986, "step": 290 }, { "epoch": 0.22, - "grad_norm": 44.43194576384378, + "grad_norm": 45.582764097748154, "learning_rate": 4.774447235382259e-07, - "logits/chosen": -1.7066457271575928, - "logits/rejected": -1.5852010250091553, - "logps/chosen": -425.8912658691406, - "logps/rejected": -526.4098510742188, - "loss": 0.4421, - "rewards/accuracies": 0.78125, - "rewards/chosen": -1.5712147951126099, - "rewards/margins": 1.0973128080368042, - "rewards/rejected": -2.668527841567993, + "logits/chosen": -0.5798165202140808, + "logits/rejected": -0.5653051733970642, + "logps/chosen": -411.58154296875, + "logps/rejected": -582.2734375, + "loss": 0.3963, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.766920804977417, + "rewards/margins": 1.7389370203018188, + "rewards/rejected": -3.5058579444885254, "step": 300 }, { "epoch": 0.22, - "eval_logits/chosen": -1.8325203657150269, - "eval_logits/rejected": -1.7522265911102295, - "eval_logps/chosen": -396.3378601074219, - "eval_logps/rejected": -492.3901062011719, - "eval_loss": 0.5335086584091187, - "eval_rewards/accuracies": 0.75390625, - "eval_rewards/chosen": -1.147042989730835, - "eval_rewards/margins": 0.8992425799369812, - "eval_rewards/rejected": -2.046285629272461, - "eval_runtime": 98.1593, - "eval_samples_per_second": 20.375, - "eval_steps_per_second": 0.326, + "eval_logits/chosen": -1.4608731269836426, + "eval_logits/rejected": -1.2769949436187744, + "eval_logps/chosen": -423.00341796875, + "eval_logps/rejected": -521.115478515625, + "eval_loss": 0.5291498303413391, + "eval_rewards/accuracies": 0.7421875, + "eval_rewards/chosen": -1.1122692823410034, + "eval_rewards/margins": 0.9236345291137695, + "eval_rewards/rejected": -2.0359039306640625, + "eval_runtime": 97.2217, + "eval_samples_per_second": 20.572, + "eval_steps_per_second": 0.329, "step": 300 }, { "epoch": 0.23, - "grad_norm": 44.68676000662293, + "grad_norm": 42.82644939529418, "learning_rate": 4.7467639422682426e-07, - "logits/chosen": -1.5684497356414795, - "logits/rejected": -1.5880094766616821, - "logps/chosen": -429.06072998046875, - "logps/rejected": -549.975830078125, - "loss": 0.431, - "rewards/accuracies": 0.8125, - "rewards/chosen": -1.6676822900772095, - "rewards/margins": 1.3817517757415771, - "rewards/rejected": -3.049434185028076, + "logits/chosen": -0.6843788623809814, + "logits/rejected": -0.46269315481185913, + "logps/chosen": -417.7638244628906, + "logps/rejected": -573.83837890625, + "loss": 0.4006, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -1.8430830240249634, + "rewards/margins": 1.669550895690918, + "rewards/rejected": -3.512633800506592, "step": 310 }, { "epoch": 0.24, - "grad_norm": 49.129673207510045, + "grad_norm": 55.146360598406936, "learning_rate": 4.7175686743716223e-07, - "logits/chosen": -1.6045185327529907, - "logits/rejected": -1.5781729221343994, - "logps/chosen": -407.8392639160156, - "logps/rejected": -560.4733276367188, - "loss": 0.4164, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": -1.750781774520874, - "rewards/margins": 1.4461250305175781, - "rewards/rejected": -3.196906566619873, + "logits/chosen": -1.140579104423523, + "logits/rejected": -0.8973017930984497, + "logps/chosen": -419.18048095703125, + "logps/rejected": -527.0257568359375, + "loss": 0.405, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.4635722637176514, + "rewards/margins": 1.3773781061172485, + "rewards/rejected": -2.8409504890441895, "step": 320 }, { "epoch": 0.25, - "grad_norm": 44.13714858626996, + "grad_norm": 45.88101703811544, "learning_rate": 4.686881078842688e-07, - "logits/chosen": -1.65378737449646, - "logits/rejected": -1.5229465961456299, - "logps/chosen": -369.09075927734375, - "logps/rejected": -465.15411376953125, - "loss": 0.4162, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -1.5404636859893799, - "rewards/margins": 1.1851208209991455, - "rewards/rejected": -2.7255845069885254, + "logits/chosen": -1.0653458833694458, + "logits/rejected": -0.8751330375671387, + "logps/chosen": -386.37335205078125, + "logps/rejected": -510.29949951171875, + "loss": 0.3899, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.47976553440094, + "rewards/margins": 1.366317868232727, + "rewards/rejected": -2.846083164215088, "step": 330 }, { "epoch": 0.25, - "grad_norm": 67.75141126041814, + "grad_norm": 58.11307992254104, "learning_rate": 4.654721807103558e-07, - "logits/chosen": -1.668287992477417, - "logits/rejected": -1.4913742542266846, - "logps/chosen": -451.5079040527344, - "logps/rejected": -548.6111450195312, - "loss": 0.4124, - "rewards/accuracies": 0.84375, - "rewards/chosen": -1.996059775352478, - "rewards/margins": 1.3178468942642212, - "rewards/rejected": -3.3139069080352783, + "logits/chosen": -0.5151967406272888, + "logits/rejected": -0.14977958798408508, + "logps/chosen": -400.7736511230469, + "logps/rejected": -529.3316650390625, + "loss": 0.3938, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.7082515954971313, + "rewards/margins": 1.6958554983139038, + "rewards/rejected": -3.404106855392456, "step": 340 }, { "epoch": 0.26, - "grad_norm": 51.63470007658604, + "grad_norm": 48.499175539211535, "learning_rate": 4.621112500950678e-07, - "logits/chosen": -1.4794889688491821, - "logits/rejected": -1.3635644912719727, - "logps/chosen": -392.33319091796875, - "logps/rejected": -509.9889221191406, - "loss": 0.3917, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": -1.6687583923339844, - "rewards/margins": 1.3096448183059692, - "rewards/rejected": -2.9784035682678223, + "logits/chosen": -0.8198322057723999, + "logits/rejected": -0.5934363603591919, + "logps/chosen": -429.72113037109375, + "logps/rejected": -547.5772705078125, + "loss": 0.3843, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.8615728616714478, + "rewards/margins": 1.499329924583435, + "rewards/rejected": -3.3609023094177246, "step": 350 }, { "epoch": 0.27, - "grad_norm": 49.136281330589576, + "grad_norm": 55.599844022581365, "learning_rate": 4.5860757779908225e-07, - "logits/chosen": -1.3447411060333252, - "logits/rejected": -1.1584227085113525, - "logps/chosen": -392.1809997558594, - "logps/rejected": -517.8204345703125, - "loss": 0.4184, - "rewards/accuracies": 0.831250011920929, - "rewards/chosen": -1.6987504959106445, - "rewards/margins": 1.4527620077133179, - "rewards/rejected": -3.151512384414673, + "logits/chosen": -1.0455310344696045, + "logits/rejected": -0.6826554536819458, + "logps/chosen": -413.38739013671875, + "logps/rejected": -542.2623291015625, + "loss": 0.3736, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.5897157192230225, + "rewards/margins": 1.6853986978530884, + "rewards/rejected": -3.2751145362854004, "step": 360 }, { "epoch": 0.27, - "grad_norm": 54.14810763202607, + "grad_norm": 74.71151634556864, "learning_rate": 4.5496352164204304e-07, - "logits/chosen": -1.0756736993789673, - "logits/rejected": -0.9471368789672852, - "logps/chosen": -381.1976013183594, - "logps/rejected": -491.38787841796875, - "loss": 0.4015, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -1.6106783151626587, - "rewards/margins": 1.3218671083450317, - "rewards/rejected": -2.9325456619262695, + "logits/chosen": -0.4619407057762146, + "logits/rejected": -0.23415322601795197, + "logps/chosen": -426.197998046875, + "logps/rejected": -620.7210693359375, + "loss": 0.3997, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -2.0138180255889893, + "rewards/margins": 2.0114035606384277, + "rewards/rejected": -4.025221347808838, "step": 370 }, { "epoch": 0.28, - "grad_norm": 61.80294176300012, + "grad_norm": 46.835706945950214, "learning_rate": 4.5118153391584966e-07, - "logits/chosen": -0.9351271390914917, - "logits/rejected": -0.686697244644165, - "logps/chosen": -431.38116455078125, - "logps/rejected": -523.0051879882812, - "loss": 0.3966, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -1.933549165725708, - "rewards/margins": 1.2298634052276611, - "rewards/rejected": -3.1634128093719482, + "logits/chosen": -0.7893734574317932, + "logits/rejected": -0.5286726951599121, + "logps/chosen": -348.12554931640625, + "logps/rejected": -483.89215087890625, + "loss": 0.3909, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.0020155906677246, + "rewards/margins": 1.7324419021606445, + "rewards/rejected": -2.734457492828369, "step": 380 }, { "epoch": 0.29, - "grad_norm": 49.95605603054853, + "grad_norm": 51.06658825135186, "learning_rate": 4.472641597343713e-07, - "logits/chosen": -0.7354714274406433, - "logits/rejected": -0.6010663509368896, - "logps/chosen": -441.1346740722656, - "logps/rejected": -589.41064453125, - "loss": 0.4152, - "rewards/accuracies": 0.793749988079071, - "rewards/chosen": -2.0311450958251953, - "rewards/margins": 1.5603392124176025, - "rewards/rejected": -3.5914840698242188, + "logits/chosen": -0.5109713077545166, + "logits/rejected": -0.07112047076225281, + "logps/chosen": -389.3044738769531, + "logps/rejected": -567.7926635742188, + "loss": 0.3846, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -1.6159217357635498, + "rewards/margins": 1.9207748174667358, + "rewards/rejected": -3.536696672439575, "step": 390 }, { "epoch": 0.3, - "grad_norm": 51.04567852493441, + "grad_norm": 44.181665144710905, "learning_rate": 4.4321403532069523e-07, - "logits/chosen": -0.6377015113830566, - "logits/rejected": -0.3965984880924225, - "logps/chosen": -387.339111328125, - "logps/rejected": -515.1150512695312, - "loss": 0.3828, - "rewards/accuracies": 0.8125, - "rewards/chosen": -1.7281858921051025, - "rewards/margins": 1.4771182537078857, - "rewards/rejected": -3.205303907394409, + "logits/chosen": -0.5097373127937317, + "logits/rejected": -0.2719523012638092, + "logps/chosen": -353.91278076171875, + "logps/rejected": -517.2376708984375, + "loss": 0.4012, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.5704162120819092, + "rewards/margins": 1.8435367345809937, + "rewards/rejected": -3.4139533042907715, "step": 400 }, { "epoch": 0.3, - "eval_logits/chosen": -1.1203975677490234, - "eval_logits/rejected": -0.9529741406440735, - "eval_logps/chosen": -448.1488037109375, - "eval_logps/rejected": -563.927978515625, - "eval_loss": 0.523780882358551, - "eval_rewards/accuracies": 0.76953125, - "eval_rewards/chosen": -1.6651523113250732, - "eval_rewards/margins": 1.096511721611023, - "eval_rewards/rejected": -2.7616641521453857, - "eval_runtime": 98.0838, - "eval_samples_per_second": 20.391, - "eval_steps_per_second": 0.326, + "eval_logits/chosen": -1.3372514247894287, + "eval_logits/rejected": -1.1222751140594482, + "eval_logps/chosen": -417.65863037109375, + "eval_logps/rejected": -516.7505493164062, + "eval_loss": 0.5314938426017761, + "eval_rewards/accuracies": 0.7734375, + "eval_rewards/chosen": -1.058821201324463, + "eval_rewards/margins": 0.9334329962730408, + "eval_rewards/rejected": -1.9922541379928589, + "eval_runtime": 97.4658, + "eval_samples_per_second": 20.52, + "eval_steps_per_second": 0.328, "step": 400 }, { "epoch": 0.3, - "grad_norm": 48.086291234203195, + "grad_norm": 50.26869622592037, "learning_rate": 4.390338862330631e-07, - "logits/chosen": -0.7085340619087219, - "logits/rejected": -0.5028501749038696, - "logps/chosen": -433.546142578125, - "logps/rejected": -573.782958984375, - "loss": 0.3738, - "rewards/accuracies": 0.893750011920929, - "rewards/chosen": -1.8388134241104126, - "rewards/margins": 1.763929009437561, - "rewards/rejected": -3.6027424335479736, + "logits/chosen": -0.7592865824699402, + "logits/rejected": -0.4464483857154846, + "logps/chosen": -401.47607421875, + "logps/rejected": -523.3784790039062, + "loss": 0.3803, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.7566916942596436, + "rewards/margins": 1.5606569051742554, + "rewards/rejected": -3.3173484802246094, "step": 410 }, { "epoch": 0.31, - "grad_norm": 64.03484534800386, + "grad_norm": 51.57934206296598, "learning_rate": 4.3472652553068835e-07, - "logits/chosen": -0.47518905997276306, - "logits/rejected": -0.23270010948181152, - "logps/chosen": -484.5774841308594, - "logps/rejected": -611.3958740234375, - "loss": 0.3739, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -2.291532039642334, - "rewards/margins": 1.4785693883895874, - "rewards/rejected": -3.770102024078369, + "logits/chosen": -0.6644355654716492, + "logits/rejected": -0.23346371948719025, + "logps/chosen": -404.8458557128906, + "logps/rejected": -540.8956298828125, + "loss": 0.3797, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -1.7600839138031006, + "rewards/margins": 1.6869585514068604, + "rewards/rejected": -3.4470419883728027, "step": 420 }, { "epoch": 0.32, - "grad_norm": 52.3682956987419, + "grad_norm": 73.04228089758476, "learning_rate": 4.3029485188068895e-07, - "logits/chosen": -0.6198094487190247, - "logits/rejected": -0.28696033358573914, - "logps/chosen": -403.52703857421875, - "logps/rejected": -581.9299926757812, - "loss": 0.3654, - "rewards/accuracies": 0.8125, - "rewards/chosen": -1.8076034784317017, - "rewards/margins": 1.710561990737915, - "rewards/rejected": -3.5181660652160645, + "logits/chosen": 0.10370206832885742, + "logits/rejected": 0.39608412981033325, + "logps/chosen": -385.42498779296875, + "logps/rejected": -570.5172729492188, + "loss": 0.3655, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -1.839719533920288, + "rewards/margins": 1.714897871017456, + "rewards/rejected": -3.5546176433563232, "step": 430 }, { "epoch": 0.33, - "grad_norm": 52.85823705565832, + "grad_norm": 54.512857623037554, "learning_rate": 4.257418476074103e-07, - "logits/chosen": -0.6958103179931641, - "logits/rejected": -0.40623918175697327, - "logps/chosen": -442.66473388671875, - "logps/rejected": -562.0119018554688, - "loss": 0.3841, - "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -1.8445909023284912, - "rewards/margins": 1.4965617656707764, - "rewards/rejected": -3.3411526679992676, + "logits/chosen": -0.023069072514772415, + "logits/rejected": 0.3960541784763336, + "logps/chosen": -423.490478515625, + "logps/rejected": -592.7897338867188, + "loss": 0.3638, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.7877943515777588, + "rewards/margins": 2.115088701248169, + "rewards/rejected": -3.9028830528259277, "step": 440 }, { "epoch": 0.33, - "grad_norm": 58.890892905451516, + "grad_norm": 55.7162708155443, "learning_rate": 4.210705766854504e-07, - "logits/chosen": -0.8186171650886536, - "logits/rejected": -0.4163144528865814, - "logps/chosen": -412.9081115722656, - "logps/rejected": -538.0160522460938, - "loss": 0.3763, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": -1.5352731943130493, - "rewards/margins": 1.580110788345337, - "rewards/rejected": -3.1153836250305176, + "logits/chosen": 0.15324774384498596, + "logits/rejected": 0.521506667137146, + "logps/chosen": -456.01776123046875, + "logps/rejected": -625.3338623046875, + "loss": 0.352, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.096989870071411, + "rewards/margins": 1.874829649925232, + "rewards/rejected": -3.9718196392059326, "step": 450 }, { "epoch": 0.34, - "grad_norm": 54.82348692428555, + "grad_norm": 51.50110954656292, "learning_rate": 4.16284182677737e-07, - "logits/chosen": -0.5104752779006958, - "logits/rejected": -0.022202759981155396, - "logps/chosen": -504.6559143066406, - "logps/rejected": -638.7310180664062, - "loss": 0.3747, - "rewards/accuracies": 0.8374999761581421, - "rewards/chosen": -2.382429599761963, - "rewards/margins": 1.8315938711166382, - "rewards/rejected": -4.214023590087891, + "logits/chosen": 0.3847750127315521, + "logits/rejected": 0.9687877893447876, + "logps/chosen": -421.48321533203125, + "logps/rejected": -571.6495361328125, + "loss": 0.3771, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.7761863470077515, + "rewards/margins": 1.777931809425354, + "rewards/rejected": -3.5541183948516846, "step": 460 }, { "epoch": 0.35, - "grad_norm": 54.979690542314074, + "grad_norm": 42.17081561639591, "learning_rate": 4.113858866200466e-07, - "logits/chosen": -0.7018298506736755, - "logits/rejected": -0.30557408928871155, - "logps/chosen": -490.76947021484375, - "logps/rejected": -648.2581787109375, - "loss": 0.3667, - "rewards/accuracies": 0.831250011920929, - "rewards/chosen": -2.0751655101776123, - "rewards/margins": 1.8448684215545654, - "rewards/rejected": -3.9200336933135986, + "logits/chosen": 0.5899291634559631, + "logits/rejected": 0.9651363492012024, + "logps/chosen": -411.4060974121094, + "logps/rejected": -587.0046997070312, + "loss": 0.3551, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -1.751307725906372, + "rewards/margins": 1.814639687538147, + "rewards/rejected": -3.5659472942352295, "step": 470 }, { "epoch": 0.36, - "grad_norm": 52.384860047470646, + "grad_norm": 48.02610054790726, "learning_rate": 4.063789848533865e-07, - "logits/chosen": -0.5765314698219299, - "logits/rejected": -0.1146412268280983, - "logps/chosen": -431.3514709472656, - "logps/rejected": -595.962646484375, - "loss": 0.364, - "rewards/accuracies": 0.84375, - "rewards/chosen": -1.9231284856796265, - "rewards/margins": 1.8943427801132202, - "rewards/rejected": -3.8174710273742676, + "logits/chosen": 0.46232396364212036, + "logits/rejected": 1.0872290134429932, + "logps/chosen": -472.24139404296875, + "logps/rejected": -634.9567260742188, + "loss": 0.374, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.287501573562622, + "rewards/margins": 1.8356859683990479, + "rewards/rejected": -4.123187065124512, "step": 480 }, { "epoch": 0.36, - "grad_norm": 48.24790366086697, + "grad_norm": 45.88835702974933, "learning_rate": 4.0126684680570074e-07, - "logits/chosen": -0.8232128024101257, - "logits/rejected": -0.3200731873512268, - "logps/chosen": -487.1802673339844, - "logps/rejected": -620.1703491210938, - "loss": 0.3632, - "rewards/accuracies": 0.8187500238418579, - "rewards/chosen": -1.8669496774673462, - "rewards/margins": 1.7825498580932617, - "rewards/rejected": -3.6494994163513184, + "logits/chosen": -0.3817380368709564, + "logits/rejected": 0.1566486358642578, + "logps/chosen": -461.13934326171875, + "logps/rejected": -592.1519165039062, + "loss": 0.334, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -1.8447940349578857, + "rewards/margins": 1.7669038772583008, + "rewards/rejected": -3.6116981506347656, "step": 490 }, { "epoch": 0.37, - "grad_norm": 53.47941403954316, + "grad_norm": 53.85769217498667, "learning_rate": 3.960529127243902e-07, - "logits/chosen": -0.6839994192123413, - "logits/rejected": -0.3794109523296356, - "logps/chosen": -454.06494140625, - "logps/rejected": -560.31396484375, - "loss": 0.3576, - "rewards/accuracies": 0.862500011920929, - "rewards/chosen": -1.671570062637329, - "rewards/margins": 1.5296937227249146, - "rewards/rejected": -3.201263904571533, + "logits/chosen": -0.31509625911712646, + "logits/rejected": -0.04504912719130516, + "logps/chosen": -477.027099609375, + "logps/rejected": -654.2672119140625, + "loss": 0.3559, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -2.053821086883545, + "rewards/margins": 2.070889711380005, + "rewards/rejected": -4.124711036682129, "step": 500 }, { "epoch": 0.37, - "eval_logits/chosen": -1.1202046871185303, - "eval_logits/rejected": -0.8921781182289124, - "eval_logps/chosen": -444.0173034667969, - "eval_logps/rejected": -560.5327758789062, - "eval_loss": 0.5183658599853516, - "eval_rewards/accuracies": 0.76953125, - "eval_rewards/chosen": -1.6238377094268799, - "eval_rewards/margins": 1.103874921798706, - "eval_rewards/rejected": -2.727712631225586, - "eval_runtime": 98.0942, - "eval_samples_per_second": 20.389, - "eval_steps_per_second": 0.326, + "eval_logits/chosen": -1.0066841840744019, + "eval_logits/rejected": -0.6833571791648865, + "eval_logps/chosen": -456.0086364746094, + "eval_logps/rejected": -568.9822387695312, + "eval_loss": 0.5275729894638062, + "eval_rewards/accuracies": 0.7578125, + "eval_rewards/chosen": -1.4423211812973022, + "eval_rewards/margins": 1.0722503662109375, + "eval_rewards/rejected": -2.5145716667175293, + "eval_runtime": 97.6519, + "eval_samples_per_second": 20.481, + "eval_steps_per_second": 0.328, "step": 500 }, { "epoch": 0.38, - "grad_norm": 47.67862520127771, + "grad_norm": 53.47947486686438, "learning_rate": 3.9074069136117594e-07, - "logits/chosen": -0.6645992994308472, - "logits/rejected": -0.23552604019641876, - "logps/chosen": -437.04168701171875, - "logps/rejected": -573.43017578125, - "loss": 0.3616, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -2.357367753982544, - "rewards/margins": 1.7008861303329468, - "rewards/rejected": -4.058254241943359, + "logits/chosen": -0.6587181687355042, + "logits/rejected": -0.11707913875579834, + "logps/chosen": -478.9352111816406, + "logps/rejected": -631.669921875, + "loss": 0.35, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -2.0555968284606934, + "rewards/margins": 1.9847618341445923, + "rewards/rejected": -4.040358543395996, "step": 510 }, { "epoch": 0.39, - "grad_norm": 55.20432992821677, + "grad_norm": 48.01190508303512, "learning_rate": 3.8533375761086094e-07, - "logits/chosen": -0.6538324356079102, - "logits/rejected": -0.28878337144851685, - "logps/chosen": -424.3060607910156, - "logps/rejected": -554.125732421875, - "loss": 0.3651, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -1.6924982070922852, - "rewards/margins": 1.4187054634094238, - "rewards/rejected": -3.111203670501709, + "logits/chosen": -0.6520954966545105, + "logits/rejected": -0.19666698575019836, + "logps/chosen": -399.66455078125, + "logps/rejected": -589.08251953125, + "loss": 0.3518, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -1.5765998363494873, + "rewards/margins": 2.0024795532226562, + "rewards/rejected": -3.5790793895721436, "step": 520 }, { "epoch": 0.39, - "grad_norm": 48.99836031219826, + "grad_norm": 58.201909693922666, "learning_rate": 3.79835750105581e-07, - "logits/chosen": -0.5227429270744324, - "logits/rejected": -0.26728391647338867, - "logps/chosen": -419.548095703125, - "logps/rejected": -586.7620849609375, - "loss": 0.3642, - "rewards/accuracies": 0.862500011920929, - "rewards/chosen": -1.778322458267212, - "rewards/margins": 1.765242338180542, - "rewards/rejected": -3.543564558029175, + "logits/chosen": -0.015231219120323658, + "logits/rejected": 0.524590253829956, + "logps/chosen": -425.837890625, + "logps/rejected": -576.46630859375, + "loss": 0.364, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -1.9596973657608032, + "rewards/margins": 1.918087363243103, + "rewards/rejected": -3.8777847290039062, "step": 530 }, { "epoch": 0.4, - "grad_norm": 58.08597953496209, + "grad_norm": 53.67325387574443, "learning_rate": 3.742503687661627e-07, - "logits/chosen": -0.20574072003364563, - "logits/rejected": 0.2035825252532959, - "logps/chosen": -465.63763427734375, - "logps/rejected": -606.3990478515625, - "loss": 0.3525, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -2.341270923614502, - "rewards/margins": 1.657379388809204, - "rewards/rejected": -3.998650312423706, + "logits/chosen": 0.3345823585987091, + "logits/rejected": 0.8041492700576782, + "logps/chosen": -436.06170654296875, + "logps/rejected": -628.6650390625, + "loss": 0.3413, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -2.163074016571045, + "rewards/margins": 2.0728249549865723, + "rewards/rejected": -4.235899925231934, "step": 540 }, { "epoch": 0.41, - "grad_norm": 45.74418223365167, + "grad_norm": 54.5126564713129, "learning_rate": 3.685813723122372e-07, - "logits/chosen": -0.06602563709020615, - "logits/rejected": 0.41205328702926636, - "logps/chosen": -458.8877868652344, - "logps/rejected": -588.2848510742188, - "loss": 0.374, - "rewards/accuracies": 0.831250011920929, - "rewards/chosen": -2.2084052562713623, - "rewards/margins": 1.5406155586242676, - "rewards/rejected": -3.74902081489563, + "logits/chosen": 0.6497628688812256, + "logits/rejected": 1.1682524681091309, + "logps/chosen": -425.30157470703125, + "logps/rejected": -617.69482421875, + "loss": 0.3365, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -1.9300180673599243, + "rewards/margins": 2.057875394821167, + "rewards/rejected": -3.987893581390381, "step": 550 }, { "epoch": 0.42, - "grad_norm": 49.47898611306175, + "grad_norm": 62.74924566191948, "learning_rate": 3.6283257573278466e-07, - "logits/chosen": -0.08754973113536835, - "logits/rejected": 0.23806925117969513, - "logps/chosen": -423.28985595703125, - "logps/rejected": -584.5783081054688, - "loss": 0.3585, - "rewards/accuracies": 0.90625, - "rewards/chosen": -1.8439204692840576, - "rewards/margins": 1.726400375366211, - "rewards/rejected": -3.5703208446502686, + "logits/chosen": 0.867998480796814, + "logits/rejected": 1.330685019493103, + "logps/chosen": -455.71124267578125, + "logps/rejected": -659.052978515625, + "loss": 0.3223, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -2.0765323638916016, + "rewards/margins": 2.156247615814209, + "rewards/rejected": -4.2327799797058105, "step": 560 }, { "epoch": 0.42, - "grad_norm": 59.10150187997885, + "grad_norm": 48.6969642598068, "learning_rate": 3.5700784771881224e-07, - "logits/chosen": 0.15894003212451935, - "logits/rejected": 0.4934872090816498, - "logps/chosen": -443.24053955078125, - "logps/rejected": -601.0978393554688, - "loss": 0.3442, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": -2.163815975189209, - "rewards/margins": 1.6688892841339111, - "rewards/rejected": -3.83270525932312, + "logits/chosen": 1.0166234970092773, + "logits/rejected": 1.6870880126953125, + "logps/chosen": -478.86407470703125, + "logps/rejected": -635.7424926757812, + "loss": 0.3382, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -2.4357941150665283, + "rewards/margins": 1.9054218530654907, + "rewards/rejected": -4.341216087341309, "step": 570 }, { "epoch": 0.43, - "grad_norm": 54.44170466261096, + "grad_norm": 43.243072977055355, "learning_rate": 3.511111080598925e-07, - "logits/chosen": 0.004490786697715521, - "logits/rejected": 0.34507110714912415, - "logps/chosen": -441.2269592285156, - "logps/rejected": -602.7559814453125, - "loss": 0.3492, - "rewards/accuracies": 0.862500011920929, - "rewards/chosen": -1.914200782775879, - "rewards/margins": 1.9196977615356445, - "rewards/rejected": -3.8338985443115234, + "logits/chosen": 0.6339820623397827, + "logits/rejected": 1.3627948760986328, + "logps/chosen": -447.268798828125, + "logps/rejected": -636.5888671875, + "loss": 0.3276, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.9055280685424805, + "rewards/margins": 2.3114867210388184, + "rewards/rejected": -4.217014312744141, "step": 580 }, { "epoch": 0.44, - "grad_norm": 53.89900166712047, + "grad_norm": 69.40196325230258, "learning_rate": 3.451463250063146e-07, - "logits/chosen": -0.27837514877319336, - "logits/rejected": 0.13625089824199677, - "logps/chosen": -448.6805725097656, - "logps/rejected": -645.89013671875, - "loss": 0.324, - "rewards/accuracies": 0.862500011920929, - "rewards/chosen": -1.93107008934021, - "rewards/margins": 2.10721755027771, - "rewards/rejected": -4.038287162780762, + "logits/chosen": 0.8395903706550598, + "logits/rejected": 1.488012671470642, + "logps/chosen": -432.853271484375, + "logps/rejected": -630.223876953125, + "loss": 0.3378, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.9797086715698242, + "rewards/margins": 2.143889904022217, + "rewards/rejected": -4.123598098754883, "step": 590 }, { "epoch": 0.45, - "grad_norm": 51.7074718432086, + "grad_norm": 59.19017069860126, "learning_rate": 3.3911751259862403e-07, - "logits/chosen": -0.041231829673051834, - "logits/rejected": 0.34056779742240906, - "logps/chosen": -443.47412109375, - "logps/rejected": -629.924560546875, - "loss": 0.3328, - "rewards/accuracies": 0.918749988079071, - "rewards/chosen": -1.894299864768982, - "rewards/margins": 2.1391186714172363, - "rewards/rejected": -4.033418655395508, + "logits/chosen": 0.9315579533576965, + "logits/rejected": 1.3961995840072632, + "logps/chosen": -493.1189880371094, + "logps/rejected": -684.4100341796875, + "loss": 0.3291, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -2.291141986846924, + "rewards/margins": 2.0969302654266357, + "rewards/rejected": -4.3880720138549805, "step": 600 }, { "epoch": 0.45, - "eval_logits/chosen": -0.06941252946853638, - "eval_logits/rejected": 0.24228118360042572, - "eval_logps/chosen": -493.6551513671875, - "eval_logps/rejected": -628.6859130859375, - "eval_loss": 0.5151087641716003, - "eval_rewards/accuracies": 0.765625, - "eval_rewards/chosen": -2.1202151775360107, - "eval_rewards/margins": 1.2890279293060303, - "eval_rewards/rejected": -3.409243106842041, - "eval_runtime": 98.1083, - "eval_samples_per_second": 20.386, - "eval_steps_per_second": 0.326, + "eval_logits/chosen": -0.2334394007921219, + "eval_logits/rejected": 0.188625305891037, + "eval_logps/chosen": -477.9444580078125, + "eval_logps/rejected": -595.6332397460938, + "eval_loss": 0.5102677941322327, + "eval_rewards/accuracies": 0.76953125, + "eval_rewards/chosen": -1.6616793870925903, + "eval_rewards/margins": 1.1194015741348267, + "eval_rewards/rejected": -2.781080961227417, + "eval_runtime": 97.2562, + "eval_samples_per_second": 20.564, + "eval_steps_per_second": 0.329, "step": 600 }, { "epoch": 0.45, - "grad_norm": 55.66397888256945, + "grad_norm": 37.653590501774474, "learning_rate": 3.3302872796634754e-07, - "logits/chosen": 0.49535948038101196, - "logits/rejected": 1.0743920803070068, - "logps/chosen": -468.02978515625, - "logps/rejected": -642.6104736328125, - "loss": 0.3096, - "rewards/accuracies": 0.862500011920929, - "rewards/chosen": -2.297302722930908, - "rewards/margins": 2.0772812366485596, - "rewards/rejected": -4.374583721160889, + "logits/chosen": 0.9580332040786743, + "logits/rejected": 1.3357497453689575, + "logps/chosen": -427.964111328125, + "logps/rejected": -620.7327880859375, + "loss": 0.3122, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -1.95559823513031, + "rewards/margins": 2.1169991493225098, + "rewards/rejected": -4.072597503662109, "step": 610 }, { "epoch": 0.46, - "grad_norm": 51.7383697331199, + "grad_norm": 47.96131506831022, "learning_rate": 3.2688406859772035e-07, - "logits/chosen": 0.2597588300704956, - "logits/rejected": 0.673038125038147, - "logps/chosen": -444.8384704589844, - "logps/rejected": -659.6806030273438, - "loss": 0.3346, - "rewards/accuracies": 0.875, - "rewards/chosen": -2.2546586990356445, - "rewards/margins": 2.226531744003296, - "rewards/rejected": -4.4811906814575195, + "logits/chosen": 0.8878351449966431, + "logits/rejected": 1.4351171255111694, + "logps/chosen": -489.7989196777344, + "logps/rejected": -665.8047485351562, + "loss": 0.3224, + "rewards/accuracies": 0.84375, + "rewards/chosen": -2.195067882537842, + "rewards/margins": 2.1086602210998535, + "rewards/rejected": -4.3037285804748535, "step": 620 }, { "epoch": 0.47, - "grad_norm": 54.19919430150042, + "grad_norm": 65.32009143781127, "learning_rate": 3.206876695822541e-07, - "logits/chosen": -0.12093131244182587, - "logits/rejected": 0.4730301797389984, - "logps/chosen": -452.8343811035156, - "logps/rejected": -602.112060546875, - "loss": 0.3051, - "rewards/accuracies": 0.8812500238418579, - "rewards/chosen": -1.8909651041030884, - "rewards/margins": 1.9853994846343994, - "rewards/rejected": -3.876364231109619, + "logits/chosen": 1.3710159063339233, + "logits/rejected": 1.7163244485855103, + "logps/chosen": -493.956298828125, + "logps/rejected": -688.6646728515625, + "loss": 0.3129, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -2.438476085662842, + "rewards/margins": 2.2680106163024902, + "rewards/rejected": -4.706486701965332, "step": 630 }, { "epoch": 0.48, - "grad_norm": 50.389338383353866, + "grad_norm": 66.03238810693847, "learning_rate": 3.144437008280012e-07, - "logits/chosen": -0.09394478052854538, - "logits/rejected": 0.33072829246520996, - "logps/chosen": -455.43731689453125, - "logps/rejected": -609.87744140625, - "loss": 0.3263, - "rewards/accuracies": 0.8687499761581421, - "rewards/chosen": -2.1607372760772705, - "rewards/margins": 1.885525107383728, - "rewards/rejected": -4.046262264251709, + "logits/chosen": 0.709919273853302, + "logits/rejected": 1.0818461179733276, + "logps/chosen": -468.56890869140625, + "logps/rejected": -691.1434326171875, + "loss": 0.3232, + "rewards/accuracies": 0.90625, + "rewards/chosen": -2.252897262573242, + "rewards/margins": 2.3767807483673096, + "rewards/rejected": -4.629677772521973, "step": 640 }, { "epoch": 0.48, - "grad_norm": 61.479087062458596, + "grad_norm": 47.885060646853404, "learning_rate": 3.0815636425538665e-07, - "logits/chosen": 0.20981228351593018, - "logits/rejected": 0.55927574634552, - "logps/chosen": -474.67803955078125, - "logps/rejected": -664.0802001953125, - "loss": 0.3204, - "rewards/accuracies": 0.875, - "rewards/chosen": -2.3460288047790527, - "rewards/margins": 2.071723222732544, - "rewards/rejected": -4.417751789093018, + "logits/chosen": 1.0194989442825317, + "logits/rejected": 1.571274995803833, + "logps/chosen": -446.6681213378906, + "logps/rejected": -611.84033203125, + "loss": 0.3429, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -2.190187454223633, + "rewards/margins": 2.0423951148986816, + "rewards/rejected": -4.232582092285156, "step": 650 }, { "epoch": 0.49, - "grad_norm": 69.05844503540243, + "grad_norm": 59.75526535732341, "learning_rate": 3.018298909694986e-07, - "logits/chosen": 0.25689777731895447, - "logits/rejected": 0.7616270184516907, - "logps/chosen": -470.198486328125, - "logps/rejected": -662.539306640625, - "loss": 0.3289, + "logits/chosen": 1.3580573797225952, + "logits/rejected": 1.913851022720337, + "logps/chosen": -489.56982421875, + "logps/rejected": -673.2572021484375, + "loss": 0.3288, "rewards/accuracies": 0.856249988079071, - "rewards/chosen": -2.4795608520507812, - "rewards/margins": 1.899802565574646, - "rewards/rejected": -4.379363059997559, + "rewards/chosen": -2.595083236694336, + "rewards/margins": 2.0307328701019287, + "rewards/rejected": -4.6258158683776855, "step": 660 }, { "epoch": 0.5, - "grad_norm": 45.7911918343519, + "grad_norm": 51.20761564052719, "learning_rate": 2.954685384127371e-07, - "logits/chosen": 0.062225330621004105, - "logits/rejected": 0.41303902864456177, - "logps/chosen": -428.16387939453125, - "logps/rejected": -597.1376953125, - "loss": 0.306, - "rewards/accuracies": 0.84375, - "rewards/chosen": -2.1429684162139893, - "rewards/margins": 1.8161710500717163, - "rewards/rejected": -3.959139347076416, + "logits/chosen": 0.8674410581588745, + "logits/rejected": 1.4072096347808838, + "logps/chosen": -482.65789794921875, + "logps/rejected": -649.311279296875, + "loss": 0.301, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -2.194945812225342, + "rewards/margins": 2.093947172164917, + "rewards/rejected": -4.288893222808838, "step": 670 }, { "epoch": 0.51, - "grad_norm": 57.36445697043156, + "grad_norm": 62.65952308868226, "learning_rate": 2.8907658749974054e-07, - "logits/chosen": -0.09550069272518158, - "logits/rejected": 0.35600176453590393, - "logps/chosen": -455.721923828125, - "logps/rejected": -670.6884765625, - "loss": 0.2969, - "rewards/accuracies": 0.875, - "rewards/chosen": -2.200000762939453, - "rewards/margins": 2.278864622116089, - "rewards/rejected": -4.478865146636963, + "logits/chosen": 0.9979363679885864, + "logits/rejected": 1.4131087064743042, + "logps/chosen": -457.8363342285156, + "logps/rejected": -703.2235107421875, + "loss": 0.2929, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -2.264411449432373, + "rewards/margins": 2.5431039333343506, + "rewards/rejected": -4.807515621185303, "step": 680 }, { "epoch": 0.51, - "grad_norm": 57.63859566762506, + "grad_norm": 49.65473672539794, "learning_rate": 2.8265833973651503e-07, - "logits/chosen": 0.4362770915031433, - "logits/rejected": 1.019630789756775, - "logps/chosen": -481.36944580078125, - "logps/rejected": -682.4991455078125, - "loss": 0.3037, - "rewards/accuracies": 0.8687499761581421, - "rewards/chosen": -2.728414297103882, - "rewards/margins": 2.1998586654663086, - "rewards/rejected": -4.928272724151611, + "logits/chosen": 0.6275979280471802, + "logits/rejected": 1.0561200380325317, + "logps/chosen": -459.69976806640625, + "logps/rejected": -684.1864013671875, + "loss": 0.2859, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -1.8421128988265991, + "rewards/margins": 2.5259382724761963, + "rewards/rejected": -4.368051528930664, "step": 690 }, { "epoch": 0.52, - "grad_norm": 38.04349040823844, + "grad_norm": 48.72864396453521, "learning_rate": 2.7621811432570736e-07, - "logits/chosen": 0.0020432949531823397, - "logits/rejected": 0.678579568862915, - "logps/chosen": -452.239990234375, - "logps/rejected": -615.9015502929688, - "loss": 0.3131, - "rewards/accuracies": 0.8374999761581421, - "rewards/chosen": -1.9651463031768799, - "rewards/margins": 2.032357931137085, - "rewards/rejected": -3.997504472732544, + "logits/chosen": 0.8585799336433411, + "logits/rejected": 1.5937745571136475, + "logps/chosen": -518.5455932617188, + "logps/rejected": -734.5382690429688, + "loss": 0.2735, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -2.441080093383789, + "rewards/margins": 2.6617679595947266, + "rewards/rejected": -5.102847576141357, "step": 700 }, { "epoch": 0.52, - "eval_logits/chosen": -0.1655656397342682, - "eval_logits/rejected": 0.172866553068161, - "eval_logps/chosen": -451.9695739746094, - "eval_logps/rejected": -578.1397705078125, - "eval_loss": 0.5152586698532104, - "eval_rewards/accuracies": 0.765625, - "eval_rewards/chosen": -1.7033603191375732, - "eval_rewards/margins": 1.2004213333129883, - "eval_rewards/rejected": -2.9037814140319824, - "eval_runtime": 98.1096, - "eval_samples_per_second": 20.385, - "eval_steps_per_second": 0.326, + "eval_logits/chosen": 0.18704134225845337, + "eval_logits/rejected": 0.6721899509429932, + "eval_logps/chosen": -541.279541015625, + "eval_logps/rejected": -687.587158203125, + "eval_loss": 0.5288776159286499, + "eval_rewards/accuracies": 0.76171875, + "eval_rewards/chosen": -2.2950310707092285, + "eval_rewards/margins": 1.40558922290802, + "eval_rewards/rejected": -3.70061993598938, + "eval_runtime": 97.5006, + "eval_samples_per_second": 20.513, + "eval_steps_per_second": 0.328, "step": 700 }, { "epoch": 0.53, - "grad_norm": 60.9670305167988, + "grad_norm": 50.62866425523001, "learning_rate": 2.6976024525996917e-07, - "logits/chosen": 0.702204167842865, - "logits/rejected": 0.9482040405273438, - "logps/chosen": -400.1419372558594, - "logps/rejected": -616.5157470703125, - "loss": 0.2933, - "rewards/accuracies": 0.8687499761581421, - "rewards/chosen": -2.058786630630493, - "rewards/margins": 2.1224417686462402, - "rewards/rejected": -4.1812286376953125, + "logits/chosen": 1.1524347066879272, + "logits/rejected": 1.7467842102050781, + "logps/chosen": -503.6927795410156, + "logps/rejected": -780.6187744140625, + "loss": 0.286, + "rewards/accuracies": 0.90625, + "rewards/chosen": -2.7125723361968994, + "rewards/margins": 2.8134512901306152, + "rewards/rejected": -5.5260233879089355, "step": 710 }, { "epoch": 0.53, - "grad_norm": 57.54299069526284, + "grad_norm": 56.03367218705217, "learning_rate": 2.6328907840536706e-07, - "logits/chosen": 0.6100508570671082, - "logits/rejected": 0.9714565277099609, - "logps/chosen": -446.7682189941406, - "logps/rejected": -678.0450439453125, - "loss": 0.312, - "rewards/accuracies": 0.856249988079071, - "rewards/chosen": -2.1031315326690674, - "rewards/margins": 2.523500680923462, - "rewards/rejected": -4.626631736755371, + "logits/chosen": 0.7062090039253235, + "logits/rejected": 1.2199087142944336, + "logps/chosen": -460.45794677734375, + "logps/rejected": -685.5617065429688, + "loss": 0.3244, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -2.43827748298645, + "rewards/margins": 2.252427577972412, + "rewards/rejected": -4.690704822540283, "step": 720 }, { "epoch": 0.54, - "grad_norm": 55.010003089665325, + "grad_norm": 57.82647372234183, "learning_rate": 2.568089685768038e-07, - "logits/chosen": 0.33207422494888306, - "logits/rejected": 0.8213400840759277, - "logps/chosen": -466.47967529296875, - "logps/rejected": -616.5966796875, - "loss": 0.3037, - "rewards/accuracies": 0.84375, - "rewards/chosen": -2.145937442779541, - "rewards/margins": 1.8470547199249268, - "rewards/rejected": -3.992992401123047, + "logits/chosen": 0.6572129130363464, + "logits/rejected": 1.0754339694976807, + "logps/chosen": -530.2496337890625, + "logps/rejected": -698.03662109375, + "loss": 0.313, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -2.59128475189209, + "rewards/margins": 2.117705821990967, + "rewards/rejected": -4.708990573883057, "step": 730 }, { "epoch": 0.55, - "grad_norm": 56.034615355832116, + "grad_norm": 50.473574423912424, "learning_rate": 2.503242766074156e-07, - "logits/chosen": 1.0638476610183716, - "logits/rejected": 1.7732141017913818, - "logps/chosen": -467.27789306640625, - "logps/rejected": -673.653564453125, - "loss": 0.3135, - "rewards/accuracies": 0.918749988079071, - "rewards/chosen": -2.2686350345611572, - "rewards/margins": 2.4400157928466797, - "rewards/rejected": -4.708651542663574, + "logits/chosen": 0.42826253175735474, + "logits/rejected": 1.0195951461791992, + "logps/chosen": -451.046142578125, + "logps/rejected": -653.2913818359375, + "loss": 0.2898, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -1.9979403018951416, + "rewards/margins": 2.318507432937622, + "rewards/rejected": -4.316447734832764, "step": 740 }, { "epoch": 0.56, - "grad_norm": 55.33213405677105, + "grad_norm": 61.13648555404995, "learning_rate": 2.4383936641392136e-07, - "logits/chosen": 0.8964494466781616, - "logits/rejected": 1.4589967727661133, - "logps/chosen": -475.98529052734375, - "logps/rejected": -655.1957397460938, - "loss": 0.3094, - "rewards/accuracies": 0.875, - "rewards/chosen": -2.501713514328003, - "rewards/margins": 2.241748094558716, - "rewards/rejected": -4.743461608886719, + "logits/chosen": 0.6429548859596252, + "logits/rejected": 1.103127360343933, + "logps/chosen": -467.82049560546875, + "logps/rejected": -702.5692749023438, + "loss": 0.2975, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -2.0785393714904785, + "rewards/margins": 2.386026382446289, + "rewards/rejected": -4.464566230773926, "step": 750 }, { "epoch": 0.56, - "grad_norm": 56.701790611883894, + "grad_norm": 51.760001565819636, "learning_rate": 2.3735860205989493e-07, - "logits/chosen": 0.8617841005325317, - "logits/rejected": 1.3072969913482666, - "logps/chosen": -474.41583251953125, - "logps/rejected": -685.8687744140625, - "loss": 0.2749, - "rewards/accuracies": 0.8687499761581421, - "rewards/chosen": -2.408444881439209, - "rewards/margins": 2.277350425720215, - "rewards/rejected": -4.685795307159424, + "logits/chosen": 0.7451823353767395, + "logits/rejected": 1.1489431858062744, + "logps/chosen": -462.767333984375, + "logps/rejected": -706.5615234375, + "loss": 0.2627, + "rewards/accuracies": 0.90625, + "rewards/chosen": -2.312885284423828, + "rewards/margins": 2.6091692447662354, + "rewards/rejected": -4.922054767608643, "step": 760 }, { "epoch": 0.57, - "grad_norm": 53.32774327458172, + "grad_norm": 56.13632726849474, "learning_rate": 2.308863448189402e-07, - "logits/chosen": 0.5902019739151001, - "logits/rejected": 0.8791207075119019, - "logps/chosen": -452.8211364746094, - "logps/rejected": -647.130126953125, - "loss": 0.3028, - "rewards/accuracies": 0.84375, - "rewards/chosen": -2.1281752586364746, - "rewards/margins": 2.103821277618408, - "rewards/rejected": -4.231996536254883, + "logits/chosen": 0.5960752367973328, + "logits/rejected": 1.0421712398529053, + "logps/chosen": -498.1941833496094, + "logps/rejected": -695.0504760742188, + "loss": 0.2811, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -2.29612398147583, + "rewards/margins": 2.4551825523376465, + "rewards/rejected": -4.751306533813477, "step": 770 }, { "epoch": 0.58, - "grad_norm": 66.84990640273234, + "grad_norm": 67.7549300842345, "learning_rate": 2.2442695023974246e-07, - "logits/chosen": 0.7262977361679077, - "logits/rejected": 1.3522447347640991, - "logps/chosen": -426.29339599609375, - "logps/rejected": -622.6487426757812, - "loss": 0.289, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": -2.120136022567749, - "rewards/margins": 2.228008985519409, - "rewards/rejected": -4.348145008087158, + "logits/chosen": 0.6856900453567505, + "logits/rejected": 1.3306076526641846, + "logps/chosen": -444.3168029785156, + "logps/rejected": -679.816650390625, + "loss": 0.2713, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -2.0717947483062744, + "rewards/margins": 2.6752490997314453, + "rewards/rejected": -4.747043609619141, "step": 780 }, { "epoch": 0.59, - "grad_norm": 58.09209476264956, + "grad_norm": 55.628538802719504, "learning_rate": 2.179847652149729e-07, - "logits/chosen": 0.5433401465415955, - "logits/rejected": 1.0540757179260254, - "logps/chosen": -516.3043212890625, - "logps/rejected": -711.8958740234375, - "loss": 0.2875, - "rewards/accuracies": 0.887499988079071, - "rewards/chosen": -2.7564001083374023, - "rewards/margins": 2.2523038387298584, - "rewards/rejected": -5.00870418548584, + "logits/chosen": 0.7401930093765259, + "logits/rejected": 1.288172960281372, + "logps/chosen": -496.6468811035156, + "logps/rejected": -687.7960205078125, + "loss": 0.295, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -2.4609100818634033, + "rewards/margins": 2.223629951477051, + "rewards/rejected": -4.684540271759033, "step": 790 }, { "epoch": 0.59, - "grad_norm": 51.81489549587344, + "grad_norm": 63.651106043315345, "learning_rate": 2.115641250560183e-07, - "logits/chosen": 0.46662241220474243, - "logits/rejected": 0.9530113339424133, - "logps/chosen": -531.3187866210938, - "logps/rejected": -718.2420043945312, - "loss": 0.2547, - "rewards/accuracies": 0.90625, - "rewards/chosen": -2.6745731830596924, - "rewards/margins": 2.1868960857391357, - "rewards/rejected": -4.86146879196167, + "logits/chosen": 0.8801604509353638, + "logits/rejected": 1.5266039371490479, + "logps/chosen": -473.2115173339844, + "logps/rejected": -701.8800659179688, + "loss": 0.2752, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -2.4201507568359375, + "rewards/margins": 2.4442293643951416, + "rewards/rejected": -4.864380836486816, "step": 800 }, { "epoch": 0.59, - "eval_logits/chosen": 0.12700016796588898, - "eval_logits/rejected": 0.44755151867866516, - "eval_logps/chosen": -535.29150390625, - "eval_logps/rejected": -673.45654296875, - "eval_loss": 0.5256171822547913, - "eval_rewards/accuracies": 0.76171875, - "eval_rewards/chosen": -2.5365793704986572, - "eval_rewards/margins": 1.3203709125518799, - "eval_rewards/rejected": -3.856950521469116, - "eval_runtime": 98.0354, - "eval_samples_per_second": 20.401, - "eval_steps_per_second": 0.326, + "eval_logits/chosen": -0.16280797123908997, + "eval_logits/rejected": 0.2751551866531372, + "eval_logps/chosen": -533.1201782226562, + "eval_logps/rejected": -668.2235717773438, + "eval_loss": 0.5228938460350037, + "eval_rewards/accuracies": 0.765625, + "eval_rewards/chosen": -2.2134366035461426, + "eval_rewards/margins": 1.2935477495193481, + "eval_rewards/rejected": -3.506984233856201, + "eval_runtime": 97.387, + "eval_samples_per_second": 20.537, + "eval_steps_per_second": 0.329, "step": 800 }, { "epoch": 0.6, - "grad_norm": 54.66675021265227, + "grad_norm": 70.2608582618962, "learning_rate": 2.051693505755042e-07, - "logits/chosen": 0.33729100227355957, - "logits/rejected": 0.9711275100708008, - "logps/chosen": -483.88861083984375, - "logps/rejected": -701.2562255859375, - "loss": 0.2491, - "rewards/accuracies": 0.918749988079071, - "rewards/chosen": -2.316730260848999, - "rewards/margins": 2.6504831314086914, - "rewards/rejected": -4.9672136306762695, + "logits/chosen": 0.8354732394218445, + "logits/rejected": 1.2750941514968872, + "logps/chosen": -461.49786376953125, + "logps/rejected": -705.8599853515625, + "loss": 0.2946, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -2.4096267223358154, + "rewards/margins": 2.483677864074707, + "rewards/rejected": -4.893305778503418, "step": 810 }, { "epoch": 0.61, - "grad_norm": 57.83387891787277, + "grad_norm": 49.246802198712466, "learning_rate": 1.9880474517957542e-07, - "logits/chosen": 0.38913315534591675, - "logits/rejected": 0.8929936289787292, - "logps/chosen": -511.2947692871094, - "logps/rejected": -719.1864013671875, - "loss": 0.2861, - "rewards/accuracies": 0.862500011920929, - "rewards/chosen": -2.5449912548065186, - "rewards/margins": 2.4298605918884277, - "rewards/rejected": -4.974852085113525, + "logits/chosen": 0.9254199862480164, + "logits/rejected": 1.563522458076477, + "logps/chosen": -481.2748107910156, + "logps/rejected": -658.328125, + "loss": 0.2674, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -2.385385036468506, + "rewards/margins": 2.1492881774902344, + "rewards/rejected": -4.53467321395874, "step": 820 }, { "epoch": 0.62, - "grad_norm": 54.573496843465676, + "grad_norm": 88.28145029556197, "learning_rate": 1.9247459197189e-07, - "logits/chosen": 0.2409726083278656, - "logits/rejected": 0.4742881655693054, - "logps/chosen": -487.2377014160156, - "logps/rejected": -707.2930908203125, - "loss": 0.2709, - "rewards/accuracies": 0.8687499761581421, - "rewards/chosen": -2.423832416534424, - "rewards/margins": 2.337522029876709, - "rewards/rejected": -4.761354446411133, + "logits/chosen": 0.8668380975723267, + "logits/rejected": 1.5001232624053955, + "logps/chosen": -488.27685546875, + "logps/rejected": -680.9069213867188, + "loss": 0.2652, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -2.6699295043945312, + "rewards/margins": 2.2055306434631348, + "rewards/rejected": -4.875459671020508, "step": 830 }, { "epoch": 0.62, - "grad_norm": 57.59285615682952, + "grad_norm": 43.13543734061108, "learning_rate": 1.8618315087127602e-07, - "logits/chosen": -0.03551248461008072, - "logits/rejected": 0.5289381146430969, - "logps/chosen": -485.76348876953125, - "logps/rejected": -648.2017822265625, - "loss": 0.2908, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": -2.2680952548980713, - "rewards/margins": 2.0456957817077637, - "rewards/rejected": -4.313790321350098, + "logits/chosen": 0.6826521754264832, + "logits/rejected": 1.2443543672561646, + "logps/chosen": -499.20892333984375, + "logps/rejected": -706.3511962890625, + "loss": 0.2563, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -2.4423558712005615, + "rewards/margins": 2.461874485015869, + "rewards/rejected": -4.904230117797852, "step": 840 }, { "epoch": 0.63, - "grad_norm": 73.30688641890157, + "grad_norm": 56.63843357010467, "learning_rate": 1.7993465574499102e-07, - "logits/chosen": -0.13685956597328186, - "logits/rejected": 0.5006009340286255, - "logps/chosen": -509.0750427246094, - "logps/rejected": -739.2849731445312, - "loss": 0.2572, + "logits/chosen": 0.5323538184165955, + "logits/rejected": 1.2176125049591064, + "logps/chosen": -463.47857666015625, + "logps/rejected": -663.4465942382812, + "loss": 0.2759, "rewards/accuracies": 0.887499988079071, - "rewards/chosen": -2.391108512878418, - "rewards/margins": 2.590216875076294, - "rewards/rejected": -4.981324672698975, + "rewards/chosen": -2.189335823059082, + "rewards/margins": 2.420409679412842, + "rewards/rejected": -4.609745502471924, "step": 850 }, { "epoch": 0.64, - "grad_norm": 60.15832089933168, + "grad_norm": 56.31423994279339, "learning_rate": 1.7373331155951233e-07, - "logits/chosen": 0.16482150554656982, - "logits/rejected": 0.5440123677253723, - "logps/chosen": -488.3111877441406, - "logps/rejected": -694.9370727539062, - "loss": 0.2517, - "rewards/accuracies": 0.893750011920929, - "rewards/chosen": -2.627448558807373, - "rewards/margins": 2.2580535411834717, - "rewards/rejected": -4.885501861572266, + "logits/chosen": 0.8688204884529114, + "logits/rejected": 1.4698970317840576, + "logps/chosen": -510.4227600097656, + "logps/rejected": -748.5259399414062, + "loss": 0.2649, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -2.550417900085449, + "rewards/margins": 2.730776309967041, + "rewards/rejected": -5.28119421005249, "step": 860 }, { "epoch": 0.65, - "grad_norm": 48.979507346317426, + "grad_norm": 50.688626621321205, "learning_rate": 1.6758329155077743e-07, - "logits/chosen": 0.2300490438938141, - "logits/rejected": 0.6597995758056641, - "logps/chosen": -471.4276428222656, - "logps/rejected": -696.9656372070312, - "loss": 0.2572, - "rewards/accuracies": 0.8812500238418579, - "rewards/chosen": -2.564178943634033, - "rewards/margins": 2.4548990726470947, - "rewards/rejected": -5.019078254699707, + "logits/chosen": 1.0613950490951538, + "logits/rejected": 1.5818780660629272, + "logps/chosen": -495.5560607910156, + "logps/rejected": -708.2391967773438, + "loss": 0.2711, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -2.642883777618408, + "rewards/margins": 2.6204209327697754, + "rewards/rejected": -5.263304710388184, "step": 870 }, { "epoch": 0.65, - "grad_norm": 54.56547566856159, + "grad_norm": 46.10359729315069, "learning_rate": 1.6148873441577662e-07, - "logits/chosen": 0.0745859146118164, - "logits/rejected": 0.5892910361289978, - "logps/chosen": -490.7748107910156, - "logps/rejected": -704.1431884765625, - "loss": 0.2334, - "rewards/accuracies": 0.918749988079071, - "rewards/chosen": -2.500243663787842, - "rewards/margins": 2.4989378452301025, - "rewards/rejected": -4.999181270599365, + "logits/chosen": 1.0479947328567505, + "logits/rejected": 1.5524357557296753, + "logps/chosen": -480.2462463378906, + "logps/rejected": -707.98681640625, + "loss": 0.2699, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.261603355407715, + "rewards/margins": 2.4961774349212646, + "rewards/rejected": -4.757781028747559, "step": 880 }, { "epoch": 0.66, - "grad_norm": 55.545059461777754, + "grad_norm": 41.346767344116245, "learning_rate": 1.5545374152738934e-07, - "logits/chosen": 0.016130054369568825, - "logits/rejected": 0.48376768827438354, - "logps/chosen": -458.7793884277344, - "logps/rejected": -710.6851196289062, - "loss": 0.25, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": -2.172322988510132, - "rewards/margins": 2.6098039150238037, - "rewards/rejected": -4.782127380371094, + "logits/chosen": 1.1905092000961304, + "logits/rejected": 1.6182410717010498, + "logps/chosen": -468.92083740234375, + "logps/rejected": -689.1092529296875, + "loss": 0.2722, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.264604091644287, + "rewards/margins": 2.391749143600464, + "rewards/rejected": -4.65635347366333, "step": 890 }, { "epoch": 0.67, - "grad_norm": 58.8472459822313, + "grad_norm": 60.48896334839974, "learning_rate": 1.4948237417433775e-07, - "logits/chosen": -0.05456262826919556, - "logits/rejected": 0.4301723539829254, - "logps/chosen": -505.43719482421875, - "logps/rejected": -729.1197509765625, - "loss": 0.2764, - "rewards/accuracies": 0.9312499761581421, - "rewards/chosen": -2.475316286087036, - "rewards/margins": 2.53151535987854, - "rewards/rejected": -5.006831169128418, + "logits/chosen": 1.380293369293213, + "logits/rejected": 2.2697908878326416, + "logps/chosen": -436.1393127441406, + "logps/rejected": -673.2228393554688, + "loss": 0.2492, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -2.151729106903076, + "rewards/margins": 2.624401330947876, + "rewards/rejected": -4.776130676269531, "step": 900 }, { "epoch": 0.67, - "eval_logits/chosen": -0.2431109994649887, - "eval_logits/rejected": 0.051950518041849136, - "eval_logps/chosen": -538.3812866210938, - "eval_logps/rejected": -682.334228515625, - "eval_loss": 0.5221121907234192, - "eval_rewards/accuracies": 0.77734375, - "eval_rewards/chosen": -2.567476749420166, - "eval_rewards/margins": 1.3782495260238647, - "eval_rewards/rejected": -3.9457266330718994, - "eval_runtime": 98.1143, - "eval_samples_per_second": 20.384, - "eval_steps_per_second": 0.326, + "eval_logits/chosen": 0.5183509588241577, + "eval_logits/rejected": 1.0725551843643188, + "eval_logps/chosen": -518.2382202148438, + "eval_logps/rejected": -652.8116455078125, + "eval_loss": 0.5152209997177124, + "eval_rewards/accuracies": 0.7734375, + "eval_rewards/chosen": -2.064617395401001, + "eval_rewards/margins": 1.2882475852966309, + "eval_rewards/rejected": -3.352864980697632, + "eval_runtime": 97.3137, + "eval_samples_per_second": 20.552, + "eval_steps_per_second": 0.329, "step": 900 }, { "epoch": 0.68, - "grad_norm": 76.04810070991684, + "grad_norm": 59.39383985362304, "learning_rate": 1.435786508281158e-07, - "logits/chosen": 0.3129107356071472, - "logits/rejected": 0.6085219383239746, - "logps/chosen": -462.0315856933594, - "logps/rejected": -686.1160888671875, - "loss": 0.2458, - "rewards/accuracies": 0.9125000238418579, - "rewards/chosen": -2.3812341690063477, - "rewards/margins": 2.453599452972412, - "rewards/rejected": -4.83483362197876, + "logits/chosen": 1.9009380340576172, + "logits/rejected": 2.567354679107666, + "logps/chosen": -482.70513916015625, + "logps/rejected": -720.0316162109375, + "loss": 0.2499, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -2.3441195487976074, + "rewards/margins": 2.6516547203063965, + "rewards/rejected": -4.995774269104004, "step": 910 }, { "epoch": 0.68, - "grad_norm": 58.24143150255672, + "grad_norm": 58.953283614647454, "learning_rate": 1.3774654443873174e-07, - "logits/chosen": -0.17105606198310852, - "logits/rejected": 0.10635167360305786, - "logps/chosen": -476.74884033203125, - "logps/rejected": -704.75537109375, - "loss": 0.2662, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": -2.2945311069488525, - "rewards/margins": 2.386672258377075, - "rewards/rejected": -4.681203365325928, + "logits/chosen": 1.749333381652832, + "logits/rejected": 2.4905173778533936, + "logps/chosen": -512.65625, + "logps/rejected": -763.8499145507812, + "loss": 0.2542, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -2.783947467803955, + "rewards/margins": 2.989567756652832, + "rewards/rejected": -5.773515224456787, "step": 920 }, { "epoch": 0.69, - "grad_norm": 61.580815446047275, + "grad_norm": 57.229551980352035, "learning_rate": 1.31989979761085e-07, - "logits/chosen": 0.19714362919330597, - "logits/rejected": 0.6800551414489746, - "logps/chosen": -470.64697265625, - "logps/rejected": -731.5118408203125, - "loss": 0.2524, - "rewards/accuracies": 0.9125000238418579, - "rewards/chosen": -2.3182313442230225, - "rewards/margins": 2.8053975105285645, - "rewards/rejected": -5.123629093170166, + "logits/chosen": 1.3056137561798096, + "logits/rejected": 2.2303478717803955, + "logps/chosen": -465.61627197265625, + "logps/rejected": -746.7559814453125, + "loss": 0.2416, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": -2.5093438625335693, + "rewards/margins": 3.106735944747925, + "rewards/rejected": -5.616079807281494, "step": 930 }, { "epoch": 0.7, - "grad_norm": 66.23348721239962, + "grad_norm": 53.92751444407525, "learning_rate": 1.2631283071377618e-07, - "logits/chosen": 0.15574759244918823, - "logits/rejected": 0.7999376058578491, - "logps/chosen": -489.5287170410156, - "logps/rejected": -739.4246215820312, - "loss": 0.2535, - "rewards/accuracies": 0.875, - "rewards/chosen": -2.4655532836914062, - "rewards/margins": 2.7967965602874756, - "rewards/rejected": -5.2623491287231445, + "logits/chosen": 1.6224052906036377, + "logits/rejected": 1.9630991220474243, + "logps/chosen": -458.9669494628906, + "logps/rejected": -742.6818237304688, + "loss": 0.2429, + "rewards/accuracies": 0.90625, + "rewards/chosen": -2.4590606689453125, + "rewards/margins": 2.7507693767547607, + "rewards/rejected": -5.209830284118652, "step": 940 }, { "epoch": 0.71, - "grad_norm": 55.370463362822846, + "grad_norm": 48.183067890071925, "learning_rate": 1.2071891777212744e-07, - "logits/chosen": 0.4210018217563629, - "logits/rejected": 1.0494170188903809, - "logps/chosen": -442.0162658691406, - "logps/rejected": -666.9036865234375, - "loss": 0.238, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": -2.3151867389678955, - "rewards/margins": 2.4912314414978027, - "rewards/rejected": -4.806417942047119, + "logits/chosen": 1.061023235321045, + "logits/rejected": 1.9151092767715454, + "logps/chosen": -507.06744384765625, + "logps/rejected": -707.039794921875, + "loss": 0.253, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.448425054550171, + "rewards/margins": 2.3641083240509033, + "rewards/rejected": -4.812533855438232, "step": 950 }, { "epoch": 0.71, - "grad_norm": 54.37619099014209, + "grad_norm": 48.31856194766799, "learning_rate": 1.1521200539716874e-07, - "logits/chosen": 0.04333481192588806, - "logits/rejected": 0.6315183639526367, - "logps/chosen": -440.3460998535156, - "logps/rejected": -668.5031127929688, - "loss": 0.2538, - "rewards/accuracies": 0.893750011920929, - "rewards/chosen": -2.369847297668457, - "rewards/margins": 2.58512282371521, - "rewards/rejected": -4.954970359802246, + "logits/chosen": 1.2143045663833618, + "logits/rejected": 1.9916166067123413, + "logps/chosen": -500.71038818359375, + "logps/rejected": -771.3677978515625, + "loss": 0.2426, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -2.3821799755096436, + "rewards/margins": 3.1737558841705322, + "rewards/rejected": -5.555935859680176, "step": 960 }, { "epoch": 0.72, - "grad_norm": 59.50016505068154, + "grad_norm": 57.66373376149326, "learning_rate": 1.0979579950231821e-07, - "logits/chosen": -0.0721534788608551, - "logits/rejected": 0.5839505195617676, - "logps/chosen": -496.072021484375, - "logps/rejected": -720.0166015625, - "loss": 0.2331, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": -2.456573963165283, - "rewards/margins": 2.6809329986572266, - "rewards/rejected": -5.13750696182251, + "logits/chosen": 1.1112618446350098, + "logits/rejected": 2.246898889541626, + "logps/chosen": -502.126220703125, + "logps/rejected": -734.8248901367188, + "loss": 0.241, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -2.395838737487793, + "rewards/margins": 2.6420400142669678, + "rewards/rejected": -5.03787899017334, "step": 970 }, { "epoch": 0.73, - "grad_norm": 63.83570606232123, + "grad_norm": 55.20670800594472, "learning_rate": 1.0447394495946291e-07, - "logits/chosen": 0.007242382504045963, - "logits/rejected": 0.5322221517562866, - "logps/chosen": -520.4622192382812, - "logps/rejected": -716.791015625, - "loss": 0.2726, - "rewards/accuracies": 0.875, - "rewards/chosen": -2.6428182125091553, - "rewards/margins": 2.505241870880127, - "rewards/rejected": -5.148059844970703, + "logits/chosen": 1.387683391571045, + "logits/rejected": 2.400949478149414, + "logps/chosen": -515.9779052734375, + "logps/rejected": -765.4949340820312, + "loss": 0.2468, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -2.70845365524292, + "rewards/margins": 2.7117531299591064, + "rewards/rejected": -5.420206546783447, "step": 980 }, { "epoch": 0.74, - "grad_norm": 76.60392151494439, + "grad_norm": 45.9412294534277, "learning_rate": 9.925002314611841e-08, - "logits/chosen": -0.2113700807094574, - "logits/rejected": 0.4387185573577881, - "logps/chosen": -535.4547119140625, - "logps/rejected": -757.9600830078125, - "loss": 0.2356, - "rewards/accuracies": 0.90625, - "rewards/chosen": -2.6833412647247314, - "rewards/margins": 2.5519299507141113, - "rewards/rejected": -5.235270977020264, + "logits/chosen": 1.8099420070648193, + "logits/rejected": 2.5098319053649902, + "logps/chosen": -484.7242736816406, + "logps/rejected": -777.49169921875, + "loss": 0.2383, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -2.564988374710083, + "rewards/margins": 2.9337170124053955, + "rewards/rejected": -5.498705863952637, "step": 990 }, { "epoch": 0.74, - "grad_norm": 55.208058181749024, + "grad_norm": 64.863814963629, "learning_rate": 9.412754953531663e-08, - "logits/chosen": 0.000503048300743103, - "logits/rejected": 0.7104528546333313, - "logps/chosen": -494.1576232910156, - "logps/rejected": -719.8685913085938, - "loss": 0.2261, - "rewards/accuracies": 0.887499988079071, - "rewards/chosen": -2.6024506092071533, - "rewards/margins": 2.5579113960266113, - "rewards/rejected": -5.160361289978027, + "logits/chosen": 1.5222892761230469, + "logits/rejected": 2.5317773818969727, + "logps/chosen": -507.424072265625, + "logps/rejected": -756.7098388671875, + "loss": 0.262, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -2.656026601791382, + "rewards/margins": 2.7969748973846436, + "rewards/rejected": -5.453001976013184, "step": 1000 }, { "epoch": 0.74, - "eval_logits/chosen": -0.1104351058602333, - "eval_logits/rejected": 0.20232482254505157, - "eval_logps/chosen": -558.2006225585938, - "eval_logps/rejected": -712.748291015625, - "eval_loss": 0.5298039317131042, - "eval_rewards/accuracies": 0.76953125, - "eval_rewards/chosen": -2.7656705379486084, - "eval_rewards/margins": 1.4841958284378052, - "eval_rewards/rejected": -4.249866485595703, - "eval_runtime": 98.1214, - "eval_samples_per_second": 20.383, - "eval_steps_per_second": 0.326, + "eval_logits/chosen": 0.6804571151733398, + "eval_logits/rejected": 1.3123811483383179, + "eval_logps/chosen": -556.8264770507812, + "eval_logps/rejected": -703.1602783203125, + "eval_loss": 0.5241079330444336, + "eval_rewards/accuracies": 0.76171875, + "eval_rewards/chosen": -2.4504995346069336, + "eval_rewards/margins": 1.405852198600769, + "eval_rewards/rejected": -3.856351613998413, + "eval_runtime": 97.4441, + "eval_samples_per_second": 20.525, + "eval_steps_per_second": 0.328, "step": 1000 }, { "epoch": 0.75, - "grad_norm": 57.330660352565694, + "grad_norm": 69.68207773392557, "learning_rate": 8.910997132984479e-08, - "logits/chosen": 0.19682852923870087, - "logits/rejected": 0.5228155255317688, - "logps/chosen": -529.1737060546875, - "logps/rejected": -791.169921875, - "loss": 0.2475, - "rewards/accuracies": 0.918749988079071, - "rewards/chosen": -2.5559616088867188, - "rewards/margins": 2.631944179534912, - "rewards/rejected": -5.187905311584473, + "logits/chosen": 1.820955514907837, + "logits/rejected": 2.952479839324951, + "logps/chosen": -544.1399536132812, + "logps/rejected": -808.0184936523438, + "loss": 0.2504, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -2.861184597015381, + "rewards/margins": 3.071931838989258, + "rewards/rejected": -5.933116436004639, "step": 1010 }, { "epoch": 0.76, - "grad_norm": 55.457376945821615, + "grad_norm": 50.59071094029437, "learning_rate": 8.42006651424274e-08, - "logits/chosen": -0.11488042026758194, - "logits/rejected": 0.43977147340774536, - "logps/chosen": -529.4292602539062, - "logps/rejected": -779.668212890625, - "loss": 0.2315, - "rewards/accuracies": 0.925000011920929, - "rewards/chosen": -2.6139321327209473, - "rewards/margins": 2.7150697708129883, - "rewards/rejected": -5.3290019035339355, + "logits/chosen": 1.8404204845428467, + "logits/rejected": 2.6863815784454346, + "logps/chosen": -461.4169921875, + "logps/rejected": -703.1361083984375, + "loss": 0.2318, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -2.4329962730407715, + "rewards/margins": 2.7300188541412354, + "rewards/rejected": -5.163014888763428, "step": 1020 }, { "epoch": 0.77, - "grad_norm": 57.42354594975122, + "grad_norm": 57.22762908033313, "learning_rate": 7.940293472341217e-08, - "logits/chosen": 0.25594162940979004, - "logits/rejected": 0.661308228969574, - "logps/chosen": -465.62347412109375, - "logps/rejected": -748.9591064453125, - "loss": 0.2417, + "logits/chosen": 2.013861894607544, + "logits/rejected": 2.7502970695495605, + "logps/chosen": -477.7572326660156, + "logps/rejected": -773.4556884765625, + "loss": 0.2276, "rewards/accuracies": 0.925000011920929, - "rewards/chosen": -2.5969550609588623, - "rewards/margins": 2.9456839561462402, - "rewards/rejected": -5.542638778686523, + "rewards/chosen": -2.6210336685180664, + "rewards/margins": 3.139965057373047, + "rewards/rejected": -5.7609992027282715, "step": 1030 }, { "epoch": 0.77, - "grad_norm": 44.59552618611705, + "grad_norm": 55.15868922046573, "learning_rate": 7.472000873748918e-08, - "logits/chosen": 0.21030166745185852, - "logits/rejected": 0.7939587831497192, - "logps/chosen": -488.8642578125, - "logps/rejected": -709.2621459960938, - "loss": 0.2323, - "rewards/accuracies": 0.862500011920929, - "rewards/chosen": -2.7148165702819824, - "rewards/margins": 2.5755038261413574, - "rewards/rejected": -5.29032039642334, + "logits/chosen": 2.0298519134521484, + "logits/rejected": 2.990135431289673, + "logps/chosen": -528.5840454101562, + "logps/rejected": -781.4909057617188, + "loss": 0.2487, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.6361494064331055, + "rewards/margins": 2.9660372734069824, + "rewards/rejected": -5.602187156677246, "step": 1040 }, { "epoch": 0.78, - "grad_norm": 64.60445708831573, + "grad_norm": 43.438291077124795, "learning_rate": 7.015503859093927e-08, - "logits/chosen": 0.2117297202348709, - "logits/rejected": 0.6777232885360718, - "logps/chosen": -483.72283935546875, - "logps/rejected": -733.4895629882812, - "loss": 0.2268, - "rewards/accuracies": 0.90625, - "rewards/chosen": -2.7278270721435547, - "rewards/margins": 2.6605074405670166, - "rewards/rejected": -5.38833475112915, + "logits/chosen": 2.1326801776885986, + "logits/rejected": 2.5511794090270996, + "logps/chosen": -486.6455078125, + "logps/rejected": -757.7630004882812, + "loss": 0.2148, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -2.617185592651367, + "rewards/margins": 2.795973062515259, + "rewards/rejected": -5.413158893585205, "step": 1050 }, { "epoch": 0.79, - "grad_norm": 55.01169146748481, + "grad_norm": 63.14016572546011, "learning_rate": 6.571109631087451e-08, - "logits/chosen": 0.0924178883433342, - "logits/rejected": 0.8385933041572571, - "logps/chosen": -520.66064453125, - "logps/rejected": -789.1815185546875, - "loss": 0.2073, + "logits/chosen": 2.417752742767334, + "logits/rejected": 3.036146402359009, + "logps/chosen": -494.73046875, + "logps/rejected": -811.0126953125, + "loss": 0.2112, "rewards/accuracies": 0.9312499761581421, - "rewards/chosen": -2.671236515045166, - "rewards/margins": 3.076190710067749, - "rewards/rejected": -5.747427463531494, + "rewards/chosen": -2.588284492492676, + "rewards/margins": 3.300442934036255, + "rewards/rejected": -5.888727188110352, "step": 1060 }, { "epoch": 0.79, - "grad_norm": 57.63704573643993, + "grad_norm": 58.89863039830767, "learning_rate": 6.139117247789687e-08, - "logits/chosen": 0.09710516035556793, - "logits/rejected": 0.5915436148643494, - "logps/chosen": -536.4342651367188, - "logps/rejected": -775.2525634765625, - "loss": 0.209, - "rewards/accuracies": 0.918749988079071, - "rewards/chosen": -2.6840100288391113, - "rewards/margins": 2.6324105262756348, - "rewards/rejected": -5.316420555114746, + "logits/chosen": 2.5516977310180664, + "logits/rejected": 3.055995464324951, + "logps/chosen": -535.7842407226562, + "logps/rejected": -800.0374145507812, + "loss": 0.2248, + "rewards/accuracies": 0.90625, + "rewards/chosen": -2.956123113632202, + "rewards/margins": 2.720890998840332, + "rewards/rejected": -5.677014350891113, "step": 1070 }, { "epoch": 0.8, - "grad_norm": 65.7023707481576, + "grad_norm": 41.21215573686561, "learning_rate": 5.719817421356685e-08, - "logits/chosen": -0.050966888666152954, - "logits/rejected": 0.3987392485141754, - "logps/chosen": -558.064453125, - "logps/rejected": -772.4810791015625, - "loss": 0.229, - "rewards/accuracies": 0.887499988079071, - "rewards/chosen": -2.901360034942627, - "rewards/margins": 2.482283115386963, - "rewards/rejected": -5.383642673492432, + "logits/chosen": 1.9021530151367188, + "logits/rejected": 2.7421538829803467, + "logps/chosen": -549.5343017578125, + "logps/rejected": -820.0753784179688, + "loss": 0.2033, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": -2.7265052795410156, + "rewards/margins": 3.281470537185669, + "rewards/rejected": -6.007976055145264, "step": 1080 }, { "epoch": 0.81, - "grad_norm": 67.44035324151042, + "grad_norm": 58.39711865385947, "learning_rate": 5.313492322403701e-08, - "logits/chosen": 0.33903616666793823, - "logits/rejected": 0.7725549936294556, - "logps/chosen": -484.7267150878906, - "logps/rejected": -763.6196899414062, - "loss": 0.2302, - "rewards/accuracies": 0.9125000238418579, - "rewards/chosen": -2.789487838745117, - "rewards/margins": 2.8590240478515625, - "rewards/rejected": -5.6485114097595215, + "logits/chosen": 2.2018539905548096, + "logits/rejected": 2.951138496398926, + "logps/chosen": -533.9331665039062, + "logps/rejected": -891.0558471679688, + "loss": 0.1937, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": -2.8866357803344727, + "rewards/margins": 3.6149306297302246, + "rewards/rejected": -6.501566410064697, "step": 1090 }, { "epoch": 0.82, - "grad_norm": 49.24699961134825, + "grad_norm": 51.18256501676837, "learning_rate": 4.9204153901165805e-08, - "logits/chosen": 0.14409589767456055, - "logits/rejected": 0.7108417749404907, - "logps/chosen": -508.53668212890625, - "logps/rejected": -749.4099731445312, - "loss": 0.2219, - "rewards/accuracies": 0.875, - "rewards/chosen": -2.825862407684326, - "rewards/margins": 2.6767187118530273, - "rewards/rejected": -5.502581596374512, + "logits/chosen": 1.9893665313720703, + "logits/rejected": 2.7781219482421875, + "logps/chosen": -530.7794189453125, + "logps/rejected": -824.0559692382812, + "loss": 0.2299, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -2.8573508262634277, + "rewards/margins": 3.2189173698425293, + "rewards/rejected": -6.076268196105957, "step": 1100 }, { "epoch": 0.82, - "eval_logits/chosen": -0.006662141531705856, - "eval_logits/rejected": 0.307841420173645, - "eval_logps/chosen": -591.4903564453125, - "eval_logps/rejected": -754.2210693359375, - "eval_loss": 0.5380497574806213, - "eval_rewards/accuracies": 0.76953125, - "eval_rewards/chosen": -3.0985679626464844, - "eval_rewards/margins": 1.566027045249939, - "eval_rewards/rejected": -4.664595127105713, - "eval_runtime": 98.1364, - "eval_samples_per_second": 20.38, - "eval_steps_per_second": 0.326, + "eval_logits/chosen": 0.8391125202178955, + "eval_logits/rejected": 1.4834216833114624, + "eval_logps/chosen": -588.2494506835938, + "eval_logps/rejected": -741.857421875, + "eval_loss": 0.5312901139259338, + "eval_rewards/accuracies": 0.7578125, + "eval_rewards/chosen": -2.7647294998168945, + "eval_rewards/margins": 1.4785932302474976, + "eval_rewards/rejected": -4.243322849273682, + "eval_runtime": 97.5423, + "eval_samples_per_second": 20.504, + "eval_steps_per_second": 0.328, "step": 1100 }, { "epoch": 0.82, - "grad_norm": 53.77081112472508, + "grad_norm": 68.60925195657734, "learning_rate": 4.540851148239036e-08, - "logits/chosen": 0.05285036563873291, - "logits/rejected": 0.5418864488601685, - "logps/chosen": -480.589599609375, - "logps/rejected": -721.0247192382812, - "loss": 0.2259, - "rewards/accuracies": 0.887499988079071, - "rewards/chosen": -2.6466457843780518, - "rewards/margins": 2.60426664352417, - "rewards/rejected": -5.250911712646484, + "logits/chosen": 1.7061752080917358, + "logits/rejected": 2.698995351791382, + "logps/chosen": -537.1931762695312, + "logps/rejected": -848.33154296875, + "loss": 0.2129, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -2.7809014320373535, + "rewards/margins": 3.3348469734191895, + "rewards/rejected": -6.115748405456543, "step": 1110 }, { "epoch": 0.83, - "grad_norm": 61.18183448166774, + "grad_norm": 48.80096479357628, "learning_rate": 4.1750550270596206e-08, - "logits/chosen": 0.007505857851356268, - "logits/rejected": 0.7265870571136475, - "logps/chosen": -525.0111694335938, - "logps/rejected": -770.3839111328125, - "loss": 0.2062, - "rewards/accuracies": 0.9125000238418579, - "rewards/chosen": -2.770293712615967, - "rewards/margins": 2.8563060760498047, - "rewards/rejected": -5.6265997886657715, + "logits/chosen": 1.531884789466858, + "logits/rejected": 2.923696994781494, + "logps/chosen": -509.5885314941406, + "logps/rejected": -794.9307250976562, + "loss": 0.1954, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.630959987640381, + "rewards/margins": 3.3725571632385254, + "rewards/rejected": -6.003516674041748, "step": 1120 }, { "epoch": 0.84, - "grad_norm": 68.7383025915832, + "grad_norm": 68.79197398198284, "learning_rate": 3.823273191518234e-08, - "logits/chosen": 0.1847713440656662, - "logits/rejected": 0.7002714276313782, - "logps/chosen": -485.5477600097656, - "logps/rejected": -744.1683959960938, - "loss": 0.2208, - "rewards/accuracies": 0.925000011920929, - "rewards/chosen": -2.557469606399536, - "rewards/margins": 2.7433865070343018, - "rewards/rejected": -5.300856113433838, + "logits/chosen": 1.5292671918869019, + "logits/rejected": 2.3230159282684326, + "logps/chosen": -568.5833740234375, + "logps/rejected": -835.826171875, + "loss": 0.2178, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -3.0106937885284424, + "rewards/margins": 3.2017643451690674, + "rewards/rejected": -6.212458610534668, "step": 1130 }, { "epoch": 0.85, - "grad_norm": 52.91705405093606, + "grad_norm": 59.434543375011025, "learning_rate": 3.485742375547745e-08, - "logits/chosen": 0.2934524416923523, - "logits/rejected": 0.7874400019645691, - "logps/chosen": -540.8467407226562, - "logps/rejected": -773.0992431640625, - "loss": 0.2157, - "rewards/accuracies": 0.925000011920929, - "rewards/chosen": -2.8220906257629395, - "rewards/margins": 2.7147064208984375, - "rewards/rejected": -5.536796569824219, + "logits/chosen": 1.4421080350875854, + "logits/rejected": 2.442089796066284, + "logps/chosen": -553.727294921875, + "logps/rejected": -822.7138671875, + "loss": 0.2009, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.1090734004974365, + "rewards/margins": 2.9853668212890625, + "rewards/rejected": -6.094440460205078, "step": 1140 }, { "epoch": 0.85, - "grad_norm": 71.88387361950765, + "grad_norm": 38.888275757403804, "learning_rate": 3.162689722762365e-08, - "logits/chosen": -0.06768418103456497, - "logits/rejected": 0.4680994153022766, - "logps/chosen": -544.25439453125, - "logps/rejected": -792.7861328125, - "loss": 0.219, - "rewards/accuracies": 0.925000011920929, - "rewards/chosen": -2.854830026626587, - "rewards/margins": 2.878235340118408, - "rewards/rejected": -5.733065128326416, + "logits/chosen": 1.5811113119125366, + "logits/rejected": 2.2564284801483154, + "logps/chosen": -543.1163940429688, + "logps/rejected": -842.681640625, + "loss": 0.2095, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -2.9668571949005127, + "rewards/margins": 3.10882830619812, + "rewards/rejected": -6.075685024261475, "step": 1150 }, { "epoch": 0.86, - "grad_norm": 55.10045985903261, + "grad_norm": 42.47551430381964, "learning_rate": 2.8543326335997904e-08, - "logits/chosen": 0.22700032591819763, - "logits/rejected": 0.5510324239730835, - "logps/chosen": -525.5604248046875, - "logps/rejected": -764.0299072265625, - "loss": 0.2111, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -2.728689193725586, - "rewards/margins": 2.7138609886169434, - "rewards/rejected": -5.442550182342529, + "logits/chosen": 1.768690824508667, + "logits/rejected": 2.4484939575195312, + "logps/chosen": -556.0635375976562, + "logps/rejected": -805.807373046875, + "loss": 0.2046, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -2.865739107131958, + "rewards/margins": 2.8989548683166504, + "rewards/rejected": -5.764693737030029, "step": 1160 }, { "epoch": 0.87, - "grad_norm": 69.28988155945117, + "grad_norm": 59.36158165544989, "learning_rate": 2.560878619020157e-08, - "logits/chosen": 0.41892099380493164, - "logits/rejected": 0.961225152015686, - "logps/chosen": -475.5420837402344, - "logps/rejected": -717.2962036132812, - "loss": 0.2244, - "rewards/accuracies": 0.925000011920929, - "rewards/chosen": -2.7839531898498535, - "rewards/margins": 2.69148588180542, - "rewards/rejected": -5.475439071655273, + "logits/chosen": 1.9017894268035889, + "logits/rejected": 2.7026009559631348, + "logps/chosen": -521.269287109375, + "logps/rejected": -813.7127685546875, + "loss": 0.1964, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": -2.9693474769592285, + "rewards/margins": 3.1322848796844482, + "rewards/rejected": -6.101632595062256, "step": 1170 }, { "epoch": 0.88, - "grad_norm": 64.73986275241352, + "grad_norm": 49.475189963130575, "learning_rate": 2.2825251608601466e-08, - "logits/chosen": 0.1623622179031372, - "logits/rejected": 0.6435664892196655, - "logps/chosen": -536.0408935546875, - "logps/rejected": -744.7503662109375, - "loss": 0.2261, - "rewards/accuracies": 0.887499988079071, - "rewards/chosen": -2.80417799949646, - "rewards/margins": 2.5097708702087402, - "rewards/rejected": -5.313948631286621, + "logits/chosen": 1.8870357275009155, + "logits/rejected": 2.8944287300109863, + "logps/chosen": -558.059814453125, + "logps/rejected": -868.568359375, + "loss": 0.1891, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -3.1376397609710693, + "rewards/margins": 3.2884891033172607, + "rewards/rejected": -6.426129341125488, "step": 1180 }, { "epoch": 0.88, - "grad_norm": 46.21048147117716, + "grad_norm": 85.599165147591, "learning_rate": 2.0194595789362474e-08, - "logits/chosen": -0.05237337946891785, - "logits/rejected": 0.4507738947868347, - "logps/chosen": -519.0675048828125, - "logps/rejected": -777.24072265625, - "loss": 0.2245, + "logits/chosen": 1.9095745086669922, + "logits/rejected": 2.530900478363037, + "logps/chosen": -577.1746826171875, + "logps/rejected": -892.88623046875, + "loss": 0.2027, "rewards/accuracies": 0.9375, - "rewards/chosen": -2.453911542892456, - "rewards/margins": 3.05096435546875, - "rewards/rejected": -5.504876136779785, + "rewards/chosen": -3.0735995769500732, + "rewards/margins": 3.377427339553833, + "rewards/rejected": -6.451026916503906, "step": 1190 }, { "epoch": 0.89, - "grad_norm": 55.74231137792324, + "grad_norm": 45.52491787365754, "learning_rate": 1.7718589049866728e-08, - "logits/chosen": 0.32885435223579407, - "logits/rejected": 0.45243850350379944, - "logps/chosen": -502.6609802246094, - "logps/rejected": -755.8111572265625, - "loss": 0.2165, - "rewards/accuracies": 0.90625, - "rewards/chosen": -2.85229754447937, - "rewards/margins": 2.4965226650238037, - "rewards/rejected": -5.348820209503174, + "logits/chosen": 2.376490592956543, + "logits/rejected": 3.1364424228668213, + "logps/chosen": -510.269287109375, + "logps/rejected": -829.1940307617188, + "loss": 0.1974, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.9278645515441895, + "rewards/margins": 3.433408737182617, + "rewards/rejected": -6.361273765563965, "step": 1200 }, { "epoch": 0.89, - "eval_logits/chosen": -0.09797076135873795, - "eval_logits/rejected": 0.20152458548545837, - "eval_logps/chosen": -580.1854858398438, - "eval_logps/rejected": -738.0179443359375, - "eval_loss": 0.5336408615112305, + "eval_logits/chosen": 0.8963963389396667, + "eval_logits/rejected": 1.5457934141159058, + "eval_logps/chosen": -606.617431640625, + "eval_logps/rejected": -764.6512451171875, + "eval_loss": 0.5366576910018921, "eval_rewards/accuracies": 0.76171875, - "eval_rewards/chosen": -2.9855194091796875, - "eval_rewards/margins": 1.5170438289642334, - "eval_rewards/rejected": -4.5025634765625, - "eval_runtime": 98.1382, - "eval_samples_per_second": 20.379, - "eval_steps_per_second": 0.326, + "eval_rewards/chosen": -2.948409080505371, + "eval_rewards/margins": 1.5228519439697266, + "eval_rewards/rejected": -4.471261024475098, + "eval_runtime": 97.4355, + "eval_samples_per_second": 20.526, + "eval_steps_per_second": 0.328, "step": 1200 }, { "epoch": 0.9, - "grad_norm": 59.87247578419851, + "grad_norm": 56.7147448955845, "learning_rate": 1.539889763536645e-08, - "logits/chosen": 0.1656707227230072, - "logits/rejected": 0.5841933488845825, - "logps/chosen": -501.6282653808594, - "logps/rejected": -756.2093505859375, - "loss": 0.2316, - "rewards/accuracies": 0.9125000238418579, - "rewards/chosen": -2.5563437938690186, - "rewards/margins": 2.784249782562256, - "rewards/rejected": -5.3405938148498535, + "logits/chosen": 1.9441492557525635, + "logits/rejected": 3.0478804111480713, + "logps/chosen": -538.355224609375, + "logps/rejected": -856.01416015625, + "loss": 0.2187, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.842240571975708, + "rewards/margins": 3.5280959606170654, + "rewards/rejected": -6.370336055755615, "step": 1210 }, { "epoch": 0.91, - "grad_norm": 60.03113730852447, + "grad_norm": 60.258963508413004, "learning_rate": 1.3237082597673172e-08, - "logits/chosen": 0.1520698368549347, - "logits/rejected": 0.7296702861785889, - "logps/chosen": -479.1258850097656, - "logps/rejected": -756.1190185546875, - "loss": 0.1913, - "rewards/accuracies": 0.893750011920929, - "rewards/chosen": -2.538914680480957, - "rewards/margins": 2.866367816925049, - "rewards/rejected": -5.405282974243164, + "logits/chosen": 2.1856608390808105, + "logits/rejected": 2.853616237640381, + "logps/chosen": -517.0845947265625, + "logps/rejected": -845.6990966796875, + "loss": 0.204, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -3.0185937881469727, + "rewards/margins": 3.2306289672851562, + "rewards/rejected": -6.249222755432129, "step": 1220 }, { "epoch": 0.91, - "grad_norm": 50.74907680854386, + "grad_norm": 71.41232139420377, "learning_rate": 1.1234598744637502e-08, - "logits/chosen": 0.06600452959537506, - "logits/rejected": 0.34231507778167725, - "logps/chosen": -536.6253051757812, - "logps/rejected": -802.2919311523438, - "loss": 0.2021, - "rewards/accuracies": 0.925000011920929, - "rewards/chosen": -2.8277297019958496, - "rewards/margins": 2.8031983375549316, - "rewards/rejected": -5.630928039550781, + "logits/chosen": 1.5448696613311768, + "logits/rejected": 2.610525608062744, + "logps/chosen": -545.0371704101562, + "logps/rejected": -821.2421875, + "loss": 0.2063, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -3.1403965950012207, + "rewards/margins": 3.1843514442443848, + "rewards/rejected": -6.3247480392456055, "step": 1230 }, { "epoch": 0.92, - "grad_norm": 53.04165278516392, + "grad_norm": 57.959377016977456, "learning_rate": 9.392793661126414e-09, - "logits/chosen": -0.1451832503080368, - "logits/rejected": 0.4121910631656647, - "logps/chosen": -569.254150390625, - "logps/rejected": -809.6979370117188, - "loss": 0.196, - "rewards/accuracies": 0.9375, - "rewards/chosen": -2.7587039470672607, - "rewards/margins": 2.8314685821533203, - "rewards/rejected": -5.590172290802002, + "logits/chosen": 1.898782730102539, + "logits/rejected": 2.7061781883239746, + "logps/chosen": -582.9857177734375, + "logps/rejected": -879.3019409179688, + "loss": 0.1979, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.2453556060791016, + "rewards/margins": 3.297309160232544, + "rewards/rejected": -6.542665004730225, "step": 1240 }, { "epoch": 0.93, - "grad_norm": 64.9259915789077, + "grad_norm": 50.86760187147993, "learning_rate": 7.71290680215711e-09, - "logits/chosen": 0.08149626106023788, - "logits/rejected": 0.6067185401916504, - "logps/chosen": -518.8594970703125, - "logps/rejected": -728.6957397460938, - "loss": 0.227, - "rewards/accuracies": 0.90625, - "rewards/chosen": -2.8665976524353027, - "rewards/margins": 2.4750990867614746, - "rewards/rejected": -5.3416972160339355, + "logits/chosen": 2.0340778827667236, + "logits/rejected": 2.8080642223358154, + "logps/chosen": -558.147705078125, + "logps/rejected": -874.9266357421875, + "loss": 0.1974, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -3.0640769004821777, + "rewards/margins": 3.380338668823242, + "rewards/rejected": -6.444415092468262, "step": 1250 }, { "epoch": 0.94, - "grad_norm": 56.48468871383734, + "grad_norm": 61.973766270626015, "learning_rate": 6.196068658797543e-09, - "logits/chosen": 0.31367558240890503, - "logits/rejected": 0.856947124004364, - "logps/chosen": -480.42205810546875, - "logps/rejected": -714.3417358398438, - "loss": 0.2094, + "logits/chosen": 1.8814232349395752, + "logits/rejected": 2.7813236713409424, + "logps/chosen": -551.5777587890625, + "logps/rejected": -826.7698974609375, + "loss": 0.1971, "rewards/accuracies": 0.9125000238418579, - "rewards/chosen": -2.750648021697998, - "rewards/margins": 2.5990233421325684, - "rewards/rejected": -5.349670886993408, + "rewards/chosen": -2.9602150917053223, + "rewards/margins": 3.0024728775024414, + "rewards/rejected": -5.9626874923706055, "step": 1260 }, { "epoch": 0.94, - "grad_norm": 55.81763854062536, + "grad_norm": 67.6695850405579, "learning_rate": 4.843299997394717e-09, - "logits/chosen": 0.29153138399124146, - "logits/rejected": 0.6525137424468994, - "logps/chosen": -505.7523498535156, - "logps/rejected": -773.7969970703125, - "loss": 0.199, - "rewards/accuracies": 0.956250011920929, - "rewards/chosen": -2.656862735748291, - "rewards/margins": 2.929708957672119, - "rewards/rejected": -5.58657169342041, + "logits/chosen": 1.856507658958435, + "logits/rejected": 2.7601516246795654, + "logps/chosen": -540.268310546875, + "logps/rejected": -846.9691162109375, + "loss": 0.2067, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.077454090118408, + "rewards/margins": 3.414836883544922, + "rewards/rejected": -6.492290496826172, "step": 1270 }, { "epoch": 0.95, - "grad_norm": 45.256008770336216, + "grad_norm": 68.73319089653008, "learning_rate": 3.655511172643372e-09, - "logits/chosen": -0.10818688571453094, - "logits/rejected": 0.48161396384239197, - "logps/chosen": -536.8485107421875, - "logps/rejected": -788.7921142578125, - "loss": 0.1927, - "rewards/accuracies": 0.90625, - "rewards/chosen": -2.5987658500671387, - "rewards/margins": 2.9826765060424805, - "rewards/rejected": -5.581442832946777, + "logits/chosen": 1.932074785232544, + "logits/rejected": 2.437225818634033, + "logps/chosen": -531.4140625, + "logps/rejected": -836.9505615234375, + "loss": 0.1876, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -2.8276994228363037, + "rewards/margins": 3.25665020942688, + "rewards/rejected": -6.084350109100342, "step": 1280 }, { "epoch": 0.96, - "grad_norm": 75.93254873129665, + "grad_norm": 50.423800165908794, "learning_rate": 2.633501514956532e-09, - "logits/chosen": -0.07302987575531006, - "logits/rejected": 0.7443720102310181, - "logps/chosen": -507.47894287109375, - "logps/rejected": -765.6873168945312, - "loss": 0.2005, - "rewards/accuracies": 0.981249988079071, - "rewards/chosen": -2.673506259918213, - "rewards/margins": 2.9605891704559326, - "rewards/rejected": -5.634096145629883, + "logits/chosen": 1.9169034957885742, + "logits/rejected": 2.7369441986083984, + "logps/chosen": -586.8289794921875, + "logps/rejected": -896.8014526367188, + "loss": 0.2044, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": -3.1295228004455566, + "rewards/margins": 3.5232949256896973, + "rewards/rejected": -6.652817726135254, "step": 1290 }, { "epoch": 0.97, - "grad_norm": 55.65736054179639, + "grad_norm": 57.31903342529662, "learning_rate": 1.777958792550993e-09, - "logits/chosen": 0.05213532596826553, - "logits/rejected": 0.4943299889564514, - "logps/chosen": -537.5960693359375, - "logps/rejected": -807.1434326171875, - "loss": 0.1728, - "rewards/accuracies": 0.9375, - "rewards/chosen": -2.9276278018951416, - "rewards/margins": 2.9666807651519775, - "rewards/rejected": -5.894307613372803, + "logits/chosen": 1.5464543104171753, + "logits/rejected": 2.9688878059387207, + "logps/chosen": -587.2015380859375, + "logps/rejected": -853.0357666015625, + "loss": 0.1842, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -2.988502025604248, + "rewards/margins": 3.156489372253418, + "rewards/rejected": -6.144991397857666, "step": 1300 }, { "epoch": 0.97, - "eval_logits/chosen": -0.06342385709285736, - "eval_logits/rejected": 0.23894435167312622, - "eval_logps/chosen": -598.8973999023438, - "eval_logps/rejected": -761.6608276367188, - "eval_loss": 0.5417826175689697, - "eval_rewards/accuracies": 0.75390625, - "eval_rewards/chosen": -3.172638177871704, - "eval_rewards/margins": 1.5663537979125977, - "eval_rewards/rejected": -4.738992214202881, - "eval_runtime": 98.1336, - "eval_samples_per_second": 20.38, - "eval_steps_per_second": 0.326, + "eval_logits/chosen": 0.9558575749397278, + "eval_logits/rejected": 1.609464406967163, + "eval_logps/chosen": -609.159423828125, + "eval_logps/rejected": -767.4317016601562, + "eval_loss": 0.5365558862686157, + "eval_rewards/accuracies": 0.76171875, + "eval_rewards/chosen": -2.9738292694091797, + "eval_rewards/margins": 1.5252362489700317, + "eval_rewards/rejected": -4.499065399169922, + "eval_runtime": 97.3239, + "eval_samples_per_second": 20.55, + "eval_steps_per_second": 0.329, "step": 1300 }, { "epoch": 0.97, - "grad_norm": 50.04724298737362, + "grad_norm": 66.21886288694567, "learning_rate": 1.0894587486089125e-09, - "logits/chosen": 0.30416375398635864, - "logits/rejected": 0.5567177534103394, - "logps/chosen": -561.5665893554688, - "logps/rejected": -784.9238891601562, - "loss": 0.2226, + "logits/chosen": 1.8931999206542969, + "logits/rejected": 2.824298858642578, + "logps/chosen": -563.06201171875, + "logps/rejected": -834.8709716796875, + "loss": 0.2157, "rewards/accuracies": 0.887499988079071, - "rewards/chosen": -3.119717836380005, - "rewards/margins": 2.4494900703430176, - "rewards/rejected": -5.569207668304443, + "rewards/chosen": -3.2370285987854004, + "rewards/margins": 3.035515546798706, + "rewards/rejected": -6.272543430328369, "step": 1310 }, { "epoch": 0.98, - "grad_norm": 58.64068052496794, + "grad_norm": 45.779926433395936, "learning_rate": 5.684647138277098e-10, - "logits/chosen": 0.06530937552452087, - "logits/rejected": 0.6911398768424988, - "logps/chosen": -524.456298828125, - "logps/rejected": -789.5941162109375, - "loss": 0.2211, - "rewards/accuracies": 0.90625, - "rewards/chosen": -2.7515296936035156, - "rewards/margins": 3.075373411178589, - "rewards/rejected": -5.826903343200684, + "logits/chosen": 1.7055333852767944, + "logits/rejected": 2.308079719543457, + "logps/chosen": -531.0139770507812, + "logps/rejected": -862.2609252929688, + "loss": 0.1974, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -2.956573486328125, + "rewards/margins": 3.375626802444458, + "rewards/rejected": -6.332200050354004, "step": 1320 }, { "epoch": 0.99, - "grad_norm": 51.7145291726634, + "grad_norm": 58.05458328657747, "learning_rate": 2.153272946184559e-10, - "logits/chosen": 0.033466748893260956, - "logits/rejected": 0.613431453704834, - "logps/chosen": -560.579833984375, - "logps/rejected": -819.1729736328125, - "loss": 0.2092, + "logits/chosen": 1.735358476638794, + "logits/rejected": 2.259385585784912, + "logps/chosen": -585.9295043945312, + "logps/rejected": -861.4645385742188, + "loss": 0.1738, "rewards/accuracies": 0.9312499761581421, - "rewards/chosen": -2.932191848754883, - "rewards/margins": 2.935133695602417, - "rewards/rejected": -5.867325782775879, + "rewards/chosen": -3.10073184967041, + "rewards/margins": 2.996291399002075, + "rewards/rejected": -6.097023010253906, "step": 1330 }, { "epoch": 1.0, - "grad_norm": 45.056558756064035, + "grad_norm": 46.42702960995785, "learning_rate": 3.0284137163189004e-11, - "logits/chosen": -0.001438182545825839, - "logits/rejected": 0.446831613779068, - "logps/chosen": -533.5986938476562, - "logps/rejected": -776.7227783203125, - "loss": 0.1891, - "rewards/accuracies": 0.9437500238418579, - "rewards/chosen": -2.894265651702881, - "rewards/margins": 2.6473288536071777, - "rewards/rejected": -5.541594505310059, + "logits/chosen": 2.000138759613037, + "logits/rejected": 2.7859671115875244, + "logps/chosen": -530.1033935546875, + "logps/rejected": -878.3465576171875, + "loss": 0.1884, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -3.1844658851623535, + "rewards/margins": 3.3884029388427734, + "rewards/rejected": -6.572869300842285, "step": 1340 }, { "epoch": 1.0, "step": 1346, "total_flos": 0.0, - "train_loss": 0.3438705863959722, - "train_runtime": 21850.8794, - "train_samples_per_second": 7.884, + "train_loss": 0.335402155391883, + "train_runtime": 21644.3608, + "train_samples_per_second": 7.959, "train_steps_per_second": 0.062 } ],