diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -9,17 +9,13 @@ "is_world_process_zero": true, "log_history": [ { - "debug/losses": 0.23031963407993317, - "debug/policy_weights": 0.3322809934616089, - "debug/raw_losses": 0.6931471824645996, - "epoch": 0.0007958615200955034, - "grad_norm": 1.6286401468051575, + "epoch": 0.0, "learning_rate": 3.968253968253968e-09, - "logits/chosen": -2.735659122467041, - "logits/rejected": -2.7581238746643066, - "logps/chosen": -124.62968444824219, - "logps/rejected": -168.09475708007812, - "loss": 0.2239, + "logits/chosen": -2.7193620204925537, + "logits/rejected": -2.698728084564209, + "logps/chosen": -182.0961456298828, + "logps/rejected": -172.47128295898438, + "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, @@ -27,2512 +23,1962 @@ "step": 1 }, { - "debug/losses": 0.23611842095851898, - "debug/policy_weights": 0.34072285890579224, - "debug/raw_losses": 0.6929619312286377, - "epoch": 0.007958615200955034, - "grad_norm": 1.6881469544577703, + "epoch": 0.01, "learning_rate": 3.968253968253968e-08, - "logits/chosen": -2.738783836364746, - "logits/rejected": -2.7277822494506836, - "logps/chosen": -146.68910217285156, - "logps/rejected": -131.2349395751953, - "loss": 0.2295, - "rewards/accuracies": 0.4375, - "rewards/chosen": 0.00013088027480989695, - "rewards/margins": 0.00037292600609362125, - "rewards/rejected": -0.00024204571673180908, + "logits/chosen": -2.7041964530944824, + "logits/rejected": -2.6794540882110596, + "logps/chosen": -162.45831298828125, + "logps/rejected": -140.5693359375, + "loss": 0.6931, + "rewards/accuracies": 0.5486111044883728, + "rewards/chosen": 0.00032037965138442814, + "rewards/margins": 0.0004935775068588555, + "rewards/rejected": -0.00017319784092251211, "step": 10 }, { - "debug/losses": 0.2265315055847168, - "debug/policy_weights": 0.3267918825149536, - "debug/raw_losses": 0.693217396736145, - "epoch": 0.01591723040191007, - "grad_norm": 1.5615628005517825, + "epoch": 0.02, "learning_rate": 7.936507936507936e-08, - "logits/chosen": -2.706993341445923, - "logits/rejected": -2.703998327255249, - "logps/chosen": -129.48587036132812, - "logps/rejected": -130.25735473632812, - "loss": 0.2238, - "rewards/accuracies": 0.44999998807907104, - "rewards/chosen": -0.00017557166574988514, - "rewards/margins": -0.000137387789436616, - "rewards/rejected": -3.818388358922675e-05, + "logits/chosen": -2.7177577018737793, + "logits/rejected": -2.7136425971984863, + "logps/chosen": -134.47242736816406, + "logps/rejected": -143.55604553222656, + "loss": 0.6931, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 8.780837379163131e-05, + "rewards/margins": 0.00010721785656642169, + "rewards/rejected": -1.940951551659964e-05, "step": 20 }, { - "debug/losses": 0.2132270336151123, - "debug/policy_weights": 0.30768805742263794, - "debug/raw_losses": 0.6929622888565063, - "epoch": 0.0238758456028651, - "grad_norm": 1.5392325714344397, + "epoch": 0.02, "learning_rate": 1.1904761904761903e-07, - "logits/chosen": -2.6836955547332764, - "logits/rejected": -2.680663824081421, - "logps/chosen": -141.81492614746094, - "logps/rejected": -155.6810760498047, - "loss": 0.2264, - "rewards/accuracies": 0.581250011920929, - "rewards/chosen": 0.00016820887685753405, - "rewards/margins": 0.0003723005356732756, - "rewards/rejected": -0.00020409165881574154, + "logits/chosen": -2.6898293495178223, + "logits/rejected": -2.676154613494873, + "logps/chosen": -140.94692993164062, + "logps/rejected": -136.50369262695312, + "loss": 0.6931, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.0005466601578518748, + "rewards/margins": -0.00021456097601912916, + "rewards/rejected": 0.0007612211629748344, "step": 30 }, { - "debug/losses": 0.2176501303911209, - "debug/policy_weights": 0.3141174018383026, - "debug/raw_losses": 0.6928995847702026, - "epoch": 0.03183446080382014, - "grad_norm": 1.638997322742743, + "epoch": 0.03, "learning_rate": 1.5873015873015872e-07, - "logits/chosen": -2.6918673515319824, - "logits/rejected": -2.6841723918914795, - "logps/chosen": -154.97286987304688, - "logps/rejected": -164.1558837890625, - "loss": 0.221, - "rewards/accuracies": 0.5625, - "rewards/chosen": -0.001508228713646531, - "rewards/margins": 0.0005008662701584399, - "rewards/rejected": -0.0020090951584279537, + "logits/chosen": -2.6958394050598145, + "logits/rejected": -2.686532974243164, + "logps/chosen": -134.98963928222656, + "logps/rejected": -144.46652221679688, + "loss": 0.6928, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": 0.0015748919686302543, + "rewards/margins": 0.0009769219905138016, + "rewards/rejected": 0.0005979698617011309, "step": 40 }, { - "debug/losses": 0.2295212298631668, - "debug/policy_weights": 0.33170244097709656, - "debug/raw_losses": 0.6918987035751343, - "epoch": 0.03979307600477517, - "grad_norm": 1.4818468652902261, + "epoch": 0.04, "learning_rate": 1.984126984126984e-07, - "logits/chosen": -2.7066802978515625, - "logits/rejected": -2.6879210472106934, - "logps/chosen": -144.00912475585938, - "logps/rejected": -137.74859619140625, - "loss": 0.2233, - "rewards/accuracies": 0.612500011920929, - "rewards/chosen": -0.002855247352272272, - "rewards/margins": 0.0025138729251921177, - "rewards/rejected": -0.005369120743125677, + "logits/chosen": -2.7042899131774902, + "logits/rejected": -2.6861345767974854, + "logps/chosen": -149.71768188476562, + "logps/rejected": -145.0757293701172, + "loss": 0.6921, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.005199921317398548, + "rewards/margins": 0.0022330707870423794, + "rewards/rejected": 0.0029668500646948814, "step": 50 }, { - "debug/losses": 0.22746217250823975, - "debug/policy_weights": 0.3287571668624878, - "debug/raw_losses": 0.6916018724441528, - "epoch": 0.0477516912057302, - "grad_norm": 1.4868186691240965, + "epoch": 0.05, "learning_rate": 2.3809523809523806e-07, - "logits/chosen": -2.715649127960205, - "logits/rejected": -2.716421604156494, - "logps/chosen": -145.9324951171875, - "logps/rejected": -159.49798583984375, - "loss": 0.2191, - "rewards/accuracies": 0.5687500238418579, - "rewards/chosen": -0.003058128524571657, - "rewards/margins": 0.0031352252699434757, - "rewards/rejected": -0.006193353794515133, + "logits/chosen": -2.705153703689575, + "logits/rejected": -2.685439348220825, + "logps/chosen": -154.3783416748047, + "logps/rejected": -151.54519653320312, + "loss": 0.6912, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": 0.00569504126906395, + "rewards/margins": 0.0022000311873853207, + "rewards/rejected": 0.003495010081678629, "step": 60 }, { - "debug/losses": 0.2176016867160797, - "debug/policy_weights": 0.3151531517505646, - "debug/raw_losses": 0.6905783414840698, - "epoch": 0.055710306406685235, - "grad_norm": 1.5355880647836464, + "epoch": 0.06, "learning_rate": 2.7777777777777776e-07, - "logits/chosen": -2.736604690551758, - "logits/rejected": -2.727555990219116, - "logps/chosen": -149.36459350585938, - "logps/rejected": -143.43045043945312, - "loss": 0.2113, - "rewards/accuracies": 0.625, - "rewards/chosen": -0.017263194546103477, - "rewards/margins": 0.005380765534937382, - "rewards/rejected": -0.022643957287073135, + "logits/chosen": -2.7017154693603516, + "logits/rejected": -2.6924962997436523, + "logps/chosen": -146.3284149169922, + "logps/rejected": -138.79405212402344, + "loss": 0.6885, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.010588793084025383, + "rewards/margins": 0.010192448273301125, + "rewards/rejected": 0.00039634370477870107, "step": 70 }, { - "debug/losses": 0.1883382946252823, - "debug/policy_weights": 0.2724798023700714, - "debug/raw_losses": 0.6915446519851685, - "epoch": 0.06366892160764027, - "grad_norm": 1.5327858491562372, + "epoch": 0.06, "learning_rate": 3.1746031746031743e-07, - "logits/chosen": -2.7107303142547607, - "logits/rejected": -2.6923205852508545, - "logps/chosen": -157.98587036132812, - "logps/rejected": -149.39295959472656, - "loss": 0.2011, - "rewards/accuracies": 0.5249999761581421, - "rewards/chosen": -0.04851604253053665, - "rewards/margins": 0.003855167655274272, - "rewards/rejected": -0.05237121507525444, + "logits/chosen": -2.7155232429504395, + "logits/rejected": -2.696071147918701, + "logps/chosen": -141.80067443847656, + "logps/rejected": -147.0068817138672, + "loss": 0.6867, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": 0.0049073463305830956, + "rewards/margins": 0.013599385507404804, + "rewards/rejected": -0.008692039176821709, "step": 80 }, { - "debug/losses": 0.18660078942775726, - "debug/policy_weights": 0.2731621563434601, - "debug/raw_losses": 0.6829166412353516, - "epoch": 0.07162753680859531, - "grad_norm": 1.6166482985332293, + "epoch": 0.07, "learning_rate": 3.5714285714285716e-07, - "logits/chosen": -2.7208943367004395, - "logits/rejected": -2.727738618850708, - "logps/chosen": -152.66610717773438, - "logps/rejected": -173.5172882080078, - "loss": 0.185, - "rewards/accuracies": 0.606249988079071, - "rewards/chosen": -0.06819866597652435, - "rewards/margins": 0.022195402532815933, - "rewards/rejected": -0.09039406478404999, + "logits/chosen": -2.7175304889678955, + "logits/rejected": -2.7080624103546143, + "logps/chosen": -153.12509155273438, + "logps/rejected": -146.53590393066406, + "loss": 0.6847, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.028871387243270874, + "rewards/margins": 0.017175236716866493, + "rewards/rejected": -0.046046625822782516, "step": 90 }, { - "debug/losses": 0.16780678927898407, - "debug/policy_weights": 0.24758808314800262, - "debug/raw_losses": 0.6784400939941406, - "epoch": 0.07958615200955034, - "grad_norm": 1.5200040949416276, + "epoch": 0.08, "learning_rate": 3.968253968253968e-07, - "logits/chosen": -2.6890692710876465, - "logits/rejected": -2.6723039150238037, - "logps/chosen": -149.3301239013672, - "logps/rejected": -143.3484344482422, - "loss": 0.1733, - "rewards/accuracies": 0.5625, - "rewards/chosen": -0.11518146842718124, - "rewards/margins": 0.03295627981424332, - "rewards/rejected": -0.14813776314258575, + "logits/chosen": -2.7524733543395996, + "logits/rejected": -2.7452526092529297, + "logps/chosen": -163.88070678710938, + "logps/rejected": -163.61032104492188, + "loss": 0.6789, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.0662173330783844, + "rewards/margins": 0.02977912127971649, + "rewards/rejected": -0.09599645435810089, "step": 100 }, { - "epoch": 0.07958615200955034, - "eval_debug/losses": 0.1618107706308365, - "eval_debug/policy_weights": 0.23841436207294464, - "eval_debug/raw_losses": 0.6786514520645142, - "eval_logits/chosen": -2.711678981781006, - "eval_logits/rejected": -2.703619956970215, - "eval_logps/chosen": -158.4381103515625, - "eval_logps/rejected": -168.3044891357422, - "eval_loss": 0.1633094847202301, - "eval_rewards/accuracies": 0.5979477763175964, - "eval_rewards/chosen": -0.1419461965560913, - "eval_rewards/margins": 0.03410893306136131, - "eval_rewards/rejected": -0.17605511844158173, - "eval_runtime": 152.723, - "eval_samples_per_second": 55.997, - "eval_steps_per_second": 0.877, + "epoch": 0.08, + "eval_logits/chosen": -2.7336502075195312, + "eval_logits/rejected": -2.7255024909973145, + "eval_logps/chosen": -155.19271850585938, + "eval_logps/rejected": -165.35523986816406, + "eval_loss": 0.6769910454750061, + "eval_rewards/accuracies": 0.5914179086685181, + "eval_rewards/chosen": -0.10619194805622101, + "eval_rewards/margins": 0.03601696714758873, + "eval_rewards/rejected": -0.14220890402793884, + "eval_runtime": 184.2857, + "eval_samples_per_second": 46.406, + "eval_steps_per_second": 0.727, "step": 100 }, { - "debug/losses": 0.15096870064735413, - "debug/policy_weights": 0.2191181182861328, - "debug/raw_losses": 0.6909996867179871, - "epoch": 0.08754476721050537, - "grad_norm": 1.5046677998036695, + "epoch": 0.09, "learning_rate": 4.365079365079365e-07, - "logits/chosen": -2.6899752616882324, - "logits/rejected": -2.671255350112915, - "logps/chosen": -179.54322814941406, - "logps/rejected": -165.19168090820312, - "loss": 0.1473, - "rewards/accuracies": 0.5375000238418579, - "rewards/chosen": -0.21252474188804626, - "rewards/margins": 0.011284739710390568, - "rewards/rejected": -0.22380945086479187, + "logits/chosen": -2.738532543182373, + "logits/rejected": -2.7273170948028564, + "logps/chosen": -164.2928009033203, + "logps/rejected": -160.19398498535156, + "loss": 0.6738, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.16211798787117004, + "rewards/margins": 0.03163355216383934, + "rewards/rejected": -0.19375154376029968, "step": 110 }, { - "debug/losses": 0.1395491510629654, - "debug/policy_weights": 0.20841534435749054, - "debug/raw_losses": 0.6783668398857117, - "epoch": 0.0955033824114604, - "grad_norm": 1.388858214651452, + "epoch": 0.1, "learning_rate": 4.761904761904761e-07, - "logits/chosen": -2.663153886795044, - "logits/rejected": -2.6503779888153076, - "logps/chosen": -167.6672821044922, - "logps/rejected": -172.83718872070312, - "loss": 0.1346, - "rewards/accuracies": 0.574999988079071, - "rewards/chosen": -0.2853550910949707, - "rewards/margins": 0.04338715597987175, - "rewards/rejected": -0.32874220609664917, + "logits/chosen": -2.7289297580718994, + "logits/rejected": -2.705962657928467, + "logps/chosen": -196.69662475585938, + "logps/rejected": -197.2833251953125, + "loss": 0.661, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.2917623221874237, + "rewards/margins": 0.08966299891471863, + "rewards/rejected": -0.38142532110214233, "step": 120 }, { - "debug/losses": 0.11419715732336044, - "debug/policy_weights": 0.17497313022613525, - "debug/raw_losses": 0.6557947397232056, - "epoch": 0.10346199761241544, - "grad_norm": 2.1752818607208244, + "epoch": 0.1, "learning_rate": 4.999845414634076e-07, - "logits/chosen": -2.6734774112701416, - "logits/rejected": -2.645984172821045, - "logps/chosen": -189.3183135986328, - "logps/rejected": -179.66036987304688, - "loss": 0.117, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -0.34408754110336304, - "rewards/margins": 0.09557478129863739, - "rewards/rejected": -0.439662367105484, + "logits/chosen": -2.658005475997925, + "logits/rejected": -2.6317684650421143, + "logps/chosen": -187.4532928466797, + "logps/rejected": -188.37689208984375, + "loss": 0.6542, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.3956056833267212, + "rewards/margins": 0.12092368304729462, + "rewards/rejected": -0.5165294408798218, "step": 130 }, { - "debug/losses": 0.0952613353729248, - "debug/policy_weights": 0.14768476784229279, - "debug/raw_losses": 0.6522240042686462, - "epoch": 0.11142061281337047, - "grad_norm": 2.442265971491927, + "epoch": 0.11, "learning_rate": 4.998106548810311e-07, - "logits/chosen": -2.6368653774261475, - "logits/rejected": -2.6138510704040527, - "logps/chosen": -192.3903045654297, - "logps/rejected": -183.68685913085938, - "loss": 0.1083, - "rewards/accuracies": 0.65625, - "rewards/chosen": -0.46787652373313904, - "rewards/margins": 0.11621709167957306, - "rewards/rejected": -0.5840936303138733, + "logits/chosen": -2.6906683444976807, + "logits/rejected": -2.6913747787475586, + "logps/chosen": -199.67568969726562, + "logps/rejected": -253.02487182617188, + "loss": 0.6171, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.4227059781551361, + "rewards/margins": 0.27536457777023315, + "rewards/rejected": -0.6980706453323364, "step": 140 }, { - "debug/losses": 0.07598518580198288, - "debug/policy_weights": 0.12855970859527588, - "debug/raw_losses": 0.6079715490341187, - "epoch": 0.1193792280143255, - "grad_norm": 5.777724458868344, + "epoch": 0.12, "learning_rate": 4.994436933879359e-07, - "logits/chosen": -2.6171412467956543, - "logits/rejected": -2.610034942626953, - "logps/chosen": -196.5837860107422, - "logps/rejected": -225.31173706054688, - "loss": 0.0881, - "rewards/accuracies": 0.6875, - "rewards/chosen": -0.5704454183578491, - "rewards/margins": 0.22279813885688782, - "rewards/rejected": -0.7932435870170593, + "logits/chosen": -2.6662166118621826, + "logits/rejected": -2.644784927368164, + "logps/chosen": -197.07180786132812, + "logps/rejected": -198.4012908935547, + "loss": 0.6395, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.3597154915332794, + "rewards/margins": 0.13716872036457062, + "rewards/rejected": -0.49688419699668884, "step": 150 }, { - "debug/losses": 0.10459884256124496, - "debug/policy_weights": 0.1645442694425583, - "debug/raw_losses": 0.6207043528556824, - "epoch": 0.12733784321528055, - "grad_norm": 1.6699884563484173, + "epoch": 0.13, "learning_rate": 4.988839406031596e-07, - "logits/chosen": -2.6161184310913086, - "logits/rejected": -2.6225523948669434, - "logps/chosen": -160.10772705078125, - "logps/rejected": -207.2742156982422, - "loss": 0.1069, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -0.3946920931339264, - "rewards/margins": 0.20463863015174866, - "rewards/rejected": -0.599330723285675, + "logits/chosen": -2.647681474685669, + "logits/rejected": -2.6395888328552246, + "logps/chosen": -182.04420471191406, + "logps/rejected": -206.59780883789062, + "loss": 0.629, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.3573477864265442, + "rewards/margins": 0.2222837507724762, + "rewards/rejected": -0.579631507396698, "step": 160 }, { - "debug/losses": 0.099858358502388, - "debug/policy_weights": 0.1572486311197281, - "debug/raw_losses": 0.6404817700386047, - "epoch": 0.13529645841623558, - "grad_norm": 1.696036642690539, + "epoch": 0.14, "learning_rate": 4.981318291512395e-07, - "logits/chosen": -2.5810608863830566, - "logits/rejected": -2.578481912612915, - "logps/chosen": -174.32598876953125, - "logps/rejected": -198.0808868408203, - "loss": 0.1132, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -0.379153311252594, - "rewards/margins": 0.15221598744392395, - "rewards/rejected": -0.5313693284988403, + "logits/chosen": -2.619232654571533, + "logits/rejected": -2.598362684249878, + "logps/chosen": -227.0933380126953, + "logps/rejected": -230.9747772216797, + "loss": 0.6242, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.7434185743331909, + "rewards/margins": 0.21749505400657654, + "rewards/rejected": -0.9609137773513794, "step": 170 }, { - "debug/losses": 0.09473618865013123, - "debug/policy_weights": 0.13851876556873322, - "debug/raw_losses": 0.672447681427002, - "epoch": 0.14325507361719061, - "grad_norm": 1.9630112175338894, + "epoch": 0.14, "learning_rate": 4.971879403278432e-07, - "logits/chosen": -2.5959112644195557, - "logits/rejected": -2.5829200744628906, - "logps/chosen": -202.05296325683594, - "logps/rejected": -207.42648315429688, - "loss": 0.0946, - "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": -0.5352639555931091, - "rewards/margins": 0.0995597094297409, - "rewards/rejected": -0.6348236203193665, + "logits/chosen": -2.5654754638671875, + "logits/rejected": -2.5364232063293457, + "logps/chosen": -241.6617431640625, + "logps/rejected": -245.66268920898438, + "loss": 0.6151, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.7342535257339478, + "rewards/margins": 0.23685339093208313, + "rewards/rejected": -0.9711068868637085, "step": 180 }, { - "debug/losses": 0.08237513154745102, - "debug/policy_weights": 0.12221841514110565, - "debug/raw_losses": 0.6608506441116333, - "epoch": 0.15121368881814565, - "grad_norm": 1.9225034938066397, + "epoch": 0.15, "learning_rate": 4.960530036504941e-07, - "logits/chosen": -2.579829692840576, - "logits/rejected": -2.5656394958496094, - "logps/chosen": -222.9535369873047, - "logps/rejected": -232.0922393798828, - "loss": 0.0793, - "rewards/accuracies": 0.574999988079071, - "rewards/chosen": -0.7860406637191772, - "rewards/margins": 0.1566535085439682, - "rewards/rejected": -0.9426941871643066, + "logits/chosen": -2.5271048545837402, + "logits/rejected": -2.486818790435791, + "logps/chosen": -235.6089630126953, + "logps/rejected": -251.17758178710938, + "loss": 0.6215, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.807177722454071, + "rewards/margins": 0.28561535477638245, + "rewards/rejected": -1.0927931070327759, "step": 190 }, { - "debug/losses": 0.08729609102010727, - "debug/policy_weights": 0.13735465705394745, - "debug/raw_losses": 0.6344841718673706, - "epoch": 0.15917230401910068, - "grad_norm": 1.4158943423202177, + "epoch": 0.16, "learning_rate": 4.947278962947386e-07, - "logits/chosen": -2.5332934856414795, - "logits/rejected": -2.5345585346221924, - "logps/chosen": -217.6802215576172, - "logps/rejected": -252.038330078125, - "loss": 0.0826, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -0.6853083372116089, - "rewards/margins": 0.20577910542488098, - "rewards/rejected": -0.8910874128341675, + "logits/chosen": -2.4217896461486816, + "logits/rejected": -2.413295269012451, + "logps/chosen": -251.0736083984375, + "logps/rejected": -268.6098937988281, + "loss": 0.6062, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.086307406425476, + "rewards/margins": 0.24874301254749298, + "rewards/rejected": -1.3350504636764526, "step": 200 }, { - "epoch": 0.15917230401910068, - "eval_debug/losses": 0.07361924648284912, - "eval_debug/policy_weights": 0.11718404293060303, - "eval_debug/raw_losses": 0.6317233443260193, - "eval_logits/chosen": -2.550771474838257, - "eval_logits/rejected": -2.540393590927124, - "eval_logps/chosen": -223.08139038085938, - "eval_logps/rejected": -250.74151611328125, - "eval_loss": 0.07500571012496948, - "eval_rewards/accuracies": 0.6361940503120422, - "eval_rewards/chosen": -0.7883791327476501, - "eval_rewards/margins": 0.21204641461372375, - "eval_rewards/rejected": -1.0004255771636963, - "eval_runtime": 153.1417, - "eval_samples_per_second": 55.844, - "eval_steps_per_second": 0.875, + "epoch": 0.16, + "eval_logits/chosen": -2.3855514526367188, + "eval_logits/rejected": -2.369593858718872, + "eval_logps/chosen": -246.6970672607422, + "eval_logps/rejected": -289.8621826171875, + "eval_loss": 0.6079375743865967, + "eval_rewards/accuracies": 0.66697758436203, + "eval_rewards/chosen": -1.021235704421997, + "eval_rewards/margins": 0.3660426437854767, + "eval_rewards/rejected": -1.3872781991958618, + "eval_runtime": 184.0759, + "eval_samples_per_second": 46.459, + "eval_steps_per_second": 0.728, "step": 200 }, { - "debug/losses": 0.06145762279629707, - "debug/policy_weights": 0.10109380632638931, - "debug/raw_losses": 0.6302995681762695, - "epoch": 0.1671309192200557, - "grad_norm": 1.435571888650618, + "epoch": 0.17, "learning_rate": 4.932136424161899e-07, - "logits/chosen": -2.521623134613037, - "logits/rejected": -2.50742769241333, - "logps/chosen": -217.82498168945312, - "logps/rejected": -248.6222381591797, - "loss": 0.0666, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -0.922324538230896, - "rewards/margins": 0.24683670699596405, - "rewards/rejected": -1.169161319732666, + "logits/chosen": -2.3366785049438477, + "logits/rejected": -2.3228511810302734, + "logps/chosen": -266.292236328125, + "logps/rejected": -300.22894287109375, + "loss": 0.5893, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.2130026817321777, + "rewards/margins": 0.3487839996814728, + "rewards/rejected": -1.5617868900299072, "step": 210 }, { - "debug/losses": 0.04205631464719772, - "debug/policy_weights": 0.07819141447544098, - "debug/raw_losses": 0.5699876546859741, - "epoch": 0.17508953442101075, - "grad_norm": 1.336610780534668, + "epoch": 0.18, "learning_rate": 4.915114123589732e-07, - "logits/chosen": -2.550821304321289, - "logits/rejected": -2.526845932006836, - "logps/chosen": -245.73306274414062, - "logps/rejected": -283.7450256347656, - "loss": 0.0522, + "logits/chosen": -2.321228504180908, + "logits/rejected": -2.3033699989318848, + "logps/chosen": -336.34161376953125, + "logps/rejected": -373.39935302734375, + "loss": 0.612, "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -1.0883342027664185, - "rewards/margins": 0.3927660584449768, - "rewards/rejected": -1.48110032081604, + "rewards/chosen": -1.9529145956039429, + "rewards/margins": 0.2863468527793884, + "rewards/rejected": -2.2392613887786865, "step": 220 }, { - "debug/losses": 0.04529764503240585, - "debug/policy_weights": 0.07175824046134949, - "debug/raw_losses": 0.6204754114151001, - "epoch": 0.18304814962196578, - "grad_norm": 1.4784857016304622, + "epoch": 0.18, "learning_rate": 4.896225217511849e-07, - "logits/chosen": -2.553422212600708, - "logits/rejected": -2.5453922748565674, - "logps/chosen": -263.7728576660156, - "logps/rejected": -298.7444152832031, - "loss": 0.0493, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -1.2447619438171387, - "rewards/margins": 0.2553327679634094, - "rewards/rejected": -1.5000946521759033, + "logits/chosen": -2.4310107231140137, + "logits/rejected": -2.422048568725586, + "logps/chosen": -291.1025695800781, + "logps/rejected": -328.18963623046875, + "loss": 0.6079, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.4314143657684326, + "rewards/margins": 0.3364001214504242, + "rewards/rejected": -1.7678143978118896, "step": 230 }, { - "debug/losses": 0.06539560854434967, - "debug/policy_weights": 0.10245949029922485, - "debug/raw_losses": 0.6345073580741882, - "epoch": 0.1910067648229208, - "grad_norm": 1.9058271009788008, + "epoch": 0.19, "learning_rate": 4.875484304880629e-07, - "logits/chosen": -2.576491594314575, - "logits/rejected": -2.558601140975952, - "logps/chosen": -257.90118408203125, - "logps/rejected": -279.42266845703125, - "loss": 0.0646, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -0.9614318609237671, - "rewards/margins": 0.2736155390739441, - "rewards/rejected": -1.2350473403930664, + "logits/chosen": -2.3412394523620605, + "logits/rejected": -2.309183120727539, + "logps/chosen": -280.8785705566406, + "logps/rejected": -308.54132080078125, + "loss": 0.613, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.309309482574463, + "rewards/margins": 0.3731766939163208, + "rewards/rejected": -1.6824861764907837, "step": 240 }, { - "debug/losses": 0.06740079820156097, - "debug/policy_weights": 0.11094751209020615, - "debug/raw_losses": 0.6248602271080017, - "epoch": 0.19896538002387584, - "grad_norm": 1.5384599137611568, + "epoch": 0.2, "learning_rate": 4.852907416036558e-07, - "logits/chosen": -2.521726131439209, - "logits/rejected": -2.512889862060547, - "logps/chosen": -222.5371856689453, - "logps/rejected": -262.0302429199219, - "loss": 0.0757, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -0.8324035406112671, - "rewards/margins": 0.2716120183467865, - "rewards/rejected": -1.1040157079696655, + "logits/chosen": -2.415271282196045, + "logits/rejected": -2.4072234630584717, + "logps/chosen": -243.56332397460938, + "logps/rejected": -298.7532043457031, + "loss": 0.591, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.967076301574707, + "rewards/margins": 0.4581146240234375, + "rewards/rejected": -1.4251911640167236, "step": 250 }, { - "debug/losses": 0.06672520935535431, - "debug/policy_weights": 0.11189812421798706, - "debug/raw_losses": 0.5824211239814758, - "epoch": 0.20692399522483088, - "grad_norm": 1.4195949507142585, + "epoch": 0.21, "learning_rate": 4.828512000318616e-07, - "logits/chosen": -2.535505533218384, - "logits/rejected": -2.4970450401306152, - "logps/chosen": -254.94741821289062, - "logps/rejected": -278.82440185546875, - "loss": 0.0639, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -0.9024659395217896, - "rewards/margins": 0.3342443108558655, - "rewards/rejected": -1.2367103099822998, + "logits/chosen": -2.3924427032470703, + "logits/rejected": -2.3613152503967285, + "logps/chosen": -266.86572265625, + "logps/rejected": -304.2983093261719, + "loss": 0.5986, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.2214807271957397, + "rewards/margins": 0.4553411602973938, + "rewards/rejected": -1.6768219470977783, "step": 260 }, { - "debug/losses": 0.05524461343884468, - "debug/policy_weights": 0.09467937052249908, - "debug/raw_losses": 0.600980281829834, - "epoch": 0.2148826104257859, - "grad_norm": 2.09985567586005, + "epoch": 0.21, "learning_rate": 4.802316912577946e-07, - "logits/chosen": -2.4608120918273926, - "logits/rejected": -2.4260432720184326, - "logps/chosen": -242.8441925048828, - "logps/rejected": -261.7018127441406, - "loss": 0.0597, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -0.9519554972648621, - "rewards/margins": 0.32279661297798157, - "rewards/rejected": -1.2747520208358765, + "logits/chosen": -2.4108529090881348, + "logits/rejected": -2.3902478218078613, + "logps/chosen": -252.7959442138672, + "logps/rejected": -295.266357421875, + "loss": 0.5917, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.0824626684188843, + "rewards/margins": 0.39643940329551697, + "rewards/rejected": -1.4789022207260132, "step": 270 }, { - "debug/losses": 0.07073532044887543, - "debug/policy_weights": 0.11124340444803238, - "debug/raw_losses": 0.6386277079582214, - "epoch": 0.22284122562674094, - "grad_norm": 2.53125193502766, + "epoch": 0.22, "learning_rate": 4.774342398605221e-07, - "logits/chosen": -2.4330108165740967, - "logits/rejected": -2.4124581813812256, - "logps/chosen": -238.8041229248047, - "logps/rejected": -258.5404357910156, - "loss": 0.0674, - "rewards/accuracies": 0.612500011920929, - "rewards/chosen": -0.9319969415664673, - "rewards/margins": 0.2547939419746399, - "rewards/rejected": -1.186790943145752, + "logits/chosen": -2.3505263328552246, + "logits/rejected": -2.2942967414855957, + "logps/chosen": -279.871337890625, + "logps/rejected": -300.4220886230469, + "loss": 0.5979, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.1787078380584717, + "rewards/margins": 0.430286169052124, + "rewards/rejected": -1.6089938879013062, "step": 280 }, { - "debug/losses": 0.06495990604162216, - "debug/policy_weights": 0.10757051408290863, - "debug/raw_losses": 0.5930755138397217, - "epoch": 0.23079984082769597, - "grad_norm": 2.544321863170394, + "epoch": 0.23, "learning_rate": 4.744610079482978e-07, - "logits/chosen": -2.4562830924987793, - "logits/rejected": -2.4185662269592285, - "logps/chosen": -274.9333190917969, - "logps/rejected": -300.23583984375, - "loss": 0.0558, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -1.0925931930541992, - "rewards/margins": 0.3425787389278412, - "rewards/rejected": -1.4351718425750732, + "logits/chosen": -2.3269264698028564, + "logits/rejected": -2.2910802364349365, + "logps/chosen": -255.27706909179688, + "logps/rejected": -281.60137939453125, + "loss": 0.5853, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.1753785610198975, + "rewards/margins": 0.3495523929595947, + "rewards/rejected": -1.5249310731887817, "step": 290 }, { - "debug/losses": 0.04091046005487442, - "debug/policy_weights": 0.07422160357236862, - "debug/raw_losses": 0.5923314094543457, - "epoch": 0.238758456028651, - "grad_norm": 2.8319076712554905, + "epoch": 0.24, "learning_rate": 4.713142934875005e-07, - "logits/chosen": -2.3712522983551025, - "logits/rejected": -2.329404592514038, - "logps/chosen": -278.4283752441406, - "logps/rejected": -303.9710388183594, - "loss": 0.0515, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -1.30873441696167, - "rewards/margins": 0.4222927689552307, - "rewards/rejected": -1.7310272455215454, + "logits/chosen": -2.2868428230285645, + "logits/rejected": -2.2631592750549316, + "logps/chosen": -284.2200012207031, + "logps/rejected": -322.45269775390625, + "loss": 0.5965, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.336501955986023, + "rewards/margins": 0.3968800902366638, + "rewards/rejected": -1.733382225036621, "step": 300 }, { - "epoch": 0.238758456028651, - "eval_debug/losses": 0.0557919405400753, - "eval_debug/policy_weights": 0.09022855013608932, - "eval_debug/raw_losses": 0.6134340167045593, - "eval_logits/chosen": -2.3552868366241455, - "eval_logits/rejected": -2.339716911315918, - "eval_logps/chosen": -268.7242736816406, - "eval_logps/rejected": -311.3038635253906, - "eval_loss": 0.05731642618775368, - "eval_rewards/accuracies": 0.6567164063453674, - "eval_rewards/chosen": -1.2448077201843262, - "eval_rewards/margins": 0.36124110221862793, - "eval_rewards/rejected": -1.6060487031936646, - "eval_runtime": 153.2008, - "eval_samples_per_second": 55.822, - "eval_steps_per_second": 0.875, + "epoch": 0.24, + "eval_logits/chosen": -2.265592098236084, + "eval_logits/rejected": -2.244987964630127, + "eval_logps/chosen": -282.3620910644531, + "eval_logps/rejected": -331.2099609375, + "eval_loss": 0.5907339453697205, + "eval_rewards/accuracies": 0.6623134613037109, + "eval_rewards/chosen": -1.3778856992721558, + "eval_rewards/margins": 0.42287060618400574, + "eval_rewards/rejected": -1.8007562160491943, + "eval_runtime": 184.1898, + "eval_samples_per_second": 46.43, + "eval_steps_per_second": 0.728, "step": 300 }, { - "debug/losses": 0.0425337590277195, - "debug/policy_weights": 0.06980495154857635, - "debug/raw_losses": 0.588487982749939, - "epoch": 0.24671707122960604, - "grad_norm": 1.7020687918550936, + "epoch": 0.25, "learning_rate": 4.679965285265706e-07, - "logits/chosen": -2.2830398082733154, - "logits/rejected": -2.270655632019043, - "logps/chosen": -256.02520751953125, - "logps/rejected": -309.3921813964844, - "loss": 0.0508, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -1.3847135305404663, - "rewards/margins": 0.4131029546260834, - "rewards/rejected": -1.7978166341781616, + "logits/chosen": -2.2354235649108887, + "logits/rejected": -2.23685884475708, + "logps/chosen": -277.09283447265625, + "logps/rejected": -347.7145080566406, + "loss": 0.5612, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.3514426946640015, + "rewards/margins": 0.4907970428466797, + "rewards/rejected": -1.8422397375106812, "step": 310 }, { - "debug/losses": 0.030957188457250595, - "debug/policy_weights": 0.05928220599889755, - "debug/raw_losses": 0.5176903009414673, - "epoch": 0.2546756864305611, - "grad_norm": 1.8070710565204129, + "epoch": 0.25, "learning_rate": 4.64510277316316e-07, - "logits/chosen": -2.213348150253296, - "logits/rejected": -2.1686458587646484, - "logps/chosen": -307.92364501953125, - "logps/rejected": -366.7274169921875, - "loss": 0.0327, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.6229156255722046, - "rewards/margins": 0.6521427631378174, - "rewards/rejected": -2.2750582695007324, + "logits/chosen": -2.2262344360351562, + "logits/rejected": -2.226029634475708, + "logps/chosen": -271.74212646484375, + "logps/rejected": -332.5010986328125, + "loss": 0.5903, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.3337775468826294, + "rewards/margins": 0.39512914419174194, + "rewards/rejected": -1.7289068698883057, "step": 320 }, { - "debug/losses": 0.03477818891406059, - "debug/policy_weights": 0.05532154440879822, - "debug/raw_losses": 0.6181614398956299, - "epoch": 0.26263430163151613, - "grad_norm": 1.6888122864648538, + "epoch": 0.26, "learning_rate": 4.6085823432804137e-07, - "logits/chosen": -2.2881710529327393, - "logits/rejected": -2.259243965148926, - "logps/chosen": -319.549560546875, - "logps/rejected": -354.44024658203125, - "loss": 0.0355, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -1.757887840270996, - "rewards/margins": 0.3513242304325104, - "rewards/rejected": -2.1092123985290527, + "logits/chosen": -2.2451891899108887, + "logits/rejected": -2.2502384185791016, + "logps/chosen": -250.6347198486328, + "logps/rejected": -333.8939208984375, + "loss": 0.5722, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.1326004266738892, + "rewards/margins": 0.5066065192222595, + "rewards/rejected": -1.639206886291504, "step": 330 }, { - "debug/losses": 0.036134570837020874, - "debug/policy_weights": 0.0601133331656456, - "debug/raw_losses": 0.6012001633644104, - "epoch": 0.27059291683247116, - "grad_norm": 0.9082619220908468, + "epoch": 0.27, "learning_rate": 4.570432221710314e-07, - "logits/chosen": -2.345127820968628, - "logits/rejected": -2.328043222427368, - "logps/chosen": -311.11541748046875, - "logps/rejected": -360.7956237792969, - "loss": 0.0329, - "rewards/accuracies": 0.625, - "rewards/chosen": -1.6493396759033203, - "rewards/margins": 0.4032120704650879, - "rewards/rejected": -2.052551746368408, + "logits/chosen": -2.0656931400299072, + "logits/rejected": -2.0213730335235596, + "logps/chosen": -318.232177734375, + "logps/rejected": -369.13311767578125, + "loss": 0.5766, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.584176778793335, + "rewards/margins": 0.5901076197624207, + "rewards/rejected": -2.1742844581604004, "step": 340 }, { - "debug/losses": 0.038389407098293304, - "debug/policy_weights": 0.0665694996714592, - "debug/raw_losses": 0.5971581339836121, - "epoch": 0.2785515320334262, - "grad_norm": 1.7673927299473366, + "epoch": 0.28, "learning_rate": 4.5306818941099866e-07, - "logits/chosen": -2.3454818725585938, - "logits/rejected": -2.2886033058166504, - "logps/chosen": -309.58526611328125, - "logps/rejected": -334.3452453613281, - "loss": 0.0377, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -1.5936106443405151, - "rewards/margins": 0.41180863976478577, - "rewards/rejected": -2.0054192543029785, + "logits/chosen": -1.9084612131118774, + "logits/rejected": -1.8514792919158936, + "logps/chosen": -316.9821472167969, + "logps/rejected": -352.9412841796875, + "loss": 0.5825, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.5627154111862183, + "rewards/margins": 0.5152220726013184, + "rewards/rejected": -2.077937364578247, "step": 350 }, { - "debug/losses": 0.03970758244395256, - "debug/policy_weights": 0.06552158296108246, - "debug/raw_losses": 0.5963835716247559, - "epoch": 0.28651014723438123, - "grad_norm": 1.165368953491439, + "epoch": 0.29, "learning_rate": 4.4893620829118124e-07, - "logits/chosen": -2.3107028007507324, - "logits/rejected": -2.2786478996276855, - "logps/chosen": -296.3944091796875, - "logps/rejected": -329.440185546875, - "loss": 0.042, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -1.4465725421905518, - "rewards/margins": 0.39386042952537537, - "rewards/rejected": -1.840433120727539, + "logits/chosen": -1.8860156536102295, + "logits/rejected": -1.8301204442977905, + "logps/chosen": -309.8200378417969, + "logps/rejected": -362.0408935546875, + "loss": 0.5755, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.5869390964508057, + "rewards/margins": 0.49348369240760803, + "rewards/rejected": -2.080422878265381, "step": 360 }, { - "debug/losses": 0.031909000128507614, - "debug/policy_weights": 0.05640440434217453, - "debug/raw_losses": 0.632055401802063, - "epoch": 0.29446876243533626, - "grad_norm": 1.4365386513442289, + "epoch": 0.29, "learning_rate": 4.4465047235785185e-07, - "logits/chosen": -2.3353095054626465, - "logits/rejected": -2.299114942550659, - "logps/chosen": -316.80322265625, - "logps/rejected": -334.6367492675781, - "loss": 0.038, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -1.719588041305542, - "rewards/margins": 0.34410953521728516, - "rewards/rejected": -2.063697576522827, + "logits/chosen": -1.6610889434814453, + "logits/rejected": -1.585129737854004, + "logps/chosen": -321.8608703613281, + "logps/rejected": -380.31036376953125, + "loss": 0.5697, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.774713158607483, + "rewards/margins": 0.6593302488327026, + "rewards/rejected": -2.4340434074401855, "step": 370 }, { - "debug/losses": 0.03546866774559021, - "debug/policy_weights": 0.05660950019955635, - "debug/raw_losses": 0.6386741399765015, - "epoch": 0.3024273776362913, - "grad_norm": 1.6981468490942673, + "epoch": 0.3, "learning_rate": 4.40214293992074e-07, - "logits/chosen": -2.302638530731201, - "logits/rejected": -2.2782890796661377, - "logps/chosen": -302.3027038574219, - "logps/rejected": -331.7867736816406, - "loss": 0.0384, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -1.649095892906189, - "rewards/margins": 0.3226591646671295, - "rewards/rejected": -1.971754789352417, + "logits/chosen": -1.385825753211975, + "logits/rejected": -1.31913161277771, + "logps/chosen": -377.07269287109375, + "logps/rejected": -459.5557556152344, + "loss": 0.5818, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -2.1890993118286133, + "rewards/margins": 0.7521292567253113, + "rewards/rejected": -2.9412286281585693, "step": 380 }, { - "debug/losses": 0.04937135428190231, - "debug/policy_weights": 0.07896497845649719, - "debug/raw_losses": 0.6217660903930664, - "epoch": 0.3103859928372463, - "grad_norm": 1.7063718152736185, + "epoch": 0.31, "learning_rate": 4.3563110184961234e-07, - "logits/chosen": -2.327444076538086, - "logits/rejected": -2.305551052093506, - "logps/chosen": -294.81793212890625, - "logps/rejected": -332.69305419921875, - "loss": 0.0437, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -1.4561359882354736, - "rewards/margins": 0.34932222962379456, - "rewards/rejected": -1.8054578304290771, + "logits/chosen": -1.5089499950408936, + "logits/rejected": -1.4075387716293335, + "logps/chosen": -338.3626708984375, + "logps/rejected": -396.67578125, + "loss": 0.5584, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.9042552709579468, + "rewards/margins": 0.5932050347328186, + "rewards/rejected": -2.49746036529541, "step": 390 }, { - "debug/losses": 0.03210610896348953, - "debug/policy_weights": 0.06029694527387619, - "debug/raw_losses": 0.5555212497711182, - "epoch": 0.31834460803820136, - "grad_norm": 1.053077631697114, + "epoch": 0.32, "learning_rate": 4.3090443821097566e-07, - "logits/chosen": -2.3126392364501953, - "logits/rejected": -2.290933847427368, - "logps/chosen": -292.9631042480469, - "logps/rejected": -358.627685546875, - "loss": 0.0343, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -1.5689239501953125, - "rewards/margins": 0.5151135325431824, - "rewards/rejected": -2.0840373039245605, + "logits/chosen": -1.2587625980377197, + "logits/rejected": -1.2017955780029297, + "logps/chosen": -309.43377685546875, + "logps/rejected": -372.00531005859375, + "loss": 0.5729, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.9031288623809814, + "rewards/margins": 0.5602144598960876, + "rewards/rejected": -2.4633431434631348, "step": 400 }, { - "epoch": 0.31834460803820136, - "eval_debug/losses": 0.028384286910295486, - "eval_debug/policy_weights": 0.04820017144083977, - "eval_debug/raw_losses": 0.59935063123703, - "eval_logits/chosen": -2.3006632328033447, - "eval_logits/rejected": -2.2854816913604736, - "eval_logps/chosen": -321.4913330078125, - "eval_logps/rejected": -364.0837097167969, - "eval_loss": 0.030249595642089844, - "eval_rewards/accuracies": 0.6623134613037109, - "eval_rewards/chosen": -1.7724782228469849, - "eval_rewards/margins": 0.3613690733909607, - "eval_rewards/rejected": -2.133847236633301, - "eval_runtime": 153.4178, - "eval_samples_per_second": 55.743, - "eval_steps_per_second": 0.873, + "epoch": 0.32, + "eval_logits/chosen": -1.3760210275650024, + "eval_logits/rejected": -1.2920024394989014, + "eval_logps/chosen": -312.20635986328125, + "eval_logps/rejected": -375.1720275878906, + "eval_loss": 0.5711147785186768, + "eval_rewards/accuracies": 0.6828358173370361, + "eval_rewards/chosen": -1.676328182220459, + "eval_rewards/margins": 0.5640482306480408, + "eval_rewards/rejected": -2.2403764724731445, + "eval_runtime": 184.0944, + "eval_samples_per_second": 46.454, + "eval_steps_per_second": 0.728, "step": 400 }, { - "debug/losses": 0.029884925112128258, - "debug/policy_weights": 0.05188853666186333, - "debug/raw_losses": 0.5859032869338989, - "epoch": 0.3263032232391564, - "grad_norm": 1.1682793896386887, + "epoch": 0.33, "learning_rate": 4.2603795624364195e-07, - "logits/chosen": -2.279048204421997, - "logits/rejected": -2.2422895431518555, - "logps/chosen": -302.4944152832031, - "logps/rejected": -329.99310302734375, - "loss": 0.0303, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -1.6612056493759155, - "rewards/margins": 0.3638765215873718, - "rewards/rejected": -2.0250821113586426, + "logits/chosen": -1.2894772291183472, + "logits/rejected": -1.23129141330719, + "logps/chosen": -299.457275390625, + "logps/rejected": -370.8555908203125, + "loss": 0.5666, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.6054102182388306, + "rewards/margins": 0.5984233021736145, + "rewards/rejected": -2.203833818435669, "step": 410 }, { - "debug/losses": 0.039774149656295776, - "debug/policy_weights": 0.06452381610870361, - "debug/raw_losses": 0.5788149833679199, - "epoch": 0.3342618384401114, - "grad_norm": 1.1186339494698643, + "epoch": 0.33, "learning_rate": 4.210354171785795e-07, - "logits/chosen": -2.3332016468048096, - "logits/rejected": -2.331998109817505, - "logps/chosen": -291.0079040527344, - "logps/rejected": -344.42462158203125, - "loss": 0.0361, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -1.488434910774231, - "rewards/margins": 0.40906715393066406, - "rewards/rejected": -1.8975019454956055, + "logits/chosen": -1.022984266281128, + "logits/rejected": -0.9285897016525269, + "logps/chosen": -324.4284973144531, + "logps/rejected": -385.0074157714844, + "loss": 0.5596, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.921677589416504, + "rewards/margins": 0.5404387712478638, + "rewards/rejected": -2.4621164798736572, "step": 420 }, { - "debug/losses": 0.03884187713265419, - "debug/policy_weights": 0.06631166487932205, - "debug/raw_losses": 0.5891371369361877, - "epoch": 0.34222045364106646, - "grad_norm": 1.8153925533673183, + "epoch": 0.34, "learning_rate": 4.15900687403248e-07, - "logits/chosen": -2.3279812335968018, - "logits/rejected": -2.3141000270843506, - "logps/chosen": -287.6651916503906, - "logps/rejected": -336.90863037109375, - "loss": 0.0393, + "logits/chosen": -0.8059805631637573, + "logits/rejected": -0.7196700572967529, + "logps/chosen": -353.788330078125, + "logps/rejected": -411.4853515625, + "loss": 0.5865, "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -1.4973087310791016, - "rewards/margins": 0.46901196241378784, - "rewards/rejected": -1.9663206338882446, + "rewards/chosen": -2.1321234703063965, + "rewards/margins": 0.463266521692276, + "rewards/rejected": -2.5953898429870605, "step": 430 }, { - "debug/losses": 0.03900137543678284, - "debug/policy_weights": 0.06590630114078522, - "debug/raw_losses": 0.6019418239593506, - "epoch": 0.3501790688420215, - "grad_norm": 1.3373117259952034, + "epoch": 0.35, "learning_rate": 4.1063773547332584e-07, - "logits/chosen": -2.332784652709961, - "logits/rejected": -2.311373233795166, - "logps/chosen": -290.93402099609375, - "logps/rejected": -337.70306396484375, - "loss": 0.0411, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -1.4399511814117432, - "rewards/margins": 0.4576480984687805, - "rewards/rejected": -1.897599458694458, + "logits/chosen": -0.9645301699638367, + "logits/rejected": -0.7601315975189209, + "logps/chosen": -346.8272705078125, + "logps/rejected": -392.2935791015625, + "loss": 0.5591, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.9153356552124023, + "rewards/margins": 0.5891679525375366, + "rewards/rejected": -2.5045037269592285, "step": 440 }, { - "debug/losses": 0.034195203334093094, - "debug/policy_weights": 0.06440727412700653, - "debug/raw_losses": 0.5752583742141724, - "epoch": 0.3581376840429765, - "grad_norm": 1.4642214637111448, + "epoch": 0.36, "learning_rate": 4.0525062904547276e-07, - "logits/chosen": -2.274158000946045, - "logits/rejected": -2.241867780685425, - "logps/chosen": -274.84783935546875, - "logps/rejected": -318.5943908691406, - "loss": 0.0424, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -1.4228484630584717, - "rewards/margins": 0.5137365460395813, - "rewards/rejected": -1.9365848302841187, + "logits/chosen": -0.608537495136261, + "logits/rejected": -0.47767123579978943, + "logps/chosen": -341.55364990234375, + "logps/rejected": -434.1073303222656, + "loss": 0.5687, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.105318069458008, + "rewards/margins": 0.6994394659996033, + "rewards/rejected": -2.8047571182250977, "step": 450 }, { - "debug/losses": 0.042845163494348526, - "debug/policy_weights": 0.07054711878299713, - "debug/raw_losses": 0.5637282729148865, - "epoch": 0.36609629924393156, - "grad_norm": 1.493914915308785, + "epoch": 0.37, "learning_rate": 3.997435317334988e-07, - "logits/chosen": -2.3144097328186035, - "logits/rejected": -2.2954559326171875, - "logps/chosen": -297.01629638671875, - "logps/rejected": -354.07861328125, - "loss": 0.0384, - "rewards/accuracies": 0.6875, - "rewards/chosen": -1.5000154972076416, - "rewards/margins": 0.5090978145599365, - "rewards/rejected": -2.009113311767578, + "logits/chosen": -0.6356207132339478, + "logits/rejected": -0.25634175539016724, + "logps/chosen": -384.43780517578125, + "logps/rejected": -419.24176025390625, + "loss": 0.5608, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.2970900535583496, + "rewards/margins": 0.6535100340843201, + "rewards/rejected": -2.9506001472473145, "step": 460 }, { - "debug/losses": 0.03523220494389534, - "debug/policy_weights": 0.060066692531108856, - "debug/raw_losses": 0.5853220820426941, - "epoch": 0.3740549144448866, - "grad_norm": 1.5606293995092184, + "epoch": 0.37, "learning_rate": 3.941206998903701e-07, - "logits/chosen": -2.290365219116211, - "logits/rejected": -2.2592685222625732, - "logps/chosen": -331.3341369628906, - "logps/rejected": -367.52630615234375, - "loss": 0.031, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -1.7656469345092773, - "rewards/margins": 0.38830217719078064, - "rewards/rejected": -2.153949022293091, + "logits/chosen": -1.0318920612335205, + "logits/rejected": -0.7451022267341614, + "logps/chosen": -338.9430236816406, + "logps/rejected": -384.64111328125, + "loss": 0.5678, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.9643396139144897, + "rewards/margins": 0.5402536392211914, + "rewards/rejected": -2.5045928955078125, "step": 470 }, { - "debug/losses": 0.042910732328891754, - "debug/policy_weights": 0.06740695238113403, - "debug/raw_losses": 0.5751327276229858, - "epoch": 0.3820135296458416, - "grad_norm": 2.0400801936994357, + "epoch": 0.38, "learning_rate": 3.8838647931853684e-07, - "logits/chosen": -2.2397708892822266, - "logits/rejected": -2.2155957221984863, - "logps/chosen": -265.8719177246094, - "logps/rejected": -317.6902770996094, - "loss": 0.0417, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -1.4397175312042236, - "rewards/margins": 0.4852531850337982, - "rewards/rejected": -1.9249706268310547, + "logits/chosen": -0.6847028732299805, + "logits/rejected": -0.5548251867294312, + "logps/chosen": -339.61456298828125, + "logps/rejected": -435.32061767578125, + "loss": 0.5814, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.1349122524261475, + "rewards/margins": 0.7573872804641724, + "rewards/rejected": -2.8923001289367676, "step": 480 }, { - "debug/losses": 0.040277738124132156, - "debug/policy_weights": 0.07625563442707062, - "debug/raw_losses": 0.5467380881309509, - "epoch": 0.38997214484679665, - "grad_norm": 1.5717280943877756, + "epoch": 0.39, "learning_rate": 3.825453019111281e-07, - "logits/chosen": -2.246934175491333, - "logits/rejected": -2.2185163497924805, - "logps/chosen": -281.0770568847656, - "logps/rejected": -348.90753173828125, - "loss": 0.0441, - "rewards/accuracies": 0.71875, - "rewards/chosen": -1.3660808801651, - "rewards/margins": 0.5780023336410522, - "rewards/rejected": -1.9440832138061523, + "logits/chosen": -0.5378957986831665, + "logits/rejected": -0.28533270955085754, + "logps/chosen": -363.78570556640625, + "logps/rejected": -430.11749267578125, + "loss": 0.5327, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.134934425354004, + "rewards/margins": 0.6089809536933899, + "rewards/rejected": -2.743915319442749, "step": 490 }, { - "debug/losses": 0.0474771223962307, - "debug/policy_weights": 0.08785078674554825, - "debug/raw_losses": 0.5391517281532288, - "epoch": 0.3979307600477517, - "grad_norm": 1.89593573498899, + "epoch": 0.4, "learning_rate": 3.7660168222660824e-07, - "logits/chosen": -2.312988519668579, - "logits/rejected": -2.2645742893218994, - "logps/chosen": -305.178466796875, - "logps/rejected": -341.88360595703125, - "loss": 0.0432, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -1.3672921657562256, - "rewards/margins": 0.5291160941123962, - "rewards/rejected": -1.8964084386825562, + "logits/chosen": -0.6318235397338867, + "logits/rejected": -0.5071814656257629, + "logps/chosen": -350.5252380371094, + "logps/rejected": -421.93353271484375, + "loss": 0.5645, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -2.062009572982788, + "rewards/margins": 0.5333147048950195, + "rewards/rejected": -2.5953242778778076, "step": 500 }, { - "epoch": 0.3979307600477517, - "eval_debug/losses": 0.0406743586063385, - "eval_debug/policy_weights": 0.07022719830274582, - "eval_debug/raw_losses": 0.5892212986946106, - "eval_logits/chosen": -2.264326333999634, - "eval_logits/rejected": -2.240631580352783, - "eval_logps/chosen": -294.8950500488281, - "eval_logps/rejected": -349.04681396484375, - "eval_loss": 0.043212514370679855, - "eval_rewards/accuracies": 0.6800373196601868, - "eval_rewards/chosen": -1.5065159797668457, - "eval_rewards/margins": 0.4769621789455414, - "eval_rewards/rejected": -1.9834781885147095, - "eval_runtime": 153.3333, - "eval_samples_per_second": 55.774, - "eval_steps_per_second": 0.874, + "epoch": 0.4, + "eval_logits/chosen": -0.7860146760940552, + "eval_logits/rejected": -0.6090859770774841, + "eval_logps/chosen": -351.7882995605469, + "eval_logps/rejected": -419.81939697265625, + "eval_loss": 0.5639454126358032, + "eval_rewards/accuracies": 0.6986940503120422, + "eval_rewards/chosen": -2.0721471309661865, + "eval_rewards/margins": 0.6147031188011169, + "eval_rewards/rejected": -2.6868505477905273, + "eval_runtime": 184.0949, + "eval_samples_per_second": 46.454, + "eval_steps_per_second": 0.728, "step": 500 }, { - "debug/losses": 0.04252048209309578, - "debug/policy_weights": 0.07039301842451096, - "debug/raw_losses": 0.6011983156204224, - "epoch": 0.4058893752487067, - "grad_norm": 2.2077694886776533, + "epoch": 0.41, "learning_rate": 3.705602139995416e-07, - "logits/chosen": -2.2369987964630127, - "logits/rejected": -2.203916072845459, - "logps/chosen": -297.1964111328125, - "logps/rejected": -339.70843505859375, - "loss": 0.0499, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -1.4493403434753418, - "rewards/margins": 0.44409775733947754, - "rewards/rejected": -1.8934379816055298, + "logits/chosen": -0.7258490920066833, + "logits/rejected": -0.4828409254550934, + "logps/chosen": -388.1371154785156, + "logps/rejected": -422.11181640625, + "loss": 0.574, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -2.290266513824463, + "rewards/margins": 0.4104091227054596, + "rewards/rejected": -2.7006754875183105, "step": 510 }, { - "debug/losses": 0.045193079859018326, - "debug/policy_weights": 0.07069429010152817, - "debug/raw_losses": 0.6190410256385803, - "epoch": 0.41384799044966175, - "grad_norm": 1.3854536697917073, + "epoch": 0.41, "learning_rate": 3.6442556659016475e-07, - "logits/chosen": -2.264357328414917, - "logits/rejected": -2.24722957611084, - "logps/chosen": -291.65087890625, - "logps/rejected": -329.7967224121094, - "loss": 0.0443, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -1.4414405822753906, - "rewards/margins": 0.3577377200126648, - "rewards/rejected": -1.7991783618927002, + "logits/chosen": -0.5335447192192078, + "logits/rejected": -0.33706527948379517, + "logps/chosen": -378.86492919921875, + "logps/rejected": -429.67724609375, + "loss": 0.5608, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.236337423324585, + "rewards/margins": 0.556148886680603, + "rewards/rejected": -2.7924864292144775, "step": 520 }, { - "debug/losses": 0.03731568530201912, - "debug/policy_weights": 0.06065317243337631, - "debug/raw_losses": 0.6242992281913757, - "epoch": 0.4218066056506168, - "grad_norm": 1.0635014002404581, + "epoch": 0.42, "learning_rate": 3.582024813755076e-07, - "logits/chosen": -2.1829025745391846, - "logits/rejected": -2.159717082977295, - "logps/chosen": -333.71392822265625, - "logps/rejected": -360.9361877441406, - "loss": 0.0308, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -1.8308143615722656, - "rewards/margins": 0.3124064803123474, - "rewards/rejected": -2.143220901489258, + "logits/chosen": -0.39548322558403015, + "logits/rejected": -0.10662730038166046, + "logps/chosen": -368.8847961425781, + "logps/rejected": -473.3500061035156, + "loss": 0.5485, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.3263449668884277, + "rewards/margins": 0.8236624598503113, + "rewards/rejected": -3.150007724761963, "step": 530 }, { - "debug/losses": 0.02375711500644684, - "debug/policy_weights": 0.03976626321673393, - "debug/raw_losses": 0.5894423127174377, - "epoch": 0.4297652208515718, - "grad_norm": 0.8777673540587257, + "epoch": 0.43, "learning_rate": 3.5189576808485404e-07, - "logits/chosen": -2.1102588176727295, - "logits/rejected": -2.0886712074279785, - "logps/chosen": -356.2108154296875, - "logps/rejected": -402.5682373046875, - "loss": 0.0241, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -2.0562400817871094, - "rewards/margins": 0.4145965576171875, - "rewards/rejected": -2.470837116241455, + "logits/chosen": 0.15742243826389313, + "logits/rejected": 0.31491726636886597, + "logps/chosen": -394.34930419921875, + "logps/rejected": -492.82232666015625, + "loss": 0.5478, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.6109700202941895, + "rewards/margins": 0.8250136375427246, + "rewards/rejected": -3.435983657836914, "step": 540 }, { - "debug/losses": 0.026336893439292908, - "debug/policy_weights": 0.04056663066148758, - "debug/raw_losses": 0.6279866099357605, - "epoch": 0.43772383605252685, - "grad_norm": 1.3575157514235345, + "epoch": 0.44, "learning_rate": 3.4551030108237433e-07, - "logits/chosen": -2.0719475746154785, - "logits/rejected": -2.0144124031066895, - "logps/chosen": -366.66064453125, - "logps/rejected": -384.7783508300781, - "loss": 0.0252, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -2.140892505645752, - "rewards/margins": 0.3489706516265869, - "rewards/rejected": -2.4898629188537598, + "logits/chosen": -0.2550584375858307, + "logits/rejected": -0.06936412304639816, + "logps/chosen": -406.5508728027344, + "logps/rejected": -448.47576904296875, + "loss": 0.5562, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -2.5152666568756104, + "rewards/margins": 0.4819938540458679, + "rewards/rejected": -2.997260332107544, "step": 550 }, { - "debug/losses": 0.023122292011976242, - "debug/policy_weights": 0.045903079211711884, - "debug/raw_losses": 0.567331850528717, - "epoch": 0.4456824512534819, - "grad_norm": 1.197996404456996, + "epoch": 0.45, "learning_rate": 3.390510155998023e-07, - "logits/chosen": -2.1358299255371094, - "logits/rejected": -2.1018567085266113, - "logps/chosen": -359.2723693847656, - "logps/rejected": -411.61187744140625, - "loss": 0.0241, - "rewards/accuracies": 0.6875, - "rewards/chosen": -2.050138473510742, - "rewards/margins": 0.5038273334503174, - "rewards/rejected": -2.5539660453796387, + "logits/chosen": -0.5292027592658997, + "logits/rejected": -0.2619571387767792, + "logps/chosen": -371.6798095703125, + "logps/rejected": -420.7915954589844, + "loss": 0.5492, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.1147050857543945, + "rewards/margins": 0.6524336338043213, + "rewards/rejected": -2.7671384811401367, "step": 560 }, { - "debug/losses": 0.029903370887041092, - "debug/policy_weights": 0.050540171563625336, - "debug/raw_losses": 0.5872690081596375, - "epoch": 0.4536410664544369, - "grad_norm": 1.5301014116065141, + "epoch": 0.45, "learning_rate": 3.325229039220684e-07, - "logits/chosen": -2.0992767810821533, - "logits/rejected": -2.076805830001831, - "logps/chosen": -361.57342529296875, - "logps/rejected": -405.8957214355469, - "loss": 0.0286, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -2.0752933025360107, - "rewards/margins": 0.4622616171836853, - "rewards/rejected": -2.537554979324341, + "logits/chosen": -0.5881962776184082, + "logits/rejected": -0.4658876061439514, + "logps/chosen": -343.7039794921875, + "logps/rejected": -406.14178466796875, + "loss": 0.57, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.0860273838043213, + "rewards/margins": 0.498068630695343, + "rewards/rejected": -2.5840957164764404, "step": 570 }, { - "debug/losses": 0.03479158133268356, - "debug/policy_weights": 0.053991250693798065, - "debug/raw_losses": 0.604568600654602, - "epoch": 0.46159968165539195, - "grad_norm": 1.41615105697594, + "epoch": 0.46, "learning_rate": 3.2593101152883795e-07, - "logits/chosen": -2.110628366470337, - "logits/rejected": -2.0749945640563965, - "logps/chosen": -349.9741516113281, - "logps/rejected": -388.0547180175781, - "loss": 0.0249, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -2.010434150695801, - "rewards/margins": 0.40317314863204956, - "rewards/rejected": -2.413607120513916, + "logits/chosen": -0.6565806269645691, + "logits/rejected": -0.2549567222595215, + "logps/chosen": -374.8047180175781, + "logps/rejected": -430.33221435546875, + "loss": 0.5512, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.2211391925811768, + "rewards/margins": 0.6813799142837524, + "rewards/rejected": -2.9025187492370605, "step": 580 }, { - "debug/losses": 0.03422771021723747, - "debug/policy_weights": 0.05810137465596199, - "debug/raw_losses": 0.5813102126121521, - "epoch": 0.469558296856347, - "grad_norm": 1.4269890028062786, + "epoch": 0.47, "learning_rate": 3.192804331949349e-07, - "logits/chosen": -2.1150734424591064, - "logits/rejected": -2.0873661041259766, - "logps/chosen": -322.30841064453125, - "logps/rejected": -366.505859375, - "loss": 0.0311, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -1.7967876195907593, - "rewards/margins": 0.4749871790409088, - "rewards/rejected": -2.2717747688293457, + "logits/chosen": -0.07184700667858124, + "logits/rejected": 0.1699156016111374, + "logps/chosen": -422.27081298828125, + "logps/rejected": -490.69134521484375, + "loss": 0.535, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.730973720550537, + "rewards/margins": 0.7726518511772156, + "rewards/rejected": -3.5036251544952393, "step": 590 }, { - "debug/losses": 0.03016613982617855, - "debug/policy_weights": 0.05547459051012993, - "debug/raw_losses": 0.5693393349647522, - "epoch": 0.477516912057302, - "grad_norm": 1.2863972010725921, + "epoch": 0.48, "learning_rate": 3.125763090526674e-07, - "logits/chosen": -2.1585888862609863, - "logits/rejected": -2.11460542678833, - "logps/chosen": -328.352783203125, - "logps/rejected": -376.36846923828125, - "loss": 0.0342, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -1.735060691833496, - "rewards/margins": 0.5392208099365234, - "rewards/rejected": -2.2742817401885986, + "logits/chosen": -0.029465889558196068, + "logits/rejected": 0.15842057764530182, + "logps/chosen": -417.373046875, + "logps/rejected": -478.73291015625, + "loss": 0.5513, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.8006317615509033, + "rewards/margins": 0.6451797485351562, + "rewards/rejected": -3.4458115100860596, "step": 600 }, { - "epoch": 0.477516912057302, - "eval_debug/losses": 0.030185788869857788, - "eval_debug/policy_weights": 0.05269056186079979, - "eval_debug/raw_losses": 0.5812317132949829, - "eval_logits/chosen": -2.13511323928833, - "eval_logits/rejected": -2.11336612701416, - "eval_logps/chosen": -327.0503234863281, - "eval_logps/rejected": -381.1920471191406, - "eval_loss": 0.032057907432317734, - "eval_rewards/accuracies": 0.6875, - "eval_rewards/chosen": -1.8280682563781738, - "eval_rewards/margins": 0.4768621623516083, - "eval_rewards/rejected": -2.3049304485321045, - "eval_runtime": 153.2709, - "eval_samples_per_second": 55.797, - "eval_steps_per_second": 0.874, + "epoch": 0.48, + "eval_logits/chosen": -0.10542195290327072, + "eval_logits/rejected": 0.12242482602596283, + "eval_logps/chosen": -436.9386291503906, + "eval_logps/rejected": -505.02227783203125, + "eval_loss": 0.5582411885261536, + "eval_rewards/accuracies": 0.7108209133148193, + "eval_rewards/chosen": -2.9236514568328857, + "eval_rewards/margins": 0.6152271032333374, + "eval_rewards/rejected": -3.5388784408569336, + "eval_runtime": 184.0286, + "eval_samples_per_second": 46.471, + "eval_steps_per_second": 0.728, "step": 600 }, { - "debug/losses": 0.02869519032537937, - "debug/policy_weights": 0.045216310769319534, - "debug/raw_losses": 0.6201798319816589, - "epoch": 0.48547552725825704, - "grad_norm": 1.0413008003733695, + "epoch": 0.49, "learning_rate": 3.0582382061909623e-07, - "logits/chosen": -2.144442081451416, - "logits/rejected": -2.114197015762329, - "logps/chosen": -334.3606262207031, - "logps/rejected": -370.7452087402344, - "loss": 0.031, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -1.9323441982269287, - "rewards/margins": 0.3452734053134918, - "rewards/rejected": -2.277617931365967, + "logits/chosen": -0.2445104569196701, + "logits/rejected": -0.018268002197146416, + "logps/chosen": -441.7857971191406, + "logps/rejected": -502.60791015625, + "loss": 0.5594, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.858261823654175, + "rewards/margins": 0.5510683655738831, + "rewards/rejected": -3.409330368041992, "step": 610 }, { - "debug/losses": 0.02686266042292118, - "debug/policy_weights": 0.04530490189790726, - "debug/raw_losses": 0.5958613753318787, - "epoch": 0.4934341424592121, - "grad_norm": 1.20614262286929, + "epoch": 0.49, "learning_rate": 2.9902818679131775e-07, - "logits/chosen": -2.1490912437438965, - "logits/rejected": -2.1047191619873047, - "logps/chosen": -345.37994384765625, - "logps/rejected": -382.47808837890625, - "loss": 0.0274, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -1.9065704345703125, - "rewards/margins": 0.42660021781921387, - "rewards/rejected": -2.3331706523895264, + "logits/chosen": -0.4190225601196289, + "logits/rejected": -0.22823679447174072, + "logps/chosen": -399.03924560546875, + "logps/rejected": -498.6724548339844, + "loss": 0.5499, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.650449275970459, + "rewards/margins": 0.7673205137252808, + "rewards/rejected": -3.4177703857421875, "step": 620 }, { - "debug/losses": 0.02654409408569336, - "debug/policy_weights": 0.04730083793401718, - "debug/raw_losses": 0.5461606979370117, - "epoch": 0.5013927576601671, - "grad_norm": 1.8099237202122547, + "epoch": 0.5, "learning_rate": 2.921946598128571e-07, - "logits/chosen": -2.12278413772583, - "logits/rejected": -2.0931880474090576, - "logps/chosen": -322.9065246582031, - "logps/rejected": -354.53228759765625, - "loss": 0.0319, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.7012741565704346, - "rewards/margins": 0.5234018564224243, - "rewards/rejected": -2.2246758937835693, + "logits/chosen": -0.43653860688209534, + "logits/rejected": -0.20837187767028809, + "logps/chosen": -402.82781982421875, + "logps/rejected": -485.4117736816406, + "loss": 0.5739, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.573000192642212, + "rewards/margins": 0.7478531002998352, + "rewards/rejected": -3.3208529949188232, "step": 630 }, { - "debug/losses": 0.037343092262744904, - "debug/policy_weights": 0.053458474576473236, - "debug/raw_losses": 0.6420666575431824, - "epoch": 0.5093513728611222, - "grad_norm": 1.6865475152716571, + "epoch": 0.51, "learning_rate": 2.8532852121428733e-07, - "logits/chosen": -2.1078145503997803, - "logits/rejected": -2.0716660022735596, - "logps/chosen": -296.3316345214844, - "logps/rejected": -330.1187744140625, - "loss": 0.0344, - "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": -1.6080780029296875, - "rewards/margins": 0.36063456535339355, - "rewards/rejected": -1.968712568283081, + "logits/chosen": -0.43430274724960327, + "logits/rejected": -0.13240045309066772, + "logps/chosen": -397.2491149902344, + "logps/rejected": -442.12384033203125, + "loss": 0.5462, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.4125733375549316, + "rewards/margins": 0.5821372866630554, + "rewards/rejected": -2.9947104454040527, "step": 640 }, { - "debug/losses": 0.035708196461200714, - "debug/policy_weights": 0.05902569368481636, - "debug/raw_losses": 0.5638469457626343, - "epoch": 0.5173099880620772, - "grad_norm": 1.7034498752676623, + "epoch": 0.52, "learning_rate": 2.7843507773121414e-07, - "logits/chosen": -2.1405997276306152, - "logits/rejected": -2.108238697052002, - "logps/chosen": -292.4433288574219, - "logps/rejected": -354.64459228515625, - "loss": 0.0376, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -1.5522750616073608, - "rewards/margins": 0.5501760840415955, - "rewards/rejected": -2.1024510860443115, + "logits/chosen": -0.4247920513153076, + "logits/rejected": -0.21372787654399872, + "logps/chosen": -389.4237976074219, + "logps/rejected": -458.3169860839844, + "loss": 0.5373, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -2.444688320159912, + "rewards/margins": 0.7236617207527161, + "rewards/rejected": -3.1683506965637207, "step": 650 }, { - "debug/losses": 0.029026631265878677, - "debug/policy_weights": 0.05990219861268997, - "debug/raw_losses": 0.531947135925293, - "epoch": 0.5252686032630323, - "grad_norm": 1.5938360839379533, + "epoch": 0.53, "learning_rate": 2.715196572027789e-07, - "logits/chosen": -2.1009621620178223, - "logits/rejected": -2.0724639892578125, - "logps/chosen": -312.18450927734375, - "logps/rejected": -384.11865234375, - "loss": 0.0339, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -1.6771888732910156, - "rewards/margins": 0.6227995753288269, - "rewards/rejected": -2.2999885082244873, + "logits/chosen": -0.6697942614555359, + "logits/rejected": -0.4933086931705475, + "logps/chosen": -387.529296875, + "logps/rejected": -472.73944091796875, + "loss": 0.5685, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.3496451377868652, + "rewards/margins": 0.7728831171989441, + "rewards/rejected": -3.122528314590454, "step": 660 }, { - "debug/losses": 0.029919123277068138, - "debug/policy_weights": 0.05170232057571411, - "debug/raw_losses": 0.6014626026153564, - "epoch": 0.5332272184639872, - "grad_norm": 1.5125072558657653, + "epoch": 0.53, "learning_rate": 2.645876044538521e-07, - "logits/chosen": -2.1004645824432373, - "logits/rejected": -2.0667197704315186, - "logps/chosen": -330.8302307128906, - "logps/rejected": -369.1127624511719, - "loss": 0.0319, + "logits/chosen": -1.0338900089263916, + "logits/rejected": -0.8813627362251282, + "logps/chosen": -372.53118896484375, + "logps/rejected": -426.54241943359375, + "loss": 0.5725, "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -1.8169825077056885, - "rewards/margins": 0.46573132276535034, - "rewards/rejected": -2.2827141284942627, + "rewards/chosen": -2.201908588409424, + "rewards/margins": 0.5010865926742554, + "rewards/rejected": -2.7029950618743896, "step": 670 }, { - "debug/losses": 0.038384515792131424, - "debug/policy_weights": 0.06169766187667847, - "debug/raw_losses": 0.5807624459266663, - "epoch": 0.5411858336649423, - "grad_norm": 1.2825455723938728, + "epoch": 0.54, "learning_rate": 2.5764427716409815e-07, - "logits/chosen": -2.126770496368408, - "logits/rejected": -2.0941073894500732, - "logps/chosen": -320.5771484375, - "logps/rejected": -363.2606506347656, - "loss": 0.0316, + "logits/chosen": -0.9278701543807983, + "logits/rejected": -0.7282145023345947, + "logps/chosen": -347.2828674316406, + "logps/rejected": -416.9349060058594, + "loss": 0.5479, "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -1.6580756902694702, - "rewards/margins": 0.4379847049713135, - "rewards/rejected": -2.096060276031494, + "rewards/chosen": -2.0276436805725098, + "rewards/margins": 0.743033230304718, + "rewards/rejected": -2.770677089691162, "step": 680 }, { - "debug/losses": 0.03115171566605568, - "debug/policy_weights": 0.051781851798295975, - "debug/raw_losses": 0.6048017144203186, - "epoch": 0.5491444488658973, - "grad_norm": 1.312607359472945, + "epoch": 0.55, "learning_rate": 2.5069504172710494e-07, - "logits/chosen": -2.095470905303955, - "logits/rejected": -2.08461332321167, - "logps/chosen": -335.7181091308594, - "logps/rejected": -388.94171142578125, - "loss": 0.0288, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -1.873830795288086, - "rewards/margins": 0.3542255759239197, - "rewards/rejected": -2.2280564308166504, + "logits/chosen": -0.5008482336997986, + "logits/rejected": -0.34875133633613586, + "logps/chosen": -373.7621154785156, + "logps/rejected": -485.12884521484375, + "loss": 0.5217, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.347053050994873, + "rewards/margins": 0.9024646878242493, + "rewards/rejected": -3.2495174407958984, "step": 690 }, { - "debug/losses": 0.032391004264354706, - "debug/policy_weights": 0.04837799817323685, - "debug/raw_losses": 0.63223797082901, - "epoch": 0.5571030640668524, - "grad_norm": 1.2365349964087866, + "epoch": 0.56, "learning_rate": 2.4374526910277886e-07, - "logits/chosen": -2.010230541229248, - "logits/rejected": -1.984021782875061, - "logps/chosen": -325.3149108886719, - "logps/rejected": -352.573974609375, - "loss": 0.0283, - "rewards/accuracies": 0.59375, - "rewards/chosen": -1.8190257549285889, - "rewards/margins": 0.3168783187866211, - "rewards/rejected": -2.13590407371521, + "logits/chosen": 0.06850005686283112, + "logits/rejected": 0.41385045647621155, + "logps/chosen": -411.46246337890625, + "logps/rejected": -476.6162109375, + "loss": 0.5571, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -2.6757898330688477, + "rewards/margins": 0.8085702657699585, + "rewards/rejected": -3.4843602180480957, "step": 700 }, { - "epoch": 0.5571030640668524, - "eval_debug/losses": 0.026820143684744835, - "eval_debug/policy_weights": 0.04667918384075165, - "eval_debug/raw_losses": 0.5765854716300964, - "eval_logits/chosen": -1.9899719953536987, - "eval_logits/rejected": -1.9676791429519653, - "eval_logps/chosen": -328.6565856933594, - "eval_logps/rejected": -378.7768859863281, - "eval_loss": 0.028307339176535606, - "eval_rewards/accuracies": 0.6940298676490784, - "eval_rewards/chosen": -1.8441307544708252, - "eval_rewards/margins": 0.4366479218006134, - "eval_rewards/rejected": -2.280778408050537, - "eval_runtime": 153.3492, - "eval_samples_per_second": 55.768, - "eval_steps_per_second": 0.874, + "epoch": 0.56, + "eval_logits/chosen": 0.035554468631744385, + "eval_logits/rejected": 0.2980235815048218, + "eval_logps/chosen": -424.2823486328125, + "eval_logps/rejected": -505.6960754394531, + "eval_loss": 0.5558871626853943, + "eval_rewards/accuracies": 0.704291045665741, + "eval_rewards/chosen": -2.797088146209717, + "eval_rewards/margins": 0.748529314994812, + "eval_rewards/rejected": -3.5456173419952393, + "eval_runtime": 184.1244, + "eval_samples_per_second": 46.447, + "eval_steps_per_second": 0.728, "step": 700 }, { - "debug/losses": 0.026440713554620743, - "debug/policy_weights": 0.04469674825668335, - "debug/raw_losses": 0.5913997292518616, - "epoch": 0.5650616792678074, - "grad_norm": 1.7865897282724608, + "epoch": 0.57, "learning_rate": 2.368003306662104e-07, - "logits/chosen": -1.9654159545898438, - "logits/rejected": -1.9266160726547241, - "logps/chosen": -352.88787841796875, - "logps/rejected": -381.1947326660156, - "loss": 0.0276, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -1.9729446172714233, - "rewards/margins": 0.3949921727180481, - "rewards/rejected": -2.367936611175537, + "logits/chosen": 0.07857178151607513, + "logits/rejected": 0.3302653729915619, + "logps/chosen": -413.8836975097656, + "logps/rejected": -535.0875244140625, + "loss": 0.5287, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.7193782329559326, + "rewards/margins": 1.0089346170425415, + "rewards/rejected": -3.7283127307891846, "step": 710 }, { - "debug/losses": 0.028649378567934036, - "debug/policy_weights": 0.05744954198598862, - "debug/raw_losses": 0.5734940767288208, - "epoch": 0.5730202944687625, - "grad_norm": 1.4165928688447773, + "epoch": 0.57, "learning_rate": 2.2986559405621886e-07, - "logits/chosen": -1.9602625370025635, - "logits/rejected": -1.916290283203125, - "logps/chosen": -343.48406982421875, - "logps/rejected": -386.95989990234375, - "loss": 0.0308, - "rewards/accuracies": 0.6875, - "rewards/chosen": -1.8104501962661743, - "rewards/margins": 0.4908137917518616, - "rewards/rejected": -2.3012638092041016, + "logits/chosen": 0.2789291739463806, + "logits/rejected": 0.4242584705352783, + "logps/chosen": -422.7801818847656, + "logps/rejected": -522.7840576171875, + "loss": 0.5551, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.925621509552002, + "rewards/margins": 0.8043605089187622, + "rewards/rejected": -3.729982376098633, "step": 720 }, { - "debug/losses": 0.022339364513754845, - "debug/policy_weights": 0.03942544013261795, - "debug/raw_losses": 0.5562313795089722, - "epoch": 0.5809789096697174, - "grad_norm": 1.382543972122978, + "epoch": 0.58, "learning_rate": 2.2294641902678443e-07, - "logits/chosen": -1.9571382999420166, - "logits/rejected": -1.9248653650283813, - "logps/chosen": -327.402587890625, - "logps/rejected": -379.947998046875, - "loss": 0.0232, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -1.9223827123641968, - "rewards/margins": 0.558407723903656, - "rewards/rejected": -2.480790615081787, + "logits/chosen": -0.19327735900878906, + "logits/rejected": 0.043265581130981445, + "logps/chosen": -363.1488342285156, + "logps/rejected": -470.94970703125, + "loss": 0.5284, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.39530873298645, + "rewards/margins": 0.908363938331604, + "rewards/rejected": -3.3036727905273438, "step": 730 }, { - "debug/losses": 0.01860680803656578, - "debug/policy_weights": 0.03574604541063309, - "debug/raw_losses": 0.5566806793212891, - "epoch": 0.5889375248706725, - "grad_norm": 1.468728949042428, + "epoch": 0.59, "learning_rate": 2.160481533045751e-07, - "logits/chosen": -1.921987533569336, - "logits/rejected": -1.873268485069275, - "logps/chosen": -362.5789489746094, - "logps/rejected": -399.66748046875, - "loss": 0.0225, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -2.1244444847106934, - "rewards/margins": 0.46507740020751953, - "rewards/rejected": -2.589521884918213, + "logits/chosen": -0.37412697076797485, + "logits/rejected": -0.17320053279399872, + "logps/chosen": -390.2896423339844, + "logps/rejected": -428.08099365234375, + "loss": 0.5572, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.3005330562591553, + "rewards/margins": 0.48462891578674316, + "rewards/rejected": -2.7851624488830566, "step": 740 }, { - "debug/losses": 0.021815910935401917, - "debug/policy_weights": 0.039903424680233, - "debug/raw_losses": 0.5822666883468628, - "epoch": 0.5968961400716275, - "grad_norm": 1.4132116728769042, + "epoch": 0.6, "learning_rate": 2.0917612845576882e-07, - "logits/chosen": -1.9699146747589111, - "logits/rejected": -1.9025242328643799, - "logps/chosen": -354.04644775390625, - "logps/rejected": -385.09075927734375, - "loss": 0.024, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -1.975638747215271, - "rewards/margins": 0.5243066549301147, - "rewards/rejected": -2.4999454021453857, + "logits/chosen": -0.26352375745773315, + "logits/rejected": -0.0010178961092606187, + "logps/chosen": -373.3875427246094, + "logps/rejected": -440.09442138671875, + "loss": 0.5534, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.3315823078155518, + "rewards/margins": 0.6843063235282898, + "rewards/rejected": -3.0158886909484863, "step": 750 }, { - "debug/losses": 0.02490827441215515, - "debug/policy_weights": 0.04055667296051979, - "debug/raw_losses": 0.6043840050697327, - "epoch": 0.6048547552725826, - "grad_norm": 1.5603724313512064, + "epoch": 0.6, "learning_rate": 2.0233565576536564e-07, - "logits/chosen": -1.9306650161743164, - "logits/rejected": -1.9156144857406616, - "logps/chosen": -329.97711181640625, - "logps/rejected": -378.48834228515625, - "loss": 0.0283, - "rewards/accuracies": 0.65625, - "rewards/chosen": -1.8800745010375977, - "rewards/margins": 0.4346494674682617, - "rewards/rejected": -2.3147239685058594, + "logits/chosen": -0.3354080021381378, + "logits/rejected": -0.006600166670978069, + "logps/chosen": -360.56463623046875, + "logps/rejected": -440.66961669921875, + "loss": 0.5328, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.1626803874969482, + "rewards/margins": 0.8473829030990601, + "rewards/rejected": -3.010063409805298, "step": 760 }, { - "debug/losses": 0.02524595521390438, - "debug/policy_weights": 0.04561594873666763, - "debug/raw_losses": 0.5465348958969116, - "epoch": 0.6128133704735376, - "grad_norm": 1.4684425781915056, + "epoch": 0.61, "learning_rate": 1.9553202213217537e-07, - "logits/chosen": -1.9351495504379272, - "logits/rejected": -1.901551604270935, - "logps/chosen": -314.9850769042969, - "logps/rejected": -377.1813049316406, - "loss": 0.0247, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -1.8458607196807861, - "rewards/margins": 0.5721122026443481, - "rewards/rejected": -2.417973041534424, + "logits/chosen": -0.021420275792479515, + "logits/rejected": 0.19946305453777313, + "logps/chosen": -389.1043395996094, + "logps/rejected": -448.04998779296875, + "loss": 0.5523, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.391838788986206, + "rewards/margins": 0.6678962707519531, + "rewards/rejected": -3.059735059738159, "step": 770 }, { - "debug/losses": 0.024128446355462074, - "debug/policy_weights": 0.04677557945251465, - "debug/raw_losses": 0.5592825412750244, - "epoch": 0.6207719856744927, - "grad_norm": 1.055203900824062, + "epoch": 0.62, "learning_rate": 1.887704859826528e-07, - "logits/chosen": -1.976380705833435, - "logits/rejected": -1.9334523677825928, - "logps/chosen": -364.484375, - "logps/rejected": -424.99957275390625, - "loss": 0.0247, - "rewards/accuracies": 0.71875, - "rewards/chosen": -2.032318353652954, - "rewards/margins": 0.5795284509658813, - "rewards/rejected": -2.611846923828125, + "logits/chosen": -0.15253478288650513, + "logits/rejected": -0.00011998042464256287, + "logps/chosen": -394.9501953125, + "logps/rejected": -462.32843017578125, + "loss": 0.5443, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.476644992828369, + "rewards/margins": 0.566824734210968, + "rewards/rejected": -3.0434699058532715, "step": 780 }, { - "debug/losses": 0.024484191089868546, - "debug/policy_weights": 0.04067380353808403, - "debug/raw_losses": 0.5770185589790344, - "epoch": 0.6287306008754476, - "grad_norm": 1.216416574340141, + "epoch": 0.63, "learning_rate": 1.8205627320673836e-07, - "logits/chosen": -1.9579511880874634, - "logits/rejected": -1.9054477214813232, - "logps/chosen": -366.54754638671875, - "logps/rejected": -416.3515625, - "loss": 0.0229, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -2.1298985481262207, - "rewards/margins": 0.5691512823104858, - "rewards/rejected": -2.699049472808838, + "logits/chosen": -0.17955633997917175, + "logits/rejected": 0.18167546391487122, + "logps/chosen": -390.32244873046875, + "logps/rejected": -444.895263671875, + "loss": 0.5566, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.4376220703125, + "rewards/margins": 0.7008293271064758, + "rewards/rejected": -3.138451099395752, "step": 790 }, { - "debug/losses": 0.02110939845442772, - "debug/policy_weights": 0.03945222496986389, - "debug/raw_losses": 0.6036224365234375, - "epoch": 0.6366892160764027, - "grad_norm": 1.1130648846777194, + "epoch": 0.64, "learning_rate": 1.7539457311884675e-07, - "logits/chosen": -1.9825928211212158, - "logits/rejected": -1.9254627227783203, - "logps/chosen": -373.7814025878906, - "logps/rejected": -404.980712890625, - "loss": 0.023, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -2.1609199047088623, - "rewards/margins": 0.4238300919532776, - "rewards/rejected": -2.584749698638916, + "logits/chosen": -0.09838727861642838, + "logits/rejected": 0.11829495429992676, + "logps/chosen": -402.4017333984375, + "logps/rejected": -451.49346923828125, + "loss": 0.5609, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.4441986083984375, + "rewards/margins": 0.5067789554595947, + "rewards/rejected": -2.9509775638580322, "step": 800 }, { - "epoch": 0.6366892160764027, - "eval_debug/losses": 0.02278236113488674, - "eval_debug/policy_weights": 0.04001852124929428, - "eval_debug/raw_losses": 0.5787393450737, - "eval_logits/chosen": -1.9514880180358887, - "eval_logits/rejected": -1.9267523288726807, - "eval_logps/chosen": -350.94134521484375, - "eval_logps/rejected": -407.4722595214844, - "eval_loss": 0.02435474470257759, - "eval_rewards/accuracies": 0.6884328126907349, - "eval_rewards/chosen": -2.0669784545898438, - "eval_rewards/margins": 0.5007539987564087, - "eval_rewards/rejected": -2.567732334136963, - "eval_runtime": 153.2113, - "eval_samples_per_second": 55.818, - "eval_steps_per_second": 0.875, + "epoch": 0.64, + "eval_logits/chosen": -0.03116540051996708, + "eval_logits/rejected": 0.1922437697649002, + "eval_logps/chosen": -387.7091979980469, + "eval_logps/rejected": -459.44390869140625, + "eval_loss": 0.5468714833259583, + "eval_rewards/accuracies": 0.7108209133148193, + "eval_rewards/chosen": -2.431356430053711, + "eval_rewards/margins": 0.6517390012741089, + "eval_rewards/rejected": -3.0830955505371094, + "eval_runtime": 184.1673, + "eval_samples_per_second": 46.436, + "eval_steps_per_second": 0.728, "step": 800 }, { - "debug/losses": 0.031348057091236115, - "debug/policy_weights": 0.05042605847120285, - "debug/raw_losses": 0.617703914642334, - "epoch": 0.6446478312773577, - "grad_norm": 1.2829017385682178, + "epoch": 0.64, "learning_rate": 1.687905344471226e-07, - "logits/chosen": -1.9726717472076416, - "logits/rejected": -1.9506819248199463, - "logps/chosen": -356.71759033203125, - "logps/rejected": -399.9237060546875, - "loss": 0.0275, - "rewards/accuracies": 0.625, - "rewards/chosen": -1.9306268692016602, - "rewards/margins": 0.41608279943466187, - "rewards/rejected": -2.346709966659546, + "logits/chosen": 0.07735608518123627, + "logits/rejected": 0.3973601460456848, + "logps/chosen": -408.05999755859375, + "logps/rejected": -459.011474609375, + "loss": 0.5384, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -2.5008435249328613, + "rewards/margins": 0.6535352468490601, + "rewards/rejected": -3.154379367828369, "step": 810 }, { - "debug/losses": 0.031708650290966034, - "debug/policy_weights": 0.04735485464334488, - "debug/raw_losses": 0.6054385900497437, - "epoch": 0.6526064464783128, - "grad_norm": 1.2622583124342848, + "epoch": 0.65, "learning_rate": 1.6224926135406693e-07, - "logits/chosen": -1.9768617153167725, - "logits/rejected": -1.9380643367767334, - "logps/chosen": -340.0035095214844, - "logps/rejected": -370.344970703125, - "loss": 0.0259, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -1.954310655593872, - "rewards/margins": 0.4057098925113678, - "rewards/rejected": -2.360020399093628, + "logits/chosen": 0.1125444769859314, + "logits/rejected": 0.3865428566932678, + "logps/chosen": -404.16058349609375, + "logps/rejected": -484.68621826171875, + "loss": 0.5448, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -2.4405789375305176, + "rewards/margins": 0.718208909034729, + "rewards/rejected": -3.158787727355957, "step": 820 }, { - "debug/losses": 0.028013870120048523, - "debug/policy_weights": 0.04989269748330116, - "debug/raw_losses": 0.5807904005050659, - "epoch": 0.6605650616792678, - "grad_norm": 1.2623708525947415, + "epoch": 0.66, "learning_rate": 1.557758094916053e-07, - "logits/chosen": -1.9997406005859375, - "logits/rejected": -1.9534574747085571, - "logps/chosen": -351.2829895019531, - "logps/rejected": -399.71087646484375, - "loss": 0.0271, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -1.9434627294540405, - "rewards/margins": 0.4895332455635071, - "rewards/rejected": -2.4329960346221924, + "logits/chosen": 0.11989516019821167, + "logits/rejected": 0.30926594138145447, + "logps/chosen": -370.29876708984375, + "logps/rejected": -452.27911376953125, + "loss": 0.5418, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.3860089778900146, + "rewards/margins": 0.7260924577713013, + "rewards/rejected": -3.1121015548706055, "step": 830 }, { - "debug/losses": 0.031273625791072845, - "debug/policy_weights": 0.05692868307232857, - "debug/raw_losses": 0.5609691739082336, - "epoch": 0.6685236768802229, - "grad_norm": 1.0866287225682587, + "epoch": 0.67, "learning_rate": 1.4937518209365108e-07, - "logits/chosen": -2.0409183502197266, - "logits/rejected": -1.9729896783828735, - "logps/chosen": -362.9765319824219, - "logps/rejected": -389.9468994140625, - "loss": 0.0327, + "logits/chosen": -0.14239154756069183, + "logits/rejected": 0.14250756800174713, + "logps/chosen": -395.55755615234375, + "logps/rejected": -447.6368713378906, + "loss": 0.5573, "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -1.8453567028045654, - "rewards/margins": 0.5211337804794312, - "rewards/rejected": -2.366490602493286, + "rewards/chosen": -2.378154754638672, + "rewards/margins": 0.6160937547683716, + "rewards/rejected": -2.994248390197754, "step": 840 }, { - "debug/losses": 0.029767373576760292, - "debug/policy_weights": 0.052564360201358795, - "debug/raw_losses": 0.6102725863456726, - "epoch": 0.6764822920811778, - "grad_norm": 1.394943527609451, + "epoch": 0.68, "learning_rate": 1.4305232610918045e-07, - "logits/chosen": -2.0147321224212646, - "logits/rejected": -1.9801139831542969, - "logps/chosen": -339.0684509277344, - "logps/rejected": -375.1840515136719, - "loss": 0.0307, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -1.9301865100860596, - "rewards/margins": 0.3946223855018616, - "rewards/rejected": -2.3248085975646973, + "logits/chosen": -0.16526366770267487, + "logits/rejected": 0.16432161629199982, + "logps/chosen": -373.45330810546875, + "logps/rejected": -436.6773376464844, + "loss": 0.5415, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.3134028911590576, + "rewards/margins": 0.774810791015625, + "rewards/rejected": -3.0882136821746826, "step": 850 }, { - "debug/losses": 0.03315902501344681, - "debug/policy_weights": 0.05502952262759209, - "debug/raw_losses": 0.5980508923530579, - "epoch": 0.6844409072821329, - "grad_norm": 1.1275097393699727, + "epoch": 0.68, "learning_rate": 1.3681212837880977e-07, - "logits/chosen": -2.002835512161255, - "logits/rejected": -1.995160698890686, - "logps/chosen": -307.8199157714844, - "logps/rejected": -369.13763427734375, - "loss": 0.0318, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -1.7375946044921875, - "rewards/margins": 0.43073099851608276, - "rewards/rejected": -2.168325901031494, + "logits/chosen": -0.1321481615304947, + "logits/rejected": 0.23287932574748993, + "logps/chosen": -364.96990966796875, + "logps/rejected": -447.7923278808594, + "loss": 0.5396, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.176964282989502, + "rewards/margins": 0.8955341577529907, + "rewards/rejected": -3.0724985599517822, "step": 860 }, { - "debug/losses": 0.029089778661727905, - "debug/policy_weights": 0.050317078828811646, - "debug/raw_losses": 0.5906243324279785, - "epoch": 0.6923995224830879, - "grad_norm": 1.4878858135274058, + "epoch": 0.69, "learning_rate": 1.3065941185782977e-07, - "logits/chosen": -1.9642223119735718, - "logits/rejected": -1.927920937538147, - "logps/chosen": -334.91162109375, - "logps/rejected": -361.5536193847656, - "loss": 0.0306, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -1.8268492221832275, - "rewards/margins": 0.42832955718040466, - "rewards/rejected": -2.255178928375244, + "logits/chosen": 0.05437428876757622, + "logits/rejected": 0.2819867432117462, + "logps/chosen": -383.08599853515625, + "logps/rejected": -439.3629455566406, + "loss": 0.5505, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -2.405247211456299, + "rewards/margins": 0.5403125882148743, + "rewards/rejected": -2.9455599784851074, "step": 870 }, { - "debug/losses": 0.029998168349266052, - "debug/policy_weights": 0.0653611272573471, - "debug/raw_losses": 0.47929373383522034, - "epoch": 0.700358137684043, - "grad_norm": 1.29187319176411, + "epoch": 0.7, "learning_rate": 1.2459893188861613e-07, - "logits/chosen": -2.0278587341308594, - "logits/rejected": -1.9883801937103271, - "logps/chosen": -307.253662109375, - "logps/rejected": -392.42962646484375, - "loss": 0.0333, - "rewards/accuracies": 0.78125, - "rewards/chosen": -1.5876758098602295, - "rewards/margins": 0.7120193839073181, - "rewards/rejected": -2.2996952533721924, + "logits/chosen": -0.12052659690380096, + "logits/rejected": 0.12284734100103378, + "logps/chosen": -367.1181640625, + "logps/rejected": -468.1044921875, + "loss": 0.5185, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.39152455329895, + "rewards/margins": 0.9137696027755737, + "rewards/rejected": -3.3052947521209717, "step": 880 }, { - "debug/losses": 0.031164586544036865, - "debug/policy_weights": 0.05676507204771042, - "debug/raw_losses": 0.5348513722419739, - "epoch": 0.708316752884998, - "grad_norm": 1.2972584564770755, + "epoch": 0.71, "learning_rate": 1.1863537252529548e-07, - "logits/chosen": -1.9931986331939697, - "logits/rejected": -1.9309186935424805, - "logps/chosen": -337.57659912109375, - "logps/rejected": -380.300048828125, - "loss": 0.0307, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -1.793096899986267, - "rewards/margins": 0.5044206380844116, - "rewards/rejected": -2.297517776489258, + "logits/chosen": 0.14598000049591064, + "logits/rejected": 0.38815659284591675, + "logps/chosen": -397.891357421875, + "logps/rejected": -472.38677978515625, + "loss": 0.5323, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.512676239013672, + "rewards/margins": 0.7713057994842529, + "rewards/rejected": -3.2839818000793457, "step": 890 }, { - "debug/losses": 0.0308552123606205, - "debug/policy_weights": 0.05725027993321419, - "debug/raw_losses": 0.5641220808029175, - "epoch": 0.716275368085953, - "grad_norm": 1.406582703436064, + "epoch": 0.72, "learning_rate": 1.1277334291351145e-07, - "logits/chosen": -1.92514169216156, - "logits/rejected": -1.8894935846328735, - "logps/chosen": -308.63641357421875, - "logps/rejected": -367.8720397949219, - "loss": 0.032, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -1.7129472494125366, - "rewards/margins": 0.5332741737365723, - "rewards/rejected": -2.2462215423583984, + "logits/chosen": 0.15319526195526123, + "logits/rejected": 0.35974830389022827, + "logps/chosen": -380.77783203125, + "logps/rejected": -449.54315185546875, + "loss": 0.5514, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -2.3706305027008057, + "rewards/margins": 0.6724039912223816, + "rewards/rejected": -3.043034076690674, "step": 900 }, { - "epoch": 0.716275368085953, - "eval_debug/losses": 0.03159477189183235, - "eval_debug/policy_weights": 0.055877406150102615, - "eval_debug/raw_losses": 0.5719749927520752, - "eval_logits/chosen": -1.952079176902771, - "eval_logits/rejected": -1.9262397289276123, - "eval_logps/chosen": -318.9172668457031, - "eval_logps/rejected": -378.01251220703125, - "eval_loss": 0.03346230462193489, - "eval_rewards/accuracies": 0.6847015023231506, - "eval_rewards/chosen": -1.7467377185821533, - "eval_rewards/margins": 0.5263976454734802, - "eval_rewards/rejected": -2.2731354236602783, - "eval_runtime": 153.3496, - "eval_samples_per_second": 55.768, - "eval_steps_per_second": 0.874, + "epoch": 0.72, + "eval_logits/chosen": 0.28598034381866455, + "eval_logits/rejected": 0.5382024645805359, + "eval_logps/chosen": -392.3096008300781, + "eval_logps/rejected": -471.95330810546875, + "eval_loss": 0.5473664402961731, + "eval_rewards/accuracies": 0.6996268630027771, + "eval_rewards/chosen": -2.4773612022399902, + "eval_rewards/margins": 0.7308279275894165, + "eval_rewards/rejected": -3.2081892490386963, + "eval_runtime": 184.2029, + "eval_samples_per_second": 46.427, + "eval_steps_per_second": 0.727, "step": 900 }, { - "debug/losses": 0.031236503273248672, - "debug/policy_weights": 0.06273458153009415, - "debug/raw_losses": 0.5026707053184509, - "epoch": 0.724233983286908, - "grad_norm": 1.4167921520949656, + "epoch": 0.72, "learning_rate": 1.0701737372808431e-07, - "logits/chosen": -1.9520496129989624, - "logits/rejected": -1.926009178161621, - "logps/chosen": -301.79803466796875, - "logps/rejected": -384.8517150878906, - "loss": 0.0301, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.5805127620697021, - "rewards/margins": 0.7013577818870544, - "rewards/rejected": -2.2818703651428223, + "logits/chosen": 0.15951867401599884, + "logits/rejected": 0.46630391478538513, + "logps/chosen": -383.52850341796875, + "logps/rejected": -467.2303771972656, + "loss": 0.5362, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.2782187461853027, + "rewards/margins": 0.8473943471908569, + "rewards/rejected": -3.125612735748291, "step": 910 }, { - "debug/losses": 0.028870219364762306, - "debug/policy_weights": 0.05118779093027115, - "debug/raw_losses": 0.6152342557907104, - "epoch": 0.7321925984878631, - "grad_norm": 1.3852811285813345, + "epoch": 0.73, "learning_rate": 1.0137191367132078e-07, - "logits/chosen": -1.9522676467895508, - "logits/rejected": -1.9203464984893799, - "logps/chosen": -363.1385192871094, - "logps/rejected": -410.9398498535156, - "loss": 0.0272, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -2.070452928543091, - "rewards/margins": 0.43279123306274414, - "rewards/rejected": -2.503243923187256, + "logits/chosen": 0.2791319191455841, + "logits/rejected": 0.45174160599708557, + "logps/chosen": -372.1945495605469, + "logps/rejected": -446.6507263183594, + "loss": 0.5458, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -2.3160648345947266, + "rewards/margins": 0.68004310131073, + "rewards/rejected": -2.996107816696167, "step": 920 }, { - "debug/losses": 0.02983178198337555, - "debug/policy_weights": 0.04729953408241272, - "debug/raw_losses": 0.6190310716629028, - "epoch": 0.7401512136888182, - "grad_norm": 1.1110786407319169, + "epoch": 0.74, "learning_rate": 9.584132603467827e-08, - "logits/chosen": -1.9343547821044922, - "logits/rejected": -1.8795759677886963, - "logps/chosen": -377.31103515625, - "logps/rejected": -408.9967346191406, - "loss": 0.0268, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -2.1014113426208496, - "rewards/margins": 0.44571709632873535, - "rewards/rejected": -2.547128438949585, + "logits/chosen": -0.12192128598690033, + "logits/rejected": 0.1477951854467392, + "logps/chosen": -366.48321533203125, + "logps/rejected": -453.130126953125, + "loss": 0.5467, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.200005531311035, + "rewards/margins": 0.7978888750076294, + "rewards/rejected": -2.997894287109375, "step": 930 }, { - "debug/losses": 0.025911280885338783, - "debug/policy_weights": 0.04492679983377457, - "debug/raw_losses": 0.5681049227714539, - "epoch": 0.7481098288897732, - "grad_norm": 1.282258686519084, + "epoch": 0.75, "learning_rate": 9.042988532644249e-08, - "logits/chosen": -1.9340381622314453, - "logits/rejected": -1.903543472290039, - "logps/chosen": -344.40826416015625, - "logps/rejected": -411.30517578125, - "loss": 0.0266, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -2.0105528831481934, - "rewards/margins": 0.5551122426986694, - "rewards/rejected": -2.5656652450561523, + "logits/chosen": -0.03106372058391571, + "logits/rejected": 0.07721444219350815, + "logps/chosen": -344.21270751953125, + "logps/rejected": -438.11077880859375, + "loss": 0.5161, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.149094581604004, + "rewards/margins": 0.7353444695472717, + "rewards/rejected": -2.884438991546631, "step": 940 }, { - "debug/losses": 0.029090652242302895, - "debug/policy_weights": 0.04689662158489227, - "debug/raw_losses": 0.5897936820983887, - "epoch": 0.7560684440907283, - "grad_norm": 1.1204653206310022, + "epoch": 0.76, "learning_rate": 8.514177396802428e-08, - "logits/chosen": -1.953285574913025, - "logits/rejected": -1.9266679286956787, - "logps/chosen": -352.0096130371094, - "logps/rejected": -408.1578063964844, - "loss": 0.0257, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -2.1213886737823486, - "rewards/margins": 0.49127835035324097, - "rewards/rejected": -2.612666606903076, + "logits/chosen": 0.006801058538258076, + "logits/rejected": 0.20282092690467834, + "logps/chosen": -358.15167236328125, + "logps/rejected": -436.4964294433594, + "loss": 0.5385, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -2.2220425605773926, + "rewards/margins": 0.7004804611206055, + "rewards/rejected": -2.922523260116577, "step": 950 }, { - "debug/losses": 0.026548391208052635, - "debug/policy_weights": 0.04522048681974411, - "debug/raw_losses": 0.5894621014595032, - "epoch": 0.7640270592916832, - "grad_norm": 0.898387217227339, + "epoch": 0.76, "learning_rate": 7.998107906142839e-08, - "logits/chosen": -1.9560728073120117, - "logits/rejected": -1.9274494647979736, - "logps/chosen": -343.21002197265625, - "logps/rejected": -379.6694030761719, - "loss": 0.0259, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -2.0018842220306396, - "rewards/margins": 0.4287974238395691, - "rewards/rejected": -2.4306817054748535, + "logits/chosen": 0.41448846459388733, + "logits/rejected": 0.705254852771759, + "logps/chosen": -371.27801513671875, + "logps/rejected": -434.56866455078125, + "loss": 0.5236, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.2166616916656494, + "rewards/margins": 0.6714047193527222, + "rewards/rejected": -2.888066530227661, "step": 960 }, { - "debug/losses": 0.02163025364279747, - "debug/policy_weights": 0.04079737886786461, - "debug/raw_losses": 0.5705414414405823, - "epoch": 0.7719856744926383, - "grad_norm": 1.3955639157776591, + "epoch": 0.77, "learning_rate": 7.495178923039396e-08, - "logits/chosen": -1.9478362798690796, - "logits/rejected": -1.9617118835449219, - "logps/chosen": -331.72784423828125, - "logps/rejected": -410.7064514160156, - "loss": 0.0264, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -2.030539035797119, - "rewards/margins": 0.5192916989326477, - "rewards/rejected": -2.549830913543701, + "logits/chosen": 0.23847150802612305, + "logits/rejected": 0.48661884665489197, + "logps/chosen": -366.28179931640625, + "logps/rejected": -462.679443359375, + "loss": 0.5459, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.1916985511779785, + "rewards/margins": 0.8472123146057129, + "rewards/rejected": -3.038910388946533, "step": 970 }, { - "debug/losses": 0.023149430751800537, - "debug/policy_weights": 0.041884638369083405, - "debug/raw_losses": 0.5447368025779724, - "epoch": 0.7799442896935933, - "grad_norm": 1.5585746856477063, + "epoch": 0.78, "learning_rate": 7.005779153764682e-08, - "logits/chosen": -1.9719524383544922, - "logits/rejected": -1.9254512786865234, - "logps/chosen": -337.332275390625, - "logps/rejected": -387.82318115234375, - "loss": 0.028, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -1.9454883337020874, - "rewards/margins": 0.5578633546829224, - "rewards/rejected": -2.5033516883850098, + "logits/chosen": 0.41438961029052734, + "logits/rejected": 0.6912784576416016, + "logps/chosen": -382.70123291015625, + "logps/rejected": -461.8614807128906, + "loss": 0.5453, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.4304287433624268, + "rewards/margins": 0.7116767764091492, + "rewards/rejected": -3.1421055793762207, "step": 980 }, { - "debug/losses": 0.027449458837509155, - "debug/policy_weights": 0.04992467164993286, - "debug/raw_losses": 0.551071286201477, - "epoch": 0.7879029048945484, - "grad_norm": 1.6695068228207648, + "epoch": 0.79, "learning_rate": 6.530286848064698e-08, - "logits/chosen": -1.9756189584732056, - "logits/rejected": -1.950194001197815, - "logps/chosen": -336.4566955566406, - "logps/rejected": -397.8472595214844, - "loss": 0.0264, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -1.8330726623535156, - "rewards/margins": 0.5761424899101257, - "rewards/rejected": -2.4092154502868652, + "logits/chosen": 0.36573725938796997, + "logits/rejected": 0.5834362506866455, + "logps/chosen": -384.49749755859375, + "logps/rejected": -466.30096435546875, + "loss": 0.5528, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.5111565589904785, + "rewards/margins": 0.7234699130058289, + "rewards/rejected": -3.234626054763794, "step": 990 }, { - "debug/losses": 0.026408571749925613, - "debug/policy_weights": 0.04767809808254242, - "debug/raw_losses": 0.5441979169845581, - "epoch": 0.7958615200955034, - "grad_norm": 1.6016926469447665, + "epoch": 0.8, "learning_rate": 6.069069506815325e-08, - "logits/chosen": -1.964962363243103, - "logits/rejected": -1.9171581268310547, - "logps/chosen": -334.23931884765625, - "logps/rejected": -392.36944580078125, - "loss": 0.0294, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -1.8932774066925049, - "rewards/margins": 0.6260205507278442, - "rewards/rejected": -2.5192978382110596, + "logits/chosen": 0.45530566573143005, + "logits/rejected": 0.5909157991409302, + "logps/chosen": -379.1433410644531, + "logps/rejected": -468.88458251953125, + "loss": 0.527, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.5361268520355225, + "rewards/margins": 0.7407721281051636, + "rewards/rejected": -3.2768986225128174, "step": 1000 }, { - "epoch": 0.7958615200955034, - "eval_debug/losses": 0.02711501345038414, - "eval_debug/policy_weights": 0.048435505479574203, - "eval_debug/raw_losses": 0.5694997906684875, - "eval_logits/chosen": -1.9579750299453735, - "eval_logits/rejected": -1.9318426847457886, - "eval_logps/chosen": -338.3062438964844, - "eval_logps/rejected": -398.16033935546875, - "eval_loss": 0.02892610803246498, - "eval_rewards/accuracies": 0.6865671873092651, - "eval_rewards/chosen": -1.9406273365020752, - "eval_rewards/margins": 0.5339860916137695, - "eval_rewards/rejected": -2.474613666534424, - "eval_runtime": 153.1415, - "eval_samples_per_second": 55.844, - "eval_steps_per_second": 0.875, + "epoch": 0.8, + "eval_logits/chosen": 0.3871051073074341, + "eval_logits/rejected": 0.6372014284133911, + "eval_logps/chosen": -394.97113037109375, + "eval_logps/rejected": -471.8453674316406, + "eval_loss": 0.5453863739967346, + "eval_rewards/accuracies": 0.70802241563797, + "eval_rewards/chosen": -2.503976345062256, + "eval_rewards/margins": 0.7031334638595581, + "eval_rewards/rejected": -3.2071101665496826, + "eval_runtime": 184.1208, + "eval_samples_per_second": 46.448, + "eval_steps_per_second": 0.728, "step": 1000 }, { - "debug/losses": 0.03183472901582718, - "debug/policy_weights": 0.049464497715234756, - "debug/raw_losses": 0.6195154786109924, - "epoch": 0.8038201352964585, - "grad_norm": 1.4391778362577514, + "epoch": 0.8, "learning_rate": 5.6224835979863714e-08, - "logits/chosen": -1.9442205429077148, - "logits/rejected": -1.9024133682250977, - "logps/chosen": -348.94781494140625, - "logps/rejected": -381.45281982421875, - "loss": 0.0284, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -1.9863436222076416, - "rewards/margins": 0.40313720703125, - "rewards/rejected": -2.3894805908203125, + "logits/chosen": 0.31174296140670776, + "logits/rejected": 0.6193565130233765, + "logps/chosen": -390.387451171875, + "logps/rejected": -468.4959411621094, + "loss": 0.5568, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.32766056060791, + "rewards/margins": 0.68747878074646, + "rewards/rejected": -3.015139102935791, "step": 1010 }, { - "debug/losses": 0.022611474618315697, - "debug/policy_weights": 0.044095687568187714, - "debug/raw_losses": 0.5569943785667419, - "epoch": 0.8117787504974134, - "grad_norm": 1.8837439228048556, + "epoch": 0.81, "learning_rate": 5.190874281132851e-08, - "logits/chosen": -1.944157600402832, - "logits/rejected": -1.92160165309906, - "logps/chosen": -325.661865234375, - "logps/rejected": -386.25885009765625, - "loss": 0.0285, + "logits/chosen": 0.22277125716209412, + "logits/rejected": 0.6487134099006653, + "logps/chosen": -402.0958557128906, + "logps/rejected": -448.5992736816406, + "loss": 0.5408, "rewards/accuracies": 0.71875, - "rewards/chosen": -1.8595483303070068, - "rewards/margins": 0.5777700543403625, - "rewards/rejected": -2.4373183250427246, + "rewards/chosen": -2.359062671661377, + "rewards/margins": 0.6533006429672241, + "rewards/rejected": -3.0123631954193115, "step": 1020 }, { - "debug/losses": 0.0247525442391634, - "debug/policy_weights": 0.045843105763196945, - "debug/raw_losses": 0.5282943844795227, - "epoch": 0.8197373656983685, - "grad_norm": 1.3079251608813307, + "epoch": 0.82, "learning_rate": 4.774575140626316e-08, - "logits/chosen": -1.9093300104141235, - "logits/rejected": -1.858944296836853, - "logps/chosen": -322.7106018066406, - "logps/rejected": -379.23748779296875, - "loss": 0.0288, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.826472282409668, - "rewards/margins": 0.6417751312255859, - "rewards/rejected": -2.468247413635254, + "logits/chosen": 0.23170511424541473, + "logits/rejected": 0.47184085845947266, + "logps/chosen": -363.46917724609375, + "logps/rejected": -442.47918701171875, + "loss": 0.5309, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.142770290374756, + "rewards/margins": 0.7513145208358765, + "rewards/rejected": -2.894084930419922, "step": 1030 }, { - "debug/losses": 0.029871661216020584, - "debug/policy_weights": 0.05007849261164665, - "debug/raw_losses": 0.6156904101371765, - "epoch": 0.8276959808993235, - "grad_norm": 1.4623406105385353, + "epoch": 0.83, "learning_rate": 4.373907927832513e-08, - "logits/chosen": -1.9410374164581299, - "logits/rejected": -1.9204730987548828, - "logps/chosen": -313.7291564941406, - "logps/rejected": -367.6692199707031, - "loss": 0.0313, - "rewards/accuracies": 0.65625, - "rewards/chosen": -1.8602615594863892, - "rewards/margins": 0.42712441086769104, - "rewards/rejected": -2.287385940551758, + "logits/chosen": 0.07573021948337555, + "logits/rejected": 0.32997313141822815, + "logps/chosen": -381.45599365234375, + "logps/rejected": -443.0684509277344, + "loss": 0.5407, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.2033116817474365, + "rewards/margins": 0.710732638835907, + "rewards/rejected": -2.914044141769409, "step": 1040 }, { - "debug/losses": 0.029429305344820023, - "debug/policy_weights": 0.054835814982652664, - "debug/raw_losses": 0.5309990048408508, - "epoch": 0.8356545961002786, - "grad_norm": 1.3217568428330395, + "epoch": 0.84, "learning_rate": 3.9891823124345665e-08, - "logits/chosen": -1.975529670715332, - "logits/rejected": -1.9376213550567627, - "logps/chosen": -324.4565124511719, - "logps/rejected": -385.99835205078125, - "loss": 0.032, - "rewards/accuracies": 0.71875, - "rewards/chosen": -1.7343988418579102, - "rewards/margins": 0.6463450193405151, - "rewards/rejected": -2.3807437419891357, + "logits/chosen": 0.23884686827659607, + "logits/rejected": 0.6128005385398865, + "logps/chosen": -364.00567626953125, + "logps/rejected": -433.3273010253906, + "loss": 0.5471, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.2081527709960938, + "rewards/margins": 0.7639263868331909, + "rewards/rejected": -2.972079038619995, "step": 1050 }, { - "debug/losses": 0.026505377143621445, - "debug/policy_weights": 0.050394363701343536, - "debug/raw_losses": 0.5719414949417114, - "epoch": 0.8436132113012336, - "grad_norm": 1.3080390048525132, + "epoch": 0.84, "learning_rate": 3.620695643093924e-08, - "logits/chosen": -1.9717638492584229, - "logits/rejected": -1.9583126306533813, - "logps/chosen": -307.21038818359375, - "logps/rejected": -371.9794616699219, - "loss": 0.0294, - "rewards/accuracies": 0.6875, - "rewards/chosen": -1.7042350769042969, - "rewards/margins": 0.5117149353027344, - "rewards/rejected": -2.2159502506256104, + "logits/chosen": 0.21963253617286682, + "logits/rejected": 0.6894062757492065, + "logps/chosen": -399.5767517089844, + "logps/rejected": -452.88909912109375, + "loss": 0.5154, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.3384335041046143, + "rewards/margins": 0.7010769844055176, + "rewards/rejected": -3.0395102500915527, "step": 1060 }, { - "debug/losses": 0.03151816874742508, - "debug/policy_weights": 0.051329899579286575, - "debug/raw_losses": 0.5482734441757202, - "epoch": 0.8515718265021887, - "grad_norm": 1.9249185694718636, + "epoch": 0.85, "learning_rate": 3.268732717634032e-08, - "logits/chosen": -1.942237138748169, - "logits/rejected": -1.908158540725708, - "logps/chosen": -311.0814514160156, - "logps/rejected": -363.3550720214844, - "loss": 0.0315, - "rewards/accuracies": 0.71875, - "rewards/chosen": -1.746870756149292, - "rewards/margins": 0.545746386051178, - "rewards/rejected": -2.292617082595825, + "logits/chosen": 0.3474286198616028, + "logits/rejected": 0.695271372795105, + "logps/chosen": -368.0654602050781, + "logps/rejected": -431.47222900390625, + "loss": 0.5499, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.1910276412963867, + "rewards/margins": 0.7267633080482483, + "rewards/rejected": -2.9177908897399902, "step": 1070 }, { - "debug/losses": 0.0361117348074913, - "debug/policy_weights": 0.056406814604997635, - "debug/raw_losses": 0.6171997785568237, - "epoch": 0.8595304417031436, - "grad_norm": 1.2978569483513427, + "epoch": 0.86, "learning_rate": 2.9335655629243645e-08, - "logits/chosen": -1.9778077602386475, - "logits/rejected": -1.9576594829559326, - "logps/chosen": -338.1204833984375, - "logps/rejected": -390.57769775390625, - "loss": 0.032, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -1.8773784637451172, - "rewards/margins": 0.41199570894241333, - "rewards/rejected": -2.2893738746643066, + "logits/chosen": 0.2347393035888672, + "logits/rejected": 0.5894696712493896, + "logps/chosen": -388.94757080078125, + "logps/rejected": -447.3855895996094, + "loss": 0.525, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.254683017730713, + "rewards/margins": 0.7334609031677246, + "rewards/rejected": -2.9881439208984375, "step": 1080 }, { - "debug/losses": 0.02704630419611931, - "debug/policy_weights": 0.04527381435036659, - "debug/raw_losses": 0.6039454340934753, - "epoch": 0.8674890569040987, - "grad_norm": 1.4432056512216735, + "epoch": 0.87, "learning_rate": 2.6154532246349476e-08, - "logits/chosen": -1.9391145706176758, - "logits/rejected": -1.882846474647522, - "logps/chosen": -329.9732360839844, - "logps/rejected": -354.43182373046875, - "loss": 0.0324, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -1.8840316534042358, - "rewards/margins": 0.42414146661758423, - "rewards/rejected": -2.3081729412078857, + "logits/chosen": 0.25378522276878357, + "logits/rejected": 0.5771256685256958, + "logps/chosen": -358.50640869140625, + "logps/rejected": -431.145751953125, + "loss": 0.5462, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.1666626930236816, + "rewards/margins": 0.756801426410675, + "rewards/rejected": -2.923464059829712, "step": 1090 }, { - "debug/losses": 0.03607811778783798, - "debug/policy_weights": 0.060448456555604935, - "debug/raw_losses": 0.6365524530410767, - "epoch": 0.8754476721050537, - "grad_norm": 2.2478985377156913, + "epoch": 0.88, "learning_rate": 2.31464156702382e-08, - "logits/chosen": -1.9797563552856445, - "logits/rejected": -1.9243662357330322, - "logps/chosen": -340.63897705078125, - "logps/rejected": -371.7919006347656, - "loss": 0.0308, - "rewards/accuracies": 0.606249988079071, - "rewards/chosen": -1.8404861688613892, - "rewards/margins": 0.3901959955692291, - "rewards/rejected": -2.230681896209717, + "logits/chosen": 0.35370689630508423, + "logits/rejected": 0.5671936273574829, + "logps/chosen": -363.0, + "logps/rejected": -438.209228515625, + "loss": 0.5487, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.2130119800567627, + "rewards/margins": 0.7499077916145325, + "rewards/rejected": -2.9629194736480713, "step": 1100 }, { - "epoch": 0.8754476721050537, - "eval_debug/losses": 0.029148757457733154, - "eval_debug/policy_weights": 0.052018992602825165, - "eval_debug/raw_losses": 0.5656781196594238, - "eval_logits/chosen": -1.9813780784606934, - "eval_logits/rejected": -1.9553614854812622, - "eval_logps/chosen": -325.3560485839844, - "eval_logps/rejected": -384.33758544921875, - "eval_loss": 0.031092733144760132, - "eval_rewards/accuracies": 0.700559675693512, - "eval_rewards/chosen": -1.81112539768219, - "eval_rewards/margins": 0.5252605676651001, - "eval_rewards/rejected": -2.336386203765869, - "eval_runtime": 153.3772, - "eval_samples_per_second": 55.758, - "eval_steps_per_second": 0.874, + "epoch": 0.88, + "eval_logits/chosen": 0.1857856959104538, + "eval_logits/rejected": 0.43363669514656067, + "eval_logps/chosen": -373.08306884765625, + "eval_logps/rejected": -450.7598876953125, + "eval_loss": 0.5444055199623108, + "eval_rewards/accuracies": 0.7089552283287048, + "eval_rewards/chosen": -2.285095453262329, + "eval_rewards/margins": 0.711159884929657, + "eval_rewards/rejected": -2.996255397796631, + "eval_runtime": 184.1211, + "eval_samples_per_second": 46.448, + "eval_steps_per_second": 0.728, "step": 1100 }, { - "debug/losses": 0.028289441019296646, - "debug/policy_weights": 0.050502192229032516, - "debug/raw_losses": 0.5873227119445801, - "epoch": 0.8834062873060088, - "grad_norm": 1.2493044367365553, + "epoch": 0.88, "learning_rate": 2.031363082912252e-08, - "logits/chosen": -1.9731247425079346, - "logits/rejected": -1.9583030939102173, - "logps/chosen": -312.7198181152344, - "logps/rejected": -365.84539794921875, - "loss": 0.0313, - "rewards/accuracies": 0.65625, - "rewards/chosen": -1.811852216720581, - "rewards/margins": 0.43998369574546814, - "rewards/rejected": -2.2518362998962402, + "logits/chosen": 0.070524200797081, + "logits/rejected": 0.4635602533817291, + "logps/chosen": -373.29327392578125, + "logps/rejected": -426.85552978515625, + "loss": 0.5513, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -2.2541089057922363, + "rewards/margins": 0.6198171973228455, + "rewards/rejected": -2.8739261627197266, "step": 1110 }, { - "debug/losses": 0.028473522514104843, - "debug/policy_weights": 0.049161095172166824, - "debug/raw_losses": 0.613974392414093, - "epoch": 0.8913649025069638, - "grad_norm": 1.5694190986130379, + "epoch": 0.89, "learning_rate": 1.7658367139945228e-08, - "logits/chosen": -1.9810594320297241, - "logits/rejected": -1.9477195739746094, - "logps/chosen": -323.91619873046875, - "logps/rejected": -368.2818298339844, - "loss": 0.0294, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -1.7960774898529053, - "rewards/margins": 0.45412111282348633, - "rewards/rejected": -2.2501983642578125, + "logits/chosen": 0.2600646913051605, + "logits/rejected": 0.5517584681510925, + "logps/chosen": -390.8568115234375, + "logps/rejected": -462.80828857421875, + "loss": 0.5471, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.377396821975708, + "rewards/margins": 0.6719989776611328, + "rewards/rejected": -3.049395799636841, "step": 1120 }, { - "debug/losses": 0.03077780269086361, - "debug/policy_weights": 0.055907707661390305, - "debug/raw_losses": 0.5727918148040771, - "epoch": 0.8993235177079189, - "grad_norm": 1.736668937190508, + "epoch": 0.9, "learning_rate": 1.5182676816211632e-08, - "logits/chosen": -1.9711143970489502, - "logits/rejected": -1.9426500797271729, - "logps/chosen": -326.6500549316406, - "logps/rejected": -389.47027587890625, - "loss": 0.0322, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -1.8423681259155273, - "rewards/margins": 0.4901112914085388, - "rewards/rejected": -2.332479476928711, + "logits/chosen": 0.04413030296564102, + "logits/rejected": 0.30151715874671936, + "logps/chosen": -382.0662536621094, + "logps/rejected": -447.08673095703125, + "loss": 0.5431, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.228654384613037, + "rewards/margins": 0.6926024556159973, + "rewards/rejected": -2.9212570190429688, "step": 1130 }, { - "debug/losses": 0.04067676141858101, - "debug/policy_weights": 0.06354663521051407, - "debug/raw_losses": 0.6167309880256653, - "epoch": 0.9072821329088738, - "grad_norm": 1.311295756466762, + "epoch": 0.91, "learning_rate": 1.2888473281864597e-08, - "logits/chosen": -2.0066215991973877, - "logits/rejected": -1.978035569190979, - "logps/chosen": -334.54376220703125, - "logps/rejected": -368.7451171875, - "loss": 0.0321, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -1.7672863006591797, - "rewards/margins": 0.3908054232597351, - "rewards/rejected": -2.1580917835235596, + "logits/chosen": 0.14212054014205933, + "logits/rejected": 0.47429710626602173, + "logps/chosen": -367.8409729003906, + "logps/rejected": -435.02764892578125, + "loss": 0.5369, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -2.2534115314483643, + "rewards/margins": 0.7273036241531372, + "rewards/rejected": -2.980715274810791, "step": 1140 }, { - "debug/losses": 0.029732922092080116, - "debug/policy_weights": 0.05216806009411812, - "debug/raw_losses": 0.5487850904464722, - "epoch": 0.9152407481098289, - "grad_norm": 1.2474371694735222, + "epoch": 0.92, "learning_rate": 1.0777529692427679e-08, - "logits/chosen": -1.9626529216766357, - "logits/rejected": -1.924966812133789, - "logps/chosen": -318.5897521972656, - "logps/rejected": -362.83251953125, - "loss": 0.0311, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -1.6781198978424072, - "rewards/margins": 0.5509058833122253, - "rewards/rejected": -2.2290260791778564, + "logits/chosen": 0.04115242511034012, + "logits/rejected": 0.28970104455947876, + "logps/chosen": -372.7949523925781, + "logps/rejected": -456.10675048828125, + "loss": 0.5265, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.300356388092041, + "rewards/margins": 0.8059718012809753, + "rewards/rejected": -3.106328248977661, "step": 1150 }, { - "debug/losses": 0.034698087722063065, - "debug/policy_weights": 0.054052066057920456, - "debug/raw_losses": 0.599198579788208, - "epoch": 0.9231993633107839, - "grad_norm": 1.4791914719616432, + "epoch": 0.92, "learning_rate": 8.851477564560061e-09, - "logits/chosen": -1.9579505920410156, - "logits/rejected": -1.9253816604614258, - "logps/chosen": -311.4327392578125, - "logps/rejected": -374.3013610839844, - "loss": 0.0312, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -1.692578911781311, - "rewards/margins": 0.5192922353744507, - "rewards/rejected": -2.2118711471557617, + "logits/chosen": 0.0867738351225853, + "logits/rejected": 0.4068300127983093, + "logps/chosen": -372.08636474609375, + "logps/rejected": -426.42388916015625, + "loss": 0.5342, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -2.331385850906372, + "rewards/margins": 0.6490964293479919, + "rewards/rejected": -2.9804821014404297, "step": 1160 }, { - "debug/losses": 0.030673867091536522, - "debug/policy_weights": 0.05762176960706711, - "debug/raw_losses": 0.5920447111129761, - "epoch": 0.931157978511739, - "grad_norm": 1.2297627601023153, + "epoch": 0.93, "learning_rate": 7.111805515081531e-09, - "logits/chosen": -1.9873937368392944, - "logits/rejected": -1.928577184677124, - "logps/chosen": -349.22552490234375, - "logps/rejected": -393.4672546386719, - "loss": 0.0309, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -1.9474347829818726, - "rewards/margins": 0.4757717251777649, - "rewards/rejected": -2.423206329345703, + "logits/chosen": 0.02022993005812168, + "logits/rejected": 0.41968393325805664, + "logps/chosen": -363.818603515625, + "logps/rejected": -447.7919006347656, + "loss": 0.5312, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -2.2372307777404785, + "rewards/margins": 0.8540315628051758, + "rewards/rejected": -3.0912623405456543, "step": 1170 }, { - "debug/losses": 0.03575451672077179, - "debug/policy_weights": 0.0605660155415535, - "debug/raw_losses": 0.5854865312576294, - "epoch": 0.939116593712694, - "grad_norm": 1.1864649900689144, + "epoch": 0.94, "learning_rate": 5.559858110443016e-09, - "logits/chosen": -2.000704288482666, - "logits/rejected": -1.9662271738052368, - "logps/chosen": -329.9343566894531, - "logps/rejected": -382.48822021484375, - "loss": 0.0302, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -1.740262746810913, - "rewards/margins": 0.504480242729187, - "rewards/rejected": -2.2447433471679688, + "logits/chosen": 0.29695388674736023, + "logits/rejected": 0.714096188545227, + "logps/chosen": -372.5519714355469, + "logps/rejected": -442.5354919433594, + "loss": 0.5383, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.3107995986938477, + "rewards/margins": 0.8070123791694641, + "rewards/rejected": -3.117811918258667, "step": 1180 }, { - "debug/losses": 0.024325475096702576, - "debug/policy_weights": 0.04755675792694092, - "debug/raw_losses": 0.549831211566925, - "epoch": 0.947075208913649, - "grad_norm": 1.4675887378312884, + "epoch": 0.95, "learning_rate": 4.196834827531276e-09, - "logits/chosen": -1.9613635540008545, - "logits/rejected": -1.926835060119629, - "logps/chosen": -328.3395690917969, - "logps/rejected": -393.16650390625, - "loss": 0.0289, - "rewards/accuracies": 0.6875, - "rewards/chosen": -1.8523941040039062, - "rewards/margins": 0.6523604393005371, - "rewards/rejected": -2.5047545433044434, + "logits/chosen": 0.140055850148201, + "logits/rejected": 0.3409932255744934, + "logps/chosen": -355.64324951171875, + "logps/rejected": -447.585693359375, + "loss": 0.5152, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.149151563644409, + "rewards/margins": 0.7904965877532959, + "rewards/rejected": -2.939648151397705, "step": 1190 }, { - "debug/losses": 0.024997171014547348, - "debug/policy_weights": 0.04472526162862778, - "debug/raw_losses": 0.5522318482398987, - "epoch": 0.955033824114604, - "grad_norm": 1.2840605704052697, + "epoch": 0.96, "learning_rate": 3.023789126611137e-09, - "logits/chosen": -1.9729950428009033, - "logits/rejected": -1.9184396266937256, - "logps/chosen": -324.0898742675781, - "logps/rejected": -372.49151611328125, - "loss": 0.0303, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -1.7675983905792236, - "rewards/margins": 0.5561206936836243, - "rewards/rejected": -2.323719024658203, + "logits/chosen": 0.03294936567544937, + "logits/rejected": 0.2933207154273987, + "logps/chosen": -363.29290771484375, + "logps/rejected": -435.640380859375, + "loss": 0.5483, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.281057357788086, + "rewards/margins": 0.7091296911239624, + "rewards/rejected": -2.990186929702759, "step": 1200 }, { - "epoch": 0.955033824114604, - "eval_debug/losses": 0.029483934864401817, - "eval_debug/policy_weights": 0.05266140401363373, - "eval_debug/raw_losses": 0.5652690529823303, - "eval_logits/chosen": -1.9727920293807983, - "eval_logits/rejected": -1.946227788925171, - "eval_logps/chosen": -324.4056091308594, - "eval_logps/rejected": -384.5557556152344, - "eval_loss": 0.03144846484065056, - "eval_rewards/accuracies": 0.6996268630027771, - "eval_rewards/chosen": -1.8016209602355957, - "eval_rewards/margins": 0.536946713924408, - "eval_rewards/rejected": -2.3385674953460693, - "eval_runtime": 153.1595, - "eval_samples_per_second": 55.837, - "eval_steps_per_second": 0.875, + "epoch": 0.96, + "eval_logits/chosen": 0.07418080419301987, + "eval_logits/rejected": 0.32435521483421326, + "eval_logps/chosen": -373.978515625, + "eval_logps/rejected": -451.6764831542969, + "eval_loss": 0.5440130829811096, + "eval_rewards/accuracies": 0.7089552283287048, + "eval_rewards/chosen": -2.2940499782562256, + "eval_rewards/margins": 0.7113713622093201, + "eval_rewards/rejected": -3.0054211616516113, + "eval_runtime": 184.0818, + "eval_samples_per_second": 46.458, + "eval_steps_per_second": 0.728, "step": 1200 }, { - "debug/losses": 0.027663961052894592, - "debug/policy_weights": 0.04848761484026909, - "debug/raw_losses": 0.5848130583763123, - "epoch": 0.9629924393155591, - "grad_norm": 1.2701407512231324, + "epoch": 0.96, "learning_rate": 2.041627637121929e-09, - "logits/chosen": -1.950147032737732, - "logits/rejected": -1.9268693923950195, - "logps/chosen": -327.3438415527344, - "logps/rejected": -394.26251220703125, - "loss": 0.0305, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -1.858901023864746, - "rewards/margins": 0.4966161251068115, - "rewards/rejected": -2.3555169105529785, + "logits/chosen": 0.10010697692632675, + "logits/rejected": 0.3795483410358429, + "logps/chosen": -348.8675231933594, + "logps/rejected": -437.20361328125, + "loss": 0.5398, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.087364673614502, + "rewards/margins": 0.828387439250946, + "rewards/rejected": -2.9157521724700928, "step": 1210 }, { - "debug/losses": 0.03097696043550968, - "debug/policy_weights": 0.05295131728053093, - "debug/raw_losses": 0.5842943787574768, - "epoch": 0.9709510545165141, - "grad_norm": 1.7444108884419736, + "epoch": 0.97, "learning_rate": 1.2511094569571668e-09, - "logits/chosen": -1.9334291219711304, - "logits/rejected": -1.87801194190979, - "logps/chosen": -323.7477722167969, - "logps/rejected": -347.8150634765625, - "loss": 0.0313, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -1.7532447576522827, - "rewards/margins": 0.47265228629112244, - "rewards/rejected": -2.2258970737457275, + "logits/chosen": 0.09991980344057083, + "logits/rejected": 0.4467397630214691, + "logps/chosen": -380.14520263671875, + "logps/rejected": -440.24658203125, + "loss": 0.5345, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.253425121307373, + "rewards/margins": 0.702509880065918, + "rewards/rejected": -2.955935001373291, "step": 1220 }, { - "debug/losses": 0.02778841182589531, - "debug/policy_weights": 0.04463866725564003, - "debug/raw_losses": 0.5899677276611328, - "epoch": 0.9789096697174692, - "grad_norm": 1.2986395101895525, + "epoch": 0.98, "learning_rate": 6.528455657691112e-10, - "logits/chosen": -1.9225807189941406, - "logits/rejected": -1.9189189672470093, - "logps/chosen": -331.2619934082031, - "logps/rejected": -391.13671875, - "loss": 0.0287, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -1.9252440929412842, - "rewards/margins": 0.48622721433639526, - "rewards/rejected": -2.411471128463745, + "logits/chosen": 0.11626466363668442, + "logits/rejected": 0.41348797082901, + "logps/chosen": -372.7298889160156, + "logps/rejected": -427.22576904296875, + "loss": 0.549, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.2800345420837402, + "rewards/margins": 0.6291176080703735, + "rewards/rejected": -2.909151792526245, "step": 1230 }, { - "debug/losses": 0.032337166368961334, - "debug/policy_weights": 0.051962029188871384, - "debug/raw_losses": 0.5668946504592896, - "epoch": 0.9868682849184242, - "grad_norm": 1.4772091582386668, + "epoch": 0.99, "learning_rate": 2.4729835275189016e-10, - "logits/chosen": -1.9424164295196533, - "logits/rejected": -1.9091637134552002, - "logps/chosen": -328.1160583496094, - "logps/rejected": -392.4992370605469, - "loss": 0.0303, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -1.8741003274917603, - "rewards/margins": 0.5802657604217529, - "rewards/rejected": -2.4543659687042236, + "logits/chosen": 0.06715863198041916, + "logits/rejected": 0.29241910576820374, + "logps/chosen": -393.8903503417969, + "logps/rejected": -477.9420471191406, + "loss": 0.5462, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.323488473892212, + "rewards/margins": 0.8067766427993774, + "rewards/rejected": -3.1302647590637207, "step": 1240 }, { - "debug/losses": 0.027117431163787842, - "debug/policy_weights": 0.04854784160852432, - "debug/raw_losses": 0.5413271188735962, - "epoch": 0.9948269001193792, - "grad_norm": 1.622815424856795, + "epoch": 0.99, "learning_rate": 3.478125926756337e-11, - "logits/chosen": -1.9423835277557373, - "logits/rejected": -1.9216960668563843, - "logps/chosen": -327.0545349121094, - "logps/rejected": -397.98040771484375, - "loss": 0.0289, - "rewards/accuracies": 0.71875, - "rewards/chosen": -1.8533722162246704, - "rewards/margins": 0.5928901433944702, - "rewards/rejected": -2.4462623596191406, + "logits/chosen": 0.25983649492263794, + "logits/rejected": 0.4905417561531067, + "logps/chosen": -364.73431396484375, + "logps/rejected": -443.79296875, + "loss": 0.5474, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.2353272438049316, + "rewards/margins": 0.771331787109375, + "rewards/rejected": -3.0066590309143066, "step": 1250 }, { - "epoch": 0.9996020692399522, + "epoch": 1.0, "step": 1256, "total_flos": 0.0, - "train_loss": 0.05419013021620595, - "train_runtime": 10439.8283, - "train_samples_per_second": 15.403, - "train_steps_per_second": 0.12 + "train_loss": 0.5712926928784438, + "train_runtime": 11681.9838, + "train_samples_per_second": 13.765, + "train_steps_per_second": 0.108 } ], "logging_steps": 10, "max_steps": 1256, - "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, - "stateful_callbacks": { - "TrainerControl": { - "args": { - "should_epoch_stop": false, - "should_evaluate": false, - "should_log": false, - "should_save": true, - "should_training_stop": false - }, - "attributes": {} - } - }, "total_flos": 0.0, - "train_batch_size": 8, "trial_name": null, "trial_params": null }