diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -9,16 +9,13 @@ "is_world_process_zero": true, "log_history": [ { - "debug/losses": 0.22325867414474487, - "debug/policy_weights": 0.322094202041626, - "debug/raw_losses": 0.6931471824645996, "epoch": 0.0, "learning_rate": 3.968253968253968e-09, "logits/chosen": -2.7193620204925537, "logits/rejected": -2.698728084564209, "logps/chosen": -182.0961456298828, "logps/rejected": -172.47128295898438, - "loss": 0.2247, + "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, @@ -26,2365 +23,1954 @@ "step": 1 }, { - "debug/losses": 0.20191282033920288, - "debug/policy_weights": 0.2913964092731476, - "debug/raw_losses": 0.6929122805595398, "epoch": 0.01, "learning_rate": 3.968253968253968e-08, - "logits/chosen": -2.703850269317627, - "logits/rejected": -2.6792843341827393, - "logps/chosen": -162.4593505859375, - "logps/rejected": -140.56825256347656, - "loss": 0.2203, - "rewards/accuracies": 0.5138888955116272, - "rewards/chosen": 0.0003097933658864349, - "rewards/margins": 0.00047214856022037566, - "rewards/rejected": -0.00016235519433394074, + "logits/chosen": -2.7041964530944824, + "logits/rejected": -2.6794540882110596, + "logps/chosen": -162.45831298828125, + "logps/rejected": -140.5693359375, + "loss": 0.6931, + "rewards/accuracies": 0.5486111044883728, + "rewards/chosen": 0.00032037965138442814, + "rewards/margins": 0.0004935775068588555, + "rewards/rejected": -0.00017319784092251211, "step": 10 }, { - "debug/losses": 0.23432615399360657, - "debug/policy_weights": 0.33815282583236694, - "debug/raw_losses": 0.6929541826248169, "epoch": 0.02, "learning_rate": 7.936507936507936e-08, - "logits/chosen": -2.718209743499756, - "logits/rejected": -2.7140936851501465, - "logps/chosen": -134.44784545898438, - "logps/rejected": -143.55966186523438, - "loss": 0.2265, - "rewards/accuracies": 0.53125, - "rewards/chosen": 0.00033358519431203604, - "rewards/margins": 0.0003893459215760231, - "rewards/rejected": -5.576071635005064e-05, + "logits/chosen": -2.7177577018737793, + "logits/rejected": -2.7136425971984863, + "logps/chosen": -134.47242736816406, + "logps/rejected": -143.55604553222656, + "loss": 0.6931, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 8.780837379163131e-05, + "rewards/margins": 0.00010721785656642169, + "rewards/rejected": -1.940951551659964e-05, "step": 20 }, { - "debug/losses": 0.2317180633544922, - "debug/policy_weights": 0.3342125713825226, - "debug/raw_losses": 0.6933391690254211, "epoch": 0.02, "learning_rate": 1.1904761904761903e-07, - "logits/chosen": -2.6895809173583984, - "logits/rejected": -2.6757731437683105, - "logps/chosen": -141.00173950195312, - "logps/rejected": -136.54183959960938, - "loss": 0.2243, - "rewards/accuracies": 0.44999998807907104, - "rewards/chosen": -1.5367928654086427e-06, - "rewards/margins": -0.0003812290378846228, - "rewards/rejected": 0.000379692210117355, + "logits/chosen": -2.6898293495178223, + "logits/rejected": -2.676154613494873, + "logps/chosen": -140.94692993164062, + "logps/rejected": -136.50369262695312, + "loss": 0.6931, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.0005466601578518748, + "rewards/margins": -0.00021456097601912916, + "rewards/rejected": 0.0007612211629748344, "step": 30 }, { - "debug/losses": 0.2238575965166092, - "debug/policy_weights": 0.32308024168014526, - "debug/raw_losses": 0.6928759217262268, "epoch": 0.03, "learning_rate": 1.5873015873015872e-07, - "logits/chosen": -2.695948362350464, - "logits/rejected": -2.6865978240966797, - "logps/chosen": -135.11508178710938, - "logps/rejected": -144.54905700683594, - "loss": 0.223, - "rewards/accuracies": 0.5249999761581421, - "rewards/chosen": 0.00032043398823589087, - "rewards/margins": 0.0005478252423927188, - "rewards/rejected": -0.00022739116684533656, + "logits/chosen": -2.6958394050598145, + "logits/rejected": -2.686532974243164, + "logps/chosen": -134.98963928222656, + "logps/rejected": -144.46652221679688, + "loss": 0.6928, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": 0.0015748919686302543, + "rewards/margins": 0.0009769219905138016, + "rewards/rejected": 0.0005979698617011309, "step": 40 }, { - "debug/losses": 0.21922382712364197, - "debug/policy_weights": 0.3166070878505707, - "debug/raw_losses": 0.6922389268875122, "epoch": 0.04, "learning_rate": 1.984126984126984e-07, - "logits/chosen": -2.7039008140563965, - "logits/rejected": -2.685803174972534, - "logps/chosen": -150.06460571289062, - "logps/rejected": -145.3826446533203, - "loss": 0.2219, - "rewards/accuracies": 0.581250011920929, - "rewards/chosen": 0.0017308598617091775, - "rewards/margins": 0.0018328956793993711, - "rewards/rejected": -0.00010203566489508376, + "logits/chosen": -2.7042899131774902, + "logits/rejected": -2.6861345767974854, + "logps/chosen": -149.71768188476562, + "logps/rejected": -145.0757293701172, + "loss": 0.6921, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.005199921317398548, + "rewards/margins": 0.0022330707870423794, + "rewards/rejected": 0.0029668500646948814, "step": 50 }, { - "debug/losses": 0.215969055891037, - "debug/policy_weights": 0.31185799837112427, - "debug/raw_losses": 0.6923114061355591, "epoch": 0.05, "learning_rate": 2.3809523809523806e-07, - "logits/chosen": -2.7042651176452637, - "logits/rejected": -2.68457293510437, - "logps/chosen": -155.12266540527344, - "logps/rejected": -152.2425537109375, - "loss": 0.2234, - "rewards/accuracies": 0.53125, - "rewards/chosen": -0.0017481986433267593, - "rewards/margins": 0.0017303129425272346, - "rewards/rejected": -0.0034785110037773848, + "logits/chosen": -2.705153703689575, + "logits/rejected": -2.685439348220825, + "logps/chosen": -154.3783416748047, + "logps/rejected": -151.54519653320312, + "loss": 0.6912, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": 0.00569504126906395, + "rewards/margins": 0.0022000311873853207, + "rewards/rejected": 0.003495010081678629, "step": 60 }, { - "debug/losses": 0.21945813298225403, - "debug/policy_weights": 0.3181782364845276, - "debug/raw_losses": 0.68887859582901, "epoch": 0.06, "learning_rate": 2.7777777777777776e-07, - "logits/chosen": -2.699489116668701, - "logits/rejected": -2.69026780128479, - "logps/chosen": -147.7420196533203, - "logps/rejected": -140.0685272216797, - "loss": 0.2196, - "rewards/accuracies": 0.5687500238418579, - "rewards/chosen": -0.0035471641458570957, - "rewards/margins": 0.00880121998488903, - "rewards/rejected": -0.012348385527729988, + "logits/chosen": -2.7017154693603516, + "logits/rejected": -2.6924962997436523, + "logps/chosen": -146.3284149169922, + "logps/rejected": -138.79405212402344, + "loss": 0.6885, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.010588793084025383, + "rewards/margins": 0.010192448273301125, + "rewards/rejected": 0.00039634370477870107, "step": 70 }, { - "debug/losses": 0.2016005963087082, - "debug/policy_weights": 0.29311949014663696, - "debug/raw_losses": 0.6875921487808228, "epoch": 0.06, "learning_rate": 3.1746031746031743e-07, - "logits/chosen": -2.710141658782959, - "logits/rejected": -2.690600633621216, - "logps/chosen": -144.12033081054688, - "logps/rejected": -149.16067504882812, - "loss": 0.2084, - "rewards/accuracies": 0.581250011920929, - "rewards/chosen": -0.018289152532815933, - "rewards/margins": 0.011940783821046352, - "rewards/rejected": -0.03022993728518486, + "logits/chosen": -2.7155232429504395, + "logits/rejected": -2.696071147918701, + "logps/chosen": -141.80067443847656, + "logps/rejected": -147.0068817138672, + "loss": 0.6867, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": 0.0049073463305830956, + "rewards/margins": 0.013599385507404804, + "rewards/rejected": -0.008692039176821709, "step": 80 }, { - "debug/losses": 0.19069644808769226, - "debug/policy_weights": 0.2775285840034485, - "debug/raw_losses": 0.6863588094711304, "epoch": 0.07, "learning_rate": 3.5714285714285716e-07, - "logits/chosen": -2.7110238075256348, - "logits/rejected": -2.7015674114227295, - "logps/chosen": -156.34317016601562, - "logps/rejected": -149.517578125, - "loss": 0.1921, - "rewards/accuracies": 0.574999988079071, - "rewards/chosen": -0.06105213612318039, - "rewards/margins": 0.014811128377914429, - "rewards/rejected": -0.07586327940225601, + "logits/chosen": -2.7175304889678955, + "logits/rejected": -2.7080624103546143, + "logps/chosen": -153.12509155273438, + "logps/rejected": -146.53590393066406, + "loss": 0.6847, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.028871387243270874, + "rewards/margins": 0.017175236716866493, + "rewards/rejected": -0.046046625822782516, "step": 90 }, { - "debug/losses": 0.1722869724035263, - "debug/policy_weights": 0.2533736228942871, - "debug/raw_losses": 0.6810176968574524, "epoch": 0.08, "learning_rate": 3.968253968253968e-07, - "logits/chosen": -2.7493042945861816, - "logits/rejected": -2.7416834831237793, - "logps/chosen": -167.77740478515625, - "logps/rejected": -167.22402954101562, - "loss": 0.1732, - "rewards/accuracies": 0.574999988079071, - "rewards/chosen": -0.10518421232700348, - "rewards/margins": 0.02694917842745781, - "rewards/rejected": -0.1321333944797516, + "logits/chosen": -2.7524733543395996, + "logits/rejected": -2.7452526092529297, + "logps/chosen": -163.88070678710938, + "logps/rejected": -163.61032104492188, + "loss": 0.6789, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.0662173330783844, + "rewards/margins": 0.02977912127971649, + "rewards/rejected": -0.09599645435810089, "step": 100 }, { "epoch": 0.08, - "eval_debug/losses": 0.16490881145000458, - "eval_debug/policy_weights": 0.24299795925617218, - "eval_debug/raw_losses": 0.6787429451942444, - "eval_logits/chosen": -2.732400417327881, - "eval_logits/rejected": -2.7241837978363037, - "eval_logps/chosen": -158.16163635253906, - "eval_logps/rejected": -168.13601684570312, - "eval_loss": 0.16647696495056152, - "eval_rewards/accuracies": 0.608208954334259, - "eval_rewards/chosen": -0.135881245136261, - "eval_rewards/margins": 0.03413529321551323, - "eval_rewards/rejected": -0.17001654207706451, - "eval_runtime": 184.3569, - "eval_samples_per_second": 46.388, + "eval_logits/chosen": -2.7336502075195312, + "eval_logits/rejected": -2.7255024909973145, + "eval_logps/chosen": -155.19271850585938, + "eval_logps/rejected": -165.35523986816406, + "eval_loss": 0.6769910454750061, + "eval_rewards/accuracies": 0.5914179086685181, + "eval_rewards/chosen": -0.10619194805622101, + "eval_rewards/margins": 0.03601696714758873, + "eval_rewards/rejected": -0.14220890402793884, + "eval_runtime": 184.3291, + "eval_samples_per_second": 46.395, "eval_steps_per_second": 0.727, "step": 100 }, { - "debug/losses": 0.16428187489509583, - "debug/policy_weights": 0.2392692118883133, - "debug/raw_losses": 0.6870724558830261, "epoch": 0.09, "learning_rate": 4.365079365079365e-07, - "logits/chosen": -2.7362523078918457, - "logits/rejected": -2.7247262001037598, - "logps/chosen": -166.95001220703125, - "logps/rejected": -161.81370544433594, - "loss": 0.158, - "rewards/accuracies": 0.543749988079071, - "rewards/chosen": -0.18869003653526306, - "rewards/margins": 0.02125888131558895, - "rewards/rejected": -0.20994892716407776, + "logits/chosen": -2.738532543182373, + "logits/rejected": -2.7273170948028564, + "logps/chosen": -164.2928009033203, + "logps/rejected": -160.19398498535156, + "loss": 0.6738, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.16211798787117004, + "rewards/margins": 0.03163355216383934, + "rewards/rejected": -0.19375154376029968, "step": 110 }, { - "debug/losses": 0.1403961032629013, - "debug/policy_weights": 0.20996880531311035, - "debug/raw_losses": 0.6652695536613464, "epoch": 0.1, "learning_rate": 4.761904761904761e-07, - "logits/chosen": -2.7246499061584473, - "logits/rejected": -2.7004261016845703, - "logps/chosen": -197.12600708007812, - "logps/rejected": -195.83370971679688, - "loss": 0.1305, - "rewards/accuracies": 0.5687500238418579, - "rewards/chosen": -0.29605621099472046, - "rewards/margins": 0.07087300717830658, - "rewards/rejected": -0.36692923307418823, + "logits/chosen": -2.7289297580718994, + "logits/rejected": -2.705962657928467, + "logps/chosen": -196.69662475585938, + "logps/rejected": -197.2833251953125, + "loss": 0.661, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.2917623221874237, + "rewards/margins": 0.08966299891471863, + "rewards/rejected": -0.38142532110214233, "step": 120 }, { - "debug/losses": 0.08349309861660004, - "debug/policy_weights": 0.1281343251466751, - "debug/raw_losses": 0.6588132977485657, "epoch": 0.1, "learning_rate": 4.999845414634076e-07, - "logits/chosen": -2.646498918533325, - "logits/rejected": -2.619450330734253, - "logps/chosen": -200.52162170410156, - "logps/rejected": -198.05235290527344, - "loss": 0.0876, - "rewards/accuracies": 0.625, - "rewards/chosen": -0.5262889266014099, - "rewards/margins": 0.0869952142238617, - "rewards/rejected": -0.6132841110229492, + "logits/chosen": -2.658005475997925, + "logits/rejected": -2.6317684650421143, + "logps/chosen": -187.4532928466797, + "logps/rejected": -188.37689208984375, + "loss": 0.6542, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.3956056833267212, + "rewards/margins": 0.12092368304729462, + "rewards/rejected": -0.5165294408798218, "step": 130 }, { - "debug/losses": 0.10831795632839203, - "debug/policy_weights": 0.16901585459709167, - "debug/raw_losses": 0.642255425453186, "epoch": 0.11, "learning_rate": 4.998106548810311e-07, - "logits/chosen": -2.6711044311523438, - "logits/rejected": -2.6733062267303467, - "logps/chosen": -202.9497528076172, - "logps/rejected": -243.6705780029297, - "loss": 0.1068, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -0.45544663071632385, - "rewards/margins": 0.14908082783222198, - "rewards/rejected": -0.604527473449707, + "logits/chosen": -2.6906683444976807, + "logits/rejected": -2.6913747787475586, + "logps/chosen": -199.67568969726562, + "logps/rejected": -253.02487182617188, + "loss": 0.6171, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.4227059781551361, + "rewards/margins": 0.27536457777023315, + "rewards/rejected": -0.6980706453323364, "step": 140 }, { - "debug/losses": 0.09790829569101334, - "debug/policy_weights": 0.14507228136062622, - "debug/raw_losses": 0.6637815237045288, "epoch": 0.12, "learning_rate": 4.994436933879359e-07, - "logits/chosen": -2.6321330070495605, - "logits/rejected": -2.607536554336548, - "logps/chosen": -224.5740203857422, - "logps/rejected": -222.1067352294922, - "loss": 0.0859, - "rewards/accuracies": 0.574999988079071, - "rewards/chosen": -0.6347376108169556, - "rewards/margins": 0.09920062869787216, - "rewards/rejected": -0.7339382171630859, + "logits/chosen": -2.6662166118621826, + "logits/rejected": -2.644784927368164, + "logps/chosen": -197.07180786132812, + "logps/rejected": -198.4012908935547, + "loss": 0.6395, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.3597154915332794, + "rewards/margins": 0.13716872036457062, + "rewards/rejected": -0.49688419699668884, "step": 150 }, { - "debug/losses": 0.09540105611085892, - "debug/policy_weights": 0.15022841095924377, - "debug/raw_losses": 0.6407302021980286, "epoch": 0.13, "learning_rate": 4.988839406031596e-07, - "logits/chosen": -2.625136613845825, - "logits/rejected": -2.610718011856079, - "logps/chosen": -208.863037109375, - "logps/rejected": -227.71347045898438, - "loss": 0.0791, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -0.6255360245704651, - "rewards/margins": 0.16525237262248993, - "rewards/rejected": -0.7907883524894714, + "logits/chosen": -2.647681474685669, + "logits/rejected": -2.6395888328552246, + "logps/chosen": -182.04420471191406, + "logps/rejected": -206.59780883789062, + "loss": 0.629, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.3573477864265442, + "rewards/margins": 0.2222837507724762, + "rewards/rejected": -0.579631507396698, "step": 160 }, { - "debug/losses": 0.10042669624090195, - "debug/policy_weights": 0.14662522077560425, - "debug/raw_losses": 0.6630629301071167, "epoch": 0.14, "learning_rate": 4.981318291512395e-07, - "logits/chosen": -2.62790584564209, - "logits/rejected": -2.6032207012176514, - "logps/chosen": -215.50753784179688, - "logps/rejected": -209.77294921875, - "loss": 0.0919, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -0.6275609135627747, - "rewards/margins": 0.12133459746837616, - "rewards/rejected": -0.7488954663276672, + "logits/chosen": -2.619232654571533, + "logits/rejected": -2.598362684249878, + "logps/chosen": -227.0933380126953, + "logps/rejected": -230.9747772216797, + "loss": 0.6242, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.7434185743331909, + "rewards/margins": 0.21749505400657654, + "rewards/rejected": -0.9609137773513794, "step": 170 }, { - "debug/losses": 0.07955813407897949, - "debug/policy_weights": 0.11796853691339493, - "debug/raw_losses": 0.6584170460700989, "epoch": 0.14, "learning_rate": 4.971879403278432e-07, - "logits/chosen": -2.604243040084839, - "logits/rejected": -2.5767505168914795, - "logps/chosen": -244.2146759033203, - "logps/rejected": -237.11184692382812, - "loss": 0.08, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -0.7597829699516296, - "rewards/margins": 0.12581536173820496, - "rewards/rejected": -0.885598361492157, + "logits/chosen": -2.5654754638671875, + "logits/rejected": -2.5364232063293457, + "logps/chosen": -241.6617431640625, + "logps/rejected": -245.66268920898438, + "loss": 0.6151, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.7342535257339478, + "rewards/margins": 0.23685339093208313, + "rewards/rejected": -0.9711068868637085, "step": 180 }, { - "debug/losses": 0.07059400528669357, - "debug/policy_weights": 0.11607640981674194, - "debug/raw_losses": 0.63481605052948, "epoch": 0.15, "learning_rate": 4.960530036504941e-07, - "logits/chosen": -2.5897984504699707, - "logits/rejected": -2.5524086952209473, - "logps/chosen": -235.70455932617188, - "logps/rejected": -242.70114135742188, - "loss": 0.0765, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -0.8081339597702026, - "rewards/margins": 0.1998949944972992, - "rewards/rejected": -1.0080288648605347, + "logits/chosen": -2.5271048545837402, + "logits/rejected": -2.486818790435791, + "logps/chosen": -235.6089630126953, + "logps/rejected": -251.17758178710938, + "loss": 0.6215, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.807177722454071, + "rewards/margins": 0.28561535477638245, + "rewards/rejected": -1.0927931070327759, "step": 190 }, { - "debug/losses": 0.048418544232845306, - "debug/policy_weights": 0.07491892576217651, - "debug/raw_losses": 0.6455397605895996, "epoch": 0.16, "learning_rate": 4.947278962947386e-07, - "logits/chosen": -2.5036425590515137, - "logits/rejected": -2.4946646690368652, - "logps/chosen": -262.738037109375, - "logps/rejected": -272.38470458984375, - "loss": 0.0483, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -1.2029519081115723, - "rewards/margins": 0.1698470562696457, - "rewards/rejected": -1.3727989196777344, + "logits/chosen": -2.4217896461486816, + "logits/rejected": -2.413295269012451, + "logps/chosen": -251.0736083984375, + "logps/rejected": -268.6098937988281, + "loss": 0.6062, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.086307406425476, + "rewards/margins": 0.24874301254749298, + "rewards/rejected": -1.3350504636764526, "step": 200 }, { "epoch": 0.16, - "eval_debug/losses": 0.039192084223032, - "eval_debug/policy_weights": 0.06250785291194916, - "eval_debug/raw_losses": 0.6507577300071716, - "eval_logits/chosen": -2.5177671909332275, - "eval_logits/rejected": -2.5073537826538086, - "eval_logps/chosen": -274.5804138183594, - "eval_logps/rejected": -296.9352111816406, - "eval_loss": 0.041309457272291183, - "eval_rewards/accuracies": 0.6054104566574097, - "eval_rewards/chosen": -1.3000690937042236, - "eval_rewards/margins": 0.15793955326080322, - "eval_rewards/rejected": -1.4580085277557373, - "eval_runtime": 184.2603, - "eval_samples_per_second": 46.413, - "eval_steps_per_second": 0.727, + "eval_logits/chosen": -2.3855514526367188, + "eval_logits/rejected": -2.369593858718872, + "eval_logps/chosen": -246.6970672607422, + "eval_logps/rejected": -289.8621826171875, + "eval_loss": 0.6079375743865967, + "eval_rewards/accuracies": 0.66697758436203, + "eval_rewards/chosen": -1.021235704421997, + "eval_rewards/margins": 0.3660426437854767, + "eval_rewards/rejected": -1.3872781991958618, + "eval_runtime": 184.172, + "eval_samples_per_second": 46.435, + "eval_steps_per_second": 0.728, "step": 200 }, { - "debug/losses": 0.042204149067401886, - "debug/policy_weights": 0.06353110074996948, - "debug/raw_losses": 0.6565446257591248, "epoch": 0.17, "learning_rate": 4.932136424161899e-07, - "logits/chosen": -2.542813777923584, - "logits/rejected": -2.536198854446411, - "logps/chosen": -273.3008728027344, - "logps/rejected": -285.6123962402344, - "loss": 0.0392, - "rewards/accuracies": 0.612500011920929, - "rewards/chosen": -1.2830891609191895, - "rewards/margins": 0.13253197073936462, - "rewards/rejected": -1.415621042251587, + "logits/chosen": -2.3366785049438477, + "logits/rejected": -2.3228511810302734, + "logps/chosen": -266.292236328125, + "logps/rejected": -300.22894287109375, + "loss": 0.5893, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.2130026817321777, + "rewards/margins": 0.3487839996814728, + "rewards/rejected": -1.5617868900299072, "step": 210 }, { - "debug/losses": 0.04242248088121414, - "debug/policy_weights": 0.07260903716087341, - "debug/raw_losses": 0.6074560880661011, "epoch": 0.18, "learning_rate": 4.915114123589732e-07, - "logits/chosen": -2.515390157699585, - "logits/rejected": -2.498671293258667, - "logps/chosen": -251.6487579345703, - "logps/rejected": -287.68841552734375, - "loss": 0.0493, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -1.1059857606887817, - "rewards/margins": 0.27616703510284424, - "rewards/rejected": -1.382152795791626, + "logits/chosen": -2.321228504180908, + "logits/rejected": -2.3033699989318848, + "logps/chosen": -336.34161376953125, + "logps/rejected": -373.39935302734375, + "loss": 0.612, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.9529145956039429, + "rewards/margins": 0.2863468527793884, + "rewards/rejected": -2.2392613887786865, "step": 220 }, { - "debug/losses": 0.05788778141140938, - "debug/policy_weights": 0.09594342112541199, - "debug/raw_losses": 0.5971055030822754, "epoch": 0.18, "learning_rate": 4.896225217511849e-07, - "logits/chosen": -2.5137414932250977, - "logits/rejected": -2.504149913787842, - "logps/chosen": -248.02224731445312, - "logps/rejected": -283.90057373046875, - "loss": 0.056, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -1.0006108283996582, - "rewards/margins": 0.32431262731552124, - "rewards/rejected": -1.3249233961105347, + "logits/chosen": -2.4310107231140137, + "logits/rejected": -2.422048568725586, + "logps/chosen": -291.1025695800781, + "logps/rejected": -328.18963623046875, + "loss": 0.6079, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.4314143657684326, + "rewards/margins": 0.3364001214504242, + "rewards/rejected": -1.7678143978118896, "step": 230 }, { - "debug/losses": 0.05111226439476013, - "debug/policy_weights": 0.08123798668384552, - "debug/raw_losses": 0.6424316763877869, "epoch": 0.19, "learning_rate": 4.875484304880629e-07, - "logits/chosen": -2.43083119392395, - "logits/rejected": -2.3957576751708984, - "logps/chosen": -262.88922119140625, - "logps/rejected": -279.9705505371094, - "loss": 0.0539, - "rewards/accuracies": 0.625, - "rewards/chosen": -1.1294162273406982, - "rewards/margins": 0.26736220717430115, - "rewards/rejected": -1.3967783451080322, + "logits/chosen": -2.3412394523620605, + "logits/rejected": -2.309183120727539, + "logps/chosen": -280.8785705566406, + "logps/rejected": -308.54132080078125, + "loss": 0.613, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.309309482574463, + "rewards/margins": 0.3731766939163208, + "rewards/rejected": -1.6824861764907837, "step": 240 }, { - "debug/losses": 0.039814677089452744, - "debug/policy_weights": 0.07425907254219055, - "debug/raw_losses": 0.581795871257782, "epoch": 0.2, "learning_rate": 4.852907416036558e-07, - "logits/chosen": -2.3591930866241455, - "logits/rejected": -2.3617148399353027, - "logps/chosen": -280.09002685546875, - "logps/rejected": -331.4532470703125, - "loss": 0.0428, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -1.332343339920044, - "rewards/margins": 0.41984838247299194, - "rewards/rejected": -1.7521917819976807, + "logits/chosen": -2.415271282196045, + "logits/rejected": -2.4072234630584717, + "logps/chosen": -243.56332397460938, + "logps/rejected": -298.7532043457031, + "loss": 0.591, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.967076301574707, + "rewards/margins": 0.4581146240234375, + "rewards/rejected": -1.4251911640167236, "step": 250 }, { - "debug/losses": 0.03227003663778305, - "debug/policy_weights": 0.05068012326955795, - "debug/raw_losses": 0.6115289926528931, "epoch": 0.21, "learning_rate": 4.828512000318616e-07, - "logits/chosen": -2.2910244464874268, - "logits/rejected": -2.265855312347412, - "logps/chosen": -318.79083251953125, - "logps/rejected": -349.7292175292969, - "loss": 0.0331, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -1.740731954574585, - "rewards/margins": 0.39039865136146545, - "rewards/rejected": -2.1311306953430176, + "logits/chosen": -2.3924427032470703, + "logits/rejected": -2.3613152503967285, + "logps/chosen": -266.86572265625, + "logps/rejected": -304.2983093261719, + "loss": 0.5986, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.2214807271957397, + "rewards/margins": 0.4553411602973938, + "rewards/rejected": -1.6768219470977783, "step": 260 }, { - "debug/losses": 0.018996205180883408, - "debug/policy_weights": 0.03658801317214966, - "debug/raw_losses": 0.5983703136444092, "epoch": 0.21, "learning_rate": 4.802316912577946e-07, - "logits/chosen": -2.3124804496765137, - "logits/rejected": -2.2964730262756348, - "logps/chosen": -354.82611083984375, - "logps/rejected": -391.5228576660156, - "loss": 0.0237, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -2.10276460647583, - "rewards/margins": 0.3387022912502289, - "rewards/rejected": -2.441466808319092, + "logits/chosen": -2.4108529090881348, + "logits/rejected": -2.3902478218078613, + "logps/chosen": -252.7959442138672, + "logps/rejected": -295.266357421875, + "loss": 0.5917, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.0824626684188843, + "rewards/margins": 0.39643940329551697, + "rewards/rejected": -1.4789022207260132, "step": 270 }, { - "debug/losses": 0.02894631028175354, - "debug/policy_weights": 0.04743387550115585, - "debug/raw_losses": 0.638053834438324, "epoch": 0.22, "learning_rate": 4.774342398605221e-07, - "logits/chosen": -2.3621456623077393, - "logits/rejected": -2.316755771636963, - "logps/chosen": -357.7198181152344, - "logps/rejected": -356.85150146484375, - "loss": 0.028, - "rewards/accuracies": 0.5562499761581421, - "rewards/chosen": -1.9571921825408936, - "rewards/margins": 0.21609549224376678, - "rewards/rejected": -2.173287868499756, + "logits/chosen": -2.3505263328552246, + "logits/rejected": -2.2942967414855957, + "logps/chosen": -279.871337890625, + "logps/rejected": -300.4220886230469, + "loss": 0.5979, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.1787078380584717, + "rewards/margins": 0.430286169052124, + "rewards/rejected": -1.6089938879013062, "step": 280 }, { - "debug/losses": 0.02991945669054985, - "debug/policy_weights": 0.04959987848997116, - "debug/raw_losses": 0.6367121934890747, "epoch": 0.23, "learning_rate": 4.744610079482978e-07, - "logits/chosen": -2.3850038051605225, - "logits/rejected": -2.360288143157959, - "logps/chosen": -294.8199462890625, - "logps/rejected": -307.3962097167969, - "loss": 0.0322, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -1.5708074569702148, - "rewards/margins": 0.21207168698310852, - "rewards/rejected": -1.782879114151001, + "logits/chosen": -2.3269264698028564, + "logits/rejected": -2.2910802364349365, + "logps/chosen": -255.27706909179688, + "logps/rejected": -281.60137939453125, + "loss": 0.5853, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.1753785610198975, + "rewards/margins": 0.3495523929595947, + "rewards/rejected": -1.5249310731887817, "step": 290 }, { - "debug/losses": 0.03670433908700943, - "debug/policy_weights": 0.059030693024396896, - "debug/raw_losses": 0.6275792717933655, "epoch": 0.24, "learning_rate": 4.713142934875005e-07, - "logits/chosen": -2.362570285797119, - "logits/rejected": -2.3510634899139404, - "logps/chosen": -315.82220458984375, - "logps/rejected": -342.0279846191406, - "loss": 0.036, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -1.6525237560272217, - "rewards/margins": 0.27661144733428955, - "rewards/rejected": -1.9291350841522217, + "logits/chosen": -2.2868428230285645, + "logits/rejected": -2.2631592750549316, + "logps/chosen": -284.2200012207031, + "logps/rejected": -322.45269775390625, + "loss": 0.5965, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.336501955986023, + "rewards/margins": 0.3968800902366638, + "rewards/rejected": -1.733382225036621, "step": 300 }, { "epoch": 0.24, - "eval_debug/losses": 0.03547443822026253, - "eval_debug/policy_weights": 0.058042485266923904, - "eval_debug/raw_losses": 0.6226081848144531, - "eval_logits/chosen": -2.39660382270813, - "eval_logits/rejected": -2.3858516216278076, - "eval_logps/chosen": -303.1715393066406, - "eval_logps/rejected": -338.63232421875, - "eval_loss": 0.03670888766646385, - "eval_rewards/accuracies": 0.6464552283287048, - "eval_rewards/chosen": -1.58597993850708, - "eval_rewards/margins": 0.2889997065067291, - "eval_rewards/rejected": -1.8749797344207764, - "eval_runtime": 184.2664, - "eval_samples_per_second": 46.411, - "eval_steps_per_second": 0.727, + "eval_logits/chosen": -2.265592098236084, + "eval_logits/rejected": -2.244987964630127, + "eval_logps/chosen": -282.3620910644531, + "eval_logps/rejected": -331.2099609375, + "eval_loss": 0.5907339453697205, + "eval_rewards/accuracies": 0.6623134613037109, + "eval_rewards/chosen": -1.3778856992721558, + "eval_rewards/margins": 0.42287060618400574, + "eval_rewards/rejected": -1.8007562160491943, + "eval_runtime": 184.1739, + "eval_samples_per_second": 46.434, + "eval_steps_per_second": 0.728, "step": 300 }, { - "debug/losses": 0.036319419741630554, - "debug/policy_weights": 0.060166917741298676, - "debug/raw_losses": 0.6005429029464722, "epoch": 0.25, "learning_rate": 4.679965285265706e-07, - "logits/chosen": -2.423051118850708, - "logits/rejected": -2.4310872554779053, - "logps/chosen": -293.30133056640625, - "logps/rejected": -348.7977294921875, - "loss": 0.0379, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -1.513527512550354, - "rewards/margins": 0.3395446538925171, - "rewards/rejected": -1.853071928024292, + "logits/chosen": -2.2354235649108887, + "logits/rejected": -2.23685884475708, + "logps/chosen": -277.09283447265625, + "logps/rejected": -347.7145080566406, + "loss": 0.5612, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.3514426946640015, + "rewards/margins": 0.4907970428466797, + "rewards/rejected": -1.8422397375106812, "step": 310 }, { - "debug/losses": 0.04854508861899376, - "debug/policy_weights": 0.06750203669071198, - "debug/raw_losses": 0.6850990056991577, "epoch": 0.25, "learning_rate": 4.64510277316316e-07, - "logits/chosen": -2.4229087829589844, - "logits/rejected": -2.42700457572937, - "logps/chosen": -294.2820739746094, - "logps/rejected": -338.6139831542969, - "loss": 0.0455, - "rewards/accuracies": 0.59375, - "rewards/chosen": -1.559177279472351, - "rewards/margins": 0.23085832595825195, - "rewards/rejected": -1.790035605430603, + "logits/chosen": -2.2262344360351562, + "logits/rejected": -2.226029634475708, + "logps/chosen": -271.74212646484375, + "logps/rejected": -332.5010986328125, + "loss": 0.5903, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.3337775468826294, + "rewards/margins": 0.39512914419174194, + "rewards/rejected": -1.7289068698883057, "step": 320 }, { - "debug/losses": 0.04818582534790039, - "debug/policy_weights": 0.08540809154510498, - "debug/raw_losses": 0.5591022372245789, "epoch": 0.26, "learning_rate": 4.6085823432804137e-07, - "logits/chosen": -2.496591329574585, - "logits/rejected": -2.5066230297088623, - "logps/chosen": -267.3396301269531, - "logps/rejected": -345.18218994140625, - "loss": 0.044, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.2996494770050049, - "rewards/margins": 0.4524403214454651, - "rewards/rejected": -1.7520897388458252, + "logits/chosen": -2.2451891899108887, + "logits/rejected": -2.2502384185791016, + "logps/chosen": -250.6347198486328, + "logps/rejected": -333.8939208984375, + "loss": 0.5722, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.1326004266738892, + "rewards/margins": 0.5066065192222595, + "rewards/rejected": -1.639206886291504, "step": 330 }, { - "debug/losses": 0.03025621734559536, - "debug/policy_weights": 0.05608632415533066, - "debug/raw_losses": 0.5739361047744751, "epoch": 0.27, "learning_rate": 4.570432221710314e-07, - "logits/chosen": -2.5042598247528076, - "logits/rejected": -2.481506109237671, - "logps/chosen": -325.81378173828125, - "logps/rejected": -355.43731689453125, - "loss": 0.0297, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -1.6599925756454468, - "rewards/margins": 0.3773336112499237, - "rewards/rejected": -2.0373263359069824, + "logits/chosen": -2.0656931400299072, + "logits/rejected": -2.0213730335235596, + "logps/chosen": -318.232177734375, + "logps/rejected": -369.13311767578125, + "loss": 0.5766, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.584176778793335, + "rewards/margins": 0.5901076197624207, + "rewards/rejected": -2.1742844581604004, "step": 340 }, { - "debug/losses": 0.03563963621854782, - "debug/policy_weights": 0.06325431168079376, - "debug/raw_losses": 0.6090465188026428, "epoch": 0.28, "learning_rate": 4.5306818941099866e-07, - "logits/chosen": -2.484764814376831, - "logits/rejected": -2.4560904502868652, - "logps/chosen": -318.2516174316406, - "logps/rejected": -337.520263671875, - "loss": 0.0352, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -1.5754101276397705, - "rewards/margins": 0.34831732511520386, - "rewards/rejected": -1.9237273931503296, + "logits/chosen": -1.9084612131118774, + "logits/rejected": -1.8514792919158936, + "logps/chosen": -316.9821472167969, + "logps/rejected": -352.9412841796875, + "loss": 0.5825, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.5627154111862183, + "rewards/margins": 0.5152220726013184, + "rewards/rejected": -2.077937364578247, "step": 350 }, { - "debug/losses": 0.04237288981676102, - "debug/policy_weights": 0.0663161650300026, - "debug/raw_losses": 0.5946708917617798, "epoch": 0.29, "learning_rate": 4.4893620829118124e-07, - "logits/chosen": -2.436422824859619, - "logits/rejected": -2.410994291305542, - "logps/chosen": -318.0573425292969, - "logps/rejected": -355.05810546875, - "loss": 0.0361, + "logits/chosen": -1.8860156536102295, + "logits/rejected": -1.8301204442977905, + "logps/chosen": -309.8200378417969, + "logps/rejected": -362.0408935546875, + "loss": 0.5755, "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -1.6693121194839478, - "rewards/margins": 0.3412826359272003, - "rewards/rejected": -2.0105948448181152, + "rewards/chosen": -1.5869390964508057, + "rewards/margins": 0.49348369240760803, + "rewards/rejected": -2.080422878265381, "step": 360 }, { - "debug/losses": 0.02816173806786537, - "debug/policy_weights": 0.04951995611190796, - "debug/raw_losses": 0.5870548486709595, "epoch": 0.29, "learning_rate": 4.4465047235785185e-07, - "logits/chosen": -2.3947503566741943, - "logits/rejected": -2.3597781658172607, - "logps/chosen": -323.8911437988281, - "logps/rejected": -360.64093017578125, - "loss": 0.0307, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -1.7950156927108765, - "rewards/margins": 0.44233280420303345, - "rewards/rejected": -2.2373485565185547, + "logits/chosen": -1.6610889434814453, + "logits/rejected": -1.585129737854004, + "logps/chosen": -321.8608703613281, + "logps/rejected": -380.31036376953125, + "loss": 0.5697, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.774713158607483, + "rewards/margins": 0.6593302488327026, + "rewards/rejected": -2.4340434074401855, "step": 370 }, { - "debug/losses": 0.038584090769290924, - "debug/policy_weights": 0.06455515325069427, - "debug/raw_losses": 0.5904648900032043, "epoch": 0.3, "learning_rate": 4.40214293992074e-07, - "logits/chosen": -2.3989882469177246, - "logits/rejected": -2.3822994232177734, - "logps/chosen": -328.2476806640625, - "logps/rejected": -382.66058349609375, - "loss": 0.0315, - "rewards/accuracies": 0.65625, - "rewards/chosen": -1.700849175453186, - "rewards/margins": 0.4714280664920807, - "rewards/rejected": -2.1722772121429443, + "logits/chosen": -1.385825753211975, + "logits/rejected": -1.31913161277771, + "logps/chosen": -377.07269287109375, + "logps/rejected": -459.5557556152344, + "loss": 0.5818, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -2.1890993118286133, + "rewards/margins": 0.7521292567253113, + "rewards/rejected": -2.9412286281585693, "step": 380 }, { - "debug/losses": 0.031948018819093704, - "debug/policy_weights": 0.05189087241888046, - "debug/raw_losses": 0.5594754815101624, "epoch": 0.31, "learning_rate": 4.3563110184961234e-07, - "logits/chosen": -2.4315905570983887, - "logits/rejected": -2.3960282802581787, - "logps/chosen": -308.19256591796875, - "logps/rejected": -356.39794921875, - "loss": 0.0337, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -1.6025543212890625, - "rewards/margins": 0.49212759733200073, - "rewards/rejected": -2.094681978225708, + "logits/chosen": -1.5089499950408936, + "logits/rejected": -1.4075387716293335, + "logps/chosen": -338.3626708984375, + "logps/rejected": -396.67578125, + "loss": 0.5584, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.9042552709579468, + "rewards/margins": 0.5932050347328186, + "rewards/rejected": -2.49746036529541, "step": 390 }, { - "debug/losses": 0.030580919235944748, - "debug/policy_weights": 0.05365537852048874, - "debug/raw_losses": 0.6114270091056824, "epoch": 0.32, "learning_rate": 4.3090443821097566e-07, - "logits/chosen": -2.3623757362365723, - "logits/rejected": -2.349914789199829, - "logps/chosen": -275.2207336425781, - "logps/rejected": -316.4654235839844, - "loss": 0.0407, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -1.5609982013702393, - "rewards/margins": 0.34694620966911316, - "rewards/rejected": -1.9079444408416748, + "logits/chosen": -1.2587625980377197, + "logits/rejected": -1.2017955780029297, + "logps/chosen": -309.43377685546875, + "logps/rejected": -372.00531005859375, + "loss": 0.5729, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.9031288623809814, + "rewards/margins": 0.5602144598960876, + "rewards/rejected": -2.4633431434631348, "step": 400 }, { "epoch": 0.32, - "eval_debug/losses": 0.041328318417072296, - "eval_debug/policy_weights": 0.06832444667816162, - "eval_debug/raw_losses": 0.6047241687774658, - "eval_logits/chosen": -2.395782232284546, - "eval_logits/rejected": -2.375157594680786, - "eval_logps/chosen": -302.2591552734375, - "eval_logps/rejected": -347.9583740234375, - "eval_loss": 0.04251013696193695, - "eval_rewards/accuracies": 0.6585820913314819, - "eval_rewards/chosen": -1.5768563747406006, - "eval_rewards/margins": 0.3913840651512146, - "eval_rewards/rejected": -1.96824049949646, - "eval_runtime": 184.3716, - "eval_samples_per_second": 46.385, - "eval_steps_per_second": 0.727, + "eval_logits/chosen": -1.3760210275650024, + "eval_logits/rejected": -1.2920024394989014, + "eval_logps/chosen": -312.20635986328125, + "eval_logps/rejected": -375.1720275878906, + "eval_loss": 0.5711147785186768, + "eval_rewards/accuracies": 0.6828358173370361, + "eval_rewards/chosen": -1.676328182220459, + "eval_rewards/margins": 0.5640482306480408, + "eval_rewards/rejected": -2.2403764724731445, + "eval_runtime": 184.465, + "eval_samples_per_second": 46.361, + "eval_steps_per_second": 0.726, "step": 400 }, { - "debug/losses": 0.03703906387090683, - "debug/policy_weights": 0.059621166437864304, - "debug/raw_losses": 0.5548309087753296, "epoch": 0.33, "learning_rate": 4.2603795624364195e-07, - "logits/chosen": -2.360203742980957, - "logits/rejected": -2.3422608375549316, - "logps/chosen": -300.69024658203125, - "logps/rejected": -364.30035400390625, - "loss": 0.0394, + "logits/chosen": -1.2894772291183472, + "logits/rejected": -1.23129141330719, + "logps/chosen": -299.457275390625, + "logps/rejected": -370.8555908203125, + "loss": 0.5666, "rewards/accuracies": 0.6875, - "rewards/chosen": -1.6177396774291992, - "rewards/margins": 0.5205414295196533, - "rewards/rejected": -2.1382811069488525, + "rewards/chosen": -1.6054102182388306, + "rewards/margins": 0.5984233021736145, + "rewards/rejected": -2.203833818435669, "step": 410 }, { - "debug/losses": 0.03448301553726196, - "debug/policy_weights": 0.05729634687304497, - "debug/raw_losses": 0.6115814447402954, "epoch": 0.33, "learning_rate": 4.210354171785795e-07, - "logits/chosen": -2.3332881927490234, - "logits/rejected": -2.3176493644714355, - "logps/chosen": -316.8100280761719, - "logps/rejected": -357.63946533203125, - "loss": 0.0354, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -1.8454933166503906, - "rewards/margins": 0.342943012714386, - "rewards/rejected": -2.188436508178711, + "logits/chosen": -1.022984266281128, + "logits/rejected": -0.9285897016525269, + "logps/chosen": -324.4284973144531, + "logps/rejected": -385.0074157714844, + "loss": 0.5596, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.921677589416504, + "rewards/margins": 0.5404387712478638, + "rewards/rejected": -2.4621164798736572, "step": 420 }, { - "debug/losses": 0.03320378437638283, - "debug/policy_weights": 0.056101541966199875, - "debug/raw_losses": 0.6030881404876709, "epoch": 0.34, "learning_rate": 4.15900687403248e-07, - "logits/chosen": -2.4058613777160645, - "logits/rejected": -2.3972506523132324, - "logps/chosen": -301.29168701171875, - "logps/rejected": -347.4168395996094, - "loss": 0.041, - "rewards/accuracies": 0.625, - "rewards/chosen": -1.607156753540039, - "rewards/margins": 0.3475479185581207, - "rewards/rejected": -1.9547046422958374, + "logits/chosen": -0.8059805631637573, + "logits/rejected": -0.7196700572967529, + "logps/chosen": -353.788330078125, + "logps/rejected": -411.4853515625, + "loss": 0.5865, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -2.1321234703063965, + "rewards/margins": 0.463266521692276, + "rewards/rejected": -2.5953898429870605, "step": 430 }, { - "debug/losses": 0.051971059292554855, - "debug/policy_weights": 0.0772247388958931, - "debug/raw_losses": 0.6139973402023315, "epoch": 0.35, "learning_rate": 4.1063773547332584e-07, - "logits/chosen": -2.55269193649292, - "logits/rejected": -2.517000436782837, - "logps/chosen": -275.25323486328125, - "logps/rejected": -292.97308349609375, - "loss": 0.048, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -1.1995950937271118, - "rewards/margins": 0.3117036819458008, - "rewards/rejected": -1.511298656463623, + "logits/chosen": -0.9645301699638367, + "logits/rejected": -0.7601315975189209, + "logps/chosen": -346.8272705078125, + "logps/rejected": -392.2935791015625, + "loss": 0.5591, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.9153356552124023, + "rewards/margins": 0.5891679525375366, + "rewards/rejected": -2.5045037269592285, "step": 440 }, { - "debug/losses": 0.04156716167926788, - "debug/policy_weights": 0.07474641501903534, - "debug/raw_losses": 0.5771032571792603, "epoch": 0.36, "learning_rate": 4.0525062904547276e-07, - "logits/chosen": -2.4993557929992676, - "logits/rejected": -2.5066721439361572, - "logps/chosen": -253.60836791992188, - "logps/rejected": -317.843994140625, - "loss": 0.0512, + "logits/chosen": -0.608537495136261, + "logits/rejected": -0.47767123579978943, + "logps/chosen": -341.55364990234375, + "logps/rejected": -434.1073303222656, + "loss": 0.5687, "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -1.2258648872375488, - "rewards/margins": 0.4162590503692627, - "rewards/rejected": -1.6421239376068115, + "rewards/chosen": -2.105318069458008, + "rewards/margins": 0.6994394659996033, + "rewards/rejected": -2.8047571182250977, "step": 450 }, { - "debug/losses": 0.03727804124355316, - "debug/policy_weights": 0.06203744560480118, - "debug/raw_losses": 0.5842506885528564, "epoch": 0.37, "learning_rate": 3.997435317334988e-07, - "logits/chosen": -2.4661307334899902, - "logits/rejected": -2.410177707672119, - "logps/chosen": -320.1123352050781, - "logps/rejected": -332.6155700683594, - "loss": 0.0367, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -1.6538352966308594, - "rewards/margins": 0.4305035173892975, - "rewards/rejected": -2.084338665008545, + "logits/chosen": -0.6356207132339478, + "logits/rejected": -0.25634175539016724, + "logps/chosen": -384.43780517578125, + "logps/rejected": -419.24176025390625, + "loss": 0.5608, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.2970900535583496, + "rewards/margins": 0.6535100340843201, + "rewards/rejected": -2.9506001472473145, "step": 460 }, { - "debug/losses": 0.033384308218955994, - "debug/policy_weights": 0.05775139480829239, - "debug/raw_losses": 0.6023306846618652, "epoch": 0.37, "learning_rate": 3.941206998903701e-07, - "logits/chosen": -2.399458169937134, - "logits/rejected": -2.3540098667144775, - "logps/chosen": -320.56854248046875, - "logps/rejected": -349.14398193359375, - "loss": 0.035, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -1.7805945873260498, - "rewards/margins": 0.36902719736099243, - "rewards/rejected": -2.1496214866638184, + "logits/chosen": -1.0318920612335205, + "logits/rejected": -0.7451022267341614, + "logps/chosen": -338.9430236816406, + "logps/rejected": -384.64111328125, + "loss": 0.5678, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.9643396139144897, + "rewards/margins": 0.5402536392211914, + "rewards/rejected": -2.5045928955078125, "step": 470 }, { - "debug/losses": 0.038652874529361725, - "debug/policy_weights": 0.06789238750934601, - "debug/raw_losses": 0.5504260659217834, "epoch": 0.38, "learning_rate": 3.8838647931853684e-07, - "logits/chosen": -2.3865714073181152, - "logits/rejected": -2.364182710647583, - "logps/chosen": -278.394775390625, - "logps/rejected": -353.56036376953125, - "loss": 0.0415, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -1.5227144956588745, - "rewards/margins": 0.5519821643829346, - "rewards/rejected": -2.0746965408325195, + "logits/chosen": -0.6847028732299805, + "logits/rejected": -0.5548251867294312, + "logps/chosen": -339.61456298828125, + "logps/rejected": -435.32061767578125, + "loss": 0.5814, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.1349122524261475, + "rewards/margins": 0.7573872804641724, + "rewards/rejected": -2.8923001289367676, "step": 480 }, { - "debug/losses": 0.031766049563884735, - "debug/policy_weights": 0.06175629049539566, - "debug/raw_losses": 0.571739673614502, "epoch": 0.39, "learning_rate": 3.825453019111281e-07, - "logits/chosen": -2.4041807651519775, - "logits/rejected": -2.3797965049743652, - "logps/chosen": -303.6551513671875, - "logps/rejected": -353.06341552734375, - "loss": 0.0389, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -1.5336285829544067, - "rewards/margins": 0.43974629044532776, - "rewards/rejected": -1.9733749628067017, + "logits/chosen": -0.5378957986831665, + "logits/rejected": -0.28533270955085754, + "logps/chosen": -363.78570556640625, + "logps/rejected": -430.11749267578125, + "loss": 0.5327, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.134934425354004, + "rewards/margins": 0.6089809536933899, + "rewards/rejected": -2.743915319442749, "step": 490 }, { - "debug/losses": 0.036833684891462326, - "debug/policy_weights": 0.058806560933589935, - "debug/raw_losses": 0.6641045808792114, "epoch": 0.4, "learning_rate": 3.7660168222660824e-07, - "logits/chosen": -2.3923141956329346, - "logits/rejected": -2.3805289268493652, - "logps/chosen": -334.06304931640625, - "logps/rejected": -380.18157958984375, - "loss": 0.033, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -1.8973872661590576, - "rewards/margins": 0.28041723370552063, - "rewards/rejected": -2.177804470062256, + "logits/chosen": -0.6318235397338867, + "logits/rejected": -0.5071814656257629, + "logps/chosen": -350.5252380371094, + "logps/rejected": -421.93353271484375, + "loss": 0.5645, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -2.062009572982788, + "rewards/margins": 0.5333147048950195, + "rewards/rejected": -2.5953242778778076, "step": 500 }, { "epoch": 0.4, - "eval_debug/losses": 0.028324006125330925, - "eval_debug/policy_weights": 0.04786182940006256, - "eval_debug/raw_losses": 0.6049469113349915, - "eval_logits/chosen": -2.375598907470703, - "eval_logits/rejected": -2.3549647331237793, - "eval_logps/chosen": -340.9717712402344, - "eval_logps/rejected": -387.99102783203125, - "eval_loss": 0.029838422313332558, - "eval_rewards/accuracies": 0.6632462739944458, - "eval_rewards/chosen": -1.9639824628829956, - "eval_rewards/margins": 0.40458402037620544, - "eval_rewards/rejected": -2.3685665130615234, - "eval_runtime": 184.2266, - "eval_samples_per_second": 46.421, - "eval_steps_per_second": 0.727, + "eval_logits/chosen": -0.7860146760940552, + "eval_logits/rejected": -0.6090859770774841, + "eval_logps/chosen": -351.7882995605469, + "eval_logps/rejected": -419.81939697265625, + "eval_loss": 0.5639454126358032, + "eval_rewards/accuracies": 0.6986940503120422, + "eval_rewards/chosen": -2.0721471309661865, + "eval_rewards/margins": 0.6147031188011169, + "eval_rewards/rejected": -2.6868505477905273, + "eval_runtime": 184.5772, + "eval_samples_per_second": 46.333, + "eval_steps_per_second": 0.726, "step": 500 }, { - "debug/losses": 0.03147561103105545, - "debug/policy_weights": 0.053833089768886566, - "debug/raw_losses": 0.6370012760162354, "epoch": 0.41, "learning_rate": 3.705602139995416e-07, - "logits/chosen": -2.3812191486358643, - "logits/rejected": -2.3351223468780518, - "logps/chosen": -363.62359619140625, - "logps/rejected": -388.4914855957031, - "loss": 0.0294, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -2.0451314449310303, - "rewards/margins": 0.3193410336971283, - "rewards/rejected": -2.3644723892211914, + "logits/chosen": -0.7258490920066833, + "logits/rejected": -0.4828409254550934, + "logps/chosen": -388.1371154785156, + "logps/rejected": -422.11181640625, + "loss": 0.574, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -2.290266513824463, + "rewards/margins": 0.4104091227054596, + "rewards/rejected": -2.7006754875183105, "step": 510 }, { - "debug/losses": 0.032896917313337326, - "debug/policy_weights": 0.051876455545425415, - "debug/raw_losses": 0.615132212638855, "epoch": 0.41, "learning_rate": 3.6442556659016475e-07, - "logits/chosen": -2.2910399436950684, - "logits/rejected": -2.2609081268310547, - "logps/chosen": -355.7308654785156, - "logps/rejected": -385.9818115234375, - "loss": 0.0302, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -2.0049965381622314, - "rewards/margins": 0.3505350649356842, - "rewards/rejected": -2.3555312156677246, + "logits/chosen": -0.5335447192192078, + "logits/rejected": -0.33706527948379517, + "logps/chosen": -378.86492919921875, + "logps/rejected": -429.67724609375, + "loss": 0.5608, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.236337423324585, + "rewards/margins": 0.556148886680603, + "rewards/rejected": -2.7924864292144775, "step": 520 }, { - "debug/losses": 0.02078114077448845, - "debug/policy_weights": 0.04127407819032669, - "debug/raw_losses": 0.5731953978538513, "epoch": 0.42, "learning_rate": 3.582024813755076e-07, - "logits/chosen": -2.2595269680023193, - "logits/rejected": -2.2300662994384766, - "logps/chosen": -338.6393127441406, - "logps/rejected": -413.12677001953125, - "loss": 0.0261, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -2.0238897800445557, - "rewards/margins": 0.5238853693008423, - "rewards/rejected": -2.5477750301361084, + "logits/chosen": -0.39548322558403015, + "logits/rejected": -0.10662730038166046, + "logps/chosen": -368.8847961425781, + "logps/rejected": -473.3500061035156, + "loss": 0.5485, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.3263449668884277, + "rewards/margins": 0.8236624598503113, + "rewards/rejected": -3.150007724761963, "step": 530 }, { - "debug/losses": 0.025779416784644127, - "debug/policy_weights": 0.04478015378117561, - "debug/raw_losses": 0.5624719858169556, "epoch": 0.43, "learning_rate": 3.5189576808485404e-07, - "logits/chosen": -2.261573314666748, - "logits/rejected": -2.247623920440674, - "logps/chosen": -312.79986572265625, - "logps/rejected": -376.02459716796875, - "loss": 0.0311, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -1.795475721359253, - "rewards/margins": 0.4725308418273926, - "rewards/rejected": -2.2680065631866455, + "logits/chosen": 0.15742243826389313, + "logits/rejected": 0.31491726636886597, + "logps/chosen": -394.34930419921875, + "logps/rejected": -492.82232666015625, + "loss": 0.5478, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.6109700202941895, + "rewards/margins": 0.8250136375427246, + "rewards/rejected": -3.435983657836914, "step": 540 }, { - "debug/losses": 0.03554507717490196, - "debug/policy_weights": 0.059527624398469925, - "debug/raw_losses": 0.6180658936500549, "epoch": 0.44, "learning_rate": 3.4551030108237433e-07, - "logits/chosen": -2.263731002807617, - "logits/rejected": -2.239624261856079, - "logps/chosen": -332.16522216796875, - "logps/rejected": -357.5715637207031, - "loss": 0.0315, - "rewards/accuracies": 0.612500011920929, - "rewards/chosen": -1.7714109420776367, - "rewards/margins": 0.3168071508407593, - "rewards/rejected": -2.0882182121276855, + "logits/chosen": -0.2550584375858307, + "logits/rejected": -0.06936412304639816, + "logps/chosen": -406.5508728027344, + "logps/rejected": -448.47576904296875, + "loss": 0.5562, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -2.5152666568756104, + "rewards/margins": 0.4819938540458679, + "rewards/rejected": -2.997260332107544, "step": 550 }, { - "debug/losses": 0.026437610387802124, - "debug/policy_weights": 0.04441109299659729, - "debug/raw_losses": 0.5916617512702942, "epoch": 0.45, "learning_rate": 3.390510155998023e-07, - "logits/chosen": -2.1818649768829346, - "logits/rejected": -2.1401560306549072, - "logps/chosen": -359.7814636230469, - "logps/rejected": -387.63983154296875, - "loss": 0.0271, - "rewards/accuracies": 0.65625, - "rewards/chosen": -1.995721459388733, - "rewards/margins": 0.4399000108242035, - "rewards/rejected": -2.435621738433838, + "logits/chosen": -0.5292027592658997, + "logits/rejected": -0.2619571387767792, + "logps/chosen": -371.6798095703125, + "logps/rejected": -420.7915954589844, + "loss": 0.5492, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.1147050857543945, + "rewards/margins": 0.6524336338043213, + "rewards/rejected": -2.7671384811401367, "step": 560 }, { - "debug/losses": 0.027716809883713722, - "debug/policy_weights": 0.04560152441263199, - "debug/raw_losses": 0.5818749666213989, "epoch": 0.45, "learning_rate": 3.325229039220684e-07, - "logits/chosen": -2.1216673851013184, - "logits/rejected": -2.105167865753174, - "logps/chosen": -324.8556213378906, - "logps/rejected": -379.442138671875, - "loss": 0.0288, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -1.8975433111190796, - "rewards/margins": 0.41955557465553284, - "rewards/rejected": -2.317099094390869, + "logits/chosen": -0.5881962776184082, + "logits/rejected": -0.4658876061439514, + "logps/chosen": -343.7039794921875, + "logps/rejected": -406.14178466796875, + "loss": 0.57, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.0860273838043213, + "rewards/margins": 0.498068630695343, + "rewards/rejected": -2.5840957164764404, "step": 570 }, { - "debug/losses": 0.030428007245063782, - "debug/policy_weights": 0.05345035344362259, - "debug/raw_losses": 0.5788132548332214, "epoch": 0.46, "learning_rate": 3.2593101152883795e-07, - "logits/chosen": -2.143578052520752, - "logits/rejected": -2.057871103286743, - "logps/chosen": -332.26580810546875, - "logps/rejected": -365.40728759765625, - "loss": 0.0336, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -1.7957499027252197, - "rewards/margins": 0.4575200080871582, - "rewards/rejected": -2.253269910812378, + "logits/chosen": -0.6565806269645691, + "logits/rejected": -0.2549567222595215, + "logps/chosen": -374.8047180175781, + "logps/rejected": -430.33221435546875, + "loss": 0.5512, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.2211391925811768, + "rewards/margins": 0.6813799142837524, + "rewards/rejected": -2.9025187492370605, "step": 580 }, { - "debug/losses": 0.019751299172639847, - "debug/policy_weights": 0.035338252782821655, - "debug/raw_losses": 0.5761606097221375, "epoch": 0.47, "learning_rate": 3.192804331949349e-07, - "logits/chosen": -1.954616904258728, - "logits/rejected": -1.8829944133758545, - "logps/chosen": -382.0030212402344, - "logps/rejected": -423.8499450683594, - "loss": 0.0245, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -2.3282957077026367, - "rewards/margins": 0.5069156885147095, - "rewards/rejected": -2.8352112770080566, + "logits/chosen": -0.07184700667858124, + "logits/rejected": 0.1699156016111374, + "logps/chosen": -422.27081298828125, + "logps/rejected": -490.69134521484375, + "loss": 0.535, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.730973720550537, + "rewards/margins": 0.7726518511772156, + "rewards/rejected": -3.5036251544952393, "step": 590 }, { - "debug/losses": 0.02557496353983879, - "debug/policy_weights": 0.04240599647164345, - "debug/raw_losses": 0.6472615003585815, "epoch": 0.48, "learning_rate": 3.125763090526674e-07, - "logits/chosen": -1.9402202367782593, - "logits/rejected": -1.8992201089859009, - "logps/chosen": -364.7431945800781, - "logps/rejected": -401.673583984375, - "loss": 0.0236, - "rewards/accuracies": 0.625, - "rewards/chosen": -2.2743334770202637, - "rewards/margins": 0.4008857309818268, - "rewards/rejected": -2.6752190589904785, + "logits/chosen": -0.029465889558196068, + "logits/rejected": 0.15842057764530182, + "logps/chosen": -417.373046875, + "logps/rejected": -478.73291015625, + "loss": 0.5513, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.8006317615509033, + "rewards/margins": 0.6451797485351562, + "rewards/rejected": -3.4458115100860596, "step": 600 }, { "epoch": 0.48, - "eval_debug/losses": 0.02388698235154152, - "eval_debug/policy_weights": 0.040445927530527115, - "eval_debug/raw_losses": 0.5903487801551819, - "eval_logits/chosen": -2.0673580169677734, - "eval_logits/rejected": -2.0261595249176025, - "eval_logps/chosen": -364.19549560546875, - "eval_logps/rejected": -417.2640075683594, - "eval_loss": 0.02519642747938633, - "eval_rewards/accuracies": 0.6716417670249939, - "eval_rewards/chosen": -2.1962194442749023, - "eval_rewards/margins": 0.4650767743587494, - "eval_rewards/rejected": -2.6612961292266846, - "eval_runtime": 184.1577, - "eval_samples_per_second": 46.438, - "eval_steps_per_second": 0.728, + "eval_logits/chosen": -0.10542195290327072, + "eval_logits/rejected": 0.12242482602596283, + "eval_logps/chosen": -436.9386291503906, + "eval_logps/rejected": -505.02227783203125, + "eval_loss": 0.5582411885261536, + "eval_rewards/accuracies": 0.7108209133148193, + "eval_rewards/chosen": -2.9236514568328857, + "eval_rewards/margins": 0.6152271032333374, + "eval_rewards/rejected": -3.5388784408569336, + "eval_runtime": 184.6013, + "eval_samples_per_second": 46.327, + "eval_steps_per_second": 0.726, "step": 600 }, { - "debug/losses": 0.030718114227056503, - "debug/policy_weights": 0.04987967759370804, - "debug/raw_losses": 0.6073102355003357, "epoch": 0.49, "learning_rate": 3.0582382061909623e-07, - "logits/chosen": -2.1803696155548096, - "logits/rejected": -2.141378402709961, - "logps/chosen": -362.0430603027344, - "logps/rejected": -409.08721923828125, - "loss": 0.0255, - "rewards/accuracies": 0.65625, - "rewards/chosen": -2.060835361480713, - "rewards/margins": 0.4132884442806244, - "rewards/rejected": -2.47412371635437, + "logits/chosen": -0.2445104569196701, + "logits/rejected": -0.018268002197146416, + "logps/chosen": -441.7857971191406, + "logps/rejected": -502.60791015625, + "loss": 0.5594, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.858261823654175, + "rewards/margins": 0.5510683655738831, + "rewards/rejected": -3.409330368041992, "step": 610 }, { - "debug/losses": 0.031075570732355118, - "debug/policy_weights": 0.05002061277627945, - "debug/raw_losses": 0.5588568449020386, "epoch": 0.49, "learning_rate": 2.9902818679131775e-07, - "logits/chosen": -2.1402440071105957, - "logits/rejected": -2.120105504989624, - "logps/chosen": -318.97039794921875, - "logps/rejected": -400.02325439453125, - "loss": 0.0285, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -1.8497616052627563, - "rewards/margins": 0.5815163850784302, - "rewards/rejected": -2.4312777519226074, + "logits/chosen": -0.4190225601196289, + "logits/rejected": -0.22823679447174072, + "logps/chosen": -399.03924560546875, + "logps/rejected": -498.6724548339844, + "loss": 0.5499, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.650449275970459, + "rewards/margins": 0.7673205137252808, + "rewards/rejected": -3.4177703857421875, "step": 620 }, { - "debug/losses": 0.03749687597155571, - "debug/policy_weights": 0.06386871635913849, - "debug/raw_losses": 0.5822035670280457, "epoch": 0.5, "learning_rate": 2.921946598128571e-07, - "logits/chosen": -2.0595593452453613, - "logits/rejected": -2.013613224029541, - "logps/chosen": -319.7913513183594, - "logps/rejected": -374.9269104003906, - "loss": 0.0355, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -1.7426353693008423, - "rewards/margins": 0.4733693599700928, - "rewards/rejected": -2.2160048484802246, + "logits/chosen": -0.43653860688209534, + "logits/rejected": -0.20837187767028809, + "logps/chosen": -402.82781982421875, + "logps/rejected": -485.4117736816406, + "loss": 0.5739, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.573000192642212, + "rewards/margins": 0.7478531002998352, + "rewards/rejected": -3.3208529949188232, "step": 630 }, { - "debug/losses": 0.0409637950360775, - "debug/policy_weights": 0.0696835070848465, - "debug/raw_losses": 0.590660572052002, "epoch": 0.51, "learning_rate": 2.8532852121428733e-07, - "logits/chosen": -2.0456626415252686, - "logits/rejected": -1.981777548789978, - "logps/chosen": -332.8511657714844, - "logps/rejected": -358.7229919433594, - "loss": 0.0358, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -1.7685935497283936, - "rewards/margins": 0.39210933446884155, - "rewards/rejected": -2.16070294380188, + "logits/chosen": -0.43430274724960327, + "logits/rejected": -0.13240045309066772, + "logps/chosen": -397.2491149902344, + "logps/rejected": -442.12384033203125, + "loss": 0.5462, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.4125733375549316, + "rewards/margins": 0.5821372866630554, + "rewards/rejected": -2.9947104454040527, "step": 640 }, { - "debug/losses": 0.03011392056941986, - "debug/policy_weights": 0.04868883639574051, - "debug/raw_losses": 0.5676226019859314, "epoch": 0.52, "learning_rate": 2.7843507773121414e-07, - "logits/chosen": -1.9788720607757568, - "logits/rejected": -1.9158132076263428, - "logps/chosen": -344.5046691894531, - "logps/rejected": -389.85186767578125, - "loss": 0.0285, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -1.9954973459243774, - "rewards/margins": 0.4882015287876129, - "rewards/rejected": -2.483699083328247, + "logits/chosen": -0.4247920513153076, + "logits/rejected": -0.21372787654399872, + "logps/chosen": -389.4237976074219, + "logps/rejected": -458.3169860839844, + "loss": 0.5373, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -2.444688320159912, + "rewards/margins": 0.7236617207527161, + "rewards/rejected": -3.1683506965637207, "step": 650 }, { - "debug/losses": 0.02614041231572628, - "debug/policy_weights": 0.042843420058488846, - "debug/raw_losses": 0.5947295427322388, "epoch": 0.53, "learning_rate": 2.715196572027789e-07, - "logits/chosen": -1.891570806503296, - "logits/rejected": -1.8399454355239868, - "logps/chosen": -383.4547119140625, - "logps/rejected": -438.5372619628906, - "loss": 0.0265, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -2.30889892578125, - "rewards/margins": 0.47160759568214417, - "rewards/rejected": -2.7805066108703613, + "logits/chosen": -0.6697942614555359, + "logits/rejected": -0.4933086931705475, + "logps/chosen": -387.529296875, + "logps/rejected": -472.73944091796875, + "loss": 0.5685, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.3496451377868652, + "rewards/margins": 0.7728831171989441, + "rewards/rejected": -3.122528314590454, "step": 660 }, { - "debug/losses": 0.03734613209962845, - "debug/policy_weights": 0.04994584247469902, - "debug/raw_losses": 0.6607553362846375, "epoch": 0.53, "learning_rate": 2.645876044538521e-07, - "logits/chosen": -1.9299882650375366, - "logits/rejected": -1.8907877206802368, - "logps/chosen": -369.05010986328125, - "logps/rejected": -400.5910949707031, - "loss": 0.0305, - "rewards/accuracies": 0.581250011920929, - "rewards/chosen": -2.167097806930542, - "rewards/margins": 0.2763841152191162, - "rewards/rejected": -2.443481922149658, + "logits/chosen": -1.0338900089263916, + "logits/rejected": -0.8813627362251282, + "logps/chosen": -372.53118896484375, + "logps/rejected": -426.54241943359375, + "loss": 0.5725, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.201908588409424, + "rewards/margins": 0.5010865926742554, + "rewards/rejected": -2.7029950618743896, "step": 670 }, { - "debug/losses": 0.0313260592520237, - "debug/policy_weights": 0.05115436390042305, - "debug/raw_losses": 0.5541172027587891, "epoch": 0.54, "learning_rate": 2.5764427716409815e-07, - "logits/chosen": -1.9014215469360352, - "logits/rejected": -1.841132402420044, - "logps/chosen": -328.7130432128906, - "logps/rejected": -377.9052429199219, - "loss": 0.0337, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -1.841946005821228, - "rewards/margins": 0.5384342670440674, - "rewards/rejected": -2.380380153656006, + "logits/chosen": -0.9278701543807983, + "logits/rejected": -0.7282145023345947, + "logps/chosen": -347.2828674316406, + "logps/rejected": -416.9349060058594, + "loss": 0.5479, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.0276436805725098, + "rewards/margins": 0.743033230304718, + "rewards/rejected": -2.770677089691162, "step": 680 }, { - "debug/losses": 0.024547632783651352, - "debug/policy_weights": 0.048949599266052246, - "debug/raw_losses": 0.5285614132881165, "epoch": 0.55, "learning_rate": 2.5069504172710494e-07, - "logits/chosen": -1.8599274158477783, - "logits/rejected": -1.8095347881317139, - "logps/chosen": -339.73712158203125, - "logps/rejected": -423.26593017578125, - "loss": 0.0297, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -2.0068037509918213, - "rewards/margins": 0.6240845322608948, - "rewards/rejected": -2.630887746810913, + "logits/chosen": -0.5008482336997986, + "logits/rejected": -0.34875133633613586, + "logps/chosen": -373.7621154785156, + "logps/rejected": -485.12884521484375, + "loss": 0.5217, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.347053050994873, + "rewards/margins": 0.9024646878242493, + "rewards/rejected": -3.2495174407958984, "step": 690 }, { - "debug/losses": 0.029593318700790405, - "debug/policy_weights": 0.04828739911317825, - "debug/raw_losses": 0.6130238175392151, "epoch": 0.56, "learning_rate": 2.4374526910277886e-07, - "logits/chosen": -1.7858304977416992, - "logits/rejected": -1.6872743368148804, - "logps/chosen": -347.20477294921875, - "logps/rejected": -374.71588134765625, - "loss": 0.0278, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -2.033212900161743, - "rewards/margins": 0.4321438670158386, - "rewards/rejected": -2.4653568267822266, + "logits/chosen": 0.06850005686283112, + "logits/rejected": 0.41385045647621155, + "logps/chosen": -411.46246337890625, + "logps/rejected": -476.6162109375, + "loss": 0.5571, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -2.6757898330688477, + "rewards/margins": 0.8085702657699585, + "rewards/rejected": -3.4843602180480957, "step": 700 }, { "epoch": 0.56, - "eval_debug/losses": 0.02802393026649952, - "eval_debug/policy_weights": 0.0477924570441246, - "eval_debug/raw_losses": 0.5799114108085632, - "eval_logits/chosen": -1.8295836448669434, - "eval_logits/rejected": -1.7539435625076294, - "eval_logps/chosen": -352.343017578125, - "eval_logps/rejected": -409.57293701171875, - "eval_loss": 0.02943836711347103, - "eval_rewards/accuracies": 0.6828358173370361, - "eval_rewards/chosen": -2.07769513130188, - "eval_rewards/margins": 0.5066906213760376, - "eval_rewards/rejected": -2.584385633468628, - "eval_runtime": 184.1705, - "eval_samples_per_second": 46.435, - "eval_steps_per_second": 0.728, + "eval_logits/chosen": 0.035554468631744385, + "eval_logits/rejected": 0.2980235815048218, + "eval_logps/chosen": -424.2823486328125, + "eval_logps/rejected": -505.6960754394531, + "eval_loss": 0.5558871626853943, + "eval_rewards/accuracies": 0.704291045665741, + "eval_rewards/chosen": -2.797088146209717, + "eval_rewards/margins": 0.748529314994812, + "eval_rewards/rejected": -3.5456173419952393, + "eval_runtime": 184.3728, + "eval_samples_per_second": 46.384, + "eval_steps_per_second": 0.727, "step": 700 }, { - "debug/losses": 0.02372685633599758, - "debug/policy_weights": 0.04611854627728462, - "debug/raw_losses": 0.48435139656066895, "epoch": 0.57, "learning_rate": 2.368003306662104e-07, - "logits/chosen": -1.8146909475326538, - "logits/rejected": -1.7254749536514282, - "logps/chosen": -346.41180419921875, - "logps/rejected": -441.62664794921875, - "loss": 0.0265, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -2.0446598529815674, - "rewards/margins": 0.749045193195343, - "rewards/rejected": -2.7937045097351074, + "logits/chosen": 0.07857178151607513, + "logits/rejected": 0.3302653729915619, + "logps/chosen": -413.8836975097656, + "logps/rejected": -535.0875244140625, + "loss": 0.5287, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.7193782329559326, + "rewards/margins": 1.0089346170425415, + "rewards/rejected": -3.7283127307891846, "step": 710 }, { - "debug/losses": 0.014840343967080116, - "debug/policy_weights": 0.02718583308160305, - "debug/raw_losses": 0.54576575756073, "epoch": 0.57, "learning_rate": 2.2986559405621886e-07, - "logits/chosen": -1.6351581811904907, - "logits/rejected": -1.584282636642456, - "logps/chosen": -375.0662536621094, - "logps/rejected": -450.80047607421875, - "loss": 0.0212, - "rewards/accuracies": 0.6875, - "rewards/chosen": -2.4484825134277344, - "rewards/margins": 0.5616628527641296, - "rewards/rejected": -3.010145425796509, + "logits/chosen": 0.2789291739463806, + "logits/rejected": 0.4242584705352783, + "logps/chosen": -422.7801818847656, + "logps/rejected": -522.7840576171875, + "loss": 0.5551, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.925621509552002, + "rewards/margins": 0.8043605089187622, + "rewards/rejected": -3.729982376098633, "step": 720 }, { - "debug/losses": 0.01882367953658104, - "debug/policy_weights": 0.03155171126127243, - "debug/raw_losses": 0.5408294796943665, "epoch": 0.58, "learning_rate": 2.2294641902678443e-07, - "logits/chosen": -1.6009079217910767, - "logits/rejected": -1.5076144933700562, - "logps/chosen": -364.93170166015625, - "logps/rejected": -444.257568359375, - "loss": 0.0202, - "rewards/accuracies": 0.71875, - "rewards/chosen": -2.413137197494507, - "rewards/margins": 0.6236140131950378, - "rewards/rejected": -3.0367512702941895, + "logits/chosen": -0.19327735900878906, + "logits/rejected": 0.043265581130981445, + "logps/chosen": -363.1488342285156, + "logps/rejected": -470.94970703125, + "loss": 0.5284, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.39530873298645, + "rewards/margins": 0.908363938331604, + "rewards/rejected": -3.3036727905273438, "step": 730 }, { - "debug/losses": 0.02030162885785103, - "debug/policy_weights": 0.033347465097904205, - "debug/raw_losses": 0.6298730969429016, "epoch": 0.59, "learning_rate": 2.160481533045751e-07, - "logits/chosen": -1.6100019216537476, - "logits/rejected": -1.530006766319275, - "logps/chosen": -410.58001708984375, - "logps/rejected": -441.34222412109375, - "loss": 0.0209, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -2.50343656539917, - "rewards/margins": 0.4143384099006653, - "rewards/rejected": -2.9177753925323486, + "logits/chosen": -0.37412697076797485, + "logits/rejected": -0.17320053279399872, + "logps/chosen": -390.2896423339844, + "logps/rejected": -428.08099365234375, + "loss": 0.5572, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.3005330562591553, + "rewards/margins": 0.48462891578674316, + "rewards/rejected": -2.7851624488830566, "step": 740 }, { - "debug/losses": 0.016051562502980232, - "debug/policy_weights": 0.030593061819672585, - "debug/raw_losses": 0.5726362466812134, "epoch": 0.6, "learning_rate": 2.0917612845576882e-07, - "logits/chosen": -1.652693510055542, - "logits/rejected": -1.544252634048462, - "logps/chosen": -384.3175354003906, - "logps/rejected": -431.06097412109375, - "loss": 0.0221, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -2.440882682800293, - "rewards/margins": 0.4846716821193695, - "rewards/rejected": -2.925554037094116, + "logits/chosen": -0.26352375745773315, + "logits/rejected": -0.0010178961092606187, + "logps/chosen": -373.3875427246094, + "logps/rejected": -440.09442138671875, + "loss": 0.5534, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.3315823078155518, + "rewards/margins": 0.6843063235282898, + "rewards/rejected": -3.0158886909484863, "step": 750 }, { - "debug/losses": 0.020186755806207657, - "debug/policy_weights": 0.04001495987176895, - "debug/raw_losses": 0.5100632905960083, "epoch": 0.6, "learning_rate": 2.0233565576536564e-07, - "logits/chosen": -1.7317155599594116, - "logits/rejected": -1.5801684856414795, - "logps/chosen": -361.05029296875, - "logps/rejected": -420.69500732421875, - "loss": 0.0208, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -2.167536735534668, - "rewards/margins": 0.6427803635597229, - "rewards/rejected": -2.810317277908325, + "logits/chosen": -0.3354080021381378, + "logits/rejected": -0.006600166670978069, + "logps/chosen": -360.56463623046875, + "logps/rejected": -440.66961669921875, + "loss": 0.5328, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.1626803874969482, + "rewards/margins": 0.8473829030990601, + "rewards/rejected": -3.010063409805298, "step": 760 }, { - "debug/losses": 0.019544925540685654, - "debug/policy_weights": 0.03376678004860878, - "debug/raw_losses": 0.6016618013381958, "epoch": 0.61, "learning_rate": 1.9553202213217537e-07, - "logits/chosen": -1.5735666751861572, - "logits/rejected": -1.4604378938674927, - "logps/chosen": -382.40289306640625, - "logps/rejected": -417.23388671875, - "loss": 0.024, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -2.324824094772339, - "rewards/margins": 0.4267503321170807, - "rewards/rejected": -2.7515742778778076, + "logits/chosen": -0.021420275792479515, + "logits/rejected": 0.19946305453777313, + "logps/chosen": -389.1043395996094, + "logps/rejected": -448.04998779296875, + "loss": 0.5523, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.391838788986206, + "rewards/margins": 0.6678962707519531, + "rewards/rejected": -3.059735059738159, "step": 770 }, { - "debug/losses": 0.020478922873735428, - "debug/policy_weights": 0.03708823397755623, - "debug/raw_losses": 0.5932536721229553, "epoch": 0.62, "learning_rate": 1.887704859826528e-07, - "logits/chosen": -1.6135523319244385, - "logits/rejected": -1.541106939315796, - "logps/chosen": -386.4679260253906, - "logps/rejected": -441.2215881347656, - "loss": 0.0203, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -2.3918228149414062, - "rewards/margins": 0.440577894449234, - "rewards/rejected": -2.832400321960449, + "logits/chosen": -0.15253478288650513, + "logits/rejected": -0.00011998042464256287, + "logps/chosen": -394.9501953125, + "logps/rejected": -462.32843017578125, + "loss": 0.5443, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.476644992828369, + "rewards/margins": 0.566824734210968, + "rewards/rejected": -3.0434699058532715, "step": 780 }, { - "debug/losses": 0.016194965690374374, - "debug/policy_weights": 0.03022093139588833, - "debug/raw_losses": 0.5761831998825073, "epoch": 0.63, "learning_rate": 1.8205627320673836e-07, - "logits/chosen": -1.496963620185852, - "logits/rejected": -1.3502593040466309, - "logps/chosen": -397.39892578125, - "logps/rejected": -432.4063415527344, - "loss": 0.0213, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -2.5083866119384766, - "rewards/margins": 0.5051754713058472, - "rewards/rejected": -3.013561964035034, + "logits/chosen": -0.17955633997917175, + "logits/rejected": 0.18167546391487122, + "logps/chosen": -390.32244873046875, + "logps/rejected": -444.895263671875, + "loss": 0.5566, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.4376220703125, + "rewards/margins": 0.7008293271064758, + "rewards/rejected": -3.138451099395752, "step": 790 }, { - "debug/losses": 0.023704659193754196, - "debug/policy_weights": 0.037073858082294464, - "debug/raw_losses": 0.6084911227226257, "epoch": 0.64, "learning_rate": 1.7539457311884675e-07, - "logits/chosen": -1.515291452407837, - "logits/rejected": -1.3988045454025269, - "logps/chosen": -411.55078125, - "logps/rejected": -447.043701171875, - "loss": 0.0196, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -2.535688877105713, - "rewards/margins": 0.37079089879989624, - "rewards/rejected": -2.906479835510254, + "logits/chosen": -0.09838727861642838, + "logits/rejected": 0.11829495429992676, + "logps/chosen": -402.4017333984375, + "logps/rejected": -451.49346923828125, + "loss": 0.5609, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.4441986083984375, + "rewards/margins": 0.5067789554595947, + "rewards/rejected": -2.9509775638580322, "step": 800 }, { "epoch": 0.64, - "eval_debug/losses": 0.02070247195661068, - "eval_debug/policy_weights": 0.035034600645303726, - "eval_debug/raw_losses": 0.5850093364715576, - "eval_logits/chosen": -1.4512754678726196, - "eval_logits/rejected": -1.3358559608459473, - "eval_logps/chosen": -392.8779296875, - "eval_logps/rejected": -447.2490539550781, - "eval_loss": 0.021671824157238007, - "eval_rewards/accuracies": 0.6791045069694519, - "eval_rewards/chosen": -2.483044385910034, - "eval_rewards/margins": 0.47810250520706177, - "eval_rewards/rejected": -2.961146831512451, - "eval_runtime": 184.0928, - "eval_samples_per_second": 46.455, - "eval_steps_per_second": 0.728, + "eval_logits/chosen": -0.03116540051996708, + "eval_logits/rejected": 0.1922437697649002, + "eval_logps/chosen": -387.7091979980469, + "eval_logps/rejected": -459.44390869140625, + "eval_loss": 0.5468714833259583, + "eval_rewards/accuracies": 0.7108209133148193, + "eval_rewards/chosen": -2.431356430053711, + "eval_rewards/margins": 0.6517390012741089, + "eval_rewards/rejected": -3.0830955505371094, + "eval_runtime": 184.5893, + "eval_samples_per_second": 46.33, + "eval_steps_per_second": 0.726, "step": 800 }, { - "debug/losses": 0.02696947753429413, - "debug/policy_weights": 0.0378401055932045, - "debug/raw_losses": 0.5993171334266663, "epoch": 0.64, "learning_rate": 1.687905344471226e-07, - "logits/chosen": -1.4443447589874268, - "logits/rejected": -1.2713860273361206, - "logps/chosen": -412.06134033203125, - "logps/rejected": -442.15606689453125, - "loss": 0.021, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -2.5408568382263184, - "rewards/margins": 0.4449678957462311, - "rewards/rejected": -2.9858250617980957, + "logits/chosen": 0.07735608518123627, + "logits/rejected": 0.3973601460456848, + "logps/chosen": -408.05999755859375, + "logps/rejected": -459.011474609375, + "loss": 0.5384, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -2.5008435249328613, + "rewards/margins": 0.6535352468490601, + "rewards/rejected": -3.154379367828369, "step": 810 }, { - "debug/losses": 0.027654822915792465, - "debug/policy_weights": 0.04146949201822281, - "debug/raw_losses": 0.6201833486557007, "epoch": 0.65, "learning_rate": 1.6224926135406693e-07, - "logits/chosen": -1.3432210683822632, - "logits/rejected": -1.225619912147522, - "logps/chosen": -406.32366943359375, - "logps/rejected": -455.39495849609375, - "loss": 0.024, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -2.462209463119507, - "rewards/margins": 0.4036657214164734, - "rewards/rejected": -2.865875482559204, + "logits/chosen": 0.1125444769859314, + "logits/rejected": 0.3865428566932678, + "logps/chosen": -404.16058349609375, + "logps/rejected": -484.68621826171875, + "loss": 0.5448, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -2.4405789375305176, + "rewards/margins": 0.718208909034729, + "rewards/rejected": -3.158787727355957, "step": 820 }, { - "debug/losses": 0.020849039778113365, - "debug/policy_weights": 0.03605310246348381, - "debug/raw_losses": 0.5819541811943054, "epoch": 0.66, "learning_rate": 1.557758094916053e-07, - "logits/chosen": -1.379014253616333, - "logits/rejected": -1.3085992336273193, - "logps/chosen": -368.54254150390625, - "logps/rejected": -422.4908752441406, - "loss": 0.0265, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -2.3684463500976562, - "rewards/margins": 0.44577255845069885, - "rewards/rejected": -2.8142189979553223, + "logits/chosen": 0.11989516019821167, + "logits/rejected": 0.30926594138145447, + "logps/chosen": -370.29876708984375, + "logps/rejected": -452.27911376953125, + "loss": 0.5418, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.3860089778900146, + "rewards/margins": 0.7260924577713013, + "rewards/rejected": -3.1121015548706055, "step": 830 }, { - "debug/losses": 0.024118471890687943, - "debug/policy_weights": 0.04134884104132652, - "debug/raw_losses": 0.6025993227958679, "epoch": 0.67, "learning_rate": 1.4937518209365108e-07, - "logits/chosen": -1.6948308944702148, - "logits/rejected": -1.5715309381484985, - "logps/chosen": -387.93450927734375, - "logps/rejected": -418.56146240234375, - "loss": 0.0261, - "rewards/accuracies": 0.65625, - "rewards/chosen": -2.3019235134124756, - "rewards/margins": 0.4015706479549408, - "rewards/rejected": -2.7034945487976074, + "logits/chosen": -0.14239154756069183, + "logits/rejected": 0.14250756800174713, + "logps/chosen": -395.55755615234375, + "logps/rejected": -447.6368713378906, + "loss": 0.5573, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.378154754638672, + "rewards/margins": 0.6160937547683716, + "rewards/rejected": -2.994248390197754, "step": 840 }, { - "debug/losses": 0.01930319517850876, - "debug/policy_weights": 0.03506116196513176, - "debug/raw_losses": 0.539021909236908, "epoch": 0.68, "learning_rate": 1.4305232610918045e-07, - "logits/chosen": -1.6697864532470703, - "logits/rejected": -1.5453600883483887, - "logps/chosen": -361.92718505859375, - "logps/rejected": -402.16436767578125, - "loss": 0.025, + "logits/chosen": -0.16526366770267487, + "logits/rejected": 0.16432161629199982, + "logps/chosen": -373.45330810546875, + "logps/rejected": -436.6773376464844, + "loss": 0.5415, "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -2.1981418132781982, - "rewards/margins": 0.544941782951355, - "rewards/rejected": -2.743083953857422, + "rewards/chosen": -2.3134028911590576, + "rewards/margins": 0.774810791015625, + "rewards/rejected": -3.0882136821746826, "step": 850 }, { - "debug/losses": 0.02497004345059395, - "debug/policy_weights": 0.048435479402542114, - "debug/raw_losses": 0.5080103278160095, "epoch": 0.68, "learning_rate": 1.3681212837880977e-07, - "logits/chosen": -1.7606804370880127, - "logits/rejected": -1.616124153137207, - "logps/chosen": -340.9532165527344, - "logps/rejected": -400.5531311035156, - "loss": 0.0241, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -1.9367974996566772, - "rewards/margins": 0.6633094549179077, - "rewards/rejected": -2.600106716156006, + "logits/chosen": -0.1321481615304947, + "logits/rejected": 0.23287932574748993, + "logps/chosen": -364.96990966796875, + "logps/rejected": -447.7923278808594, + "loss": 0.5396, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.176964282989502, + "rewards/margins": 0.8955341577529907, + "rewards/rejected": -3.0724985599517822, "step": 860 }, { - "debug/losses": 0.02837424911558628, - "debug/policy_weights": 0.04567035287618637, - "debug/raw_losses": 0.6146183013916016, "epoch": 0.69, "learning_rate": 1.3065941185782977e-07, - "logits/chosen": -1.585915207862854, - "logits/rejected": -1.5111041069030762, - "logps/chosen": -354.051025390625, - "logps/rejected": -393.76806640625, - "loss": 0.0245, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -2.1148974895477295, - "rewards/margins": 0.37471339106559753, - "rewards/rejected": -2.4896111488342285, + "logits/chosen": 0.05437428876757622, + "logits/rejected": 0.2819867432117462, + "logps/chosen": -383.08599853515625, + "logps/rejected": -439.3629455566406, + "loss": 0.5505, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -2.405247211456299, + "rewards/margins": 0.5403125882148743, + "rewards/rejected": -2.9455599784851074, "step": 870 }, { - "debug/losses": 0.020547617226839066, - "debug/policy_weights": 0.034798312932252884, - "debug/raw_losses": 0.5676860809326172, "epoch": 0.7, "learning_rate": 1.2459893188861613e-07, - "logits/chosen": -1.5211480855941772, - "logits/rejected": -1.400282621383667, - "logps/chosen": -349.64459228515625, - "logps/rejected": -415.67022705078125, - "loss": 0.028, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -2.2167887687683105, - "rewards/margins": 0.5641622543334961, - "rewards/rejected": -2.7809510231018066, + "logits/chosen": -0.12052659690380096, + "logits/rejected": 0.12284734100103378, + "logps/chosen": -367.1181640625, + "logps/rejected": -468.1044921875, + "loss": 0.5185, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.39152455329895, + "rewards/margins": 0.9137696027755737, + "rewards/rejected": -3.3052947521209717, "step": 880 }, { - "debug/losses": 0.027502432465553284, - "debug/policy_weights": 0.04566425085067749, - "debug/raw_losses": 0.6056962013244629, "epoch": 0.71, "learning_rate": 1.1863537252529548e-07, - "logits/chosen": -1.4182993173599243, - "logits/rejected": -1.3039287328720093, - "logps/chosen": -380.0196228027344, - "logps/rejected": -422.4239196777344, - "loss": 0.0268, - "rewards/accuracies": 0.6875, - "rewards/chosen": -2.333958387374878, - "rewards/margins": 0.45039528608322144, - "rewards/rejected": -2.7843539714813232, + "logits/chosen": 0.14598000049591064, + "logits/rejected": 0.38815659284591675, + "logps/chosen": -397.891357421875, + "logps/rejected": -472.38677978515625, + "loss": 0.5323, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.512676239013672, + "rewards/margins": 0.7713057994842529, + "rewards/rejected": -3.2839818000793457, "step": 890 }, { - "debug/losses": 0.028682146221399307, - "debug/policy_weights": 0.04342349246144295, - "debug/raw_losses": 0.5851776003837585, "epoch": 0.72, "learning_rate": 1.1277334291351145e-07, - "logits/chosen": -1.480594277381897, - "logits/rejected": -1.3723777532577515, - "logps/chosen": -369.6292419433594, - "logps/rejected": -419.40313720703125, - "loss": 0.0277, - "rewards/accuracies": 0.65625, - "rewards/chosen": -2.2591445446014404, - "rewards/margins": 0.4824891686439514, - "rewards/rejected": -2.741633892059326, + "logits/chosen": 0.15319526195526123, + "logits/rejected": 0.35974830389022827, + "logps/chosen": -380.77783203125, + "logps/rejected": -449.54315185546875, + "loss": 0.5514, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -2.3706305027008057, + "rewards/margins": 0.6724039912223816, + "rewards/rejected": -3.043034076690674, "step": 900 }, { "epoch": 0.72, - "eval_debug/losses": 0.024355126544833183, - "eval_debug/policy_weights": 0.0418710894882679, - "eval_debug/raw_losses": 0.5759484767913818, - "eval_logits/chosen": -1.4823256731033325, - "eval_logits/rejected": -1.372009038925171, - "eval_logps/chosen": -374.232177734375, - "eval_logps/rejected": -431.201416015625, - "eval_loss": 0.025714728981256485, - "eval_rewards/accuracies": 0.6884328126907349, - "eval_rewards/chosen": -2.296586275100708, - "eval_rewards/margins": 0.5040841102600098, - "eval_rewards/rejected": -2.800670623779297, - "eval_runtime": 184.3135, - "eval_samples_per_second": 46.399, - "eval_steps_per_second": 0.727, + "eval_logits/chosen": 0.28598034381866455, + "eval_logits/rejected": 0.5382024645805359, + "eval_logps/chosen": -392.3096008300781, + "eval_logps/rejected": -471.95330810546875, + "eval_loss": 0.5473664402961731, + "eval_rewards/accuracies": 0.6996268630027771, + "eval_rewards/chosen": -2.4773612022399902, + "eval_rewards/margins": 0.7308279275894165, + "eval_rewards/rejected": -3.2081892490386963, + "eval_runtime": 184.5275, + "eval_samples_per_second": 46.345, + "eval_steps_per_second": 0.726, "step": 900 }, { - "debug/losses": 0.02738502062857151, - "debug/policy_weights": 0.0495600588619709, - "debug/raw_losses": 0.5319479703903198, "epoch": 0.72, "learning_rate": 1.0701737372808431e-07, - "logits/chosen": -1.6114399433135986, - "logits/rejected": -1.478167176246643, - "logps/chosen": -360.99053955078125, - "logps/rejected": -421.829833984375, - "loss": 0.0263, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -2.0528383255004883, - "rewards/margins": 0.6187690496444702, - "rewards/rejected": -2.671607255935669, + "logits/chosen": 0.15951867401599884, + "logits/rejected": 0.46630391478538513, + "logps/chosen": -383.52850341796875, + "logps/rejected": -467.2303771972656, + "loss": 0.5362, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.2782187461853027, + "rewards/margins": 0.8473943471908569, + "rewards/rejected": -3.125612735748291, "step": 910 }, { - "debug/losses": 0.023180395364761353, - "debug/policy_weights": 0.03656435385346413, - "debug/raw_losses": 0.5755435824394226, "epoch": 0.73, "learning_rate": 1.0137191367132078e-07, - "logits/chosen": -1.5202953815460205, - "logits/rejected": -1.4201245307922363, - "logps/chosen": -359.0076904296875, - "logps/rejected": -417.811767578125, - "loss": 0.0234, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -2.1841957569122314, - "rewards/margins": 0.5235225558280945, - "rewards/rejected": -2.7077183723449707, + "logits/chosen": 0.2791319191455841, + "logits/rejected": 0.45174160599708557, + "logps/chosen": -372.1945495605469, + "logps/rejected": -446.6507263183594, + "loss": 0.5458, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -2.3160648345947266, + "rewards/margins": 0.68004310131073, + "rewards/rejected": -2.996107816696167, "step": 920 }, { - "debug/losses": 0.02461547963321209, - "debug/policy_weights": 0.041976023465394974, - "debug/raw_losses": 0.5439327955245972, "epoch": 0.74, "learning_rate": 9.584132603467827e-08, - "logits/chosen": -1.5991088151931763, - "logits/rejected": -1.4676315784454346, - "logps/chosen": -373.9784240722656, - "logps/rejected": -436.048583984375, - "loss": 0.0264, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -2.2749578952789307, - "rewards/margins": 0.5521215200424194, - "rewards/rejected": -2.8270797729492188, + "logits/chosen": -0.12192128598690033, + "logits/rejected": 0.1477951854467392, + "logps/chosen": -366.48321533203125, + "logps/rejected": -453.130126953125, + "loss": 0.5467, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.200005531311035, + "rewards/margins": 0.7978888750076294, + "rewards/rejected": -2.997894287109375, "step": 930 }, { - "debug/losses": 0.023597698658704758, - "debug/policy_weights": 0.03990933299064636, - "debug/raw_losses": 0.5475441813468933, "epoch": 0.75, "learning_rate": 9.042988532644249e-08, - "logits/chosen": -1.4806602001190186, - "logits/rejected": -1.4163165092468262, - "logps/chosen": -354.83294677734375, - "logps/rejected": -430.00897216796875, - "loss": 0.0226, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -2.2552969455718994, - "rewards/margins": 0.548123836517334, - "rewards/rejected": -2.8034207820892334, + "logits/chosen": -0.03106372058391571, + "logits/rejected": 0.07721444219350815, + "logps/chosen": -344.21270751953125, + "logps/rejected": -438.11077880859375, + "loss": 0.5161, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.149094581604004, + "rewards/margins": 0.7353444695472717, + "rewards/rejected": -2.884438991546631, "step": 940 }, { - "debug/losses": 0.01878216303884983, - "debug/policy_weights": 0.03427324444055557, - "debug/raw_losses": 0.5461187362670898, "epoch": 0.76, "learning_rate": 8.514177396802428e-08, - "logits/chosen": -1.3683289289474487, - "logits/rejected": -1.2690086364746094, - "logps/chosen": -381.18011474609375, - "logps/rejected": -442.29583740234375, - "loss": 0.0216, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -2.452326536178589, - "rewards/margins": 0.5281901359558105, - "rewards/rejected": -2.9805169105529785, + "logits/chosen": 0.006801058538258076, + "logits/rejected": 0.20282092690467834, + "logps/chosen": -358.15167236328125, + "logps/rejected": -436.4964294433594, + "loss": 0.5385, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -2.2220425605773926, + "rewards/margins": 0.7004804611206055, + "rewards/rejected": -2.922523260116577, "step": 950 }, { - "debug/losses": 0.017957579344511032, - "debug/policy_weights": 0.03907625377178192, - "debug/raw_losses": 0.5351197719573975, "epoch": 0.76, "learning_rate": 7.998107906142839e-08, - "logits/chosen": -1.323372483253479, - "logits/rejected": -1.1536943912506104, - "logps/chosen": -389.06451416015625, - "logps/rejected": -442.05029296875, - "loss": 0.0218, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -2.3945274353027344, - "rewards/margins": 0.5683561563491821, - "rewards/rejected": -2.962883472442627, + "logits/chosen": 0.41448846459388733, + "logits/rejected": 0.705254852771759, + "logps/chosen": -371.27801513671875, + "logps/rejected": -434.56866455078125, + "loss": 0.5236, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.2166616916656494, + "rewards/margins": 0.6714047193527222, + "rewards/rejected": -2.888066530227661, "step": 960 }, { - "debug/losses": 0.023620374500751495, - "debug/policy_weights": 0.0439610555768013, - "debug/raw_losses": 0.537574827671051, "epoch": 0.77, "learning_rate": 7.495178923039396e-08, - "logits/chosen": -1.3938144445419312, - "logits/rejected": -1.2645502090454102, - "logps/chosen": -365.7980651855469, - "logps/rejected": -443.1065979003906, - "loss": 0.0238, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -2.1868607997894287, - "rewards/margins": 0.6563215255737305, - "rewards/rejected": -2.8431825637817383, + "logits/chosen": 0.23847150802612305, + "logits/rejected": 0.48661884665489197, + "logps/chosen": -366.28179931640625, + "logps/rejected": -462.679443359375, + "loss": 0.5459, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.1916985511779785, + "rewards/margins": 0.8472123146057129, + "rewards/rejected": -3.038910388946533, "step": 970 }, { - "debug/losses": 0.021119868382811546, - "debug/policy_weights": 0.03630395606160164, - "debug/raw_losses": 0.5694866180419922, "epoch": 0.78, "learning_rate": 7.005779153764682e-08, - "logits/chosen": -1.3872743844985962, - "logits/rejected": -1.2704601287841797, - "logps/chosen": -363.4798278808594, - "logps/rejected": -420.29119873046875, - "loss": 0.0252, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -2.2382142543792725, - "rewards/margins": 0.48818764090538025, - "rewards/rejected": -2.7264022827148438, + "logits/chosen": 0.41438961029052734, + "logits/rejected": 0.6912784576416016, + "logps/chosen": -382.70123291015625, + "logps/rejected": -461.8614807128906, + "loss": 0.5453, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.4304287433624268, + "rewards/margins": 0.7116767764091492, + "rewards/rejected": -3.1421055793762207, "step": 980 }, { - "debug/losses": 0.025250500068068504, - "debug/policy_weights": 0.04387316480278969, - "debug/raw_losses": 0.6040468215942383, "epoch": 0.79, "learning_rate": 6.530286848064698e-08, - "logits/chosen": -1.4193531274795532, - "logits/rejected": -1.3229395151138306, - "logps/chosen": -361.82440185546875, - "logps/rejected": -421.3187561035156, - "loss": 0.0265, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -2.2844254970550537, - "rewards/margins": 0.5003790259361267, - "rewards/rejected": -2.784804582595825, + "logits/chosen": 0.36573725938796997, + "logits/rejected": 0.5834362506866455, + "logps/chosen": -384.49749755859375, + "logps/rejected": -466.30096435546875, + "loss": 0.5528, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.5111565589904785, + "rewards/margins": 0.7234699130058289, + "rewards/rejected": -3.234626054763794, "step": 990 }, { - "debug/losses": 0.021930910646915436, - "debug/policy_weights": 0.03801232576370239, - "debug/raw_losses": 0.5503931045532227, "epoch": 0.8, "learning_rate": 6.069069506815325e-08, - "logits/chosen": -1.4316551685333252, - "logits/rejected": -1.386977195739746, - "logps/chosen": -347.33074951171875, - "logps/rejected": -420.5255432128906, - "loss": 0.0252, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -2.218000888824463, - "rewards/margins": 0.5753072500228882, - "rewards/rejected": -2.7933077812194824, + "logits/chosen": 0.45530566573143005, + "logits/rejected": 0.5909157991409302, + "logps/chosen": -379.1433410644531, + "logps/rejected": -468.88458251953125, + "loss": 0.527, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.5361268520355225, + "rewards/margins": 0.7407721281051636, + "rewards/rejected": -3.2768986225128174, "step": 1000 }, { "epoch": 0.8, - "eval_debug/losses": 0.02548273280262947, - "eval_debug/policy_weights": 0.04403071105480194, - "eval_debug/raw_losses": 0.5723671913146973, - "eval_logits/chosen": -1.516414999961853, - "eval_logits/rejected": -1.4056260585784912, - "eval_logps/chosen": -362.14508056640625, - "eval_logps/rejected": -420.9573669433594, - "eval_loss": 0.02666386030614376, - "eval_rewards/accuracies": 0.6996268630027771, - "eval_rewards/chosen": -2.175715923309326, - "eval_rewards/margins": 0.5225144028663635, - "eval_rewards/rejected": -2.6982297897338867, - "eval_runtime": 184.2993, - "eval_samples_per_second": 46.403, + "eval_logits/chosen": 0.3871051073074341, + "eval_logits/rejected": 0.6372014284133911, + "eval_logps/chosen": -394.97113037109375, + "eval_logps/rejected": -471.8453674316406, + "eval_loss": 0.5453863739967346, + "eval_rewards/accuracies": 0.70802241563797, + "eval_rewards/chosen": -2.503976345062256, + "eval_rewards/margins": 0.7031334638595581, + "eval_rewards/rejected": -3.2071101665496826, + "eval_runtime": 184.4226, + "eval_samples_per_second": 46.372, "eval_steps_per_second": 0.727, "step": 1000 }, { - "debug/losses": 0.027735576033592224, - "debug/policy_weights": 0.04791461303830147, - "debug/raw_losses": 0.5849307775497437, "epoch": 0.8, "learning_rate": 5.6224835979863714e-08, - "logits/chosen": -1.5229387283325195, - "logits/rejected": -1.378822922706604, - "logps/chosen": -370.7265625, - "logps/rejected": -431.07147216796875, - "loss": 0.0274, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -2.1310508251190186, - "rewards/margins": 0.5098434686660767, - "rewards/rejected": -2.6408944129943848, + "logits/chosen": 0.31174296140670776, + "logits/rejected": 0.6193565130233765, + "logps/chosen": -390.387451171875, + "logps/rejected": -468.4959411621094, + "loss": 0.5568, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.32766056060791, + "rewards/margins": 0.68747878074646, + "rewards/rejected": -3.015139102935791, "step": 1010 }, { - "debug/losses": 0.02689420059323311, - "debug/policy_weights": 0.04838930815458298, - "debug/raw_losses": 0.5670397877693176, "epoch": 0.81, "learning_rate": 5.190874281132851e-08, - "logits/chosen": -1.60018789768219, - "logits/rejected": -1.419245719909668, - "logps/chosen": -378.01434326171875, - "logps/rejected": -410.55859375, - "loss": 0.0286, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -2.1182477474212646, - "rewards/margins": 0.5137091875076294, - "rewards/rejected": -2.6319568157196045, + "logits/chosen": 0.22277125716209412, + "logits/rejected": 0.6487134099006653, + "logps/chosen": -402.0958557128906, + "logps/rejected": -448.5992736816406, + "loss": 0.5408, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.359062671661377, + "rewards/margins": 0.6533006429672241, + "rewards/rejected": -3.0123631954193115, "step": 1020 }, { - "debug/losses": 0.02637534961104393, - "debug/policy_weights": 0.04681163281202316, - "debug/raw_losses": 0.5525280833244324, "epoch": 0.82, "learning_rate": 4.774575140626316e-08, - "logits/chosen": -1.5973323583602905, - "logits/rejected": -1.4668004512786865, - "logps/chosen": -343.55364990234375, - "logps/rejected": -404.0575256347656, - "loss": 0.0259, + "logits/chosen": 0.23170511424541473, + "logits/rejected": 0.47184085845947266, + "logps/chosen": -363.46917724609375, + "logps/rejected": -442.47918701171875, + "loss": 0.5309, "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -1.943615198135376, - "rewards/margins": 0.5662528872489929, - "rewards/rejected": -2.5098681449890137, + "rewards/chosen": -2.142770290374756, + "rewards/margins": 0.7513145208358765, + "rewards/rejected": -2.894084930419922, "step": 1030 }, { - "debug/losses": 0.02873830497264862, - "debug/policy_weights": 0.05036206915974617, - "debug/raw_losses": 0.5529695749282837, "epoch": 0.83, "learning_rate": 4.373907927832513e-08, - "logits/chosen": -1.6437633037567139, - "logits/rejected": -1.518031358718872, - "logps/chosen": -358.2055358886719, - "logps/rejected": -397.07135009765625, - "loss": 0.0256, - "rewards/accuracies": 0.71875, - "rewards/chosen": -1.970807671546936, - "rewards/margins": 0.4832659661769867, - "rewards/rejected": -2.454073429107666, + "logits/chosen": 0.07573021948337555, + "logits/rejected": 0.32997313141822815, + "logps/chosen": -381.45599365234375, + "logps/rejected": -443.0684509277344, + "loss": 0.5407, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.2033116817474365, + "rewards/margins": 0.710732638835907, + "rewards/rejected": -2.914044141769409, "step": 1040 }, { - "debug/losses": 0.024505551904439926, - "debug/policy_weights": 0.046079121530056, - "debug/raw_losses": 0.5681978464126587, "epoch": 0.84, "learning_rate": 3.9891823124345665e-08, - "logits/chosen": -1.5160458087921143, - "logits/rejected": -1.3487160205841064, - "logps/chosen": -352.01373291015625, - "logps/rejected": -399.7527770996094, - "loss": 0.0272, - "rewards/accuracies": 0.6875, - "rewards/chosen": -2.088233709335327, - "rewards/margins": 0.5481000542640686, - "rewards/rejected": -2.636333703994751, + "logits/chosen": 0.23884686827659607, + "logits/rejected": 0.6128005385398865, + "logps/chosen": -364.00567626953125, + "logps/rejected": -433.3273010253906, + "loss": 0.5471, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.2081527709960938, + "rewards/margins": 0.7639263868331909, + "rewards/rejected": -2.972079038619995, "step": 1050 }, { - "debug/losses": 0.021924784407019615, - "debug/policy_weights": 0.045226097106933594, - "debug/raw_losses": 0.5537627339363098, "epoch": 0.84, "learning_rate": 3.620695643093924e-08, - "logits/chosen": -1.5006005764007568, - "logits/rejected": -1.3100881576538086, - "logps/chosen": -375.2703552246094, - "logps/rejected": -414.5780334472656, - "loss": 0.0281, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -2.095370054244995, - "rewards/margins": 0.5610297918319702, - "rewards/rejected": -2.656399965286255, + "logits/chosen": 0.21963253617286682, + "logits/rejected": 0.6894062757492065, + "logps/chosen": -399.5767517089844, + "logps/rejected": -452.88909912109375, + "loss": 0.5154, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.3384335041046143, + "rewards/margins": 0.7010769844055176, + "rewards/rejected": -3.0395102500915527, "step": 1060 }, { - "debug/losses": 0.02421540394425392, - "debug/policy_weights": 0.04291614517569542, - "debug/raw_losses": 0.5675187706947327, "epoch": 0.85, "learning_rate": 3.268732717634032e-08, - "logits/chosen": -1.4694091081619263, - "logits/rejected": -1.3285291194915771, - "logps/chosen": -357.8580017089844, - "logps/rejected": -400.9749450683594, - "loss": 0.0269, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -2.0889532566070557, - "rewards/margins": 0.5238643884658813, - "rewards/rejected": -2.6128172874450684, + "logits/chosen": 0.3474286198616028, + "logits/rejected": 0.695271372795105, + "logps/chosen": -368.0654602050781, + "logps/rejected": -431.47222900390625, + "loss": 0.5499, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.1910276412963867, + "rewards/margins": 0.7267633080482483, + "rewards/rejected": -2.9177908897399902, "step": 1070 }, { - "debug/losses": 0.025431480258703232, - "debug/policy_weights": 0.04796423763036728, - "debug/raw_losses": 0.5536999106407166, "epoch": 0.86, "learning_rate": 2.9335655629243645e-08, - "logits/chosen": -1.509301781654358, - "logits/rejected": -1.338833212852478, - "logps/chosen": -377.6971435546875, - "logps/rejected": -418.16839599609375, - "loss": 0.026, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -2.142178535461426, - "rewards/margins": 0.5537933707237244, - "rewards/rejected": -2.695971727371216, + "logits/chosen": 0.2347393035888672, + "logits/rejected": 0.5894696712493896, + "logps/chosen": -388.94757080078125, + "logps/rejected": -447.3855895996094, + "loss": 0.525, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.254683017730713, + "rewards/margins": 0.7334609031677246, + "rewards/rejected": -2.9881439208984375, "step": 1080 }, { - "debug/losses": 0.02289220504462719, - "debug/policy_weights": 0.043845418840646744, - "debug/raw_losses": 0.5494796633720398, "epoch": 0.87, "learning_rate": 2.6154532246349476e-08, - "logits/chosen": -1.4601554870605469, - "logits/rejected": -1.27918541431427, - "logps/chosen": -345.5830993652344, - "logps/rejected": -398.8944091796875, - "loss": 0.027, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -2.0374293327331543, - "rewards/margins": 0.5635212659835815, - "rewards/rejected": -2.6009507179260254, + "logits/chosen": 0.25378522276878357, + "logits/rejected": 0.5771256685256958, + "logps/chosen": -358.50640869140625, + "logps/rejected": -431.145751953125, + "loss": 0.5462, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.1666626930236816, + "rewards/margins": 0.756801426410675, + "rewards/rejected": -2.923464059829712, "step": 1090 }, { - "debug/losses": 0.029651587828993797, - "debug/policy_weights": 0.05081306770443916, - "debug/raw_losses": 0.567287802696228, "epoch": 0.88, "learning_rate": 2.31464156702382e-08, - "logits/chosen": -1.4290579557418823, - "logits/rejected": -1.2941869497299194, - "logps/chosen": -358.2601318359375, - "logps/rejected": -411.37335205078125, - "loss": 0.0285, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -2.1656134128570557, - "rewards/margins": 0.528947651386261, - "rewards/rejected": -2.694561004638672, + "logits/chosen": 0.35370689630508423, + "logits/rejected": 0.5671936273574829, + "logps/chosen": -363.0, + "logps/rejected": -438.209228515625, + "loss": 0.5487, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.2130119800567627, + "rewards/margins": 0.7499077916145325, + "rewards/rejected": -2.9629194736480713, "step": 1100 }, { "epoch": 0.88, - "eval_debug/losses": 0.025445688515901566, - "eval_debug/policy_weights": 0.04400235787034035, - "eval_debug/raw_losses": 0.5706825852394104, - "eval_logits/chosen": -1.4793744087219238, - "eval_logits/rejected": -1.3619236946105957, - "eval_logps/chosen": -361.6648864746094, - "eval_logps/rejected": -420.7030029296875, - "eval_loss": 0.026664981618523598, - "eval_rewards/accuracies": 0.6996268630027771, - "eval_rewards/chosen": -2.1709134578704834, - "eval_rewards/margins": 0.5247728228569031, - "eval_rewards/rejected": -2.6956863403320312, - "eval_runtime": 184.3232, - "eval_samples_per_second": 46.397, - "eval_steps_per_second": 0.727, + "eval_logits/chosen": 0.1857856959104538, + "eval_logits/rejected": 0.43363669514656067, + "eval_logps/chosen": -373.08306884765625, + "eval_logps/rejected": -450.7598876953125, + "eval_loss": 0.5444055199623108, + "eval_rewards/accuracies": 0.7089552283287048, + "eval_rewards/chosen": -2.285095453262329, + "eval_rewards/margins": 0.711159884929657, + "eval_rewards/rejected": -2.996255397796631, + "eval_runtime": 184.462, + "eval_samples_per_second": 46.362, + "eval_steps_per_second": 0.726, "step": 1100 }, { - "debug/losses": 0.026189718395471573, - "debug/policy_weights": 0.04181275889277458, - "debug/raw_losses": 0.6016842126846313, "epoch": 0.88, "learning_rate": 2.031363082912252e-08, - "logits/chosen": -1.499267816543579, - "logits/rejected": -1.3394094705581665, - "logps/chosen": -363.6783447265625, - "logps/rejected": -398.1620178222656, - "loss": 0.0265, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -2.1579599380493164, - "rewards/margins": 0.42903074622154236, - "rewards/rejected": -2.5869908332824707, + "logits/chosen": 0.070524200797081, + "logits/rejected": 0.4635602533817291, + "logps/chosen": -373.29327392578125, + "logps/rejected": -426.85552978515625, + "loss": 0.5513, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -2.2541089057922363, + "rewards/margins": 0.6198171973228455, + "rewards/rejected": -2.8739261627197266, "step": 1110 }, { - "debug/losses": 0.022603105753660202, - "debug/policy_weights": 0.040678009390830994, - "debug/raw_losses": 0.596676766872406, "epoch": 0.89, "learning_rate": 1.7658367139945228e-08, - "logits/chosen": -1.449702262878418, - "logits/rejected": -1.3108792304992676, - "logps/chosen": -380.9664001464844, - "logps/rejected": -442.7220764160156, - "loss": 0.0262, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -2.2784924507141113, - "rewards/margins": 0.5700411796569824, - "rewards/rejected": -2.8485333919525146, + "logits/chosen": 0.2600646913051605, + "logits/rejected": 0.5517584681510925, + "logps/chosen": -390.8568115234375, + "logps/rejected": -462.80828857421875, + "loss": 0.5471, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.377396821975708, + "rewards/margins": 0.6719989776611328, + "rewards/rejected": -3.049395799636841, "step": 1120 }, { - "debug/losses": 0.027733484283089638, - "debug/policy_weights": 0.051148880273103714, - "debug/raw_losses": 0.5847379565238953, "epoch": 0.9, "learning_rate": 1.5182676816211632e-08, - "logits/chosen": -1.5339306592941284, - "logits/rejected": -1.3667137622833252, - "logps/chosen": -367.2829284667969, - "logps/rejected": -413.9859313964844, - "loss": 0.0273, + "logits/chosen": 0.04413030296564102, + "logits/rejected": 0.30151715874671936, + "logps/chosen": -382.0662536621094, + "logps/rejected": -447.08673095703125, + "loss": 0.5431, "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -2.0808215141296387, - "rewards/margins": 0.5094277858734131, - "rewards/rejected": -2.590249538421631, + "rewards/chosen": -2.228654384613037, + "rewards/margins": 0.6926024556159973, + "rewards/rejected": -2.9212570190429688, "step": 1130 }, { - "debug/losses": 0.022030893713235855, - "debug/policy_weights": 0.04428356885910034, - "debug/raw_losses": 0.5199077129364014, "epoch": 0.91, "learning_rate": 1.2888473281864597e-08, - "logits/chosen": -1.4379560947418213, - "logits/rejected": -1.2951207160949707, - "logps/chosen": -360.4403991699219, - "logps/rejected": -416.469970703125, - "loss": 0.0259, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": -2.179405689239502, - "rewards/margins": 0.61573326587677, - "rewards/rejected": -2.7951388359069824, + "logits/chosen": 0.14212054014205933, + "logits/rejected": 0.47429710626602173, + "logps/chosen": -367.8409729003906, + "logps/rejected": -435.02764892578125, + "loss": 0.5369, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -2.2534115314483643, + "rewards/margins": 0.7273036241531372, + "rewards/rejected": -2.980715274810791, "step": 1140 }, { - "debug/losses": 0.026605481281876564, - "debug/policy_weights": 0.04820040613412857, - "debug/raw_losses": 0.5693732500076294, "epoch": 0.92, "learning_rate": 1.0777529692427679e-08, - "logits/chosen": -1.5079468488693237, - "logits/rejected": -1.4001545906066895, - "logps/chosen": -359.7943420410156, - "logps/rejected": -421.87249755859375, - "loss": 0.0232, + "logits/chosen": 0.04115242511034012, + "logits/rejected": 0.28970104455947876, + "logps/chosen": -372.7949523925781, + "logps/rejected": -456.10675048828125, + "loss": 0.5265, "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -2.1703505516052246, - "rewards/margins": 0.5936353206634521, - "rewards/rejected": -2.7639856338500977, + "rewards/chosen": -2.300356388092041, + "rewards/margins": 0.8059718012809753, + "rewards/rejected": -3.106328248977661, "step": 1150 }, { - "debug/losses": 0.024028832092881203, - "debug/policy_weights": 0.03998067229986191, - "debug/raw_losses": 0.591817319393158, "epoch": 0.92, "learning_rate": 8.851477564560061e-09, - "logits/chosen": -1.4518392086029053, - "logits/rejected": -1.2875111103057861, - "logps/chosen": -360.74237060546875, - "logps/rejected": -402.0259094238281, - "loss": 0.0256, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -2.2179458141326904, - "rewards/margins": 0.5185562968254089, - "rewards/rejected": -2.736502170562744, + "logits/chosen": 0.0867738351225853, + "logits/rejected": 0.4068300127983093, + "logps/chosen": -372.08636474609375, + "logps/rejected": -426.42388916015625, + "loss": 0.5342, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -2.331385850906372, + "rewards/margins": 0.6490964293479919, + "rewards/rejected": -2.9804821014404297, "step": 1160 }, { - "debug/losses": 0.022491086274385452, - "debug/policy_weights": 0.03993768244981766, - "debug/raw_losses": 0.5183846354484558, "epoch": 0.93, "learning_rate": 7.111805515081531e-09, - "logits/chosen": -1.5072176456451416, - "logits/rejected": -1.278832197189331, - "logps/chosen": -353.49365234375, - "logps/rejected": -418.4107971191406, - "loss": 0.0252, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -2.133981227874756, - "rewards/margins": 0.6634694337844849, - "rewards/rejected": -2.7974507808685303, + "logits/chosen": 0.02022993005812168, + "logits/rejected": 0.41968393325805664, + "logps/chosen": -363.818603515625, + "logps/rejected": -447.7919006347656, + "loss": 0.5312, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -2.2372307777404785, + "rewards/margins": 0.8540315628051758, + "rewards/rejected": -3.0912623405456543, "step": 1170 }, { - "debug/losses": 0.019128387793898582, - "debug/policy_weights": 0.036281704902648926, - "debug/raw_losses": 0.5611279010772705, "epoch": 0.94, "learning_rate": 5.559858110443016e-09, - "logits/chosen": -1.3955036401748657, - "logits/rejected": -1.1885672807693481, - "logps/chosen": -368.412353515625, - "logps/rejected": -412.42144775390625, - "loss": 0.0246, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -2.2694032192230225, - "rewards/margins": 0.5472678542137146, - "rewards/rejected": -2.8166708946228027, + "logits/chosen": 0.29695388674736023, + "logits/rejected": 0.714096188545227, + "logps/chosen": -372.5519714355469, + "logps/rejected": -442.5354919433594, + "loss": 0.5383, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.3107995986938477, + "rewards/margins": 0.8070123791694641, + "rewards/rejected": -3.117811918258667, "step": 1180 }, { - "debug/losses": 0.021850595250725746, - "debug/policy_weights": 0.04237665981054306, - "debug/raw_losses": 0.5156761407852173, "epoch": 0.95, "learning_rate": 4.196834827531276e-09, - "logits/chosen": -1.4580028057098389, - "logits/rejected": -1.3392555713653564, - "logps/chosen": -350.61151123046875, - "logps/rejected": -427.6637268066406, - "loss": 0.0253, + "logits/chosen": 0.140055850148201, + "logits/rejected": 0.3409932255744934, + "logps/chosen": -355.64324951171875, + "logps/rejected": -447.585693359375, + "loss": 0.5152, "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -2.09883451461792, - "rewards/margins": 0.6415940523147583, - "rewards/rejected": -2.7404284477233887, + "rewards/chosen": -2.149151563644409, + "rewards/margins": 0.7904965877532959, + "rewards/rejected": -2.939648151397705, "step": 1190 }, { - "debug/losses": 0.02618267573416233, - "debug/policy_weights": 0.04739568382501602, - "debug/raw_losses": 0.5612160563468933, "epoch": 0.96, "learning_rate": 3.023789126611137e-09, - "logits/chosen": -1.5009291172027588, - "logits/rejected": -1.360892653465271, - "logps/chosen": -351.302978515625, - "logps/rejected": -406.3242492675781, - "loss": 0.0253, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -2.161158323287964, - "rewards/margins": 0.53586745262146, - "rewards/rejected": -2.697025775909424, + "logits/chosen": 0.03294936567544937, + "logits/rejected": 0.2933207154273987, + "logps/chosen": -363.29290771484375, + "logps/rejected": -435.640380859375, + "loss": 0.5483, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.281057357788086, + "rewards/margins": 0.7091296911239624, + "rewards/rejected": -2.990186929702759, "step": 1200 }, { "epoch": 0.96, - "eval_debug/losses": 0.024837609380483627, - "eval_debug/policy_weights": 0.04291440546512604, - "eval_debug/raw_losses": 0.5705610513687134, - "eval_logits/chosen": -1.4822590351104736, - "eval_logits/rejected": -1.3643442392349243, - "eval_logps/chosen": -363.91851806640625, - "eval_logps/rejected": -422.5250549316406, - "eval_loss": 0.026016969233751297, - "eval_rewards/accuracies": 0.7033582329750061, - "eval_rewards/chosen": -2.1934494972229004, - "eval_rewards/margins": 0.5204570889472961, - "eval_rewards/rejected": -2.713906764984131, - "eval_runtime": 184.297, - "eval_samples_per_second": 46.403, + "eval_logits/chosen": 0.07418080419301987, + "eval_logits/rejected": 0.32435521483421326, + "eval_logps/chosen": -373.978515625, + "eval_logps/rejected": -451.6764831542969, + "eval_loss": 0.5440130829811096, + "eval_rewards/accuracies": 0.7089552283287048, + "eval_rewards/chosen": -2.2940499782562256, + "eval_rewards/margins": 0.7113713622093201, + "eval_rewards/rejected": -3.0054211616516113, + "eval_runtime": 184.3589, + "eval_samples_per_second": 46.388, "eval_steps_per_second": 0.727, "step": 1200 }, { - "debug/losses": 0.022863609716296196, - "debug/policy_weights": 0.04306855797767639, - "debug/raw_losses": 0.5241585969924927, "epoch": 0.96, "learning_rate": 2.041627637121929e-09, - "logits/chosen": -1.470323920249939, - "logits/rejected": -1.3386765718460083, - "logps/chosen": -346.5189208984375, - "logps/rejected": -412.08380126953125, - "loss": 0.0248, + "logits/chosen": 0.10010697692632675, + "logits/rejected": 0.3795483410358429, + "logps/chosen": -348.8675231933594, + "logps/rejected": -437.20361328125, + "loss": 0.5398, "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -2.0638785362243652, - "rewards/margins": 0.6006760597229004, - "rewards/rejected": -2.6645545959472656, + "rewards/chosen": -2.087364673614502, + "rewards/margins": 0.828387439250946, + "rewards/rejected": -2.9157521724700928, "step": 1210 }, { - "debug/losses": 0.023648496717214584, - "debug/policy_weights": 0.04211193323135376, - "debug/raw_losses": 0.5517353415489197, "epoch": 0.97, "learning_rate": 1.2511094569571668e-09, - "logits/chosen": -1.420866847038269, - "logits/rejected": -1.243516206741333, - "logps/chosen": -378.2394104003906, - "logps/rejected": -423.1949157714844, - "loss": 0.0263, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -2.234367609024048, - "rewards/margins": 0.5510514974594116, - "rewards/rejected": -2.78541898727417, + "logits/chosen": 0.09991980344057083, + "logits/rejected": 0.4467397630214691, + "logps/chosen": -380.14520263671875, + "logps/rejected": -440.24658203125, + "loss": 0.5345, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.253425121307373, + "rewards/margins": 0.702509880065918, + "rewards/rejected": -2.955935001373291, "step": 1220 }, { - "debug/losses": 0.025155076757073402, - "debug/policy_weights": 0.03890926018357277, - "debug/raw_losses": 0.6175922155380249, "epoch": 0.98, "learning_rate": 6.528455657691112e-10, - "logits/chosen": -1.4460153579711914, - "logits/rejected": -1.3199143409729004, - "logps/chosen": -366.03570556640625, - "logps/rejected": -397.4085388183594, - "loss": 0.0254, - "rewards/accuracies": 0.6875, - "rewards/chosen": -2.213092565536499, - "rewards/margins": 0.3978874087333679, - "rewards/rejected": -2.6109797954559326, + "logits/chosen": 0.11626466363668442, + "logits/rejected": 0.41348797082901, + "logps/chosen": -372.7298889160156, + "logps/rejected": -427.22576904296875, + "loss": 0.549, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.2800345420837402, + "rewards/margins": 0.6291176080703735, + "rewards/rejected": -2.909151792526245, "step": 1230 }, { - "debug/losses": 0.027021676301956177, - "debug/policy_weights": 0.05005006864666939, - "debug/raw_losses": 0.5574381947517395, "epoch": 0.99, "learning_rate": 2.4729835275189016e-10, - "logits/chosen": -1.5190496444702148, - "logits/rejected": -1.406127691268921, - "logps/chosen": -380.56854248046875, - "logps/rejected": -438.9554748535156, - "loss": 0.0253, - "rewards/accuracies": 0.6875, - "rewards/chosen": -2.190269947052002, - "rewards/margins": 0.5501295924186707, - "rewards/rejected": -2.7403993606567383, + "logits/chosen": 0.06715863198041916, + "logits/rejected": 0.29241910576820374, + "logps/chosen": -393.8903503417969, + "logps/rejected": -477.9420471191406, + "loss": 0.5462, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.323488473892212, + "rewards/margins": 0.8067766427993774, + "rewards/rejected": -3.1302647590637207, "step": 1240 }, { - "debug/losses": 0.027259517461061478, - "debug/policy_weights": 0.04358803108334541, - "debug/raw_losses": 0.6284344792366028, "epoch": 0.99, "learning_rate": 3.478125926756337e-11, - "logits/chosen": -1.3990485668182373, - "logits/rejected": -1.2819174528121948, - "logps/chosen": -360.79443359375, - "logps/rejected": -410.0205078125, - "loss": 0.0254, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -2.1959283351898193, - "rewards/margins": 0.4730054438114166, - "rewards/rejected": -2.668933629989624, + "logits/chosen": 0.25983649492263794, + "logits/rejected": 0.4905417561531067, + "logps/chosen": -364.73431396484375, + "logps/rejected": -443.79296875, + "loss": 0.5474, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.2353272438049316, + "rewards/margins": 0.771331787109375, + "rewards/rejected": -3.0066590309143066, "step": 1250 }, { "epoch": 1.0, "step": 1256, "total_flos": 0.0, - "train_loss": 0.049385896711877195, - "train_runtime": 11542.5774, - "train_samples_per_second": 13.931, + "train_loss": 0.5712926928784438, + "train_runtime": 11573.5451, + "train_samples_per_second": 13.894, "train_steps_per_second": 0.109 } ],