diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -9,17 +9,14 @@ "is_world_process_zero": true, "log_history": [ { - "debug/losses": 0.11528982222080231, - "debug/policy_weights": 0.1663280427455902, - "debug/raw_losses": 0.6931471824645996, "epoch": 0.0007958615200955034, - "grad_norm": 0.8616884804965014, + "grad_norm": 4.986090946069212, "learning_rate": 3.968253968253968e-09, "logits/chosen": -2.735659122467041, "logits/rejected": -2.7581238746643066, "logps/chosen": -124.62968444824219, "logps/rejected": -168.09475708007812, - "loss": 0.1109, + "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, @@ -27,2490 +24,2079 @@ "step": 1 }, { - "debug/losses": 0.11954596638679504, - "debug/policy_weights": 0.17245811223983765, - "debug/raw_losses": 0.6931769847869873, "epoch": 0.007958615200955034, - "grad_norm": 0.8510719579827503, + "grad_norm": 5.160120887614425, "learning_rate": 3.968253968253968e-08, - "logits/chosen": -2.7387959957122803, - "logits/rejected": -2.727739095687866, - "logps/chosen": -146.7299346923828, - "logps/rejected": -131.23277282714844, - "loss": 0.1139, - "rewards/accuracies": 0.3958333432674408, - "rewards/chosen": -0.000277455139439553, - "rewards/margins": -5.68832183489576e-05, - "rewards/rejected": -0.00022057195019442588, + "logits/chosen": -2.738856315612793, + "logits/rejected": -2.7277917861938477, + "logps/chosen": -146.72731018066406, + "logps/rejected": -131.20956420898438, + "loss": 0.6931, + "rewards/accuracies": 0.4444444477558136, + "rewards/chosen": -0.00025126771652139723, + "rewards/margins": -0.0002628751390147954, + "rewards/rejected": 1.1607427040871698e-05, "step": 10 }, { - "debug/losses": 0.11194368451833725, - "debug/policy_weights": 0.1614898294210434, - "debug/raw_losses": 0.6931231021881104, "epoch": 0.01591723040191007, - "grad_norm": 0.8013573883996123, + "grad_norm": 4.941410796258004, "learning_rate": 7.936507936507936e-08, - "logits/chosen": -2.7066047191619873, - "logits/rejected": -2.703556776046753, - "logps/chosen": -129.4934539794922, - "logps/rejected": -130.2837371826172, - "loss": 0.1096, - "rewards/accuracies": 0.512499988079071, - "rewards/chosen": -0.00025156832998618484, - "rewards/margins": 5.059631075710058e-05, - "rewards/rejected": -0.0003021646407432854, + "logits/chosen": -2.7067627906799316, + "logits/rejected": -2.7038016319274902, + "logps/chosen": -129.4619598388672, + "logps/rejected": -130.26687622070312, + "loss": 0.6932, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": 6.337546074064448e-05, + "rewards/margins": 0.00019685756706167012, + "rewards/rejected": -0.00013348212814889848, "step": 20 }, { - "debug/losses": 0.10068635642528534, - "debug/policy_weights": 0.14529208838939667, - "debug/raw_losses": 0.6929429769515991, "epoch": 0.0238758456028651, - "grad_norm": 0.7733830793750671, + "grad_norm": 4.798328591529667, "learning_rate": 1.1904761904761903e-07, - "logits/chosen": -2.683795690536499, - "logits/rejected": -2.680772542953491, - "logps/chosen": -141.83145141601562, - "logps/rejected": -155.7014923095703, - "loss": 0.111, - "rewards/accuracies": 0.543749988079071, - "rewards/chosen": 3.1686854526924435e-06, - "rewards/margins": 0.0004113609029445797, - "rewards/rejected": -0.0004081922525074333, + "logits/chosen": -2.684114694595337, + "logits/rejected": -2.6811366081237793, + "logps/chosen": -141.76788330078125, + "logps/rejected": -155.64646911621094, + "loss": 0.693, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": 0.0006387188332155347, + "rewards/margins": 0.0004968013381585479, + "rewards/rejected": 0.00014191746595315635, "step": 30 }, { - "debug/losses": 0.1046997457742691, - "debug/policy_weights": 0.15105444192886353, - "debug/raw_losses": 0.6930674314498901, "epoch": 0.03183446080382014, - "grad_norm": 0.8523933397333024, + "grad_norm": 5.077340975164852, "learning_rate": 1.5873015873015872e-07, - "logits/chosen": -2.6914453506469727, - "logits/rejected": -2.683753252029419, - "logps/chosen": -155.02069091796875, - "logps/rejected": -164.1700439453125, - "loss": 0.1069, - "rewards/accuracies": 0.5249999761581421, - "rewards/chosen": -0.0019863867200911045, - "rewards/margins": 0.00016431634139735252, - "rewards/rejected": -0.002150703454390168, + "logits/chosen": -2.69206166267395, + "logits/rejected": -2.684312343597412, + "logps/chosen": -154.81820678710938, + "logps/rejected": -164.00318908691406, + "loss": 0.6928, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": 3.8339756429195404e-05, + "rewards/margins": 0.0005206236382946372, + "rewards/rejected": -0.00048228385276161134, "step": 40 }, { - "debug/losses": 0.11338607221841812, - "debug/policy_weights": 0.16372892260551453, - "debug/raw_losses": 0.6922137141227722, "epoch": 0.03979307600477517, - "grad_norm": 0.7129242337063245, + "grad_norm": 4.879872886513438, "learning_rate": 1.984126984126984e-07, - "logits/chosen": -2.7061779499053955, - "logits/rejected": -2.6873795986175537, - "logps/chosen": -144.10765075683594, - "logps/rejected": -137.78359985351562, - "loss": 0.1088, - "rewards/accuracies": 0.59375, - "rewards/chosen": -0.0038405258674174547, - "rewards/margins": 0.00187877775169909, - "rewards/rejected": -0.005719303619116545, + "logits/chosen": -2.7071824073791504, + "logits/rejected": -2.688455820083618, + "logps/chosen": -143.71075439453125, + "logps/rejected": -137.49859619140625, + "loss": 0.6923, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": 0.00012845727906096727, + "rewards/margins": 0.002997648436576128, + "rewards/rejected": -0.0028691913466900587, "step": 50 }, { - "debug/losses": 0.10943502187728882, - "debug/policy_weights": 0.15804630517959595, - "debug/raw_losses": 0.6921452283859253, "epoch": 0.0477516912057302, - "grad_norm": 0.735644435219337, + "grad_norm": 4.800891825988376, "learning_rate": 2.3809523809523806e-07, - "logits/chosen": -2.714186906814575, - "logits/rejected": -2.715027093887329, - "logps/chosen": -146.23638916015625, - "logps/rejected": -159.6914825439453, - "loss": 0.1042, - "rewards/accuracies": 0.612500011920929, - "rewards/chosen": -0.006097087636590004, - "rewards/margins": 0.0020311239641159773, - "rewards/rejected": -0.0081282127648592, + "logits/chosen": -2.715952157974243, + "logits/rejected": -2.7166202068328857, + "logps/chosen": -145.3311309814453, + "logps/rejected": -158.9158935546875, + "loss": 0.6912, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": 0.002955437172204256, + "rewards/margins": 0.003327813697978854, + "rewards/rejected": -0.00037237658398225904, "step": 60 }, { - "debug/losses": 0.1017540916800499, - "debug/policy_weights": 0.14726726710796356, - "debug/raw_losses": 0.6908984780311584, "epoch": 0.055710306406685235, - "grad_norm": 0.7373206708215507, + "grad_norm": 5.073043436012192, "learning_rate": 2.7777777777777776e-07, - "logits/chosen": -2.7332382202148438, - "logits/rejected": -2.7244322299957275, - "logps/chosen": -149.62216186523438, - "logps/rejected": -143.61502075195312, - "loss": 0.0982, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -0.019838612526655197, - "rewards/margins": 0.004651274066418409, - "rewards/rejected": -0.024489887058734894, + "logits/chosen": -2.737873077392578, + "logits/rejected": -2.728829860687256, + "logps/chosen": -148.14971923828125, + "logps/rejected": -142.3303985595703, + "loss": 0.6892, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.005114335101097822, + "rewards/margins": 0.0065292841754853725, + "rewards/rejected": -0.01164362020790577, "step": 70 }, { - "debug/losses": 0.07933579385280609, - "debug/policy_weights": 0.1148357018828392, - "debug/raw_losses": 0.691461980342865, "epoch": 0.06366892160764027, - "grad_norm": 0.7380219079491483, + "grad_norm": 5.019983039590536, "learning_rate": 3.1746031746031743e-07, - "logits/chosen": -2.7057547569274902, - "logits/rejected": -2.6872401237487793, - "logps/chosen": -158.06829833984375, - "logps/rejected": -149.4662628173828, - "loss": 0.0901, - "rewards/accuracies": 0.518750011920929, - "rewards/chosen": -0.049340587109327316, - "rewards/margins": 0.003763629589229822, - "rewards/rejected": -0.0531042218208313, + "logits/chosen": -2.7146174907684326, + "logits/rejected": -2.6961400508880615, + "logps/chosen": -155.6132049560547, + "logps/rejected": -147.08509826660156, + "loss": 0.6878, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": -0.024789467453956604, + "rewards/margins": 0.004503136035054922, + "rewards/rejected": -0.029292598366737366, "step": 80 }, { - "debug/losses": 0.08148403465747833, - "debug/policy_weights": 0.11854945123195648, - "debug/raw_losses": 0.6848093271255493, "epoch": 0.07162753680859531, - "grad_norm": 0.7584917889993195, + "grad_norm": 5.340063734876701, "learning_rate": 3.5714285714285716e-07, - "logits/chosen": -2.7153098583221436, - "logits/rejected": -2.7220914363861084, - "logps/chosen": -152.84352111816406, - "logps/rejected": -173.2491455078125, - "loss": 0.0776, - "rewards/accuracies": 0.612500011920929, - "rewards/chosen": -0.06997288763523102, - "rewards/margins": 0.017739882692694664, - "rewards/rejected": -0.08771277964115143, + "logits/chosen": -2.7231202125549316, + "logits/rejected": -2.729904890060425, + "logps/chosen": -149.95083618164062, + "logps/rejected": -170.76242065429688, + "loss": 0.683, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.04104585200548172, + "rewards/margins": 0.021799778565764427, + "rewards/rejected": -0.0628456324338913, "step": 90 }, { - "debug/losses": 0.06477204710245132, - "debug/policy_weights": 0.09502540528774261, - "debug/raw_losses": 0.6833306550979614, "epoch": 0.07958615200955034, - "grad_norm": 0.6766442230291091, + "grad_norm": 5.627078193282986, "learning_rate": 3.968253968253968e-07, - "logits/chosen": -2.684262752532959, - "logits/rejected": -2.6679141521453857, - "logps/chosen": -149.81607055664062, - "logps/rejected": -142.66061401367188, - "loss": 0.0684, - "rewards/accuracies": 0.5625, - "rewards/chosen": -0.1200411468744278, - "rewards/margins": 0.021218357607722282, - "rewards/rejected": -0.14125947654247284, + "logits/chosen": -2.679619312286377, + "logits/rejected": -2.6625077724456787, + "logps/chosen": -147.9279022216797, + "logps/rejected": -142.6844482421875, + "loss": 0.679, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.10115940868854523, + "rewards/margins": 0.04033854603767395, + "rewards/rejected": -0.14149793982505798, "step": 100 }, { "epoch": 0.07958615200955034, - "eval_debug/losses": 0.061516378074884415, - "eval_debug/policy_weights": 0.08996383100748062, - "eval_debug/raw_losses": 0.6826685070991516, - "eval_logits/chosen": -2.713895320892334, - "eval_logits/rejected": -2.7060160636901855, - "eval_logps/chosen": -158.206298828125, - "eval_logps/rejected": -167.00390625, - "eval_loss": 0.06231542304158211, - "eval_rewards/accuracies": 0.6035447716712952, - "eval_rewards/chosen": -0.13962814211845398, - "eval_rewards/margins": 0.02342102862894535, - "eval_rewards/rejected": -0.16304917633533478, - "eval_runtime": 153.8511, - "eval_samples_per_second": 55.586, - "eval_steps_per_second": 0.871, + "eval_logits/chosen": -2.6942806243896484, + "eval_logits/rejected": -2.686227798461914, + "eval_logps/chosen": -158.6035614013672, + "eval_logps/rejected": -168.8750457763672, + "eval_loss": 0.6759105324745178, + "eval_rewards/accuracies": 0.5998134613037109, + "eval_rewards/chosen": -0.1436006873846054, + "eval_rewards/margins": 0.03815995156764984, + "eval_rewards/rejected": -0.18176063895225525, + "eval_runtime": 153.1197, + "eval_samples_per_second": 55.852, + "eval_steps_per_second": 0.875, "step": 100 }, { - "debug/losses": 0.05575896054506302, - "debug/policy_weights": 0.08116874098777771, - "debug/raw_losses": 0.6914528012275696, "epoch": 0.08754476721050537, - "grad_norm": 0.5916448940303697, + "grad_norm": 6.93010288991277, "learning_rate": 4.365079365079365e-07, - "logits/chosen": -2.698333263397217, - "logits/rejected": -2.679625988006592, - "logps/chosen": -176.5699005126953, - "logps/rejected": -161.7273406982422, - "loss": 0.0549, - "rewards/accuracies": 0.5562499761581421, - "rewards/chosen": -0.18279142677783966, - "rewards/margins": 0.006374613847583532, - "rewards/rejected": -0.18916605412960052, + "logits/chosen": -2.6722376346588135, + "logits/rejected": -2.652876138687134, + "logps/chosen": -177.9219970703125, + "logps/rejected": -164.42129516601562, + "loss": 0.6754, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.19631250202655792, + "rewards/margins": 0.019793253391981125, + "rewards/rejected": -0.21610574424266815, "step": 110 }, { - "debug/losses": 0.0522412434220314, - "debug/policy_weights": 0.0772034078836441, - "debug/raw_losses": 0.6850447654724121, "epoch": 0.0955033824114604, - "grad_norm": 0.6262223454672763, + "grad_norm": 9.118856419956929, "learning_rate": 4.761904761904761e-07, - "logits/chosen": -2.6780261993408203, - "logits/rejected": -2.6650888919830322, - "logps/chosen": -161.04185485839844, - "logps/rejected": -164.0379638671875, - "loss": 0.0505, - "rewards/accuracies": 0.550000011920929, - "rewards/chosen": -0.21910080313682556, - "rewards/margins": 0.02164912037551403, - "rewards/rejected": -0.24074992537498474, + "logits/chosen": -2.6485350131988525, + "logits/rejected": -2.6367721557617188, + "logps/chosen": -167.44517517089844, + "logps/rejected": -174.68959045410156, + "loss": 0.6673, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.2831340432167053, + "rewards/margins": 0.0641321912407875, + "rewards/rejected": -0.34726622700691223, "step": 120 }, { - "debug/losses": 0.0441550612449646, - "debug/policy_weights": 0.06524286419153214, - "debug/raw_losses": 0.6730135679244995, "epoch": 0.10346199761241544, - "grad_norm": 0.688795923683336, + "grad_norm": 14.535361780051513, "learning_rate": 4.999845414634076e-07, - "logits/chosen": -2.697788953781128, - "logits/rejected": -2.6717123985290527, - "logps/chosen": -178.5142059326172, - "logps/rejected": -164.13534545898438, - "loss": 0.0469, - "rewards/accuracies": 0.581250011920929, - "rewards/chosen": -0.23604664206504822, - "rewards/margins": 0.048365212976932526, - "rewards/rejected": -0.28441184759140015, + "logits/chosen": -2.6754541397094727, + "logits/rejected": -2.646840810775757, + "logps/chosen": -180.4161376953125, + "logps/rejected": -173.7825164794922, + "loss": 0.647, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.2550658583641052, + "rewards/margins": 0.12581779062747955, + "rewards/rejected": -0.38088366389274597, "step": 130 }, { - "debug/losses": 0.034053437411785126, - "debug/policy_weights": 0.05063674598932266, - "debug/raw_losses": 0.6727820038795471, "epoch": 0.11142061281337047, - "grad_norm": 0.5644556850252691, + "grad_norm": 9.275457035342084, "learning_rate": 4.998106548810311e-07, - "logits/chosen": -2.6465182304382324, - "logits/rejected": -2.624471426010132, - "logps/chosen": -181.93539428710938, - "logps/rejected": -166.69976806640625, - "loss": 0.0366, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -0.3633270859718323, - "rewards/margins": 0.050895463675260544, - "rewards/rejected": -0.4142225682735443, + "logits/chosen": -2.5903968811035156, + "logits/rejected": -2.5657451152801514, + "logps/chosen": -212.9225311279297, + "logps/rejected": -203.63796997070312, + "loss": 0.6494, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.6731986999511719, + "rewards/margins": 0.11040596663951874, + "rewards/rejected": -0.783604621887207, "step": 140 }, { - "debug/losses": 0.024495644494891167, - "debug/policy_weights": 0.03860057145357132, - "debug/raw_losses": 0.6439628005027771, "epoch": 0.1193792280143255, - "grad_norm": 0.7107338165103922, + "grad_norm": 14.605167530290739, "learning_rate": 4.994436933879359e-07, - "logits/chosen": -2.622201919555664, - "logits/rejected": -2.616553544998169, - "logps/chosen": -181.92910766601562, - "logps/rejected": -199.99267578125, - "loss": 0.0267, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -0.4238983690738678, - "rewards/margins": 0.11615456640720367, - "rewards/rejected": -0.5400528907775879, + "logits/chosen": -2.510530948638916, + "logits/rejected": -2.503571033477783, + "logps/chosen": -217.52725219726562, + "logps/rejected": -246.1698455810547, + "loss": 0.6321, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.7798799276351929, + "rewards/margins": 0.22194448113441467, + "rewards/rejected": -1.0018242597579956, "step": 150 }, { - "debug/losses": 0.02472759038209915, - "debug/policy_weights": 0.03820445016026497, - "debug/raw_losses": 0.6442066431045532, "epoch": 0.12733784321528055, - "grad_norm": 0.49576657988459333, + "grad_norm": 13.975831761557039, "learning_rate": 4.988839406031596e-07, - "logits/chosen": -2.6254782676696777, - "logits/rejected": -2.6356091499328613, - "logps/chosen": -166.76321411132812, - "logps/rejected": -206.20278930664062, - "loss": 0.0277, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -0.4612468183040619, - "rewards/margins": 0.12736964225769043, - "rewards/rejected": -0.5886164307594299, + "logits/chosen": -2.507648468017578, + "logits/rejected": -2.5182366371154785, + "logps/chosen": -202.53158569335938, + "logps/rejected": -266.3342590332031, + "loss": 0.6275, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.8189308047294617, + "rewards/margins": 0.371000200510025, + "rewards/rejected": -1.189931035041809, "step": 160 }, { - "debug/losses": 0.010120457038283348, - "debug/policy_weights": 0.01634625717997551, - "debug/raw_losses": 0.6580480337142944, "epoch": 0.13529645841623558, - "grad_norm": 0.4478115719879286, + "grad_norm": 19.875830962562063, "learning_rate": 4.981318291512395e-07, - "logits/chosen": -2.596377372741699, - "logits/rejected": -2.5937087535858154, - "logps/chosen": -211.00350952148438, - "logps/rejected": -229.0654296875, - "loss": 0.0132, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -0.745928943157196, - "rewards/margins": 0.09528591483831406, - "rewards/rejected": -0.8412148356437683, + "logits/chosen": -2.433882474899292, + "logits/rejected": -2.430387020111084, + "logps/chosen": -218.954833984375, + "logps/rejected": -256.6148681640625, + "loss": 0.6212, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.8254417181015015, + "rewards/margins": 0.2912672758102417, + "rewards/rejected": -1.1167091131210327, "step": 170 }, { - "debug/losses": 0.01582014001905918, - "debug/policy_weights": 0.02243804931640625, - "debug/raw_losses": 0.6765472292900085, "epoch": 0.14325507361719061, - "grad_norm": 0.5348614998389231, + "grad_norm": 24.36373306347673, "learning_rate": 4.971879403278432e-07, - "logits/chosen": -2.6075491905212402, - "logits/rejected": -2.5911221504211426, - "logps/chosen": -213.48001098632812, - "logps/rejected": -215.29647827148438, - "loss": 0.0188, - "rewards/accuracies": 0.59375, - "rewards/chosen": -0.6495342254638672, - "rewards/margins": 0.06398934125900269, - "rewards/rejected": -0.7135236859321594, + "logits/chosen": -2.3362064361572266, + "logits/rejected": -2.3115732669830322, + "logps/chosen": -261.9034423828125, + "logps/rejected": -281.554443359375, + "loss": 0.6153, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.1337685585021973, + "rewards/margins": 0.24233467876911163, + "rewards/rejected": -1.376103401184082, "step": 180 }, { - "debug/losses": 0.020779911428689957, - "debug/policy_weights": 0.03059370443224907, - "debug/raw_losses": 0.6826348900794983, "epoch": 0.15121368881814565, - "grad_norm": 0.4791035716944309, + "grad_norm": 24.478766866690933, "learning_rate": 4.960530036504941e-07, - "logits/chosen": -2.6114742755889893, - "logits/rejected": -2.5984721183776855, - "logps/chosen": -211.92611694335938, - "logps/rejected": -210.9833526611328, - "loss": 0.0189, - "rewards/accuracies": 0.5625, - "rewards/chosen": -0.6757665872573853, - "rewards/margins": 0.05583900958299637, - "rewards/rejected": -0.7316056489944458, + "logits/chosen": -2.0850882530212402, + "logits/rejected": -2.040325403213501, + "logps/chosen": -273.06854248046875, + "logps/rejected": -312.0998229980469, + "loss": 0.6106, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.2871907949447632, + "rewards/margins": 0.4555794596672058, + "rewards/rejected": -1.7427704334259033, "step": 190 }, { - "debug/losses": 0.018696870654821396, - "debug/policy_weights": 0.028333622962236404, - "debug/raw_losses": 0.6458210349082947, "epoch": 0.15917230401910068, - "grad_norm": 0.5624030751523189, + "grad_norm": 19.670570237645645, "learning_rate": 4.947278962947386e-07, - "logits/chosen": -2.599944591522217, - "logits/rejected": -2.60343861579895, - "logps/chosen": -212.6822052001953, - "logps/rejected": -240.33090209960938, - "loss": 0.0188, - "rewards/accuracies": 0.612500011920929, - "rewards/chosen": -0.635327935218811, - "rewards/margins": 0.13868513703346252, - "rewards/rejected": -0.7740131616592407, + "logits/chosen": -1.907619833946228, + "logits/rejected": -1.9094089269638062, + "logps/chosen": -261.22698974609375, + "logps/rejected": -314.9784851074219, + "loss": 0.5947, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.1207753419876099, + "rewards/margins": 0.399713933467865, + "rewards/rejected": -1.5204893350601196, "step": 200 }, { "epoch": 0.15917230401910068, - "eval_debug/losses": 0.01704235188663006, - "eval_debug/policy_weights": 0.026140499860048294, - "eval_debug/raw_losses": 0.6477541327476501, - "eval_logits/chosen": -2.6260132789611816, - "eval_logits/rejected": -2.6167147159576416, - "eval_logps/chosen": -210.14483642578125, - "eval_logps/rejected": -229.957763671875, - "eval_loss": 0.018111631274223328, - "eval_rewards/accuracies": 0.6352611780166626, - "eval_rewards/chosen": -0.6590133309364319, - "eval_rewards/margins": 0.133574441075325, - "eval_rewards/rejected": -0.7925877571105957, - "eval_runtime": 153.5479, - "eval_samples_per_second": 55.696, - "eval_steps_per_second": 0.873, + "eval_logits/chosen": -1.661999225616455, + "eval_logits/rejected": -1.6082700490951538, + "eval_logps/chosen": -295.57269287109375, + "eval_logps/rejected": -351.9329528808594, + "eval_loss": 0.6026676893234253, + "eval_rewards/accuracies": 0.6679104566574097, + "eval_rewards/chosen": -1.5132923126220703, + "eval_rewards/margins": 0.49904683232307434, + "eval_rewards/rejected": -2.0123391151428223, + "eval_runtime": 153.0416, + "eval_samples_per_second": 55.88, + "eval_steps_per_second": 0.876, "step": 200 }, { - "debug/losses": 0.012981243431568146, - "debug/policy_weights": 0.020270589739084244, - "debug/raw_losses": 0.6476176977157593, "epoch": 0.1671309192200557, - "grad_norm": 0.3363244656750204, + "grad_norm": 16.167356453307296, "learning_rate": 4.932136424161899e-07, - "logits/chosen": -2.5959079265594482, - "logits/rejected": -2.5857765674591064, - "logps/chosen": -199.751953125, - "logps/rejected": -219.63211059570312, - "loss": 0.0154, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -0.7415943145751953, - "rewards/margins": 0.1376654952764511, - "rewards/rejected": -0.8792597651481628, + "logits/chosen": -1.4656827449798584, + "logits/rejected": -1.4232580661773682, + "logps/chosen": -280.2381591796875, + "logps/rejected": -332.170654296875, + "loss": 0.6021, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.546456217765808, + "rewards/margins": 0.4581894874572754, + "rewards/rejected": -2.004645824432373, "step": 210 }, { - "debug/losses": 0.009969423525035381, - "debug/policy_weights": 0.016851870343089104, - "debug/raw_losses": 0.6304318308830261, "epoch": 0.17508953442101075, - "grad_norm": 0.369805813736416, + "grad_norm": 14.356845739543413, "learning_rate": 4.915114123589732e-07, - "logits/chosen": -2.601393699645996, - "logits/rejected": -2.5798416137695312, - "logps/chosen": -221.81802368164062, - "logps/rejected": -236.99325561523438, - "loss": 0.0131, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -0.8491840362548828, - "rewards/margins": 0.16439886391162872, - "rewards/rejected": -1.0135828256607056, + "logits/chosen": -1.5527619123458862, + "logits/rejected": -1.5004384517669678, + "logps/chosen": -306.37225341796875, + "logps/rejected": -345.39385986328125, + "loss": 0.6093, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.6947263479232788, + "rewards/margins": 0.40286189317703247, + "rewards/rejected": -2.097588300704956, "step": 220 }, { - "debug/losses": 0.01011290680617094, - "debug/policy_weights": 0.01572282984852791, - "debug/raw_losses": 0.6608097553253174, "epoch": 0.18304814962196578, - "grad_norm": 0.47630517904372627, + "grad_norm": 18.794378620034212, "learning_rate": 4.896225217511849e-07, - "logits/chosen": -2.5869479179382324, - "logits/rejected": -2.5818216800689697, - "logps/chosen": -236.49746704101562, - "logps/rejected": -256.587158203125, - "loss": 0.0119, - "rewards/accuracies": 0.612500011920929, - "rewards/chosen": -0.9720077514648438, - "rewards/margins": 0.10651447623968124, - "rewards/rejected": -1.0785222053527832, + "logits/chosen": -1.7889591455459595, + "logits/rejected": -1.7687686681747437, + "logps/chosen": -236.9343719482422, + "logps/rejected": -293.1578674316406, + "loss": 0.58, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.9763768911361694, + "rewards/margins": 0.46785277128219604, + "rewards/rejected": -1.4442296028137207, "step": 230 }, { - "debug/losses": 0.011557658202946186, - "debug/policy_weights": 0.018197016790509224, - "debug/raw_losses": 0.6527289152145386, "epoch": 0.1910067648229208, - "grad_norm": 0.4296392421731883, + "grad_norm": 16.085123205817453, "learning_rate": 4.875484304880629e-07, - "logits/chosen": -2.5879721641540527, - "logits/rejected": -2.5685877799987793, - "logps/chosen": -252.7036895751953, - "logps/rejected": -261.66973876953125, - "loss": 0.013, - "rewards/accuracies": 0.59375, - "rewards/chosen": -0.9094566106796265, - "rewards/margins": 0.14806167781352997, - "rewards/rejected": -1.0575182437896729, + "logits/chosen": -1.8684545755386353, + "logits/rejected": -1.8107630014419556, + "logps/chosen": -275.6334533691406, + "logps/rejected": -318.3482360839844, + "loss": 0.5724, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.13875412940979, + "rewards/margins": 0.4855495095252991, + "rewards/rejected": -1.6243034601211548, "step": 240 }, { - "debug/losses": 0.011128942482173443, - "debug/policy_weights": 0.018581857904791832, - "debug/raw_losses": 0.6574316024780273, "epoch": 0.19896538002387584, - "grad_norm": 0.4137854262926794, + "grad_norm": 18.207419558759465, "learning_rate": 4.852907416036558e-07, - "logits/chosen": -2.5383076667785645, - "logits/rejected": -2.532837390899658, - "logps/chosen": -225.3877716064453, - "logps/rejected": -250.76669311523438, - "loss": 0.014, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -0.8609094619750977, - "rewards/margins": 0.13047048449516296, - "rewards/rejected": -0.9913798570632935, + "logits/chosen": -1.8272058963775635, + "logits/rejected": -1.8065818548202515, + "logps/chosen": -253.82308959960938, + "logps/rejected": -313.4346618652344, + "loss": 0.5908, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.1452624797821045, + "rewards/margins": 0.4727972149848938, + "rewards/rejected": -1.618059754371643, "step": 250 }, { - "debug/losses": 0.012957748956978321, - "debug/policy_weights": 0.02070624753832817, - "debug/raw_losses": 0.6470428109169006, "epoch": 0.20692399522483088, - "grad_norm": 0.43516878734921594, + "grad_norm": 17.79415319185038, "learning_rate": 4.828512000318616e-07, - "logits/chosen": -2.573000431060791, - "logits/rejected": -2.533466100692749, - "logps/chosen": -257.0665588378906, - "logps/rejected": -261.9660949707031, - "loss": 0.0122, - "rewards/accuracies": 0.625, - "rewards/chosen": -0.9236570596694946, - "rewards/margins": 0.14447014033794403, - "rewards/rejected": -1.068127155303955, + "logits/chosen": -1.2992960214614868, + "logits/rejected": -1.1269283294677734, + "logps/chosen": -330.8904113769531, + "logps/rejected": -382.5238952636719, + "loss": 0.5689, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.661895990371704, + "rewards/margins": 0.6118096709251404, + "rewards/rejected": -2.2737057209014893, "step": 260 }, { - "debug/losses": 0.006833743304014206, - "debug/policy_weights": 0.01035793125629425, - "debug/raw_losses": 0.6686216592788696, "epoch": 0.2148826104257859, - "grad_norm": 0.7316646034006385, + "grad_norm": 14.406997827267608, "learning_rate": 4.802316912577946e-07, - "logits/chosen": -2.516526460647583, - "logits/rejected": -2.486034870147705, - "logps/chosen": -249.7852020263672, - "logps/rejected": -245.7858428955078, - "loss": 0.0089, - "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": -1.0213656425476074, - "rewards/margins": 0.09422695636749268, - "rewards/rejected": -1.1155927181243896, + "logits/chosen": -1.2646944522857666, + "logits/rejected": -1.1142375469207764, + "logps/chosen": -307.0154724121094, + "logps/rejected": -348.8962707519531, + "loss": 0.5872, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.5936686992645264, + "rewards/margins": 0.5530277490615845, + "rewards/rejected": -2.1466963291168213, "step": 270 }, { - "debug/losses": 0.008976435288786888, - "debug/policy_weights": 0.01276155561208725, - "debug/raw_losses": 0.6897184252738953, "epoch": 0.22284122562674094, - "grad_norm": 0.48173147031459806, + "grad_norm": 20.387615258054584, "learning_rate": 4.774342398605221e-07, - "logits/chosen": -2.5043716430664062, - "logits/rejected": -2.4910271167755127, - "logps/chosen": -249.2817840576172, - "logps/rejected": -247.11795043945312, - "loss": 0.0089, - "rewards/accuracies": 0.518750011920929, - "rewards/chosen": -1.036773443222046, - "rewards/margins": 0.035792578011751175, - "rewards/rejected": -1.0725661516189575, + "logits/chosen": -1.1495741605758667, + "logits/rejected": -1.0322356224060059, + "logps/chosen": -358.7090759277344, + "logps/rejected": -401.15283203125, + "loss": 0.5778, + "rewards/accuracies": 0.65625, + "rewards/chosen": -2.131046772003174, + "rewards/margins": 0.4818685054779053, + "rewards/rejected": -2.6129150390625, "step": 280 }, { - "debug/losses": 0.012884433381259441, - "debug/policy_weights": 0.019073178991675377, - "debug/raw_losses": 0.654518187046051, "epoch": 0.23079984082769597, - "grad_norm": 0.3979893171562093, + "grad_norm": 17.33296023667363, "learning_rate": 4.744610079482978e-07, - "logits/chosen": -2.557462453842163, - "logits/rejected": -2.527928352355957, - "logps/chosen": -259.79571533203125, - "logps/rejected": -261.197021484375, - "loss": 0.0106, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -0.9412172436714172, - "rewards/margins": 0.10356676578521729, - "rewards/rejected": -1.0447839498519897, + "logits/chosen": -1.5665136575698853, + "logits/rejected": -1.3881865739822388, + "logps/chosen": -337.09918212890625, + "logps/rejected": -372.98944091796875, + "loss": 0.5975, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.7142517566680908, + "rewards/margins": 0.448456346988678, + "rewards/rejected": -2.162708282470703, "step": 290 }, { - "debug/losses": 0.008294315077364445, - "debug/policy_weights": 0.013245286419987679, - "debug/raw_losses": 0.6458945870399475, "epoch": 0.238758456028651, - "grad_norm": 0.7466334284854154, + "grad_norm": 14.305368392746832, "learning_rate": 4.713142934875005e-07, - "logits/chosen": -2.5328361988067627, - "logits/rejected": -2.5026156902313232, - "logps/chosen": -239.67001342773438, - "logps/rejected": -236.6134033203125, - "loss": 0.0106, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -0.9211509823799133, - "rewards/margins": 0.13629981875419617, - "rewards/rejected": -1.057450771331787, + "logits/chosen": -1.339380145072937, + "logits/rejected": -1.121058702468872, + "logps/chosen": -292.33392333984375, + "logps/rejected": -326.50115966796875, + "loss": 0.578, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.4477901458740234, + "rewards/margins": 0.5085382461547852, + "rewards/rejected": -1.9563281536102295, "step": 300 }, { "epoch": 0.238758456028651, - "eval_debug/losses": 0.011732951737940311, - "eval_debug/policy_weights": 0.018165074288845062, - "eval_debug/raw_losses": 0.650356650352478, - "eval_logits/chosen": -2.540983200073242, - "eval_logits/rejected": -2.532646417617798, - "eval_logps/chosen": -232.72845458984375, - "eval_logps/rejected": -251.57742309570312, - "eval_loss": 0.012441331520676613, - "eval_rewards/accuracies": 0.6231343150138855, - "eval_rewards/chosen": -0.8848498463630676, - "eval_rewards/margins": 0.12393450736999512, - "eval_rewards/rejected": -1.0087844133377075, - "eval_runtime": 153.822, - "eval_samples_per_second": 55.597, - "eval_steps_per_second": 0.871, + "eval_logits/chosen": -1.5127747058868408, + "eval_logits/rejected": -1.3925303220748901, + "eval_logps/chosen": -271.07684326171875, + "eval_logps/rejected": -322.1283874511719, + "eval_loss": 0.575111448764801, + "eval_rewards/accuracies": 0.6893656849861145, + "eval_rewards/chosen": -1.2683334350585938, + "eval_rewards/margins": 0.4459605813026428, + "eval_rewards/rejected": -1.714294195175171, + "eval_runtime": 153.0554, + "eval_samples_per_second": 55.875, + "eval_steps_per_second": 0.876, "step": 300 }, { - "debug/losses": 0.009767532348632812, - "debug/policy_weights": 0.014414462260901928, - "debug/raw_losses": 0.6532250046730042, "epoch": 0.24671707122960604, - "grad_norm": 0.6022163924610267, + "grad_norm": 16.44125545365527, "learning_rate": 4.679965285265706e-07, - "logits/chosen": -2.516752004623413, - "logits/rejected": -2.5129215717315674, - "logps/chosen": -211.92715454101562, - "logps/rejected": -235.82760620117188, - "loss": 0.0122, - "rewards/accuracies": 0.5687500238418579, - "rewards/chosen": -0.9437325596809387, - "rewards/margins": 0.11843845993280411, - "rewards/rejected": -1.0621709823608398, + "logits/chosen": -1.284905195236206, + "logits/rejected": -1.2035940885543823, + "logps/chosen": -255.46212768554688, + "logps/rejected": -315.3814697265625, + "loss": 0.5717, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.3790826797485352, + "rewards/margins": 0.47862672805786133, + "rewards/rejected": -1.857709288597107, "step": 310 }, { - "debug/losses": 0.011818929575383663, - "debug/policy_weights": 0.01776820793747902, - "debug/raw_losses": 0.6237934231758118, "epoch": 0.2546756864305611, - "grad_norm": 0.553137313091687, + "grad_norm": 18.14884709080922, "learning_rate": 4.64510277316316e-07, - "logits/chosen": -2.4835593700408936, - "logits/rejected": -2.4536149501800537, - "logps/chosen": -236.3191680908203, - "logps/rejected": -250.107666015625, - "loss": 0.0123, - "rewards/accuracies": 0.6875, - "rewards/chosen": -0.9068711400032043, - "rewards/margins": 0.20198988914489746, - "rewards/rejected": -1.108860969543457, + "logits/chosen": -0.8231368064880371, + "logits/rejected": -0.5619879961013794, + "logps/chosen": -297.4189453125, + "logps/rejected": -376.95050048828125, + "loss": 0.5411, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.5178688764572144, + "rewards/margins": 0.8594205975532532, + "rewards/rejected": -2.3772895336151123, "step": 320 }, { - "debug/losses": 0.01837514340877533, - "debug/policy_weights": 0.026234418153762817, - "debug/raw_losses": 0.6640917062759399, "epoch": 0.26263430163151613, - "grad_norm": 0.9265370666539059, + "grad_norm": 16.049521884862, "learning_rate": 4.6085823432804137e-07, - "logits/chosen": -2.5293900966644287, - "logits/rejected": -2.511305570602417, - "logps/chosen": -225.53518676757812, - "logps/rejected": -236.95339965820312, - "loss": 0.0187, - "rewards/accuracies": 0.606249988079071, - "rewards/chosen": -0.8177441358566284, - "rewards/margins": 0.116599440574646, - "rewards/rejected": -0.9343435168266296, + "logits/chosen": -0.8478446006774902, + "logits/rejected": -0.6845074892044067, + "logps/chosen": -326.3879699707031, + "logps/rejected": -379.32586669921875, + "loss": 0.5815, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.8262717723846436, + "rewards/margins": 0.5317971110343933, + "rewards/rejected": -2.3580689430236816, "step": 330 }, { - "debug/losses": 0.020881159231066704, - "debug/policy_weights": 0.030892541632056236, - "debug/raw_losses": 0.6454728245735168, "epoch": 0.27059291683247116, - "grad_norm": 0.37159592470750724, + "grad_norm": 17.252563575103384, "learning_rate": 4.570432221710314e-07, - "logits/chosen": -2.5206029415130615, - "logits/rejected": -2.5169670581817627, - "logps/chosen": -216.4114990234375, - "logps/rejected": -241.2029266357422, - "loss": 0.0194, - "rewards/accuracies": 0.625, - "rewards/chosen": -0.7023001313209534, - "rewards/margins": 0.1543247401714325, - "rewards/rejected": -0.8566249012947083, + "logits/chosen": -0.7273017764091492, + "logits/rejected": -0.5780073404312134, + "logps/chosen": -302.949462890625, + "logps/rejected": -364.41619873046875, + "loss": 0.5814, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.567679762840271, + "rewards/margins": 0.5210777521133423, + "rewards/rejected": -2.0887577533721924, "step": 340 }, { - "debug/losses": 0.021329209208488464, - "debug/policy_weights": 0.03285343572497368, - "debug/raw_losses": 0.6685600876808167, "epoch": 0.2785515320334262, - "grad_norm": 0.5225386180163766, + "grad_norm": 15.400974157549417, "learning_rate": 4.5306818941099866e-07, - "logits/chosen": -2.540334463119507, - "logits/rejected": -2.4982547760009766, - "logps/chosen": -223.546875, - "logps/rejected": -217.67105102539062, - "loss": 0.018, - "rewards/accuracies": 0.581250011920929, - "rewards/chosen": -0.7332266569137573, - "rewards/margins": 0.10545055568218231, - "rewards/rejected": -0.8386772274971008, + "logits/chosen": -0.4381464123725891, + "logits/rejected": -0.08063732087612152, + "logps/chosen": -313.7085876464844, + "logps/rejected": -357.1277770996094, + "loss": 0.5942, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.6348435878753662, + "rewards/margins": 0.5984010100364685, + "rewards/rejected": -2.2332444190979004, "step": 350 }, { - "debug/losses": 0.019619420170783997, - "debug/policy_weights": 0.029550284147262573, - "debug/raw_losses": 0.6593185067176819, "epoch": 0.28651014723438123, - "grad_norm": 0.5306100648641101, + "grad_norm": 14.504561357614989, "learning_rate": 4.4893620829118124e-07, - "logits/chosen": -2.5392906665802, - "logits/rejected": -2.5207695960998535, - "logps/chosen": -230.8947296142578, - "logps/rejected": -238.7125244140625, - "loss": 0.0197, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -0.7915760278701782, - "rewards/margins": 0.14158041775226593, - "rewards/rejected": -0.9331563711166382, + "logits/chosen": -0.4463214874267578, + "logits/rejected": -0.24701222777366638, + "logps/chosen": -351.8408203125, + "logps/rejected": -392.67095947265625, + "loss": 0.5536, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.0010366439819336, + "rewards/margins": 0.4717040956020355, + "rewards/rejected": -2.472740888595581, "step": 360 }, { - "debug/losses": 0.015291196294128895, - "debug/policy_weights": 0.02270735241472721, - "debug/raw_losses": 0.6952095627784729, "epoch": 0.29446876243533626, - "grad_norm": 0.39445695977766165, + "grad_norm": 17.52031908845256, "learning_rate": 4.4465047235785185e-07, - "logits/chosen": -2.5752360820770264, - "logits/rejected": -2.5506844520568848, - "logps/chosen": -240.67501831054688, - "logps/rejected": -231.72927856445312, - "loss": 0.0152, - "rewards/accuracies": 0.5562499761581421, - "rewards/chosen": -0.9583063125610352, - "rewards/margins": 0.07631716132164001, - "rewards/rejected": -1.034623384475708, + "logits/chosen": -0.9400293231010437, + "logits/rejected": -0.6988920569419861, + "logps/chosen": -370.46893310546875, + "logps/rejected": -406.6575927734375, + "loss": 0.5671, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -2.2562451362609863, + "rewards/margins": 0.5276610851287842, + "rewards/rejected": -2.7839062213897705, "step": 370 }, { - "debug/losses": 0.006141385529190302, - "debug/policy_weights": 0.010329265147447586, - "debug/raw_losses": 0.6901925802230835, "epoch": 0.3024273776362913, - "grad_norm": 0.3064069246991804, + "grad_norm": 16.26258053172861, "learning_rate": 4.40214293992074e-07, - "logits/chosen": -2.5442752838134766, - "logits/rejected": -2.527510166168213, - "logps/chosen": -258.46923828125, - "logps/rejected": -265.7283935546875, - "loss": 0.009, - "rewards/accuracies": 0.5625, - "rewards/chosen": -1.2107610702514648, - "rewards/margins": 0.10041014105081558, - "rewards/rejected": -1.311171054840088, + "logits/chosen": -1.0389044284820557, + "logits/rejected": -0.8688551187515259, + "logps/chosen": -315.2569580078125, + "logps/rejected": -383.2552795410156, + "loss": 0.5533, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.7786381244659424, + "rewards/margins": 0.7078015208244324, + "rewards/rejected": -2.4864397048950195, "step": 380 }, { - "debug/losses": 0.01604517176747322, - "debug/policy_weights": 0.022270509973168373, - "debug/raw_losses": 0.6663911938667297, "epoch": 0.3103859928372463, - "grad_norm": 0.48121695252032026, + "grad_norm": 18.389510644608478, "learning_rate": 4.3563110184961234e-07, - "logits/chosen": -2.5838708877563477, - "logits/rejected": -2.569918155670166, - "logps/chosen": -258.06866455078125, - "logps/rejected": -273.8180847167969, - "loss": 0.0119, - "rewards/accuracies": 0.550000011920929, - "rewards/chosen": -1.0886430740356445, - "rewards/margins": 0.1280651092529297, - "rewards/rejected": -1.2167080640792847, + "logits/chosen": -0.9544135928153992, + "logits/rejected": -0.8071931004524231, + "logps/chosen": -319.52386474609375, + "logps/rejected": -378.70819091796875, + "loss": 0.5412, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.7031948566436768, + "rewards/margins": 0.5624147653579712, + "rewards/rejected": -2.2656095027923584, "step": 390 }, { - "debug/losses": 0.010074732825160027, - "debug/policy_weights": 0.01658284291625023, - "debug/raw_losses": 0.5911771059036255, "epoch": 0.31834460803820136, - "grad_norm": 0.354283231770322, + "grad_norm": 20.77179730886419, "learning_rate": 4.3090443821097566e-07, - "logits/chosen": -2.5739798545837402, - "logits/rejected": -2.5637741088867188, - "logps/chosen": -238.1716766357422, - "logps/rejected": -285.08172607421875, - "loss": 0.0113, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -1.0210093259811401, - "rewards/margins": 0.3275682032108307, - "rewards/rejected": -1.348577618598938, + "logits/chosen": -0.5997304320335388, + "logits/rejected": -0.3994545638561249, + "logps/chosen": -297.0989074707031, + "logps/rejected": -398.7206726074219, + "loss": 0.5575, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.6102817058563232, + "rewards/margins": 0.8746851086616516, + "rewards/rejected": -2.48496675491333, "step": 400 }, { "epoch": 0.31834460803820136, - "eval_debug/losses": 0.009367607533931732, - "eval_debug/policy_weights": 0.014644055627286434, - "eval_debug/raw_losses": 0.6485610604286194, - "eval_logits/chosen": -2.5541255474090576, - "eval_logits/rejected": -2.5431408882141113, - "eval_logps/chosen": -256.7430114746094, - "eval_logps/rejected": -282.9041748046875, - "eval_loss": 0.01066713035106659, - "eval_rewards/accuracies": 0.6259328126907349, - "eval_rewards/chosen": -1.1249948740005493, - "eval_rewards/margins": 0.197056844830513, - "eval_rewards/rejected": -1.322051763534546, - "eval_runtime": 154.223, - "eval_samples_per_second": 55.452, - "eval_steps_per_second": 0.869, + "eval_logits/chosen": -0.42633184790611267, + "eval_logits/rejected": -0.25109803676605225, + "eval_logps/chosen": -322.9848327636719, + "eval_logps/rejected": -395.5074157714844, + "eval_loss": 0.5613003373146057, + "eval_rewards/accuracies": 0.7052238583564758, + "eval_rewards/chosen": -1.7874133586883545, + "eval_rewards/margins": 0.6606705784797668, + "eval_rewards/rejected": -2.4480838775634766, + "eval_runtime": 153.0612, + "eval_samples_per_second": 55.873, + "eval_steps_per_second": 0.875, "step": 400 }, { - "debug/losses": 0.00755792111158371, - "debug/policy_weights": 0.012881780043244362, - "debug/raw_losses": 0.6399897933006287, "epoch": 0.3263032232391564, - "grad_norm": 0.3292230818250206, + "grad_norm": 14.6184384920338, "learning_rate": 4.2603795624364195e-07, - "logits/chosen": -2.546541213989258, - "logits/rejected": -2.514399766921997, - "logps/chosen": -252.35745239257812, - "logps/rejected": -261.92620849609375, - "loss": 0.0084, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -1.159836769104004, - "rewards/margins": 0.18457694351673126, - "rewards/rejected": -1.3444135189056396, + "logits/chosen": -0.3563244640827179, + "logits/rejected": -0.04272305592894554, + "logps/chosen": -324.42291259765625, + "logps/rejected": -374.9647521972656, + "loss": 0.5646, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.8804908990859985, + "rewards/margins": 0.5943077802658081, + "rewards/rejected": -2.4747986793518066, "step": 410 }, { - "debug/losses": 0.005532898474484682, - "debug/policy_weights": 0.009266809560358524, - "debug/raw_losses": 0.674716591835022, "epoch": 0.3342618384401114, - "grad_norm": 0.23754665887440424, + "grad_norm": 16.33524211106954, "learning_rate": 4.210354171785795e-07, - "logits/chosen": -2.518486499786377, - "logits/rejected": -2.5224523544311523, - "logps/chosen": -285.84881591796875, - "logps/rejected": -313.0412292480469, - "loss": 0.0052, - "rewards/accuracies": 0.550000011920929, - "rewards/chosen": -1.4368436336517334, - "rewards/margins": 0.1468246430158615, - "rewards/rejected": -1.5836683511734009, + "logits/chosen": -0.3055838942527771, + "logits/rejected": -0.16813552379608154, + "logps/chosen": -361.65093994140625, + "logps/rejected": -439.7191467285156, + "loss": 0.5423, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.1948649883270264, + "rewards/margins": 0.6555823087692261, + "rewards/rejected": -2.850447177886963, "step": 420 }, { - "debug/losses": 0.00432454003021121, - "debug/policy_weights": 0.007013822440057993, - "debug/raw_losses": 0.668225109577179, "epoch": 0.34222045364106646, - "grad_norm": 0.2996512601494498, + "grad_norm": 16.700884894882467, "learning_rate": 4.15900687403248e-07, - "logits/chosen": -2.538316249847412, - "logits/rejected": -2.5322518348693848, - "logps/chosen": -291.5013122558594, - "logps/rejected": -313.30810546875, - "loss": 0.0048, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -1.5356699228286743, - "rewards/margins": 0.19464543461799622, - "rewards/rejected": -1.7303152084350586, + "logits/chosen": -0.16022978723049164, + "logits/rejected": 0.04306970164179802, + "logps/chosen": -349.4731750488281, + "logps/rejected": -418.4617614746094, + "loss": 0.5537, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -2.1153881549835205, + "rewards/margins": 0.6664639711380005, + "rewards/rejected": -2.7818517684936523, "step": 430 }, { - "debug/losses": 0.004692443646490574, - "debug/policy_weights": 0.006985181476920843, - "debug/raw_losses": 0.6745913624763489, "epoch": 0.3501790688420215, - "grad_norm": 0.2890119099296941, + "grad_norm": 14.476246245258272, "learning_rate": 4.1063773547332584e-07, - "logits/chosen": -2.581429958343506, - "logits/rejected": -2.5656654834747314, - "logps/chosen": -299.73126220703125, - "logps/rejected": -318.9335021972656, - "loss": 0.0052, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -1.5279231071472168, - "rewards/margins": 0.18198037147521973, - "rewards/rejected": -1.7099034786224365, + "logits/chosen": 0.17018680274486542, + "logits/rejected": 0.42503976821899414, + "logps/chosen": -332.9987487792969, + "logps/rejected": -387.84613037109375, + "loss": 0.5584, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.8605985641479492, + "rewards/margins": 0.5384317636489868, + "rewards/rejected": -2.3990302085876465, "step": 440 }, { - "debug/losses": 0.004112830851227045, - "debug/policy_weights": 0.006491529755294323, - "debug/raw_losses": 0.6466792225837708, "epoch": 0.3581376840429765, - "grad_norm": 0.30150249922041256, + "grad_norm": 23.02311576539631, "learning_rate": 4.0525062904547276e-07, - "logits/chosen": -2.5238678455352783, - "logits/rejected": -2.500016450881958, - "logps/chosen": -271.2080383300781, - "logps/rejected": -286.7987976074219, - "loss": 0.0064, - "rewards/accuracies": 0.625, - "rewards/chosen": -1.3864507675170898, - "rewards/margins": 0.2321782410144806, - "rewards/rejected": -1.6186290979385376, + "logits/chosen": 0.4983510971069336, + "logits/rejected": 0.8844894170761108, + "logps/chosen": -356.9858703613281, + "logps/rejected": -418.490234375, + "loss": 0.5431, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.2442288398742676, + "rewards/margins": 0.6913145184516907, + "rewards/rejected": -2.9355432987213135, "step": 450 }, { - "debug/losses": 0.007575507275760174, - "debug/policy_weights": 0.012141515500843525, - "debug/raw_losses": 0.6485113501548767, "epoch": 0.36609629924393156, - "grad_norm": 0.32086164553161495, + "grad_norm": 18.11929521024602, "learning_rate": 3.997435317334988e-07, - "logits/chosen": -2.5503549575805664, - "logits/rejected": -2.538334369659424, - "logps/chosen": -284.7257385253906, - "logps/rejected": -310.32244873046875, - "loss": 0.0066, - "rewards/accuracies": 0.625, - "rewards/chosen": -1.3771101236343384, - "rewards/margins": 0.19444192945957184, - "rewards/rejected": -1.5715519189834595, + "logits/chosen": 0.0664445012807846, + "logits/rejected": 0.3327707350254059, + "logps/chosen": -370.38568115234375, + "logps/rejected": -447.3545837402344, + "loss": 0.5506, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -2.2337095737457275, + "rewards/margins": 0.7081635594367981, + "rewards/rejected": -2.941873073577881, "step": 460 }, { - "debug/losses": 0.0063556404784321785, - "debug/policy_weights": 0.00990296620875597, - "debug/raw_losses": 0.6760750412940979, "epoch": 0.3740549144448866, - "grad_norm": 0.2823094153700017, + "grad_norm": 15.617407519466495, "learning_rate": 3.941206998903701e-07, - "logits/chosen": -2.594268560409546, - "logits/rejected": -2.5712995529174805, - "logps/chosen": -294.4463806152344, - "logps/rejected": -306.3409118652344, - "loss": 0.0056, - "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": -1.3967692852020264, - "rewards/margins": 0.14532601833343506, - "rewards/rejected": -1.5420953035354614, + "logits/chosen": -0.06704016029834747, + "logits/rejected": 0.2724049985408783, + "logps/chosen": -378.48236083984375, + "logps/rejected": -437.5526428222656, + "loss": 0.5621, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.237128973007202, + "rewards/margins": 0.6170836687088013, + "rewards/rejected": -2.854212760925293, "step": 470 }, { - "debug/losses": 0.005242775194346905, - "debug/policy_weights": 0.007469573058187962, - "debug/raw_losses": 0.6963475346565247, "epoch": 0.3820135296458416, - "grad_norm": 0.23996176279841727, + "grad_norm": 20.78463516779003, "learning_rate": 3.8838647931853684e-07, - "logits/chosen": -2.529966115951538, - "logits/rejected": -2.519165515899658, - "logps/chosen": -272.6905212402344, - "logps/rejected": -285.30841064453125, - "loss": 0.0068, - "rewards/accuracies": 0.5625, - "rewards/chosen": -1.5079033374786377, - "rewards/margins": 0.09324829280376434, - "rewards/rejected": -1.601151704788208, + "logits/chosen": -0.20896323025226593, + "logits/rejected": 0.013983624055981636, + "logps/chosen": -309.01287841796875, + "logps/rejected": -382.2357177734375, + "loss": 0.5527, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.8711271286010742, + "rewards/margins": 0.6992978453636169, + "rewards/rejected": -2.570424795150757, "step": 480 }, { - "debug/losses": 0.00598041620105505, - "debug/policy_weights": 0.010452435351908207, - "debug/raw_losses": 0.654226541519165, "epoch": 0.38997214484679665, - "grad_norm": 0.5472892421620039, + "grad_norm": 17.40353233817741, "learning_rate": 3.825453019111281e-07, - "logits/chosen": -2.539477586746216, - "logits/rejected": -2.525785207748413, - "logps/chosen": -289.7580261230469, - "logps/rejected": -322.4846496582031, - "loss": 0.0057, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -1.4528900384902954, - "rewards/margins": 0.22696420550346375, - "rewards/rejected": -1.679854154586792, + "logits/chosen": -0.6753722429275513, + "logits/rejected": -0.3529302477836609, + "logps/chosen": -310.6385192871094, + "logps/rejected": -395.55609130859375, + "loss": 0.5438, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.6616952419281006, + "rewards/margins": 0.7488740682601929, + "rewards/rejected": -2.410569190979004, "step": 490 }, { - "debug/losses": 0.007534264586865902, - "debug/policy_weights": 0.011624102480709553, - "debug/raw_losses": 0.6891680955886841, "epoch": 0.3979307600477517, - "grad_norm": 0.2410652784920897, + "grad_norm": 19.06650086284646, "learning_rate": 3.7660168222660824e-07, - "logits/chosen": -2.5813050270080566, - "logits/rejected": -2.545656204223633, - "logps/chosen": -323.9276428222656, - "logps/rejected": -318.6961975097656, - "loss": 0.0049, - "rewards/accuracies": 0.574999988079071, - "rewards/chosen": -1.554783821105957, - "rewards/margins": 0.10975100845098495, - "rewards/rejected": -1.6645348072052002, + "logits/chosen": -0.7103143930435181, + "logits/rejected": -0.3697218894958496, + "logps/chosen": -343.75140380859375, + "logps/rejected": -391.75445556640625, + "loss": 0.5311, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.753021478652954, + "rewards/margins": 0.6420953869819641, + "rewards/rejected": -2.3951168060302734, "step": 500 }, { "epoch": 0.3979307600477517, - "eval_debug/losses": 0.004585243761539459, - "eval_debug/policy_weights": 0.007001264486461878, - "eval_debug/raw_losses": 0.6676536202430725, - "eval_logits/chosen": -2.5501811504364014, - "eval_logits/rejected": -2.5388710498809814, - "eval_logps/chosen": -299.8376770019531, - "eval_logps/rejected": -326.1407775878906, - "eval_loss": 0.005230129696428776, - "eval_rewards/accuracies": 0.58302241563797, - "eval_rewards/chosen": -1.5559419393539429, - "eval_rewards/margins": 0.1984759271144867, - "eval_rewards/rejected": -1.7544177770614624, - "eval_runtime": 153.6915, - "eval_samples_per_second": 55.644, - "eval_steps_per_second": 0.872, + "eval_logits/chosen": -0.14436845481395721, + "eval_logits/rejected": 0.1320793181657791, + "eval_logps/chosen": -351.6741027832031, + "eval_logps/rejected": -428.5196228027344, + "eval_loss": 0.5600787997245789, + "eval_rewards/accuracies": 0.7248134613037109, + "eval_rewards/chosen": -2.074305534362793, + "eval_rewards/margins": 0.7039004564285278, + "eval_rewards/rejected": -2.7782061100006104, + "eval_runtime": 153.2793, + "eval_samples_per_second": 55.794, + "eval_steps_per_second": 0.874, "step": 500 }, { - "debug/losses": 0.0037676370702683926, - "debug/policy_weights": 0.0052662924863398075, - "debug/raw_losses": 0.7040041089057922, "epoch": 0.4058893752487067, - "grad_norm": 0.40774056325379826, + "grad_norm": 20.933791990106656, "learning_rate": 3.705602139995416e-07, - "logits/chosen": -2.4990837574005127, - "logits/rejected": -2.480976104736328, - "logps/chosen": -321.1830749511719, - "logps/rejected": -332.6640930175781, - "loss": 0.0052, - "rewards/accuracies": 0.5625, - "rewards/chosen": -1.6892070770263672, - "rewards/margins": 0.13378755748271942, - "rewards/rejected": -1.822994589805603, + "logits/chosen": -0.009120747447013855, + "logits/rejected": 0.2614721655845642, + "logps/chosen": -390.1700744628906, + "logps/rejected": -449.2955627441406, + "loss": 0.5792, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.3790764808654785, + "rewards/margins": 0.6102325916290283, + "rewards/rejected": -2.989309549331665, "step": 510 }, { - "debug/losses": 0.0035227362532168627, - "debug/policy_weights": 0.004760933108627796, - "debug/raw_losses": 0.7151198387145996, "epoch": 0.41384799044966175, - "grad_norm": 0.2819755565466031, + "grad_norm": 15.961444394222205, "learning_rate": 3.6442556659016475e-07, - "logits/chosen": -2.526418447494507, - "logits/rejected": -2.5172476768493652, - "logps/chosen": -317.5640869140625, - "logps/rejected": -330.0902099609375, - "loss": 0.0043, - "rewards/accuracies": 0.5249999761581421, - "rewards/chosen": -1.7005726099014282, - "rewards/margins": 0.10154050588607788, - "rewards/rejected": -1.8021131753921509, + "logits/chosen": 0.2502554953098297, + "logits/rejected": 0.46849527955055237, + "logps/chosen": -383.04693603515625, + "logps/rejected": -437.341796875, + "loss": 0.5409, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.355401039123535, + "rewards/margins": 0.519227921962738, + "rewards/rejected": -2.874629259109497, "step": 520 }, { - "debug/losses": 0.005672266241163015, - "debug/policy_weights": 0.00919837225228548, - "debug/raw_losses": 0.6718519926071167, "epoch": 0.4218066056506168, - "grad_norm": 0.23867540906939516, + "grad_norm": 16.827064617105826, "learning_rate": 3.582024813755076e-07, - "logits/chosen": -2.5276780128479004, - "logits/rejected": -2.514214038848877, - "logps/chosen": -311.69158935546875, - "logps/rejected": -323.6425476074219, - "loss": 0.0043, - "rewards/accuracies": 0.5687500238418579, - "rewards/chosen": -1.6105915307998657, - "rewards/margins": 0.15969306230545044, - "rewards/rejected": -1.7702842950820923, + "logits/chosen": 0.6198503971099854, + "logits/rejected": 0.8526325225830078, + "logps/chosen": -389.84356689453125, + "logps/rejected": -434.67999267578125, + "loss": 0.552, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -2.392110824584961, + "rewards/margins": 0.48854774236679077, + "rewards/rejected": -2.8806586265563965, "step": 530 }, { - "debug/losses": 0.0027658152393996716, - "debug/policy_weights": 0.004610239528119564, - "debug/raw_losses": 0.6935795545578003, "epoch": 0.4297652208515718, - "grad_norm": 0.1493706526784262, + "grad_norm": 18.188037065315324, "learning_rate": 3.5189576808485404e-07, - "logits/chosen": -2.5311484336853027, - "logits/rejected": -2.5161919593811035, - "logps/chosen": -325.65399169921875, - "logps/rejected": -345.66705322265625, - "loss": 0.0035, - "rewards/accuracies": 0.574999988079071, - "rewards/chosen": -1.7506721019744873, - "rewards/margins": 0.15115287899971008, - "rewards/rejected": -1.901824951171875, + "logits/chosen": 0.3544410765171051, + "logits/rejected": 0.4798678755760193, + "logps/chosen": -369.5892028808594, + "logps/rejected": -434.74053955078125, + "loss": 0.5622, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.1900246143341064, + "rewards/margins": 0.6025354862213135, + "rewards/rejected": -2.79256010055542, "step": 540 }, { - "debug/losses": 0.0035218801349401474, - "debug/policy_weights": 0.005299523007124662, - "debug/raw_losses": 0.7395445108413696, "epoch": 0.43772383605252685, - "grad_norm": 0.31145154543629516, + "grad_norm": 20.54659926983385, "learning_rate": 3.4551030108237433e-07, - "logits/chosen": -2.5389254093170166, - "logits/rejected": -2.5010597705841064, - "logps/chosen": -326.8507385253906, - "logps/rejected": -310.1211853027344, - "loss": 0.0033, - "rewards/accuracies": 0.5062500238418579, - "rewards/chosen": -1.742793321609497, - "rewards/margins": 0.0004985686391592026, - "rewards/rejected": -1.7432918548583984, + "logits/chosen": -0.033793479204177856, + "logits/rejected": 0.37864160537719727, + "logps/chosen": -356.50653076171875, + "logps/rejected": -409.03839111328125, + "loss": 0.5558, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.03935170173645, + "rewards/margins": 0.6931120157241821, + "rewards/rejected": -2.7324633598327637, "step": 550 }, { - "debug/losses": 0.0035917130298912525, - "debug/policy_weights": 0.005464069079607725, - "debug/raw_losses": 0.6668697595596313, "epoch": 0.4456824512534819, - "grad_norm": 0.4133215356133092, + "grad_norm": 17.320248907302155, "learning_rate": 3.390510155998023e-07, - "logits/chosen": -2.5678248405456543, - "logits/rejected": -2.5489749908447266, - "logps/chosen": -326.417236328125, - "logps/rejected": -347.8440856933594, - "loss": 0.0036, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -1.721588134765625, - "rewards/margins": 0.1946999430656433, - "rewards/rejected": -1.9162880182266235, + "logits/chosen": -0.5626561045646667, + "logits/rejected": -0.217830628156662, + "logps/chosen": -339.8533630371094, + "logps/rejected": -406.0281066894531, + "loss": 0.5374, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.8559491634368896, + "rewards/margins": 0.6421788334846497, + "rewards/rejected": -2.4981281757354736, "step": 560 }, { - "debug/losses": 0.00520257418975234, - "debug/policy_weights": 0.006928709335625172, - "debug/raw_losses": 0.6966046094894409, "epoch": 0.4536410664544369, - "grad_norm": 0.4197137744018701, + "grad_norm": 18.454225254996054, "learning_rate": 3.325229039220684e-07, - "logits/chosen": -2.536591053009033, - "logits/rejected": -2.528271198272705, - "logps/chosen": -323.67340087890625, - "logps/rejected": -334.86981201171875, - "loss": 0.0043, - "rewards/accuracies": 0.550000011920929, - "rewards/chosen": -1.6962934732437134, - "rewards/margins": 0.13100206851959229, - "rewards/rejected": -1.8272953033447266, + "logits/chosen": -0.4510825574398041, + "logits/rejected": -0.2019493579864502, + "logps/chosen": -339.4858093261719, + "logps/rejected": -399.65234375, + "loss": 0.5633, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.8544174432754517, + "rewards/margins": 0.6207035779953003, + "rewards/rejected": -2.475121021270752, "step": 570 }, { - "debug/losses": 0.004648587666451931, - "debug/policy_weights": 0.007099623326212168, - "debug/raw_losses": 0.7026728391647339, "epoch": 0.46159968165539195, - "grad_norm": 0.6480993213685303, + "grad_norm": 15.265836542622413, "learning_rate": 3.2593101152883795e-07, - "logits/chosen": -2.536067485809326, - "logits/rejected": -2.517671823501587, - "logps/chosen": -311.83465576171875, - "logps/rejected": -320.65228271484375, - "loss": 0.0036, - "rewards/accuracies": 0.550000011920929, - "rewards/chosen": -1.6290395259857178, - "rewards/margins": 0.11054316908121109, - "rewards/rejected": -1.7395826578140259, + "logits/chosen": -0.5197261571884155, + "logits/rejected": -0.3228117823600769, + "logps/chosen": -316.60516357421875, + "logps/rejected": -366.8912658691406, + "loss": 0.5638, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.6767441034317017, + "rewards/margins": 0.5252284407615662, + "rewards/rejected": -2.201972484588623, "step": 580 }, { - "debug/losses": 0.0071870447136461735, - "debug/policy_weights": 0.010465339757502079, - "debug/raw_losses": 0.6819754838943481, "epoch": 0.469558296856347, - "grad_norm": 0.31849457228607825, + "grad_norm": 15.206190457344837, "learning_rate": 3.192804331949349e-07, - "logits/chosen": -2.5641560554504395, - "logits/rejected": -2.551443099975586, - "logps/chosen": -298.05108642578125, - "logps/rejected": -309.4243469238281, - "loss": 0.005, - "rewards/accuracies": 0.581250011920929, - "rewards/chosen": -1.5542137622833252, - "rewards/margins": 0.1467454880475998, - "rewards/rejected": -1.7009594440460205, + "logits/chosen": 0.21528498828411102, + "logits/rejected": 0.4312516152858734, + "logps/chosen": -316.5174865722656, + "logps/rejected": -376.46002197265625, + "loss": 0.5289, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.7388776540756226, + "rewards/margins": 0.6324383020401001, + "rewards/rejected": -2.3713161945343018, "step": 590 }, { - "debug/losses": 0.004736016970127821, - "debug/policy_weights": 0.007246729917824268, - "debug/raw_losses": 0.6467695236206055, "epoch": 0.477516912057302, - "grad_norm": 0.2671963559392113, + "grad_norm": 20.282458926057863, "learning_rate": 3.125763090526674e-07, - "logits/chosen": -2.5547337532043457, - "logits/rejected": -2.528456926345825, - "logps/chosen": -294.1816711425781, - "logps/rejected": -308.18267822265625, - "loss": 0.0057, - "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": -1.393349528312683, - "rewards/margins": 0.19907453656196594, - "rewards/rejected": -1.592423915863037, + "logits/chosen": 0.4832405149936676, + "logits/rejected": 0.88775235414505, + "logps/chosen": -342.80096435546875, + "logps/rejected": -407.1078796386719, + "loss": 0.5658, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.8795427083969116, + "rewards/margins": 0.7021334171295166, + "rewards/rejected": -2.5816760063171387, "step": 600 }, { "epoch": 0.477516912057302, - "eval_debug/losses": 0.00638400437310338, - "eval_debug/policy_weights": 0.009958026930689812, - "eval_debug/raw_losses": 0.6465267539024353, - "eval_logits/chosen": -2.555931568145752, - "eval_logits/rejected": -2.5457732677459717, - "eval_logps/chosen": -274.5812072753906, - "eval_logps/rejected": -301.52093505859375, - "eval_loss": 0.007382492069154978, - "eval_rewards/accuracies": 0.6138059496879578, - "eval_rewards/chosen": -1.3033772706985474, - "eval_rewards/margins": 0.20484226942062378, - "eval_rewards/rejected": -1.5082194805145264, - "eval_runtime": 153.8374, - "eval_samples_per_second": 55.591, - "eval_steps_per_second": 0.871, + "eval_logits/chosen": 0.6660908460617065, + "eval_logits/rejected": 0.9124837517738342, + "eval_logps/chosen": -340.0068664550781, + "eval_logps/rejected": -416.9898986816406, + "eval_loss": 0.5562008619308472, + "eval_rewards/accuracies": 0.7192164063453674, + "eval_rewards/chosen": -1.9576338529586792, + "eval_rewards/margins": 0.7052750587463379, + "eval_rewards/rejected": -2.6629090309143066, + "eval_runtime": 153.1635, + "eval_samples_per_second": 55.836, + "eval_steps_per_second": 0.875, "step": 600 }, { - "debug/losses": 0.007963726297020912, - "debug/policy_weights": 0.012822565622627735, - "debug/raw_losses": 0.690829873085022, "epoch": 0.48547552725825704, - "grad_norm": 0.2758959141284049, + "grad_norm": 14.608157782099068, "learning_rate": 3.0582382061909623e-07, - "logits/chosen": -2.5585238933563232, - "logits/rejected": -2.536741018295288, - "logps/chosen": -273.50592041015625, - "logps/rejected": -286.12158203125, - "loss": 0.0079, - "rewards/accuracies": 0.5687500238418579, - "rewards/chosen": -1.323797345161438, - "rewards/margins": 0.1075838953256607, - "rewards/rejected": -1.431381106376648, + "logits/chosen": 0.36511674523353577, + "logits/rejected": 0.6286307573318481, + "logps/chosen": -337.5827941894531, + "logps/rejected": -395.5306701660156, + "loss": 0.5395, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.9645658731460571, + "rewards/margins": 0.5609062910079956, + "rewards/rejected": -2.5254719257354736, "step": 610 }, { - "debug/losses": 0.006302698515355587, - "debug/policy_weights": 0.00910632498562336, - "debug/raw_losses": 0.6518343687057495, "epoch": 0.4934341424592121, - "grad_norm": 0.37499067518315377, + "grad_norm": 14.933588886711542, "learning_rate": 2.9902818679131775e-07, - "logits/chosen": -2.564722776412964, - "logits/rejected": -2.5377914905548096, - "logps/chosen": -285.66473388671875, - "logps/rejected": -300.630126953125, - "loss": 0.0069, - "rewards/accuracies": 0.59375, - "rewards/chosen": -1.3094189167022705, - "rewards/margins": 0.20527267456054688, - "rewards/rejected": -1.5146914720535278, + "logits/chosen": 0.3990648686885834, + "logits/rejected": 0.5564724206924438, + "logps/chosen": -339.22735595703125, + "logps/rejected": -399.9192810058594, + "loss": 0.5578, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.8450448513031006, + "rewards/margins": 0.6625381708145142, + "rewards/rejected": -2.507582902908325, "step": 620 }, { - "debug/losses": 0.005589304957538843, - "debug/policy_weights": 0.008459536358714104, - "debug/raw_losses": 0.6310914754867554, "epoch": 0.5013927576601671, - "grad_norm": 0.6186732376237966, + "grad_norm": 18.611562829139118, "learning_rate": 2.921946598128571e-07, - "logits/chosen": -2.538897752761841, - "logits/rejected": -2.5250511169433594, - "logps/chosen": -284.4039001464844, - "logps/rejected": -284.2347412109375, - "loss": 0.0076, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -1.3162474632263184, - "rewards/margins": 0.20545336604118347, - "rewards/rejected": -1.5217007398605347, + "logits/chosen": 0.7237969636917114, + "logits/rejected": 1.000910997390747, + "logps/chosen": -346.236572265625, + "logps/rejected": -393.0661315917969, + "loss": 0.5371, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.9345744848251343, + "rewards/margins": 0.6754398941993713, + "rewards/rejected": -2.6100144386291504, "step": 630 }, { - "debug/losses": 0.008229405619204044, - "debug/policy_weights": 0.011773859150707722, - "debug/raw_losses": 0.6769380569458008, "epoch": 0.5093513728611222, - "grad_norm": 0.36031530008444884, + "grad_norm": 18.159841838504743, "learning_rate": 2.8532852121428733e-07, - "logits/chosen": -2.5451481342315674, - "logits/rejected": -2.5281176567077637, - "logps/chosen": -263.3935546875, - "logps/rejected": -274.42108154296875, - "loss": 0.0068, - "rewards/accuracies": 0.5062500238418579, - "rewards/chosen": -1.2786972522735596, - "rewards/margins": 0.13303811848163605, - "rewards/rejected": -1.4117352962493896, + "logits/chosen": 1.1098816394805908, + "logits/rejected": 1.3760236501693726, + "logps/chosen": -353.3837890625, + "logps/rejected": -409.08062744140625, + "loss": 0.5375, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.1785995960235596, + "rewards/margins": 0.5797316431999207, + "rewards/rejected": -2.758331537246704, "step": 640 }, { - "debug/losses": 0.008351767435669899, - "debug/policy_weights": 0.012713427655398846, - "debug/raw_losses": 0.623299241065979, "epoch": 0.5173099880620772, - "grad_norm": 0.506927386733817, + "grad_norm": 17.95563717396888, "learning_rate": 2.7843507773121414e-07, - "logits/chosen": -2.5620720386505127, - "logits/rejected": -2.547813892364502, - "logps/chosen": -252.62655639648438, - "logps/rejected": -284.22491455078125, - "loss": 0.0094, - "rewards/accuracies": 0.65625, - "rewards/chosen": -1.1541074514389038, - "rewards/margins": 0.2441467046737671, - "rewards/rejected": -1.398254156112671, + "logits/chosen": 0.8091678619384766, + "logits/rejected": 1.021480679512024, + "logps/chosen": -350.10638427734375, + "logps/rejected": -437.02569580078125, + "loss": 0.511, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.128905773162842, + "rewards/margins": 0.7973555326461792, + "rewards/rejected": -2.9262614250183105, "step": 650 }, { - "debug/losses": 0.009884612634778023, - "debug/policy_weights": 0.017373202368617058, - "debug/raw_losses": 0.5893998146057129, "epoch": 0.5252686032630323, - "grad_norm": 0.48450009957998225, + "grad_norm": 16.8332185564681, "learning_rate": 2.715196572027789e-07, - "logits/chosen": -2.53163480758667, - "logits/rejected": -2.520258665084839, - "logps/chosen": -247.4897918701172, - "logps/rejected": -289.49163818359375, - "loss": 0.0104, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -1.0302413702011108, - "rewards/margins": 0.3234770596027374, - "rewards/rejected": -1.3537184000015259, + "logits/chosen": 1.244178056716919, + "logits/rejected": 1.4877557754516602, + "logps/chosen": -353.20233154296875, + "logps/rejected": -446.7106018066406, + "loss": 0.5551, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.087367057800293, + "rewards/margins": 0.838540256023407, + "rewards/rejected": -2.9259073734283447, "step": 660 }, { - "debug/losses": 0.008643941953778267, - "debug/policy_weights": 0.01316711027175188, - "debug/raw_losses": 0.6537618637084961, "epoch": 0.5332272184639872, - "grad_norm": 0.45714247829836013, + "grad_norm": 14.658814068318406, "learning_rate": 2.645876044538521e-07, - "logits/chosen": -2.5438942909240723, - "logits/rejected": -2.5284218788146973, - "logps/chosen": -266.7118835449219, - "logps/rejected": -276.3323059082031, - "loss": 0.0091, - "rewards/accuracies": 0.59375, - "rewards/chosen": -1.175799012184143, - "rewards/margins": 0.17910988628864288, - "rewards/rejected": -1.3549087047576904, + "logits/chosen": 0.5786877274513245, + "logits/rejected": 0.8311547040939331, + "logps/chosen": -339.4533386230469, + "logps/rejected": -391.8295593261719, + "loss": 0.5523, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.9032132625579834, + "rewards/margins": 0.6066684126853943, + "rewards/rejected": -2.5098819732666016, "step": 670 }, { - "debug/losses": 0.008115797303617, - "debug/policy_weights": 0.012199058197438717, - "debug/raw_losses": 0.6621893048286438, "epoch": 0.5411858336649423, - "grad_norm": 0.3750284424858676, + "grad_norm": 16.282915430254512, "learning_rate": 2.5764427716409815e-07, - "logits/chosen": -2.5285165309906006, - "logits/rejected": -2.50962495803833, - "logps/chosen": -283.6080017089844, - "logps/rejected": -298.36041259765625, - "loss": 0.0062, - "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": -1.2883840799331665, - "rewards/margins": 0.15867391228675842, - "rewards/rejected": -1.4470579624176025, + "logits/chosen": 0.6706225275993347, + "logits/rejected": 0.9435638189315796, + "logps/chosen": -339.28765869140625, + "logps/rejected": -400.4329528808594, + "loss": 0.551, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.8451801538467407, + "rewards/margins": 0.6226030588150024, + "rewards/rejected": -2.4677834510803223, "step": 680 }, { - "debug/losses": 0.007645717356353998, - "debug/policy_weights": 0.011332646012306213, - "debug/raw_losses": 0.6681965589523315, "epoch": 0.5491444488658973, - "grad_norm": 0.44120218811957973, + "grad_norm": 18.972868571281783, "learning_rate": 2.5069504172710494e-07, - "logits/chosen": -2.5573880672454834, - "logits/rejected": -2.5541958808898926, - "logps/chosen": -282.6352233886719, - "logps/rejected": -315.0184020996094, - "loss": 0.0066, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -1.3430020809173584, - "rewards/margins": 0.14582130312919617, - "rewards/rejected": -1.488823413848877, + "logits/chosen": 0.5228421092033386, + "logits/rejected": 0.6484982967376709, + "logps/chosen": -349.12841796875, + "logps/rejected": -433.3041076660156, + "loss": 0.5433, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.007934093475342, + "rewards/margins": 0.6637459993362427, + "rewards/rejected": -2.671679735183716, "step": 690 }, { - "debug/losses": 0.010622764006257057, - "debug/policy_weights": 0.016023432835936546, - "debug/raw_losses": 0.6925386190414429, "epoch": 0.5571030640668524, - "grad_norm": 0.5061171086550631, + "grad_norm": 20.904681899906066, "learning_rate": 2.4374526910277886e-07, - "logits/chosen": -2.541714906692505, - "logits/rejected": -2.5287177562713623, - "logps/chosen": -265.9856262207031, - "logps/rejected": -274.21514892578125, - "loss": 0.0088, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -1.2257328033447266, - "rewards/margins": 0.126583069562912, - "rewards/rejected": -1.352315902709961, + "logits/chosen": 0.6581841111183167, + "logits/rejected": 0.9438881874084473, + "logps/chosen": -350.12841796875, + "logps/rejected": -397.9573974609375, + "loss": 0.556, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -2.0671610832214355, + "rewards/margins": 0.5225776433944702, + "rewards/rejected": -2.589738368988037, "step": 700 }, { "epoch": 0.5571030640668524, - "eval_debug/losses": 0.008296997286379337, - "eval_debug/policy_weights": 0.012994813732802868, - "eval_debug/raw_losses": 0.6414722800254822, - "eval_logits/chosen": -2.528498888015747, - "eval_logits/rejected": -2.5181169509887695, - "eval_logps/chosen": -263.69171142578125, - "eval_logps/rejected": -292.02899169921875, - "eval_loss": 0.010307971388101578, - "eval_rewards/accuracies": 0.621268630027771, - "eval_rewards/chosen": -1.1944820880889893, - "eval_rewards/margins": 0.21881809830665588, - "eval_rewards/rejected": -1.4133001565933228, - "eval_runtime": 153.7631, - "eval_samples_per_second": 55.618, - "eval_steps_per_second": 0.871, + "eval_logits/chosen": 0.7301986217498779, + "eval_logits/rejected": 0.9968724250793457, + "eval_logps/chosen": -355.7083740234375, + "eval_logps/rejected": -428.9443359375, + "eval_loss": 0.5501761436462402, + "eval_rewards/accuracies": 0.7201492786407471, + "eval_rewards/chosen": -2.1146485805511475, + "eval_rewards/margins": 0.6678044199943542, + "eval_rewards/rejected": -2.7824532985687256, + "eval_runtime": 153.1053, + "eval_samples_per_second": 55.857, + "eval_steps_per_second": 0.875, "step": 700 }, { - "debug/losses": 0.009021037258207798, - "debug/policy_weights": 0.012906293384730816, - "debug/raw_losses": 0.6762464046478271, "epoch": 0.5650616792678074, - "grad_norm": 0.6161976277127167, + "grad_norm": 16.070119739343724, "learning_rate": 2.368003306662104e-07, - "logits/chosen": -2.4972877502441406, - "logits/rejected": -2.476879596710205, - "logps/chosen": -283.9917297363281, - "logps/rejected": -287.7764587402344, - "loss": 0.0109, - "rewards/accuracies": 0.5375000238418579, - "rewards/chosen": -1.2839831113815308, - "rewards/margins": 0.1497708261013031, - "rewards/rejected": -1.4337539672851562, + "logits/chosen": 0.8950408101081848, + "logits/rejected": 1.2581216096878052, + "logps/chosen": -378.45245361328125, + "logps/rejected": -437.49542236328125, + "loss": 0.5443, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -2.228590488433838, + "rewards/margins": 0.7023531198501587, + "rewards/rejected": -2.930943489074707, "step": 710 }, { - "debug/losses": 0.008371567353606224, - "debug/policy_weights": 0.013729481026530266, - "debug/raw_losses": 0.6200276613235474, "epoch": 0.5730202944687625, - "grad_norm": 0.42374936555821074, + "grad_norm": 23.561044168016466, "learning_rate": 2.2986559405621886e-07, - "logits/chosen": -2.507690906524658, - "logits/rejected": -2.4873509407043457, - "logps/chosen": -275.37200927734375, - "logps/rejected": -295.102294921875, - "loss": 0.0097, - "rewards/accuracies": 0.65625, - "rewards/chosen": -1.1293296813964844, - "rewards/margins": 0.25335806608200073, - "rewards/rejected": -1.3826878070831299, + "logits/chosen": 0.8009999990463257, + "logits/rejected": 1.2362650632858276, + "logps/chosen": -351.4358215332031, + "logps/rejected": -408.4342346191406, + "loss": 0.5543, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.889967679977417, + "rewards/margins": 0.6260396242141724, + "rewards/rejected": -2.5160071849823, "step": 720 }, { - "debug/losses": 0.007534002419561148, - "debug/policy_weights": 0.011125156655907631, - "debug/raw_losses": 0.6151086688041687, "epoch": 0.5809789096697174, - "grad_norm": 0.5573003005885695, + "grad_norm": 17.715188324255074, "learning_rate": 2.2294641902678443e-07, - "logits/chosen": -2.481174945831299, - "logits/rejected": -2.4632718563079834, - "logps/chosen": -252.3656005859375, - "logps/rejected": -277.9593200683594, - "loss": 0.0076, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -1.1720126867294312, - "rewards/margins": 0.28889113664627075, - "rewards/rejected": -1.4609038829803467, + "logits/chosen": 0.6837292909622192, + "logits/rejected": 1.051099181175232, + "logps/chosen": -342.70269775390625, + "logps/rejected": -421.7893981933594, + "loss": 0.5134, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.075383424758911, + "rewards/margins": 0.8238208889961243, + "rewards/rejected": -2.8992042541503906, "step": 730 }, { - "debug/losses": 0.005707957781851292, - "debug/policy_weights": 0.009306355379521847, - "debug/raw_losses": 0.6428549885749817, "epoch": 0.5889375248706725, - "grad_norm": 0.4503498456808193, + "grad_norm": 19.531113452828787, "learning_rate": 2.160481533045751e-07, - "logits/chosen": -2.4572572708129883, - "logits/rejected": -2.4322314262390137, - "logps/chosen": -281.6376953125, - "logps/rejected": -291.15692138671875, - "loss": 0.0064, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -1.3150320053100586, - "rewards/margins": 0.18938429653644562, - "rewards/rejected": -1.5044163465499878, + "logits/chosen": 0.7316833734512329, + "logits/rejected": 1.1876373291015625, + "logps/chosen": -371.8680725097656, + "logps/rejected": -437.3309020996094, + "loss": 0.5553, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -2.2173359394073486, + "rewards/margins": 0.7488197088241577, + "rewards/rejected": -2.966156005859375, "step": 740 }, { - "debug/losses": 0.0053934664465487, - "debug/policy_weights": 0.008661061525344849, - "debug/raw_losses": 0.6872018575668335, "epoch": 0.5968961400716275, - "grad_norm": 0.3047546226170542, + "grad_norm": 17.826971988142052, "learning_rate": 2.0917612845576882e-07, - "logits/chosen": -2.440650463104248, - "logits/rejected": -2.3968372344970703, - "logps/chosen": -293.74493408203125, - "logps/rejected": -286.46636962890625, - "loss": 0.0064, - "rewards/accuracies": 0.606249988079071, - "rewards/chosen": -1.3726236820220947, - "rewards/margins": 0.14107808470726013, - "rewards/rejected": -1.5137017965316772, + "logits/chosen": 0.47909075021743774, + "logits/rejected": 0.9067865610122681, + "logps/chosen": -358.71551513671875, + "logps/rejected": -414.9165954589844, + "loss": 0.5267, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.022329807281494, + "rewards/margins": 0.7758738994598389, + "rewards/rejected": -2.798203945159912, "step": 750 }, { - "debug/losses": 0.005546064116060734, - "debug/policy_weights": 0.008649295195937157, - "debug/raw_losses": 0.645238995552063, "epoch": 0.6048547552725826, - "grad_norm": 0.3407688186234949, + "grad_norm": 16.01258522652455, "learning_rate": 2.0233565576536564e-07, - "logits/chosen": -2.419266939163208, - "logits/rejected": -2.413198947906494, - "logps/chosen": -279.4403076171875, - "logps/rejected": -303.93389892578125, - "loss": 0.0065, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -1.3747066259384155, - "rewards/margins": 0.19447334110736847, - "rewards/rejected": -1.5691800117492676, + "logits/chosen": 0.6064720153808594, + "logits/rejected": 0.7591885328292847, + "logps/chosen": -335.24798583984375, + "logps/rejected": -399.1866760253906, + "loss": 0.5594, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.9327834844589233, + "rewards/margins": 0.5889240503311157, + "rewards/rejected": -2.521707773208618, "step": 760 }, { - "debug/losses": 0.00917271338403225, - "debug/policy_weights": 0.014650100842118263, - "debug/raw_losses": 0.6324642300605774, "epoch": 0.6128133704735376, - "grad_norm": 0.3237617382662941, + "grad_norm": 15.946070991890718, "learning_rate": 1.9553202213217537e-07, - "logits/chosen": -2.3931689262390137, - "logits/rejected": -2.3785834312438965, - "logps/chosen": -262.7375183105469, - "logps/rejected": -290.7518615722656, - "loss": 0.0068, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -1.3233850002288818, - "rewards/margins": 0.23029252886772156, - "rewards/rejected": -1.5536775588989258, + "logits/chosen": 0.1371159851551056, + "logits/rejected": 0.3473878800868988, + "logps/chosen": -303.07025146484375, + "logps/rejected": -386.58203125, + "loss": 0.5281, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.7267131805419922, + "rewards/margins": 0.7852660417556763, + "rewards/rejected": -2.5119788646698, "step": 770 }, { - "debug/losses": 0.006442698650062084, - "debug/policy_weights": 0.010415828786790371, - "debug/raw_losses": 0.647124171257019, "epoch": 0.6207719856744927, - "grad_norm": 0.2720492701623735, + "grad_norm": 21.594187073993353, "learning_rate": 1.887704859826528e-07, - "logits/chosen": -2.424227476119995, - "logits/rejected": -2.4021072387695312, - "logps/chosen": -307.26513671875, - "logps/rejected": -332.822265625, - "loss": 0.0057, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -1.4601261615753174, - "rewards/margins": 0.22994783520698547, - "rewards/rejected": -1.6900737285614014, + "logits/chosen": 0.050722457468509674, + "logits/rejected": 0.4147067666053772, + "logps/chosen": -368.37640380859375, + "logps/rejected": -446.398681640625, + "loss": 0.5397, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.0712389945983887, + "rewards/margins": 0.7545989155769348, + "rewards/rejected": -2.8258378505706787, "step": 780 }, { - "debug/losses": 0.004948030225932598, - "debug/policy_weights": 0.007730833254754543, - "debug/raw_losses": 0.6671634912490845, "epoch": 0.6287306008754476, - "grad_norm": 0.2528779472547584, + "grad_norm": 20.596254559457844, "learning_rate": 1.8205627320673836e-07, - "logits/chosen": -2.439519166946411, - "logits/rejected": -2.4123809337615967, - "logps/chosen": -307.4483337402344, - "logps/rejected": -322.8079528808594, - "loss": 0.0048, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -1.5389058589935303, - "rewards/margins": 0.22470760345458984, - "rewards/rejected": -1.7636134624481201, + "logits/chosen": 0.6064023375511169, + "logits/rejected": 1.075620174407959, + "logps/chosen": -376.5938720703125, + "logps/rejected": -465.5481872558594, + "loss": 0.5334, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.2303614616394043, + "rewards/margins": 0.9606544375419617, + "rewards/rejected": -3.1910159587860107, "step": 790 }, { - "debug/losses": 0.0038409889675676823, - "debug/policy_weights": 0.006472854875028133, - "debug/raw_losses": 0.6771573424339294, "epoch": 0.6366892160764027, - "grad_norm": 0.38520005295041226, + "grad_norm": 15.746309704598048, "learning_rate": 1.7539457311884675e-07, - "logits/chosen": -2.427551746368408, - "logits/rejected": -2.393444538116455, - "logps/chosen": -316.10980224609375, - "logps/rejected": -318.96435546875, - "loss": 0.0045, - "rewards/accuracies": 0.59375, - "rewards/chosen": -1.5842044353485107, - "rewards/margins": 0.14038211107254028, - "rewards/rejected": -1.7245864868164062, + "logits/chosen": 0.6997416615486145, + "logits/rejected": 1.1372315883636475, + "logps/chosen": -385.6691589355469, + "logps/rejected": -445.677978515625, + "loss": 0.5285, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -2.2797977924346924, + "rewards/margins": 0.7119247317314148, + "rewards/rejected": -2.991722583770752, "step": 800 }, { "epoch": 0.6366892160764027, - "eval_debug/losses": 0.0037104368675500154, - "eval_debug/policy_weights": 0.005816968157887459, - "eval_debug/raw_losses": 0.6507457494735718, - "eval_logits/chosen": -2.3915836811065674, - "eval_logits/rejected": -2.3813607692718506, - "eval_logps/chosen": -303.1590576171875, - "eval_logps/rejected": -332.9696044921875, - "eval_loss": 0.004839831031858921, - "eval_rewards/accuracies": 0.6054104566574097, - "eval_rewards/chosen": -1.5891555547714233, - "eval_rewards/margins": 0.23355057835578918, - "eval_rewards/rejected": -1.8227061033248901, - "eval_runtime": 153.8581, - "eval_samples_per_second": 55.584, - "eval_steps_per_second": 0.871, + "eval_logits/chosen": 0.6029295921325684, + "eval_logits/rejected": 0.85638028383255, + "eval_logps/chosen": -364.04046630859375, + "eval_logps/rejected": -445.2566833496094, + "eval_loss": 0.547686755657196, + "eval_rewards/accuracies": 0.7229477763175964, + "eval_rewards/chosen": -2.197970151901245, + "eval_rewards/margins": 0.7476070523262024, + "eval_rewards/rejected": -2.9455766677856445, + "eval_runtime": 153.0451, + "eval_samples_per_second": 55.879, + "eval_steps_per_second": 0.876, "step": 800 }, { - "debug/losses": 0.004317887127399445, - "debug/policy_weights": 0.007244779262691736, - "debug/raw_losses": 0.6528981924057007, "epoch": 0.6446478312773577, - "grad_norm": 0.25773730654278415, + "grad_norm": 17.76306380556218, "learning_rate": 1.687905344471226e-07, - "logits/chosen": -2.3889260292053223, - "logits/rejected": -2.3771719932556152, - "logps/chosen": -317.72149658203125, - "logps/rejected": -342.5805969238281, - "loss": 0.0056, - "rewards/accuracies": 0.5625, - "rewards/chosen": -1.5406662225723267, - "rewards/margins": 0.23261304199695587, - "rewards/rejected": -1.7732791900634766, + "logits/chosen": 0.5493108034133911, + "logits/rejected": 0.7385646104812622, + "logps/chosen": -376.4003601074219, + "logps/rejected": -441.7693786621094, + "loss": 0.5619, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -2.1274545192718506, + "rewards/margins": 0.6377121210098267, + "rewards/rejected": -2.765166759490967, "step": 810 }, { - "debug/losses": 0.01069083996117115, - "debug/policy_weights": 0.01425662636756897, - "debug/raw_losses": 0.7089247703552246, "epoch": 0.6526064464783128, - "grad_norm": 0.45661862942505177, + "grad_norm": 17.319583567554556, "learning_rate": 1.6224926135406693e-07, - "logits/chosen": -2.3906173706054688, - "logits/rejected": -2.3693528175354004, - "logps/chosen": -290.47161865234375, - "logps/rejected": -290.6516418457031, - "loss": 0.0059, - "rewards/accuracies": 0.53125, - "rewards/chosen": -1.4589916467666626, - "rewards/margins": 0.10409541428089142, - "rewards/rejected": -1.563086986541748, + "logits/chosen": 0.29170387983322144, + "logits/rejected": 0.5566233992576599, + "logps/chosen": -364.3298645019531, + "logps/rejected": -414.4306640625, + "loss": 0.5414, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.1975741386413574, + "rewards/margins": 0.6033033728599548, + "rewards/rejected": -2.800877332687378, "step": 820 }, { - "debug/losses": 0.004518064670264721, - "debug/policy_weights": 0.007354943547397852, - "debug/raw_losses": 0.6433005928993225, "epoch": 0.6605650616792678, - "grad_norm": 0.3469646777929176, + "grad_norm": 15.746531845729336, "learning_rate": 1.557758094916053e-07, - "logits/chosen": -2.389251232147217, - "logits/rejected": -2.3616697788238525, - "logps/chosen": -308.2472229003906, - "logps/rejected": -330.23724365234375, - "loss": 0.0054, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -1.5131051540374756, - "rewards/margins": 0.225154310464859, - "rewards/rejected": -1.7382595539093018, + "logits/chosen": -0.04723300039768219, + "logits/rejected": 0.32069122791290283, + "logps/chosen": -368.81829833984375, + "logps/rejected": -438.3575134277344, + "loss": 0.5396, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.1188158988952637, + "rewards/margins": 0.700646162033081, + "rewards/rejected": -2.819462299346924, "step": 830 }, { - "debug/losses": 0.005165197886526585, - "debug/policy_weights": 0.007880007848143578, - "debug/raw_losses": 0.6706713438034058, "epoch": 0.6685236768802229, - "grad_norm": 0.23676325708483353, + "grad_norm": 16.062030141781605, "learning_rate": 1.4937518209365108e-07, - "logits/chosen": -2.434936761856079, - "logits/rejected": -2.3894906044006348, - "logps/chosen": -328.7256164550781, - "logps/rejected": -319.41424560546875, - "loss": 0.0059, - "rewards/accuracies": 0.574999988079071, - "rewards/chosen": -1.5028477907180786, - "rewards/margins": 0.1583162099123001, - "rewards/rejected": -1.6611640453338623, + "logits/chosen": -0.0310503002256155, + "logits/rejected": 0.4633910059928894, + "logps/chosen": -385.63214111328125, + "logps/rejected": -429.7618103027344, + "loss": 0.556, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.071913003921509, + "rewards/margins": 0.6927269101142883, + "rewards/rejected": -2.7646398544311523, "step": 840 }, { - "debug/losses": 0.005795011762529612, - "debug/policy_weights": 0.009083392098546028, - "debug/raw_losses": 0.6539183855056763, "epoch": 0.6764822920811778, - "grad_norm": 0.4329258112752223, + "grad_norm": 17.885863472179913, "learning_rate": 1.4305232610918045e-07, - "logits/chosen": -2.390982151031494, - "logits/rejected": -2.3737387657165527, - "logps/chosen": -297.3653259277344, - "logps/rejected": -311.5605163574219, - "loss": 0.0056, - "rewards/accuracies": 0.612500011920929, - "rewards/chosen": -1.5131553411483765, - "rewards/margins": 0.1754181981086731, - "rewards/rejected": -1.6885734796524048, + "logits/chosen": 0.18630388379096985, + "logits/rejected": 0.4484528601169586, + "logps/chosen": -366.92767333984375, + "logps/rejected": -433.56396484375, + "loss": 0.5565, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.2087783813476562, + "rewards/margins": 0.6998289227485657, + "rewards/rejected": -2.9086074829101562, "step": 850 }, { - "debug/losses": 0.004215762950479984, - "debug/policy_weights": 0.006693544331938028, - "debug/raw_losses": 0.6309518814086914, "epoch": 0.6844409072821329, - "grad_norm": 0.388110373764802, + "grad_norm": 15.92958193178829, "learning_rate": 1.3681212837880977e-07, - "logits/chosen": -2.399656295776367, - "logits/rejected": -2.402662992477417, - "logps/chosen": -275.9090270996094, - "logps/rejected": -318.91278076171875, - "loss": 0.0056, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -1.4184858798980713, - "rewards/margins": 0.24759134650230408, - "rewards/rejected": -1.6660772562026978, + "logits/chosen": 0.26056399941444397, + "logits/rejected": 0.38261863589286804, + "logps/chosen": -355.76263427734375, + "logps/rejected": -432.66888427734375, + "loss": 0.5366, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -2.217022180557251, + "rewards/margins": 0.5866159796714783, + "rewards/rejected": -2.803637981414795, "step": 860 }, { - "debug/losses": 0.005548650864511728, - "debug/policy_weights": 0.009830271825194359, - "debug/raw_losses": 0.6559033989906311, "epoch": 0.6923995224830879, - "grad_norm": 0.33751112986558723, + "grad_norm": 15.260112587373007, "learning_rate": 1.3065941185782977e-07, - "logits/chosen": -2.3506875038146973, - "logits/rejected": -2.3303661346435547, - "logps/chosen": -298.9208984375, - "logps/rejected": -301.32733154296875, - "loss": 0.0062, - "rewards/accuracies": 0.5625, - "rewards/chosen": -1.4669424295425415, - "rewards/margins": 0.1859736144542694, - "rewards/rejected": -1.6529157161712646, + "logits/chosen": 0.5801733732223511, + "logits/rejected": 0.8631863594055176, + "logps/chosen": -384.7486877441406, + "logps/rejected": -430.93670654296875, + "loss": 0.5471, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.3252196311950684, + "rewards/margins": 0.6237902641296387, + "rewards/rejected": -2.949010133743286, "step": 870 }, { - "debug/losses": 0.004481197334825993, - "debug/policy_weights": 0.008680048398673534, - "debug/raw_losses": 0.5855633020401001, "epoch": 0.700358137684043, - "grad_norm": 0.4621250249815187, + "grad_norm": 17.704754563268107, "learning_rate": 1.2459893188861613e-07, - "logits/chosen": -2.4257559776306152, - "logits/rejected": -2.4043354988098145, - "logps/chosen": -283.9574890136719, - "logps/rejected": -336.14398193359375, - "loss": 0.0056, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -1.354714274406433, - "rewards/margins": 0.3821246027946472, - "rewards/rejected": -1.736838698387146, + "logits/chosen": 0.1365218460559845, + "logits/rejected": 0.5395029783248901, + "logps/chosen": -343.422119140625, + "logps/rejected": -448.3148498535156, + "loss": 0.5426, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -1.9493602514266968, + "rewards/margins": 0.9091870188713074, + "rewards/rejected": -2.8585472106933594, "step": 880 }, { - "debug/losses": 0.00681277085095644, - "debug/policy_weights": 0.010610613040626049, - "debug/raw_losses": 0.6493979692459106, "epoch": 0.708316752884998, - "grad_norm": 0.38768279937627187, + "grad_norm": 15.681417295423357, "learning_rate": 1.1863537252529548e-07, - "logits/chosen": -2.362109899520874, - "logits/rejected": -2.325230598449707, - "logps/chosen": -305.0633239746094, - "logps/rejected": -315.7812194824219, - "loss": 0.006, - "rewards/accuracies": 0.606249988079071, - "rewards/chosen": -1.4679639339447021, - "rewards/margins": 0.1843653917312622, - "rewards/rejected": -1.652329444885254, + "logits/chosen": 0.5491958260536194, + "logits/rejected": 1.0280582904815674, + "logps/chosen": -366.83544921875, + "logps/rejected": -434.9790954589844, + "loss": 0.5335, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.0856850147247314, + "rewards/margins": 0.7586231231689453, + "rewards/rejected": -2.8443081378936768, "step": 890 }, { - "debug/losses": 0.005336861591786146, - "debug/policy_weights": 0.00900069810450077, - "debug/raw_losses": 0.6543849110603333, "epoch": 0.716275368085953, - "grad_norm": 0.17091916151059527, + "grad_norm": 16.956525191226525, "learning_rate": 1.1277334291351145e-07, - "logits/chosen": -2.3315508365631104, - "logits/rejected": -2.3106534481048584, - "logps/chosen": -279.8155822753906, - "logps/rejected": -306.58892822265625, - "loss": 0.0058, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -1.424738883972168, - "rewards/margins": 0.20865151286125183, - "rewards/rejected": -1.6333904266357422, + "logits/chosen": 0.774886429309845, + "logits/rejected": 1.0509330034255981, + "logps/chosen": -348.33258056640625, + "logps/rejected": -434.04931640625, + "loss": 0.5299, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.1099088191986084, + "rewards/margins": 0.79808509349823, + "rewards/rejected": -2.907993793487549, "step": 900 }, { "epoch": 0.716275368085953, - "eval_debug/losses": 0.005231600254774094, - "eval_debug/policy_weights": 0.00833048578351736, - "eval_debug/raw_losses": 0.6435883641242981, - "eval_logits/chosen": -2.35436749458313, - "eval_logits/rejected": -2.34352445602417, - "eval_logps/chosen": -286.1336364746094, - "eval_logps/rejected": -315.2442321777344, - "eval_loss": 0.006573778111487627, - "eval_rewards/accuracies": 0.6054104566574097, - "eval_rewards/chosen": -1.4189012050628662, - "eval_rewards/margins": 0.22655123472213745, - "eval_rewards/rejected": -1.6454524993896484, - "eval_runtime": 153.8078, - "eval_samples_per_second": 55.602, - "eval_steps_per_second": 0.871, + "eval_logits/chosen": 0.7089307904243469, + "eval_logits/rejected": 0.9831804633140564, + "eval_logps/chosen": -355.4507751464844, + "eval_logps/rejected": -435.81591796875, + "eval_loss": 0.54501873254776, + "eval_rewards/accuracies": 0.7341417670249939, + "eval_rewards/chosen": -2.112072706222534, + "eval_rewards/margins": 0.7390963435173035, + "eval_rewards/rejected": -2.851168632507324, + "eval_runtime": 153.2199, + "eval_samples_per_second": 55.815, + "eval_steps_per_second": 0.875, "step": 900 }, { - "debug/losses": 0.007089519407600164, - "debug/policy_weights": 0.011885651387274265, - "debug/raw_losses": 0.5784580111503601, "epoch": 0.724233983286908, - "grad_norm": 0.5904644347701085, + "grad_norm": 18.36522083595644, "learning_rate": 1.0701737372808431e-07, - "logits/chosen": -2.3350112438201904, - "logits/rejected": -2.3275089263916016, - "logps/chosen": -270.8136291503906, - "logps/rejected": -324.17626953125, - "loss": 0.0062, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -1.2706687450408936, - "rewards/margins": 0.4044477045536041, - "rewards/rejected": -1.6751163005828857, + "logits/chosen": 0.9391189813613892, + "logits/rejected": 1.0841562747955322, + "logps/chosen": -332.28680419921875, + "logps/rejected": -432.437255859375, + "loss": 0.5285, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.8854007720947266, + "rewards/margins": 0.872325599193573, + "rewards/rejected": -2.7577261924743652, "step": 910 }, { - "debug/losses": 0.005501788109540939, - "debug/policy_weights": 0.009571429342031479, - "debug/raw_losses": 0.6771363019943237, "epoch": 0.7321925984878631, - "grad_norm": 0.3840464352558924, + "grad_norm": 14.771527831877176, "learning_rate": 1.0137191367132078e-07, - "logits/chosen": -2.377220630645752, - "logits/rejected": -2.363191843032837, - "logps/chosen": -311.4690246582031, - "logps/rejected": -336.18658447265625, - "loss": 0.0055, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -1.553757905960083, - "rewards/margins": 0.2019537389278412, - "rewards/rejected": -1.7557117938995361, + "logits/chosen": 0.9683534502983093, + "logits/rejected": 1.2063651084899902, + "logps/chosen": -393.1899108886719, + "logps/rejected": -460.9879455566406, + "loss": 0.5276, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -2.370966911315918, + "rewards/margins": 0.6327589154243469, + "rewards/rejected": -3.003725528717041, "step": 920 }, { - "debug/losses": 0.005613836459815502, - "debug/policy_weights": 0.008208638988435268, - "debug/raw_losses": 0.691414475440979, "epoch": 0.7401512136888182, - "grad_norm": 0.2826122830316169, + "grad_norm": 15.363649277993106, "learning_rate": 9.584132603467827e-08, - "logits/chosen": -2.379488229751587, - "logits/rejected": -2.340334415435791, - "logps/chosen": -326.0104064941406, - "logps/rejected": -330.33380126953125, - "loss": 0.0051, - "rewards/accuracies": 0.5562499761581421, - "rewards/chosen": -1.588404893875122, - "rewards/margins": 0.17209449410438538, - "rewards/rejected": -1.760499358177185, + "logits/chosen": 1.2887399196624756, + "logits/rejected": 1.730613350868225, + "logps/chosen": -417.56500244140625, + "logps/rejected": -460.6282653808594, + "loss": 0.5419, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.50395131111145, + "rewards/margins": 0.5594934225082397, + "rewards/rejected": -3.0634446144104004, "step": 930 }, { - "debug/losses": 0.005423235706984997, - "debug/policy_weights": 0.008763330057263374, - "debug/raw_losses": 0.6793016195297241, "epoch": 0.7481098288897732, - "grad_norm": 0.38257682570765483, + "grad_norm": 15.533558224288868, "learning_rate": 9.042988532644249e-08, - "logits/chosen": -2.340975046157837, - "logits/rejected": -2.329636573791504, - "logps/chosen": -295.8883972167969, - "logps/rejected": -325.37689208984375, - "loss": 0.0051, - "rewards/accuracies": 0.5625, - "rewards/chosen": -1.5253543853759766, - "rewards/margins": 0.1810278445482254, - "rewards/rejected": -1.706382393836975, + "logits/chosen": 1.2376580238342285, + "logits/rejected": 1.4853687286376953, + "logps/chosen": -384.9251403808594, + "logps/rejected": -471.8038635253906, + "loss": 0.5375, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.4157216548919678, + "rewards/margins": 0.7549302577972412, + "rewards/rejected": -3.170651912689209, "step": 940 }, { - "debug/losses": 0.006376519799232483, - "debug/policy_weights": 0.00958496518433094, - "debug/raw_losses": 0.6660071611404419, "epoch": 0.7560684440907283, - "grad_norm": 0.19253023210505116, + "grad_norm": 17.631353509539945, "learning_rate": 8.514177396802428e-08, - "logits/chosen": -2.3895061016082764, - "logits/rejected": -2.372832775115967, - "logps/chosen": -293.9468688964844, - "logps/rejected": -323.3780212402344, - "loss": 0.0053, - "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": -1.5407609939575195, - "rewards/margins": 0.22410783171653748, - "rewards/rejected": -1.764868974685669, + "logits/chosen": 0.7006018757820129, + "logits/rejected": 0.9818047285079956, + "logps/chosen": -390.9950256347656, + "logps/rejected": -467.605224609375, + "loss": 0.5352, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.511242389678955, + "rewards/margins": 0.6958988308906555, + "rewards/rejected": -3.207141160964966, "step": 950 }, { - "debug/losses": 0.004370839335024357, - "debug/policy_weights": 0.0070319585502147675, - "debug/raw_losses": 0.6650201082229614, "epoch": 0.7640270592916832, - "grad_norm": 0.20541088275423153, + "grad_norm": 16.74451609121504, "learning_rate": 7.998107906142839e-08, - "logits/chosen": -2.370800733566284, - "logits/rejected": -2.353532075881958, - "logps/chosen": -291.18597412109375, - "logps/rejected": -303.9618835449219, - "loss": 0.0047, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -1.481643557548523, - "rewards/margins": 0.19196270406246185, - "rewards/rejected": -1.673606276512146, + "logits/chosen": 0.650057315826416, + "logits/rejected": 0.9443982243537903, + "logps/chosen": -383.0691223144531, + "logps/rejected": -440.1776428222656, + "loss": 0.5429, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.400475025177002, + "rewards/margins": 0.6352895498275757, + "rewards/rejected": -3.035764455795288, "step": 960 }, { - "debug/losses": 0.005208554677665234, - "debug/policy_weights": 0.008515411987900734, - "debug/raw_losses": 0.5917180776596069, "epoch": 0.7719856744926383, - "grad_norm": 0.35390482583145494, + "grad_norm": 17.31477017879575, "learning_rate": 7.495178923039396e-08, - "logits/chosen": -2.355196475982666, - "logits/rejected": -2.3742012977600098, - "logps/chosen": -271.4175720214844, - "logps/rejected": -340.2388916015625, - "loss": 0.0054, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -1.4274370670318604, - "rewards/margins": 0.4177183508872986, - "rewards/rejected": -1.8451553583145142, + "logits/chosen": 0.6368435621261597, + "logits/rejected": 0.6060078144073486, + "logps/chosen": -356.3271484375, + "logps/rejected": -451.2137145996094, + "loss": 0.5412, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -2.2765326499938965, + "rewards/margins": 0.678371250629425, + "rewards/rejected": -2.9549036026000977, "step": 970 }, { - "debug/losses": 0.004732693079859018, - "debug/policy_weights": 0.0064360699616372585, - "debug/raw_losses": 0.6343146562576294, "epoch": 0.7799442896935933, - "grad_norm": 0.5203224890068501, + "grad_norm": 19.22477593332992, "learning_rate": 7.005779153764682e-08, - "logits/chosen": -2.37237811088562, - "logits/rejected": -2.3444859981536865, - "logps/chosen": -286.66265869140625, - "logps/rejected": -308.6042175292969, - "loss": 0.0062, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -1.4387918710708618, - "rewards/margins": 0.272369921207428, - "rewards/rejected": -1.7111618518829346, + "logits/chosen": 0.31581220030784607, + "logits/rejected": 0.7790960073471069, + "logps/chosen": -355.78216552734375, + "logps/rejected": -430.12115478515625, + "loss": 0.5252, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.1299870014190674, + "rewards/margins": 0.7963441610336304, + "rewards/rejected": -2.926331043243408, "step": 980 }, { - "debug/losses": 0.006554176565259695, - "debug/policy_weights": 0.00958862341940403, - "debug/raw_losses": 0.6320292949676514, "epoch": 0.7879029048945484, - "grad_norm": 0.43650906403321443, + "grad_norm": 23.587781729505224, "learning_rate": 6.530286848064698e-08, - "logits/chosen": -2.3619437217712402, - "logits/rejected": -2.350609302520752, - "logps/chosen": -294.9227294921875, - "logps/rejected": -325.01312255859375, - "loss": 0.0058, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -1.4177331924438477, - "rewards/margins": 0.26314035058021545, - "rewards/rejected": -1.6808735132217407, + "logits/chosen": 0.5126671195030212, + "logits/rejected": 0.7368132472038269, + "logps/chosen": -354.09619140625, + "logps/rejected": -436.5574645996094, + "loss": 0.5206, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.0094680786132812, + "rewards/margins": 0.7868490815162659, + "rewards/rejected": -2.7963171005249023, "step": 990 }, { - "debug/losses": 0.004413291346281767, - "debug/policy_weights": 0.0070404470898211, - "debug/raw_losses": 0.6145377159118652, "epoch": 0.7958615200955034, - "grad_norm": 0.27350493107078006, + "grad_norm": 23.091191084575406, "learning_rate": 6.069069506815325e-08, - "logits/chosen": -2.4027926921844482, - "logits/rejected": -2.3677072525024414, - "logps/chosen": -288.64447021484375, - "logps/rejected": -314.2682189941406, - "loss": 0.006, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -1.4373289346694946, - "rewards/margins": 0.3009566068649292, - "rewards/rejected": -1.7382856607437134, + "logits/chosen": 0.4187610149383545, + "logits/rejected": 0.844292938709259, + "logps/chosen": -353.77496337890625, + "logps/rejected": -425.384521484375, + "loss": 0.5629, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.0886340141296387, + "rewards/margins": 0.760814368724823, + "rewards/rejected": -2.8494486808776855, "step": 1000 }, { "epoch": 0.7958615200955034, - "eval_debug/losses": 0.00470565864816308, - "eval_debug/policy_weights": 0.007544398307800293, - "eval_debug/raw_losses": 0.6449161171913147, - "eval_logits/chosen": -2.370081901550293, - "eval_logits/rejected": -2.3586676120758057, - "eval_logps/chosen": -290.10247802734375, - "eval_logps/rejected": -320.6679382324219, - "eval_loss": 0.00624101934954524, - "eval_rewards/accuracies": 0.6091417670249939, - "eval_rewards/chosen": -1.4585894346237183, - "eval_rewards/margins": 0.2410999983549118, - "eval_rewards/rejected": -1.6996896266937256, - "eval_runtime": 153.7579, - "eval_samples_per_second": 55.62, - "eval_steps_per_second": 0.872, + "eval_logits/chosen": 0.4599636495113373, + "eval_logits/rejected": 0.7032696604728699, + "eval_logps/chosen": -359.07489013671875, + "eval_logps/rejected": -440.1051330566406, + "eval_loss": 0.544038712978363, + "eval_rewards/accuracies": 0.7322761416435242, + "eval_rewards/chosen": -2.148313522338867, + "eval_rewards/margins": 0.7457479238510132, + "eval_rewards/rejected": -2.894061326980591, + "eval_runtime": 152.9965, + "eval_samples_per_second": 55.897, + "eval_steps_per_second": 0.876, "step": 1000 }, { - "debug/losses": 0.007456570863723755, - "debug/policy_weights": 0.011332079768180847, - "debug/raw_losses": 0.7036976218223572, "epoch": 0.8038201352964585, - "grad_norm": 0.5119074592142228, + "grad_norm": 16.757821822715535, "learning_rate": 5.6224835979863714e-08, - "logits/chosen": -2.3976004123687744, - "logits/rejected": -2.37339448928833, - "logps/chosen": -306.21209716796875, - "logps/rejected": -308.34552001953125, - "loss": 0.0058, - "rewards/accuracies": 0.5249999761581421, - "rewards/chosen": -1.5589864253997803, - "rewards/margins": 0.09942086786031723, - "rewards/rejected": -1.65840744972229, + "logits/chosen": 0.31164878606796265, + "logits/rejected": 0.6217297911643982, + "logps/chosen": -365.158935546875, + "logps/rejected": -421.8297424316406, + "loss": 0.542, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -2.1484549045562744, + "rewards/margins": 0.6447950601577759, + "rewards/rejected": -2.7932498455047607, "step": 1010 }, { - "debug/losses": 0.0031814612448215485, - "debug/policy_weights": 0.005595595575869083, - "debug/raw_losses": 0.6678837537765503, "epoch": 0.8117787504974134, - "grad_norm": 0.41900320597533136, + "grad_norm": 18.84738392137227, "learning_rate": 5.190874281132851e-08, - "logits/chosen": -2.3519294261932373, - "logits/rejected": -2.347428321838379, - "logps/chosen": -293.00933837890625, - "logps/rejected": -314.662841796875, - "loss": 0.0052, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -1.5330232381820679, - "rewards/margins": 0.18833503127098083, - "rewards/rejected": -1.721358060836792, + "logits/chosen": 0.4319641590118408, + "logits/rejected": 0.6768335700035095, + "logps/chosen": -342.3192138671875, + "logps/rejected": -426.79150390625, + "loss": 0.5334, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.0261223316192627, + "rewards/margins": 0.8165225982666016, + "rewards/rejected": -2.842644453048706, "step": 1020 }, { - "debug/losses": 0.004050451796501875, - "debug/policy_weights": 0.0068862466141581535, - "debug/raw_losses": 0.6581259369850159, "epoch": 0.8197373656983685, - "grad_norm": 0.24983618265757235, + "grad_norm": 17.342020030463754, "learning_rate": 4.774575140626316e-08, - "logits/chosen": -2.3442137241363525, - "logits/rejected": -2.3174569606781006, - "logps/chosen": -290.02569580078125, - "logps/rejected": -304.29071044921875, - "loss": 0.0049, - "rewards/accuracies": 0.574999988079071, - "rewards/chosen": -1.4996235370635986, - "rewards/margins": 0.2191561907529831, - "rewards/rejected": -1.7187799215316772, + "logits/chosen": 0.5502648949623108, + "logits/rejected": 0.9354747533798218, + "logps/chosen": -352.39959716796875, + "logps/rejected": -427.843017578125, + "loss": 0.5377, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.1233630180358887, + "rewards/margins": 0.8309398889541626, + "rewards/rejected": -2.9543025493621826, "step": 1030 }, { - "debug/losses": 0.005188937298953533, - "debug/policy_weights": 0.007965468801558018, - "debug/raw_losses": 0.6591325998306274, "epoch": 0.8276959808993235, - "grad_norm": 0.2821038337870524, + "grad_norm": 18.89903354492542, "learning_rate": 4.373907927832513e-08, - "logits/chosen": -2.342741012573242, - "logits/rejected": -2.334594964981079, - "logps/chosen": -269.0764465332031, - "logps/rejected": -300.1712951660156, - "loss": 0.0055, - "rewards/accuracies": 0.612500011920929, - "rewards/chosen": -1.4137346744537354, - "rewards/margins": 0.19867181777954102, - "rewards/rejected": -1.6124063730239868, + "logits/chosen": 0.6125099062919617, + "logits/rejected": 0.7803068161010742, + "logps/chosen": -340.08282470703125, + "logps/rejected": -415.82928466796875, + "loss": 0.5402, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -2.1237986087799072, + "rewards/margins": 0.6451882123947144, + "rewards/rejected": -2.768986701965332, "step": 1040 }, { - "debug/losses": 0.005739947315305471, - "debug/policy_weights": 0.009350694715976715, - "debug/raw_losses": 0.6166882514953613, "epoch": 0.8356545961002786, - "grad_norm": 0.3906592246303309, + "grad_norm": 15.617092554442154, "learning_rate": 3.9891823124345665e-08, - "logits/chosen": -2.3692679405212402, - "logits/rejected": -2.3485307693481445, - "logps/chosen": -284.3598327636719, - "logps/rejected": -309.95416259765625, - "loss": 0.0062, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -1.3334324359893799, - "rewards/margins": 0.28686991333961487, - "rewards/rejected": -1.6203022003173828, + "logits/chosen": 0.49313363432884216, + "logits/rejected": 0.6866206526756287, + "logps/chosen": -347.2782287597656, + "logps/rejected": -431.2796325683594, + "loss": 0.54, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.9626163244247437, + "rewards/margins": 0.8709405660629272, + "rewards/rejected": -2.833556890487671, "step": 1050 }, { - "debug/losses": 0.0042928243055939674, - "debug/policy_weights": 0.007413353770971298, - "debug/raw_losses": 0.6195831894874573, "epoch": 0.8436132113012336, - "grad_norm": 0.25428481628705635, + "grad_norm": 16.41345421019679, "learning_rate": 3.620695643093924e-08, - "logits/chosen": -2.3796534538269043, - "logits/rejected": -2.3795361518859863, - "logps/chosen": -268.11834716796875, - "logps/rejected": -309.98876953125, - "loss": 0.0056, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -1.3133149147033691, - "rewards/margins": 0.2827284038066864, - "rewards/rejected": -1.5960432291030884, + "logits/chosen": 0.5007290840148926, + "logits/rejected": 0.6147471070289612, + "logps/chosen": -332.5653381347656, + "logps/rejected": -410.90826416015625, + "loss": 0.5381, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.957784652709961, + "rewards/margins": 0.6474533677101135, + "rewards/rejected": -2.6052377223968506, "step": 1060 }, { - "debug/losses": 0.007709466852247715, - "debug/policy_weights": 0.011627515777945518, - "debug/raw_losses": 0.6371027231216431, "epoch": 0.8515718265021887, - "grad_norm": 0.5229626352477018, + "grad_norm": 17.140432579260622, "learning_rate": 3.268732717634032e-08, - "logits/chosen": -2.3665263652801514, - "logits/rejected": -2.348654270172119, - "logps/chosen": -270.7255554199219, - "logps/rejected": -290.8115539550781, - "loss": 0.0065, - "rewards/accuracies": 0.612500011920929, - "rewards/chosen": -1.3433122634887695, - "rewards/margins": 0.22386956214904785, - "rewards/rejected": -1.5671818256378174, + "logits/chosen": 0.4870120882987976, + "logits/rejected": 0.6993613839149475, + "logps/chosen": -340.8184814453125, + "logps/rejected": -407.65557861328125, + "loss": 0.5422, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -2.044240951538086, + "rewards/margins": 0.6913812756538391, + "rewards/rejected": -2.7356221675872803, "step": 1070 }, { - "debug/losses": 0.005681043956428766, - "debug/policy_weights": 0.009358206763863564, - "debug/raw_losses": 0.6649767160415649, "epoch": 0.8595304417031436, - "grad_norm": 0.2210256720780983, + "grad_norm": 16.694484491754185, "learning_rate": 2.9335655629243645e-08, - "logits/chosen": -2.3961851596832275, - "logits/rejected": -2.39003324508667, - "logps/chosen": -292.87030029296875, - "logps/rejected": -323.01873779296875, - "loss": 0.0066, - "rewards/accuracies": 0.625, - "rewards/chosen": -1.4248764514923096, - "rewards/margins": 0.1889083832502365, - "rewards/rejected": -1.6137847900390625, + "logits/chosen": 0.4208219647407532, + "logits/rejected": 0.6202067136764526, + "logps/chosen": -369.6133728027344, + "logps/rejected": -443.1822204589844, + "loss": 0.5484, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -2.1923069953918457, + "rewards/margins": 0.6231125593185425, + "rewards/rejected": -2.8154194355010986, "step": 1080 }, { - "debug/losses": 0.007488328032195568, - "debug/policy_weights": 0.010853718966245651, - "debug/raw_losses": 0.7004289627075195, "epoch": 0.8674890569040987, - "grad_norm": 0.2528842128922513, + "grad_norm": 22.133523730480416, "learning_rate": 2.6154532246349476e-08, - "logits/chosen": -2.3703179359436035, - "logits/rejected": -2.331704616546631, - "logps/chosen": -289.47943115234375, - "logps/rejected": -282.0934143066406, - "loss": 0.0071, - "rewards/accuracies": 0.5562499761581421, - "rewards/chosen": -1.479093313217163, - "rewards/margins": 0.10569523274898529, - "rewards/rejected": -1.58478844165802, + "logits/chosen": 0.2894337773323059, + "logits/rejected": 0.7369552254676819, + "logps/chosen": -358.96392822265625, + "logps/rejected": -397.40496826171875, + "loss": 0.5698, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -2.173938274383545, + "rewards/margins": 0.5639660358428955, + "rewards/rejected": -2.7379043102264404, "step": 1090 }, { - "debug/losses": 0.006660944316536188, - "debug/policy_weights": 0.010694949887692928, - "debug/raw_losses": 0.7022801041603088, "epoch": 0.8754476721050537, - "grad_norm": 0.4137044736435946, + "grad_norm": 18.678051085655653, "learning_rate": 2.31464156702382e-08, - "logits/chosen": -2.3767080307006836, - "logits/rejected": -2.339198589324951, - "logps/chosen": -301.8552551269531, - "logps/rejected": -307.1533203125, - "loss": 0.0058, - "rewards/accuracies": 0.574999988079071, - "rewards/chosen": -1.4526488780975342, - "rewards/margins": 0.1316477358341217, - "rewards/rejected": -1.584296703338623, + "logits/chosen": 0.37609541416168213, + "logits/rejected": 0.7177656888961792, + "logps/chosen": -366.3799743652344, + "logps/rejected": -418.73046875, + "loss": 0.5351, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -2.097896099090576, + "rewards/margins": 0.6021716594696045, + "rewards/rejected": -2.7000677585601807, "step": 1100 }, { "epoch": 0.8754476721050537, - "eval_debug/losses": 0.0052309781312942505, - "eval_debug/policy_weights": 0.008395406417548656, - "eval_debug/raw_losses": 0.6403370499610901, - "eval_logits/chosen": -2.379594087600708, - "eval_logits/rejected": -2.367927074432373, - "eval_logps/chosen": -284.0606384277344, - "eval_logps/rejected": -315.5556945800781, - "eval_loss": 0.006968467030674219, - "eval_rewards/accuracies": 0.6063432693481445, - "eval_rewards/chosen": -1.3981714248657227, - "eval_rewards/margins": 0.25039568543434143, - "eval_rewards/rejected": -1.6485670804977417, - "eval_runtime": 153.7503, - "eval_samples_per_second": 55.623, - "eval_steps_per_second": 0.872, + "eval_logits/chosen": 0.2752957046031952, + "eval_logits/rejected": 0.5029404759407043, + "eval_logps/chosen": -359.2066345214844, + "eval_logps/rejected": -436.4062194824219, + "eval_loss": 0.542252242565155, + "eval_rewards/accuracies": 0.7304104566574097, + "eval_rewards/chosen": -2.1496312618255615, + "eval_rewards/margins": 0.7074410319328308, + "eval_rewards/rejected": -2.857072353363037, + "eval_runtime": 153.1272, + "eval_samples_per_second": 55.849, + "eval_steps_per_second": 0.875, "step": 1100 }, { - "debug/losses": 0.006308531854301691, - "debug/policy_weights": 0.01133556105196476, - "debug/raw_losses": 0.643947958946228, "epoch": 0.8834062873060088, - "grad_norm": 0.7148622916690225, + "grad_norm": 20.61267635187729, "learning_rate": 2.031363082912252e-08, - "logits/chosen": -2.3657798767089844, - "logits/rejected": -2.3629374504089355, - "logps/chosen": -265.7649230957031, - "logps/rejected": -295.9998779296875, - "loss": 0.0065, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -1.342302918434143, - "rewards/margins": 0.21107757091522217, - "rewards/rejected": -1.5533803701400757, + "logits/chosen": 0.4241918623447418, + "logits/rejected": 0.5393252968788147, + "logps/chosen": -339.3815612792969, + "logps/rejected": -413.20794677734375, + "loss": 0.5198, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.0784695148468018, + "rewards/margins": 0.6469917893409729, + "rewards/rejected": -2.72546124458313, "step": 1110 }, { - "debug/losses": 0.00507017970085144, - "debug/policy_weights": 0.0074596283957362175, - "debug/raw_losses": 0.6618943214416504, "epoch": 0.8913649025069638, - "grad_norm": 0.32311277344873013, + "grad_norm": 18.0074651342195, "learning_rate": 1.7658367139945228e-08, - "logits/chosen": -2.3766467571258545, - "logits/rejected": -2.355742931365967, - "logps/chosen": -284.4914855957031, - "logps/rejected": -304.9427185058594, - "loss": 0.0054, - "rewards/accuracies": 0.612500011920929, - "rewards/chosen": -1.4018304347991943, - "rewards/margins": 0.21497675776481628, - "rewards/rejected": -1.6168073415756226, + "logits/chosen": 0.34336820244789124, + "logits/rejected": 0.5900403261184692, + "logps/chosen": -356.97467041015625, + "logps/rejected": -420.053955078125, + "loss": 0.532, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.126662492752075, + "rewards/margins": 0.6412577033042908, + "rewards/rejected": -2.7679200172424316, "step": 1120 }, { - "debug/losses": 0.005239368416368961, - "debug/policy_weights": 0.008933836594223976, - "debug/raw_losses": 0.632648229598999, "epoch": 0.8993235177079189, - "grad_norm": 0.3517353827105627, + "grad_norm": 20.08213128535877, "learning_rate": 1.5182676816211632e-08, - "logits/chosen": -2.3913938999176025, - "logits/rejected": -2.3770534992218018, - "logps/chosen": -281.8226623535156, - "logps/rejected": -323.12744140625, - "loss": 0.0066, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -1.394094467163086, - "rewards/margins": 0.274956613779068, - "rewards/rejected": -1.669050931930542, + "logits/chosen": 0.3494935631752014, + "logits/rejected": 0.6701606512069702, + "logps/chosen": -356.5093078613281, + "logps/rejected": -447.0511169433594, + "loss": 0.5238, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.140960931777954, + "rewards/margins": 0.7673269510269165, + "rewards/rejected": -2.908287763595581, "step": 1130 }, { - "debug/losses": 0.013167977333068848, - "debug/policy_weights": 0.019408199936151505, - "debug/raw_losses": 0.6732186079025269, "epoch": 0.9072821329088738, - "grad_norm": 0.3224072778132541, + "grad_norm": 15.635113581490817, "learning_rate": 1.2888473281864597e-08, - "logits/chosen": -2.4019036293029785, - "logits/rejected": -2.384862184524536, - "logps/chosen": -294.0433654785156, - "logps/rejected": -304.0968322753906, - "loss": 0.0077, - "rewards/accuracies": 0.574999988079071, - "rewards/chosen": -1.362282156944275, - "rewards/margins": 0.1493268460035324, - "rewards/rejected": -1.5116089582443237, + "logits/chosen": 0.17955251038074493, + "logits/rejected": 0.35742440819740295, + "logps/chosen": -364.36993408203125, + "logps/rejected": -419.10186767578125, + "loss": 0.5323, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -2.065547227859497, + "rewards/margins": 0.5961118936538696, + "rewards/rejected": -2.6616594791412354, "step": 1140 }, { - "debug/losses": 0.009401632472872734, - "debug/policy_weights": 0.01385374553501606, - "debug/raw_losses": 0.6444624662399292, "epoch": 0.9152407481098289, - "grad_norm": 0.32759927360986657, + "grad_norm": 18.17543468321864, "learning_rate": 1.0777529692427679e-08, - "logits/chosen": -2.354888439178467, - "logits/rejected": -2.332421064376831, - "logps/chosen": -280.6602478027344, - "logps/rejected": -293.0055236816406, - "loss": 0.0076, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -1.2988250255584717, - "rewards/margins": 0.23193073272705078, - "rewards/rejected": -1.5307557582855225, + "logits/chosen": 0.5689483880996704, + "logits/rejected": 0.9127931594848633, + "logps/chosen": -358.0685119628906, + "logps/rejected": -420.18426513671875, + "loss": 0.5372, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.072906970977783, + "rewards/margins": 0.7296361327171326, + "rewards/rejected": -2.8025431632995605, "step": 1150 }, { - "debug/losses": 0.006143433507531881, - "debug/policy_weights": 0.009376000612974167, - "debug/raw_losses": 0.655838131904602, "epoch": 0.9231993633107839, - "grad_norm": 0.31098315627560696, + "grad_norm": 18.913441395008096, "learning_rate": 8.851477564560061e-09, - "logits/chosen": -2.3542723655700684, - "logits/rejected": -2.3350157737731934, - "logps/chosen": -278.3203125, - "logps/rejected": -316.9741516113281, - "loss": 0.0074, - "rewards/accuracies": 0.606249988079071, - "rewards/chosen": -1.3614548444747925, - "rewards/margins": 0.2771442234516144, - "rewards/rejected": -1.6385990381240845, + "logits/chosen": 0.6391158103942871, + "logits/rejected": 0.8792598843574524, + "logps/chosen": -345.1988830566406, + "logps/rejected": -423.65643310546875, + "loss": 0.5553, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.030240535736084, + "rewards/margins": 0.6751813888549805, + "rewards/rejected": -2.7054216861724854, "step": 1160 }, { - "debug/losses": 0.00859107542783022, - "debug/policy_weights": 0.01376525778323412, - "debug/raw_losses": 0.6619387269020081, "epoch": 0.931157978511739, - "grad_norm": 0.2442134251564226, + "grad_norm": 17.617100933080067, "learning_rate": 7.111805515081531e-09, - "logits/chosen": -2.398651123046875, - "logits/rejected": -2.3627355098724365, - "logps/chosen": -293.2715759277344, - "logps/rejected": -312.73443603515625, - "loss": 0.0086, - "rewards/accuracies": 0.625, - "rewards/chosen": -1.3878949880599976, - "rewards/margins": 0.22798268496990204, - "rewards/rejected": -1.6158778667449951, + "logits/chosen": 0.2744918763637543, + "logits/rejected": 0.6285992860794067, + "logps/chosen": -381.2764587402344, + "logps/rejected": -450.02142333984375, + "loss": 0.5253, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.267944097518921, + "rewards/margins": 0.7208031415939331, + "rewards/rejected": -2.9887471199035645, "step": 1170 }, { - "debug/losses": 0.007142928894609213, - "debug/policy_weights": 0.012093516066670418, - "debug/raw_losses": 0.605353057384491, "epoch": 0.939116593712694, - "grad_norm": 0.22327237960882176, + "grad_norm": 15.79506235955512, "learning_rate": 5.559858110443016e-09, - "logits/chosen": -2.3911027908325195, - "logits/rejected": -2.3682327270507812, - "logps/chosen": -287.30517578125, - "logps/rejected": -321.6160583496094, - "loss": 0.0074, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -1.3139712810516357, - "rewards/margins": 0.322050005197525, - "rewards/rejected": -1.636021375656128, + "logits/chosen": 0.21863842010498047, + "logits/rejected": 0.5074991583824158, + "logps/chosen": -360.7405700683594, + "logps/rejected": -429.59130859375, + "loss": 0.5093, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.048325777053833, + "rewards/margins": 0.6674487590789795, + "rewards/rejected": -2.7157740592956543, "step": 1180 }, { - "debug/losses": 0.004909296054393053, - "debug/policy_weights": 0.008050905540585518, - "debug/raw_losses": 0.6141546964645386, "epoch": 0.947075208913649, - "grad_norm": 0.40148504448495864, + "grad_norm": 15.907477201452677, "learning_rate": 4.196834827531276e-09, - "logits/chosen": -2.368664026260376, - "logits/rejected": -2.3524036407470703, - "logps/chosen": -287.7311096191406, - "logps/rejected": -321.55450439453125, - "loss": 0.0064, - "rewards/accuracies": 0.612500011920929, - "rewards/chosen": -1.4463093280792236, - "rewards/margins": 0.3423252999782562, - "rewards/rejected": -1.7886346578598022, + "logits/chosen": 0.36164388060569763, + "logits/rejected": 0.6227206587791443, + "logps/chosen": -360.2243347167969, + "logps/rejected": -448.7967834472656, + "loss": 0.5248, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -2.1712417602539062, + "rewards/margins": 0.8898156881332397, + "rewards/rejected": -3.0610575675964355, "step": 1190 }, { - "debug/losses": 0.006442622747272253, - "debug/policy_weights": 0.010372470133006573, - "debug/raw_losses": 0.659237265586853, "epoch": 0.955033824114604, - "grad_norm": 0.4773837819826264, + "grad_norm": 16.740269639186682, "learning_rate": 3.023789126611137e-09, - "logits/chosen": -2.3759243488311768, - "logits/rejected": -2.3451437950134277, - "logps/chosen": -288.5528869628906, - "logps/rejected": -302.302734375, - "loss": 0.0064, - "rewards/accuracies": 0.5625, - "rewards/chosen": -1.4122283458709717, - "rewards/margins": 0.20960351824760437, - "rewards/rejected": -1.6218318939208984, + "logits/chosen": 0.4015735685825348, + "logits/rejected": 0.7451781034469604, + "logps/chosen": -357.9459533691406, + "logps/rejected": -429.60107421875, + "loss": 0.5499, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.10615873336792, + "rewards/margins": 0.7886561751365662, + "rewards/rejected": -2.894814968109131, "step": 1200 }, { "epoch": 0.955033824114604, - "eval_debug/losses": 0.005183727014809847, - "eval_debug/policy_weights": 0.008306536823511124, - "eval_debug/raw_losses": 0.6403022408485413, - "eval_logits/chosen": -2.3784806728363037, - "eval_logits/rejected": -2.366612195968628, - "eval_logps/chosen": -284.97100830078125, - "eval_logps/rejected": -316.9624938964844, - "eval_loss": 0.007046498823910952, - "eval_rewards/accuracies": 0.6100746393203735, - "eval_rewards/chosen": -1.4072749614715576, - "eval_rewards/margins": 0.2553601562976837, - "eval_rewards/rejected": -1.662635087966919, - "eval_runtime": 153.9665, - "eval_samples_per_second": 55.545, - "eval_steps_per_second": 0.87, + "eval_logits/chosen": 0.35614868998527527, + "eval_logits/rejected": 0.5902336239814758, + "eval_logps/chosen": -359.8675231933594, + "eval_logps/rejected": -438.7700500488281, + "eval_loss": 0.5416554808616638, + "eval_rewards/accuracies": 0.7313432693481445, + "eval_rewards/chosen": -2.1562399864196777, + "eval_rewards/margins": 0.7244706153869629, + "eval_rewards/rejected": -2.8807106018066406, + "eval_runtime": 153.0914, + "eval_samples_per_second": 55.862, + "eval_steps_per_second": 0.875, "step": 1200 }, { - "debug/losses": 0.005012947134673595, - "debug/policy_weights": 0.00824889075011015, - "debug/raw_losses": 0.633194625377655, "epoch": 0.9629924393155591, - "grad_norm": 0.3853967228824428, + "grad_norm": 17.694729007133922, "learning_rate": 2.041627637121929e-09, - "logits/chosen": -2.363786220550537, - "logits/rejected": -2.3533267974853516, - "logps/chosen": -281.7425842285156, - "logps/rejected": -324.53460693359375, - "loss": 0.0067, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -1.402888298034668, - "rewards/margins": 0.2553493082523346, - "rewards/rejected": -1.6582376956939697, + "logits/chosen": 0.3815138638019562, + "logits/rejected": 0.6700073480606079, + "logps/chosen": -365.1811218261719, + "logps/rejected": -457.5047302246094, + "loss": 0.5543, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.237273931503296, + "rewards/margins": 0.7506656050682068, + "rewards/rejected": -2.9879393577575684, "step": 1210 }, { - "debug/losses": 0.006254622247070074, - "debug/policy_weights": 0.009424760937690735, - "debug/raw_losses": 0.6736980676651001, "epoch": 0.9709510545165141, - "grad_norm": 0.4465471268013703, + "grad_norm": 18.90643800869425, "learning_rate": 1.2511094569571668e-09, - "logits/chosen": -2.328406810760498, - "logits/rejected": -2.2939095497131348, - "logps/chosen": -287.53472900390625, - "logps/rejected": -279.87188720703125, - "loss": 0.0073, - "rewards/accuracies": 0.606249988079071, - "rewards/chosen": -1.3911142349243164, - "rewards/margins": 0.1553507000207901, - "rewards/rejected": -1.5464651584625244, + "logits/chosen": 0.5528720021247864, + "logits/rejected": 0.9850804209709167, + "logps/chosen": -364.58245849609375, + "logps/rejected": -405.16729736328125, + "loss": 0.5454, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.1615917682647705, + "rewards/margins": 0.6378272771835327, + "rewards/rejected": -2.7994191646575928, "step": 1220 }, { - "debug/losses": 0.004875156097114086, - "debug/policy_weights": 0.007172915153205395, - "debug/raw_losses": 0.6340065002441406, "epoch": 0.9789096697174692, - "grad_norm": 0.24064112264421297, + "grad_norm": 15.395362721693543, "learning_rate": 6.528455657691112e-10, - "logits/chosen": -2.347778797149658, - "logits/rejected": -2.35634183883667, - "logps/chosen": -281.10821533203125, - "logps/rejected": -318.0760498046875, - "loss": 0.0061, - "rewards/accuracies": 0.606249988079071, - "rewards/chosen": -1.4237060546875, - "rewards/margins": 0.2571585774421692, - "rewards/rejected": -1.680864691734314, + "logits/chosen": 0.5471321940422058, + "logits/rejected": 0.6208599209785461, + "logps/chosen": -363.4918518066406, + "logps/rejected": -439.646240234375, + "loss": 0.5121, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.2475426197052, + "rewards/margins": 0.6490240097045898, + "rewards/rejected": -2.896566867828369, "step": 1230 }, { - "debug/losses": 0.0060342904180288315, - "debug/policy_weights": 0.008632312528789043, - "debug/raw_losses": 0.655373215675354, "epoch": 0.9868682849184242, - "grad_norm": 0.20752921795869225, + "grad_norm": 18.690557653353416, "learning_rate": 2.4729835275189016e-10, - "logits/chosen": -2.363779306411743, - "logits/rejected": -2.346327304840088, - "logps/chosen": -285.58331298828125, - "logps/rejected": -317.2745361328125, - "loss": 0.006, - "rewards/accuracies": 0.5625, - "rewards/chosen": -1.4487727880477905, - "rewards/margins": 0.25334611535072327, - "rewards/rejected": -1.7021188735961914, + "logits/chosen": 0.4783777594566345, + "logits/rejected": 0.6712285876274109, + "logps/chosen": -352.80804443359375, + "logps/rejected": -443.9805603027344, + "loss": 0.5489, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -2.1210198402404785, + "rewards/margins": 0.8481594324111938, + "rewards/rejected": -2.969179630279541, "step": 1240 }, { - "debug/losses": 0.005698877386748791, - "debug/policy_weights": 0.00828011054545641, - "debug/raw_losses": 0.6336467266082764, "epoch": 0.9948269001193792, - "grad_norm": 0.411540994445993, + "grad_norm": 19.13672093750131, "learning_rate": 3.478125926756337e-11, - "logits/chosen": -2.3535804748535156, - "logits/rejected": -2.3520710468292236, - "logps/chosen": -285.2567138671875, - "logps/rejected": -321.4198303222656, - "loss": 0.0057, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -1.435394048690796, - "rewards/margins": 0.2452627718448639, - "rewards/rejected": -1.6806570291519165, + "logits/chosen": 0.5186442732810974, + "logits/rejected": 0.6572960615158081, + "logps/chosen": -358.7153625488281, + "logps/rejected": -452.71893310546875, + "loss": 0.5414, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -2.16998028755188, + "rewards/margins": 0.8236673474311829, + "rewards/rejected": -2.993648052215576, "step": 1250 }, { "epoch": 0.9996020692399522, "step": 1256, "total_flos": 0.0, - "train_loss": 0.016716306088907514, - "train_runtime": 10019.6903, - "train_samples_per_second": 16.048, + "train_loss": 0.56636344817034, + "train_runtime": 10031.2749, + "train_samples_per_second": 16.03, "train_steps_per_second": 0.125 } ],