diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -9,17 +9,13 @@ "is_world_process_zero": true, "log_history": [ { - "debug/losses": 0.23031963407993317, - "debug/policy_weights": 0.3322809934616089, - "debug/raw_losses": 0.6931471824645996, - "epoch": 0.0007958615200955034, - "grad_norm": 1.6287391185739195, + "epoch": 0.0, "learning_rate": 3.968253968253968e-09, - "logits/chosen": -2.735659122467041, - "logits/rejected": -2.7581238746643066, - "logps/chosen": -124.62968444824219, - "logps/rejected": -168.09475708007812, - "loss": 0.2239, + "logits/chosen": -2.7193620204925537, + "logits/rejected": -2.698728084564209, + "logps/chosen": -182.0961456298828, + "logps/rejected": -172.47128295898438, + "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, @@ -27,2512 +23,1962 @@ "step": 1 }, { - "debug/losses": 0.23633037507534027, - "debug/policy_weights": 0.34083470702171326, - "debug/raw_losses": 0.693356990814209, - "epoch": 0.007958615200955034, - "grad_norm": 1.6857448656689296, + "epoch": 0.01, "learning_rate": 3.968253968253968e-08, - "logits/chosen": -2.738861560821533, - "logits/rejected": -2.7278800010681152, - "logps/chosen": -146.718994140625, - "logps/rejected": -131.18580627441406, - "loss": 0.2295, - "rewards/accuracies": 0.4027777910232544, - "rewards/chosen": -0.00016815567505545914, - "rewards/margins": -0.00041737209539860487, - "rewards/rejected": 0.0002492164494469762, + "logits/chosen": -2.7041964530944824, + "logits/rejected": -2.6794540882110596, + "logps/chosen": -162.45831298828125, + "logps/rejected": -140.5693359375, + "loss": 0.6931, + "rewards/accuracies": 0.5486111044883728, + "rewards/chosen": 0.00032037965138442814, + "rewards/margins": 0.0004935775068588555, + "rewards/rejected": -0.00017319784092251211, "step": 10 }, { - "debug/losses": 0.22639703750610352, - "debug/policy_weights": 0.3266511857509613, - "debug/raw_losses": 0.6931136250495911, - "epoch": 0.01591723040191007, - "grad_norm": 1.5624222510478807, + "epoch": 0.02, "learning_rate": 7.936507936507936e-08, - "logits/chosen": -2.7067270278930664, - "logits/rejected": -2.703731060028076, - "logps/chosen": -129.4856414794922, - "logps/rejected": -130.27786254882812, - "loss": 0.2238, - "rewards/accuracies": 0.4937500059604645, - "rewards/chosen": -0.0001734123652568087, - "rewards/margins": 6.990063411649317e-05, - "rewards/rejected": -0.00024331299937330186, + "logits/chosen": -2.7177577018737793, + "logits/rejected": -2.7136425971984863, + "logps/chosen": -134.47242736816406, + "logps/rejected": -143.55604553222656, + "loss": 0.6931, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 8.780837379163131e-05, + "rewards/margins": 0.00010721785656642169, + "rewards/rejected": -1.940951551659964e-05, "step": 20 }, { - "debug/losses": 0.21324554085731506, - "debug/policy_weights": 0.3076760470867157, - "debug/raw_losses": 0.6930493116378784, - "epoch": 0.0238758456028651, - "grad_norm": 1.5412879090281855, + "epoch": 0.02, "learning_rate": 1.1904761904761903e-07, - "logits/chosen": -2.6839892864227295, - "logits/rejected": -2.6810474395751953, - "logps/chosen": -141.8278045654297, - "logps/rejected": -155.67654418945312, - "loss": 0.2264, - "rewards/accuracies": 0.5562499761581421, - "rewards/chosen": 3.942887997254729e-05, - "rewards/margins": 0.0001982362737180665, - "rewards/rejected": -0.00015880735008977354, + "logits/chosen": -2.6898293495178223, + "logits/rejected": -2.676154613494873, + "logps/chosen": -140.94692993164062, + "logps/rejected": -136.50369262695312, + "loss": 0.6931, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.0005466601578518748, + "rewards/margins": -0.00021456097601912916, + "rewards/rejected": 0.0007612211629748344, "step": 30 }, { - "debug/losses": 0.2176821529865265, - "debug/policy_weights": 0.3141413629055023, - "debug/raw_losses": 0.6929017305374146, - "epoch": 0.03183446080382014, - "grad_norm": 1.6385196184238553, + "epoch": 0.03, "learning_rate": 1.5873015873015872e-07, - "logits/chosen": -2.69197678565979, - "logits/rejected": -2.6842830181121826, - "logps/chosen": -154.9615936279297, - "logps/rejected": -164.14413452148438, - "loss": 0.221, - "rewards/accuracies": 0.5687500238418579, - "rewards/chosen": -0.0013954730238765478, - "rewards/margins": 0.0004961603553965688, - "rewards/rejected": -0.0018916334956884384, + "logits/chosen": -2.6958394050598145, + "logits/rejected": -2.686532974243164, + "logps/chosen": -134.98963928222656, + "logps/rejected": -144.46652221679688, + "loss": 0.6928, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": 0.0015748919686302543, + "rewards/margins": 0.0009769219905138016, + "rewards/rejected": 0.0005979698617011309, "step": 40 }, { - "debug/losses": 0.229542538523674, - "debug/policy_weights": 0.3317711055278778, - "debug/raw_losses": 0.6918389797210693, - "epoch": 0.03979307600477517, - "grad_norm": 1.48113487801622, + "epoch": 0.04, "learning_rate": 1.984126984126984e-07, - "logits/chosen": -2.7066245079040527, - "logits/rejected": -2.6878674030303955, - "logps/chosen": -143.980224609375, - "logps/rejected": -137.73158264160156, - "loss": 0.2233, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -0.0025663406122475863, - "rewards/margins": 0.0026327171362936497, - "rewards/rejected": -0.005199057050049305, + "logits/chosen": -2.7042899131774902, + "logits/rejected": -2.6861345767974854, + "logps/chosen": -149.71768188476562, + "logps/rejected": -145.0757293701172, + "loss": 0.6921, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.005199921317398548, + "rewards/margins": 0.0022330707870423794, + "rewards/rejected": 0.0029668500646948814, "step": 50 }, { - "debug/losses": 0.22738368809223175, - "debug/policy_weights": 0.32868558168411255, - "debug/raw_losses": 0.6915279626846313, - "epoch": 0.0477516912057302, - "grad_norm": 1.487501706200083, + "epoch": 0.05, "learning_rate": 2.3809523809523806e-07, - "logits/chosen": -2.7156598567962646, - "logits/rejected": -2.716393232345581, - "logps/chosen": -145.95175170898438, - "logps/rejected": -159.5319366455078, - "loss": 0.2191, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -0.0032509162556380033, - "rewards/margins": 0.00328192301094532, - "rewards/rejected": -0.00653283903375268, + "logits/chosen": -2.705153703689575, + "logits/rejected": -2.685439348220825, + "logps/chosen": -154.3783416748047, + "logps/rejected": -151.54519653320312, + "loss": 0.6912, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": 0.00569504126906395, + "rewards/margins": 0.0022000311873853207, + "rewards/rejected": 0.003495010081678629, "step": 60 }, { - "debug/losses": 0.21748514473438263, - "debug/policy_weights": 0.31510016322135925, - "debug/raw_losses": 0.6903446316719055, - "epoch": 0.055710306406685235, - "grad_norm": 1.5343429096282828, + "epoch": 0.06, "learning_rate": 2.7777777777777776e-07, - "logits/chosen": -2.736347198486328, - "logits/rejected": -2.7274653911590576, - "logps/chosen": -149.3427276611328, - "logps/rejected": -143.45547485351562, - "loss": 0.2112, - "rewards/accuracies": 0.625, - "rewards/chosen": -0.01704435609281063, - "rewards/margins": 0.005850088782608509, - "rewards/rejected": -0.022894445806741714, + "logits/chosen": -2.7017154693603516, + "logits/rejected": -2.6924962997436523, + "logps/chosen": -146.3284149169922, + "logps/rejected": -138.79405212402344, + "loss": 0.6885, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.010588793084025383, + "rewards/margins": 0.010192448273301125, + "rewards/rejected": 0.00039634370477870107, "step": 70 }, { - "debug/losses": 0.1880975216627121, - "debug/policy_weights": 0.2722209393978119, - "debug/raw_losses": 0.6913992166519165, - "epoch": 0.06366892160764027, - "grad_norm": 1.533336025559628, + "epoch": 0.06, "learning_rate": 3.1746031746031743e-07, - "logits/chosen": -2.7107605934143066, - "logits/rejected": -2.6922690868377686, - "logps/chosen": -158.02011108398438, - "logps/rejected": -149.45602416992188, - "loss": 0.201, - "rewards/accuracies": 0.543749988079071, - "rewards/chosen": -0.04885869473218918, - "rewards/margins": 0.004143272526562214, - "rewards/rejected": -0.05300196260213852, + "logits/chosen": -2.7155232429504395, + "logits/rejected": -2.696071147918701, + "logps/chosen": -141.80067443847656, + "logps/rejected": -147.0068817138672, + "loss": 0.6867, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": 0.0049073463305830956, + "rewards/margins": 0.013599385507404804, + "rewards/rejected": -0.008692039176821709, "step": 80 }, { - "debug/losses": 0.1863638460636139, - "debug/policy_weights": 0.2727832794189453, - "debug/raw_losses": 0.6830354928970337, - "epoch": 0.07162753680859531, - "grad_norm": 1.6114760928044485, + "epoch": 0.07, "learning_rate": 3.5714285714285716e-07, - "logits/chosen": -2.7209115028381348, - "logits/rejected": -2.7276768684387207, - "logps/chosen": -152.76400756835938, - "logps/rejected": -173.5898895263672, - "loss": 0.1848, - "rewards/accuracies": 0.59375, - "rewards/chosen": -0.06917759776115417, - "rewards/margins": 0.02194269374012947, - "rewards/rejected": -0.09112029522657394, + "logits/chosen": -2.7175304889678955, + "logits/rejected": -2.7080624103546143, + "logps/chosen": -153.12509155273438, + "logps/rejected": -146.53590393066406, + "loss": 0.6847, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.028871387243270874, + "rewards/margins": 0.017175236716866493, + "rewards/rejected": -0.046046625822782516, "step": 90 }, { - "debug/losses": 0.16769352555274963, - "debug/policy_weights": 0.24745841324329376, - "debug/raw_losses": 0.6783273816108704, - "epoch": 0.07958615200955034, - "grad_norm": 1.5208024805782128, + "epoch": 0.08, "learning_rate": 3.968253968253968e-07, - "logits/chosen": -2.689220666885376, - "logits/rejected": -2.672532081604004, - "logps/chosen": -149.3437042236328, - "logps/rejected": -143.38467407226562, - "loss": 0.1732, - "rewards/accuracies": 0.574999988079071, - "rewards/chosen": -0.11531722545623779, - "rewards/margins": 0.033182911574840546, - "rewards/rejected": -0.14850012958049774, + "logits/chosen": -2.7524733543395996, + "logits/rejected": -2.7452526092529297, + "logps/chosen": -163.88070678710938, + "logps/rejected": -163.61032104492188, + "loss": 0.6789, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.0662173330783844, + "rewards/margins": 0.02977912127971649, + "rewards/rejected": -0.09599645435810089, "step": 100 }, { - "epoch": 0.07958615200955034, - "eval_debug/losses": 0.16172632575035095, - "eval_debug/policy_weights": 0.2382642775774002, - "eval_debug/raw_losses": 0.6786961555480957, - "eval_logits/chosen": -2.7119109630584717, - "eval_logits/rejected": -2.703813076019287, - "eval_logps/chosen": -158.47622680664062, - "eval_logps/rejected": -168.33502197265625, - "eval_loss": 0.16316837072372437, - "eval_rewards/accuracies": 0.5960820913314819, - "eval_rewards/chosen": -0.14232736825942993, - "eval_rewards/margins": 0.03403294086456299, - "eval_rewards/rejected": -0.17636029422283173, - "eval_runtime": 153.0553, - "eval_samples_per_second": 55.875, - "eval_steps_per_second": 0.876, + "epoch": 0.08, + "eval_logits/chosen": -2.7336502075195312, + "eval_logits/rejected": -2.7255024909973145, + "eval_logps/chosen": -155.19271850585938, + "eval_logps/rejected": -165.35523986816406, + "eval_loss": 0.6769910454750061, + "eval_rewards/accuracies": 0.5914179086685181, + "eval_rewards/chosen": -0.10619194805622101, + "eval_rewards/margins": 0.03601696714758873, + "eval_rewards/rejected": -0.14220890402793884, + "eval_runtime": 184.251, + "eval_samples_per_second": 46.415, + "eval_steps_per_second": 0.727, "step": 100 }, { - "debug/losses": 0.1508895754814148, - "debug/policy_weights": 0.2189808338880539, - "debug/raw_losses": 0.6909345984458923, - "epoch": 0.08754476721050537, - "grad_norm": 1.5035580854434272, + "epoch": 0.09, "learning_rate": 4.365079365079365e-07, - "logits/chosen": -2.6903061866760254, - "logits/rejected": -2.6716604232788086, - "logps/chosen": -179.5548553466797, - "logps/rejected": -165.21966552734375, - "loss": 0.1472, - "rewards/accuracies": 0.550000011920929, - "rewards/chosen": -0.2126411497592926, - "rewards/margins": 0.011448127217590809, - "rewards/rejected": -0.22408926486968994, + "logits/chosen": -2.738532543182373, + "logits/rejected": -2.7273170948028564, + "logps/chosen": -164.2928009033203, + "logps/rejected": -160.19398498535156, + "loss": 0.6738, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.16211798787117004, + "rewards/margins": 0.03163355216383934, + "rewards/rejected": -0.19375154376029968, "step": 110 }, { - "debug/losses": 0.137036994099617, - "debug/policy_weights": 0.2047518789768219, - "debug/raw_losses": 0.678392231464386, - "epoch": 0.0955033824114604, - "grad_norm": 1.3984947799152403, + "epoch": 0.1, "learning_rate": 4.761904761904761e-07, - "logits/chosen": -2.663886785507202, - "logits/rejected": -2.651045560836792, - "logps/chosen": -168.24790954589844, - "logps/rejected": -173.3909149169922, - "loss": 0.1331, - "rewards/accuracies": 0.5687500238418579, - "rewards/chosen": -0.29116135835647583, - "rewards/margins": 0.0431181900203228, - "rewards/rejected": -0.33427953720092773, + "logits/chosen": -2.7289297580718994, + "logits/rejected": -2.705962657928467, + "logps/chosen": -196.69662475585938, + "logps/rejected": -197.2833251953125, + "loss": 0.661, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.2917623221874237, + "rewards/margins": 0.08966299891471863, + "rewards/rejected": -0.38142532110214233, "step": 120 }, { - "debug/losses": 0.11775548756122589, - "debug/policy_weights": 0.1796623170375824, - "debug/raw_losses": 0.6566502451896667, - "epoch": 0.10346199761241544, - "grad_norm": 2.374984413363097, + "epoch": 0.1, "learning_rate": 4.999845414634076e-07, - "logits/chosen": -2.6745553016662598, - "logits/rejected": -2.6470043659210205, - "logps/chosen": -188.0329132080078, - "logps/rejected": -178.14801025390625, - "loss": 0.1191, - "rewards/accuracies": 0.606249988079071, - "rewards/chosen": -0.33123353123664856, - "rewards/margins": 0.09330518543720245, - "rewards/rejected": -0.4245387017726898, + "logits/chosen": -2.658005475997925, + "logits/rejected": -2.6317684650421143, + "logps/chosen": -187.4532928466797, + "logps/rejected": -188.37689208984375, + "loss": 0.6542, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.3956056833267212, + "rewards/margins": 0.12092368304729462, + "rewards/rejected": -0.5165294408798218, "step": 130 }, { - "debug/losses": 0.09425052255392075, - "debug/policy_weights": 0.1469433605670929, - "debug/raw_losses": 0.6522111892700195, - "epoch": 0.11142061281337047, - "grad_norm": 1.7466151736267947, + "epoch": 0.11, "learning_rate": 4.998106548810311e-07, - "logits/chosen": -2.627020835876465, - "logits/rejected": -2.603726625442505, - "logps/chosen": -197.0404510498047, - "logps/rejected": -187.454833984375, - "loss": 0.106, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -0.514377772808075, - "rewards/margins": 0.10739537328481674, - "rewards/rejected": -0.6217731237411499, + "logits/chosen": -2.6906683444976807, + "logits/rejected": -2.6913747787475586, + "logps/chosen": -199.67568969726562, + "logps/rejected": -253.02487182617188, + "loss": 0.6171, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.4227059781551361, + "rewards/margins": 0.27536457777023315, + "rewards/rejected": -0.6980706453323364, "step": 140 }, { - "debug/losses": 0.08491900563240051, - "debug/policy_weights": 0.1424042284488678, - "debug/raw_losses": 0.6084787845611572, - "epoch": 0.1193792280143255, - "grad_norm": 2.8243407531109206, + "epoch": 0.12, "learning_rate": 4.994436933879359e-07, - "logits/chosen": -2.5916829109191895, - "logits/rejected": -2.5848000049591064, - "logps/chosen": -188.14651489257812, - "logps/rejected": -216.0280303955078, - "loss": 0.0994, - "rewards/accuracies": 0.71875, - "rewards/chosen": -0.4860725998878479, - "rewards/margins": 0.2143338918685913, - "rewards/rejected": -0.7004064321517944, + "logits/chosen": -2.6662166118621826, + "logits/rejected": -2.644784927368164, + "logps/chosen": -197.07180786132812, + "logps/rejected": -198.4012908935547, + "loss": 0.6395, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.3597154915332794, + "rewards/margins": 0.13716872036457062, + "rewards/rejected": -0.49688419699668884, "step": 150 }, { - "debug/losses": 0.08054587990045547, - "debug/policy_weights": 0.13052348792552948, - "debug/raw_losses": 0.6103022694587708, - "epoch": 0.12733784321528055, - "grad_norm": 2.235321236398355, + "epoch": 0.13, "learning_rate": 4.988839406031596e-07, - "logits/chosen": -2.584826946258545, - "logits/rejected": -2.5921337604522705, - "logps/chosen": -179.30166625976562, - "logps/rejected": -230.94314575195312, - "loss": 0.0879, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -0.5866313576698303, - "rewards/margins": 0.24938829243183136, - "rewards/rejected": -0.8360196352005005, + "logits/chosen": -2.647681474685669, + "logits/rejected": -2.6395888328552246, + "logps/chosen": -182.04420471191406, + "logps/rejected": -206.59780883789062, + "loss": 0.629, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.3573477864265442, + "rewards/margins": 0.2222837507724762, + "rewards/rejected": -0.579631507396698, "step": 160 }, { - "debug/losses": 0.053926557302474976, - "debug/policy_weights": 0.08985424786806107, - "debug/raw_losses": 0.6291411519050598, - "epoch": 0.13529645841623558, - "grad_norm": 1.7353626869477405, + "epoch": 0.14, "learning_rate": 4.981318291512395e-07, - "logits/chosen": -2.5308010578155518, - "logits/rejected": -2.5215961933135986, - "logps/chosen": -233.59939575195312, - "logps/rejected": -265.3108215332031, - "loss": 0.065, + "logits/chosen": -2.619232654571533, + "logits/rejected": -2.598362684249878, + "logps/chosen": -227.0933380126953, + "logps/rejected": -230.9747772216797, + "loss": 0.6242, "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -0.971887469291687, - "rewards/margins": 0.23178091645240784, - "rewards/rejected": -1.203668475151062, + "rewards/chosen": -0.7434185743331909, + "rewards/margins": 0.21749505400657654, + "rewards/rejected": -0.9609137773513794, "step": 170 }, { - "debug/losses": 0.03688238933682442, - "debug/policy_weights": 0.05372762680053711, - "debug/raw_losses": 0.6907342672348022, - "epoch": 0.14325507361719061, - "grad_norm": 1.7970759832571033, + "epoch": 0.14, "learning_rate": 4.971879403278432e-07, - "logits/chosen": -2.511756181716919, - "logits/rejected": -2.4959020614624023, - "logps/chosen": -288.7469177246094, - "logps/rejected": -290.37908935546875, - "loss": 0.0354, - "rewards/accuracies": 0.574999988079071, - "rewards/chosen": -1.4022036790847778, - "rewards/margins": 0.062146056443452835, - "rewards/rejected": -1.464349627494812, + "logits/chosen": -2.5654754638671875, + "logits/rejected": -2.5364232063293457, + "logps/chosen": -241.6617431640625, + "logps/rejected": -245.66268920898438, + "loss": 0.6151, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.7342535257339478, + "rewards/margins": 0.23685339093208313, + "rewards/rejected": -0.9711068868637085, "step": 180 }, { - "debug/losses": 0.05163710191845894, - "debug/policy_weights": 0.08004944026470184, - "debug/raw_losses": 0.6670282483100891, - "epoch": 0.15121368881814565, - "grad_norm": 1.6751904674481997, + "epoch": 0.15, "learning_rate": 4.960530036504941e-07, - "logits/chosen": -2.5430991649627686, - "logits/rejected": -2.5239815711975098, - "logps/chosen": -263.212158203125, - "logps/rejected": -268.65362548828125, - "loss": 0.0498, - "rewards/accuracies": 0.59375, - "rewards/chosen": -1.188627004623413, - "rewards/margins": 0.11968141794204712, - "rewards/rejected": -1.3083083629608154, + "logits/chosen": -2.5271048545837402, + "logits/rejected": -2.486818790435791, + "logps/chosen": -235.6089630126953, + "logps/rejected": -251.17758178710938, + "loss": 0.6215, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.807177722454071, + "rewards/margins": 0.28561535477638245, + "rewards/rejected": -1.0927931070327759, "step": 190 }, { - "debug/losses": 0.07679580897092819, - "debug/policy_weights": 0.12060348689556122, - "debug/raw_losses": 0.6367899775505066, - "epoch": 0.15917230401910068, - "grad_norm": 1.8015846859226716, + "epoch": 0.16, "learning_rate": 4.947278962947386e-07, - "logits/chosen": -2.5448789596557617, - "logits/rejected": -2.5480525493621826, - "logps/chosen": -223.87393188476562, - "logps/rejected": -254.6379852294922, - "loss": 0.077, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -0.7472453713417053, - "rewards/margins": 0.16983875632286072, - "rewards/rejected": -0.9170840382575989, + "logits/chosen": -2.4217896461486816, + "logits/rejected": -2.413295269012451, + "logps/chosen": -251.0736083984375, + "logps/rejected": -268.6098937988281, + "loss": 0.6062, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.086307406425476, + "rewards/margins": 0.24874301254749298, + "rewards/rejected": -1.3350504636764526, "step": 200 }, { - "epoch": 0.15917230401910068, - "eval_debug/losses": 0.0807570293545723, - "eval_debug/policy_weights": 0.12652097642421722, - "eval_debug/raw_losses": 0.6429691910743713, - "eval_logits/chosen": -2.545888662338257, - "eval_logits/rejected": -2.5351974964141846, - "eval_logps/chosen": -218.20616149902344, - "eval_logps/rejected": -243.5860595703125, - "eval_loss": 0.08196824789047241, - "eval_rewards/accuracies": 0.6296641826629639, - "eval_rewards/chosen": -0.7396268248558044, - "eval_rewards/margins": 0.18924373388290405, - "eval_rewards/rejected": -0.9288705587387085, - "eval_runtime": 152.9399, - "eval_samples_per_second": 55.917, - "eval_steps_per_second": 0.876, + "epoch": 0.16, + "eval_logits/chosen": -2.3855514526367188, + "eval_logits/rejected": -2.369593858718872, + "eval_logps/chosen": -246.6970672607422, + "eval_logps/rejected": -289.8621826171875, + "eval_loss": 0.6079375743865967, + "eval_rewards/accuracies": 0.66697758436203, + "eval_rewards/chosen": -1.021235704421997, + "eval_rewards/margins": 0.3660426437854767, + "eval_rewards/rejected": -1.3872781991958618, + "eval_runtime": 184.1922, + "eval_samples_per_second": 46.43, + "eval_steps_per_second": 0.728, "step": 200 }, { - "debug/losses": 0.06875108927488327, - "debug/policy_weights": 0.11560399830341339, - "debug/raw_losses": 0.6261113882064819, - "epoch": 0.1671309192200557, - "grad_norm": 1.923749397238511, + "epoch": 0.17, "learning_rate": 4.932136424161899e-07, - "logits/chosen": -2.508167028427124, - "logits/rejected": -2.493638038635254, - "logps/chosen": -204.44461059570312, - "logps/rejected": -234.84375, - "loss": 0.0769, - "rewards/accuracies": 0.65625, - "rewards/chosen": -0.7885208129882812, - "rewards/margins": 0.24285559356212616, - "rewards/rejected": -1.0313764810562134, + "logits/chosen": -2.3366785049438477, + "logits/rejected": -2.3228511810302734, + "logps/chosen": -266.292236328125, + "logps/rejected": -300.22894287109375, + "loss": 0.5893, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.2130026817321777, + "rewards/margins": 0.3487839996814728, + "rewards/rejected": -1.5617868900299072, "step": 210 }, { - "debug/losses": 0.07057208567857742, - "debug/policy_weights": 0.1248287707567215, - "debug/raw_losses": 0.5716283321380615, - "epoch": 0.17508953442101075, - "grad_norm": 2.155645087454887, + "epoch": 0.18, "learning_rate": 4.915114123589732e-07, - "logits/chosen": -2.5536115169525146, - "logits/rejected": -2.532297134399414, - "logps/chosen": -200.5375518798828, - "logps/rejected": -235.3711700439453, - "loss": 0.0807, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -0.636379063129425, - "rewards/margins": 0.36098265647888184, - "rewards/rejected": -0.9973617792129517, + "logits/chosen": -2.321228504180908, + "logits/rejected": -2.3033699989318848, + "logps/chosen": -336.34161376953125, + "logps/rejected": -373.39935302734375, + "loss": 0.612, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.9529145956039429, + "rewards/margins": 0.2863468527793884, + "rewards/rejected": -2.2392613887786865, "step": 220 }, { - "debug/losses": 0.0636017918586731, - "debug/policy_weights": 0.10361097007989883, - "debug/raw_losses": 0.6123852133750916, - "epoch": 0.18304814962196578, - "grad_norm": 2.8331926004473647, + "epoch": 0.18, "learning_rate": 4.896225217511849e-07, - "logits/chosen": -2.5085248947143555, - "logits/rejected": -2.5012524127960205, - "logps/chosen": -241.10537719726562, - "logps/rejected": -279.1767883300781, - "loss": 0.0691, - "rewards/accuracies": 0.65625, - "rewards/chosen": -1.0180869102478027, - "rewards/margins": 0.2863315939903259, - "rewards/rejected": -1.3044183254241943, + "logits/chosen": -2.4310107231140137, + "logits/rejected": -2.422048568725586, + "logps/chosen": -291.1025695800781, + "logps/rejected": -328.18963623046875, + "loss": 0.6079, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.4314143657684326, + "rewards/margins": 0.3364001214504242, + "rewards/rejected": -1.7678143978118896, "step": 230 }, { - "debug/losses": 0.049545757472515106, - "debug/policy_weights": 0.08214031159877777, - "debug/raw_losses": 0.6215575933456421, - "epoch": 0.1910067648229208, - "grad_norm": 1.5874644335916546, + "epoch": 0.19, "learning_rate": 4.875484304880629e-07, - "logits/chosen": -2.4870269298553467, - "logits/rejected": -2.46873140335083, - "logps/chosen": -294.48333740234375, - "logps/rejected": -322.12176513671875, - "loss": 0.0498, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -1.3272528648376465, - "rewards/margins": 0.3347854018211365, - "rewards/rejected": -1.6620384454727173, + "logits/chosen": -2.3412394523620605, + "logits/rejected": -2.309183120727539, + "logps/chosen": -280.8785705566406, + "logps/rejected": -308.54132080078125, + "loss": 0.613, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.309309482574463, + "rewards/margins": 0.3731766939163208, + "rewards/rejected": -1.6824861764907837, "step": 240 }, { - "debug/losses": 0.060823164880275726, - "debug/policy_weights": 0.10039641708135605, - "debug/raw_losses": 0.63308185338974, - "epoch": 0.19896538002387584, - "grad_norm": 1.78198841800004, + "epoch": 0.2, "learning_rate": 4.852907416036558e-07, - "logits/chosen": -2.4755935668945312, - "logits/rejected": -2.471782922744751, - "logps/chosen": -233.53451538085938, - "logps/rejected": -271.2043762207031, - "loss": 0.067, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -0.9423769116401672, - "rewards/margins": 0.25337982177734375, - "rewards/rejected": -1.1957566738128662, + "logits/chosen": -2.415271282196045, + "logits/rejected": -2.4072234630584717, + "logps/chosen": -243.56332397460938, + "logps/rejected": -298.7532043457031, + "loss": 0.591, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.967076301574707, + "rewards/margins": 0.4581146240234375, + "rewards/rejected": -1.4251911640167236, "step": 250 }, { - "debug/losses": 0.06635858118534088, - "debug/policy_weights": 0.11175660043954849, - "debug/raw_losses": 0.5817192196846008, - "epoch": 0.20692399522483088, - "grad_norm": 1.8058774364103256, + "epoch": 0.21, "learning_rate": 4.828512000318616e-07, - "logits/chosen": -2.4792320728302, - "logits/rejected": -2.426579713821411, - "logps/chosen": -257.08880615234375, - "logps/rejected": -285.12872314453125, - "loss": 0.0635, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -0.9238799214363098, - "rewards/margins": 0.37587353587150574, - "rewards/rejected": -1.2997533082962036, + "logits/chosen": -2.3924427032470703, + "logits/rejected": -2.3613152503967285, + "logps/chosen": -266.86572265625, + "logps/rejected": -304.2983093261719, + "loss": 0.5986, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.2214807271957397, + "rewards/margins": 0.4553411602973938, + "rewards/rejected": -1.6768219470977783, "step": 260 }, { - "debug/losses": 0.05608036369085312, - "debug/policy_weights": 0.09485211223363876, - "debug/raw_losses": 0.6099108457565308, - "epoch": 0.2148826104257859, - "grad_norm": 1.7589439888686003, + "epoch": 0.21, "learning_rate": 4.802316912577946e-07, - "logits/chosen": -2.371746778488159, - "logits/rejected": -2.3314661979675293, - "logps/chosen": -249.49166870117188, - "logps/rejected": -269.15106201171875, - "loss": 0.0578, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -1.018430471420288, - "rewards/margins": 0.33081451058387756, - "rewards/rejected": -1.3492449522018433, + "logits/chosen": -2.4108529090881348, + "logits/rejected": -2.3902478218078613, + "logps/chosen": -252.7959442138672, + "logps/rejected": -295.266357421875, + "loss": 0.5917, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.0824626684188843, + "rewards/margins": 0.39643940329551697, + "rewards/rejected": -1.4789022207260132, "step": 270 }, { - "debug/losses": 0.052897512912750244, - "debug/policy_weights": 0.07982214540243149, - "debug/raw_losses": 0.6378833055496216, - "epoch": 0.22284122562674094, - "grad_norm": 1.5995617528843298, + "epoch": 0.22, "learning_rate": 4.774342398605221e-07, - "logits/chosen": -2.2725443840026855, - "logits/rejected": -2.2477705478668213, - "logps/chosen": -273.08953857421875, - "logps/rejected": -294.1094665527344, - "loss": 0.049, - "rewards/accuracies": 0.606249988079071, - "rewards/chosen": -1.2748507261276245, - "rewards/margins": 0.26763075590133667, - "rewards/rejected": -1.542481541633606, + "logits/chosen": -2.3505263328552246, + "logits/rejected": -2.2942967414855957, + "logps/chosen": -279.871337890625, + "logps/rejected": -300.4220886230469, + "loss": 0.5979, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.1787078380584717, + "rewards/margins": 0.430286169052124, + "rewards/rejected": -1.6089938879013062, "step": 280 }, { - "debug/losses": 0.06544710695743561, - "debug/policy_weights": 0.10770060122013092, - "debug/raw_losses": 0.5902801752090454, - "epoch": 0.23079984082769597, - "grad_norm": 2.5512893652177375, + "epoch": 0.23, "learning_rate": 4.744610079482978e-07, - "logits/chosen": -2.2364134788513184, - "logits/rejected": -2.1829159259796143, - "logps/chosen": -278.95745849609375, - "logps/rejected": -308.3731384277344, - "loss": 0.0543, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -1.1328349113464355, - "rewards/margins": 0.3837105631828308, - "rewards/rejected": -1.5165454149246216, + "logits/chosen": -2.3269264698028564, + "logits/rejected": -2.2910802364349365, + "logps/chosen": -255.27706909179688, + "logps/rejected": -281.60137939453125, + "loss": 0.5853, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.1753785610198975, + "rewards/margins": 0.3495523929595947, + "rewards/rejected": -1.5249310731887817, "step": 290 }, { - "debug/losses": 0.037609830498695374, - "debug/policy_weights": 0.06786644458770752, - "debug/raw_losses": 0.6028710603713989, - "epoch": 0.238758456028651, - "grad_norm": 1.753065545131153, + "epoch": 0.24, "learning_rate": 4.713142934875005e-07, - "logits/chosen": -2.1160764694213867, - "logits/rejected": -2.0635263919830322, - "logps/chosen": -289.21832275390625, - "logps/rejected": -311.9188537597656, - "loss": 0.0465, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -1.416634202003479, - "rewards/margins": 0.3938707709312439, - "rewards/rejected": -1.8105049133300781, + "logits/chosen": -2.2868428230285645, + "logits/rejected": -2.2631592750549316, + "logps/chosen": -284.2200012207031, + "logps/rejected": -322.45269775390625, + "loss": 0.5965, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.336501955986023, + "rewards/margins": 0.3968800902366638, + "rewards/rejected": -1.733382225036621, "step": 300 }, { - "epoch": 0.238758456028651, - "eval_debug/losses": 0.04409417510032654, - "eval_debug/policy_weights": 0.07315339893102646, - "eval_debug/raw_losses": 0.6074807643890381, - "eval_logits/chosen": -2.1516358852386475, - "eval_logits/rejected": -2.1315433979034424, - "eval_logps/chosen": -289.1007995605469, - "eval_logps/rejected": -331.553466796875, - "eval_loss": 0.04599982127547264, - "eval_rewards/accuracies": 0.66697758436203, - "eval_rewards/chosen": -1.448573112487793, - "eval_rewards/margins": 0.3599713444709778, - "eval_rewards/rejected": -1.808544397354126, - "eval_runtime": 152.7829, - "eval_samples_per_second": 55.975, - "eval_steps_per_second": 0.877, + "epoch": 0.24, + "eval_logits/chosen": -2.265592098236084, + "eval_logits/rejected": -2.244987964630127, + "eval_logps/chosen": -282.3620910644531, + "eval_logps/rejected": -331.2099609375, + "eval_loss": 0.5907339453697205, + "eval_rewards/accuracies": 0.6623134613037109, + "eval_rewards/chosen": -1.3778856992721558, + "eval_rewards/margins": 0.42287060618400574, + "eval_rewards/rejected": -1.8007562160491943, + "eval_runtime": 184.1495, + "eval_samples_per_second": 46.441, + "eval_steps_per_second": 0.728, "step": 300 }, { - "debug/losses": 0.03696579486131668, - "debug/policy_weights": 0.06142206862568855, - "debug/raw_losses": 0.5807094573974609, - "epoch": 0.24671707122960604, - "grad_norm": 1.4236707076819513, + "epoch": 0.25, "learning_rate": 4.679965285265706e-07, - "logits/chosen": -2.129534959793091, - "logits/rejected": -2.1084346771240234, - "logps/chosen": -263.85357666015625, - "logps/rejected": -315.09295654296875, - "loss": 0.0454, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -1.4629967212677002, - "rewards/margins": 0.39182788133621216, - "rewards/rejected": -1.8548247814178467, + "logits/chosen": -2.2354235649108887, + "logits/rejected": -2.23685884475708, + "logps/chosen": -277.09283447265625, + "logps/rejected": -347.7145080566406, + "loss": 0.5612, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.3514426946640015, + "rewards/margins": 0.4907970428466797, + "rewards/rejected": -1.8422397375106812, "step": 310 }, { - "debug/losses": 0.039889153093099594, - "debug/policy_weights": 0.07385966926813126, - "debug/raw_losses": 0.515781044960022, - "epoch": 0.2546756864305611, - "grad_norm": 1.8086670273747332, + "epoch": 0.25, "learning_rate": 4.64510277316316e-07, - "logits/chosen": -2.105729103088379, - "logits/rejected": -2.050633192062378, - "logps/chosen": -284.87628173828125, - "logps/rejected": -340.914794921875, - "loss": 0.0397, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -1.3924423456192017, - "rewards/margins": 0.624489963054657, - "rewards/rejected": -2.016932249069214, + "logits/chosen": -2.2262344360351562, + "logits/rejected": -2.226029634475708, + "logps/chosen": -271.74212646484375, + "logps/rejected": -332.5010986328125, + "loss": 0.5903, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.3337775468826294, + "rewards/margins": 0.39512914419174194, + "rewards/rejected": -1.7289068698883057, "step": 320 }, { - "debug/losses": 0.03575636073946953, - "debug/policy_weights": 0.05730568245053291, - "debug/raw_losses": 0.6370795369148254, - "epoch": 0.26263430163151613, - "grad_norm": 1.4742792675036147, + "epoch": 0.26, "learning_rate": 4.6085823432804137e-07, - "logits/chosen": -2.0471785068511963, - "logits/rejected": -2.015995740890503, - "logps/chosen": -330.13970947265625, - "logps/rejected": -357.7228088378906, - "loss": 0.0358, - "rewards/accuracies": 0.65625, - "rewards/chosen": -1.8637893199920654, - "rewards/margins": 0.2782483696937561, - "rewards/rejected": -2.1420376300811768, + "logits/chosen": -2.2451891899108887, + "logits/rejected": -2.2502384185791016, + "logps/chosen": -250.6347198486328, + "logps/rejected": -333.8939208984375, + "loss": 0.5722, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.1326004266738892, + "rewards/margins": 0.5066065192222595, + "rewards/rejected": -1.639206886291504, "step": 330 }, { - "debug/losses": 0.026058007031679153, - "debug/policy_weights": 0.044492077082395554, - "debug/raw_losses": 0.5921159982681274, - "epoch": 0.27059291683247116, - "grad_norm": 0.8069991257279925, + "epoch": 0.27, "learning_rate": 4.570432221710314e-07, - "logits/chosen": -2.0316271781921387, - "logits/rejected": -2.0155906677246094, - "logps/chosen": -339.8232727050781, - "logps/rejected": -390.1024475097656, - "loss": 0.0253, - "rewards/accuracies": 0.612500011920929, - "rewards/chosen": -1.936417579650879, - "rewards/margins": 0.40920180082321167, - "rewards/rejected": -2.3456194400787354, + "logits/chosen": -2.0656931400299072, + "logits/rejected": -2.0213730335235596, + "logps/chosen": -318.232177734375, + "logps/rejected": -369.13311767578125, + "loss": 0.5766, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.584176778793335, + "rewards/margins": 0.5901076197624207, + "rewards/rejected": -2.1742844581604004, "step": 340 }, { - "debug/losses": 0.03554385155439377, - "debug/policy_weights": 0.06352627277374268, - "debug/raw_losses": 0.5905269384384155, - "epoch": 0.2785515320334262, - "grad_norm": 1.8338764172051625, + "epoch": 0.28, "learning_rate": 4.5306818941099866e-07, - "logits/chosen": -2.1141629219055176, - "logits/rejected": -2.056692123413086, - "logps/chosen": -321.1317138671875, - "logps/rejected": -345.6830749511719, - "loss": 0.0347, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -1.7090749740600586, - "rewards/margins": 0.4097229838371277, - "rewards/rejected": -2.118797779083252, + "logits/chosen": -1.9084612131118774, + "logits/rejected": -1.8514792919158936, + "logps/chosen": -316.9821472167969, + "logps/rejected": -352.9412841796875, + "loss": 0.5825, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.5627154111862183, + "rewards/margins": 0.5152220726013184, + "rewards/rejected": -2.077937364578247, "step": 350 }, { - "debug/losses": 0.039255283772945404, - "debug/policy_weights": 0.06526477634906769, - "debug/raw_losses": 0.5919562578201294, - "epoch": 0.28651014723438123, - "grad_norm": 1.341401556005613, + "epoch": 0.29, "learning_rate": 4.4893620829118124e-07, - "logits/chosen": -2.1538476943969727, - "logits/rejected": -2.1225638389587402, - "logps/chosen": -299.4391174316406, - "logps/rejected": -328.9165954589844, - "loss": 0.042, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -1.477020025253296, - "rewards/margins": 0.358177125453949, - "rewards/rejected": -1.8351972103118896, + "logits/chosen": -1.8860156536102295, + "logits/rejected": -1.8301204442977905, + "logps/chosen": -309.8200378417969, + "logps/rejected": -362.0408935546875, + "loss": 0.5755, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.5869390964508057, + "rewards/margins": 0.49348369240760803, + "rewards/rejected": -2.080422878265381, "step": 360 }, { - "debug/losses": 0.03851500526070595, - "debug/policy_weights": 0.0639430433511734, - "debug/raw_losses": 0.6564086079597473, - "epoch": 0.29446876243533626, - "grad_norm": 1.6981934763772588, + "epoch": 0.29, "learning_rate": 4.4465047235785185e-07, - "logits/chosen": -2.1684069633483887, - "logits/rejected": -2.133373498916626, - "logps/chosen": -311.69146728515625, - "logps/rejected": -320.17266845703125, - "loss": 0.0399, - "rewards/accuracies": 0.606249988079071, - "rewards/chosen": -1.6684709787368774, - "rewards/margins": 0.2505863904953003, - "rewards/rejected": -1.9190574884414673, + "logits/chosen": -1.6610889434814453, + "logits/rejected": -1.585129737854004, + "logps/chosen": -321.8608703613281, + "logps/rejected": -380.31036376953125, + "loss": 0.5697, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.774713158607483, + "rewards/margins": 0.6593302488327026, + "rewards/rejected": -2.4340434074401855, "step": 370 }, { - "debug/losses": 0.03426089510321617, - "debug/policy_weights": 0.05481856316328049, - "debug/raw_losses": 0.636349081993103, - "epoch": 0.3024273776362913, - "grad_norm": 1.88904780932561, + "epoch": 0.3, "learning_rate": 4.40214293992074e-07, - "logits/chosen": -2.0536434650421143, - "logits/rejected": -2.0290040969848633, - "logps/chosen": -313.34796142578125, - "logps/rejected": -339.9054260253906, - "loss": 0.0367, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -1.7595481872558594, - "rewards/margins": 0.2933933436870575, - "rewards/rejected": -2.0529415607452393, + "logits/chosen": -1.385825753211975, + "logits/rejected": -1.31913161277771, + "logps/chosen": -377.07269287109375, + "logps/rejected": -459.5557556152344, + "loss": 0.5818, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -2.1890993118286133, + "rewards/margins": 0.7521292567253113, + "rewards/rejected": -2.9412286281585693, "step": 380 }, { - "debug/losses": 0.049106162041425705, - "debug/policy_weights": 0.07827477902173996, - "debug/raw_losses": 0.6187797784805298, - "epoch": 0.3103859928372463, - "grad_norm": 1.386576352552282, + "epoch": 0.31, "learning_rate": 4.3563110184961234e-07, - "logits/chosen": -2.0677030086517334, - "logits/rejected": -2.041632890701294, - "logps/chosen": -306.24053955078125, - "logps/rejected": -343.28515625, - "loss": 0.0419, - "rewards/accuracies": 0.625, - "rewards/chosen": -1.5703617334365845, - "rewards/margins": 0.34101730585098267, - "rewards/rejected": -1.9113788604736328, + "logits/chosen": -1.5089499950408936, + "logits/rejected": -1.4075387716293335, + "logps/chosen": -338.3626708984375, + "logps/rejected": -396.67578125, + "loss": 0.5584, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.9042552709579468, + "rewards/margins": 0.5932050347328186, + "rewards/rejected": -2.49746036529541, "step": 390 }, { - "debug/losses": 0.029186096042394638, - "debug/policy_weights": 0.054026149213314056, - "debug/raw_losses": 0.5490237474441528, - "epoch": 0.31834460803820136, - "grad_norm": 1.1347734675549332, + "epoch": 0.32, "learning_rate": 4.3090443821097566e-07, - "logits/chosen": -2.0278306007385254, - "logits/rejected": -1.9964557886123657, - "logps/chosen": -312.5970458984375, - "logps/rejected": -376.74591064453125, - "loss": 0.0301, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -1.765263319015503, - "rewards/margins": 0.4999556541442871, - "rewards/rejected": -2.265219211578369, + "logits/chosen": -1.2587625980377197, + "logits/rejected": -1.2017955780029297, + "logps/chosen": -309.43377685546875, + "logps/rejected": -372.00531005859375, + "loss": 0.5729, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.9031288623809814, + "rewards/margins": 0.5602144598960876, + "rewards/rejected": -2.4633431434631348, "step": 400 }, { - "epoch": 0.31834460803820136, - "eval_debug/losses": 0.027978185564279556, - "eval_debug/policy_weights": 0.04763055220246315, - "eval_debug/raw_losses": 0.606778085231781, - "eval_logits/chosen": -2.021036386489868, - "eval_logits/rejected": -1.9971833229064941, - "eval_logps/chosen": -334.9689025878906, - "eval_logps/rejected": -375.8979797363281, - "eval_loss": 0.0302441269159317, - "eval_rewards/accuracies": 0.6604477763175964, - "eval_rewards/chosen": -1.9072539806365967, - "eval_rewards/margins": 0.34473562240600586, - "eval_rewards/rejected": -2.2519896030426025, - "eval_runtime": 152.8965, - "eval_samples_per_second": 55.933, - "eval_steps_per_second": 0.876, + "epoch": 0.32, + "eval_logits/chosen": -1.3760210275650024, + "eval_logits/rejected": -1.2920024394989014, + "eval_logps/chosen": -312.20635986328125, + "eval_logps/rejected": -375.1720275878906, + "eval_loss": 0.5711147785186768, + "eval_rewards/accuracies": 0.6828358173370361, + "eval_rewards/chosen": -1.676328182220459, + "eval_rewards/margins": 0.5640482306480408, + "eval_rewards/rejected": -2.2403764724731445, + "eval_runtime": 183.9293, + "eval_samples_per_second": 46.496, + "eval_steps_per_second": 0.729, "step": 400 }, { - "debug/losses": 0.033793386071920395, - "debug/policy_weights": 0.05818073824048042, - "debug/raw_losses": 0.5851433277130127, - "epoch": 0.3263032232391564, - "grad_norm": 1.5661418376002596, + "epoch": 0.33, "learning_rate": 4.2603795624364195e-07, - "logits/chosen": -2.012786865234375, - "logits/rejected": -1.9587138891220093, - "logps/chosen": -307.849365234375, - "logps/rejected": -336.9012756347656, - "loss": 0.032, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -1.714755654335022, - "rewards/margins": 0.37940770387649536, - "rewards/rejected": -2.094163417816162, + "logits/chosen": -1.2894772291183472, + "logits/rejected": -1.23129141330719, + "logps/chosen": -299.457275390625, + "logps/rejected": -370.8555908203125, + "loss": 0.5666, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.6054102182388306, + "rewards/margins": 0.5984233021736145, + "rewards/rejected": -2.203833818435669, "step": 410 }, { - "debug/losses": 0.06058833748102188, - "debug/policy_weights": 0.09707297384738922, - "debug/raw_losses": 0.5794335603713989, - "epoch": 0.3342618384401114, - "grad_norm": 1.8472937892928514, + "epoch": 0.33, "learning_rate": 4.210354171785795e-07, - "logits/chosen": -2.123286724090576, - "logits/rejected": -2.116464138031006, - "logps/chosen": -268.4242248535156, - "logps/rejected": -322.94091796875, - "loss": 0.0525, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -1.262597918510437, - "rewards/margins": 0.42006754875183105, - "rewards/rejected": -1.6826655864715576, + "logits/chosen": -1.022984266281128, + "logits/rejected": -0.9285897016525269, + "logps/chosen": -324.4284973144531, + "logps/rejected": -385.0074157714844, + "loss": 0.5596, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.921677589416504, + "rewards/margins": 0.5404387712478638, + "rewards/rejected": -2.4621164798736572, "step": 420 }, { - "debug/losses": 0.046358704566955566, - "debug/policy_weights": 0.08018581569194794, - "debug/raw_losses": 0.6013703942298889, - "epoch": 0.34222045364106646, - "grad_norm": 1.3294109687375146, + "epoch": 0.34, "learning_rate": 4.15900687403248e-07, - "logits/chosen": -2.1637144088745117, - "logits/rejected": -2.14619779586792, - "logps/chosen": -269.3055114746094, - "logps/rejected": -311.72271728515625, - "loss": 0.0515, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -1.3137115240097046, - "rewards/margins": 0.40074974298477173, - "rewards/rejected": -1.714461326599121, + "logits/chosen": -0.8059805631637573, + "logits/rejected": -0.7196700572967529, + "logps/chosen": -353.788330078125, + "logps/rejected": -411.4853515625, + "loss": 0.5865, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -2.1321234703063965, + "rewards/margins": 0.463266521692276, + "rewards/rejected": -2.5953898429870605, "step": 430 }, { - "debug/losses": 0.028133749961853027, - "debug/policy_weights": 0.04740751534700394, - "debug/raw_losses": 0.6014739871025085, - "epoch": 0.3501790688420215, - "grad_norm": 0.7809537654588382, + "epoch": 0.35, "learning_rate": 4.1063773547332584e-07, - "logits/chosen": -2.0739526748657227, - "logits/rejected": -2.0483832359313965, - "logps/chosen": -318.5735168457031, - "logps/rejected": -358.01007080078125, - "loss": 0.0305, - "rewards/accuracies": 0.625, - "rewards/chosen": -1.7163457870483398, - "rewards/margins": 0.38432344794273376, - "rewards/rejected": -2.1006693840026855, + "logits/chosen": -0.9645301699638367, + "logits/rejected": -0.7601315975189209, + "logps/chosen": -346.8272705078125, + "logps/rejected": -392.2935791015625, + "loss": 0.5591, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.9153356552124023, + "rewards/margins": 0.5891679525375366, + "rewards/rejected": -2.5045037269592285, "step": 440 }, { - "debug/losses": 0.020220816135406494, - "debug/policy_weights": 0.03701246529817581, - "debug/raw_losses": 0.5830426812171936, - "epoch": 0.3581376840429765, - "grad_norm": 0.9223432546306553, + "epoch": 0.36, "learning_rate": 4.0525062904547276e-07, - "logits/chosen": -2.0108728408813477, - "logits/rejected": -1.9766314029693604, - "logps/chosen": -317.477783203125, - "logps/rejected": -354.3273010253906, - "loss": 0.028, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -1.849148154258728, - "rewards/margins": 0.4447658658027649, - "rewards/rejected": -2.293914318084717, + "logits/chosen": -0.608537495136261, + "logits/rejected": -0.47767123579978943, + "logps/chosen": -341.55364990234375, + "logps/rejected": -434.1073303222656, + "loss": 0.5687, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.105318069458008, + "rewards/margins": 0.6994394659996033, + "rewards/rejected": -2.8047571182250977, "step": 450 }, { - "debug/losses": 0.032562464475631714, - "debug/policy_weights": 0.055046629160642624, - "debug/raw_losses": 0.5713292360305786, - "epoch": 0.36609629924393156, - "grad_norm": 1.046176079788828, + "epoch": 0.37, "learning_rate": 3.997435317334988e-07, - "logits/chosen": -2.074617385864258, - "logits/rejected": -2.0565450191497803, - "logps/chosen": -322.37738037109375, - "logps/rejected": -373.4345703125, - "loss": 0.0293, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -1.753626823425293, - "rewards/margins": 0.44904619455337524, - "rewards/rejected": -2.2026727199554443, + "logits/chosen": -0.6356207132339478, + "logits/rejected": -0.25634175539016724, + "logps/chosen": -384.43780517578125, + "logps/rejected": -419.24176025390625, + "loss": 0.5608, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.2970900535583496, + "rewards/margins": 0.6535100340843201, + "rewards/rejected": -2.9506001472473145, "step": 460 }, { - "debug/losses": 0.03133855015039444, - "debug/policy_weights": 0.05432797595858574, - "debug/raw_losses": 0.5810926556587219, - "epoch": 0.3740549144448866, - "grad_norm": 0.9700020848498487, + "epoch": 0.37, "learning_rate": 3.941206998903701e-07, - "logits/chosen": -2.1040358543395996, - "logits/rejected": -2.0743563175201416, - "logps/chosen": -331.5384826660156, - "logps/rejected": -365.0169677734375, - "loss": 0.0285, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -1.7676904201507568, - "rewards/margins": 0.36116519570350647, - "rewards/rejected": -2.1288554668426514, + "logits/chosen": -1.0318920612335205, + "logits/rejected": -0.7451022267341614, + "logps/chosen": -338.9430236816406, + "logps/rejected": -384.64111328125, + "loss": 0.5678, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.9643396139144897, + "rewards/margins": 0.5402536392211914, + "rewards/rejected": -2.5045928955078125, "step": 470 }, { - "debug/losses": 0.03259238973259926, - "debug/policy_weights": 0.052516065537929535, - "debug/raw_losses": 0.5992477536201477, - "epoch": 0.3820135296458416, - "grad_norm": 1.3154459086844248, + "epoch": 0.38, "learning_rate": 3.8838647931853684e-07, - "logits/chosen": -2.0499539375305176, - "logits/rejected": -2.0263118743896484, - "logps/chosen": -289.03106689453125, - "logps/rejected": -331.93133544921875, - "loss": 0.0345, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -1.6713088750839233, - "rewards/margins": 0.3960718512535095, - "rewards/rejected": -2.067380666732788, + "logits/chosen": -0.6847028732299805, + "logits/rejected": -0.5548251867294312, + "logps/chosen": -339.61456298828125, + "logps/rejected": -435.32061767578125, + "loss": 0.5814, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.1349122524261475, + "rewards/margins": 0.7573872804641724, + "rewards/rejected": -2.8923001289367676, "step": 480 }, { - "debug/losses": 0.03394859656691551, - "debug/policy_weights": 0.0627564936876297, - "debug/raw_losses": 0.5517407655715942, - "epoch": 0.38997214484679665, - "grad_norm": 1.4921908216218414, + "epoch": 0.39, "learning_rate": 3.825453019111281e-07, - "logits/chosen": -2.1080005168914795, - "logits/rejected": -2.0806961059570312, - "logps/chosen": -297.0895080566406, - "logps/rejected": -362.0596618652344, - "loss": 0.0356, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -1.5262044668197632, - "rewards/margins": 0.549400269985199, - "rewards/rejected": -2.0756049156188965, + "logits/chosen": -0.5378957986831665, + "logits/rejected": -0.28533270955085754, + "logps/chosen": -363.78570556640625, + "logps/rejected": -430.11749267578125, + "loss": 0.5327, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.134934425354004, + "rewards/margins": 0.6089809536933899, + "rewards/rejected": -2.743915319442749, "step": 490 }, { - "debug/losses": 0.04039504751563072, - "debug/policy_weights": 0.07268974184989929, - "debug/raw_losses": 0.543070375919342, - "epoch": 0.3979307600477517, - "grad_norm": 1.5546856632601864, + "epoch": 0.4, "learning_rate": 3.7660168222660824e-07, - "logits/chosen": -2.1996207237243652, - "logits/rejected": -2.1463570594787598, - "logps/chosen": -322.22967529296875, - "logps/rejected": -356.3480224609375, - "loss": 0.0365, - "rewards/accuracies": 0.71875, - "rewards/chosen": -1.537804365158081, - "rewards/margins": 0.5032481551170349, - "rewards/rejected": -2.0410525798797607, + "logits/chosen": -0.6318235397338867, + "logits/rejected": -0.5071814656257629, + "logps/chosen": -350.5252380371094, + "logps/rejected": -421.93353271484375, + "loss": 0.5645, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -2.062009572982788, + "rewards/margins": 0.5333147048950195, + "rewards/rejected": -2.5953242778778076, "step": 500 }, { - "epoch": 0.3979307600477517, - "eval_debug/losses": 0.03992186114192009, - "eval_debug/policy_weights": 0.06951306015253067, - "eval_debug/raw_losses": 0.5867676734924316, - "eval_logits/chosen": -2.1866133213043213, - "eval_logits/rejected": -2.1609621047973633, - "eval_logps/chosen": -295.57781982421875, - "eval_logps/rejected": -349.751708984375, - "eval_loss": 0.04235580936074257, - "eval_rewards/accuracies": 0.683768630027771, - "eval_rewards/chosen": -1.513343334197998, - "eval_rewards/margins": 0.477183997631073, - "eval_rewards/rejected": -1.9905272722244263, - "eval_runtime": 152.8577, - "eval_samples_per_second": 55.947, - "eval_steps_per_second": 0.877, + "epoch": 0.4, + "eval_logits/chosen": -0.7860146760940552, + "eval_logits/rejected": -0.6090859770774841, + "eval_logps/chosen": -351.7882995605469, + "eval_logps/rejected": -419.81939697265625, + "eval_loss": 0.5639454126358032, + "eval_rewards/accuracies": 0.6986940503120422, + "eval_rewards/chosen": -2.0721471309661865, + "eval_rewards/margins": 0.6147031188011169, + "eval_rewards/rejected": -2.6868505477905273, + "eval_runtime": 184.0154, + "eval_samples_per_second": 46.474, + "eval_steps_per_second": 0.728, "step": 500 }, { - "debug/losses": 0.041816793382167816, - "debug/policy_weights": 0.07070693373680115, - "debug/raw_losses": 0.5931949615478516, - "epoch": 0.4058893752487067, - "grad_norm": 1.8707198573695687, + "epoch": 0.41, "learning_rate": 3.705602139995416e-07, - "logits/chosen": -2.1828060150146484, - "logits/rejected": -2.147474765777588, - "logps/chosen": -293.61553955078125, - "logps/rejected": -337.2850036621094, - "loss": 0.0503, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -1.4135316610336304, - "rewards/margins": 0.45567187666893005, - "rewards/rejected": -1.8692035675048828, + "logits/chosen": -0.7258490920066833, + "logits/rejected": -0.4828409254550934, + "logps/chosen": -388.1371154785156, + "logps/rejected": -422.11181640625, + "loss": 0.574, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -2.290266513824463, + "rewards/margins": 0.4104091227054596, + "rewards/rejected": -2.7006754875183105, "step": 510 }, { - "debug/losses": 0.04330951347947121, - "debug/policy_weights": 0.06736676394939423, - "debug/raw_losses": 0.6137939691543579, - "epoch": 0.41384799044966175, - "grad_norm": 1.4619434139696437, + "epoch": 0.41, "learning_rate": 3.6442556659016475e-07, - "logits/chosen": -2.185202121734619, - "logits/rejected": -2.165663003921509, - "logps/chosen": -298.6885681152344, - "logps/rejected": -335.07366943359375, - "loss": 0.0422, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -1.5118178129196167, - "rewards/margins": 0.34012988209724426, - "rewards/rejected": -1.8519477844238281, + "logits/chosen": -0.5335447192192078, + "logits/rejected": -0.33706527948379517, + "logps/chosen": -378.86492919921875, + "logps/rejected": -429.67724609375, + "loss": 0.5608, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.236337423324585, + "rewards/margins": 0.556148886680603, + "rewards/rejected": -2.7924864292144775, "step": 520 }, { - "debug/losses": 0.04355059936642647, - "debug/policy_weights": 0.06907118856906891, - "debug/raw_losses": 0.6439172029495239, - "epoch": 0.4218066056506168, - "grad_norm": 1.3367245541718495, + "epoch": 0.42, "learning_rate": 3.582024813755076e-07, - "logits/chosen": -2.108959436416626, - "logits/rejected": -2.086933135986328, - "logps/chosen": -321.8992614746094, - "logps/rejected": -345.1759033203125, - "loss": 0.0345, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -1.71266770362854, - "rewards/margins": 0.27295029163360596, - "rewards/rejected": -1.985618233680725, + "logits/chosen": -0.39548322558403015, + "logits/rejected": -0.10662730038166046, + "logps/chosen": -368.8847961425781, + "logps/rejected": -473.3500061035156, + "loss": 0.5485, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.3263449668884277, + "rewards/margins": 0.8236624598503113, + "rewards/rejected": -3.150007724761963, "step": 530 }, { - "debug/losses": 0.03872833028435707, - "debug/policy_weights": 0.06447508931159973, - "debug/raw_losses": 0.5817040205001831, - "epoch": 0.4297652208515718, - "grad_norm": 1.3025243070628232, + "epoch": 0.43, "learning_rate": 3.5189576808485404e-07, - "logits/chosen": -2.1122708320617676, - "logits/rejected": -2.088346004486084, - "logps/chosen": -310.7181701660156, - "logps/rejected": -361.2312316894531, - "loss": 0.0377, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -1.601313829421997, - "rewards/margins": 0.45615309476852417, - "rewards/rejected": -2.057466983795166, + "logits/chosen": 0.15742243826389313, + "logits/rejected": 0.31491726636886597, + "logps/chosen": -394.34930419921875, + "logps/rejected": -492.82232666015625, + "loss": 0.5478, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.6109700202941895, + "rewards/margins": 0.8250136375427246, + "rewards/rejected": -3.435983657836914, "step": 540 }, { - "debug/losses": 0.03740643709897995, - "debug/policy_weights": 0.05903150886297226, - "debug/raw_losses": 0.6242271065711975, - "epoch": 0.43772383605252685, - "grad_norm": 1.8261628628746243, + "epoch": 0.44, "learning_rate": 3.4551030108237433e-07, - "logits/chosen": -2.097672462463379, - "logits/rejected": -2.042898654937744, - "logps/chosen": -325.98193359375, - "logps/rejected": -347.8229675292969, - "loss": 0.0361, - "rewards/accuracies": 0.65625, - "rewards/chosen": -1.7341053485870361, - "rewards/margins": 0.3862043023109436, - "rewards/rejected": -2.120309591293335, + "logits/chosen": -0.2550584375858307, + "logits/rejected": -0.06936412304639816, + "logps/chosen": -406.5508728027344, + "logps/rejected": -448.47576904296875, + "loss": 0.5562, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -2.5152666568756104, + "rewards/margins": 0.4819938540458679, + "rewards/rejected": -2.997260332107544, "step": 550 }, { - "debug/losses": 0.028903227299451828, - "debug/policy_weights": 0.05456990748643875, - "debug/raw_losses": 0.5671552419662476, - "epoch": 0.4456824512534819, - "grad_norm": 1.3969384254784958, + "epoch": 0.45, "learning_rate": 3.390510155998023e-07, - "logits/chosen": -2.142831325531006, - "logits/rejected": -2.1043479442596436, - "logps/chosen": -335.3097229003906, - "logps/rejected": -388.9020080566406, - "loss": 0.0296, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -1.8105127811431885, - "rewards/margins": 0.5163544416427612, - "rewards/rejected": -2.32686710357666, + "logits/chosen": -0.5292027592658997, + "logits/rejected": -0.2619571387767792, + "logps/chosen": -371.6798095703125, + "logps/rejected": -420.7915954589844, + "loss": 0.5492, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.1147050857543945, + "rewards/margins": 0.6524336338043213, + "rewards/rejected": -2.7671384811401367, "step": 560 }, { - "debug/losses": 0.031607236713171005, - "debug/policy_weights": 0.05318068340420723, - "debug/raw_losses": 0.5985600352287292, - "epoch": 0.4536410664544369, - "grad_norm": 1.647741129882177, + "epoch": 0.45, "learning_rate": 3.325229039220684e-07, - "logits/chosen": -2.085341691970825, - "logits/rejected": -2.064438581466675, - "logps/chosen": -350.662353515625, - "logps/rejected": -387.7210998535156, - "loss": 0.0299, - "rewards/accuracies": 0.65625, - "rewards/chosen": -1.9661823511123657, - "rewards/margins": 0.3896261155605316, - "rewards/rejected": -2.3558084964752197, + "logits/chosen": -0.5881962776184082, + "logits/rejected": -0.4658876061439514, + "logps/chosen": -343.7039794921875, + "logps/rejected": -406.14178466796875, + "loss": 0.57, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.0860273838043213, + "rewards/margins": 0.498068630695343, + "rewards/rejected": -2.5840957164764404, "step": 570 }, { - "debug/losses": 0.03731871768832207, - "debug/policy_weights": 0.0585642084479332, - "debug/raw_losses": 0.5958778858184814, - "epoch": 0.46159968165539195, - "grad_norm": 1.4791769718410137, + "epoch": 0.46, "learning_rate": 3.2593101152883795e-07, - "logits/chosen": -2.095475435256958, - "logits/rejected": -2.058140277862549, - "logps/chosen": -331.99822998046875, - "logps/rejected": -371.34918212890625, - "loss": 0.0269, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -1.8306747674942017, - "rewards/margins": 0.4158768653869629, - "rewards/rejected": -2.246551513671875, + "logits/chosen": -0.6565806269645691, + "logits/rejected": -0.2549567222595215, + "logps/chosen": -374.8047180175781, + "logps/rejected": -430.33221435546875, + "loss": 0.5512, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.2211391925811768, + "rewards/margins": 0.6813799142837524, + "rewards/rejected": -2.9025187492370605, "step": 580 }, { - "debug/losses": 0.03436756879091263, - "debug/policy_weights": 0.05549240857362747, - "debug/raw_losses": 0.5947594046592712, - "epoch": 0.469558296856347, - "grad_norm": 1.311005334728305, + "epoch": 0.47, "learning_rate": 3.192804331949349e-07, - "logits/chosen": -2.0892815589904785, - "logits/rejected": -2.064006805419922, - "logps/chosen": -329.79632568359375, - "logps/rejected": -367.3946228027344, - "loss": 0.0293, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -1.8716663122177124, - "rewards/margins": 0.40899592638015747, - "rewards/rejected": -2.2806622982025146, + "logits/chosen": -0.07184700667858124, + "logits/rejected": 0.1699156016111374, + "logps/chosen": -422.27081298828125, + "logps/rejected": -490.69134521484375, + "loss": 0.535, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.730973720550537, + "rewards/margins": 0.7726518511772156, + "rewards/rejected": -3.5036251544952393, "step": 590 }, { - "debug/losses": 0.02820407785475254, - "debug/policy_weights": 0.05099906399846077, - "debug/raw_losses": 0.572905421257019, - "epoch": 0.477516912057302, - "grad_norm": 1.2557571913266798, + "epoch": 0.48, "learning_rate": 3.125763090526674e-07, - "logits/chosen": -2.1351561546325684, - "logits/rejected": -2.0871920585632324, - "logps/chosen": -335.27978515625, - "logps/rejected": -378.8961486816406, - "loss": 0.0314, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -1.804330587387085, - "rewards/margins": 0.49522823095321655, - "rewards/rejected": -2.299558639526367, + "logits/chosen": -0.029465889558196068, + "logits/rejected": 0.15842057764530182, + "logps/chosen": -417.373046875, + "logps/rejected": -478.73291015625, + "loss": 0.5513, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.8006317615509033, + "rewards/margins": 0.6451797485351562, + "rewards/rejected": -3.4458115100860596, "step": 600 }, { - "epoch": 0.477516912057302, - "eval_debug/losses": 0.028778724372386932, - "eval_debug/policy_weights": 0.05042354390025139, - "eval_debug/raw_losses": 0.5833220481872559, - "eval_logits/chosen": -2.128926992416382, - "eval_logits/rejected": -2.1045045852661133, - "eval_logps/chosen": -328.2419128417969, - "eval_logps/rejected": -378.8386535644531, - "eval_loss": 0.030947599560022354, - "eval_rewards/accuracies": 0.6772388219833374, - "eval_rewards/chosen": -1.839984655380249, - "eval_rewards/margins": 0.4414121210575104, - "eval_rewards/rejected": -2.2813963890075684, - "eval_runtime": 152.7586, - "eval_samples_per_second": 55.984, - "eval_steps_per_second": 0.877, + "epoch": 0.48, + "eval_logits/chosen": -0.10542195290327072, + "eval_logits/rejected": 0.12242482602596283, + "eval_logps/chosen": -436.9386291503906, + "eval_logps/rejected": -505.02227783203125, + "eval_loss": 0.5582411885261536, + "eval_rewards/accuracies": 0.7108209133148193, + "eval_rewards/chosen": -2.9236514568328857, + "eval_rewards/margins": 0.6152271032333374, + "eval_rewards/rejected": -3.5388784408569336, + "eval_runtime": 183.9235, + "eval_samples_per_second": 46.498, + "eval_steps_per_second": 0.729, "step": 600 }, { - "debug/losses": 0.028522927314043045, - "debug/policy_weights": 0.04586270451545715, - "debug/raw_losses": 0.6142871379852295, - "epoch": 0.48547552725825704, - "grad_norm": 1.182590096149601, + "epoch": 0.49, "learning_rate": 3.0582382061909623e-07, - "logits/chosen": -2.143110752105713, - "logits/rejected": -2.1091790199279785, - "logps/chosen": -331.83636474609375, - "logps/rejected": -368.21844482421875, - "loss": 0.0302, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -1.9071013927459717, - "rewards/margins": 0.3452487289905548, - "rewards/rejected": -2.252350091934204, + "logits/chosen": -0.2445104569196701, + "logits/rejected": -0.018268002197146416, + "logps/chosen": -441.7857971191406, + "logps/rejected": -502.60791015625, + "loss": 0.5594, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.858261823654175, + "rewards/margins": 0.5510683655738831, + "rewards/rejected": -3.409330368041992, "step": 610 }, { - "debug/losses": 0.03037545084953308, - "debug/policy_weights": 0.0514327809214592, - "debug/raw_losses": 0.5810720324516296, - "epoch": 0.4934341424592121, - "grad_norm": 1.593561174181043, + "epoch": 0.49, "learning_rate": 2.9902818679131775e-07, - "logits/chosen": -2.1445069313049316, - "logits/rejected": -2.091860294342041, - "logps/chosen": -326.50469970703125, - "logps/rejected": -367.2829284667969, - "loss": 0.0321, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -1.7178184986114502, - "rewards/margins": 0.4634013772010803, - "rewards/rejected": -2.1812198162078857, + "logits/chosen": -0.4190225601196289, + "logits/rejected": -0.22823679447174072, + "logps/chosen": -399.03924560546875, + "logps/rejected": -498.6724548339844, + "loss": 0.5499, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.650449275970459, + "rewards/margins": 0.7673205137252808, + "rewards/rejected": -3.4177703857421875, "step": 620 }, { - "debug/losses": 0.030525147914886475, - "debug/policy_weights": 0.05667508766055107, - "debug/raw_losses": 0.5397539734840393, - "epoch": 0.5013927576601671, - "grad_norm": 2.8222395351404543, + "epoch": 0.5, "learning_rate": 2.921946598128571e-07, - "logits/chosen": -2.10142183303833, - "logits/rejected": -2.067218780517578, - "logps/chosen": -305.54437255859375, - "logps/rejected": -338.94415283203125, - "loss": 0.0373, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -1.5276521444320679, - "rewards/margins": 0.5411427617073059, - "rewards/rejected": -2.0687947273254395, + "logits/chosen": -0.43653860688209534, + "logits/rejected": -0.20837187767028809, + "logps/chosen": -402.82781982421875, + "logps/rejected": -485.4117736816406, + "loss": 0.5739, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.573000192642212, + "rewards/margins": 0.7478531002998352, + "rewards/rejected": -3.3208529949188232, "step": 630 }, { - "debug/losses": 0.03578261286020279, - "debug/policy_weights": 0.05523357912898064, - "debug/raw_losses": 0.62241131067276, - "epoch": 0.5093513728611222, - "grad_norm": 1.8331946866279472, + "epoch": 0.51, "learning_rate": 2.8532852121428733e-07, - "logits/chosen": -2.059525728225708, - "logits/rejected": -2.024341344833374, - "logps/chosen": -295.8885192871094, - "logps/rejected": -332.0082092285156, - "loss": 0.035, - "rewards/accuracies": 0.625, - "rewards/chosen": -1.6036468744277954, - "rewards/margins": 0.38395947217941284, - "rewards/rejected": -1.987606406211853, + "logits/chosen": -0.43430274724960327, + "logits/rejected": -0.13240045309066772, + "logps/chosen": -397.2491149902344, + "logps/rejected": -442.12384033203125, + "loss": 0.5462, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.4125733375549316, + "rewards/margins": 0.5821372866630554, + "rewards/rejected": -2.9947104454040527, "step": 640 }, { - "debug/losses": 0.0353676900267601, - "debug/policy_weights": 0.0586540512740612, - "debug/raw_losses": 0.5654557943344116, - "epoch": 0.5173099880620772, - "grad_norm": 1.6263620123295128, + "epoch": 0.52, "learning_rate": 2.7843507773121414e-07, - "logits/chosen": -2.0878262519836426, - "logits/rejected": -2.050427198410034, - "logps/chosen": -297.3978271484375, - "logps/rejected": -357.1239929199219, - "loss": 0.0368, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -1.6018199920654297, - "rewards/margins": 0.5254247784614563, - "rewards/rejected": -2.1272449493408203, + "logits/chosen": -0.4247920513153076, + "logits/rejected": -0.21372787654399872, + "logps/chosen": -389.4237976074219, + "logps/rejected": -458.3169860839844, + "loss": 0.5373, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -2.444688320159912, + "rewards/margins": 0.7236617207527161, + "rewards/rejected": -3.1683506965637207, "step": 650 }, { - "debug/losses": 0.029138093814253807, - "debug/policy_weights": 0.06430795043706894, - "debug/raw_losses": 0.5185871720314026, - "epoch": 0.5252686032630323, - "grad_norm": 1.4058645383465609, + "epoch": 0.53, "learning_rate": 2.715196572027789e-07, - "logits/chosen": -2.038132667541504, - "logits/rejected": -2.007830858230591, - "logps/chosen": -304.67694091796875, - "logps/rejected": -377.6889343261719, - "loss": 0.0357, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -1.602113127708435, - "rewards/margins": 0.6335779428482056, - "rewards/rejected": -2.2356910705566406, + "logits/chosen": -0.6697942614555359, + "logits/rejected": -0.4933086931705475, + "logps/chosen": -387.529296875, + "logps/rejected": -472.73944091796875, + "loss": 0.5685, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.3496451377868652, + "rewards/margins": 0.7728831171989441, + "rewards/rejected": -3.122528314590454, "step": 660 }, { - "debug/losses": 0.03085913695394993, - "debug/policy_weights": 0.05442778393626213, - "debug/raw_losses": 0.6082974672317505, - "epoch": 0.5332272184639872, - "grad_norm": 1.6041217842152309, + "epoch": 0.53, "learning_rate": 2.645876044538521e-07, - "logits/chosen": -2.0396082401275635, - "logits/rejected": -2.0031583309173584, - "logps/chosen": -327.6360168457031, - "logps/rejected": -360.5151062011719, - "loss": 0.033, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -1.7850401401519775, - "rewards/margins": 0.4116973280906677, - "rewards/rejected": -2.19673752784729, + "logits/chosen": -1.0338900089263916, + "logits/rejected": -0.8813627362251282, + "logps/chosen": -372.53118896484375, + "logps/rejected": -426.54241943359375, + "loss": 0.5725, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.201908588409424, + "rewards/margins": 0.5010865926742554, + "rewards/rejected": -2.7029950618743896, "step": 670 }, { - "debug/losses": 0.034388747066259384, - "debug/policy_weights": 0.057945240288972855, - "debug/raw_losses": 0.5781614184379578, - "epoch": 0.5411858336649423, - "grad_norm": 1.2956026443956346, + "epoch": 0.54, "learning_rate": 2.5764427716409815e-07, - "logits/chosen": -2.0535218715667725, - "logits/rejected": -2.0209546089172363, - "logps/chosen": -322.48492431640625, - "logps/rejected": -363.35797119140625, - "loss": 0.0298, + "logits/chosen": -0.9278701543807983, + "logits/rejected": -0.7282145023345947, + "logps/chosen": -347.2828674316406, + "logps/rejected": -416.9349060058594, + "loss": 0.5479, "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -1.67715322971344, - "rewards/margins": 0.41988006234169006, - "rewards/rejected": -2.0970332622528076, + "rewards/chosen": -2.0276436805725098, + "rewards/margins": 0.743033230304718, + "rewards/rejected": -2.770677089691162, "step": 680 }, { - "debug/losses": 0.0360778272151947, - "debug/policy_weights": 0.059746015816926956, - "debug/raw_losses": 0.6053534150123596, - "epoch": 0.5491444488658973, - "grad_norm": 1.6135200911375513, + "epoch": 0.55, "learning_rate": 2.5069504172710494e-07, - "logits/chosen": -2.088912010192871, - "logits/rejected": -2.0784342288970947, - "logps/chosen": -319.89501953125, - "logps/rejected": -373.1657409667969, - "loss": 0.0349, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -1.7156000137329102, - "rewards/margins": 0.3546966016292572, - "rewards/rejected": -2.0702967643737793, + "logits/chosen": -0.5008482336997986, + "logits/rejected": -0.34875133633613586, + "logps/chosen": -373.7621154785156, + "logps/rejected": -485.12884521484375, + "loss": 0.5217, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.347053050994873, + "rewards/margins": 0.9024646878242493, + "rewards/rejected": -3.2495174407958984, "step": 690 }, { - "debug/losses": 0.049276988953351974, - "debug/policy_weights": 0.07480012625455856, - "debug/raw_losses": 0.6235750913619995, - "epoch": 0.5571030640668524, - "grad_norm": 1.5808440748566825, + "epoch": 0.56, "learning_rate": 2.4374526910277886e-07, - "logits/chosen": -2.0845532417297363, - "logits/rejected": -2.0596041679382324, - "logps/chosen": -284.52301025390625, - "logps/rejected": -313.69476318359375, - "loss": 0.0424, - "rewards/accuracies": 0.625, - "rewards/chosen": -1.411106824874878, - "rewards/margins": 0.33600515127182007, - "rewards/rejected": -1.7471120357513428, + "logits/chosen": 0.06850005686283112, + "logits/rejected": 0.41385045647621155, + "logps/chosen": -411.46246337890625, + "logps/rejected": -476.6162109375, + "loss": 0.5571, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -2.6757898330688477, + "rewards/margins": 0.8085702657699585, + "rewards/rejected": -3.4843602180480957, "step": 700 }, { - "epoch": 0.5571030640668524, - "eval_debug/losses": 0.04337022453546524, - "eval_debug/policy_weights": 0.07531096041202545, - "eval_debug/raw_losses": 0.5755950212478638, - "eval_logits/chosen": -2.0900442600250244, - "eval_logits/rejected": -2.0673747062683105, - "eval_logps/chosen": -280.4930114746094, - "eval_logps/rejected": -331.8508605957031, - "eval_loss": 0.04617752134799957, - "eval_rewards/accuracies": 0.691231369972229, - "eval_rewards/chosen": -1.3624950647354126, - "eval_rewards/margins": 0.449023574590683, - "eval_rewards/rejected": -1.811518669128418, - "eval_runtime": 152.8491, - "eval_samples_per_second": 55.951, - "eval_steps_per_second": 0.877, + "epoch": 0.56, + "eval_logits/chosen": 0.035554468631744385, + "eval_logits/rejected": 0.2980235815048218, + "eval_logps/chosen": -424.2823486328125, + "eval_logps/rejected": -505.6960754394531, + "eval_loss": 0.5558871626853943, + "eval_rewards/accuracies": 0.704291045665741, + "eval_rewards/chosen": -2.797088146209717, + "eval_rewards/margins": 0.748529314994812, + "eval_rewards/rejected": -3.5456173419952393, + "eval_runtime": 183.8747, + "eval_samples_per_second": 46.51, + "eval_steps_per_second": 0.729, "step": 700 }, { - "debug/losses": 0.0449453704059124, - "debug/policy_weights": 0.0735592395067215, - "debug/raw_losses": 0.5990960597991943, - "epoch": 0.5650616792678074, - "grad_norm": 2.20481488012055, + "epoch": 0.57, "learning_rate": 2.368003306662104e-07, - "logits/chosen": -2.0511984825134277, - "logits/rejected": -2.01592755317688, - "logps/chosen": -304.49285888671875, - "logps/rejected": -334.17608642578125, - "loss": 0.0457, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -1.4889942407608032, - "rewards/margins": 0.4087558686733246, - "rewards/rejected": -1.8977501392364502, + "logits/chosen": 0.07857178151607513, + "logits/rejected": 0.3302653729915619, + "logps/chosen": -413.8836975097656, + "logps/rejected": -535.0875244140625, + "loss": 0.5287, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.7193782329559326, + "rewards/margins": 1.0089346170425415, + "rewards/rejected": -3.7283127307891846, "step": 710 }, { - "debug/losses": 0.037745922803878784, - "debug/policy_weights": 0.07329835742712021, - "debug/raw_losses": 0.5745851397514343, - "epoch": 0.5730202944687625, - "grad_norm": 1.6814472517336854, + "epoch": 0.57, "learning_rate": 2.2986559405621886e-07, - "logits/chosen": -2.0232553482055664, - "logits/rejected": -1.9815248250961304, - "logps/chosen": -316.2637634277344, - "logps/rejected": -357.62969970703125, - "loss": 0.0387, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -1.5382474660873413, - "rewards/margins": 0.46971410512924194, - "rewards/rejected": -2.0079617500305176, + "logits/chosen": 0.2789291739463806, + "logits/rejected": 0.4242584705352783, + "logps/chosen": -422.7801818847656, + "logps/rejected": -522.7840576171875, + "loss": 0.5551, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.925621509552002, + "rewards/margins": 0.8043605089187622, + "rewards/rejected": -3.729982376098633, "step": 720 }, { - "debug/losses": 0.021795693784952164, - "debug/policy_weights": 0.03945660963654518, - "debug/raw_losses": 0.5574513673782349, - "epoch": 0.5809789096697174, - "grad_norm": 1.2278801956439722, + "epoch": 0.58, "learning_rate": 2.2294641902678443e-07, - "logits/chosen": -1.9514557123184204, - "logits/rejected": -1.9241917133331299, - "logps/chosen": -324.55279541015625, - "logps/rejected": -370.191650390625, - "loss": 0.0231, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -1.8938844203948975, - "rewards/margins": 0.4893426299095154, - "rewards/rejected": -2.3832271099090576, + "logits/chosen": -0.19327735900878906, + "logits/rejected": 0.043265581130981445, + "logps/chosen": -363.1488342285156, + "logps/rejected": -470.94970703125, + "loss": 0.5284, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.39530873298645, + "rewards/margins": 0.908363938331604, + "rewards/rejected": -3.3036727905273438, "step": 730 }, { - "debug/losses": 0.021131381392478943, - "debug/policy_weights": 0.040012426674366, - "debug/raw_losses": 0.5742698907852173, - "epoch": 0.5889375248706725, - "grad_norm": 1.454963087082975, + "epoch": 0.59, "learning_rate": 2.160481533045751e-07, - "logits/chosen": -1.9214550256729126, - "logits/rejected": -1.877915382385254, - "logps/chosen": -347.30078125, - "logps/rejected": -377.1596984863281, - "loss": 0.0255, - "rewards/accuracies": 0.71875, - "rewards/chosen": -1.9716631174087524, - "rewards/margins": 0.39278116822242737, - "rewards/rejected": -2.3644442558288574, + "logits/chosen": -0.37412697076797485, + "logits/rejected": -0.17320053279399872, + "logps/chosen": -390.2896423339844, + "logps/rejected": -428.08099365234375, + "loss": 0.5572, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.3005330562591553, + "rewards/margins": 0.48462891578674316, + "rewards/rejected": -2.7851624488830566, "step": 740 }, { - "debug/losses": 0.027965540066361427, - "debug/policy_weights": 0.05136016011238098, - "debug/raw_losses": 0.570824146270752, - "epoch": 0.5968961400716275, - "grad_norm": 1.3911526389472733, + "epoch": 0.6, "learning_rate": 2.0917612845576882e-07, - "logits/chosen": -1.9628822803497314, - "logits/rejected": -1.8985135555267334, - "logps/chosen": -327.02178955078125, - "logps/rejected": -358.8661804199219, - "loss": 0.03, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -1.705392599105835, - "rewards/margins": 0.5323067903518677, - "rewards/rejected": -2.237699270248413, + "logits/chosen": -0.26352375745773315, + "logits/rejected": -0.0010178961092606187, + "logps/chosen": -373.3875427246094, + "logps/rejected": -440.09442138671875, + "loss": 0.5534, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.3315823078155518, + "rewards/margins": 0.6843063235282898, + "rewards/rejected": -3.0158886909484863, "step": 750 }, { - "debug/losses": 0.029450882226228714, - "debug/policy_weights": 0.05074804276227951, - "debug/raw_losses": 0.5880511403083801, - "epoch": 0.6048547552725826, - "grad_norm": 2.2333339678589184, + "epoch": 0.6, "learning_rate": 2.0233565576536564e-07, - "logits/chosen": -1.9341914653778076, - "logits/rejected": -1.9149129390716553, - "logps/chosen": -303.7951965332031, - "logps/rejected": -353.73541259765625, - "loss": 0.0359, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -1.618255376815796, - "rewards/margins": 0.4489392340183258, - "rewards/rejected": -2.067194700241089, + "logits/chosen": -0.3354080021381378, + "logits/rejected": -0.006600166670978069, + "logps/chosen": -360.56463623046875, + "logps/rejected": -440.66961669921875, + "loss": 0.5328, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.1626803874969482, + "rewards/margins": 0.8473829030990601, + "rewards/rejected": -3.010063409805298, "step": 760 }, { - "debug/losses": 0.03321235626935959, - "debug/policy_weights": 0.06097835302352905, - "debug/raw_losses": 0.5398764610290527, - "epoch": 0.6128133704735376, - "grad_norm": 1.6557317661926283, + "epoch": 0.61, "learning_rate": 1.9553202213217537e-07, - "logits/chosen": -1.9118038415908813, - "logits/rejected": -1.8792883157730103, - "logps/chosen": -286.04901123046875, - "logps/rejected": -348.7887268066406, - "loss": 0.0346, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -1.5565006732940674, - "rewards/margins": 0.5775460004806519, - "rewards/rejected": -2.134046792984009, + "logits/chosen": -0.021420275792479515, + "logits/rejected": 0.19946305453777313, + "logps/chosen": -389.1043395996094, + "logps/rejected": -448.04998779296875, + "loss": 0.5523, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.391838788986206, + "rewards/margins": 0.6678962707519531, + "rewards/rejected": -3.059735059738159, "step": 770 }, { - "debug/losses": 0.03244846314191818, - "debug/policy_weights": 0.061699897050857544, - "debug/raw_losses": 0.5690463185310364, - "epoch": 0.6207719856744927, - "grad_norm": 1.4912972959386048, + "epoch": 0.62, "learning_rate": 1.887704859826528e-07, - "logits/chosen": -1.927679419517517, - "logits/rejected": -1.8800761699676514, - "logps/chosen": -330.97998046875, - "logps/rejected": -388.7947692871094, - "loss": 0.0324, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -1.6972744464874268, - "rewards/margins": 0.5525250434875488, - "rewards/rejected": -2.2497994899749756, + "logits/chosen": -0.15253478288650513, + "logits/rejected": -0.00011998042464256287, + "logps/chosen": -394.9501953125, + "logps/rejected": -462.32843017578125, + "loss": 0.5443, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.476644992828369, + "rewards/margins": 0.566824734210968, + "rewards/rejected": -3.0434699058532715, "step": 780 }, { - "debug/losses": 0.031903307884931564, - "debug/policy_weights": 0.05280442163348198, - "debug/raw_losses": 0.5780637860298157, - "epoch": 0.6287306008754476, - "grad_norm": 1.4895600667578988, + "epoch": 0.63, "learning_rate": 1.8205627320673836e-07, - "logits/chosen": -1.8867695331573486, - "logits/rejected": -1.8334290981292725, - "logps/chosen": -339.6418762207031, - "logps/rejected": -387.8199462890625, - "loss": 0.0294, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -1.8608417510986328, - "rewards/margins": 0.5528916120529175, - "rewards/rejected": -2.4137332439422607, + "logits/chosen": -0.17955633997917175, + "logits/rejected": 0.18167546391487122, + "logps/chosen": -390.32244873046875, + "logps/rejected": -444.895263671875, + "loss": 0.5566, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.4376220703125, + "rewards/margins": 0.7008293271064758, + "rewards/rejected": -3.138451099395752, "step": 790 }, { - "debug/losses": 0.027057424187660217, - "debug/policy_weights": 0.049826182425022125, - "debug/raw_losses": 0.5931520462036133, - "epoch": 0.6366892160764027, - "grad_norm": 1.1976056858635522, + "epoch": 0.64, "learning_rate": 1.7539457311884675e-07, - "logits/chosen": -1.8889570236206055, - "logits/rejected": -1.8268101215362549, - "logps/chosen": -345.4725036621094, - "logps/rejected": -377.6209411621094, - "loss": 0.0289, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -1.8778314590454102, - "rewards/margins": 0.4333206117153168, - "rewards/rejected": -2.311152219772339, + "logits/chosen": -0.09838727861642838, + "logits/rejected": 0.11829495429992676, + "logps/chosen": -402.4017333984375, + "logps/rejected": -451.49346923828125, + "loss": 0.5609, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.4441986083984375, + "rewards/margins": 0.5067789554595947, + "rewards/rejected": -2.9509775638580322, "step": 800 }, { - "epoch": 0.6366892160764027, - "eval_debug/losses": 0.02687034010887146, - "eval_debug/policy_weights": 0.048049140721559525, - "eval_debug/raw_losses": 0.5768545269966125, - "eval_logits/chosen": -1.8538771867752075, - "eval_logits/rejected": -1.8270140886306763, - "eval_logps/chosen": -329.3051452636719, - "eval_logps/rejected": -385.80999755859375, - "eval_loss": 0.029471097514033318, - "eval_rewards/accuracies": 0.6977611780166626, - "eval_rewards/chosen": -1.8506169319152832, - "eval_rewards/margins": 0.5004932284355164, - "eval_rewards/rejected": -2.3511102199554443, - "eval_runtime": 152.9352, - "eval_samples_per_second": 55.919, - "eval_steps_per_second": 0.876, + "epoch": 0.64, + "eval_logits/chosen": -0.03116540051996708, + "eval_logits/rejected": 0.1922437697649002, + "eval_logps/chosen": -387.7091979980469, + "eval_logps/rejected": -459.44390869140625, + "eval_loss": 0.5468714833259583, + "eval_rewards/accuracies": 0.7108209133148193, + "eval_rewards/chosen": -2.431356430053711, + "eval_rewards/margins": 0.6517390012741089, + "eval_rewards/rejected": -3.0830955505371094, + "eval_runtime": 183.9774, + "eval_samples_per_second": 46.484, + "eval_steps_per_second": 0.728, "step": 800 }, { - "debug/losses": 0.03517676144838333, - "debug/policy_weights": 0.05810894817113876, - "debug/raw_losses": 0.6028115153312683, - "epoch": 0.6446478312773577, - "grad_norm": 1.3469519811703725, + "epoch": 0.64, "learning_rate": 1.687905344471226e-07, - "logits/chosen": -1.871289610862732, - "logits/rejected": -1.8393253087997437, - "logps/chosen": -336.5460510253906, - "logps/rejected": -381.5574645996094, - "loss": 0.032, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -1.7289113998413086, - "rewards/margins": 0.43413639068603516, - "rewards/rejected": -2.1630477905273438, + "logits/chosen": 0.07735608518123627, + "logits/rejected": 0.3973601460456848, + "logps/chosen": -408.05999755859375, + "logps/rejected": -459.011474609375, + "loss": 0.5384, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -2.5008435249328613, + "rewards/margins": 0.6535352468490601, + "rewards/rejected": -3.154379367828369, "step": 810 }, { - "debug/losses": 0.03737642616033554, - "debug/policy_weights": 0.053779907524585724, - "debug/raw_losses": 0.6097862124443054, - "epoch": 0.6526064464783128, - "grad_norm": 1.2305683677129626, + "epoch": 0.65, "learning_rate": 1.6224926135406693e-07, - "logits/chosen": -1.8764880895614624, - "logits/rejected": -1.8340880870819092, - "logps/chosen": -320.94805908203125, - "logps/rejected": -350.2289123535156, - "loss": 0.0304, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -1.7637560367584229, - "rewards/margins": 0.3951038420200348, - "rewards/rejected": -2.158859968185425, + "logits/chosen": 0.1125444769859314, + "logits/rejected": 0.3865428566932678, + "logps/chosen": -404.16058349609375, + "logps/rejected": -484.68621826171875, + "loss": 0.5448, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -2.4405789375305176, + "rewards/margins": 0.718208909034729, + "rewards/rejected": -3.158787727355957, "step": 820 }, { - "debug/losses": 0.031071752309799194, - "debug/policy_weights": 0.05692852661013603, - "debug/raw_losses": 0.5603014826774597, - "epoch": 0.6605650616792678, - "grad_norm": 1.4651619498114408, + "epoch": 0.66, "learning_rate": 1.557758094916053e-07, - "logits/chosen": -1.906121015548706, - "logits/rejected": -1.8540939092636108, - "logps/chosen": -327.9610900878906, - "logps/rejected": -378.42193603515625, - "loss": 0.0311, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -1.7102444171905518, - "rewards/margins": 0.5098624229431152, - "rewards/rejected": -2.220106601715088, + "logits/chosen": 0.11989516019821167, + "logits/rejected": 0.30926594138145447, + "logps/chosen": -370.29876708984375, + "logps/rejected": -452.27911376953125, + "loss": 0.5418, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.3860089778900146, + "rewards/margins": 0.7260924577713013, + "rewards/rejected": -3.1121015548706055, "step": 830 }, { - "debug/losses": 0.03221515566110611, - "debug/policy_weights": 0.05942277982831001, - "debug/raw_losses": 0.5662818551063538, - "epoch": 0.6685236768802229, - "grad_norm": 1.4409855909584606, + "epoch": 0.67, "learning_rate": 1.4937518209365108e-07, - "logits/chosen": -1.925093650817871, - "logits/rejected": -1.8490371704101562, - "logps/chosen": -352.86773681640625, - "logps/rejected": -377.3597412109375, - "loss": 0.0346, - "rewards/accuracies": 0.71875, - "rewards/chosen": -1.7442684173583984, - "rewards/margins": 0.4963502287864685, - "rewards/rejected": -2.2406187057495117, + "logits/chosen": -0.14239154756069183, + "logits/rejected": 0.14250756800174713, + "logps/chosen": -395.55755615234375, + "logps/rejected": -447.6368713378906, + "loss": 0.5573, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.378154754638672, + "rewards/margins": 0.6160937547683716, + "rewards/rejected": -2.994248390197754, "step": 840 }, { - "debug/losses": 0.02671756222844124, - "debug/policy_weights": 0.04855426400899887, - "debug/raw_losses": 0.6005613207817078, - "epoch": 0.6764822920811778, - "grad_norm": 1.4529693634395395, + "epoch": 0.68, "learning_rate": 1.4305232610918045e-07, - "logits/chosen": -1.8812288045883179, - "logits/rejected": -1.8472543954849243, - "logps/chosen": -335.77374267578125, - "logps/rejected": -373.2962951660156, - "loss": 0.0302, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -1.8972389698028564, - "rewards/margins": 0.40869230031967163, - "rewards/rejected": -2.3059310913085938, + "logits/chosen": -0.16526366770267487, + "logits/rejected": 0.16432161629199982, + "logps/chosen": -373.45330810546875, + "logps/rejected": -436.6773376464844, + "loss": 0.5415, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.3134028911590576, + "rewards/margins": 0.774810791015625, + "rewards/rejected": -3.0882136821746826, "step": 850 }, { - "debug/losses": 0.029225418344140053, - "debug/policy_weights": 0.049057330936193466, - "debug/raw_losses": 0.6105281114578247, - "epoch": 0.6844409072821329, - "grad_norm": 1.1419302383364476, + "epoch": 0.68, "learning_rate": 1.3681212837880977e-07, - "logits/chosen": -1.8963346481323242, - "logits/rejected": -1.8906829357147217, - "logps/chosen": -313.63372802734375, - "logps/rejected": -370.35540771484375, - "loss": 0.0301, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -1.7957327365875244, - "rewards/margins": 0.38477063179016113, - "rewards/rejected": -2.1805036067962646, + "logits/chosen": -0.1321481615304947, + "logits/rejected": 0.23287932574748993, + "logps/chosen": -364.96990966796875, + "logps/rejected": -447.7923278808594, + "loss": 0.5396, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.176964282989502, + "rewards/margins": 0.8955341577529907, + "rewards/rejected": -3.0724985599517822, "step": 860 }, { - "debug/losses": 0.026798686012625694, - "debug/policy_weights": 0.044307220727205276, - "debug/raw_losses": 0.5882238745689392, - "epoch": 0.6923995224830879, - "grad_norm": 1.3414173043772086, + "epoch": 0.69, "learning_rate": 1.3065941185782977e-07, - "logits/chosen": -1.8557084798812866, - "logits/rejected": -1.815911054611206, - "logps/chosen": -340.4590759277344, - "logps/rejected": -364.7593078613281, - "loss": 0.0285, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -1.882323980331421, - "rewards/margins": 0.4049120843410492, - "rewards/rejected": -2.287236213684082, + "logits/chosen": 0.05437428876757622, + "logits/rejected": 0.2819867432117462, + "logps/chosen": -383.08599853515625, + "logps/rejected": -439.3629455566406, + "loss": 0.5505, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -2.405247211456299, + "rewards/margins": 0.5403125882148743, + "rewards/rejected": -2.9455599784851074, "step": 870 }, { - "debug/losses": 0.02769031748175621, - "debug/policy_weights": 0.06145339086651802, - "debug/raw_losses": 0.48969849944114685, - "epoch": 0.700358137684043, - "grad_norm": 1.3672144922117222, + "epoch": 0.7, "learning_rate": 1.2459893188861613e-07, - "logits/chosen": -1.9420058727264404, - "logits/rejected": -1.9016504287719727, - "logps/chosen": -313.2261657714844, - "logps/rejected": -395.3963317871094, - "loss": 0.0307, - "rewards/accuracies": 0.793749988079071, - "rewards/chosen": -1.6474010944366455, - "rewards/margins": 0.6819612383842468, - "rewards/rejected": -2.329362154006958, + "logits/chosen": -0.12052659690380096, + "logits/rejected": 0.12284734100103378, + "logps/chosen": -367.1181640625, + "logps/rejected": -468.1044921875, + "loss": 0.5185, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.39152455329895, + "rewards/margins": 0.9137696027755737, + "rewards/rejected": -3.3052947521209717, "step": 880 }, { - "debug/losses": 0.030598634853959084, - "debug/policy_weights": 0.05649831146001816, - "debug/raw_losses": 0.5550761222839355, - "epoch": 0.708316752884998, - "grad_norm": 1.515347005278819, + "epoch": 0.71, "learning_rate": 1.1863537252529548e-07, - "logits/chosen": -1.8932174444198608, - "logits/rejected": -1.8344228267669678, - "logps/chosen": -336.3061218261719, - "logps/rejected": -373.91888427734375, - "loss": 0.0291, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -1.7803919315338135, - "rewards/margins": 0.45331424474716187, - "rewards/rejected": -2.233705997467041, + "logits/chosen": 0.14598000049591064, + "logits/rejected": 0.38815659284591675, + "logps/chosen": -397.891357421875, + "logps/rejected": -472.38677978515625, + "loss": 0.5323, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.512676239013672, + "rewards/margins": 0.7713057994842529, + "rewards/rejected": -3.2839818000793457, "step": 890 }, { - "debug/losses": 0.029710734263062477, - "debug/policy_weights": 0.057327818125486374, - "debug/raw_losses": 0.5665749311447144, - "epoch": 0.716275368085953, - "grad_norm": 1.3873792779779055, + "epoch": 0.72, "learning_rate": 1.1277334291351145e-07, - "logits/chosen": -1.8338550329208374, - "logits/rejected": -1.802924394607544, - "logps/chosen": -308.0507507324219, - "logps/rejected": -367.1876220703125, - "loss": 0.0314, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -1.7070906162261963, - "rewards/margins": 0.5322864055633545, - "rewards/rejected": -2.239377021789551, + "logits/chosen": 0.15319526195526123, + "logits/rejected": 0.35974830389022827, + "logps/chosen": -380.77783203125, + "logps/rejected": -449.54315185546875, + "loss": 0.5514, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -2.3706305027008057, + "rewards/margins": 0.6724039912223816, + "rewards/rejected": -3.043034076690674, "step": 900 }, { - "epoch": 0.716275368085953, - "eval_debug/losses": 0.03087555803358555, - "eval_debug/policy_weights": 0.05567449331283569, - "eval_debug/raw_losses": 0.5706273913383484, - "eval_logits/chosen": -1.867327094078064, - "eval_logits/rejected": -1.8414264917373657, - "eval_logps/chosen": -317.96832275390625, - "eval_logps/rejected": -375.444091796875, - "eval_loss": 0.0338822603225708, - "eval_rewards/accuracies": 0.6893656849861145, - "eval_rewards/chosen": -1.7372483015060425, - "eval_rewards/margins": 0.5102024078369141, - "eval_rewards/rejected": -2.247450590133667, - "eval_runtime": 152.7672, - "eval_samples_per_second": 55.981, - "eval_steps_per_second": 0.877, + "epoch": 0.72, + "eval_logits/chosen": 0.28598034381866455, + "eval_logits/rejected": 0.5382024645805359, + "eval_logps/chosen": -392.3096008300781, + "eval_logps/rejected": -471.95330810546875, + "eval_loss": 0.5473664402961731, + "eval_rewards/accuracies": 0.6996268630027771, + "eval_rewards/chosen": -2.4773612022399902, + "eval_rewards/margins": 0.7308279275894165, + "eval_rewards/rejected": -3.2081892490386963, + "eval_runtime": 183.9377, + "eval_samples_per_second": 46.494, + "eval_steps_per_second": 0.729, "step": 900 }, { - "debug/losses": 0.03138936683535576, - "debug/policy_weights": 0.06450549513101578, - "debug/raw_losses": 0.5024587512016296, - "epoch": 0.724233983286908, - "grad_norm": 1.472288824809844, + "epoch": 0.72, "learning_rate": 1.0701737372808431e-07, - "logits/chosen": -1.8538185358047485, - "logits/rejected": -1.8261350393295288, - "logps/chosen": -295.5444030761719, - "logps/rejected": -376.8030700683594, - "loss": 0.0314, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.5179767608642578, - "rewards/margins": 0.6834074258804321, - "rewards/rejected": -2.2013843059539795, + "logits/chosen": 0.15951867401599884, + "logits/rejected": 0.46630391478538513, + "logps/chosen": -383.52850341796875, + "logps/rejected": -467.2303771972656, + "loss": 0.5362, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.2782187461853027, + "rewards/margins": 0.8473943471908569, + "rewards/rejected": -3.125612735748291, "step": 910 }, { - "debug/losses": 0.031300343573093414, - "debug/policy_weights": 0.055743057280778885, - "debug/raw_losses": 0.61448734998703, - "epoch": 0.7321925984878631, - "grad_norm": 1.7081189272422854, + "epoch": 0.73, "learning_rate": 1.0137191367132078e-07, - "logits/chosen": -1.8605804443359375, - "logits/rejected": -1.8302139043807983, - "logps/chosen": -348.89617919921875, - "logps/rejected": -396.13519287109375, - "loss": 0.0289, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -1.9280294179916382, - "rewards/margins": 0.42716842889785767, - "rewards/rejected": -2.3551979064941406, + "logits/chosen": 0.2791319191455841, + "logits/rejected": 0.45174160599708557, + "logps/chosen": -372.1945495605469, + "logps/rejected": -446.6507263183594, + "loss": 0.5458, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -2.3160648345947266, + "rewards/margins": 0.68004310131073, + "rewards/rejected": -2.996107816696167, "step": 920 }, { - "debug/losses": 0.03180256113409996, - "debug/policy_weights": 0.05098626762628555, - "debug/raw_losses": 0.6134781837463379, - "epoch": 0.7401512136888182, - "grad_norm": 1.2410467414958612, + "epoch": 0.74, "learning_rate": 9.584132603467827e-08, - "logits/chosen": -1.845842719078064, - "logits/rejected": -1.7887229919433594, - "logps/chosen": -364.4795227050781, - "logps/rejected": -396.46282958984375, - "loss": 0.0289, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -1.973096251487732, - "rewards/margins": 0.44869351387023926, - "rewards/rejected": -2.4217896461486816, + "logits/chosen": -0.12192128598690033, + "logits/rejected": 0.1477951854467392, + "logps/chosen": -366.48321533203125, + "logps/rejected": -453.130126953125, + "loss": 0.5467, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.200005531311035, + "rewards/margins": 0.7978888750076294, + "rewards/rejected": -2.997894287109375, "step": 930 }, { - "debug/losses": 0.02910168096423149, - "debug/policy_weights": 0.04697355628013611, - "debug/raw_losses": 0.584063708782196, - "epoch": 0.7481098288897732, - "grad_norm": 1.6349371306622613, + "epoch": 0.75, "learning_rate": 9.042988532644249e-08, - "logits/chosen": -1.8313400745391846, - "logits/rejected": -1.8036092519760132, - "logps/chosen": -337.94219970703125, - "logps/rejected": -399.6029357910156, - "loss": 0.0287, - "rewards/accuracies": 0.6875, - "rewards/chosen": -1.945892333984375, - "rewards/margins": 0.5027505159378052, - "rewards/rejected": -2.4486427307128906, + "logits/chosen": -0.03106372058391571, + "logits/rejected": 0.07721444219350815, + "logps/chosen": -344.21270751953125, + "logps/rejected": -438.11077880859375, + "loss": 0.5161, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.149094581604004, + "rewards/margins": 0.7353444695472717, + "rewards/rejected": -2.884438991546631, "step": 940 }, { - "debug/losses": 0.027888696640729904, - "debug/policy_weights": 0.046953700482845306, - "debug/raw_losses": 0.5813563466072083, - "epoch": 0.7560684440907283, - "grad_norm": 1.026768048816564, + "epoch": 0.76, "learning_rate": 8.514177396802428e-08, - "logits/chosen": -1.8680336475372314, - "logits/rejected": -1.840049386024475, - "logps/chosen": -344.02215576171875, - "logps/rejected": -397.464111328125, - "loss": 0.0271, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -2.0415139198303223, - "rewards/margins": 0.4642157554626465, - "rewards/rejected": -2.505729913711548, + "logits/chosen": 0.006801058538258076, + "logits/rejected": 0.20282092690467834, + "logps/chosen": -358.15167236328125, + "logps/rejected": -436.4964294433594, + "loss": 0.5385, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -2.2220425605773926, + "rewards/margins": 0.7004804611206055, + "rewards/rejected": -2.922523260116577, "step": 950 }, { - "debug/losses": 0.026357349008321762, - "debug/policy_weights": 0.048806965351104736, - "debug/raw_losses": 0.5880208015441895, - "epoch": 0.7640270592916832, - "grad_norm": 1.050274133658511, + "epoch": 0.76, "learning_rate": 7.998107906142839e-08, - "logits/chosen": -1.8624064922332764, - "logits/rejected": -1.8342987298965454, - "logps/chosen": -333.5138244628906, - "logps/rejected": -370.5243225097656, - "loss": 0.0272, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -1.9049221277236938, - "rewards/margins": 0.43430835008621216, - "rewards/rejected": -2.3392302989959717, + "logits/chosen": 0.41448846459388733, + "logits/rejected": 0.705254852771759, + "logps/chosen": -371.27801513671875, + "logps/rejected": -434.56866455078125, + "loss": 0.5236, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.2166616916656494, + "rewards/margins": 0.6714047193527222, + "rewards/rejected": -2.888066530227661, "step": 960 }, { - "debug/losses": 0.02456454373896122, - "debug/policy_weights": 0.044480856508016586, - "debug/raw_losses": 0.5699892640113831, - "epoch": 0.7719856744926383, - "grad_norm": 1.5164224739604937, + "epoch": 0.77, "learning_rate": 7.495178923039396e-08, - "logits/chosen": -1.8286335468292236, - "logits/rejected": -1.851117491722107, - "logps/chosen": -319.8497009277344, - "logps/rejected": -401.72100830078125, - "loss": 0.0276, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -1.911757469177246, - "rewards/margins": 0.5482187867164612, - "rewards/rejected": -2.4599764347076416, + "logits/chosen": 0.23847150802612305, + "logits/rejected": 0.48661884665489197, + "logps/chosen": -366.28179931640625, + "logps/rejected": -462.679443359375, + "loss": 0.5459, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.1916985511779785, + "rewards/margins": 0.8472123146057129, + "rewards/rejected": -3.038910388946533, "step": 970 }, { - "debug/losses": 0.026758376508951187, - "debug/policy_weights": 0.047099605202674866, - "debug/raw_losses": 0.5511727333068848, - "epoch": 0.7799442896935933, - "grad_norm": 2.2837305176620237, + "epoch": 0.78, "learning_rate": 7.005779153764682e-08, - "logits/chosen": -1.8559293746948242, - "logits/rejected": -1.802053689956665, - "logps/chosen": -325.4739074707031, - "logps/rejected": -375.23028564453125, - "loss": 0.0308, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -1.826904535293579, - "rewards/margins": 0.5505177974700928, - "rewards/rejected": -2.37742280960083, + "logits/chosen": 0.41438961029052734, + "logits/rejected": 0.6912784576416016, + "logps/chosen": -382.70123291015625, + "logps/rejected": -461.8614807128906, + "loss": 0.5453, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.4304287433624268, + "rewards/margins": 0.7116767764091492, + "rewards/rejected": -3.1421055793762207, "step": 980 }, { - "debug/losses": 0.030480515211820602, - "debug/policy_weights": 0.05681230500340462, - "debug/raw_losses": 0.5534275770187378, - "epoch": 0.7879029048945484, - "grad_norm": 1.8186545450772766, + "epoch": 0.79, "learning_rate": 6.530286848064698e-08, - "logits/chosen": -1.839967966079712, - "logits/rejected": -1.8160829544067383, - "logps/chosen": -325.0166320800781, - "logps/rejected": -384.3879699707031, - "loss": 0.0292, + "logits/chosen": 0.36573725938796997, + "logits/rejected": 0.5834362506866455, + "logps/chosen": -384.49749755859375, + "logps/rejected": -466.30096435546875, + "loss": 0.5528, "rewards/accuracies": 0.71875, - "rewards/chosen": -1.7186723947525024, - "rewards/margins": 0.5559494495391846, - "rewards/rejected": -2.2746217250823975, + "rewards/chosen": -2.5111565589904785, + "rewards/margins": 0.7234699130058289, + "rewards/rejected": -3.234626054763794, "step": 990 }, { - "debug/losses": 0.026850074529647827, - "debug/policy_weights": 0.05034085363149643, - "debug/raw_losses": 0.5474573373794556, - "epoch": 0.7958615200955034, - "grad_norm": 1.3975900876939507, + "epoch": 0.8, "learning_rate": 6.069069506815325e-08, - "logits/chosen": -1.8527164459228516, - "logits/rejected": -1.805755615234375, - "logps/chosen": -324.24761962890625, - "logps/rejected": -378.9568176269531, - "loss": 0.0307, - "rewards/accuracies": 0.71875, - "rewards/chosen": -1.7933601140975952, - "rewards/margins": 0.5918117165565491, - "rewards/rejected": -2.385171890258789, + "logits/chosen": 0.45530566573143005, + "logits/rejected": 0.5909157991409302, + "logps/chosen": -379.1433410644531, + "logps/rejected": -468.88458251953125, + "loss": 0.527, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.5361268520355225, + "rewards/margins": 0.7407721281051636, + "rewards/rejected": -3.2768986225128174, "step": 1000 }, { - "epoch": 0.7958615200955034, - "eval_debug/losses": 0.027858305722475052, - "eval_debug/policy_weights": 0.050272345542907715, - "eval_debug/raw_losses": 0.5695627331733704, - "eval_logits/chosen": -1.8391083478927612, - "eval_logits/rejected": -1.8125532865524292, - "eval_logps/chosen": -329.7293701171875, - "eval_logps/rejected": -386.5124816894531, - "eval_loss": 0.030572954565286636, - "eval_rewards/accuracies": 0.6865671873092651, - "eval_rewards/chosen": -1.8548587560653687, - "eval_rewards/margins": 0.5032761096954346, - "eval_rewards/rejected": -2.3581347465515137, - "eval_runtime": 152.9123, - "eval_samples_per_second": 55.927, - "eval_steps_per_second": 0.876, + "epoch": 0.8, + "eval_logits/chosen": 0.3871051073074341, + "eval_logits/rejected": 0.6372014284133911, + "eval_logps/chosen": -394.97113037109375, + "eval_logps/rejected": -471.8453674316406, + "eval_loss": 0.5453863739967346, + "eval_rewards/accuracies": 0.70802241563797, + "eval_rewards/chosen": -2.503976345062256, + "eval_rewards/margins": 0.7031334638595581, + "eval_rewards/rejected": -3.2071101665496826, + "eval_runtime": 183.9898, + "eval_samples_per_second": 46.481, + "eval_steps_per_second": 0.728, "step": 1000 }, { - "debug/losses": 0.031778767704963684, - "debug/policy_weights": 0.05145453289151192, - "debug/raw_losses": 0.6123412251472473, - "epoch": 0.8038201352964585, - "grad_norm": 1.2716238320400024, + "epoch": 0.8, "learning_rate": 5.6224835979863714e-08, - "logits/chosen": -1.8510977029800415, - "logits/rejected": -1.8080089092254639, - "logps/chosen": -336.0456237792969, - "logps/rejected": -369.1909484863281, - "loss": 0.0295, - "rewards/accuracies": 0.625, - "rewards/chosen": -1.8573215007781982, - "rewards/margins": 0.4095400273799896, - "rewards/rejected": -2.2668616771698, + "logits/chosen": 0.31174296140670776, + "logits/rejected": 0.6193565130233765, + "logps/chosen": -390.387451171875, + "logps/rejected": -468.4959411621094, + "loss": 0.5568, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.32766056060791, + "rewards/margins": 0.68747878074646, + "rewards/rejected": -3.015139102935791, "step": 1010 }, { - "debug/losses": 0.023703234270215034, - "debug/policy_weights": 0.046647168695926666, - "debug/raw_losses": 0.5592247247695923, - "epoch": 0.8117787504974134, - "grad_norm": 1.946635889826808, + "epoch": 0.81, "learning_rate": 5.190874281132851e-08, - "logits/chosen": -1.8227264881134033, - "logits/rejected": -1.8052421808242798, - "logps/chosen": -316.83843994140625, - "logps/rejected": -374.4837951660156, - "loss": 0.0304, + "logits/chosen": 0.22277125716209412, + "logits/rejected": 0.6487134099006653, + "logps/chosen": -402.0958557128906, + "logps/rejected": -448.5992736816406, + "loss": 0.5408, "rewards/accuracies": 0.71875, - "rewards/chosen": -1.771314263343811, - "rewards/margins": 0.5482533574104309, - "rewards/rejected": -2.319567918777466, + "rewards/chosen": -2.359062671661377, + "rewards/margins": 0.6533006429672241, + "rewards/rejected": -3.0123631954193115, "step": 1020 }, { - "debug/losses": 0.02848326787352562, - "debug/policy_weights": 0.05005430057644844, - "debug/raw_losses": 0.5512452721595764, - "epoch": 0.8197373656983685, - "grad_norm": 1.5331091718163976, + "epoch": 0.82, "learning_rate": 4.774575140626316e-08, - "logits/chosen": -1.8013317584991455, - "logits/rejected": -1.7473710775375366, - "logps/chosen": -313.5793151855469, - "logps/rejected": -363.24713134765625, - "loss": 0.0314, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -1.7351596355438232, - "rewards/margins": 0.5731841921806335, - "rewards/rejected": -2.3083438873291016, + "logits/chosen": 0.23170511424541473, + "logits/rejected": 0.47184085845947266, + "logps/chosen": -363.46917724609375, + "logps/rejected": -442.47918701171875, + "loss": 0.5309, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.142770290374756, + "rewards/margins": 0.7513145208358765, + "rewards/rejected": -2.894084930419922, "step": 1030 }, { - "debug/losses": 0.033693552017211914, - "debug/policy_weights": 0.05701867491006851, - "debug/raw_losses": 0.6127216815948486, - "epoch": 0.8276959808993235, - "grad_norm": 1.8494763538425973, + "epoch": 0.83, "learning_rate": 4.373907927832513e-08, - "logits/chosen": -1.835228681564331, - "logits/rejected": -1.8134419918060303, - "logps/chosen": -295.58941650390625, - "logps/rejected": -349.1560363769531, - "loss": 0.0357, - "rewards/accuracies": 0.65625, - "rewards/chosen": -1.6788638830184937, - "rewards/margins": 0.423389732837677, - "rewards/rejected": -2.1022536754608154, + "logits/chosen": 0.07573021948337555, + "logits/rejected": 0.32997313141822815, + "logps/chosen": -381.45599365234375, + "logps/rejected": -443.0684509277344, + "loss": 0.5407, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.2033116817474365, + "rewards/margins": 0.710732638835907, + "rewards/rejected": -2.914044141769409, "step": 1040 }, { - "debug/losses": 0.03326871246099472, - "debug/policy_weights": 0.06307505071163177, - "debug/raw_losses": 0.5278674364089966, - "epoch": 0.8356545961002786, - "grad_norm": 1.6774079365802337, + "epoch": 0.84, "learning_rate": 3.9891823124345665e-08, - "logits/chosen": -1.874322533607483, - "logits/rejected": -1.834237813949585, - "logps/chosen": -303.7854919433594, - "logps/rejected": -363.41619873046875, - "loss": 0.0349, - "rewards/accuracies": 0.71875, - "rewards/chosen": -1.5276892185211182, - "rewards/margins": 0.6272332072257996, - "rewards/rejected": -2.1549224853515625, + "logits/chosen": 0.23884686827659607, + "logits/rejected": 0.6128005385398865, + "logps/chosen": -364.00567626953125, + "logps/rejected": -433.3273010253906, + "loss": 0.5471, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.2081527709960938, + "rewards/margins": 0.7639263868331909, + "rewards/rejected": -2.972079038619995, "step": 1050 }, { - "debug/losses": 0.03040960431098938, - "debug/policy_weights": 0.057738013565540314, - "debug/raw_losses": 0.5673826932907104, - "epoch": 0.8436132113012336, - "grad_norm": 1.2896520617276703, + "epoch": 0.84, "learning_rate": 3.620695643093924e-08, - "logits/chosen": -1.873903512954712, - "logits/rejected": -1.8679378032684326, - "logps/chosen": -292.12353515625, - "logps/rejected": -355.63787841796875, - "loss": 0.0327, - "rewards/accuracies": 0.65625, - "rewards/chosen": -1.5533664226531982, - "rewards/margins": 0.4991677701473236, - "rewards/rejected": -2.0525341033935547, + "logits/chosen": 0.21963253617286682, + "logits/rejected": 0.6894062757492065, + "logps/chosen": -399.5767517089844, + "logps/rejected": -452.88909912109375, + "loss": 0.5154, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.3384335041046143, + "rewards/margins": 0.7010769844055176, + "rewards/rejected": -3.0395102500915527, "step": 1060 }, { - "debug/losses": 0.03549133986234665, - "debug/policy_weights": 0.058590926229953766, - "debug/raw_losses": 0.5505832433700562, - "epoch": 0.8515718265021887, - "grad_norm": 1.6406733477612518, + "epoch": 0.85, "learning_rate": 3.268732717634032e-08, - "logits/chosen": -1.8545564413070679, - "logits/rejected": -1.8187520503997803, - "logps/chosen": -293.1727600097656, - "logps/rejected": -343.296142578125, - "loss": 0.0346, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -1.5677841901779175, - "rewards/margins": 0.5242434740066528, - "rewards/rejected": -2.0920276641845703, + "logits/chosen": 0.3474286198616028, + "logits/rejected": 0.695271372795105, + "logps/chosen": -368.0654602050781, + "logps/rejected": -431.47222900390625, + "loss": 0.5499, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.1910276412963867, + "rewards/margins": 0.7267633080482483, + "rewards/rejected": -2.9177908897399902, "step": 1070 }, { - "debug/losses": 0.03740120679140091, - "debug/policy_weights": 0.06145843118429184, - "debug/raw_losses": 0.6025527715682983, - "epoch": 0.8595304417031436, - "grad_norm": 1.603935612215664, + "epoch": 0.86, "learning_rate": 2.9335655629243645e-08, - "logits/chosen": -1.8837159872055054, - "logits/rejected": -1.8695827722549438, - "logps/chosen": -321.89208984375, - "logps/rejected": -373.8929443359375, - "loss": 0.0332, - "rewards/accuracies": 0.65625, - "rewards/chosen": -1.7150942087173462, - "rewards/margins": 0.40743288397789, - "rewards/rejected": -2.1225271224975586, + "logits/chosen": 0.2347393035888672, + "logits/rejected": 0.5894696712493896, + "logps/chosen": -388.94757080078125, + "logps/rejected": -447.3855895996094, + "loss": 0.525, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.254683017730713, + "rewards/margins": 0.7334609031677246, + "rewards/rejected": -2.9881439208984375, "step": 1080 }, { - "debug/losses": 0.026631182059645653, - "debug/policy_weights": 0.04555311053991318, - "debug/raw_losses": 0.5978758335113525, - "epoch": 0.8674890569040987, - "grad_norm": 1.3056333450149697, + "epoch": 0.87, "learning_rate": 2.6154532246349476e-08, - "logits/chosen": -1.8537206649780273, - "logits/rejected": -1.791595220565796, - "logps/chosen": -319.4361572265625, - "logps/rejected": -342.8656311035156, - "loss": 0.0333, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -1.778660535812378, - "rewards/margins": 0.41385021805763245, - "rewards/rejected": -2.1925110816955566, + "logits/chosen": 0.25378522276878357, + "logits/rejected": 0.5771256685256958, + "logps/chosen": -358.50640869140625, + "logps/rejected": -431.145751953125, + "loss": 0.5462, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.1666626930236816, + "rewards/margins": 0.756801426410675, + "rewards/rejected": -2.923464059829712, "step": 1090 }, { - "debug/losses": 0.03684794157743454, - "debug/policy_weights": 0.061137206852436066, - "debug/raw_losses": 0.6424818634986877, - "epoch": 0.8754476721050537, - "grad_norm": 1.8597954368327896, + "epoch": 0.88, "learning_rate": 2.31464156702382e-08, - "logits/chosen": -1.856898546218872, - "logits/rejected": -1.8056995868682861, - "logps/chosen": -335.59161376953125, - "logps/rejected": -364.3418884277344, - "loss": 0.0313, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -1.7900125980377197, - "rewards/margins": 0.3661695420742035, - "rewards/rejected": -2.156182050704956, + "logits/chosen": 0.35370689630508423, + "logits/rejected": 0.5671936273574829, + "logps/chosen": -363.0, + "logps/rejected": -438.209228515625, + "loss": 0.5487, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.2130119800567627, + "rewards/margins": 0.7499077916145325, + "rewards/rejected": -2.9629194736480713, "step": 1100 }, { - "epoch": 0.8754476721050537, - "eval_debug/losses": 0.029432397335767746, - "eval_debug/policy_weights": 0.052661605179309845, - "eval_debug/raw_losses": 0.5672317147254944, - "eval_logits/chosen": -1.8722801208496094, - "eval_logits/rejected": -1.8470919132232666, - "eval_logps/chosen": -319.0127258300781, - "eval_logps/rejected": -375.38702392578125, - "eval_loss": 0.031895771622657776, - "eval_rewards/accuracies": 0.6921641826629639, - "eval_rewards/chosen": -1.7476924657821655, - "eval_rewards/margins": 0.4991871416568756, - "eval_rewards/rejected": -2.2468795776367188, - "eval_runtime": 152.9853, - "eval_samples_per_second": 55.901, - "eval_steps_per_second": 0.876, + "epoch": 0.88, + "eval_logits/chosen": 0.1857856959104538, + "eval_logits/rejected": 0.43363669514656067, + "eval_logps/chosen": -373.08306884765625, + "eval_logps/rejected": -450.7598876953125, + "eval_loss": 0.5444055199623108, + "eval_rewards/accuracies": 0.7089552283287048, + "eval_rewards/chosen": -2.285095453262329, + "eval_rewards/margins": 0.711159884929657, + "eval_rewards/rejected": -2.996255397796631, + "eval_runtime": 183.9924, + "eval_samples_per_second": 46.48, + "eval_steps_per_second": 0.728, "step": 1100 }, { - "debug/losses": 0.027387287467718124, - "debug/policy_weights": 0.05117439478635788, - "debug/raw_losses": 0.5718464255332947, - "epoch": 0.8834062873060088, - "grad_norm": 1.236953286612134, + "epoch": 0.88, "learning_rate": 2.031363082912252e-08, - "logits/chosen": -1.8504539728164673, - "logits/rejected": -1.8379875421524048, - "logps/chosen": -304.44122314453125, - "logps/rejected": -359.46954345703125, - "loss": 0.0316, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -1.7290661334991455, - "rewards/margins": 0.4590110182762146, - "rewards/rejected": -2.188077211380005, + "logits/chosen": 0.070524200797081, + "logits/rejected": 0.4635602533817291, + "logps/chosen": -373.29327392578125, + "logps/rejected": -426.85552978515625, + "loss": 0.5513, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -2.2541089057922363, + "rewards/margins": 0.6198171973228455, + "rewards/rejected": -2.8739261627197266, "step": 1110 }, { - "debug/losses": 0.029294824227690697, - "debug/policy_weights": 0.05012016370892525, - "debug/raw_losses": 0.5968630313873291, - "epoch": 0.8913649025069638, - "grad_norm": 1.5652814842138667, + "epoch": 0.89, "learning_rate": 1.7658367139945228e-08, - "logits/chosen": -1.8701921701431274, - "logits/rejected": -1.8391071557998657, - "logps/chosen": -316.2760009765625, - "logps/rejected": -361.0913391113281, - "loss": 0.0295, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -1.7196756601333618, - "rewards/margins": 0.4586181044578552, - "rewards/rejected": -2.1782937049865723, + "logits/chosen": 0.2600646913051605, + "logits/rejected": 0.5517584681510925, + "logps/chosen": -390.8568115234375, + "logps/rejected": -462.80828857421875, + "loss": 0.5471, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.377396821975708, + "rewards/margins": 0.6719989776611328, + "rewards/rejected": -3.049395799636841, "step": 1120 }, { - "debug/losses": 0.029790222644805908, - "debug/policy_weights": 0.05559501796960831, - "debug/raw_losses": 0.5522772073745728, - "epoch": 0.8993235177079189, - "grad_norm": 1.4128987302177969, + "epoch": 0.9, "learning_rate": 1.5182676816211632e-08, - "logits/chosen": -1.8724491596221924, - "logits/rejected": -1.8452503681182861, - "logps/chosen": -319.95953369140625, - "logps/rejected": -386.1808166503906, - "loss": 0.0308, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -1.7754627466201782, - "rewards/margins": 0.524121880531311, - "rewards/rejected": -2.2995846271514893, + "logits/chosen": 0.04413030296564102, + "logits/rejected": 0.30151715874671936, + "logps/chosen": -382.0662536621094, + "logps/rejected": -447.08673095703125, + "loss": 0.5431, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.228654384613037, + "rewards/margins": 0.6926024556159973, + "rewards/rejected": -2.9212570190429688, "step": 1130 }, { - "debug/losses": 0.04000743851065636, - "debug/policy_weights": 0.06320012360811234, - "debug/raw_losses": 0.6184248328208923, - "epoch": 0.9072821329088738, - "grad_norm": 1.3375871919682243, + "epoch": 0.91, "learning_rate": 1.2888473281864597e-08, - "logits/chosen": -1.8942562341690063, - "logits/rejected": -1.8619248867034912, - "logps/chosen": -329.1838684082031, - "logps/rejected": -361.75543212890625, - "loss": 0.0321, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -1.7136873006820679, - "rewards/margins": 0.3745079040527344, - "rewards/rejected": -2.0881950855255127, + "logits/chosen": 0.14212054014205933, + "logits/rejected": 0.47429710626602173, + "logps/chosen": -367.8409729003906, + "logps/rejected": -435.02764892578125, + "loss": 0.5369, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -2.2534115314483643, + "rewards/margins": 0.7273036241531372, + "rewards/rejected": -2.980715274810791, "step": 1140 }, { - "debug/losses": 0.0304880253970623, - "debug/policy_weights": 0.05398009344935417, - "debug/raw_losses": 0.5579678416252136, - "epoch": 0.9152407481098289, - "grad_norm": 1.164265065566766, + "epoch": 0.92, "learning_rate": 1.0777529692427679e-08, - "logits/chosen": -1.8475040197372437, - "logits/rejected": -1.8063409328460693, - "logps/chosen": -312.86041259765625, - "logps/rejected": -353.0240173339844, - "loss": 0.0309, + "logits/chosen": 0.04115242511034012, + "logits/rejected": 0.28970104455947876, + "logps/chosen": -372.7949523925781, + "logps/rejected": -456.10675048828125, + "loss": 0.5265, "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -1.6208263635635376, - "rewards/margins": 0.5101144909858704, - "rewards/rejected": -2.1309409141540527, + "rewards/chosen": -2.300356388092041, + "rewards/margins": 0.8059718012809753, + "rewards/rejected": -3.106328248977661, "step": 1150 }, { - "debug/losses": 0.03380966559052467, - "debug/policy_weights": 0.054825879633426666, - "debug/raw_losses": 0.5997228622436523, - "epoch": 0.9231993633107839, - "grad_norm": 1.3666307813144103, + "epoch": 0.92, "learning_rate": 8.851477564560061e-09, - "logits/chosen": -1.831713318824768, - "logits/rejected": -1.7982580661773682, - "logps/chosen": -308.39483642578125, - "logps/rejected": -368.4901428222656, - "loss": 0.0311, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -1.6622002124786377, - "rewards/margins": 0.4915587306022644, - "rewards/rejected": -2.153759002685547, + "logits/chosen": 0.0867738351225853, + "logits/rejected": 0.4068300127983093, + "logps/chosen": -372.08636474609375, + "logps/rejected": -426.42388916015625, + "loss": 0.5342, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -2.331385850906372, + "rewards/margins": 0.6490964293479919, + "rewards/rejected": -2.9804821014404297, "step": 1160 }, { - "debug/losses": 0.030011435970664024, - "debug/policy_weights": 0.05814291164278984, - "debug/raw_losses": 0.5802719593048096, - "epoch": 0.931157978511739, - "grad_norm": 1.436341263317072, + "epoch": 0.93, "learning_rate": 7.111805515081531e-09, - "logits/chosen": -1.8729360103607178, - "logits/rejected": -1.8116334676742554, - "logps/chosen": -339.6989440917969, - "logps/rejected": -385.967041015625, - "loss": 0.0309, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -1.8521687984466553, - "rewards/margins": 0.4960354268550873, - "rewards/rejected": -2.3482041358947754, + "logits/chosen": 0.02022993005812168, + "logits/rejected": 0.41968393325805664, + "logps/chosen": -363.818603515625, + "logps/rejected": -447.7919006347656, + "loss": 0.5312, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -2.2372307777404785, + "rewards/margins": 0.8540315628051758, + "rewards/rejected": -3.0912623405456543, "step": 1170 }, { - "debug/losses": 0.03459464758634567, - "debug/policy_weights": 0.06046764925122261, - "debug/raw_losses": 0.5656360387802124, - "epoch": 0.939116593712694, - "grad_norm": 1.3207132160045205, + "epoch": 0.94, "learning_rate": 5.559858110443016e-09, - "logits/chosen": -1.8872379064559937, - "logits/rejected": -1.8523366451263428, - "logps/chosen": -322.80029296875, - "logps/rejected": -378.6051025390625, - "loss": 0.0299, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -1.6689224243164062, - "rewards/margins": 0.5369892716407776, - "rewards/rejected": -2.205911636352539, + "logits/chosen": 0.29695388674736023, + "logits/rejected": 0.714096188545227, + "logps/chosen": -372.5519714355469, + "logps/rejected": -442.5354919433594, + "loss": 0.5383, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.3107995986938477, + "rewards/margins": 0.8070123791694641, + "rewards/rejected": -3.117811918258667, "step": 1180 }, { - "debug/losses": 0.023186931386590004, - "debug/policy_weights": 0.0461835041642189, - "debug/raw_losses": 0.5589593052864075, - "epoch": 0.947075208913649, - "grad_norm": 1.3641453732624562, + "epoch": 0.95, "learning_rate": 4.196834827531276e-09, - "logits/chosen": -1.8458702564239502, - "logits/rejected": -1.8175357580184937, - "logps/chosen": -324.30230712890625, - "logps/rejected": -383.04608154296875, - "loss": 0.0296, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -1.8120216131210327, - "rewards/margins": 0.5915289521217346, - "rewards/rejected": -2.403550624847412, + "logits/chosen": 0.140055850148201, + "logits/rejected": 0.3409932255744934, + "logps/chosen": -355.64324951171875, + "logps/rejected": -447.585693359375, + "loss": 0.5152, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.149151563644409, + "rewards/margins": 0.7904965877532959, + "rewards/rejected": -2.939648151397705, "step": 1190 }, { - "debug/losses": 0.024858497083187103, - "debug/policy_weights": 0.04600748419761658, - "debug/raw_losses": 0.5523272752761841, - "epoch": 0.955033824114604, - "grad_norm": 1.4362856575591538, + "epoch": 0.96, "learning_rate": 3.023789126611137e-09, - "logits/chosen": -1.8598756790161133, - "logits/rejected": -1.8078199625015259, - "logps/chosen": -317.81170654296875, - "logps/rejected": -364.5005798339844, - "loss": 0.031, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -1.704816222190857, - "rewards/margins": 0.5389941930770874, - "rewards/rejected": -2.2438104152679443, + "logits/chosen": 0.03294936567544937, + "logits/rejected": 0.2933207154273987, + "logps/chosen": -363.29290771484375, + "logps/rejected": -435.640380859375, + "loss": 0.5483, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.281057357788086, + "rewards/margins": 0.7091296911239624, + "rewards/rejected": -2.990186929702759, "step": 1200 }, { - "epoch": 0.955033824114604, - "eval_debug/losses": 0.029611436650156975, - "eval_debug/policy_weights": 0.05299457162618637, - "eval_debug/raw_losses": 0.5668274760246277, - "eval_logits/chosen": -1.8642234802246094, - "eval_logits/rejected": -1.8383311033248901, - "eval_logps/chosen": -319.97027587890625, - "eval_logps/rejected": -377.42315673828125, - "eval_loss": 0.03207932412624359, - "eval_rewards/accuracies": 0.6958954930305481, - "eval_rewards/chosen": -1.757267713546753, - "eval_rewards/margins": 0.5099742412567139, - "eval_rewards/rejected": -2.2672417163848877, - "eval_runtime": 152.7819, - "eval_samples_per_second": 55.975, - "eval_steps_per_second": 0.877, + "epoch": 0.96, + "eval_logits/chosen": 0.07418080419301987, + "eval_logits/rejected": 0.32435521483421326, + "eval_logps/chosen": -373.978515625, + "eval_logps/rejected": -451.6764831542969, + "eval_loss": 0.5440130829811096, + "eval_rewards/accuracies": 0.7089552283287048, + "eval_rewards/chosen": -2.2940499782562256, + "eval_rewards/margins": 0.7113713622093201, + "eval_rewards/rejected": -3.0054211616516113, + "eval_runtime": 183.9013, + "eval_samples_per_second": 46.503, + "eval_steps_per_second": 0.729, "step": 1200 }, { - "debug/losses": 0.02852838858962059, - "debug/policy_weights": 0.051258690655231476, - "debug/raw_losses": 0.5769435167312622, - "epoch": 0.9629924393155591, - "grad_norm": 1.3807763771834016, + "epoch": 0.96, "learning_rate": 2.041627637121929e-09, - "logits/chosen": -1.8483145236968994, - "logits/rejected": -1.8244221210479736, - "logps/chosen": -316.3843994140625, - "logps/rejected": -385.6697692871094, - "loss": 0.0315, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -1.7493064403533936, - "rewards/margins": 0.5202833414077759, - "rewards/rejected": -2.269589900970459, + "logits/chosen": 0.10010697692632675, + "logits/rejected": 0.3795483410358429, + "logps/chosen": -348.8675231933594, + "logps/rejected": -437.20361328125, + "loss": 0.5398, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.087364673614502, + "rewards/margins": 0.828387439250946, + "rewards/rejected": -2.9157521724700928, "step": 1210 }, { - "debug/losses": 0.029417548328638077, - "debug/policy_weights": 0.051362644881010056, - "debug/raw_losses": 0.5895043611526489, - "epoch": 0.9709510545165141, - "grad_norm": 1.6916257941886172, + "epoch": 0.97, "learning_rate": 1.2511094569571668e-09, - "logits/chosen": -1.8148953914642334, - "logits/rejected": -1.7570714950561523, - "logps/chosen": -321.575927734375, - "logps/rejected": -342.35205078125, - "loss": 0.0315, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -1.7315263748168945, - "rewards/margins": 0.4397401809692383, - "rewards/rejected": -2.171266794204712, + "logits/chosen": 0.09991980344057083, + "logits/rejected": 0.4467397630214691, + "logps/chosen": -380.14520263671875, + "logps/rejected": -440.24658203125, + "loss": 0.5345, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.253425121307373, + "rewards/margins": 0.702509880065918, + "rewards/rejected": -2.955935001373291, "step": 1220 }, { - "debug/losses": 0.028057556599378586, - "debug/policy_weights": 0.047226615250110626, - "debug/raw_losses": 0.5787237882614136, - "epoch": 0.9789096697174692, - "grad_norm": 1.2061064281394842, + "epoch": 0.98, "learning_rate": 6.528455657691112e-10, - "logits/chosen": -1.817239761352539, - "logits/rejected": -1.8122575283050537, - "logps/chosen": -320.651611328125, - "logps/rejected": -379.8753967285156, - "loss": 0.0294, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -1.8191404342651367, - "rewards/margins": 0.4797174036502838, - "rewards/rejected": -2.2988579273223877, + "logits/chosen": 0.11626466363668442, + "logits/rejected": 0.41348797082901, + "logps/chosen": -372.7298889160156, + "logps/rejected": -427.22576904296875, + "loss": 0.549, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.2800345420837402, + "rewards/margins": 0.6291176080703735, + "rewards/rejected": -2.909151792526245, "step": 1230 }, { - "debug/losses": 0.03001987561583519, - "debug/policy_weights": 0.052023641765117645, - "debug/raw_losses": 0.5478265881538391, - "epoch": 0.9868682849184242, - "grad_norm": 1.4769747929733708, + "epoch": 0.99, "learning_rate": 2.4729835275189016e-10, - "logits/chosen": -1.8267762660980225, - "logits/rejected": -1.7973419427871704, - "logps/chosen": -319.90771484375, - "logps/rejected": -387.49957275390625, - "loss": 0.0306, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -1.7920163869857788, - "rewards/margins": 0.612352728843689, - "rewards/rejected": -2.4043688774108887, + "logits/chosen": 0.06715863198041916, + "logits/rejected": 0.29241910576820374, + "logps/chosen": -393.8903503417969, + "logps/rejected": -477.9420471191406, + "loss": 0.5462, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.323488473892212, + "rewards/margins": 0.8067766427993774, + "rewards/rejected": -3.1302647590637207, "step": 1240 }, { - "debug/losses": 0.027796531096100807, - "debug/policy_weights": 0.051054131239652634, - "debug/raw_losses": 0.5395588278770447, - "epoch": 0.9948269001193792, - "grad_norm": 1.7339598026750096, + "epoch": 0.99, "learning_rate": 3.478125926756337e-11, - "logits/chosen": -1.826674222946167, - "logits/rejected": -1.8084548711776733, - "logps/chosen": -319.61077880859375, - "logps/rejected": -389.81427001953125, - "loss": 0.0294, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -1.7789347171783447, - "rewards/margins": 0.5856661200523376, - "rewards/rejected": -2.364600896835327, + "logits/chosen": 0.25983649492263794, + "logits/rejected": 0.4905417561531067, + "logps/chosen": -364.73431396484375, + "logps/rejected": -443.79296875, + "loss": 0.5474, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.2353272438049316, + "rewards/margins": 0.771331787109375, + "rewards/rejected": -3.0066590309143066, "step": 1250 }, { - "epoch": 0.9996020692399522, + "epoch": 1.0, "step": 1256, "total_flos": 0.0, - "train_loss": 0.05414291525817221, - "train_runtime": 10529.1077, - "train_samples_per_second": 15.272, - "train_steps_per_second": 0.119 + "train_loss": 0.5712926928784438, + "train_runtime": 11525.4961, + "train_samples_per_second": 13.952, + "train_steps_per_second": 0.109 } ], "logging_steps": 10, "max_steps": 1256, - "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, - "stateful_callbacks": { - "TrainerControl": { - "args": { - "should_epoch_stop": false, - "should_evaluate": false, - "should_log": false, - "should_save": true, - "should_training_stop": false - }, - "attributes": {} - } - }, "total_flos": 0.0, - "train_batch_size": 8, "trial_name": null, "trial_params": null }