diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -9,17 +9,13 @@ "is_world_process_zero": true, "log_history": [ { - "debug/losses": 0.23031963407993317, - "debug/policy_weights": 0.3322809934616089, - "debug/raw_losses": 0.6931471824645996, - "epoch": 0.0007958615200955034, - "grad_norm": 1.6289096206321385, + "epoch": 0.0, "learning_rate": 3.968253968253968e-09, - "logits/chosen": -2.735659122467041, - "logits/rejected": -2.7581238746643066, - "logps/chosen": -124.62968444824219, - "logps/rejected": -168.09475708007812, - "loss": 0.2239, + "logits/chosen": -2.7193620204925537, + "logits/rejected": -2.698728084564209, + "logps/chosen": -182.0961456298828, + "logps/rejected": -172.47128295898438, + "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, @@ -27,2512 +23,1962 @@ "step": 1 }, { - "debug/losses": 0.23617929220199585, - "debug/policy_weights": 0.3407440781593323, - "debug/raw_losses": 0.6931117177009583, - "epoch": 0.007958615200955034, - "grad_norm": 1.6861129588975887, + "epoch": 0.01, "learning_rate": 3.968253968253968e-08, - "logits/chosen": -2.7388463020324707, - "logits/rejected": -2.727876901626587, - "logps/chosen": -146.6982879638672, - "logps/rejected": -131.2141571044922, - "loss": 0.2295, - "rewards/accuracies": 0.4513888955116272, - "rewards/chosen": 3.8997877709334716e-05, - "rewards/margins": 7.3335635534022e-05, - "rewards/rejected": -3.4337710530962795e-05, + "logits/chosen": -2.7041964530944824, + "logits/rejected": -2.6794540882110596, + "logps/chosen": -162.45831298828125, + "logps/rejected": -140.5693359375, + "loss": 0.6931, + "rewards/accuracies": 0.5486111044883728, + "rewards/chosen": 0.00032037965138442814, + "rewards/margins": 0.0004935775068588555, + "rewards/rejected": -0.00017319784092251211, "step": 10 }, { - "debug/losses": 0.22658827900886536, - "debug/policy_weights": 0.32686564326286316, - "debug/raw_losses": 0.6932188272476196, - "epoch": 0.01591723040191007, - "grad_norm": 1.5613292890727226, + "epoch": 0.02, "learning_rate": 7.936507936507936e-08, - "logits/chosen": -2.706713914871216, - "logits/rejected": -2.7037758827209473, - "logps/chosen": -129.48782348632812, - "logps/rejected": -130.2589874267578, - "loss": 0.2239, - "rewards/accuracies": 0.5, - "rewards/chosen": -0.00019513315055519342, - "rewards/margins": -0.00014081124390941113, - "rewards/rejected": -5.432188117993064e-05, + "logits/chosen": -2.7177577018737793, + "logits/rejected": -2.7136425971984863, + "logps/chosen": -134.47242736816406, + "logps/rejected": -143.55604553222656, + "loss": 0.6931, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 8.780837379163131e-05, + "rewards/margins": 0.00010721785656642169, + "rewards/rejected": -1.940951551659964e-05, "step": 20 }, { - "debug/losses": 0.21325843036174774, - "debug/policy_weights": 0.307701975107193, - "debug/raw_losses": 0.6930276155471802, - "epoch": 0.0238758456028651, - "grad_norm": 1.5386121917266484, + "epoch": 0.02, "learning_rate": 1.1904761904761903e-07, - "logits/chosen": -2.6837477684020996, - "logits/rejected": -2.6807758808135986, - "logps/chosen": -141.8106231689453, - "logps/rejected": -155.6637420654297, - "loss": 0.2264, - "rewards/accuracies": 0.5062500238418579, - "rewards/chosen": 0.00021137837029527873, - "rewards/margins": 0.00024208621471188962, - "rewards/rejected": -3.070788807235658e-05, + "logits/chosen": -2.6898293495178223, + "logits/rejected": -2.676154613494873, + "logps/chosen": -140.94692993164062, + "logps/rejected": -136.50369262695312, + "loss": 0.6931, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.0005466601578518748, + "rewards/margins": -0.00021456097601912916, + "rewards/rejected": 0.0007612211629748344, "step": 30 }, { - "debug/losses": 0.2177859991788864, - "debug/policy_weights": 0.3142939507961273, - "debug/raw_losses": 0.6929190158843994, - "epoch": 0.03183446080382014, - "grad_norm": 1.637054860498854, + "epoch": 0.03, "learning_rate": 1.5873015873015872e-07, - "logits/chosen": -2.691988468170166, - "logits/rejected": -2.6842727661132812, - "logps/chosen": -154.9478302001953, - "logps/rejected": -164.12692260742188, - "loss": 0.221, - "rewards/accuracies": 0.5562499761581421, - "rewards/chosen": -0.0012577458983287215, - "rewards/margins": 0.00046192569425329566, - "rewards/rejected": -0.001719671650789678, + "logits/chosen": -2.6958394050598145, + "logits/rejected": -2.686532974243164, + "logps/chosen": -134.98963928222656, + "logps/rejected": -144.46652221679688, + "loss": 0.6928, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": 0.0015748919686302543, + "rewards/margins": 0.0009769219905138016, + "rewards/rejected": 0.0005979698617011309, "step": 40 }, { - "debug/losses": 0.22946766018867493, - "debug/policy_weights": 0.33167093992233276, - "debug/raw_losses": 0.6918202042579651, - "epoch": 0.03979307600477517, - "grad_norm": 1.480789109824279, + "epoch": 0.04, "learning_rate": 1.984126984126984e-07, - "logits/chosen": -2.706690788269043, - "logits/rejected": -2.6879990100860596, - "logps/chosen": -144.00408935546875, - "logps/rejected": -137.75918579101562, - "loss": 0.2233, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -0.0028049442917108536, - "rewards/margins": 0.0026701870374381542, - "rewards/rejected": -0.005475131794810295, + "logits/chosen": -2.7042899131774902, + "logits/rejected": -2.6861345767974854, + "logps/chosen": -149.71768188476562, + "logps/rejected": -145.0757293701172, + "loss": 0.6921, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.005199921317398548, + "rewards/margins": 0.0022330707870423794, + "rewards/rejected": 0.0029668500646948814, "step": 50 }, { - "debug/losses": 0.22743281722068787, - "debug/policy_weights": 0.32873186469078064, - "debug/raw_losses": 0.6915570497512817, - "epoch": 0.0477516912057302, - "grad_norm": 1.4857805665702917, + "epoch": 0.05, "learning_rate": 2.3809523809523806e-07, - "logits/chosen": -2.7156319618225098, - "logits/rejected": -2.7164268493652344, - "logps/chosen": -145.94308471679688, - "logps/rejected": -159.51734924316406, - "loss": 0.2192, - "rewards/accuracies": 0.606249988079071, - "rewards/chosen": -0.0031640075612813234, - "rewards/margins": 0.003222744446247816, - "rewards/rejected": -0.006386751774698496, + "logits/chosen": -2.705153703689575, + "logits/rejected": -2.685439348220825, + "logps/chosen": -154.3783416748047, + "logps/rejected": -151.54519653320312, + "loss": 0.6912, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": 0.00569504126906395, + "rewards/margins": 0.0022000311873853207, + "rewards/rejected": 0.003495010081678629, "step": 60 }, { - "debug/losses": 0.2175430804491043, - "debug/policy_weights": 0.3152545094490051, - "debug/raw_losses": 0.6902174353599548, - "epoch": 0.055710306406685235, - "grad_norm": 1.5369085878162871, + "epoch": 0.06, "learning_rate": 2.7777777777777776e-07, - "logits/chosen": -2.736572265625, - "logits/rejected": -2.7276124954223633, - "logps/chosen": -149.30255126953125, - "logps/rejected": -143.44100952148438, - "loss": 0.2114, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -0.016642453148961067, - "rewards/margins": 0.006107243709266186, - "rewards/rejected": -0.02274969592690468, + "logits/chosen": -2.7017154693603516, + "logits/rejected": -2.6924962997436523, + "logps/chosen": -146.3284149169922, + "logps/rejected": -138.79405212402344, + "loss": 0.6885, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.010588793084025383, + "rewards/margins": 0.010192448273301125, + "rewards/rejected": 0.00039634370477870107, "step": 70 }, { - "debug/losses": 0.18839044868946075, - "debug/policy_weights": 0.2725599408149719, - "debug/raw_losses": 0.6915421485900879, - "epoch": 0.06366892160764027, - "grad_norm": 1.5333558257053381, + "epoch": 0.06, "learning_rate": 3.1746031746031743e-07, - "logits/chosen": -2.7109673023223877, - "logits/rejected": -2.6925058364868164, - "logps/chosen": -157.96241760253906, - "logps/rejected": -149.37074279785156, - "loss": 0.2012, - "rewards/accuracies": 0.5249999761581421, - "rewards/chosen": -0.04828154668211937, - "rewards/margins": 0.0038675833493471146, - "rewards/rejected": -0.05214913561940193, + "logits/chosen": -2.7155232429504395, + "logits/rejected": -2.696071147918701, + "logps/chosen": -141.80067443847656, + "logps/rejected": -147.0068817138672, + "loss": 0.6867, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": 0.0049073463305830956, + "rewards/margins": 0.013599385507404804, + "rewards/rejected": -0.008692039176821709, "step": 80 }, { - "debug/losses": 0.18670453131198883, - "debug/policy_weights": 0.273386150598526, - "debug/raw_losses": 0.6827707290649414, - "epoch": 0.07162753680859531, - "grad_norm": 1.6092945600702757, + "epoch": 0.07, "learning_rate": 3.5714285714285716e-07, - "logits/chosen": -2.7211790084838867, - "logits/rejected": -2.728004217147827, - "logps/chosen": -152.60055541992188, - "logps/rejected": -173.4809112548828, - "loss": 0.1852, + "logits/chosen": -2.7175304889678955, + "logits/rejected": -2.7080624103546143, + "logps/chosen": -153.12509155273438, + "logps/rejected": -146.53590393066406, + "loss": 0.6847, "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": -0.0675431415438652, - "rewards/margins": 0.02248724177479744, - "rewards/rejected": -0.09003038704395294, + "rewards/chosen": -0.028871387243270874, + "rewards/margins": 0.017175236716866493, + "rewards/rejected": -0.046046625822782516, "step": 90 }, { - "debug/losses": 0.16796275973320007, - "debug/policy_weights": 0.24775293469429016, - "debug/raw_losses": 0.6785470843315125, - "epoch": 0.07958615200955034, - "grad_norm": 1.5168173744160047, + "epoch": 0.08, "learning_rate": 3.968253968253968e-07, - "logits/chosen": -2.6893362998962402, - "logits/rejected": -2.672621250152588, - "logps/chosen": -149.28890991210938, - "logps/rejected": -143.28530883789062, - "loss": 0.1734, - "rewards/accuracies": 0.5687500238418579, - "rewards/chosen": -0.11476944386959076, - "rewards/margins": 0.0327371247112751, - "rewards/rejected": -0.14750656485557556, + "logits/chosen": -2.7524733543395996, + "logits/rejected": -2.7452526092529297, + "logps/chosen": -163.88070678710938, + "logps/rejected": -163.61032104492188, + "loss": 0.6789, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.0662173330783844, + "rewards/margins": 0.02977912127971649, + "rewards/rejected": -0.09599645435810089, "step": 100 }, { - "epoch": 0.07958615200955034, - "eval_debug/losses": 0.16163869202136993, - "eval_debug/policy_weights": 0.23812691867351532, - "eval_debug/raw_losses": 0.6786960959434509, - "eval_logits/chosen": -2.7123541831970215, - "eval_logits/rejected": -2.70428729057312, - "eval_logps/chosen": -158.4906768798828, - "eval_logps/rejected": -168.3474884033203, - "eval_loss": 0.1630723923444748, - "eval_rewards/accuracies": 0.5923507213592529, - "eval_rewards/chosen": -0.14247193932533264, - "eval_rewards/margins": 0.03401322290301323, - "eval_rewards/rejected": -0.17648516595363617, - "eval_runtime": 153.0792, - "eval_samples_per_second": 55.867, - "eval_steps_per_second": 0.875, + "epoch": 0.08, + "eval_logits/chosen": -2.7336502075195312, + "eval_logits/rejected": -2.7255024909973145, + "eval_logps/chosen": -155.19271850585938, + "eval_logps/rejected": -165.35523986816406, + "eval_loss": 0.6769910454750061, + "eval_rewards/accuracies": 0.5914179086685181, + "eval_rewards/chosen": -0.10619194805622101, + "eval_rewards/margins": 0.03601696714758873, + "eval_rewards/rejected": -0.14220890402793884, + "eval_runtime": 184.1564, + "eval_samples_per_second": 46.439, + "eval_steps_per_second": 0.728, "step": 100 }, { - "debug/losses": 0.15102894604206085, - "debug/policy_weights": 0.21911868453025818, - "debug/raw_losses": 0.6911253333091736, - "epoch": 0.08754476721050537, - "grad_norm": 1.50091397853791, + "epoch": 0.09, "learning_rate": 4.365079365079365e-07, - "logits/chosen": -2.690058946609497, - "logits/rejected": -2.6713109016418457, - "logps/chosen": -179.52647399902344, - "logps/rejected": -165.1460418701172, - "loss": 0.1473, - "rewards/accuracies": 0.53125, - "rewards/chosen": -0.2123573273420334, - "rewards/margins": 0.010995535179972649, - "rewards/rejected": -0.22335286438465118, + "logits/chosen": -2.738532543182373, + "logits/rejected": -2.7273170948028564, + "logps/chosen": -164.2928009033203, + "logps/rejected": -160.19398498535156, + "loss": 0.6738, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.16211798787117004, + "rewards/margins": 0.03163355216383934, + "rewards/rejected": -0.19375154376029968, "step": 110 }, { - "debug/losses": 0.13807068765163422, - "debug/policy_weights": 0.2061845362186432, - "debug/raw_losses": 0.6786133646965027, - "epoch": 0.0955033824114604, - "grad_norm": 1.3930389291929042, + "epoch": 0.1, "learning_rate": 4.761904761904761e-07, - "logits/chosen": -2.663013219833374, - "logits/rejected": -2.650160312652588, - "logps/chosen": -168.0050506591797, - "logps/rejected": -173.11129760742188, - "loss": 0.1337, - "rewards/accuracies": 0.574999988079071, - "rewards/chosen": -0.2887328267097473, - "rewards/margins": 0.042750097811222076, - "rewards/rejected": -0.3314829468727112, + "logits/chosen": -2.7289297580718994, + "logits/rejected": -2.705962657928467, + "logps/chosen": -196.69662475585938, + "logps/rejected": -197.2833251953125, + "loss": 0.661, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.2917623221874237, + "rewards/margins": 0.08966299891471863, + "rewards/rejected": -0.38142532110214233, "step": 120 }, { - "debug/losses": 0.1164780706167221, - "debug/policy_weights": 0.17811226844787598, - "debug/raw_losses": 0.6562212705612183, - "epoch": 0.10346199761241544, - "grad_norm": 2.4007915699235807, + "epoch": 0.1, "learning_rate": 4.999845414634076e-07, - "logits/chosen": -2.674560546875, - "logits/rejected": -2.6469898223876953, - "logps/chosen": -188.45101928710938, - "logps/rejected": -178.65158081054688, - "loss": 0.1182, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -0.33541449904441833, - "rewards/margins": 0.09415977448225021, - "rewards/rejected": -0.42957431077957153, + "logits/chosen": -2.658005475997925, + "logits/rejected": -2.6317684650421143, + "logps/chosen": -187.4532928466797, + "logps/rejected": -188.37689208984375, + "loss": 0.6542, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.3956056833267212, + "rewards/margins": 0.12092368304729462, + "rewards/rejected": -0.5165294408798218, "step": 130 }, { - "debug/losses": 0.09183915704488754, - "debug/policy_weights": 0.14337728917598724, - "debug/raw_losses": 0.65064537525177, - "epoch": 0.11142061281337047, - "grad_norm": 1.3456206004862283, + "epoch": 0.11, "learning_rate": 4.998106548810311e-07, - "logits/chosen": -2.628385543823242, - "logits/rejected": -2.6050915718078613, - "logps/chosen": -198.1656951904297, - "logps/rejected": -188.9591064453125, - "loss": 0.1034, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -0.5256301760673523, - "rewards/margins": 0.11118575185537338, - "rewards/rejected": -0.6368159055709839, + "logits/chosen": -2.6906683444976807, + "logits/rejected": -2.6913747787475586, + "logps/chosen": -199.67568969726562, + "logps/rejected": -253.02487182617188, + "loss": 0.6171, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.4227059781551361, + "rewards/margins": 0.27536457777023315, + "rewards/rejected": -0.6980706453323364, "step": 140 }, { - "debug/losses": 0.07921925187110901, - "debug/policy_weights": 0.1330244094133377, - "debug/raw_losses": 0.6098042130470276, - "epoch": 0.1193792280143255, - "grad_norm": 2.0760765044039147, + "epoch": 0.12, "learning_rate": 4.994436933879359e-07, - "logits/chosen": -2.606598138809204, - "logits/rejected": -2.600273847579956, - "logps/chosen": -192.59347534179688, - "logps/rejected": -219.58139038085938, - "loss": 0.0924, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -0.5305424332618713, - "rewards/margins": 0.20539744198322296, - "rewards/rejected": -0.7359398603439331, + "logits/chosen": -2.6662166118621826, + "logits/rejected": -2.644784927368164, + "logps/chosen": -197.07180786132812, + "logps/rejected": -198.4012908935547, + "loss": 0.6395, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.3597154915332794, + "rewards/margins": 0.13716872036457062, + "rewards/rejected": -0.49688419699668884, "step": 150 }, { - "debug/losses": 0.07018786668777466, - "debug/policy_weights": 0.11792769283056259, - "debug/raw_losses": 0.6061297655105591, - "epoch": 0.12733784321528055, - "grad_norm": 1.6935156833490121, + "epoch": 0.13, "learning_rate": 4.988839406031596e-07, - "logits/chosen": -2.5744402408599854, - "logits/rejected": -2.5837349891662598, - "logps/chosen": -186.702880859375, - "logps/rejected": -239.1992645263672, - "loss": 0.0757, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -0.660643458366394, - "rewards/margins": 0.2579377293586731, - "rewards/rejected": -0.9185811877250671, + "logits/chosen": -2.647681474685669, + "logits/rejected": -2.6395888328552246, + "logps/chosen": -182.04420471191406, + "logps/rejected": -206.59780883789062, + "loss": 0.629, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.3573477864265442, + "rewards/margins": 0.2222837507724762, + "rewards/rejected": -0.579631507396698, "step": 160 }, { - "debug/losses": 0.04878082871437073, - "debug/policy_weights": 0.08209049701690674, - "debug/raw_losses": 0.6344524621963501, - "epoch": 0.13529645841623558, - "grad_norm": 2.6486930748428503, + "epoch": 0.14, "learning_rate": 4.981318291512395e-07, - "logits/chosen": -2.4930522441864014, - "logits/rejected": -2.486619472503662, - "logps/chosen": -233.8449249267578, - "logps/rejected": -264.4144592285156, - "loss": 0.0581, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -0.974342942237854, - "rewards/margins": 0.22036199271678925, - "rewards/rejected": -1.1947048902511597, + "logits/chosen": -2.619232654571533, + "logits/rejected": -2.598362684249878, + "logps/chosen": -227.0933380126953, + "logps/rejected": -230.9747772216797, + "loss": 0.6242, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.7434185743331909, + "rewards/margins": 0.21749505400657654, + "rewards/rejected": -0.9609137773513794, "step": 170 }, { - "debug/losses": 0.0591508224606514, - "debug/policy_weights": 0.08082972466945648, - "debug/raw_losses": 0.6892833113670349, - "epoch": 0.14325507361719061, - "grad_norm": 2.7902190886457094, + "epoch": 0.14, "learning_rate": 4.971879403278432e-07, - "logits/chosen": -2.502582550048828, - "logits/rejected": -2.485318660736084, - "logps/chosen": -251.0359344482422, - "logps/rejected": -252.9623565673828, - "loss": 0.0563, - "rewards/accuracies": 0.581250011920929, - "rewards/chosen": -1.0250937938690186, - "rewards/margins": 0.06508847326040268, - "rewards/rejected": -1.0901821851730347, + "logits/chosen": -2.5654754638671875, + "logits/rejected": -2.5364232063293457, + "logps/chosen": -241.6617431640625, + "logps/rejected": -245.66268920898438, + "loss": 0.6151, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.7342535257339478, + "rewards/margins": 0.23685339093208313, + "rewards/rejected": -0.9711068868637085, "step": 180 }, { - "debug/losses": 0.08667734265327454, - "debug/policy_weights": 0.12972070276737213, - "debug/raw_losses": 0.6562903523445129, - "epoch": 0.15121368881814565, - "grad_norm": 3.107961610097759, + "epoch": 0.15, "learning_rate": 4.960530036504941e-07, - "logits/chosen": -2.49674654006958, - "logits/rejected": -2.4783453941345215, - "logps/chosen": -218.3079833984375, - "logps/rejected": -227.2297821044922, - "loss": 0.0838, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -0.7395851612091064, - "rewards/margins": 0.15448498725891113, - "rewards/rejected": -0.8940702676773071, + "logits/chosen": -2.5271048545837402, + "logits/rejected": -2.486818790435791, + "logps/chosen": -235.6089630126953, + "logps/rejected": -251.17758178710938, + "loss": 0.6215, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.807177722454071, + "rewards/margins": 0.28561535477638245, + "rewards/rejected": -1.0927931070327759, "step": 190 }, { - "debug/losses": 0.07691031694412231, - "debug/policy_weights": 0.12212906777858734, - "debug/raw_losses": 0.6350489854812622, - "epoch": 0.15917230401910068, - "grad_norm": 1.9719680635837602, + "epoch": 0.16, "learning_rate": 4.947278962947386e-07, - "logits/chosen": -2.455076217651367, - "logits/rejected": -2.459906578063965, - "logps/chosen": -222.7344970703125, - "logps/rejected": -255.70956420898438, - "loss": 0.0795, - "rewards/accuracies": 0.606249988079071, - "rewards/chosen": -0.7358509302139282, - "rewards/margins": 0.1919489949941635, - "rewards/rejected": -0.9277998805046082, + "logits/chosen": -2.4217896461486816, + "logits/rejected": -2.413295269012451, + "logps/chosen": -251.0736083984375, + "logps/rejected": -268.6098937988281, + "loss": 0.6062, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.086307406425476, + "rewards/margins": 0.24874301254749298, + "rewards/rejected": -1.3350504636764526, "step": 200 }, { - "epoch": 0.15917230401910068, - "eval_debug/losses": 0.08000103384256363, - "eval_debug/policy_weights": 0.12660016119480133, - "eval_debug/raw_losses": 0.6296245455741882, - "eval_logits/chosen": -2.4997141361236572, - "eval_logits/rejected": -2.4878616333007812, - "eval_logps/chosen": -215.84107971191406, - "eval_logps/rejected": -243.79220581054688, - "eval_loss": 0.08260933309793472, - "eval_rewards/accuracies": 0.6483209133148193, - "eval_rewards/chosen": -0.7159760594367981, - "eval_rewards/margins": 0.2149561196565628, - "eval_rewards/rejected": -0.9309321045875549, - "eval_runtime": 153.0714, - "eval_samples_per_second": 55.869, - "eval_steps_per_second": 0.875, + "epoch": 0.16, + "eval_logits/chosen": -2.3855514526367188, + "eval_logits/rejected": -2.369593858718872, + "eval_logps/chosen": -246.6970672607422, + "eval_logps/rejected": -289.8621826171875, + "eval_loss": 0.6079375743865967, + "eval_rewards/accuracies": 0.66697758436203, + "eval_rewards/chosen": -1.021235704421997, + "eval_rewards/margins": 0.3660426437854767, + "eval_rewards/rejected": -1.3872781991958618, + "eval_runtime": 183.9816, + "eval_samples_per_second": 46.483, + "eval_steps_per_second": 0.728, "step": 200 }, { - "debug/losses": 0.07473501563072205, - "debug/policy_weights": 0.12107981741428375, - "debug/raw_losses": 0.6079034805297852, - "epoch": 0.1671309192200557, - "grad_norm": 1.4064853346615585, + "epoch": 0.17, "learning_rate": 4.932136424161899e-07, - "logits/chosen": -2.458627223968506, - "logits/rejected": -2.446324586868286, - "logps/chosen": -200.37945556640625, - "logps/rejected": -234.1180419921875, - "loss": 0.0798, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -0.7478693127632141, - "rewards/margins": 0.2762502133846283, - "rewards/rejected": -1.0241196155548096, + "logits/chosen": -2.3366785049438477, + "logits/rejected": -2.3228511810302734, + "logps/chosen": -266.292236328125, + "logps/rejected": -300.22894287109375, + "loss": 0.5893, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.2130026817321777, + "rewards/margins": 0.3487839996814728, + "rewards/rejected": -1.5617868900299072, "step": 210 }, { - "debug/losses": 0.05417264252901077, - "debug/policy_weights": 0.09399188309907913, - "debug/raw_losses": 0.5990099310874939, - "epoch": 0.17508953442101075, - "grad_norm": 1.9293056998121305, + "epoch": 0.18, "learning_rate": 4.915114123589732e-07, - "logits/chosen": -2.487759590148926, - "logits/rejected": -2.465144395828247, - "logps/chosen": -230.71896362304688, - "logps/rejected": -261.5802917480469, - "loss": 0.0643, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -0.9381935000419617, - "rewards/margins": 0.3212595582008362, - "rewards/rejected": -1.2594529390335083, + "logits/chosen": -2.321228504180908, + "logits/rejected": -2.3033699989318848, + "logps/chosen": -336.34161376953125, + "logps/rejected": -373.39935302734375, + "loss": 0.612, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.9529145956039429, + "rewards/margins": 0.2863468527793884, + "rewards/rejected": -2.2392613887786865, "step": 220 }, { - "debug/losses": 0.07093538343906403, - "debug/policy_weights": 0.11489018052816391, - "debug/raw_losses": 0.5926896929740906, - "epoch": 0.18304814962196578, - "grad_norm": 1.6411550414623024, + "epoch": 0.18, "learning_rate": 4.896225217511849e-07, - "logits/chosen": -2.5063443183898926, - "logits/rejected": -2.502354860305786, - "logps/chosen": -224.992919921875, - "logps/rejected": -266.00543212890625, - "loss": 0.0686, - "rewards/accuracies": 0.71875, - "rewards/chosen": -0.8569623231887817, - "rewards/margins": 0.3157426714897156, - "rewards/rejected": -1.172705054283142, + "logits/chosen": -2.4310107231140137, + "logits/rejected": -2.422048568725586, + "logps/chosen": -291.1025695800781, + "logps/rejected": -328.18963623046875, + "loss": 0.6079, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.4314143657684326, + "rewards/margins": 0.3364001214504242, + "rewards/rejected": -1.7678143978118896, "step": 230 }, { - "debug/losses": 0.06816870719194412, - "debug/policy_weights": 0.10983666032552719, - "debug/raw_losses": 0.6124148964881897, - "epoch": 0.1910067648229208, - "grad_norm": 1.6733877297747892, + "epoch": 0.19, "learning_rate": 4.875484304880629e-07, - "logits/chosen": -2.512686252593994, - "logits/rejected": -2.4914631843566895, - "logps/chosen": -242.66162109375, - "logps/rejected": -266.0993957519531, - "loss": 0.0716, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -0.8090359568595886, - "rewards/margins": 0.2927786111831665, - "rewards/rejected": -1.1018145084381104, + "logits/chosen": -2.3412394523620605, + "logits/rejected": -2.309183120727539, + "logps/chosen": -280.8785705566406, + "logps/rejected": -308.54132080078125, + "loss": 0.613, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.309309482574463, + "rewards/margins": 0.3731766939163208, + "rewards/rejected": -1.6824861764907837, "step": 240 }, { - "debug/losses": 0.06458164751529694, - "debug/policy_weights": 0.10547280311584473, - "debug/raw_losses": 0.6321852803230286, - "epoch": 0.19896538002387584, - "grad_norm": 1.766771502743095, + "epoch": 0.2, "learning_rate": 4.852907416036558e-07, - "logits/chosen": -2.43923020362854, - "logits/rejected": -2.432621479034424, - "logps/chosen": -224.22988891601562, - "logps/rejected": -259.5379943847656, - "loss": 0.0732, - "rewards/accuracies": 0.606249988079071, - "rewards/chosen": -0.8493305444717407, - "rewards/margins": 0.22976212203502655, - "rewards/rejected": -1.0790926218032837, + "logits/chosen": -2.415271282196045, + "logits/rejected": -2.4072234630584717, + "logps/chosen": -243.56332397460938, + "logps/rejected": -298.7532043457031, + "loss": 0.591, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.967076301574707, + "rewards/margins": 0.4581146240234375, + "rewards/rejected": -1.4251911640167236, "step": 250 }, { - "debug/losses": 0.06774205714464188, - "debug/policy_weights": 0.11494859308004379, - "debug/raw_losses": 0.6017103791236877, - "epoch": 0.20692399522483088, - "grad_norm": 2.1704551991420646, + "epoch": 0.21, "learning_rate": 4.828512000318616e-07, - "logits/chosen": -2.4806602001190186, - "logits/rejected": -2.4344286918640137, - "logps/chosen": -252.88290405273438, - "logps/rejected": -272.6732482910156, - "loss": 0.0658, - "rewards/accuracies": 0.65625, - "rewards/chosen": -0.8818208575248718, - "rewards/margins": 0.29337772727012634, - "rewards/rejected": -1.1751985549926758, + "logits/chosen": -2.3924427032470703, + "logits/rejected": -2.3613152503967285, + "logps/chosen": -266.86572265625, + "logps/rejected": -304.2983093261719, + "loss": 0.5986, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.2214807271957397, + "rewards/margins": 0.4553411602973938, + "rewards/rejected": -1.6768219470977783, "step": 260 }, { - "debug/losses": 0.06392187625169754, - "debug/policy_weights": 0.10442598909139633, - "debug/raw_losses": 0.6145210266113281, - "epoch": 0.2148826104257859, - "grad_norm": 1.6273411526384032, + "epoch": 0.21, "learning_rate": 4.802316912577946e-07, - "logits/chosen": -2.4324231147766113, - "logits/rejected": -2.39255690574646, - "logps/chosen": -231.31167602539062, - "logps/rejected": -244.3099365234375, - "loss": 0.0687, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -0.8366304636001587, - "rewards/margins": 0.2642030119895935, - "rewards/rejected": -1.100833535194397, + "logits/chosen": -2.4108529090881348, + "logits/rejected": -2.3902478218078613, + "logps/chosen": -252.7959442138672, + "logps/rejected": -295.266357421875, + "loss": 0.5917, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.0824626684188843, + "rewards/margins": 0.39643940329551697, + "rewards/rejected": -1.4789022207260132, "step": 270 }, { - "debug/losses": 0.05992782115936279, - "debug/policy_weights": 0.0925414115190506, - "debug/raw_losses": 0.6348901987075806, - "epoch": 0.22284122562674094, - "grad_norm": 1.6517986020806328, + "epoch": 0.22, "learning_rate": 4.774342398605221e-07, - "logits/chosen": -2.416997194290161, - "logits/rejected": -2.3983612060546875, - "logps/chosen": -245.29135131835938, - "logps/rejected": -261.55218505859375, - "loss": 0.0566, - "rewards/accuracies": 0.59375, - "rewards/chosen": -0.9968692660331726, - "rewards/margins": 0.22003936767578125, - "rewards/rejected": -1.2169086933135986, + "logits/chosen": -2.3505263328552246, + "logits/rejected": -2.2942967414855957, + "logps/chosen": -279.871337890625, + "logps/rejected": -300.4220886230469, + "loss": 0.5979, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.1787078380584717, + "rewards/margins": 0.430286169052124, + "rewards/rejected": -1.6089938879013062, "step": 280 }, { - "debug/losses": 0.06707664579153061, - "debug/policy_weights": 0.10901203006505966, - "debug/raw_losses": 0.6151145100593567, - "epoch": 0.23079984082769597, - "grad_norm": 1.9541190672933495, + "epoch": 0.23, "learning_rate": 4.744610079482978e-07, - "logits/chosen": -2.4758219718933105, - "logits/rejected": -2.4370126724243164, - "logps/chosen": -264.2386474609375, - "logps/rejected": -282.10357666015625, - "loss": 0.0577, + "logits/chosen": -2.3269264698028564, + "logits/rejected": -2.2910802364349365, + "logps/chosen": -255.27706909179688, + "logps/rejected": -281.60137939453125, + "loss": 0.5853, "rewards/accuracies": 0.6875, - "rewards/chosen": -0.9856462478637695, - "rewards/margins": 0.2682034969329834, - "rewards/rejected": -1.253849744796753, + "rewards/chosen": -1.1753785610198975, + "rewards/margins": 0.3495523929595947, + "rewards/rejected": -1.5249310731887817, "step": 290 }, { - "debug/losses": 0.04470010846853256, - "debug/policy_weights": 0.07902451604604721, - "debug/raw_losses": 0.6110645532608032, - "epoch": 0.238758456028651, - "grad_norm": 2.239630938666137, + "epoch": 0.24, "learning_rate": 4.713142934875005e-07, - "logits/chosen": -2.4241604804992676, - "logits/rejected": -2.382441997528076, - "logps/chosen": -255.75048828125, - "logps/rejected": -272.4277038574219, - "loss": 0.0545, - "rewards/accuracies": 0.65625, - "rewards/chosen": -1.0819556713104248, - "rewards/margins": 0.33363839983940125, - "rewards/rejected": -1.4155938625335693, + "logits/chosen": -2.2868428230285645, + "logits/rejected": -2.2631592750549316, + "logps/chosen": -284.2200012207031, + "logps/rejected": -322.45269775390625, + "loss": 0.5965, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.336501955986023, + "rewards/margins": 0.3968800902366638, + "rewards/rejected": -1.733382225036621, "step": 300 }, { - "epoch": 0.238758456028651, - "eval_debug/losses": 0.05503418296575546, - "eval_debug/policy_weights": 0.08935731649398804, - "eval_debug/raw_losses": 0.6165817379951477, - "eval_logits/chosen": -2.4302186965942383, - "eval_logits/rejected": -2.416015386581421, - "eval_logps/chosen": -253.9807891845703, - "eval_logps/rejected": -292.5660705566406, - "eval_loss": 0.05720232427120209, - "eval_rewards/accuracies": 0.6641790866851807, - "eval_rewards/chosen": -1.0973730087280273, - "eval_rewards/margins": 0.32129794359207153, - "eval_rewards/rejected": -1.418670892715454, - "eval_runtime": 153.1278, - "eval_samples_per_second": 55.849, - "eval_steps_per_second": 0.875, + "epoch": 0.24, + "eval_logits/chosen": -2.265592098236084, + "eval_logits/rejected": -2.244987964630127, + "eval_logps/chosen": -282.3620910644531, + "eval_logps/rejected": -331.2099609375, + "eval_loss": 0.5907339453697205, + "eval_rewards/accuracies": 0.6623134613037109, + "eval_rewards/chosen": -1.3778856992721558, + "eval_rewards/margins": 0.42287060618400574, + "eval_rewards/rejected": -1.8007562160491943, + "eval_runtime": 183.9593, + "eval_samples_per_second": 46.489, + "eval_steps_per_second": 0.728, "step": 300 }, { - "debug/losses": 0.04352787882089615, - "debug/policy_weights": 0.07053101807832718, - "debug/raw_losses": 0.5888990163803101, - "epoch": 0.24671707122960604, - "grad_norm": 1.9538707292393864, + "epoch": 0.25, "learning_rate": 4.679965285265706e-07, - "logits/chosen": -2.3698525428771973, - "logits/rejected": -2.360182285308838, - "logps/chosen": -235.0277099609375, - "logps/rejected": -285.10247802734375, - "loss": 0.0545, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -1.1747382879257202, - "rewards/margins": 0.38018131256103516, - "rewards/rejected": -1.5549194812774658, + "logits/chosen": -2.2354235649108887, + "logits/rejected": -2.23685884475708, + "logps/chosen": -277.09283447265625, + "logps/rejected": -347.7145080566406, + "loss": 0.5612, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.3514426946640015, + "rewards/margins": 0.4907970428466797, + "rewards/rejected": -1.8422397375106812, "step": 310 }, { - "debug/losses": 0.036569319665431976, - "debug/policy_weights": 0.06743182241916656, - "debug/raw_losses": 0.5270095467567444, - "epoch": 0.2546756864305611, - "grad_norm": 1.4572562934726576, + "epoch": 0.25, "learning_rate": 4.64510277316316e-07, - "logits/chosen": -2.2776880264282227, - "logits/rejected": -2.233410358428955, - "logps/chosen": -288.60418701171875, - "logps/rejected": -347.5012512207031, - "loss": 0.0369, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -1.4297213554382324, - "rewards/margins": 0.6530755162239075, - "rewards/rejected": -2.082796573638916, + "logits/chosen": -2.2262344360351562, + "logits/rejected": -2.226029634475708, + "logps/chosen": -271.74212646484375, + "logps/rejected": -332.5010986328125, + "loss": 0.5903, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.3337775468826294, + "rewards/margins": 0.39512914419174194, + "rewards/rejected": -1.7289068698883057, "step": 320 }, { - "debug/losses": 0.02330527827143669, - "debug/policy_weights": 0.034696005284786224, - "debug/raw_losses": 0.6584650874137878, - "epoch": 0.26263430163151613, - "grad_norm": 1.0590322815828623, + "epoch": 0.26, "learning_rate": 4.6085823432804137e-07, - "logits/chosen": -2.21286678314209, - "logits/rejected": -2.183129072189331, - "logps/chosen": -389.7990417480469, - "logps/rejected": -418.78045654296875, - "loss": 0.0215, - "rewards/accuracies": 0.606249988079071, - "rewards/chosen": -2.4603824615478516, - "rewards/margins": 0.2922320067882538, - "rewards/rejected": -2.7526144981384277, + "logits/chosen": -2.2451891899108887, + "logits/rejected": -2.2502384185791016, + "logps/chosen": -250.6347198486328, + "logps/rejected": -333.8939208984375, + "loss": 0.5722, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.1326004266738892, + "rewards/margins": 0.5066065192222595, + "rewards/rejected": -1.639206886291504, "step": 330 }, { - "debug/losses": 0.020010516047477722, - "debug/policy_weights": 0.033290598541498184, - "debug/raw_losses": 0.615290641784668, - "epoch": 0.27059291683247116, - "grad_norm": 0.6231939002180164, + "epoch": 0.27, "learning_rate": 4.570432221710314e-07, - "logits/chosen": -2.2122933864593506, - "logits/rejected": -2.194932460784912, - "logps/chosen": -384.0797119140625, - "logps/rejected": -438.1153259277344, - "loss": 0.018, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -2.3789823055267334, - "rewards/margins": 0.44676661491394043, - "rewards/rejected": -2.825748920440674, + "logits/chosen": -2.0656931400299072, + "logits/rejected": -2.0213730335235596, + "logps/chosen": -318.232177734375, + "logps/rejected": -369.13311767578125, + "loss": 0.5766, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.584176778793335, + "rewards/margins": 0.5901076197624207, + "rewards/rejected": -2.1742844581604004, "step": 340 }, { - "debug/losses": 0.027714502066373825, - "debug/policy_weights": 0.04697718471288681, - "debug/raw_losses": 0.6301008462905884, - "epoch": 0.2785515320334262, - "grad_norm": 1.3624865342914951, + "epoch": 0.28, "learning_rate": 4.5306818941099866e-07, - "logits/chosen": -2.262915849685669, - "logits/rejected": -2.2063891887664795, - "logps/chosen": -356.70513916015625, - "logps/rejected": -376.57891845703125, - "loss": 0.0251, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -2.0648093223571777, - "rewards/margins": 0.3629462420940399, - "rewards/rejected": -2.427755832672119, + "logits/chosen": -1.9084612131118774, + "logits/rejected": -1.8514792919158936, + "logps/chosen": -316.9821472167969, + "logps/rejected": -352.9412841796875, + "loss": 0.5825, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.5627154111862183, + "rewards/margins": 0.5152220726013184, + "rewards/rejected": -2.077937364578247, "step": 350 }, { - "debug/losses": 0.03147003799676895, - "debug/policy_weights": 0.049594730138778687, - "debug/raw_losses": 0.5973548889160156, - "epoch": 0.28651014723438123, - "grad_norm": 1.0467766810317458, + "epoch": 0.29, "learning_rate": 4.4893620829118124e-07, - "logits/chosen": -2.250816822052002, - "logits/rejected": -2.222325563430786, - "logps/chosen": -329.90728759765625, - "logps/rejected": -360.37164306640625, - "loss": 0.0335, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -1.781701683998108, - "rewards/margins": 0.3680455684661865, - "rewards/rejected": -2.149747371673584, + "logits/chosen": -1.8860156536102295, + "logits/rejected": -1.8301204442977905, + "logps/chosen": -309.8200378417969, + "logps/rejected": -362.0408935546875, + "loss": 0.5755, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.5869390964508057, + "rewards/margins": 0.49348369240760803, + "rewards/rejected": -2.080422878265381, "step": 360 }, { - "debug/losses": 0.029663532972335815, - "debug/policy_weights": 0.049563243985176086, - "debug/raw_losses": 0.655312180519104, - "epoch": 0.29446876243533626, - "grad_norm": 1.209879585308472, + "epoch": 0.29, "learning_rate": 4.4465047235785185e-07, - "logits/chosen": -2.2697219848632812, - "logits/rejected": -2.2369260787963867, - "logps/chosen": -354.7026062011719, - "logps/rejected": -364.2904968261719, - "loss": 0.0316, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -2.0985820293426514, - "rewards/margins": 0.2616536021232605, - "rewards/rejected": -2.3602359294891357, + "logits/chosen": -1.6610889434814453, + "logits/rejected": -1.585129737854004, + "logps/chosen": -321.8608703613281, + "logps/rejected": -380.31036376953125, + "loss": 0.5697, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.774713158607483, + "rewards/margins": 0.6593302488327026, + "rewards/rejected": -2.4340434074401855, "step": 370 }, { - "debug/losses": 0.02233636938035488, - "debug/policy_weights": 0.03632068261504173, - "debug/raw_losses": 0.6318480968475342, - "epoch": 0.3024273776362913, - "grad_norm": 0.8855115172661371, + "epoch": 0.3, "learning_rate": 4.40214293992074e-07, - "logits/chosen": -2.1508800983428955, - "logits/rejected": -2.1226308345794678, - "logps/chosen": -370.634765625, - "logps/rejected": -402.4145202636719, - "loss": 0.025, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -2.332416534423828, - "rewards/margins": 0.34561586380004883, - "rewards/rejected": -2.678032398223877, + "logits/chosen": -1.385825753211975, + "logits/rejected": -1.31913161277771, + "logps/chosen": -377.07269287109375, + "logps/rejected": -459.5557556152344, + "loss": 0.5818, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -2.1890993118286133, + "rewards/margins": 0.7521292567253113, + "rewards/rejected": -2.9412286281585693, "step": 380 }, { - "debug/losses": 0.03705073148012161, - "debug/policy_weights": 0.058359406888484955, - "debug/raw_losses": 0.6300365328788757, - "epoch": 0.3103859928372463, - "grad_norm": 1.213922080492998, + "epoch": 0.31, "learning_rate": 4.3563110184961234e-07, - "logits/chosen": -2.2061028480529785, - "logits/rejected": -2.182528018951416, - "logps/chosen": -346.3789978027344, - "logps/rejected": -381.57293701171875, - "loss": 0.0306, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -1.9717462062835693, - "rewards/margins": 0.32251009345054626, - "rewards/rejected": -2.2942566871643066, + "logits/chosen": -1.5089499950408936, + "logits/rejected": -1.4075387716293335, + "logps/chosen": -338.3626708984375, + "logps/rejected": -396.67578125, + "loss": 0.5584, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.9042552709579468, + "rewards/margins": 0.5932050347328186, + "rewards/rejected": -2.49746036529541, "step": 390 }, { - "debug/losses": 0.02749418281018734, - "debug/policy_weights": 0.050195060670375824, - "debug/raw_losses": 0.5436308979988098, - "epoch": 0.31834460803820136, - "grad_norm": 1.1171579871093724, + "epoch": 0.32, "learning_rate": 4.3090443821097566e-07, - "logits/chosen": -2.2397525310516357, - "logits/rejected": -2.213344097137451, - "logps/chosen": -328.1468200683594, - "logps/rejected": -400.07891845703125, - "loss": 0.0288, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -1.9207608699798584, - "rewards/margins": 0.5777884721755981, - "rewards/rejected": -2.498549222946167, + "logits/chosen": -1.2587625980377197, + "logits/rejected": -1.2017955780029297, + "logps/chosen": -309.43377685546875, + "logps/rejected": -372.00531005859375, + "loss": 0.5729, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.9031288623809814, + "rewards/margins": 0.5602144598960876, + "rewards/rejected": -2.4633431434631348, "step": 400 }, { - "epoch": 0.31834460803820136, - "eval_debug/losses": 0.0286563690751791, - "eval_debug/policy_weights": 0.04770776256918907, - "eval_debug/raw_losses": 0.6043540835380554, - "eval_logits/chosen": -2.257312536239624, - "eval_logits/rejected": -2.237586736679077, - "eval_logps/chosen": -339.86920166015625, - "eval_logps/rejected": -388.4184265136719, - "eval_loss": 0.030234459787607193, - "eval_rewards/accuracies": 0.6697761416435242, - "eval_rewards/chosen": -1.9562571048736572, - "eval_rewards/margins": 0.4209369122982025, - "eval_rewards/rejected": -2.37719464302063, - "eval_runtime": 153.0016, - "eval_samples_per_second": 55.895, - "eval_steps_per_second": 0.876, + "epoch": 0.32, + "eval_logits/chosen": -1.3760210275650024, + "eval_logits/rejected": -1.2920024394989014, + "eval_logps/chosen": -312.20635986328125, + "eval_logps/rejected": -375.1720275878906, + "eval_loss": 0.5711147785186768, + "eval_rewards/accuracies": 0.6828358173370361, + "eval_rewards/chosen": -1.676328182220459, + "eval_rewards/margins": 0.5640482306480408, + "eval_rewards/rejected": -2.2403764724731445, + "eval_runtime": 183.9941, + "eval_samples_per_second": 46.48, + "eval_steps_per_second": 0.728, "step": 400 }, { - "debug/losses": 0.026865383610129356, - "debug/policy_weights": 0.04847482591867447, - "debug/raw_losses": 0.5621613264083862, - "epoch": 0.3263032232391564, - "grad_norm": 1.2502124213306463, + "epoch": 0.33, "learning_rate": 4.2603795624364195e-07, - "logits/chosen": -2.2415857315063477, - "logits/rejected": -2.195265293121338, - "logps/chosen": -318.52496337890625, - "logps/rejected": -356.39923095703125, - "loss": 0.0289, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -1.8215112686157227, - "rewards/margins": 0.4676322042942047, - "rewards/rejected": -2.2891438007354736, + "logits/chosen": -1.2894772291183472, + "logits/rejected": -1.23129141330719, + "logps/chosen": -299.457275390625, + "logps/rejected": -370.8555908203125, + "loss": 0.5666, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.6054102182388306, + "rewards/margins": 0.5984233021736145, + "rewards/rejected": -2.203833818435669, "step": 410 }, { - "debug/losses": 0.03042503260076046, - "debug/policy_weights": 0.05211452394723892, - "debug/raw_losses": 0.5721229314804077, - "epoch": 0.3342618384401114, - "grad_norm": 0.917628025882299, + "epoch": 0.33, "learning_rate": 4.210354171785795e-07, - "logits/chosen": -2.2705883979797363, - "logits/rejected": -2.2648684978485107, - "logps/chosen": -333.748046875, - "logps/rejected": -389.78167724609375, - "loss": 0.0285, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -1.9158363342285156, - "rewards/margins": 0.4352361559867859, - "rewards/rejected": -2.3510725498199463, + "logits/chosen": -1.022984266281128, + "logits/rejected": -0.9285897016525269, + "logps/chosen": -324.4284973144531, + "logps/rejected": -385.0074157714844, + "loss": 0.5596, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.921677589416504, + "rewards/margins": 0.5404387712478638, + "rewards/rejected": -2.4621164798736572, "step": 420 }, { - "debug/losses": 0.028450261801481247, - "debug/policy_weights": 0.049022845923900604, - "debug/raw_losses": 0.6050612330436707, - "epoch": 0.34222045364106646, - "grad_norm": 1.3942371078185885, + "epoch": 0.34, "learning_rate": 4.15900687403248e-07, - "logits/chosen": -2.260845184326172, - "logits/rejected": -2.2445521354675293, - "logps/chosen": -337.31494140625, - "logps/rejected": -379.3194580078125, - "loss": 0.0306, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -1.9938061237335205, - "rewards/margins": 0.39662283658981323, - "rewards/rejected": -2.3904290199279785, + "logits/chosen": -0.8059805631637573, + "logits/rejected": -0.7196700572967529, + "logps/chosen": -353.788330078125, + "logps/rejected": -411.4853515625, + "loss": 0.5865, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -2.1321234703063965, + "rewards/margins": 0.463266521692276, + "rewards/rejected": -2.5953898429870605, "step": 430 }, { - "debug/losses": 0.031036963686347008, - "debug/policy_weights": 0.05087602883577347, - "debug/raw_losses": 0.6116689443588257, - "epoch": 0.3501790688420215, - "grad_norm": 1.2091608750974971, + "epoch": 0.35, "learning_rate": 4.1063773547332584e-07, - "logits/chosen": -2.271272659301758, - "logits/rejected": -2.2465357780456543, - "logps/chosen": -337.552490234375, - "logps/rejected": -381.76239013671875, - "loss": 0.0313, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -1.9061357975006104, - "rewards/margins": 0.43205636739730835, - "rewards/rejected": -2.3381924629211426, + "logits/chosen": -0.9645301699638367, + "logits/rejected": -0.7601315975189209, + "logps/chosen": -346.8272705078125, + "logps/rejected": -392.2935791015625, + "loss": 0.5591, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.9153356552124023, + "rewards/margins": 0.5891679525375366, + "rewards/rejected": -2.5045037269592285, "step": 440 }, { - "debug/losses": 0.02100428007543087, - "debug/policy_weights": 0.040780894458293915, - "debug/raw_losses": 0.5640828609466553, - "epoch": 0.3581376840429765, - "grad_norm": 1.2368184816859698, + "epoch": 0.36, "learning_rate": 4.0525062904547276e-07, - "logits/chosen": -2.2050936222076416, - "logits/rejected": -2.1659345626831055, - "logps/chosen": -341.0467834472656, - "logps/rejected": -385.0848693847656, - "loss": 0.029, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -2.084838390350342, - "rewards/margins": 0.5166509747505188, - "rewards/rejected": -2.601489543914795, + "logits/chosen": -0.608537495136261, + "logits/rejected": -0.47767123579978943, + "logps/chosen": -341.55364990234375, + "logps/rejected": -434.1073303222656, + "loss": 0.5687, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.105318069458008, + "rewards/margins": 0.6994394659996033, + "rewards/rejected": -2.8047571182250977, "step": 450 }, { - "debug/losses": 0.03121250309050083, - "debug/policy_weights": 0.05547971650958061, - "debug/raw_losses": 0.5400241613388062, - "epoch": 0.36609629924393156, - "grad_norm": 1.2892249196934267, + "epoch": 0.37, "learning_rate": 3.997435317334988e-07, - "logits/chosen": -2.279585361480713, - "logits/rejected": -2.251013994216919, - "logps/chosen": -340.62738037109375, - "logps/rejected": -401.3656005859375, - "loss": 0.03, + "logits/chosen": -0.6356207132339478, + "logits/rejected": -0.25634175539016724, + "logps/chosen": -384.43780517578125, + "logps/rejected": -419.24176025390625, + "loss": 0.5608, "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -1.936126708984375, - "rewards/margins": 0.5458568334579468, - "rewards/rejected": -2.4819834232330322, + "rewards/chosen": -2.2970900535583496, + "rewards/margins": 0.6535100340843201, + "rewards/rejected": -2.9506001472473145, "step": 460 }, { - "debug/losses": 0.0340055450797081, - "debug/policy_weights": 0.05620163679122925, - "debug/raw_losses": 0.5971667170524597, - "epoch": 0.3740549144448866, - "grad_norm": 1.0883622256614363, + "epoch": 0.37, "learning_rate": 3.941206998903701e-07, - "logits/chosen": -2.304421901702881, - "logits/rejected": -2.2663235664367676, - "logps/chosen": -359.36468505859375, - "logps/rejected": -393.1944885253906, - "loss": 0.0283, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -2.045952320098877, - "rewards/margins": 0.3646782636642456, - "rewards/rejected": -2.410630702972412, + "logits/chosen": -1.0318920612335205, + "logits/rejected": -0.7451022267341614, + "logps/chosen": -338.9430236816406, + "logps/rejected": -384.64111328125, + "loss": 0.5678, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.9643396139144897, + "rewards/margins": 0.5402536392211914, + "rewards/rejected": -2.5045928955078125, "step": 470 }, { - "debug/losses": 0.02767511084675789, - "debug/policy_weights": 0.042119450867176056, - "debug/raw_losses": 0.5988866090774536, - "epoch": 0.3820135296458416, - "grad_norm": 1.153171503132909, + "epoch": 0.38, "learning_rate": 3.8838647931853684e-07, - "logits/chosen": -2.15468430519104, - "logits/rejected": -2.124429702758789, - "logps/chosen": -333.17987060546875, - "logps/rejected": -377.18231201171875, - "loss": 0.0293, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -2.112797260284424, - "rewards/margins": 0.4070938527584076, - "rewards/rejected": -2.519890785217285, + "logits/chosen": -0.6847028732299805, + "logits/rejected": -0.5548251867294312, + "logps/chosen": -339.61456298828125, + "logps/rejected": -435.32061767578125, + "loss": 0.5814, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.1349122524261475, + "rewards/margins": 0.7573872804641724, + "rewards/rejected": -2.8923001289367676, "step": 480 }, { - "debug/losses": 0.031500790268182755, - "debug/policy_weights": 0.05641796067357063, - "debug/raw_losses": 0.5616481900215149, - "epoch": 0.38997214484679665, - "grad_norm": 1.584575140641085, + "epoch": 0.39, "learning_rate": 3.825453019111281e-07, - "logits/chosen": -2.1593868732452393, - "logits/rejected": -2.1285595893859863, - "logps/chosen": -337.25396728515625, - "logps/rejected": -398.2967529296875, - "loss": 0.0325, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -1.9278494119644165, - "rewards/margins": 0.5101264119148254, - "rewards/rejected": -2.4379756450653076, + "logits/chosen": -0.5378957986831665, + "logits/rejected": -0.28533270955085754, + "logps/chosen": -363.78570556640625, + "logps/rejected": -430.11749267578125, + "loss": 0.5327, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.134934425354004, + "rewards/margins": 0.6089809536933899, + "rewards/rejected": -2.743915319442749, "step": 490 }, { - "debug/losses": 0.042373210191726685, - "debug/policy_weights": 0.0751892626285553, - "debug/raw_losses": 0.5488675832748413, - "epoch": 0.3979307600477517, - "grad_norm": 1.6646537625502156, + "epoch": 0.4, "learning_rate": 3.7660168222660824e-07, - "logits/chosen": -2.291050910949707, - "logits/rejected": -2.2317252159118652, - "logps/chosen": -337.90264892578125, - "logps/rejected": -370.1439514160156, - "loss": 0.0358, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -1.6945345401763916, - "rewards/margins": 0.4844774305820465, - "rewards/rejected": -2.179011821746826, + "logits/chosen": -0.6318235397338867, + "logits/rejected": -0.5071814656257629, + "logps/chosen": -350.5252380371094, + "logps/rejected": -421.93353271484375, + "loss": 0.5645, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -2.062009572982788, + "rewards/margins": 0.5333147048950195, + "rewards/rejected": -2.5953242778778076, "step": 500 }, { - "epoch": 0.3979307600477517, - "eval_debug/losses": 0.039384085685014725, - "eval_debug/policy_weights": 0.06587707996368408, - "eval_debug/raw_losses": 0.6064032912254333, - "eval_logits/chosen": -2.2540104389190674, - "eval_logits/rejected": -2.226517677307129, - "eval_logps/chosen": -315.93218994140625, - "eval_logps/rejected": -366.12408447265625, - "eval_loss": 0.040718384087085724, - "eval_rewards/accuracies": 0.6697761416435242, - "eval_rewards/chosen": -1.71688711643219, - "eval_rewards/margins": 0.43736401200294495, - "eval_rewards/rejected": -2.1542508602142334, - "eval_runtime": 153.0429, - "eval_samples_per_second": 55.88, - "eval_steps_per_second": 0.876, + "epoch": 0.4, + "eval_logits/chosen": -0.7860146760940552, + "eval_logits/rejected": -0.6090859770774841, + "eval_logps/chosen": -351.7882995605469, + "eval_logps/rejected": -419.81939697265625, + "eval_loss": 0.5639454126358032, + "eval_rewards/accuracies": 0.6986940503120422, + "eval_rewards/chosen": -2.0721471309661865, + "eval_rewards/margins": 0.6147031188011169, + "eval_rewards/rejected": -2.6868505477905273, + "eval_runtime": 183.9958, + "eval_samples_per_second": 46.479, + "eval_steps_per_second": 0.728, "step": 500 }, { - "debug/losses": 0.044026248157024384, - "debug/policy_weights": 0.07014697045087814, - "debug/raw_losses": 0.6108037829399109, - "epoch": 0.4058893752487067, - "grad_norm": 1.7093431457694426, + "epoch": 0.41, "learning_rate": 3.705602139995416e-07, - "logits/chosen": -2.22845721244812, - "logits/rejected": -2.192497491836548, - "logps/chosen": -311.43267822265625, - "logps/rejected": -353.4495544433594, - "loss": 0.0503, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -1.591703176498413, - "rewards/margins": 0.43914633989334106, - "rewards/rejected": -2.0308496952056885, + "logits/chosen": -0.7258490920066833, + "logits/rejected": -0.4828409254550934, + "logps/chosen": -388.1371154785156, + "logps/rejected": -422.11181640625, + "loss": 0.574, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -2.290266513824463, + "rewards/margins": 0.4104091227054596, + "rewards/rejected": -2.7006754875183105, "step": 510 }, { - "debug/losses": 0.04649205133318901, - "debug/policy_weights": 0.07462817430496216, - "debug/raw_losses": 0.6169490814208984, - "epoch": 0.41384799044966175, - "grad_norm": 1.3610692184849214, + "epoch": 0.41, "learning_rate": 3.6442556659016475e-07, - "logits/chosen": -2.287468671798706, - "logits/rejected": -2.2674317359924316, - "logps/chosen": -294.1647033691406, - "logps/rejected": -330.3625183105469, - "loss": 0.0468, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -1.4665789604187012, - "rewards/margins": 0.33825773000717163, - "rewards/rejected": -1.804836630821228, + "logits/chosen": -0.5335447192192078, + "logits/rejected": -0.33706527948379517, + "logps/chosen": -378.86492919921875, + "logps/rejected": -429.67724609375, + "loss": 0.5608, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.236337423324585, + "rewards/margins": 0.556148886680603, + "rewards/rejected": -2.7924864292144775, "step": 520 }, { - "debug/losses": 0.04447519779205322, - "debug/policy_weights": 0.06717316806316376, - "debug/raw_losses": 0.6419554948806763, - "epoch": 0.4218066056506168, - "grad_norm": 1.196394399121714, + "epoch": 0.42, "learning_rate": 3.582024813755076e-07, - "logits/chosen": -2.258765697479248, - "logits/rejected": -2.2340471744537354, - "logps/chosen": -329.6542053222656, - "logps/rejected": -353.29425048828125, - "loss": 0.0341, - "rewards/accuracies": 0.612500011920929, - "rewards/chosen": -1.7902171611785889, - "rewards/margins": 0.2765839397907257, - "rewards/rejected": -2.0668013095855713, + "logits/chosen": -0.39548322558403015, + "logits/rejected": -0.10662730038166046, + "logps/chosen": -368.8847961425781, + "logps/rejected": -473.3500061035156, + "loss": 0.5485, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.3263449668884277, + "rewards/margins": 0.8236624598503113, + "rewards/rejected": -3.150007724761963, "step": 530 }, { - "debug/losses": 0.024901706725358963, - "debug/policy_weights": 0.04227976128458977, - "debug/raw_losses": 0.586767315864563, - "epoch": 0.4297652208515718, - "grad_norm": 1.0501870660898793, + "epoch": 0.43, "learning_rate": 3.5189576808485404e-07, - "logits/chosen": -2.2085609436035156, - "logits/rejected": -2.182343006134033, - "logps/chosen": -358.0467529296875, - "logps/rejected": -406.2567443847656, - "loss": 0.0256, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -2.0745999813079834, - "rewards/margins": 0.43312233686447144, - "rewards/rejected": -2.5077223777770996, + "logits/chosen": 0.15742243826389313, + "logits/rejected": 0.31491726636886597, + "logps/chosen": -394.34930419921875, + "logps/rejected": -492.82232666015625, + "loss": 0.5478, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.6109700202941895, + "rewards/margins": 0.8250136375427246, + "rewards/rejected": -3.435983657836914, "step": 540 }, { - "debug/losses": 0.027683740481734276, - "debug/policy_weights": 0.0427556186914444, - "debug/raw_losses": 0.6474384069442749, - "epoch": 0.43772383605252685, - "grad_norm": 1.6499830455402607, + "epoch": 0.44, "learning_rate": 3.4551030108237433e-07, - "logits/chosen": -2.1517820358276367, - "logits/rejected": -2.0897772312164307, - "logps/chosen": -370.8080749511719, - "logps/rejected": -387.0035400390625, - "loss": 0.0258, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -2.1823668479919434, - "rewards/margins": 0.329748272895813, - "rewards/rejected": -2.512115001678467, + "logits/chosen": -0.2550584375858307, + "logits/rejected": -0.06936412304639816, + "logps/chosen": -406.5508728027344, + "logps/rejected": -448.47576904296875, + "loss": 0.5562, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -2.5152666568756104, + "rewards/margins": 0.4819938540458679, + "rewards/rejected": -2.997260332107544, "step": 550 }, { - "debug/losses": 0.026466000825166702, - "debug/policy_weights": 0.05167583376169205, - "debug/raw_losses": 0.5545183420181274, - "epoch": 0.4456824512534819, - "grad_norm": 1.3487018057051514, + "epoch": 0.45, "learning_rate": 3.390510155998023e-07, - "logits/chosen": -2.20346736907959, - "logits/rejected": -2.157214879989624, - "logps/chosen": -348.9996643066406, - "logps/rejected": -407.4266357421875, - "loss": 0.0282, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -1.9474118947982788, - "rewards/margins": 0.5647018551826477, - "rewards/rejected": -2.5121140480041504, + "logits/chosen": -0.5292027592658997, + "logits/rejected": -0.2619571387767792, + "logps/chosen": -371.6798095703125, + "logps/rejected": -420.7915954589844, + "loss": 0.5492, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.1147050857543945, + "rewards/margins": 0.6524336338043213, + "rewards/rejected": -2.7671384811401367, "step": 560 }, { - "debug/losses": 0.03509330004453659, - "debug/policy_weights": 0.05596591904759407, - "debug/raw_losses": 0.5913228988647461, - "epoch": 0.4536410664544369, - "grad_norm": 1.489783171592465, + "epoch": 0.45, "learning_rate": 3.325229039220684e-07, - "logits/chosen": -2.111301898956299, - "logits/rejected": -2.084190607070923, - "logps/chosen": -355.8735656738281, - "logps/rejected": -399.67095947265625, - "loss": 0.0317, + "logits/chosen": -0.5881962776184082, + "logits/rejected": -0.4658876061439514, + "logps/chosen": -343.7039794921875, + "logps/rejected": -406.14178466796875, + "loss": 0.57, "rewards/accuracies": 0.6875, - "rewards/chosen": -2.0182948112487793, - "rewards/margins": 0.457012414932251, - "rewards/rejected": -2.4753072261810303, + "rewards/chosen": -2.0860273838043213, + "rewards/margins": 0.498068630695343, + "rewards/rejected": -2.5840957164764404, "step": 570 }, { - "debug/losses": 0.03357679396867752, - "debug/policy_weights": 0.05124642699956894, - "debug/raw_losses": 0.6098935008049011, - "epoch": 0.46159968165539195, - "grad_norm": 1.242702956848461, + "epoch": 0.46, "learning_rate": 3.2593101152883795e-07, - "logits/chosen": -2.1088345050811768, - "logits/rejected": -2.067966938018799, - "logps/chosen": -359.7041320800781, - "logps/rejected": -397.36822509765625, - "loss": 0.0233, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -2.107734203338623, - "rewards/margins": 0.3990080952644348, - "rewards/rejected": -2.506742238998413, + "logits/chosen": -0.6565806269645691, + "logits/rejected": -0.2549567222595215, + "logps/chosen": -374.8047180175781, + "logps/rejected": -430.33221435546875, + "loss": 0.5512, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.2211391925811768, + "rewards/margins": 0.6813799142837524, + "rewards/rejected": -2.9025187492370605, "step": 580 }, { - "debug/losses": 0.03141509369015694, - "debug/policy_weights": 0.05401759222149849, - "debug/raw_losses": 0.5844415426254272, - "epoch": 0.469558296856347, - "grad_norm": 1.1114396949980998, + "epoch": 0.47, "learning_rate": 3.192804331949349e-07, - "logits/chosen": -2.114969253540039, - "logits/rejected": -2.0797171592712402, - "logps/chosen": -340.1515197753906, - "logps/rejected": -380.93218994140625, - "loss": 0.0275, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -1.975218415260315, - "rewards/margins": 0.44081950187683105, - "rewards/rejected": -2.4160375595092773, + "logits/chosen": -0.07184700667858124, + "logits/rejected": 0.1699156016111374, + "logps/chosen": -422.27081298828125, + "logps/rejected": -490.69134521484375, + "loss": 0.535, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.730973720550537, + "rewards/margins": 0.7726518511772156, + "rewards/rejected": -3.5036251544952393, "step": 590 }, { - "debug/losses": 0.028428133577108383, - "debug/policy_weights": 0.04971148818731308, - "debug/raw_losses": 0.574429452419281, - "epoch": 0.477516912057302, - "grad_norm": 1.8581586725148402, + "epoch": 0.48, "learning_rate": 3.125763090526674e-07, - "logits/chosen": -2.1486268043518066, - "logits/rejected": -2.094489574432373, - "logps/chosen": -346.2554016113281, - "logps/rejected": -392.0444030761719, - "loss": 0.0309, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -1.914086937904358, - "rewards/margins": 0.5169549584388733, - "rewards/rejected": -2.431041717529297, + "logits/chosen": -0.029465889558196068, + "logits/rejected": 0.15842057764530182, + "logps/chosen": -417.373046875, + "logps/rejected": -478.73291015625, + "loss": 0.5513, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.8006317615509033, + "rewards/margins": 0.6451797485351562, + "rewards/rejected": -3.4458115100860596, "step": 600 }, { - "epoch": 0.477516912057302, - "eval_debug/losses": 0.02868218533694744, - "eval_debug/policy_weights": 0.048851560801267624, - "eval_debug/raw_losses": 0.589902400970459, - "eval_logits/chosen": -2.115903377532959, - "eval_logits/rejected": -2.0849289894104004, - "eval_logps/chosen": -339.2856750488281, - "eval_logps/rejected": -391.61474609375, - "eval_loss": 0.030176514759659767, - "eval_rewards/accuracies": 0.6660447716712952, - "eval_rewards/chosen": -1.9504221677780151, - "eval_rewards/margins": 0.4587351679801941, - "eval_rewards/rejected": -2.4091572761535645, - "eval_runtime": 153.0228, - "eval_samples_per_second": 55.887, - "eval_steps_per_second": 0.876, + "epoch": 0.48, + "eval_logits/chosen": -0.10542195290327072, + "eval_logits/rejected": 0.12242482602596283, + "eval_logps/chosen": -436.9386291503906, + "eval_logps/rejected": -505.02227783203125, + "eval_loss": 0.5582411885261536, + "eval_rewards/accuracies": 0.7108209133148193, + "eval_rewards/chosen": -2.9236514568328857, + "eval_rewards/margins": 0.6152271032333374, + "eval_rewards/rejected": -3.5388784408569336, + "eval_runtime": 183.9757, + "eval_samples_per_second": 46.484, + "eval_steps_per_second": 0.728, "step": 600 }, { - "debug/losses": 0.0295234564691782, - "debug/policy_weights": 0.047347038984298706, - "debug/raw_losses": 0.6249476671218872, - "epoch": 0.48547552725825704, - "grad_norm": 1.005872549018641, + "epoch": 0.49, "learning_rate": 3.0582382061909623e-07, - "logits/chosen": -2.1071560382843018, - "logits/rejected": -2.065136432647705, - "logps/chosen": -339.8302917480469, - "logps/rejected": -375.41748046875, - "loss": 0.0304, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -1.9870408773422241, - "rewards/margins": 0.33729949593544006, - "rewards/rejected": -2.324340343475342, + "logits/chosen": -0.2445104569196701, + "logits/rejected": -0.018268002197146416, + "logps/chosen": -441.7857971191406, + "logps/rejected": -502.60791015625, + "loss": 0.5594, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.858261823654175, + "rewards/margins": 0.5510683655738831, + "rewards/rejected": -3.409330368041992, "step": 610 }, { - "debug/losses": 0.025071118026971817, - "debug/policy_weights": 0.040535397827625275, - "debug/raw_losses": 0.600980818271637, - "epoch": 0.4934341424592121, - "grad_norm": 1.3090829877164378, + "epoch": 0.49, "learning_rate": 2.9902818679131775e-07, - "logits/chosen": -2.0073862075805664, - "logits/rejected": -1.9497063159942627, - "logps/chosen": -363.63079833984375, - "logps/rejected": -403.33758544921875, - "loss": 0.0267, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -2.0890800952911377, - "rewards/margins": 0.452686071395874, - "rewards/rejected": -2.5417656898498535, + "logits/chosen": -0.4190225601196289, + "logits/rejected": -0.22823679447174072, + "logps/chosen": -399.03924560546875, + "logps/rejected": -498.6724548339844, + "loss": 0.5499, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.650449275970459, + "rewards/margins": 0.7673205137252808, + "rewards/rejected": -3.4177703857421875, "step": 620 }, { - "debug/losses": 0.02215055376291275, - "debug/policy_weights": 0.03932579606771469, - "debug/raw_losses": 0.5642175078392029, - "epoch": 0.5013927576601671, - "grad_norm": 1.5282212337262977, + "epoch": 0.5, "learning_rate": 2.921946598128571e-07, - "logits/chosen": -1.9308440685272217, - "logits/rejected": -1.8768609762191772, - "logps/chosen": -357.49041748046875, - "logps/rejected": -387.71112060546875, - "loss": 0.0273, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -2.0471129417419434, - "rewards/margins": 0.5093515515327454, - "rewards/rejected": -2.556464433670044, + "logits/chosen": -0.43653860688209534, + "logits/rejected": -0.20837187767028809, + "logps/chosen": -402.82781982421875, + "logps/rejected": -485.4117736816406, + "loss": 0.5739, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.573000192642212, + "rewards/margins": 0.7478531002998352, + "rewards/rejected": -3.3208529949188232, "step": 630 }, { - "debug/losses": 0.02335355058312416, - "debug/policy_weights": 0.03469157591462135, - "debug/raw_losses": 0.6207915544509888, - "epoch": 0.5093513728611222, - "grad_norm": 1.444583233150152, + "epoch": 0.51, "learning_rate": 2.8532852121428733e-07, - "logits/chosen": -1.7392104864120483, - "logits/rejected": -1.655212640762329, - "logps/chosen": -359.82733154296875, - "logps/rejected": -401.420654296875, - "loss": 0.0232, - "rewards/accuracies": 0.625, - "rewards/chosen": -2.243035316467285, - "rewards/margins": 0.43869590759277344, - "rewards/rejected": -2.6817312240600586, + "logits/chosen": -0.43430274724960327, + "logits/rejected": -0.13240045309066772, + "logps/chosen": -397.2491149902344, + "logps/rejected": -442.12384033203125, + "loss": 0.5462, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.4125733375549316, + "rewards/margins": 0.5821372866630554, + "rewards/rejected": -2.9947104454040527, "step": 640 }, { - "debug/losses": 0.02076483704149723, - "debug/policy_weights": 0.034380484372377396, - "debug/raw_losses": 0.55111163854599, - "epoch": 0.5173099880620772, - "grad_norm": 1.169956473894214, + "epoch": 0.52, "learning_rate": 2.7843507773121414e-07, - "logits/chosen": -1.711505651473999, - "logits/rejected": -1.6203527450561523, - "logps/chosen": -366.01141357421875, - "logps/rejected": -429.87237548828125, - "loss": 0.0239, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -2.2879557609558105, - "rewards/margins": 0.5667728185653687, - "rewards/rejected": -2.8547282218933105, + "logits/chosen": -0.4247920513153076, + "logits/rejected": -0.21372787654399872, + "logps/chosen": -389.4237976074219, + "logps/rejected": -458.3169860839844, + "loss": 0.5373, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -2.444688320159912, + "rewards/margins": 0.7236617207527161, + "rewards/rejected": -3.1683506965637207, "step": 650 }, { - "debug/losses": 0.017944971099495888, - "debug/policy_weights": 0.03764867037534714, - "debug/raw_losses": 0.526678740978241, - "epoch": 0.5252686032630323, - "grad_norm": 1.1259040599406782, + "epoch": 0.53, "learning_rate": 2.715196572027789e-07, - "logits/chosen": -1.6266266107559204, - "logits/rejected": -1.4919464588165283, - "logps/chosen": -371.72772216796875, - "logps/rejected": -443.4164123535156, - "loss": 0.0222, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -2.272620677947998, - "rewards/margins": 0.6203452944755554, - "rewards/rejected": -2.892965793609619, + "logits/chosen": -0.6697942614555359, + "logits/rejected": -0.4933086931705475, + "logps/chosen": -387.529296875, + "logps/rejected": -472.73944091796875, + "loss": 0.5685, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.3496451377868652, + "rewards/margins": 0.7728831171989441, + "rewards/rejected": -3.122528314590454, "step": 660 }, { - "debug/losses": 0.01783212646842003, - "debug/policy_weights": 0.03255900740623474, - "debug/raw_losses": 0.5750246047973633, - "epoch": 0.5332272184639872, - "grad_norm": 1.0656793710017185, + "epoch": 0.53, "learning_rate": 2.645876044538521e-07, - "logits/chosen": -1.7439796924591064, - "logits/rejected": -1.6551119089126587, - "logps/chosen": -379.49322509765625, - "logps/rejected": -420.49029541015625, - "loss": 0.0206, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -2.303612232208252, - "rewards/margins": 0.4928767681121826, - "rewards/rejected": -2.7964890003204346, + "logits/chosen": -1.0338900089263916, + "logits/rejected": -0.8813627362251282, + "logps/chosen": -372.53118896484375, + "logps/rejected": -426.54241943359375, + "loss": 0.5725, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.201908588409424, + "rewards/margins": 0.5010865926742554, + "rewards/rejected": -2.7029950618743896, "step": 670 }, { - "debug/losses": 0.020845994353294373, - "debug/policy_weights": 0.03576134517788887, - "debug/raw_losses": 0.5663331151008606, - "epoch": 0.5411858336649423, - "grad_norm": 1.0909834179777187, + "epoch": 0.54, "learning_rate": 2.5764427716409815e-07, - "logits/chosen": -1.7359968423843384, - "logits/rejected": -1.6398273706436157, - "logps/chosen": -392.9457702636719, - "logps/rejected": -438.53436279296875, - "loss": 0.0186, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -2.3817615509033203, - "rewards/margins": 0.4670361578464508, - "rewards/rejected": -2.848797559738159, + "logits/chosen": -0.9278701543807983, + "logits/rejected": -0.7282145023345947, + "logps/chosen": -347.2828674316406, + "logps/rejected": -416.9349060058594, + "loss": 0.5479, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.0276436805725098, + "rewards/margins": 0.743033230304718, + "rewards/rejected": -2.770677089691162, "step": 680 }, { - "debug/losses": 0.02242189273238182, - "debug/policy_weights": 0.03661586716771126, - "debug/raw_losses": 0.6055338978767395, - "epoch": 0.5491444488658973, - "grad_norm": 1.2304923828991894, + "epoch": 0.55, "learning_rate": 2.5069504172710494e-07, - "logits/chosen": -1.7902625799179077, - "logits/rejected": -1.747271180152893, - "logps/chosen": -390.880859375, - "logps/rejected": -448.6715393066406, - "loss": 0.0201, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -2.4254586696624756, - "rewards/margins": 0.39989569783210754, - "rewards/rejected": -2.8253543376922607, + "logits/chosen": -0.5008482336997986, + "logits/rejected": -0.34875133633613586, + "logps/chosen": -373.7621154785156, + "logps/rejected": -485.12884521484375, + "loss": 0.5217, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.347053050994873, + "rewards/margins": 0.9024646878242493, + "rewards/rejected": -3.2495174407958984, "step": 690 }, { - "debug/losses": 0.02342490293085575, - "debug/policy_weights": 0.03432609513401985, - "debug/raw_losses": 0.6200428605079651, - "epoch": 0.5571030640668524, - "grad_norm": 0.9871754800391283, + "epoch": 0.56, "learning_rate": 2.4374526910277886e-07, - "logits/chosen": -1.8094078302383423, - "logits/rejected": -1.6969894170761108, - "logps/chosen": -372.88385009765625, - "logps/rejected": -403.9977722167969, - "loss": 0.0203, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -2.29471492767334, - "rewards/margins": 0.35542750358581543, - "rewards/rejected": -2.6501424312591553, + "logits/chosen": 0.06850005686283112, + "logits/rejected": 0.41385045647621155, + "logps/chosen": -411.46246337890625, + "logps/rejected": -476.6162109375, + "loss": 0.5571, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -2.6757898330688477, + "rewards/margins": 0.8085702657699585, + "rewards/rejected": -3.4843602180480957, "step": 700 }, { - "epoch": 0.5571030640668524, - "eval_debug/losses": 0.018499089404940605, - "eval_debug/policy_weights": 0.031654853373765945, - "eval_debug/raw_losses": 0.580752432346344, - "eval_logits/chosen": -1.7383748292922974, - "eval_logits/rejected": -1.6612528562545776, - "eval_logps/chosen": -377.3936767578125, - "eval_logps/rejected": -427.1260681152344, - "eval_loss": 0.01980224810540676, - "eval_rewards/accuracies": 0.6856343150138855, - "eval_rewards/chosen": -2.3315021991729736, - "eval_rewards/margins": 0.43276873230934143, - "eval_rewards/rejected": -2.764270544052124, - "eval_runtime": 153.0401, - "eval_samples_per_second": 55.881, - "eval_steps_per_second": 0.876, + "epoch": 0.56, + "eval_logits/chosen": 0.035554468631744385, + "eval_logits/rejected": 0.2980235815048218, + "eval_logps/chosen": -424.2823486328125, + "eval_logps/rejected": -505.6960754394531, + "eval_loss": 0.5558871626853943, + "eval_rewards/accuracies": 0.704291045665741, + "eval_rewards/chosen": -2.797088146209717, + "eval_rewards/margins": 0.748529314994812, + "eval_rewards/rejected": -3.5456173419952393, + "eval_runtime": 184.0145, + "eval_samples_per_second": 46.475, + "eval_steps_per_second": 0.728, "step": 700 }, { - "debug/losses": 0.018895355984568596, - "debug/policy_weights": 0.03160635381937027, - "debug/raw_losses": 0.607495903968811, - "epoch": 0.5650616792678074, - "grad_norm": 1.3171395438648685, + "epoch": 0.57, "learning_rate": 2.368003306662104e-07, - "logits/chosen": -1.6481819152832031, - "logits/rejected": -1.5332415103912354, - "logps/chosen": -404.7178955078125, - "logps/rejected": -433.3427734375, - "loss": 0.0188, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -2.4912447929382324, - "rewards/margins": 0.39817237854003906, - "rewards/rejected": -2.8894169330596924, + "logits/chosen": 0.07857178151607513, + "logits/rejected": 0.3302653729915619, + "logps/chosen": -413.8836975097656, + "logps/rejected": -535.0875244140625, + "loss": 0.5287, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.7193782329559326, + "rewards/margins": 1.0089346170425415, + "rewards/rejected": -3.7283127307891846, "step": 710 }, { - "debug/losses": 0.019927415996789932, - "debug/policy_weights": 0.03931970149278641, - "debug/raw_losses": 0.5693992376327515, - "epoch": 0.5730202944687625, - "grad_norm": 1.252735847781797, + "epoch": 0.57, "learning_rate": 2.2986559405621886e-07, - "logits/chosen": -1.5497167110443115, - "logits/rejected": -1.4008510112762451, - "logps/chosen": -398.9753723144531, - "logps/rejected": -445.42926025390625, - "loss": 0.0199, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -2.365363121032715, - "rewards/margins": 0.5205937623977661, - "rewards/rejected": -2.8859567642211914, + "logits/chosen": 0.2789291739463806, + "logits/rejected": 0.4242584705352783, + "logps/chosen": -422.7801818847656, + "logps/rejected": -522.7840576171875, + "loss": 0.5551, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.925621509552002, + "rewards/margins": 0.8043605089187622, + "rewards/rejected": -3.729982376098633, "step": 720 }, { - "debug/losses": 0.017138421535491943, - "debug/policy_weights": 0.029266973957419395, - "debug/raw_losses": 0.5414391756057739, - "epoch": 0.5809789096697174, - "grad_norm": 0.9922063489426967, + "epoch": 0.58, "learning_rate": 2.2294641902678443e-07, - "logits/chosen": -1.5957987308502197, - "logits/rejected": -1.4495995044708252, - "logps/chosen": -367.09942626953125, - "logps/rejected": -425.4873046875, - "loss": 0.0177, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -2.3193514347076416, - "rewards/margins": 0.6168329119682312, - "rewards/rejected": -2.9361839294433594, + "logits/chosen": -0.19327735900878906, + "logits/rejected": 0.043265581130981445, + "logps/chosen": -363.1488342285156, + "logps/rejected": -470.94970703125, + "loss": 0.5284, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.39530873298645, + "rewards/margins": 0.908363938331604, + "rewards/rejected": -3.3036727905273438, "step": 730 }, { - "debug/losses": 0.013756600208580494, - "debug/policy_weights": 0.029798978939652443, - "debug/raw_losses": 0.537264347076416, - "epoch": 0.5889375248706725, - "grad_norm": 1.6691124809021063, + "epoch": 0.59, "learning_rate": 2.160481533045751e-07, - "logits/chosen": -1.561067819595337, - "logits/rejected": -1.3976190090179443, - "logps/chosen": -395.62347412109375, - "logps/rejected": -438.599609375, - "loss": 0.0184, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -2.454890012741089, - "rewards/margins": 0.5239530801773071, - "rewards/rejected": -2.9788429737091064, + "logits/chosen": -0.37412697076797485, + "logits/rejected": -0.17320053279399872, + "logps/chosen": -390.2896423339844, + "logps/rejected": -428.08099365234375, + "loss": 0.5572, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.3005330562591553, + "rewards/margins": 0.48462891578674316, + "rewards/rejected": -2.7851624488830566, "step": 740 }, { - "debug/losses": 0.01619603857398033, - "debug/policy_weights": 0.031267426908016205, - "debug/raw_losses": 0.5780462622642517, - "epoch": 0.5968961400716275, - "grad_norm": 1.077635653848249, + "epoch": 0.6, "learning_rate": 2.0917612845576882e-07, - "logits/chosen": -1.6572452783584595, - "logits/rejected": -1.4474142789840698, - "logps/chosen": -395.1420593261719, - "logps/rejected": -431.25732421875, - "loss": 0.0189, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -2.3865952491760254, - "rewards/margins": 0.5750153064727783, - "rewards/rejected": -2.9616103172302246, + "logits/chosen": -0.26352375745773315, + "logits/rejected": -0.0010178961092606187, + "logps/chosen": -373.3875427246094, + "logps/rejected": -440.09442138671875, + "loss": 0.5534, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.3315823078155518, + "rewards/margins": 0.6843063235282898, + "rewards/rejected": -3.0158886909484863, "step": 750 }, { - "debug/losses": 0.0169072188436985, - "debug/policy_weights": 0.028911063447594643, - "debug/raw_losses": 0.6073988080024719, - "epoch": 0.6048547552725826, - "grad_norm": 1.2161871750266422, + "epoch": 0.6, "learning_rate": 2.0233565576536564e-07, - "logits/chosen": -1.533752202987671, - "logits/rejected": -1.4571056365966797, - "logps/chosen": -383.4120788574219, - "logps/rejected": -433.785888671875, - "loss": 0.0207, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -2.414424419403076, - "rewards/margins": 0.4532749652862549, - "rewards/rejected": -2.867699384689331, + "logits/chosen": -0.3354080021381378, + "logits/rejected": -0.006600166670978069, + "logps/chosen": -360.56463623046875, + "logps/rejected": -440.66961669921875, + "loss": 0.5328, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.1626803874969482, + "rewards/margins": 0.8473829030990601, + "rewards/rejected": -3.010063409805298, "step": 760 }, { - "debug/losses": 0.018340473994612694, - "debug/policy_weights": 0.03365606069564819, - "debug/raw_losses": 0.5463643670082092, - "epoch": 0.6128133704735376, - "grad_norm": 1.1430708218569445, + "epoch": 0.61, "learning_rate": 1.9553202213217537e-07, - "logits/chosen": -1.549505352973938, - "logits/rejected": -1.4244543313980103, - "logps/chosen": -367.46600341796875, - "logps/rejected": -431.8912048339844, - "loss": 0.018, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -2.3706705570220947, - "rewards/margins": 0.5944007039070129, - "rewards/rejected": -2.965071201324463, + "logits/chosen": -0.021420275792479515, + "logits/rejected": 0.19946305453777313, + "logps/chosen": -389.1043395996094, + "logps/rejected": -448.04998779296875, + "loss": 0.5523, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.391838788986206, + "rewards/margins": 0.6678962707519531, + "rewards/rejected": -3.059735059738159, "step": 770 }, { - "debug/losses": 0.019901562482118607, - "debug/policy_weights": 0.03774517774581909, - "debug/raw_losses": 0.5596235990524292, - "epoch": 0.6207719856744927, - "grad_norm": 1.1044700038610857, + "epoch": 0.62, "learning_rate": 1.887704859826528e-07, - "logits/chosen": -1.5840781927108765, - "logits/rejected": -1.435772180557251, - "logps/chosen": -411.3043518066406, - "logps/rejected": -474.3914489746094, - "loss": 0.0186, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -2.5005176067352295, - "rewards/margins": 0.6052481532096863, - "rewards/rejected": -3.1057658195495605, + "logits/chosen": -0.15253478288650513, + "logits/rejected": -0.00011998042464256287, + "logps/chosen": -394.9501953125, + "logps/rejected": -462.32843017578125, + "loss": 0.5443, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.476644992828369, + "rewards/margins": 0.566824734210968, + "rewards/rejected": -3.0434699058532715, "step": 780 }, { - "debug/losses": 0.020346064120531082, - "debug/policy_weights": 0.03269239515066147, - "debug/raw_losses": 0.5674707889556885, - "epoch": 0.6287306008754476, - "grad_norm": 0.9859185287093285, + "epoch": 0.63, "learning_rate": 1.8205627320673836e-07, - "logits/chosen": -1.475979208946228, - "logits/rejected": -1.2797057628631592, - "logps/chosen": -401.9871826171875, - "logps/rejected": -459.9845275878906, - "loss": 0.018, - "rewards/accuracies": 0.71875, - "rewards/chosen": -2.484294891357422, - "rewards/margins": 0.6510842442512512, - "rewards/rejected": -3.135378837585449, + "logits/chosen": -0.17955633997917175, + "logits/rejected": 0.18167546391487122, + "logps/chosen": -390.32244873046875, + "logps/rejected": -444.895263671875, + "loss": 0.5566, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.4376220703125, + "rewards/margins": 0.7008293271064758, + "rewards/rejected": -3.138451099395752, "step": 790 }, { - "debug/losses": 0.017856208607554436, - "debug/policy_weights": 0.033618152141571045, - "debug/raw_losses": 0.5967596769332886, - "epoch": 0.6366892160764027, - "grad_norm": 0.9736061211829494, + "epoch": 0.64, "learning_rate": 1.7539457311884675e-07, - "logits/chosen": -1.3345762491226196, - "logits/rejected": -1.1230970621109009, - "logps/chosen": -407.16778564453125, - "logps/rejected": -441.09149169921875, - "loss": 0.0192, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -2.494783878326416, - "rewards/margins": 0.4510740339756012, - "rewards/rejected": -2.9458580017089844, + "logits/chosen": -0.09838727861642838, + "logits/rejected": 0.11829495429992676, + "logps/chosen": -402.4017333984375, + "logps/rejected": -451.49346923828125, + "loss": 0.5609, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.4441986083984375, + "rewards/margins": 0.5067789554595947, + "rewards/rejected": -2.9509775638580322, "step": 800 }, { - "epoch": 0.6366892160764027, - "eval_debug/losses": 0.01690017618238926, - "eval_debug/policy_weights": 0.02897428721189499, - "eval_debug/raw_losses": 0.5789195895195007, - "eval_logits/chosen": -1.2121543884277344, - "eval_logits/rejected": -1.0483382940292358, - "eval_logps/chosen": -403.5320739746094, - "eval_logps/rejected": -462.95257568359375, - "eval_loss": 0.018223632127046585, - "eval_rewards/accuracies": 0.6865671873092651, - "eval_rewards/chosen": -2.592885971069336, - "eval_rewards/margins": 0.5296501517295837, - "eval_rewards/rejected": -3.1225357055664062, - "eval_runtime": 153.0023, - "eval_samples_per_second": 55.895, - "eval_steps_per_second": 0.876, + "epoch": 0.64, + "eval_logits/chosen": -0.03116540051996708, + "eval_logits/rejected": 0.1922437697649002, + "eval_logps/chosen": -387.7091979980469, + "eval_logps/rejected": -459.44390869140625, + "eval_loss": 0.5468714833259583, + "eval_rewards/accuracies": 0.7108209133148193, + "eval_rewards/chosen": -2.431356430053711, + "eval_rewards/margins": 0.6517390012741089, + "eval_rewards/rejected": -3.0830955505371094, + "eval_runtime": 183.9634, + "eval_samples_per_second": 46.488, + "eval_steps_per_second": 0.728, "step": 800 }, { - "debug/losses": 0.0213015079498291, - "debug/policy_weights": 0.034373920410871506, - "debug/raw_losses": 0.6232832670211792, - "epoch": 0.6446478312773577, - "grad_norm": 1.2970621410520329, + "epoch": 0.64, "learning_rate": 1.687905344471226e-07, - "logits/chosen": -1.2676408290863037, - "logits/rejected": -1.1279184818267822, - "logps/chosen": -419.06475830078125, - "logps/rejected": -464.020263671875, - "loss": 0.0194, - "rewards/accuracies": 0.606249988079071, - "rewards/chosen": -2.55409836769104, - "rewards/margins": 0.43357712030410767, - "rewards/rejected": -2.987675905227661, + "logits/chosen": 0.07735608518123627, + "logits/rejected": 0.3973601460456848, + "logps/chosen": -408.05999755859375, + "logps/rejected": -459.011474609375, + "loss": 0.5384, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -2.5008435249328613, + "rewards/margins": 0.6535352468490601, + "rewards/rejected": -3.154379367828369, "step": 810 }, { - "debug/losses": 0.02103157714009285, - "debug/policy_weights": 0.029956122860312462, - "debug/raw_losses": 0.6147562265396118, - "epoch": 0.6526064464783128, - "grad_norm": 0.7803955054344923, + "epoch": 0.65, "learning_rate": 1.6224926135406693e-07, - "logits/chosen": -1.371311902999878, - "logits/rejected": -1.1925647258758545, - "logps/chosen": -411.35577392578125, - "logps/rejected": -442.8028259277344, - "loss": 0.0162, - "rewards/accuracies": 0.65625, - "rewards/chosen": -2.6678333282470703, - "rewards/margins": 0.4167654514312744, - "rewards/rejected": -3.084599018096924, + "logits/chosen": 0.1125444769859314, + "logits/rejected": 0.3865428566932678, + "logps/chosen": -404.16058349609375, + "logps/rejected": -484.68621826171875, + "loss": 0.5448, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -2.4405789375305176, + "rewards/margins": 0.718208909034729, + "rewards/rejected": -3.158787727355957, "step": 820 }, { - "debug/losses": 0.017109088599681854, - "debug/policy_weights": 0.03144029527902603, - "debug/raw_losses": 0.5839846730232239, - "epoch": 0.6605650616792678, - "grad_norm": 0.9253818549329441, + "epoch": 0.66, "learning_rate": 1.557758094916053e-07, - "logits/chosen": -1.5016579627990723, - "logits/rejected": -1.3439366817474365, - "logps/chosen": -427.08551025390625, - "logps/rejected": -477.54718017578125, - "loss": 0.0164, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -2.701488494873047, - "rewards/margins": 0.5098705887794495, - "rewards/rejected": -3.2113590240478516, + "logits/chosen": 0.11989516019821167, + "logits/rejected": 0.30926594138145447, + "logps/chosen": -370.29876708984375, + "logps/rejected": -452.27911376953125, + "loss": 0.5418, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.3860089778900146, + "rewards/margins": 0.7260924577713013, + "rewards/rejected": -3.1121015548706055, "step": 830 }, { - "debug/losses": 0.01964627578854561, - "debug/policy_weights": 0.03611503541469574, - "debug/raw_losses": 0.5748011469841003, - "epoch": 0.6685236768802229, - "grad_norm": 0.8808070127931911, + "epoch": 0.67, "learning_rate": 1.4937518209365108e-07, - "logits/chosen": -1.5935519933700562, - "logits/rejected": -1.422109603881836, - "logps/chosen": -435.3828125, - "logps/rejected": -461.01763916015625, - "loss": 0.0207, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -2.5694196224212646, - "rewards/margins": 0.5077784061431885, - "rewards/rejected": -3.077198028564453, + "logits/chosen": -0.14239154756069183, + "logits/rejected": 0.14250756800174713, + "logps/chosen": -395.55755615234375, + "logps/rejected": -447.6368713378906, + "loss": 0.5573, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.378154754638672, + "rewards/margins": 0.6160937547683716, + "rewards/rejected": -2.994248390197754, "step": 840 }, { - "debug/losses": 0.017254317179322243, - "debug/policy_weights": 0.030938278883695602, - "debug/raw_losses": 0.5966542959213257, - "epoch": 0.6764822920811778, - "grad_norm": 0.9343951943413922, + "epoch": 0.68, "learning_rate": 1.4305232610918045e-07, - "logits/chosen": -1.5590174198150635, - "logits/rejected": -1.4611246585845947, - "logps/chosen": -413.28875732421875, - "logps/rejected": -451.7987365722656, - "loss": 0.0186, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -2.672389507293701, - "rewards/margins": 0.4185652732849121, - "rewards/rejected": -3.090954542160034, + "logits/chosen": -0.16526366770267487, + "logits/rejected": 0.16432161629199982, + "logps/chosen": -373.45330810546875, + "logps/rejected": -436.6773376464844, + "loss": 0.5415, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.3134028911590576, + "rewards/margins": 0.774810791015625, + "rewards/rejected": -3.0882136821746826, "step": 850 }, { - "debug/losses": 0.01943446323275566, - "debug/policy_weights": 0.031098250299692154, - "debug/raw_losses": 0.6142188906669617, - "epoch": 0.6844409072821329, - "grad_norm": 0.9375815515878454, + "epoch": 0.68, "learning_rate": 1.3681212837880977e-07, - "logits/chosen": -1.6131696701049805, - "logits/rejected": -1.5382283926010132, - "logps/chosen": -378.0366516113281, - "logps/rejected": -439.3740234375, - "loss": 0.0193, - "rewards/accuracies": 0.65625, - "rewards/chosen": -2.4397616386413574, - "rewards/margins": 0.4309278428554535, - "rewards/rejected": -2.870689868927002, + "logits/chosen": -0.1321481615304947, + "logits/rejected": 0.23287932574748993, + "logps/chosen": -364.96990966796875, + "logps/rejected": -447.7923278808594, + "loss": 0.5396, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.176964282989502, + "rewards/margins": 0.8955341577529907, + "rewards/rejected": -3.0724985599517822, "step": 860 }, { - "debug/losses": 0.018429789692163467, - "debug/policy_weights": 0.03325992077589035, - "debug/raw_losses": 0.5925087928771973, - "epoch": 0.6923995224830879, - "grad_norm": 1.232238112884586, + "epoch": 0.69, "learning_rate": 1.3065941185782977e-07, - "logits/chosen": -1.5463390350341797, - "logits/rejected": -1.3958414793014526, - "logps/chosen": -399.37152099609375, - "logps/rejected": -427.0814514160156, - "loss": 0.0211, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -2.4714484214782715, - "rewards/margins": 0.43900877237319946, - "rewards/rejected": -2.910456895828247, + "logits/chosen": 0.05437428876757622, + "logits/rejected": 0.2819867432117462, + "logps/chosen": -383.08599853515625, + "logps/rejected": -439.3629455566406, + "loss": 0.5505, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -2.405247211456299, + "rewards/margins": 0.5403125882148743, + "rewards/rejected": -2.9455599784851074, "step": 870 }, { - "debug/losses": 0.01962456852197647, - "debug/policy_weights": 0.046004533767700195, - "debug/raw_losses": 0.4855673313140869, - "epoch": 0.700358137684043, - "grad_norm": 1.2553897674721153, + "epoch": 0.7, "learning_rate": 1.2459893188861613e-07, - "logits/chosen": -1.717850685119629, - "logits/rejected": -1.556694746017456, - "logps/chosen": -361.3697204589844, - "logps/rejected": -447.73858642578125, - "loss": 0.0229, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -2.1288363933563232, - "rewards/margins": 0.7239478826522827, - "rewards/rejected": -2.8527846336364746, + "logits/chosen": -0.12052659690380096, + "logits/rejected": 0.12284734100103378, + "logps/chosen": -367.1181640625, + "logps/rejected": -468.1044921875, + "loss": 0.5185, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.39152455329895, + "rewards/margins": 0.9137696027755737, + "rewards/rejected": -3.3052947521209717, "step": 880 }, { - "debug/losses": 0.022167332470417023, - "debug/policy_weights": 0.04253797605633736, - "debug/raw_losses": 0.5294119119644165, - "epoch": 0.708316752884998, - "grad_norm": 1.099703402287045, + "epoch": 0.71, "learning_rate": 1.1863537252529548e-07, - "logits/chosen": -1.6016219854354858, - "logits/rejected": -1.3953754901885986, - "logps/chosen": -387.30731201171875, - "logps/rejected": -434.49169921875, - "loss": 0.0214, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -2.2904038429260254, - "rewards/margins": 0.5490304231643677, - "rewards/rejected": -2.8394341468811035, + "logits/chosen": 0.14598000049591064, + "logits/rejected": 0.38815659284591675, + "logps/chosen": -397.891357421875, + "logps/rejected": -472.38677978515625, + "loss": 0.5323, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.512676239013672, + "rewards/margins": 0.7713057994842529, + "rewards/rejected": -3.2839818000793457, "step": 890 }, { - "debug/losses": 0.020326469093561172, - "debug/policy_weights": 0.03977964445948601, - "debug/raw_losses": 0.5732988119125366, - "epoch": 0.716275368085953, - "grad_norm": 1.090687554043691, + "epoch": 0.72, "learning_rate": 1.1277334291351145e-07, - "logits/chosen": -1.4579260349273682, - "logits/rejected": -1.2984815835952759, - "logps/chosen": -366.3522033691406, - "logps/rejected": -429.67950439453125, - "loss": 0.0233, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -2.290104866027832, - "rewards/margins": 0.5741912722587585, - "rewards/rejected": -2.8642961978912354, + "logits/chosen": 0.15319526195526123, + "logits/rejected": 0.35974830389022827, + "logps/chosen": -380.77783203125, + "logps/rejected": -449.54315185546875, + "loss": 0.5514, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -2.3706305027008057, + "rewards/margins": 0.6724039912223816, + "rewards/rejected": -3.043034076690674, "step": 900 }, { - "epoch": 0.716275368085953, - "eval_debug/losses": 0.02210032381117344, - "eval_debug/policy_weights": 0.03874586522579193, - "eval_debug/raw_losses": 0.5725884437561035, - "eval_logits/chosen": -1.4492710828781128, - "eval_logits/rejected": -1.3096469640731812, - "eval_logps/chosen": -377.3470153808594, - "eval_logps/rejected": -440.0111389160156, - "eval_loss": 0.023749953135848045, - "eval_rewards/accuracies": 0.6809701323509216, - "eval_rewards/chosen": -2.3310351371765137, - "eval_rewards/margins": 0.5620867609977722, - "eval_rewards/rejected": -2.8931214809417725, - "eval_runtime": 152.9735, - "eval_samples_per_second": 55.905, - "eval_steps_per_second": 0.876, + "epoch": 0.72, + "eval_logits/chosen": 0.28598034381866455, + "eval_logits/rejected": 0.5382024645805359, + "eval_logps/chosen": -392.3096008300781, + "eval_logps/rejected": -471.95330810546875, + "eval_loss": 0.5473664402961731, + "eval_rewards/accuracies": 0.6996268630027771, + "eval_rewards/chosen": -2.4773612022399902, + "eval_rewards/margins": 0.7308279275894165, + "eval_rewards/rejected": -3.2081892490386963, + "eval_runtime": 183.9871, + "eval_samples_per_second": 46.482, + "eval_steps_per_second": 0.728, "step": 900 }, { - "debug/losses": 0.02190154604613781, - "debug/policy_weights": 0.04413367062807083, - "debug/raw_losses": 0.4940849244594574, - "epoch": 0.724233983286908, - "grad_norm": 1.0479997589103751, + "epoch": 0.72, "learning_rate": 1.0701737372808431e-07, - "logits/chosen": -1.4712375402450562, - "logits/rejected": -1.3293416500091553, - "logps/chosen": -357.8360290527344, - "logps/rejected": -447.7298889160156, - "loss": 0.0218, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -2.14089298248291, - "rewards/margins": 0.7697595357894897, - "rewards/rejected": -2.9106526374816895, + "logits/chosen": 0.15951867401599884, + "logits/rejected": 0.46630391478538513, + "logps/chosen": -383.52850341796875, + "logps/rejected": -467.2303771972656, + "loss": 0.5362, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.2782187461853027, + "rewards/margins": 0.8473943471908569, + "rewards/rejected": -3.125612735748291, "step": 910 }, { - "debug/losses": 0.01954295113682747, - "debug/policy_weights": 0.036859314888715744, - "debug/raw_losses": 0.5885938405990601, - "epoch": 0.7321925984878631, - "grad_norm": 1.129536901422531, + "epoch": 0.73, "learning_rate": 1.0137191367132078e-07, - "logits/chosen": -1.4527212381362915, - "logits/rejected": -1.2773245573043823, - "logps/chosen": -412.63983154296875, - "logps/rejected": -471.3998107910156, - "loss": 0.0189, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -2.5654666423797607, - "rewards/margins": 0.5423773527145386, - "rewards/rejected": -3.107844114303589, + "logits/chosen": 0.2791319191455841, + "logits/rejected": 0.45174160599708557, + "logps/chosen": -372.1945495605469, + "logps/rejected": -446.6507263183594, + "loss": 0.5458, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -2.3160648345947266, + "rewards/margins": 0.68004310131073, + "rewards/rejected": -2.996107816696167, "step": 920 }, { - "debug/losses": 0.02070821449160576, - "debug/policy_weights": 0.033360544592142105, - "debug/raw_losses": 0.6200900673866272, - "epoch": 0.7401512136888182, - "grad_norm": 0.8545282722936337, + "epoch": 0.74, "learning_rate": 9.584132603467827e-08, - "logits/chosen": -1.449439287185669, - "logits/rejected": -1.263599157333374, - "logps/chosen": -435.151123046875, - "logps/rejected": -468.1375427246094, - "loss": 0.0189, - "rewards/accuracies": 0.65625, - "rewards/chosen": -2.67981219291687, - "rewards/margins": 0.4587249755859375, - "rewards/rejected": -3.1385371685028076, + "logits/chosen": -0.12192128598690033, + "logits/rejected": 0.1477951854467392, + "logps/chosen": -366.48321533203125, + "logps/rejected": -453.130126953125, + "loss": 0.5467, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.200005531311035, + "rewards/margins": 0.7978888750076294, + "rewards/rejected": -2.997894287109375, "step": 930 }, { - "debug/losses": 0.020046690478920937, - "debug/policy_weights": 0.03236168995499611, - "debug/raw_losses": 0.5747401118278503, - "epoch": 0.7481098288897732, - "grad_norm": 0.9918593494773874, + "epoch": 0.75, "learning_rate": 9.042988532644249e-08, - "logits/chosen": -1.5396702289581299, - "logits/rejected": -1.397327184677124, - "logps/chosen": -402.87237548828125, - "logps/rejected": -470.3892517089844, - "loss": 0.0195, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -2.59519362449646, - "rewards/margins": 0.5613120794296265, - "rewards/rejected": -3.156505823135376, + "logits/chosen": -0.03106372058391571, + "logits/rejected": 0.07721444219350815, + "logps/chosen": -344.21270751953125, + "logps/rejected": -438.11077880859375, + "loss": 0.5161, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.149094581604004, + "rewards/margins": 0.7353444695472717, + "rewards/rejected": -2.884438991546631, "step": 940 }, { - "debug/losses": 0.01932508684694767, - "debug/policy_weights": 0.03236791491508484, - "debug/raw_losses": 0.5765026211738586, - "epoch": 0.7560684440907283, - "grad_norm": 0.6875663367151513, + "epoch": 0.76, "learning_rate": 8.514177396802428e-08, - "logits/chosen": -1.6237666606903076, - "logits/rejected": -1.5048038959503174, - "logps/chosen": -407.4132385253906, - "logps/rejected": -465.19927978515625, - "loss": 0.0181, - "rewards/accuracies": 0.71875, - "rewards/chosen": -2.675424814224243, - "rewards/margins": 0.5076571106910706, - "rewards/rejected": -3.183081865310669, + "logits/chosen": 0.006801058538258076, + "logits/rejected": 0.20282092690467834, + "logps/chosen": -358.15167236328125, + "logps/rejected": -436.4964294433594, + "loss": 0.5385, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -2.2220425605773926, + "rewards/margins": 0.7004804611206055, + "rewards/rejected": -2.922523260116577, "step": 950 }, { - "debug/losses": 0.017698202282190323, - "debug/policy_weights": 0.031711481511592865, - "debug/raw_losses": 0.5835244059562683, - "epoch": 0.7640270592916832, - "grad_norm": 0.7868483840339101, + "epoch": 0.76, "learning_rate": 7.998107906142839e-08, - "logits/chosen": -1.6007171869277954, - "logits/rejected": -1.478947401046753, - "logps/chosen": -396.6483459472656, - "logps/rejected": -433.88458251953125, - "loss": 0.0184, - "rewards/accuracies": 0.6875, - "rewards/chosen": -2.5362672805786133, - "rewards/margins": 0.4365662932395935, - "rewards/rejected": -2.9728338718414307, + "logits/chosen": 0.41448846459388733, + "logits/rejected": 0.705254852771759, + "logps/chosen": -371.27801513671875, + "logps/rejected": -434.56866455078125, + "loss": 0.5236, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.2166616916656494, + "rewards/margins": 0.6714047193527222, + "rewards/rejected": -2.888066530227661, "step": 960 }, { - "debug/losses": 0.016292816027998924, - "debug/policy_weights": 0.02861318551003933, - "debug/raw_losses": 0.5646917819976807, - "epoch": 0.7719856744926383, - "grad_norm": 1.0427659029787784, + "epoch": 0.77, "learning_rate": 7.495178923039396e-08, - "logits/chosen": -1.6051594018936157, - "logits/rejected": -1.5850203037261963, - "logps/chosen": -380.40289306640625, - "logps/rejected": -462.0849609375, - "loss": 0.0195, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -2.517289638519287, - "rewards/margins": 0.546326220035553, - "rewards/rejected": -3.0636162757873535, + "logits/chosen": 0.23847150802612305, + "logits/rejected": 0.48661884665489197, + "logps/chosen": -366.28179931640625, + "logps/rejected": -462.679443359375, + "loss": 0.5459, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.1916985511779785, + "rewards/margins": 0.8472123146057129, + "rewards/rejected": -3.038910388946533, "step": 970 }, { - "debug/losses": 0.017953380942344666, - "debug/policy_weights": 0.030950292944908142, - "debug/raw_losses": 0.5466476678848267, - "epoch": 0.7799442896935933, - "grad_norm": 1.3967641035945062, + "epoch": 0.78, "learning_rate": 7.005779153764682e-08, - "logits/chosen": -1.622676134109497, - "logits/rejected": -1.4608395099639893, - "logps/chosen": -385.9210205078125, - "logps/rejected": -437.5826721191406, - "loss": 0.0211, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -2.431375741958618, - "rewards/margins": 0.5695705413818359, - "rewards/rejected": -3.000946283340454, + "logits/chosen": 0.41438961029052734, + "logits/rejected": 0.6912784576416016, + "logps/chosen": -382.70123291015625, + "logps/rejected": -461.8614807128906, + "loss": 0.5453, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.4304287433624268, + "rewards/margins": 0.7116767764091492, + "rewards/rejected": -3.1421055793762207, "step": 980 }, { - "debug/losses": 0.020693689584732056, - "debug/policy_weights": 0.03592243418097496, - "debug/raw_losses": 0.5423937439918518, - "epoch": 0.7879029048945484, - "grad_norm": 1.5010217405708293, + "epoch": 0.79, "learning_rate": 6.530286848064698e-08, - "logits/chosen": -1.5891830921173096, - "logits/rejected": -1.4476853609085083, - "logps/chosen": -388.16107177734375, - "logps/rejected": -452.994384765625, - "loss": 0.0194, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -2.35011625289917, - "rewards/margins": 0.6105698347091675, - "rewards/rejected": -2.960686206817627, + "logits/chosen": 0.36573725938796997, + "logits/rejected": 0.5834362506866455, + "logps/chosen": -384.49749755859375, + "logps/rejected": -466.30096435546875, + "loss": 0.5528, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.5111565589904785, + "rewards/margins": 0.7234699130058289, + "rewards/rejected": -3.234626054763794, "step": 990 }, { - "debug/losses": 0.01797301694750786, - "debug/policy_weights": 0.035282202064991, - "debug/raw_losses": 0.5305525064468384, - "epoch": 0.7958615200955034, - "grad_norm": 1.0175708862425297, + "epoch": 0.8, "learning_rate": 6.069069506815325e-08, - "logits/chosen": -1.6233867406845093, - "logits/rejected": -1.3854453563690186, - "logps/chosen": -379.2464294433594, - "logps/rejected": -441.9747009277344, - "loss": 0.0213, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -2.343348503112793, - "rewards/margins": 0.6720021963119507, - "rewards/rejected": -3.015350818634033, + "logits/chosen": 0.45530566573143005, + "logits/rejected": 0.5909157991409302, + "logps/chosen": -379.1433410644531, + "logps/rejected": -468.88458251953125, + "loss": 0.527, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.5361268520355225, + "rewards/margins": 0.7407721281051636, + "rewards/rejected": -3.2768986225128174, "step": 1000 }, { - "epoch": 0.7958615200955034, - "eval_debug/losses": 0.02031446062028408, - "eval_debug/policy_weights": 0.035683903843164444, - "eval_debug/raw_losses": 0.569391131401062, - "eval_logits/chosen": -1.604931116104126, - "eval_logits/rejected": -1.4880036115646362, - "eval_logps/chosen": -386.5316467285156, - "eval_logps/rejected": -446.7563781738281, - "eval_loss": 0.021877959370613098, - "eval_rewards/accuracies": 0.6930969953536987, - "eval_rewards/chosen": -2.422881603240967, - "eval_rewards/margins": 0.5376923680305481, - "eval_rewards/rejected": -2.960574150085449, - "eval_runtime": 152.9371, - "eval_samples_per_second": 55.918, - "eval_steps_per_second": 0.876, + "epoch": 0.8, + "eval_logits/chosen": 0.3871051073074341, + "eval_logits/rejected": 0.6372014284133911, + "eval_logps/chosen": -394.97113037109375, + "eval_logps/rejected": -471.8453674316406, + "eval_loss": 0.5453863739967346, + "eval_rewards/accuracies": 0.70802241563797, + "eval_rewards/chosen": -2.503976345062256, + "eval_rewards/margins": 0.7031334638595581, + "eval_rewards/rejected": -3.2071101665496826, + "eval_runtime": 183.9533, + "eval_samples_per_second": 46.49, + "eval_steps_per_second": 0.728, "step": 1000 }, { - "debug/losses": 0.023462774232029915, - "debug/policy_weights": 0.035995837301015854, - "debug/raw_losses": 0.6167944669723511, - "epoch": 0.8038201352964585, - "grad_norm": 1.1729268102061192, + "epoch": 0.8, "learning_rate": 5.6224835979863714e-08, - "logits/chosen": -1.646054983139038, - "logits/rejected": -1.4877814054489136, - "logps/chosen": -394.0400085449219, - "logps/rejected": -429.5767517089844, - "loss": 0.0209, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -2.4372658729553223, - "rewards/margins": 0.43345385789871216, - "rewards/rejected": -2.8707196712493896, + "logits/chosen": 0.31174296140670776, + "logits/rejected": 0.6193565130233765, + "logps/chosen": -390.387451171875, + "logps/rejected": -468.4959411621094, + "loss": 0.5568, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.32766056060791, + "rewards/margins": 0.68747878074646, + "rewards/rejected": -3.015139102935791, "step": 1010 }, { - "debug/losses": 0.01666095480322838, - "debug/policy_weights": 0.03231491893529892, - "debug/raw_losses": 0.5554049015045166, - "epoch": 0.8117787504974134, - "grad_norm": 1.767673975921384, + "epoch": 0.81, "learning_rate": 5.190874281132851e-08, - "logits/chosen": -1.6193885803222656, - "logits/rejected": -1.4846832752227783, - "logps/chosen": -377.4434509277344, - "logps/rejected": -439.32159423828125, - "loss": 0.0215, + "logits/chosen": 0.22277125716209412, + "logits/rejected": 0.6487134099006653, + "logps/chosen": -402.0958557128906, + "logps/rejected": -448.5992736816406, + "loss": 0.5408, "rewards/accuracies": 0.71875, - "rewards/chosen": -2.3773646354675293, - "rewards/margins": 0.5905806422233582, - "rewards/rejected": -2.9679455757141113, + "rewards/chosen": -2.359062671661377, + "rewards/margins": 0.6533006429672241, + "rewards/rejected": -3.0123631954193115, "step": 1020 }, { - "debug/losses": 0.018973354250192642, - "debug/policy_weights": 0.033219628036022186, - "debug/raw_losses": 0.5266579389572144, - "epoch": 0.8197373656983685, - "grad_norm": 1.193592806014732, + "epoch": 0.82, "learning_rate": 4.774575140626316e-08, - "logits/chosen": -1.5476404428482056, - "logits/rejected": -1.3509008884429932, - "logps/chosen": -378.56463623046875, - "logps/rejected": -437.563720703125, - "loss": 0.0206, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -2.3850128650665283, - "rewards/margins": 0.6664969325065613, - "rewards/rejected": -3.051509380340576, + "logits/chosen": 0.23170511424541473, + "logits/rejected": 0.47184085845947266, + "logps/chosen": -363.46917724609375, + "logps/rejected": -442.47918701171875, + "loss": 0.5309, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.142770290374756, + "rewards/margins": 0.7513145208358765, + "rewards/rejected": -2.894084930419922, "step": 1030 }, { - "debug/losses": 0.021493710577487946, - "debug/policy_weights": 0.035152681171894073, - "debug/raw_losses": 0.6217910051345825, - "epoch": 0.8276959808993235, - "grad_norm": 1.4851010788677348, + "epoch": 0.83, "learning_rate": 4.373907927832513e-08, - "logits/chosen": -1.5497913360595703, - "logits/rejected": -1.4415110349655151, - "logps/chosen": -370.7920837402344, - "logps/rejected": -425.83636474609375, - "loss": 0.0226, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -2.4308905601501465, - "rewards/margins": 0.4381667971611023, - "rewards/rejected": -2.8690574169158936, + "logits/chosen": 0.07573021948337555, + "logits/rejected": 0.32997313141822815, + "logps/chosen": -381.45599365234375, + "logps/rejected": -443.0684509277344, + "loss": 0.5407, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.2033116817474365, + "rewards/margins": 0.710732638835907, + "rewards/rejected": -2.914044141769409, "step": 1040 }, { - "debug/losses": 0.019884012639522552, - "debug/policy_weights": 0.03903160244226456, - "debug/raw_losses": 0.5188706517219543, - "epoch": 0.8356545961002786, - "grad_norm": 1.1564325896000056, + "epoch": 0.84, "learning_rate": 3.9891823124345665e-08, - "logits/chosen": -1.6113007068634033, - "logits/rejected": -1.4773445129394531, - "logps/chosen": -376.01031494140625, - "logps/rejected": -441.5926208496094, - "loss": 0.0227, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -2.249936819076538, - "rewards/margins": 0.6867498159408569, - "rewards/rejected": -2.9366867542266846, + "logits/chosen": 0.23884686827659607, + "logits/rejected": 0.6128005385398865, + "logps/chosen": -364.00567626953125, + "logps/rejected": -433.3273010253906, + "loss": 0.5471, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.2081527709960938, + "rewards/margins": 0.7639263868331909, + "rewards/rejected": -2.972079038619995, "step": 1050 }, { - "debug/losses": 0.018301142379641533, - "debug/policy_weights": 0.03422557935118675, - "debug/raw_losses": 0.5713909864425659, - "epoch": 0.8436132113012336, - "grad_norm": 1.0553462531854274, + "epoch": 0.84, "learning_rate": 3.620695643093924e-08, - "logits/chosen": -1.6144059896469116, - "logits/rejected": -1.5364993810653687, - "logps/chosen": -362.0438537597656, - "logps/rejected": -426.15283203125, - "loss": 0.021, - "rewards/accuracies": 0.6875, - "rewards/chosen": -2.2525696754455566, - "rewards/margins": 0.5051141977310181, - "rewards/rejected": -2.7576839923858643, + "logits/chosen": 0.21963253617286682, + "logits/rejected": 0.6894062757492065, + "logps/chosen": -399.5767517089844, + "logps/rejected": -452.88909912109375, + "loss": 0.5154, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.3384335041046143, + "rewards/margins": 0.7010769844055176, + "rewards/rejected": -3.0395102500915527, "step": 1060 }, { - "debug/losses": 0.023652352392673492, - "debug/policy_weights": 0.038290224969387054, - "debug/raw_losses": 0.5432866811752319, - "epoch": 0.8515718265021887, - "grad_norm": 1.6604665806944348, + "epoch": 0.85, "learning_rate": 3.268732717634032e-08, - "logits/chosen": -1.5845705270767212, - "logits/rejected": -1.4731186628341675, - "logps/chosen": -363.3177795410156, - "logps/rejected": -417.6956481933594, - "loss": 0.0233, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -2.2692339420318604, - "rewards/margins": 0.5667887926101685, - "rewards/rejected": -2.8360228538513184, + "logits/chosen": 0.3474286198616028, + "logits/rejected": 0.695271372795105, + "logps/chosen": -368.0654602050781, + "logps/rejected": -431.47222900390625, + "loss": 0.5499, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.1910276412963867, + "rewards/margins": 0.7267633080482483, + "rewards/rejected": -2.9177908897399902, "step": 1070 }, { - "debug/losses": 0.024808544665575027, - "debug/policy_weights": 0.04168625548481941, - "debug/raw_losses": 0.598394513130188, - "epoch": 0.8595304417031436, - "grad_norm": 1.1200756250900803, + "epoch": 0.86, "learning_rate": 2.9335655629243645e-08, - "logits/chosen": -1.6236356496810913, - "logits/rejected": -1.505225658416748, - "logps/chosen": -384.00665283203125, - "logps/rejected": -440.53741455078125, - "loss": 0.023, - "rewards/accuracies": 0.65625, - "rewards/chosen": -2.3362393379211426, - "rewards/margins": 0.45273178815841675, - "rewards/rejected": -2.788971424102783, + "logits/chosen": 0.2347393035888672, + "logits/rejected": 0.5894696712493896, + "logps/chosen": -388.94757080078125, + "logps/rejected": -447.3855895996094, + "loss": 0.525, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.254683017730713, + "rewards/margins": 0.7334609031677246, + "rewards/rejected": -2.9881439208984375, "step": 1080 }, { - "debug/losses": 0.019663607701659203, - "debug/policy_weights": 0.03420232608914375, - "debug/raw_losses": 0.6034265160560608, - "epoch": 0.8674890569040987, - "grad_norm": 0.9926669013809155, + "epoch": 0.87, "learning_rate": 2.6154532246349476e-08, - "logits/chosen": -1.6022891998291016, - "logits/rejected": -1.448965311050415, - "logps/chosen": -376.11602783203125, - "logps/rejected": -401.70013427734375, - "loss": 0.0236, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -2.345458984375, - "rewards/margins": 0.435396671295166, - "rewards/rejected": -2.780856132507324, + "logits/chosen": 0.25378522276878357, + "logits/rejected": 0.5771256685256958, + "logps/chosen": -358.50640869140625, + "logps/rejected": -431.145751953125, + "loss": 0.5462, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.1666626930236816, + "rewards/margins": 0.756801426410675, + "rewards/rejected": -2.923464059829712, "step": 1090 }, { - "debug/losses": 0.027548635378479958, - "debug/policy_weights": 0.04641988128423691, - "debug/raw_losses": 0.6471782922744751, - "epoch": 0.8754476721050537, - "grad_norm": 1.6941121596085795, + "epoch": 0.88, "learning_rate": 2.31464156702382e-08, - "logits/chosen": -1.6227413415908813, - "logits/rejected": -1.4754379987716675, - "logps/chosen": -386.9624328613281, - "logps/rejected": -417.848876953125, - "loss": 0.0229, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -2.3037209510803223, - "rewards/margins": 0.3875313103199005, - "rewards/rejected": -2.691251754760742, + "logits/chosen": 0.35370689630508423, + "logits/rejected": 0.5671936273574829, + "logps/chosen": -363.0, + "logps/rejected": -438.209228515625, + "loss": 0.5487, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.2130119800567627, + "rewards/margins": 0.7499077916145325, + "rewards/rejected": -2.9629194736480713, "step": 1100 }, { - "epoch": 0.8754476721050537, - "eval_debug/losses": 0.021541133522987366, - "eval_debug/policy_weights": 0.037866536527872086, - "eval_debug/raw_losses": 0.5694783926010132, - "eval_logits/chosen": -1.6574220657348633, - "eval_logits/rejected": -1.5526961088180542, - "eval_logps/chosen": -371.6010437011719, - "eval_logps/rejected": -429.42828369140625, - "eval_loss": 0.023058408871293068, - "eval_rewards/accuracies": 0.6949626803398132, - "eval_rewards/chosen": -2.2735750675201416, - "eval_rewards/margins": 0.5137178301811218, - "eval_rewards/rejected": -2.787292718887329, - "eval_runtime": 153.1149, - "eval_samples_per_second": 55.853, - "eval_steps_per_second": 0.875, + "epoch": 0.88, + "eval_logits/chosen": 0.1857856959104538, + "eval_logits/rejected": 0.43363669514656067, + "eval_logps/chosen": -373.08306884765625, + "eval_logps/rejected": -450.7598876953125, + "eval_loss": 0.5444055199623108, + "eval_rewards/accuracies": 0.7089552283287048, + "eval_rewards/chosen": -2.285095453262329, + "eval_rewards/margins": 0.711159884929657, + "eval_rewards/rejected": -2.996255397796631, + "eval_runtime": 184.0455, + "eval_samples_per_second": 46.467, + "eval_steps_per_second": 0.728, "step": 1100 }, { - "debug/losses": 0.01961921714246273, - "debug/policy_weights": 0.0378187857568264, - "debug/raw_losses": 0.5709212422370911, - "epoch": 0.8834062873060088, - "grad_norm": 1.0232523554179078, + "epoch": 0.88, "learning_rate": 2.031363082912252e-08, - "logits/chosen": -1.6505826711654663, - "logits/rejected": -1.548697829246521, - "logps/chosen": -354.1360168457031, - "logps/rejected": -411.10906982421875, - "loss": 0.0236, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -2.2260138988494873, - "rewards/margins": 0.478458970785141, - "rewards/rejected": -2.7044730186462402, + "logits/chosen": 0.070524200797081, + "logits/rejected": 0.4635602533817291, + "logps/chosen": -373.29327392578125, + "logps/rejected": -426.85552978515625, + "loss": 0.5513, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -2.2541089057922363, + "rewards/margins": 0.6198171973228455, + "rewards/rejected": -2.8739261627197266, "step": 1110 }, { - "debug/losses": 0.021199991926550865, - "debug/policy_weights": 0.03719371557235718, - "debug/raw_losses": 0.6032904386520386, - "epoch": 0.8913649025069638, - "grad_norm": 1.0591369289157555, + "epoch": 0.89, "learning_rate": 1.7658367139945228e-08, - "logits/chosen": -1.6456480026245117, - "logits/rejected": -1.5404677391052246, - "logps/chosen": -370.068603515625, - "logps/rejected": -416.41162109375, - "loss": 0.0221, - "rewards/accuracies": 0.71875, - "rewards/chosen": -2.257601737976074, - "rewards/margins": 0.47389450669288635, - "rewards/rejected": -2.7314963340759277, + "logits/chosen": 0.2600646913051605, + "logits/rejected": 0.5517584681510925, + "logps/chosen": -390.8568115234375, + "logps/rejected": -462.80828857421875, + "loss": 0.5471, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.377396821975708, + "rewards/margins": 0.6719989776611328, + "rewards/rejected": -3.049395799636841, "step": 1120 }, { - "debug/losses": 0.022228095680475235, - "debug/policy_weights": 0.04186805337667465, - "debug/raw_losses": 0.5478729009628296, - "epoch": 0.8993235177079189, - "grad_norm": 1.5518095173727842, + "epoch": 0.9, "learning_rate": 1.5182676816211632e-08, - "logits/chosen": -1.6441676616668701, - "logits/rejected": -1.5106528997421265, - "logps/chosen": -369.6057434082031, - "logps/rejected": -442.1670837402344, - "loss": 0.0236, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -2.2719249725341797, - "rewards/margins": 0.5875225067138672, - "rewards/rejected": -2.859447717666626, + "logits/chosen": 0.04413030296564102, + "logits/rejected": 0.30151715874671936, + "logps/chosen": -382.0662536621094, + "logps/rejected": -447.08673095703125, + "loss": 0.5431, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.228654384613037, + "rewards/margins": 0.6926024556159973, + "rewards/rejected": -2.9212570190429688, "step": 1130 }, { - "debug/losses": 0.02779330313205719, - "debug/policy_weights": 0.04451151564717293, - "debug/raw_losses": 0.6167975068092346, - "epoch": 0.9072821329088738, - "grad_norm": 1.1093999376056516, + "epoch": 0.91, "learning_rate": 1.2888473281864597e-08, - "logits/chosen": -1.670432686805725, - "logits/rejected": -1.5831201076507568, - "logps/chosen": -379.52020263671875, - "logps/rejected": -415.2666931152344, - "loss": 0.0234, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -2.217050075531006, - "rewards/margins": 0.4062577188014984, - "rewards/rejected": -2.623307704925537, + "logits/chosen": 0.14212054014205933, + "logits/rejected": 0.47429710626602173, + "logps/chosen": -367.8409729003906, + "logps/rejected": -435.02764892578125, + "loss": 0.5369, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -2.2534115314483643, + "rewards/margins": 0.7273036241531372, + "rewards/rejected": -2.980715274810791, "step": 1140 }, { - "debug/losses": 0.02286931499838829, - "debug/policy_weights": 0.039380840957164764, - "debug/raw_losses": 0.5585586428642273, - "epoch": 0.9152407481098289, - "grad_norm": 1.024708050069526, + "epoch": 0.92, "learning_rate": 1.0777529692427679e-08, - "logits/chosen": -1.5818754434585571, - "logits/rejected": -1.4253770112991333, - "logps/chosen": -367.00750732421875, - "logps/rejected": -408.79974365234375, - "loss": 0.0232, + "logits/chosen": 0.04115242511034012, + "logits/rejected": 0.28970104455947876, + "logps/chosen": -372.7949523925781, + "logps/rejected": -456.10675048828125, + "loss": 0.5265, "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -2.1622977256774902, - "rewards/margins": 0.5264004468917847, - "rewards/rejected": -2.6886980533599854, + "rewards/chosen": -2.300356388092041, + "rewards/margins": 0.8059718012809753, + "rewards/rejected": -3.106328248977661, "step": 1150 }, { - "debug/losses": 0.025199243798851967, - "debug/policy_weights": 0.03711647912859917, - "debug/raw_losses": 0.5976914167404175, - "epoch": 0.9231993633107839, - "grad_norm": 1.1752665323906102, + "epoch": 0.92, "learning_rate": 8.851477564560061e-09, - "logits/chosen": -1.5626524686813354, - "logits/rejected": -1.385860800743103, - "logps/chosen": -362.3895263671875, - "logps/rejected": -427.11468505859375, - "loss": 0.023, + "logits/chosen": 0.0867738351225853, + "logits/rejected": 0.4068300127983093, + "logps/chosen": -372.08636474609375, + "logps/rejected": -426.42388916015625, + "loss": 0.5342, "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -2.2021470069885254, - "rewards/margins": 0.5378574728965759, - "rewards/rejected": -2.740004539489746, + "rewards/chosen": -2.331385850906372, + "rewards/margins": 0.6490964293479919, + "rewards/rejected": -2.9804821014404297, "step": 1160 }, { - "debug/losses": 0.023396309465169907, - "debug/policy_weights": 0.045775819569826126, - "debug/raw_losses": 0.56801837682724, - "epoch": 0.931157978511739, - "grad_norm": 1.0873844543068034, + "epoch": 0.93, "learning_rate": 7.111805515081531e-09, - "logits/chosen": -1.593715786933899, - "logits/rejected": -1.4320154190063477, - "logps/chosen": -387.57379150390625, - "logps/rejected": -439.60333251953125, - "loss": 0.0227, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -2.3309168815612793, - "rewards/margins": 0.5536500215530396, - "rewards/rejected": -2.8845667839050293, + "logits/chosen": 0.02022993005812168, + "logits/rejected": 0.41968393325805664, + "logps/chosen": -363.818603515625, + "logps/rejected": -447.7919006347656, + "loss": 0.5312, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -2.2372307777404785, + "rewards/margins": 0.8540315628051758, + "rewards/rejected": -3.0912623405456543, "step": 1170 }, { - "debug/losses": 0.026339393109083176, - "debug/policy_weights": 0.043764419853687286, - "debug/raw_losses": 0.5899370908737183, - "epoch": 0.939116593712694, - "grad_norm": 1.026780874391343, + "epoch": 0.94, "learning_rate": 5.559858110443016e-09, - "logits/chosen": -1.6738373041152954, - "logits/rejected": -1.5380009412765503, - "logps/chosen": -377.5504150390625, - "logps/rejected": -429.8271484375, - "loss": 0.0218, - "rewards/accuracies": 0.6875, - "rewards/chosen": -2.216423749923706, - "rewards/margins": 0.5017086267471313, - "rewards/rejected": -2.7181320190429688, + "logits/chosen": 0.29695388674736023, + "logits/rejected": 0.714096188545227, + "logps/chosen": -372.5519714355469, + "logps/rejected": -442.5354919433594, + "loss": 0.5383, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.3107995986938477, + "rewards/margins": 0.8070123791694641, + "rewards/rejected": -3.117811918258667, "step": 1180 }, { - "debug/losses": 0.01717524789273739, - "debug/policy_weights": 0.033865705132484436, - "debug/raw_losses": 0.5577677488327026, - "epoch": 0.947075208913649, - "grad_norm": 1.2386891264420188, + "epoch": 0.95, "learning_rate": 4.196834827531276e-09, - "logits/chosen": -1.5762865543365479, - "logits/rejected": -1.4528791904449463, - "logps/chosen": -376.5120544433594, - "logps/rejected": -440.51055908203125, - "loss": 0.0207, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -2.3341188430786133, - "rewards/margins": 0.6440759897232056, - "rewards/rejected": -2.9781947135925293, + "logits/chosen": 0.140055850148201, + "logits/rejected": 0.3409932255744934, + "logps/chosen": -355.64324951171875, + "logps/rejected": -447.585693359375, + "loss": 0.5152, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.149151563644409, + "rewards/margins": 0.7904965877532959, + "rewards/rejected": -2.939648151397705, "step": 1190 }, { - "debug/losses": 0.018522335216403008, - "debug/policy_weights": 0.03287139907479286, - "debug/raw_losses": 0.5465400218963623, - "epoch": 0.955033824114604, - "grad_norm": 1.0907629974938928, + "epoch": 0.96, "learning_rate": 3.023789126611137e-09, - "logits/chosen": -1.6226059198379517, - "logits/rejected": -1.4353219270706177, - "logps/chosen": -370.7825927734375, - "logps/rejected": -422.1617126464844, - "loss": 0.0216, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -2.234524965286255, - "rewards/margins": 0.5858966112136841, - "rewards/rejected": -2.8204216957092285, + "logits/chosen": 0.03294936567544937, + "logits/rejected": 0.2933207154273987, + "logps/chosen": -363.29290771484375, + "logps/rejected": -435.640380859375, + "loss": 0.5483, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.281057357788086, + "rewards/margins": 0.7091296911239624, + "rewards/rejected": -2.990186929702759, "step": 1200 }, { - "epoch": 0.955033824114604, - "eval_debug/losses": 0.021226363256573677, - "eval_debug/policy_weights": 0.03739428520202637, - "eval_debug/raw_losses": 0.5681630373001099, - "eval_logits/chosen": -1.583396553993225, - "eval_logits/rejected": -1.4622374773025513, - "eval_logps/chosen": -375.3782043457031, - "eval_logps/rejected": -435.4866638183594, - "eval_loss": 0.022734906524419785, - "eval_rewards/accuracies": 0.6930969953536987, - "eval_rewards/chosen": -2.311347484588623, - "eval_rewards/margins": 0.536529541015625, - "eval_rewards/rejected": -2.847877264022827, - "eval_runtime": 152.9233, - "eval_samples_per_second": 55.923, - "eval_steps_per_second": 0.876, + "epoch": 0.96, + "eval_logits/chosen": 0.07418080419301987, + "eval_logits/rejected": 0.32435521483421326, + "eval_logps/chosen": -373.978515625, + "eval_logps/rejected": -451.6764831542969, + "eval_loss": 0.5440130829811096, + "eval_rewards/accuracies": 0.7089552283287048, + "eval_rewards/chosen": -2.2940499782562256, + "eval_rewards/margins": 0.7113713622093201, + "eval_rewards/rejected": -3.0054211616516113, + "eval_runtime": 183.9633, + "eval_samples_per_second": 46.488, + "eval_steps_per_second": 0.728, "step": 1200 }, { - "debug/losses": 0.01947428658604622, - "debug/policy_weights": 0.03627028688788414, - "debug/raw_losses": 0.5707622766494751, - "epoch": 0.9629924393155591, - "grad_norm": 1.0632313642647104, + "epoch": 0.96, "learning_rate": 2.041627637121929e-09, - "logits/chosen": -1.5518733263015747, - "logits/rejected": -1.381583333015442, - "logps/chosen": -376.3943786621094, - "logps/rejected": -449.06536865234375, - "loss": 0.0218, - "rewards/accuracies": 0.6875, - "rewards/chosen": -2.3494064807891846, - "rewards/margins": 0.5541393756866455, - "rewards/rejected": -2.90354585647583, + "logits/chosen": 0.10010697692632675, + "logits/rejected": 0.3795483410358429, + "logps/chosen": -348.8675231933594, + "logps/rejected": -437.20361328125, + "loss": 0.5398, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.087364673614502, + "rewards/margins": 0.828387439250946, + "rewards/rejected": -2.9157521724700928, "step": 1210 }, { - "debug/losses": 0.02166592888534069, - "debug/policy_weights": 0.03883267566561699, - "debug/raw_losses": 0.6050316095352173, - "epoch": 0.9709510545165141, - "grad_norm": 1.2818450205185703, + "epoch": 0.97, "learning_rate": 1.2511094569571668e-09, - "logits/chosen": -1.5095674991607666, - "logits/rejected": -1.303008794784546, - "logps/chosen": -378.7860412597656, - "logps/rejected": -401.0227966308594, - "loss": 0.0226, - "rewards/accuracies": 0.612500011920929, - "rewards/chosen": -2.3036277294158936, - "rewards/margins": 0.4543466567993164, - "rewards/rejected": -2.757974147796631, + "logits/chosen": 0.09991980344057083, + "logits/rejected": 0.4467397630214691, + "logps/chosen": -380.14520263671875, + "logps/rejected": -440.24658203125, + "loss": 0.5345, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.253425121307373, + "rewards/margins": 0.702509880065918, + "rewards/rejected": -2.955935001373291, "step": 1220 }, { - "debug/losses": 0.019198119640350342, - "debug/policy_weights": 0.03157269209623337, - "debug/raw_losses": 0.5717985033988953, - "epoch": 0.9789096697174692, - "grad_norm": 1.048649146210472, + "epoch": 0.98, "learning_rate": 6.528455657691112e-10, - "logits/chosen": -1.4732084274291992, - "logits/rejected": -1.4151369333267212, - "logps/chosen": -380.28582763671875, - "logps/rejected": -445.0985412597656, - "loss": 0.0206, - "rewards/accuracies": 0.6875, - "rewards/chosen": -2.415482759475708, - "rewards/margins": 0.5356068015098572, - "rewards/rejected": -2.951089382171631, + "logits/chosen": 0.11626466363668442, + "logits/rejected": 0.41348797082901, + "logps/chosen": -372.7298889160156, + "logps/rejected": -427.22576904296875, + "loss": 0.549, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.2800345420837402, + "rewards/margins": 0.6291176080703735, + "rewards/rejected": -2.909151792526245, "step": 1230 }, { - "debug/losses": 0.022800814360380173, - "debug/policy_weights": 0.03683259338140488, - "debug/raw_losses": 0.5676442980766296, - "epoch": 0.9868682849184242, - "grad_norm": 1.3029480412499796, + "epoch": 0.99, "learning_rate": 2.4729835275189016e-10, - "logits/chosen": -1.5280263423919678, - "logits/rejected": -1.38931405544281, - "logps/chosen": -378.03851318359375, - "logps/rejected": -444.1370544433594, - "loss": 0.0213, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -2.373324394226074, - "rewards/margins": 0.5974202156066895, - "rewards/rejected": -2.9707443714141846, + "logits/chosen": 0.06715863198041916, + "logits/rejected": 0.29241910576820374, + "logps/chosen": -393.8903503417969, + "logps/rejected": -477.9420471191406, + "loss": 0.5462, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.323488473892212, + "rewards/margins": 0.8067766427993774, + "rewards/rejected": -3.1302647590637207, "step": 1240 }, { - "debug/losses": 0.018399138003587723, - "debug/policy_weights": 0.034402213990688324, - "debug/raw_losses": 0.5340577363967896, - "epoch": 0.9948269001193792, - "grad_norm": 1.3789162938036494, + "epoch": 0.99, "learning_rate": 3.478125926756337e-11, - "logits/chosen": -1.4897087812423706, - "logits/rejected": -1.3781123161315918, - "logps/chosen": -379.60504150390625, - "logps/rejected": -452.0511169433594, - "loss": 0.0207, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -2.3788774013519287, - "rewards/margins": 0.6080917716026306, - "rewards/rejected": -2.986969470977783, + "logits/chosen": 0.25983649492263794, + "logits/rejected": 0.4905417561531067, + "logps/chosen": -364.73431396484375, + "logps/rejected": -443.79296875, + "loss": 0.5474, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.2353272438049316, + "rewards/margins": 0.771331787109375, + "rewards/rejected": -3.0066590309143066, "step": 1250 }, { - "epoch": 0.9996020692399522, + "epoch": 1.0, "step": 1256, "total_flos": 0.0, - "train_loss": 0.048100706314442646, - "train_runtime": 10605.3952, - "train_samples_per_second": 15.162, - "train_steps_per_second": 0.118 + "train_loss": 0.5712926928784438, + "train_runtime": 11605.7995, + "train_samples_per_second": 13.855, + "train_steps_per_second": 0.108 } ], "logging_steps": 10, "max_steps": 1256, - "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, - "stateful_callbacks": { - "TrainerControl": { - "args": { - "should_epoch_stop": false, - "should_evaluate": false, - "should_log": false, - "should_save": true, - "should_training_stop": false - }, - "attributes": {} - } - }, "total_flos": 0.0, - "train_batch_size": 8, "trial_name": null, "trial_params": null }