{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9996190476190476, "eval_steps": 500, "global_step": 656, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 7.575757575757576e-08, "logits/chosen": -0.06230628490447998, "logits/rejected": 0.387611985206604, "logps/chosen": -299.96368408203125, "logps/rejected": -309.7692565917969, "loss": 0.3457, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.02, "learning_rate": 7.575757575757576e-07, "logits/chosen": 0.10603617876768112, "logits/rejected": 0.24432696402072906, "logps/chosen": -349.7105712890625, "logps/rejected": -287.6678466796875, "loss": 0.3317, "rewards/accuracies": 0.3611111044883728, "rewards/chosen": -0.00022038168390281498, "rewards/margins": -0.0001302291639149189, "rewards/rejected": -9.01525272638537e-05, "step": 10 }, { "epoch": 0.03, "learning_rate": 1.5151515151515152e-06, "logits/chosen": 0.09682648628950119, "logits/rejected": 0.25353121757507324, "logps/chosen": -334.0283203125, "logps/rejected": -270.40179443359375, "loss": 0.3365, "rewards/accuracies": 0.4375, "rewards/chosen": 8.945484069045051e-07, "rewards/margins": 2.460894847899908e-06, "rewards/rejected": -1.5663565591239603e-06, "step": 20 }, { "epoch": 0.05, "learning_rate": 2.2727272727272728e-06, "logits/chosen": 0.12519071996212006, "logits/rejected": 0.20205554366111755, "logps/chosen": -287.5645446777344, "logps/rejected": -266.9909362792969, "loss": 0.3417, "rewards/accuracies": 0.518750011920929, "rewards/chosen": 0.0003230299334973097, "rewards/margins": 0.0002776235924102366, "rewards/rejected": 4.540634108707309e-05, "step": 30 }, { "epoch": 0.06, "learning_rate": 3.0303030303030305e-06, "logits/chosen": 0.04107561707496643, "logits/rejected": 0.20281007885932922, "logps/chosen": -319.4472351074219, "logps/rejected": -262.85418701171875, "loss": 0.3615, "rewards/accuracies": 0.59375, "rewards/chosen": 0.0008775835740379989, "rewards/margins": 0.0006280745146796107, "rewards/rejected": 0.0002495090593583882, "step": 40 }, { "epoch": 0.08, "learning_rate": 3.7878787878787882e-06, "logits/chosen": 0.12531228363513947, "logits/rejected": 0.27179789543151855, "logps/chosen": -359.4334411621094, "logps/rejected": -278.2669982910156, "loss": 0.3412, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.0017220426816493273, "rewards/margins": 0.001711520948447287, "rewards/rejected": 1.0521779586269986e-05, "step": 50 }, { "epoch": 0.09, "learning_rate": 4.5454545454545455e-06, "logits/chosen": 0.07350507378578186, "logits/rejected": 0.22942480444908142, "logps/chosen": -346.56927490234375, "logps/rejected": -306.72686767578125, "loss": 0.3553, "rewards/accuracies": 0.78125, "rewards/chosen": 0.0025817144196480513, "rewards/margins": 0.003724290756508708, "rewards/rejected": -0.001142576104030013, "step": 60 }, { "epoch": 0.11, "learning_rate": 4.999432965739786e-06, "logits/chosen": 0.09025775641202927, "logits/rejected": 0.31351155042648315, "logps/chosen": -381.5528869628906, "logps/rejected": -286.2391662597656, "loss": 0.3633, "rewards/accuracies": 0.625, "rewards/chosen": 0.0029081101529300213, "rewards/margins": 0.0062509761191904545, "rewards/rejected": -0.003342865500599146, "step": 70 }, { "epoch": 0.12, "learning_rate": 4.9930567839810125e-06, "logits/chosen": 0.0820033922791481, "logits/rejected": 0.2647712826728821, "logps/chosen": -350.4310607910156, "logps/rejected": -280.1022644042969, "loss": 0.3175, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.0049164495430886745, "rewards/margins": 0.010990149341523647, "rewards/rejected": -0.006073700729757547, "step": 80 }, { "epoch": 0.14, "learning_rate": 4.979613761906212e-06, "logits/chosen": 0.11785046756267548, "logits/rejected": 0.3276062607765198, "logps/chosen": -370.00860595703125, "logps/rejected": -322.6431579589844, "loss": 0.3244, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.004425252787768841, "rewards/margins": 0.019340159371495247, "rewards/rejected": -0.014914905652403831, "step": 90 }, { "epoch": 0.15, "learning_rate": 4.959142005221991e-06, "logits/chosen": 0.19294002652168274, "logits/rejected": 0.3200022578239441, "logps/chosen": -369.6689147949219, "logps/rejected": -329.09893798828125, "loss": 0.2978, "rewards/accuracies": 0.75, "rewards/chosen": -0.018663963302969933, "rewards/margins": 0.04274021461606026, "rewards/rejected": -0.06140417978167534, "step": 100 }, { "epoch": 0.17, "learning_rate": 4.931699543346854e-06, "logits/chosen": 0.23536458611488342, "logits/rejected": 0.35565608739852905, "logps/chosen": -432.9308166503906, "logps/rejected": -432.2447204589844, "loss": 0.2948, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.07575322687625885, "rewards/margins": 0.06633279472589493, "rewards/rejected": -0.14208602905273438, "step": 110 }, { "epoch": 0.18, "learning_rate": 4.897364164920515e-06, "logits/chosen": 0.24238090217113495, "logits/rejected": 0.2669451832771301, "logps/chosen": -502.5521545410156, "logps/rejected": -526.5723876953125, "loss": 0.286, "rewards/accuracies": 0.65625, "rewards/chosen": -0.16139724850654602, "rewards/margins": 0.09947613626718521, "rewards/rejected": -0.26087337732315063, "step": 120 }, { "epoch": 0.2, "learning_rate": 4.8562331973035396e-06, "logits/chosen": 0.24233976006507874, "logits/rejected": 0.37175804376602173, "logps/chosen": -561.4464721679688, "logps/rejected": -616.5276489257812, "loss": 0.2701, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.1863406002521515, "rewards/margins": 0.11555895954370499, "rewards/rejected": -0.30189958214759827, "step": 130 }, { "epoch": 0.21, "learning_rate": 4.808423230692374e-06, "logits/chosen": 0.2752463221549988, "logits/rejected": 0.41067615151405334, "logps/chosen": -544.420654296875, "logps/rejected": -609.9964599609375, "loss": 0.2437, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.16954469680786133, "rewards/margins": 0.15670140087604523, "rewards/rejected": -0.32624611258506775, "step": 140 }, { "epoch": 0.23, "learning_rate": 4.754069787631761e-06, "logits/chosen": 0.22061054408550262, "logits/rejected": 0.38979893922805786, "logps/chosen": -499.3285217285156, "logps/rejected": -558.7503051757812, "loss": 0.2709, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.1981675624847412, "rewards/margins": 0.09831173717975616, "rewards/rejected": -0.2964792847633362, "step": 150 }, { "epoch": 0.24, "learning_rate": 4.693326938861367e-06, "logits/chosen": 0.2467188537120819, "logits/rejected": 0.35028940439224243, "logps/chosen": -526.26513671875, "logps/rejected": -599.4227294921875, "loss": 0.2605, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.1936112642288208, "rewards/margins": 0.1178494542837143, "rewards/rejected": -0.3114607036113739, "step": 160 }, { "epoch": 0.26, "learning_rate": 4.626366866585528e-06, "logits/chosen": 0.22420647740364075, "logits/rejected": 0.4213325083255768, "logps/chosen": -622.0628662109375, "logps/rejected": -708.9386596679688, "loss": 0.2402, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.2440744936466217, "rewards/margins": 0.15784001350402832, "rewards/rejected": -0.40191444754600525, "step": 170 }, { "epoch": 0.27, "learning_rate": 4.553379376404085e-06, "logits/chosen": 0.27953073382377625, "logits/rejected": 0.3603507876396179, "logps/chosen": -601.0642700195312, "logps/rejected": -663.9993896484375, "loss": 0.2735, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.2422056943178177, "rewards/margins": 0.1294884979724884, "rewards/rejected": -0.3716942071914673, "step": 180 }, { "epoch": 0.29, "learning_rate": 4.474571359287791e-06, "logits/chosen": 0.23259811103343964, "logits/rejected": 0.4210383892059326, "logps/chosen": -572.4783935546875, "logps/rejected": -687.89306640625, "loss": 0.2247, "rewards/accuracies": 0.75, "rewards/chosen": -0.20176899433135986, "rewards/margins": 0.18673528730869293, "rewards/rejected": -0.3885042667388916, "step": 190 }, { "epoch": 0.3, "learning_rate": 4.3901662051233755e-06, "logits/chosen": 0.2610084116458893, "logits/rejected": 0.3713424801826477, "logps/chosen": -539.9381103515625, "logps/rejected": -680.5265502929688, "loss": 0.2243, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.20527365803718567, "rewards/margins": 0.1885356605052948, "rewards/rejected": -0.3938092887401581, "step": 200 }, { "epoch": 0.32, "learning_rate": 4.30040316949064e-06, "logits/chosen": 0.2715567946434021, "logits/rejected": 0.3666667640209198, "logps/chosen": -536.0437622070312, "logps/rejected": -629.3165283203125, "loss": 0.248, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.18539923429489136, "rewards/margins": 0.16864952445030212, "rewards/rejected": -0.35404878854751587, "step": 210 }, { "epoch": 0.34, "learning_rate": 4.205536695466524e-06, "logits/chosen": 0.23633995652198792, "logits/rejected": 0.3769295811653137, "logps/chosen": -584.9187622070312, "logps/rejected": -705.7189331054688, "loss": 0.2185, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.2217777669429779, "rewards/margins": 0.20478694140911102, "rewards/rejected": -0.4265647530555725, "step": 220 }, { "epoch": 0.35, "learning_rate": 4.105835692378557e-06, "logits/chosen": 0.29004430770874023, "logits/rejected": 0.4667590260505676, "logps/chosen": -701.4776611328125, "logps/rejected": -798.4501953125, "loss": 0.2323, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.316620796918869, "rewards/margins": 0.18385566771030426, "rewards/rejected": -0.5004764795303345, "step": 230 }, { "epoch": 0.37, "learning_rate": 4.001582773552153e-06, "logits/chosen": 0.27633368968963623, "logits/rejected": 0.42621952295303345, "logps/chosen": -621.6236572265625, "logps/rejected": -752.4913940429688, "loss": 0.2466, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.2713666260242462, "rewards/margins": 0.17953212559223175, "rewards/rejected": -0.4508987367153168, "step": 240 }, { "epoch": 0.38, "learning_rate": 3.893073455212438e-06, "logits/chosen": 0.2567233443260193, "logits/rejected": 0.3767459988594055, "logps/chosen": -553.5460205078125, "logps/rejected": -691.7283935546875, "loss": 0.2438, "rewards/accuracies": 0.75, "rewards/chosen": -0.23452623188495636, "rewards/margins": 0.17362844944000244, "rewards/rejected": -0.4081546664237976, "step": 250 }, { "epoch": 0.4, "learning_rate": 3.7806153188114027e-06, "logits/chosen": 0.24300554394721985, "logits/rejected": 0.30316272377967834, "logps/chosen": -578.8638916015625, "logps/rejected": -715.2518310546875, "loss": 0.2468, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.2516656219959259, "rewards/margins": 0.16586793959140778, "rewards/rejected": -0.4175335466861725, "step": 260 }, { "epoch": 0.41, "learning_rate": 3.6645271391548542e-06, "logits/chosen": 0.23739242553710938, "logits/rejected": 0.426413357257843, "logps/chosen": -634.893798828125, "logps/rejected": -719.962890625, "loss": 0.2531, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.25505417585372925, "rewards/margins": 0.18401430547237396, "rewards/rejected": -0.4390684962272644, "step": 270 }, { "epoch": 0.43, "learning_rate": 3.5451379808006014e-06, "logits/chosen": 0.2224760502576828, "logits/rejected": 0.3537066578865051, "logps/chosen": -589.7808227539062, "logps/rejected": -747.9609375, "loss": 0.2283, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.2638700306415558, "rewards/margins": 0.1954762488603592, "rewards/rejected": -0.45934629440307617, "step": 280 }, { "epoch": 0.44, "learning_rate": 3.4227862652892106e-06, "logits/chosen": 0.23644611239433289, "logits/rejected": 0.4177249073982239, "logps/chosen": -645.470703125, "logps/rejected": -779.50927734375, "loss": 0.2437, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.27952614426612854, "rewards/margins": 0.19378896057605743, "rewards/rejected": -0.4733150601387024, "step": 290 }, { "epoch": 0.46, "learning_rate": 3.2978188118513814e-06, "logits/chosen": 0.2835056483745575, "logits/rejected": 0.3992118239402771, "logps/chosen": -620.6287841796875, "logps/rejected": -733.6190185546875, "loss": 0.2377, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.2810095250606537, "rewards/margins": 0.17397195100784302, "rewards/rejected": -0.4549815058708191, "step": 300 }, { "epoch": 0.47, "learning_rate": 3.1705898543111576e-06, "logits/chosen": 0.2648586630821228, "logits/rejected": 0.36566048860549927, "logps/chosen": -600.49853515625, "logps/rejected": -694.8892822265625, "loss": 0.2408, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.2665795385837555, "rewards/margins": 0.14690735936164856, "rewards/rejected": -0.41348689794540405, "step": 310 }, { "epoch": 0.49, "learning_rate": 3.041460036971664e-06, "logits/chosen": 0.2214949131011963, "logits/rejected": 0.38505813479423523, "logps/chosen": -669.3818969726562, "logps/rejected": -743.3441162109375, "loss": 0.2197, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.27733737230300903, "rewards/margins": 0.16739198565483093, "rewards/rejected": -0.4447293281555176, "step": 320 }, { "epoch": 0.5, "learning_rate": 2.910795392329649e-06, "logits/chosen": 0.19933928549289703, "logits/rejected": 0.3905678391456604, "logps/chosen": -631.4853515625, "logps/rejected": -736.2476806640625, "loss": 0.2262, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.2883307933807373, "rewards/margins": 0.17361871898174286, "rewards/rejected": -0.46194949746131897, "step": 330 }, { "epoch": 0.52, "learning_rate": 2.7789663035166035e-06, "logits/chosen": 0.19208988547325134, "logits/rejected": 0.3280327022075653, "logps/chosen": -637.7025146484375, "logps/rejected": -742.5934448242188, "loss": 0.229, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.2803892493247986, "rewards/margins": 0.18691286444664001, "rewards/rejected": -0.46730202436447144, "step": 340 }, { "epoch": 0.53, "learning_rate": 2.6463464544075344e-06, "logits/chosen": 0.2192709892988205, "logits/rejected": 0.354716956615448, "logps/chosen": -666.2679443359375, "logps/rejected": -781.5311889648438, "loss": 0.2365, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.313618928194046, "rewards/margins": 0.2000730335712433, "rewards/rejected": -0.5136920213699341, "step": 350 }, { "epoch": 0.55, "learning_rate": 2.513311770373421e-06, "logits/chosen": 0.24396447837352753, "logits/rejected": 0.3740311563014984, "logps/chosen": -632.1403198242188, "logps/rejected": -780.8329467773438, "loss": 0.2137, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.29758018255233765, "rewards/margins": 0.20380613207817078, "rewards/rejected": -0.501386284828186, "step": 360 }, { "epoch": 0.56, "learning_rate": 2.380239352679908e-06, "logits/chosen": 0.2605392634868622, "logits/rejected": 0.3702816367149353, "logps/chosen": -659.3692626953125, "logps/rejected": -771.1994018554688, "loss": 0.2134, "rewards/accuracies": 0.75, "rewards/chosen": -0.2961762547492981, "rewards/margins": 0.19712337851524353, "rewards/rejected": -0.49329957365989685, "step": 370 }, { "epoch": 0.58, "learning_rate": 2.247506409552795e-06, "logits/chosen": 0.2626807987689972, "logits/rejected": 0.4523330330848694, "logps/chosen": -723.3246459960938, "logps/rejected": -843.2516479492188, "loss": 0.2222, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.33206573128700256, "rewards/margins": 0.2155541181564331, "rewards/rejected": -0.5476198792457581, "step": 380 }, { "epoch": 0.59, "learning_rate": 2.1154891869403436e-06, "logits/chosen": 0.2673259675502777, "logits/rejected": 0.3809369206428528, "logps/chosen": -627.7708740234375, "logps/rejected": -776.5219116210938, "loss": 0.2345, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.2913658320903778, "rewards/margins": 0.19852493703365326, "rewards/rejected": -0.48989081382751465, "step": 390 }, { "epoch": 0.61, "learning_rate": 1.9845619020032552e-06, "logits/chosen": 0.22264230251312256, "logits/rejected": 0.36639919877052307, "logps/chosen": -656.4054565429688, "logps/rejected": -789.1956787109375, "loss": 0.2189, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.30770599842071533, "rewards/margins": 0.20198026299476624, "rewards/rejected": -0.5096862316131592, "step": 400 }, { "epoch": 0.62, "learning_rate": 1.8550956823554708e-06, "logits/chosen": 0.25498437881469727, "logits/rejected": 0.39235639572143555, "logps/chosen": -597.4380493164062, "logps/rejected": -743.4146728515625, "loss": 0.2504, "rewards/accuracies": 0.75, "rewards/chosen": -0.2715555429458618, "rewards/margins": 0.1882806420326233, "rewards/rejected": -0.4598361849784851, "step": 410 }, { "epoch": 0.64, "learning_rate": 1.7274575140626318e-06, "logits/chosen": 0.2268383502960205, "logits/rejected": 0.3757990598678589, "logps/chosen": -628.30517578125, "logps/rejected": -744.66455078125, "loss": 0.2356, "rewards/accuracies": 0.75, "rewards/chosen": -0.2700210213661194, "rewards/margins": 0.19132563471794128, "rewards/rejected": -0.4613465666770935, "step": 420 }, { "epoch": 0.66, "learning_rate": 1.6020092013802002e-06, "logits/chosen": 0.26629549264907837, "logits/rejected": 0.38806790113449097, "logps/chosen": -643.1737060546875, "logps/rejected": -752.5285034179688, "loss": 0.2192, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.28404101729393005, "rewards/margins": 0.18914134800434113, "rewards/rejected": -0.4731822907924652, "step": 430 }, { "epoch": 0.67, "learning_rate": 1.4791063411799938e-06, "logits/chosen": 0.17316266894340515, "logits/rejected": 0.3880611062049866, "logps/chosen": -670.3546142578125, "logps/rejected": -764.1770629882812, "loss": 0.2366, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.3103855550289154, "rewards/margins": 0.17021675407886505, "rewards/rejected": -0.48060232400894165, "step": 440 }, { "epoch": 0.69, "learning_rate": 1.3590973149722103e-06, "logits/chosen": 0.24187886714935303, "logits/rejected": 0.37198665738105774, "logps/chosen": -628.3953857421875, "logps/rejected": -773.7864990234375, "loss": 0.2157, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.3083949089050293, "rewards/margins": 0.18738897144794464, "rewards/rejected": -0.49578380584716797, "step": 450 }, { "epoch": 0.7, "learning_rate": 1.2423223013801946e-06, "logits/chosen": 0.19144193828105927, "logits/rejected": 0.3342723846435547, "logps/chosen": -679.7613525390625, "logps/rejected": -820.7508544921875, "loss": 0.2274, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.3411591649055481, "rewards/margins": 0.18641141057014465, "rewards/rejected": -0.5275705456733704, "step": 460 }, { "epoch": 0.72, "learning_rate": 1.1291123118671665e-06, "logits/chosen": 0.21762657165527344, "logits/rejected": 0.307900607585907, "logps/chosen": -651.0516967773438, "logps/rejected": -805.5900268554688, "loss": 0.2194, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.309937983751297, "rewards/margins": 0.20782515406608582, "rewards/rejected": -0.5177631378173828, "step": 470 }, { "epoch": 0.73, "learning_rate": 1.019788252448267e-06, "logits/chosen": 0.184129536151886, "logits/rejected": 0.3611859083175659, "logps/chosen": -619.1923828125, "logps/rejected": -762.8614501953125, "loss": 0.2191, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.30183008313179016, "rewards/margins": 0.18515999615192413, "rewards/rejected": -0.4869900643825531, "step": 480 }, { "epoch": 0.75, "learning_rate": 9.146600140475945e-07, "logits/chosen": 0.20372629165649414, "logits/rejected": 0.33465132117271423, "logps/chosen": -636.4930419921875, "logps/rejected": -756.65576171875, "loss": 0.2029, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.2839277386665344, "rewards/margins": 0.197633758187294, "rewards/rejected": -0.4815615117549896, "step": 490 }, { "epoch": 0.76, "learning_rate": 8.140255940787059e-07, "logits/chosen": 0.18791750073432922, "logits/rejected": 0.4121164381504059, "logps/chosen": -636.9640502929688, "logps/rejected": -798.8194580078125, "loss": 0.2107, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.27898019552230835, "rewards/margins": 0.21688583493232727, "rewards/rejected": -0.49586600065231323, "step": 500 }, { "epoch": 0.78, "learning_rate": 7.181702517385789e-07, "logits/chosen": 0.19490660727024078, "logits/rejected": 0.3919000029563904, "logps/chosen": -693.1792602539062, "logps/rejected": -796.6826171875, "loss": 0.2212, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.30116355419158936, "rewards/margins": 0.22377757728099823, "rewards/rejected": -0.524941086769104, "step": 510 }, { "epoch": 0.79, "learning_rate": 6.273656994094232e-07, "logits/chosen": 0.21901503205299377, "logits/rejected": 0.33780670166015625, "logps/chosen": -620.2666015625, "logps/rejected": -768.0850830078125, "loss": 0.2174, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.29874101281166077, "rewards/margins": 0.19197914004325867, "rewards/rejected": -0.49072012305259705, "step": 520 }, { "epoch": 0.81, "learning_rate": 5.418693324604082e-07, "logits/chosen": 0.16905078291893005, "logits/rejected": 0.3485548198223114, "logps/chosen": -657.9064331054688, "logps/rejected": -817.0252075195312, "loss": 0.22, "rewards/accuracies": 0.8125, "rewards/chosen": -0.3066551983356476, "rewards/margins": 0.22827258706092834, "rewards/rejected": -0.5349277853965759, "step": 530 }, { "epoch": 0.82, "learning_rate": 4.619234996325314e-07, "logits/chosen": 0.2440187931060791, "logits/rejected": 0.34625181555747986, "logps/chosen": -640.2761840820312, "logps/rejected": -762.24609375, "loss": 0.2044, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.2988249659538269, "rewards/margins": 0.19742272794246674, "rewards/rejected": -0.49624767899513245, "step": 540 }, { "epoch": 0.84, "learning_rate": 3.877548160747768e-07, "logits/chosen": 0.20869365334510803, "logits/rejected": 0.31174546480178833, "logps/chosen": -660.8912353515625, "logps/rejected": -801.5670166015625, "loss": 0.2247, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.3307650685310364, "rewards/margins": 0.18925593793392181, "rewards/rejected": -0.5200210213661194, "step": 550 }, { "epoch": 0.85, "learning_rate": 3.195735209788528e-07, "logits/chosen": 0.20122747123241425, "logits/rejected": 0.35053348541259766, "logps/chosen": -679.0135498046875, "logps/rejected": -827.4410400390625, "loss": 0.2419, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.3423236012458801, "rewards/margins": 0.19195982813835144, "rewards/rejected": -0.534283459186554, "step": 560 }, { "epoch": 0.87, "learning_rate": 2.5757288163336806e-07, "logits/chosen": 0.26682180166244507, "logits/rejected": 0.3439808785915375, "logps/chosen": -672.2916259765625, "logps/rejected": -777.9793701171875, "loss": 0.2386, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.312061071395874, "rewards/margins": 0.18959736824035645, "rewards/rejected": -0.5016584396362305, "step": 570 }, { "epoch": 0.88, "learning_rate": 2.019286455866981e-07, "logits/chosen": 0.23167696595191956, "logits/rejected": 0.3346760869026184, "logps/chosen": -632.02587890625, "logps/rejected": -782.4437255859375, "loss": 0.224, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.29868921637535095, "rewards/margins": 0.1968630850315094, "rewards/rejected": -0.49555230140686035, "step": 580 }, { "epoch": 0.9, "learning_rate": 1.5279854247146703e-07, "logits/chosen": 0.19181525707244873, "logits/rejected": 0.3878282904624939, "logps/chosen": -682.7297973632812, "logps/rejected": -799.3133544921875, "loss": 0.221, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.3136585056781769, "rewards/margins": 0.19802789390087128, "rewards/rejected": -0.5116864442825317, "step": 590 }, { "epoch": 0.91, "learning_rate": 1.1032183690276754e-07, "logits/chosen": 0.20669761300086975, "logits/rejected": 0.3533198833465576, "logps/chosen": -694.8823852539062, "logps/rejected": -831.9597778320312, "loss": 0.1987, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.3144986033439636, "rewards/margins": 0.2219509333372116, "rewards/rejected": -0.5364495515823364, "step": 600 }, { "epoch": 0.93, "learning_rate": 7.46189337174788e-08, "logits/chosen": 0.2405429631471634, "logits/rejected": 0.3472765386104584, "logps/chosen": -657.0903930664062, "logps/rejected": -797.1058959960938, "loss": 0.2369, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.310764878988266, "rewards/margins": 0.20058993995189667, "rewards/rejected": -0.5113548040390015, "step": 610 }, { "epoch": 0.94, "learning_rate": 4.579103667367385e-08, "logits/chosen": 0.2070445716381073, "logits/rejected": 0.4006822109222412, "logps/chosen": -655.3494873046875, "logps/rejected": -780.321533203125, "loss": 0.222, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.32885631918907166, "rewards/margins": 0.17823253571987152, "rewards/rejected": -0.507088840007782, "step": 620 }, { "epoch": 0.96, "learning_rate": 2.3919861577572924e-08, "logits/chosen": 0.23205919563770294, "logits/rejected": 0.3889433741569519, "logps/chosen": -668.5777587890625, "logps/rejected": -799.8650512695312, "loss": 0.235, "rewards/accuracies": 0.75, "rewards/chosen": -0.3039870858192444, "rewards/margins": 0.20831787586212158, "rewards/rejected": -0.5123049020767212, "step": 630 }, { "epoch": 0.98, "learning_rate": 9.067404651211808e-09, "logits/chosen": 0.1636372208595276, "logits/rejected": 0.3309451639652252, "logps/chosen": -615.0830688476562, "logps/rejected": -778.7999267578125, "loss": 0.2582, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.3035675883293152, "rewards/margins": 0.19927778840065002, "rewards/rejected": -0.5028454065322876, "step": 640 }, { "epoch": 0.99, "learning_rate": 1.2757667974155896e-09, "logits/chosen": 0.23857417702674866, "logits/rejected": 0.35767877101898193, "logps/chosen": -685.1931762695312, "logps/rejected": -803.7652587890625, "loss": 0.2486, "rewards/accuracies": 0.75, "rewards/chosen": -0.3033232092857361, "rewards/margins": 0.2285349816083908, "rewards/rejected": -0.5318582057952881, "step": 650 }, { "epoch": 1.0, "step": 656, "total_flos": 0.0, "train_loss": 0.2497515143236009, "train_runtime": 7980.5775, "train_samples_per_second": 2.631, "train_steps_per_second": 0.082 } ], "logging_steps": 10, "max_steps": 656, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }