{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9996190476190476, "eval_steps": 500, "global_step": 1312, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 3.787878787878788e-08, "logits/chosen": 0.17224127054214478, "logits/rejected": 0.18124699592590332, "logps/chosen": -379.32623291015625, "logps/rejected": -349.5926208496094, "loss": 0.2902, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.01, "learning_rate": 3.787878787878788e-07, "logits/chosen": 0.07856506109237671, "logits/rejected": 0.2510358691215515, "logps/chosen": -334.9958190917969, "logps/rejected": -283.54034423828125, "loss": 0.3745, "rewards/accuracies": 0.4375, "rewards/chosen": 1.0113296411873307e-05, "rewards/margins": 5.3244151786202565e-05, "rewards/rejected": -4.313084718887694e-05, "step": 10 }, { "epoch": 0.02, "learning_rate": 7.575757575757576e-07, "logits/chosen": 0.07835443317890167, "logits/rejected": 0.23690445721149445, "logps/chosen": -343.5411682128906, "logps/rejected": -300.49774169921875, "loss": 0.345, "rewards/accuracies": 0.518750011920929, "rewards/chosen": 9.991687193178223e-07, "rewards/margins": 5.578011041507125e-05, "rewards/rejected": -5.478093953570351e-05, "step": 20 }, { "epoch": 0.02, "learning_rate": 1.1363636363636364e-06, "logits/chosen": 0.09362699836492538, "logits/rejected": 0.24969105422496796, "logps/chosen": -384.11199951171875, "logps/rejected": -290.873779296875, "loss": 0.3667, "rewards/accuracies": 0.53125, "rewards/chosen": 0.00015662855003029108, "rewards/margins": 9.987165867642034e-06, "rewards/rejected": 0.0001466413668822497, "step": 30 }, { "epoch": 0.03, "learning_rate": 1.5151515151515152e-06, "logits/chosen": 0.08275317400693893, "logits/rejected": 0.20892572402954102, "logps/chosen": -361.24462890625, "logps/rejected": -296.53094482421875, "loss": 0.3012, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 0.00019404105842113495, "rewards/margins": 0.000132537359604612, "rewards/rejected": 6.150371336843818e-05, "step": 40 }, { "epoch": 0.04, "learning_rate": 1.8939393939393941e-06, "logits/chosen": 0.14996236562728882, "logits/rejected": 0.2148081511259079, "logps/chosen": -339.32916259765625, "logps/rejected": -292.900146484375, "loss": 0.3391, "rewards/accuracies": 0.518750011920929, "rewards/chosen": 0.0002820778754539788, "rewards/margins": 0.00035016084439121187, "rewards/rejected": -6.80829762131907e-05, "step": 50 }, { "epoch": 0.05, "learning_rate": 2.2727272727272728e-06, "logits/chosen": 0.09292325377464294, "logits/rejected": 0.25342074036598206, "logps/chosen": -357.79498291015625, "logps/rejected": -279.56475830078125, "loss": 0.3523, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 0.0005591081571765244, "rewards/margins": 0.0007476316532120109, "rewards/rejected": -0.00018852358334697783, "step": 60 }, { "epoch": 0.05, "learning_rate": 2.6515151515151514e-06, "logits/chosen": 0.1520170122385025, "logits/rejected": 0.23838527500629425, "logps/chosen": -344.97381591796875, "logps/rejected": -272.6285400390625, "loss": 0.333, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.000808493874501437, "rewards/margins": 0.0009522804175503552, "rewards/rejected": -0.0001437865139450878, "step": 70 }, { "epoch": 0.06, "learning_rate": 3.0303030303030305e-06, "logits/chosen": 0.10541319847106934, "logits/rejected": 0.2519112229347229, "logps/chosen": -337.99932861328125, "logps/rejected": -285.3942565917969, "loss": 0.3276, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.0017709163948893547, "rewards/margins": 0.0019623935222625732, "rewards/rejected": -0.00019147712737321854, "step": 80 }, { "epoch": 0.07, "learning_rate": 3.409090909090909e-06, "logits/chosen": 0.08055099099874496, "logits/rejected": 0.23055055737495422, "logps/chosen": -358.501220703125, "logps/rejected": -301.07373046875, "loss": 0.325, "rewards/accuracies": 0.71875, "rewards/chosen": 0.0031805038452148438, "rewards/margins": 0.0035764030180871487, "rewards/rejected": -0.0003958995803259313, "step": 90 }, { "epoch": 0.08, "learning_rate": 3.7878787878787882e-06, "logits/chosen": 0.1381727159023285, "logits/rejected": 0.36128586530685425, "logps/chosen": -408.2177734375, "logps/rejected": -295.70391845703125, "loss": 0.3551, "rewards/accuracies": 0.6875, "rewards/chosen": 0.005324442870914936, "rewards/margins": 0.006567128002643585, "rewards/rejected": -0.00124268583022058, "step": 100 }, { "epoch": 0.08, "learning_rate": 4.166666666666667e-06, "logits/chosen": 0.11063258349895477, "logits/rejected": 0.3397473692893982, "logps/chosen": -365.5294494628906, "logps/rejected": -296.68212890625, "loss": 0.3176, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.0065561323426663876, "rewards/margins": 0.009804257191717625, "rewards/rejected": -0.0032481239177286625, "step": 110 }, { "epoch": 0.09, "learning_rate": 4.5454545454545455e-06, "logits/chosen": 0.10760724544525146, "logits/rejected": 0.3315739035606384, "logps/chosen": -369.8674011230469, "logps/rejected": -290.553955078125, "loss": 0.2862, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": 0.009247648529708385, "rewards/margins": 0.017721759155392647, "rewards/rejected": -0.008474110625684261, "step": 120 }, { "epoch": 0.1, "learning_rate": 4.924242424242425e-06, "logits/chosen": 0.19570864737033844, "logits/rejected": 0.29995661973953247, "logps/chosen": -357.7752685546875, "logps/rejected": -298.80865478515625, "loss": 0.3091, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.0016540288925170898, "rewards/margins": 0.030770784243941307, "rewards/rejected": -0.029116755351424217, "step": 130 }, { "epoch": 0.11, "learning_rate": 4.999432965739786e-06, "logits/chosen": 0.18279646337032318, "logits/rejected": 0.24193449318408966, "logps/chosen": -326.20623779296875, "logps/rejected": -340.66204833984375, "loss": 0.3146, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.02705274522304535, "rewards/margins": 0.038378529250621796, "rewards/rejected": -0.06543128192424774, "step": 140 }, { "epoch": 0.11, "learning_rate": 4.997129829895409e-06, "logits/chosen": 0.14213022589683533, "logits/rejected": 0.2512076199054718, "logps/chosen": -408.73541259765625, "logps/rejected": -432.90771484375, "loss": 0.2979, "rewards/accuracies": 0.71875, "rewards/chosen": -0.0731552243232727, "rewards/margins": 0.08651129901409149, "rewards/rejected": -0.159666508436203, "step": 150 }, { "epoch": 0.12, "learning_rate": 4.9930567839810125e-06, "logits/chosen": 0.17261534929275513, "logits/rejected": 0.27892106771469116, "logps/chosen": -483.6229553222656, "logps/rejected": -527.606201171875, "loss": 0.2727, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.121568962931633, "rewards/margins": 0.10641299188137054, "rewards/rejected": -0.22798196971416473, "step": 160 }, { "epoch": 0.13, "learning_rate": 4.987216714880929e-06, "logits/chosen": 0.19240622222423553, "logits/rejected": 0.22908082604408264, "logps/chosen": -516.4906005859375, "logps/rejected": -531.3421020507812, "loss": 0.278, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.17584487795829773, "rewards/margins": 0.10210248082876205, "rewards/rejected": -0.27794739603996277, "step": 170 }, { "epoch": 0.14, "learning_rate": 4.979613761906212e-06, "logits/chosen": 0.12470928579568863, "logits/rejected": 0.2596343755722046, "logps/chosen": -551.5357666015625, "logps/rejected": -654.6400756835938, "loss": 0.2664, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.21089038252830505, "rewards/margins": 0.15304332971572876, "rewards/rejected": -0.3639337420463562, "step": 180 }, { "epoch": 0.14, "learning_rate": 4.970253313860788e-06, "logits/chosen": 0.1833106130361557, "logits/rejected": 0.2698153257369995, "logps/chosen": -562.19677734375, "logps/rejected": -634.8347778320312, "loss": 0.2832, "rewards/accuracies": 0.65625, "rewards/chosen": -0.2445872575044632, "rewards/margins": 0.11998845636844635, "rewards/rejected": -0.36457571387290955, "step": 190 }, { "epoch": 0.15, "learning_rate": 4.959142005221991e-06, "logits/chosen": 0.13080090284347534, "logits/rejected": 0.21319513022899628, "logps/chosen": -603.390380859375, "logps/rejected": -718.3425903320312, "loss": 0.2595, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -0.25654521584510803, "rewards/margins": 0.16353540122509003, "rewards/rejected": -0.42008060216903687, "step": 200 }, { "epoch": 0.16, "learning_rate": 4.94628771143819e-06, "logits/chosen": 0.17947080731391907, "logits/rejected": 0.2654314935207367, "logps/chosen": -653.5524291992188, "logps/rejected": -726.80126953125, "loss": 0.3026, "rewards/accuracies": 0.6875, "rewards/chosen": -0.2890639305114746, "rewards/margins": 0.12797169387340546, "rewards/rejected": -0.41703566908836365, "step": 210 }, { "epoch": 0.17, "learning_rate": 4.931699543346854e-06, "logits/chosen": 0.1152615174651146, "logits/rejected": 0.25124651193618774, "logps/chosen": -588.23046875, "logps/rejected": -704.0632934570312, "loss": 0.2673, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.2535329759120941, "rewards/margins": 0.17576026916503906, "rewards/rejected": -0.4292932152748108, "step": 220 }, { "epoch": 0.18, "learning_rate": 4.9153878407169815e-06, "logits/chosen": 0.13138779997825623, "logits/rejected": 0.1749398410320282, "logps/chosen": -536.669921875, "logps/rejected": -619.0616455078125, "loss": 0.2609, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.2183808982372284, "rewards/margins": 0.12902414798736572, "rewards/rejected": -0.3474050462245941, "step": 230 }, { "epoch": 0.18, "learning_rate": 4.897364164920515e-06, "logits/chosen": 0.14722837507724762, "logits/rejected": 0.2979207932949066, "logps/chosen": -644.4813232421875, "logps/rejected": -710.7215576171875, "loss": 0.2364, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.2495686262845993, "rewards/margins": 0.14611390233039856, "rewards/rejected": -0.39568251371383667, "step": 240 }, { "epoch": 0.19, "learning_rate": 4.8776412907378845e-06, "logits/chosen": 0.14406076073646545, "logits/rejected": 0.24922068417072296, "logps/chosen": -602.2944946289062, "logps/rejected": -689.9060668945312, "loss": 0.2803, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.25432199239730835, "rewards/margins": 0.14366620779037476, "rewards/rejected": -0.3979881703853607, "step": 250 }, { "epoch": 0.2, "learning_rate": 4.8562331973035396e-06, "logits/chosen": 0.14575393497943878, "logits/rejected": 0.28581660985946655, "logps/chosen": -567.1799926757812, "logps/rejected": -649.4906005859375, "loss": 0.2541, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.21804265677928925, "rewards/margins": 0.13621987402439117, "rewards/rejected": -0.3542625308036804, "step": 260 }, { "epoch": 0.21, "learning_rate": 4.833155058197842e-06, "logits/chosen": 0.20403075218200684, "logits/rejected": 0.32390326261520386, "logps/chosen": -611.0584106445312, "logps/rejected": -643.7525634765625, "loss": 0.2827, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.23078171908855438, "rewards/margins": 0.12669074535369873, "rewards/rejected": -0.3574724495410919, "step": 270 }, { "epoch": 0.21, "learning_rate": 4.808423230692374e-06, "logits/chosen": 0.16620513796806335, "logits/rejected": 0.3157016634941101, "logps/chosen": -557.9627075195312, "logps/rejected": -638.0758666992188, "loss": 0.2304, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -0.1864185631275177, "rewards/margins": 0.15307244658470154, "rewards/rejected": -0.33949097990989685, "step": 280 }, { "epoch": 0.22, "learning_rate": 4.7820552441562625e-06, "logits/chosen": 0.18919572234153748, "logits/rejected": 0.24673417210578918, "logps/chosen": -532.9969482421875, "logps/rejected": -591.529296875, "loss": 0.2764, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.19002854824066162, "rewards/margins": 0.12101318687200546, "rewards/rejected": -0.3110417425632477, "step": 290 }, { "epoch": 0.23, "learning_rate": 4.754069787631761e-06, "logits/chosen": 0.15965518355369568, "logits/rejected": 0.3048693537712097, "logps/chosen": -550.864990234375, "logps/rejected": -679.41357421875, "loss": 0.25, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.21647436916828156, "rewards/margins": 0.16393651068210602, "rewards/rejected": -0.3804108500480652, "step": 300 }, { "epoch": 0.24, "learning_rate": 4.724486696587862e-06, "logits/chosen": 0.17140202224254608, "logits/rejected": 0.21409063041210175, "logps/chosen": -635.85791015625, "logps/rejected": -747.188232421875, "loss": 0.258, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.29053109884262085, "rewards/margins": 0.15401865541934967, "rewards/rejected": -0.4445497393608093, "step": 310 }, { "epoch": 0.24, "learning_rate": 4.693326938861367e-06, "logits/chosen": 0.19796046614646912, "logits/rejected": 0.25672799348831177, "logps/chosen": -643.25537109375, "logps/rejected": -801.0643310546875, "loss": 0.2427, "rewards/accuracies": 0.75, "rewards/chosen": -0.31196296215057373, "rewards/margins": 0.19956240057945251, "rewards/rejected": -0.5115253925323486, "step": 320 }, { "epoch": 0.25, "learning_rate": 4.660612599795343e-06, "logits/chosen": 0.11408114433288574, "logits/rejected": 0.22174029052257538, "logps/chosen": -647.49658203125, "logps/rejected": -756.5814208984375, "loss": 0.2472, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.29143720865249634, "rewards/margins": 0.1790235936641693, "rewards/rejected": -0.47046083211898804, "step": 330 }, { "epoch": 0.26, "learning_rate": 4.626366866585528e-06, "logits/chosen": 0.08651003241539001, "logits/rejected": 0.29140934348106384, "logps/chosen": -647.8692626953125, "logps/rejected": -736.3836669921875, "loss": 0.2449, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.28671354055404663, "rewards/margins": 0.16735221445560455, "rewards/rejected": -0.45406574010849, "step": 340 }, { "epoch": 0.27, "learning_rate": 4.590614011845758e-06, "logits/chosen": 0.14404548704624176, "logits/rejected": 0.26181453466415405, "logps/chosen": -638.6263427734375, "logps/rejected": -730.7149658203125, "loss": 0.2148, "rewards/accuracies": 0.71875, "rewards/chosen": -0.2571524977684021, "rewards/margins": 0.1780269742012024, "rewards/rejected": -0.4351794719696045, "step": 350 }, { "epoch": 0.27, "learning_rate": 4.553379376404085e-06, "logits/chosen": 0.17719906568527222, "logits/rejected": 0.17130860686302185, "logps/chosen": -565.2915649414062, "logps/rejected": -659.3453369140625, "loss": 0.252, "rewards/accuracies": 0.71875, "rewards/chosen": -0.23078814148902893, "rewards/margins": 0.16517826914787292, "rewards/rejected": -0.39596638083457947, "step": 360 }, { "epoch": 0.28, "learning_rate": 4.514689351341751e-06, "logits/chosen": 0.18946215510368347, "logits/rejected": 0.2520686089992523, "logps/chosen": -707.220458984375, "logps/rejected": -799.5474853515625, "loss": 0.2296, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.29390206933021545, "rewards/margins": 0.1939956247806549, "rewards/rejected": -0.48789769411087036, "step": 370 }, { "epoch": 0.29, "learning_rate": 4.474571359287791e-06, "logits/chosen": 0.1794353723526001, "logits/rejected": 0.20889122784137726, "logps/chosen": -659.88720703125, "logps/rejected": -792.5078125, "loss": 0.2767, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.32972732186317444, "rewards/margins": 0.16797736287117004, "rewards/rejected": -0.4977046847343445, "step": 380 }, { "epoch": 0.3, "learning_rate": 4.4330538349824684e-06, "logits/chosen": 0.15636876225471497, "logits/rejected": 0.2390608787536621, "logps/chosen": -675.450927734375, "logps/rejected": -797.5559692382812, "loss": 0.2478, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.3252645432949066, "rewards/margins": 0.17893439531326294, "rewards/rejected": -0.5041989684104919, "step": 390 }, { "epoch": 0.3, "learning_rate": 4.3901662051233755e-06, "logits/chosen": 0.1320812702178955, "logits/rejected": 0.25596413016319275, "logps/chosen": -722.6912841796875, "logps/rejected": -806.4666748046875, "loss": 0.2426, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.33014723658561707, "rewards/margins": 0.17146170139312744, "rewards/rejected": -0.5016089677810669, "step": 400 }, { "epoch": 0.31, "learning_rate": 4.345938867508439e-06, "logits/chosen": 0.15700757503509521, "logits/rejected": 0.2542612552642822, "logps/chosen": -715.8280029296875, "logps/rejected": -812.6696166992188, "loss": 0.2483, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.34785327315330505, "rewards/margins": 0.17259207367897034, "rewards/rejected": -0.5204453468322754, "step": 410 }, { "epoch": 0.32, "learning_rate": 4.30040316949064e-06, "logits/chosen": 0.12117477506399155, "logits/rejected": 0.21865728497505188, "logps/chosen": -640.8743896484375, "logps/rejected": -720.25439453125, "loss": 0.2724, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.30353325605392456, "rewards/margins": 0.136855810880661, "rewards/rejected": -0.44038906693458557, "step": 420 }, { "epoch": 0.33, "learning_rate": 4.253591385759705e-06, "logits/chosen": 0.1048569455742836, "logits/rejected": 0.2262450009584427, "logps/chosen": -654.50634765625, "logps/rejected": -762.971435546875, "loss": 0.2437, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.30736270546913147, "rewards/margins": 0.18053661286830902, "rewards/rejected": -0.48789939284324646, "step": 430 }, { "epoch": 0.34, "learning_rate": 4.205536695466524e-06, "logits/chosen": 0.1525915265083313, "logits/rejected": 0.27372902631759644, "logps/chosen": -654.9791259765625, "logps/rejected": -811.6696166992188, "loss": 0.2447, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.319768488407135, "rewards/margins": 0.19004443287849426, "rewards/rejected": -0.5098129510879517, "step": 440 }, { "epoch": 0.34, "learning_rate": 4.15627315870651e-06, "logits/chosen": 0.136866956949234, "logits/rejected": 0.21996262669563293, "logps/chosen": -676.5054931640625, "logps/rejected": -751.3809204101562, "loss": 0.2517, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.3240048289299011, "rewards/margins": 0.1569635421037674, "rewards/rejected": -0.48096832633018494, "step": 450 }, { "epoch": 0.35, "learning_rate": 4.105835692378557e-06, "logits/chosen": 0.17371432483196259, "logits/rejected": 0.2416466474533081, "logps/chosen": -675.8497314453125, "logps/rejected": -758.9337158203125, "loss": 0.2008, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.28624922037124634, "rewards/margins": 0.17619088292121887, "rewards/rejected": -0.4624401032924652, "step": 460 }, { "epoch": 0.36, "learning_rate": 4.05426004543672e-06, "logits/chosen": 0.12746404111385345, "logits/rejected": 0.2692243754863739, "logps/chosen": -697.2105712890625, "logps/rejected": -788.6505126953125, "loss": 0.2271, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -0.3282613158226013, "rewards/margins": 0.18242862820625305, "rewards/rejected": -0.510689914226532, "step": 470 }, { "epoch": 0.37, "learning_rate": 4.001582773552153e-06, "logits/chosen": 0.19310589134693146, "logits/rejected": 0.24649909138679504, "logps/chosen": -676.9974365234375, "logps/rejected": -746.8987426757812, "loss": 0.2236, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.31384724378585815, "rewards/margins": 0.16469937562942505, "rewards/rejected": -0.478546679019928, "step": 480 }, { "epoch": 0.37, "learning_rate": 3.947841213203262e-06, "logits/chosen": 0.15690350532531738, "logits/rejected": 0.300692081451416, "logps/chosen": -735.6814575195312, "logps/rejected": -848.1085815429688, "loss": 0.2125, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.34843936562538147, "rewards/margins": 0.20286861062049866, "rewards/rejected": -0.5513080358505249, "step": 490 }, { "epoch": 0.38, "learning_rate": 3.893073455212438e-06, "logits/chosen": 0.18581806123256683, "logits/rejected": 0.28662142157554626, "logps/chosen": -699.9567260742188, "logps/rejected": -875.0812377929688, "loss": 0.2029, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.35302621126174927, "rewards/margins": 0.2367512434720993, "rewards/rejected": -0.589777410030365, "step": 500 }, { "epoch": 0.39, "learning_rate": 3.837318317748134e-06, "logits/chosen": 0.2250034064054489, "logits/rejected": 0.25657814741134644, "logps/chosen": -629.2000732421875, "logps/rejected": -729.8438720703125, "loss": 0.2881, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.2759498357772827, "rewards/margins": 0.16293799877166748, "rewards/rejected": -0.43888789415359497, "step": 510 }, { "epoch": 0.4, "learning_rate": 3.7806153188114027e-06, "logits/chosen": 0.15463793277740479, "logits/rejected": 0.26033270359039307, "logps/chosen": -536.32861328125, "logps/rejected": -633.5609130859375, "loss": 0.2673, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.219068244099617, "rewards/margins": 0.13708043098449707, "rewards/rejected": -0.3561486601829529, "step": 520 }, { "epoch": 0.4, "learning_rate": 3.7230046482264256e-06, "logits/chosen": 0.1772613823413849, "logits/rejected": 0.3070564270019531, "logps/chosen": -638.0252685546875, "logps/rejected": -699.7626953125, "loss": 0.2509, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.24965958297252655, "rewards/margins": 0.15243235230445862, "rewards/rejected": -0.402091920375824, "step": 530 }, { "epoch": 0.41, "learning_rate": 3.6645271391548542e-06, "logits/chosen": 0.1249006986618042, "logits/rejected": 0.20080497860908508, "logps/chosen": -620.0784912109375, "logps/rejected": -716.3678588867188, "loss": 0.2308, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.2955653667449951, "rewards/margins": 0.15287812054157257, "rewards/rejected": -0.4484435021877289, "step": 540 }, { "epoch": 0.42, "learning_rate": 3.6052242391541746e-06, "logits/chosen": 0.1353389322757721, "logits/rejected": 0.22389094531536102, "logps/chosen": -614.9147338867188, "logps/rejected": -771.8558349609375, "loss": 0.2102, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.2764993906021118, "rewards/margins": 0.18298561871051788, "rewards/rejected": -0.4594849944114685, "step": 550 }, { "epoch": 0.43, "learning_rate": 3.5451379808006014e-06, "logits/chosen": 0.17011868953704834, "logits/rejected": 0.27556750178337097, "logps/chosen": -658.0120849609375, "logps/rejected": -755.5858764648438, "loss": 0.2345, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.29901498556137085, "rewards/margins": 0.18377789855003357, "rewards/rejected": -0.48279285430908203, "step": 560 }, { "epoch": 0.43, "learning_rate": 3.484310951897323e-06, "logits/chosen": 0.1761491745710373, "logits/rejected": 0.30945947766304016, "logps/chosen": -717.3760986328125, "logps/rejected": -787.5486450195312, "loss": 0.2483, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.32107868790626526, "rewards/margins": 0.1693280041217804, "rewards/rejected": -0.49040669202804565, "step": 570 }, { "epoch": 0.44, "learning_rate": 3.4227862652892106e-06, "logits/chosen": 0.20576810836791992, "logits/rejected": 0.26615187525749207, "logps/chosen": -668.9432983398438, "logps/rejected": -773.5794067382812, "loss": 0.2502, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.31081390380859375, "rewards/margins": 0.15438678860664368, "rewards/rejected": -0.46520066261291504, "step": 580 }, { "epoch": 0.45, "learning_rate": 3.3606075283054005e-06, "logits/chosen": 0.18598072230815887, "logits/rejected": 0.2837832570075989, "logps/chosen": -603.8995361328125, "logps/rejected": -735.7412719726562, "loss": 0.208, "rewards/accuracies": 0.75, "rewards/chosen": -0.2771075367927551, "rewards/margins": 0.1824244260787964, "rewards/rejected": -0.4595320224761963, "step": 590 }, { "epoch": 0.46, "learning_rate": 3.2978188118513814e-06, "logits/chosen": 0.1718396097421646, "logits/rejected": 0.3636724054813385, "logps/chosen": -653.9473266601562, "logps/rejected": -759.1709594726562, "loss": 0.2263, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.28921762108802795, "rewards/margins": 0.19078542292118073, "rewards/rejected": -0.4800030589103699, "step": 600 }, { "epoch": 0.46, "learning_rate": 3.234464619172522e-06, "logits/chosen": 0.1497826725244522, "logits/rejected": 0.2777649164199829, "logps/chosen": -638.6043090820312, "logps/rejected": -779.666748046875, "loss": 0.223, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.28172770142555237, "rewards/margins": 0.22779683768749237, "rewards/rejected": -0.5095245242118835, "step": 610 }, { "epoch": 0.47, "learning_rate": 3.1705898543111576e-06, "logits/chosen": 0.13684847950935364, "logits/rejected": 0.24817728996276855, "logps/chosen": -634.4373779296875, "logps/rejected": -796.2052612304688, "loss": 0.2257, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.2933855652809143, "rewards/margins": 0.20933008193969727, "rewards/rejected": -0.5027156472206116, "step": 620 }, { "epoch": 0.48, "learning_rate": 3.106239790279606e-06, "logits/chosen": 0.16658630967140198, "logits/rejected": 0.2954631447792053, "logps/chosen": -654.8861083984375, "logps/rejected": -780.149169921875, "loss": 0.2097, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.3087119460105896, "rewards/margins": 0.1822930872440338, "rewards/rejected": -0.4910050928592682, "step": 630 }, { "epoch": 0.49, "learning_rate": 3.041460036971664e-06, "logits/chosen": 0.13499195873737335, "logits/rejected": 0.27665549516677856, "logps/chosen": -655.3276977539062, "logps/rejected": -844.3450927734375, "loss": 0.2251, "rewards/accuracies": 0.78125, "rewards/chosen": -0.3263702094554901, "rewards/margins": 0.22413411736488342, "rewards/rejected": -0.5505043268203735, "step": 640 }, { "epoch": 0.5, "learning_rate": 2.976296508835326e-06, "logits/chosen": 0.16133855283260345, "logits/rejected": 0.256599485874176, "logps/chosen": -660.0172119140625, "logps/rejected": -769.0188598632812, "loss": 0.2228, "rewards/accuracies": 0.71875, "rewards/chosen": -0.3122148811817169, "rewards/margins": 0.18627096712589264, "rewards/rejected": -0.49848586320877075, "step": 650 }, { "epoch": 0.5, "learning_rate": 2.910795392329649e-06, "logits/chosen": 0.1639477163553238, "logits/rejected": 0.24110262095928192, "logps/chosen": -739.8809204101562, "logps/rejected": -895.7755126953125, "loss": 0.2189, "rewards/accuracies": 0.78125, "rewards/chosen": -0.33884376287460327, "rewards/margins": 0.23572292923927307, "rewards/rejected": -0.5745667219161987, "step": 660 }, { "epoch": 0.51, "learning_rate": 2.8450031131888147e-06, "logits/chosen": 0.13471753895282745, "logits/rejected": 0.27453264594078064, "logps/chosen": -660.6944580078125, "logps/rejected": -760.9178466796875, "loss": 0.2316, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.32461482286453247, "rewards/margins": 0.18133333325386047, "rewards/rejected": -0.5059481859207153, "step": 670 }, { "epoch": 0.52, "learning_rate": 2.7789663035166035e-06, "logits/chosen": 0.07190994918346405, "logits/rejected": 0.22703304886817932, "logps/chosen": -670.5394287109375, "logps/rejected": -823.9978637695312, "loss": 0.2036, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.3125961720943451, "rewards/margins": 0.2316323220729828, "rewards/rejected": -0.5442285537719727, "step": 680 }, { "epoch": 0.53, "learning_rate": 2.7127317687345973e-06, "logits/chosen": 0.07092462480068207, "logits/rejected": 0.26120471954345703, "logps/chosen": -671.69189453125, "logps/rejected": -811.6243286132812, "loss": 0.2018, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.30282872915267944, "rewards/margins": 0.21179743111133575, "rewards/rejected": -0.514626145362854, "step": 690 }, { "epoch": 0.53, "learning_rate": 2.6463464544075344e-06, "logits/chosen": 0.14573340117931366, "logits/rejected": 0.23916473984718323, "logps/chosen": -684.8336181640625, "logps/rejected": -777.0270385742188, "loss": 0.1987, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.3061198592185974, "rewards/margins": 0.19952335953712463, "rewards/rejected": -0.5056431889533997, "step": 700 }, { "epoch": 0.54, "learning_rate": 2.579857412969345e-06, "logits/chosen": 0.11962984502315521, "logits/rejected": 0.23462708294391632, "logps/chosen": -730.8359375, "logps/rejected": -856.9847412109375, "loss": 0.237, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.37815654277801514, "rewards/margins": 0.18362250924110413, "rewards/rejected": -0.5617790818214417, "step": 710 }, { "epoch": 0.55, "learning_rate": 2.513311770373421e-06, "logits/chosen": 0.11839403957128525, "logits/rejected": 0.16228660941123962, "logps/chosen": -644.6649780273438, "logps/rejected": -824.513671875, "loss": 0.2276, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.32936322689056396, "rewards/margins": 0.2137618511915207, "rewards/rejected": -0.5431250929832458, "step": 720 }, { "epoch": 0.56, "learning_rate": 2.446756692690804e-06, "logits/chosen": 0.13511911034584045, "logits/rejected": 0.2399536371231079, "logps/chosen": -717.3846435546875, "logps/rejected": -858.6282958984375, "loss": 0.2365, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.3688200116157532, "rewards/margins": 0.2047378122806549, "rewards/rejected": -0.5735577940940857, "step": 730 }, { "epoch": 0.56, "learning_rate": 2.380239352679908e-06, "logits/chosen": 0.14800789952278137, "logits/rejected": 0.2702687382698059, "logps/chosen": -674.1021728515625, "logps/rejected": -830.0198974609375, "loss": 0.2284, "rewards/accuracies": 0.71875, "rewards/chosen": -0.31319013237953186, "rewards/margins": 0.2306814193725586, "rewards/rejected": -0.5438715219497681, "step": 740 }, { "epoch": 0.57, "learning_rate": 2.313806896351529e-06, "logits/chosen": 0.07657043635845184, "logits/rejected": 0.2402833253145218, "logps/chosen": -611.2683715820312, "logps/rejected": -762.6300048828125, "loss": 0.2543, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.3043960928916931, "rewards/margins": 0.188668355345726, "rewards/rejected": -0.49306440353393555, "step": 750 }, { "epoch": 0.58, "learning_rate": 2.247506409552795e-06, "logits/chosen": 0.11987291276454926, "logits/rejected": 0.1521257907152176, "logps/chosen": -654.2996826171875, "logps/rejected": -776.6150512695312, "loss": 0.248, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.2802131772041321, "rewards/margins": 0.1828833967447281, "rewards/rejected": -0.463096559047699, "step": 760 }, { "epoch": 0.59, "learning_rate": 2.1813848845937695e-06, "logits/chosen": 0.07216247916221619, "logits/rejected": 0.16098852455615997, "logps/chosen": -612.78759765625, "logps/rejected": -761.1871948242188, "loss": 0.2486, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.31463655829429626, "rewards/margins": 0.16312028467655182, "rewards/rejected": -0.4777568280696869, "step": 770 }, { "epoch": 0.59, "learning_rate": 2.1154891869403436e-06, "logits/chosen": 0.056468479335308075, "logits/rejected": 0.21281781792640686, "logps/chosen": -676.2210693359375, "logps/rejected": -826.8810424804688, "loss": 0.202, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -0.31608521938323975, "rewards/margins": 0.22052684426307678, "rewards/rejected": -0.5366120934486389, "step": 780 }, { "epoch": 0.6, "learning_rate": 2.0498660219970395e-06, "logits/chosen": 0.14304831624031067, "logits/rejected": 0.2865908741950989, "logps/chosen": -720.0322875976562, "logps/rejected": -834.4865112304688, "loss": 0.2143, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.3310641646385193, "rewards/margins": 0.21627302467823029, "rewards/rejected": -0.5473372340202332, "step": 790 }, { "epoch": 0.61, "learning_rate": 1.9845619020032552e-06, "logits/chosen": 0.0913710817694664, "logits/rejected": 0.2621229588985443, "logps/chosen": -681.4385986328125, "logps/rejected": -793.8607788085938, "loss": 0.2232, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.33381250500679016, "rewards/margins": 0.18326430022716522, "rewards/rejected": -0.5170767903327942, "step": 800 }, { "epoch": 0.62, "learning_rate": 1.9196231130664282e-06, "logits/chosen": 0.12347328662872314, "logits/rejected": 0.22142863273620605, "logps/chosen": -656.6223754882812, "logps/rejected": -849.30908203125, "loss": 0.2207, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.32625263929367065, "rewards/margins": 0.2373245507478714, "rewards/rejected": -0.5635771751403809, "step": 810 }, { "epoch": 0.62, "learning_rate": 1.8550956823554708e-06, "logits/chosen": 0.12958547472953796, "logits/rejected": 0.20247094333171844, "logps/chosen": -676.3146362304688, "logps/rejected": -852.1516723632812, "loss": 0.2231, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -0.3258567452430725, "rewards/margins": 0.22926822304725647, "rewards/rejected": -0.5551249384880066, "step": 820 }, { "epoch": 0.63, "learning_rate": 1.7910253454777346e-06, "logits/chosen": 0.1100487932562828, "logits/rejected": 0.178089901804924, "logps/chosen": -628.2322387695312, "logps/rejected": -760.2506103515625, "loss": 0.2364, "rewards/accuracies": 0.75, "rewards/chosen": -0.3000151813030243, "rewards/margins": 0.20657257735729218, "rewards/rejected": -0.5065878033638, "step": 830 }, { "epoch": 0.64, "learning_rate": 1.7274575140626318e-06, "logits/chosen": 0.06159307807683945, "logits/rejected": 0.21589604020118713, "logps/chosen": -658.613525390625, "logps/rejected": -839.2703247070312, "loss": 0.2066, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.3004804253578186, "rewards/margins": 0.23831859230995178, "rewards/rejected": -0.5387989282608032, "step": 840 }, { "epoch": 0.65, "learning_rate": 1.6644372435748823e-06, "logits/chosen": 0.13025906682014465, "logits/rejected": 0.25792089104652405, "logps/chosen": -657.7048950195312, "logps/rejected": -741.805908203125, "loss": 0.2056, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.3026004731655121, "rewards/margins": 0.18632087111473083, "rewards/rejected": -0.4889214038848877, "step": 850 }, { "epoch": 0.66, "learning_rate": 1.6020092013802002e-06, "logits/chosen": 0.13131192326545715, "logits/rejected": 0.2267313450574875, "logps/chosen": -609.1980590820312, "logps/rejected": -762.2578735351562, "loss": 0.2173, "rewards/accuracies": 0.71875, "rewards/chosen": -0.2911318838596344, "rewards/margins": 0.19050107896327972, "rewards/rejected": -0.48163294792175293, "step": 860 }, { "epoch": 0.66, "learning_rate": 1.5402176350860653e-06, "logits/chosen": 0.1238790899515152, "logits/rejected": 0.215298131108284, "logps/chosen": -654.3424072265625, "logps/rejected": -766.5152587890625, "loss": 0.2276, "rewards/accuracies": 0.6875, "rewards/chosen": -0.311137855052948, "rewards/margins": 0.18526899814605713, "rewards/rejected": -0.4964068531990051, "step": 870 }, { "epoch": 0.67, "learning_rate": 1.4791063411799938e-06, "logits/chosen": 0.07573570311069489, "logits/rejected": 0.16694195568561554, "logps/chosen": -611.1826782226562, "logps/rejected": -747.8428955078125, "loss": 0.225, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.28111016750335693, "rewards/margins": 0.19318345189094543, "rewards/rejected": -0.47429361939430237, "step": 880 }, { "epoch": 0.68, "learning_rate": 1.4187186339875697e-06, "logits/chosen": 0.14150698482990265, "logits/rejected": 0.19218984246253967, "logps/chosen": -653.740478515625, "logps/rejected": -790.3258666992188, "loss": 0.1959, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -0.2978840172290802, "rewards/margins": 0.19249173998832703, "rewards/rejected": -0.4903757572174072, "step": 890 }, { "epoch": 0.69, "learning_rate": 1.3590973149722103e-06, "logits/chosen": 0.07968850433826447, "logits/rejected": 0.24394066631793976, "logps/chosen": -660.4170532226562, "logps/rejected": -787.5786743164062, "loss": 0.2189, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.3150777518749237, "rewards/margins": 0.19490757584571838, "rewards/rejected": -0.5099853277206421, "step": 900 }, { "epoch": 0.69, "learning_rate": 1.300284642398445e-06, "logits/chosen": 0.1339327096939087, "logits/rejected": 0.19529958069324493, "logps/chosen": -637.8862915039062, "logps/rejected": -804.20263671875, "loss": 0.2179, "rewards/accuracies": 0.75, "rewards/chosen": -0.30193617939949036, "rewards/margins": 0.2071986198425293, "rewards/rejected": -0.5091347694396973, "step": 910 }, { "epoch": 0.7, "learning_rate": 1.2423223013801946e-06, "logits/chosen": 0.09140697866678238, "logits/rejected": 0.23906917870044708, "logps/chosen": -615.2650146484375, "logps/rejected": -794.8267211914062, "loss": 0.2345, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.290291965007782, "rewards/margins": 0.21678297221660614, "rewards/rejected": -0.5070749521255493, "step": 920 }, { "epoch": 0.71, "learning_rate": 1.1852513743352886e-06, "logits/chosen": 0.09063401818275452, "logits/rejected": 0.18860659003257751, "logps/chosen": -603.1500854492188, "logps/rejected": -759.2166748046875, "loss": 0.1973, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.26861757040023804, "rewards/margins": 0.2037418633699417, "rewards/rejected": -0.47235947847366333, "step": 930 }, { "epoch": 0.72, "learning_rate": 1.1291123118671665e-06, "logits/chosen": 0.082930788397789, "logits/rejected": 0.177327498793602, "logps/chosen": -685.0097045898438, "logps/rejected": -753.899658203125, "loss": 0.2698, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.31101205945014954, "rewards/margins": 0.16297343373298645, "rewards/rejected": -0.473985493183136, "step": 940 }, { "epoch": 0.72, "learning_rate": 1.073944904094385e-06, "logits/chosen": 0.12176795303821564, "logits/rejected": 0.24055452644824982, "logps/chosen": -727.9547729492188, "logps/rejected": -807.3270874023438, "loss": 0.2284, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.3418019115924835, "rewards/margins": 0.1853446215391159, "rewards/rejected": -0.5271465182304382, "step": 950 }, { "epoch": 0.73, "learning_rate": 1.019788252448267e-06, "logits/chosen": 0.1278046816587448, "logits/rejected": 0.18657834827899933, "logps/chosen": -629.8650512695312, "logps/rejected": -775.0892333984375, "loss": 0.2073, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.31090590357780457, "rewards/margins": 0.19522252678871155, "rewards/rejected": -0.5061284899711609, "step": 960 }, { "epoch": 0.74, "learning_rate": 9.66680741958685e-07, "logits/chosen": 0.06174594908952713, "logits/rejected": 0.21714496612548828, "logps/chosen": -706.654052734375, "logps/rejected": -845.1712646484375, "loss": 0.2177, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.33396342396736145, "rewards/margins": 0.21265073120594025, "rewards/rejected": -0.5466141700744629, "step": 970 }, { "epoch": 0.75, "learning_rate": 9.146600140475945e-07, "logits/chosen": 0.09592024236917496, "logits/rejected": 0.1995583027601242, "logps/chosen": -628.862548828125, "logps/rejected": -779.5205688476562, "loss": 0.2233, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.3146194815635681, "rewards/margins": 0.17806106805801392, "rewards/rejected": -0.49268054962158203, "step": 980 }, { "epoch": 0.75, "learning_rate": 8.637629398496378e-07, "logits/chosen": 0.06787069141864777, "logits/rejected": 0.19926394522190094, "logps/chosen": -675.4822387695312, "logps/rejected": -792.4046630859375, "loss": 0.2406, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.31674814224243164, "rewards/margins": 0.19142326712608337, "rewards/rejected": -0.5081714391708374, "step": 990 }, { "epoch": 0.76, "learning_rate": 8.140255940787059e-07, "logits/chosen": 0.14530155062675476, "logits/rejected": 0.2198048084974289, "logps/chosen": -646.3609008789062, "logps/rejected": -774.7366333007812, "loss": 0.2234, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.30126315355300903, "rewards/margins": 0.1865108758211136, "rewards/rejected": -0.4877740442752838, "step": 1000 }, { "epoch": 0.77, "learning_rate": 7.654832294589776e-07, "logits/chosen": 0.14776812493801117, "logits/rejected": 0.18824756145477295, "logps/chosen": -676.3685302734375, "logps/rejected": -820.2717895507812, "loss": 0.242, "rewards/accuracies": 0.78125, "rewards/chosen": -0.3261755406856537, "rewards/margins": 0.19091561436653137, "rewards/rejected": -0.5170911550521851, "step": 1010 }, { "epoch": 0.78, "learning_rate": 7.181702517385789e-07, "logits/chosen": 0.15526942908763885, "logits/rejected": 0.20596864819526672, "logps/chosen": -720.7452392578125, "logps/rejected": -820.1512451171875, "loss": 0.2423, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.33318132162094116, "rewards/margins": 0.173573836684227, "rewards/rejected": -0.5067551732063293, "step": 1020 }, { "epoch": 0.78, "learning_rate": 6.721201953035511e-07, "logits/chosen": 0.09918368607759476, "logits/rejected": 0.23762516677379608, "logps/chosen": -690.7906494140625, "logps/rejected": -799.6129150390625, "loss": 0.222, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.31717661023139954, "rewards/margins": 0.19311536848545074, "rewards/rejected": -0.5102919340133667, "step": 1030 }, { "epoch": 0.79, "learning_rate": 6.273656994094232e-07, "logits/chosen": 0.08452818542718887, "logits/rejected": 0.15428626537322998, "logps/chosen": -674.3331298828125, "logps/rejected": -852.9098510742188, "loss": 0.212, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -0.3417903482913971, "rewards/margins": 0.20718708634376526, "rewards/rejected": -0.5489774942398071, "step": 1040 }, { "epoch": 0.8, "learning_rate": 5.839384850472359e-07, "logits/chosen": 0.11137109994888306, "logits/rejected": 0.23986658453941345, "logps/chosen": -681.7805786132812, "logps/rejected": -840.8401489257812, "loss": 0.2013, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.33553558588027954, "rewards/margins": 0.22832973301410675, "rewards/rejected": -0.5638653039932251, "step": 1050 }, { "epoch": 0.81, "learning_rate": 5.418693324604082e-07, "logits/chosen": 0.054320525377988815, "logits/rejected": 0.2136838734149933, "logps/chosen": -713.266357421875, "logps/rejected": -872.5158081054688, "loss": 0.1902, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.3621819019317627, "rewards/margins": 0.21344491839408875, "rewards/rejected": -0.5756268501281738, "step": 1060 }, { "epoch": 0.82, "learning_rate": 5.01188059328386e-07, "logits/chosen": 0.10655899345874786, "logits/rejected": 0.20958073437213898, "logps/chosen": -671.4863891601562, "logps/rejected": -796.0863647460938, "loss": 0.1935, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.31292790174484253, "rewards/margins": 0.21162652969360352, "rewards/rejected": -0.5245543718338013, "step": 1070 }, { "epoch": 0.82, "learning_rate": 4.619234996325314e-07, "logits/chosen": 0.1319715678691864, "logits/rejected": 0.21460673213005066, "logps/chosen": -709.2652587890625, "logps/rejected": -860.7151489257812, "loss": 0.2324, "rewards/accuracies": 0.75, "rewards/chosen": -0.3620796799659729, "rewards/margins": 0.1911153942346573, "rewards/rejected": -0.5531951189041138, "step": 1080 }, { "epoch": 0.83, "learning_rate": 4.241034832192434e-07, "logits/chosen": 0.10131983458995819, "logits/rejected": 0.1885637789964676, "logps/chosen": -677.9981079101562, "logps/rejected": -864.3063354492188, "loss": 0.2159, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -0.3420425057411194, "rewards/margins": 0.23228974640369415, "rewards/rejected": -0.5743322372436523, "step": 1090 }, { "epoch": 0.84, "learning_rate": 3.877548160747768e-07, "logits/chosen": 0.11466535180807114, "logits/rejected": 0.24565303325653076, "logps/chosen": -680.638671875, "logps/rejected": -798.11376953125, "loss": 0.1963, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.30832600593566895, "rewards/margins": 0.23151281476020813, "rewards/rejected": -0.5398387908935547, "step": 1100 }, { "epoch": 0.85, "learning_rate": 3.529032613257574e-07, "logits/chosen": 0.10837472975254059, "logits/rejected": 0.17577466368675232, "logps/chosen": -682.72216796875, "logps/rejected": -828.9464111328125, "loss": 0.2376, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.3374750316143036, "rewards/margins": 0.20901791751384735, "rewards/rejected": -0.5464929342269897, "step": 1110 }, { "epoch": 0.85, "learning_rate": 3.195735209788528e-07, "logits/chosen": 0.1015692800283432, "logits/rejected": 0.1785646677017212, "logps/chosen": -639.4000244140625, "logps/rejected": -781.9521484375, "loss": 0.2455, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.33476072549819946, "rewards/margins": 0.18203067779541016, "rewards/rejected": -0.5167914032936096, "step": 1120 }, { "epoch": 0.86, "learning_rate": 2.8778921841253774e-07, "logits/chosen": 0.06415721774101257, "logits/rejected": 0.2145998477935791, "logps/chosen": -690.4867553710938, "logps/rejected": -865.8243408203125, "loss": 0.1726, "rewards/accuracies": 0.8125, "rewards/chosen": -0.3243727684020996, "rewards/margins": 0.251103937625885, "rewards/rejected": -0.5754767656326294, "step": 1130 }, { "epoch": 0.87, "learning_rate": 2.5757288163336806e-07, "logits/chosen": 0.08277393132448196, "logits/rejected": 0.20556513965129852, "logps/chosen": -701.6412353515625, "logps/rejected": -884.2093505859375, "loss": 0.2016, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -0.34318509697914124, "rewards/margins": 0.23703384399414062, "rewards/rejected": -0.5802189707756042, "step": 1140 }, { "epoch": 0.88, "learning_rate": 2.2894592730863336e-07, "logits/chosen": 0.08898656070232391, "logits/rejected": 0.19494621455669403, "logps/chosen": -663.3425903320312, "logps/rejected": -838.2100830078125, "loss": 0.2081, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.3233157992362976, "rewards/margins": 0.22612051665782928, "rewards/rejected": -0.5494363903999329, "step": 1150 }, { "epoch": 0.88, "learning_rate": 2.019286455866981e-07, "logits/chosen": 0.09930244833230972, "logits/rejected": 0.25030988454818726, "logps/chosen": -665.492919921875, "logps/rejected": -791.5203857421875, "loss": 0.2154, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.3297829031944275, "rewards/margins": 0.19049373269081116, "rewards/rejected": -0.520276665687561, "step": 1160 }, { "epoch": 0.89, "learning_rate": 1.7654018571579557e-07, "logits/chosen": 0.07714973390102386, "logits/rejected": 0.1515274941921234, "logps/chosen": -710.2567749023438, "logps/rejected": -828.8133544921875, "loss": 0.2217, "rewards/accuracies": 0.71875, "rewards/chosen": -0.3351573944091797, "rewards/margins": 0.20143434405326843, "rewards/rejected": -0.5365917086601257, "step": 1170 }, { "epoch": 0.9, "learning_rate": 1.5279854247146703e-07, "logits/chosen": 0.08655952662229538, "logits/rejected": 0.18316319584846497, "logps/chosen": -664.911376953125, "logps/rejected": -804.0281982421875, "loss": 0.2363, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.3494827151298523, "rewards/margins": 0.19858424365520477, "rewards/rejected": -0.5480669140815735, "step": 1180 }, { "epoch": 0.91, "learning_rate": 1.307205434022671e-07, "logits/chosen": 0.0854581817984581, "logits/rejected": 0.21030446887016296, "logps/chosen": -680.3010864257812, "logps/rejected": -877.068359375, "loss": 0.1977, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -0.33833056688308716, "rewards/margins": 0.23077313601970673, "rewards/rejected": -0.5691036581993103, "step": 1190 }, { "epoch": 0.91, "learning_rate": 1.1032183690276754e-07, "logits/chosen": 0.07660888135433197, "logits/rejected": 0.1921808272600174, "logps/chosen": -673.0133056640625, "logps/rejected": -855.6375732421875, "loss": 0.2076, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.32487696409225464, "rewards/margins": 0.2425091564655304, "rewards/rejected": -0.5673861503601074, "step": 1200 }, { "epoch": 0.92, "learning_rate": 9.161688112232836e-08, "logits/chosen": 0.10119873285293579, "logits/rejected": 0.22447247803211212, "logps/chosen": -711.1212158203125, "logps/rejected": -857.6790771484375, "loss": 0.2041, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.35159099102020264, "rewards/margins": 0.22090163826942444, "rewards/rejected": -0.5724925994873047, "step": 1210 }, { "epoch": 0.93, "learning_rate": 7.46189337174788e-08, "logits/chosen": 0.0487164705991745, "logits/rejected": 0.1770685911178589, "logps/chosen": -655.6456298828125, "logps/rejected": -825.86572265625, "loss": 0.1868, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.3402259647846222, "rewards/margins": 0.2170572578907013, "rewards/rejected": -0.5572832822799683, "step": 1220 }, { "epoch": 0.94, "learning_rate": 5.934004245518793e-08, "logits/chosen": 0.09085109829902649, "logits/rejected": 0.22920957207679749, "logps/chosen": -656.9745483398438, "logps/rejected": -807.7351684570312, "loss": 0.2086, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.3091570734977722, "rewards/margins": 0.2053556740283966, "rewards/rejected": -0.5145127177238464, "step": 1230 }, { "epoch": 0.94, "learning_rate": 4.579103667367385e-08, "logits/chosen": 0.10511846840381622, "logits/rejected": 0.22097325325012207, "logps/chosen": -677.5391235351562, "logps/rejected": -815.5825805664062, "loss": 0.2188, "rewards/accuracies": 0.75, "rewards/chosen": -0.3394906222820282, "rewards/margins": 0.2081030309200287, "rewards/rejected": -0.5475937128067017, "step": 1240 }, { "epoch": 0.95, "learning_rate": 3.398151960681162e-08, "logits/chosen": 0.04612383991479874, "logits/rejected": 0.16620075702667236, "logps/chosen": -679.2454833984375, "logps/rejected": -800.4119262695312, "loss": 0.2539, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.35038450360298157, "rewards/margins": 0.1713865101337433, "rewards/rejected": -0.5217710733413696, "step": 1250 }, { "epoch": 0.96, "learning_rate": 2.3919861577572924e-08, "logits/chosen": 0.11022261530160904, "logits/rejected": 0.166605606675148, "logps/chosen": -690.1370239257812, "logps/rejected": -797.578369140625, "loss": 0.2181, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -0.32749998569488525, "rewards/margins": 0.20777150988578796, "rewards/rejected": -0.5352715253829956, "step": 1260 }, { "epoch": 0.97, "learning_rate": 1.5613194065327854e-08, "logits/chosen": 0.052184127271175385, "logits/rejected": 0.22973528504371643, "logps/chosen": -629.1040649414062, "logps/rejected": -766.4088134765625, "loss": 0.1872, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.2912130355834961, "rewards/margins": 0.22654838860034943, "rewards/rejected": -0.5177614092826843, "step": 1270 }, { "epoch": 0.98, "learning_rate": 9.067404651211808e-09, "logits/chosen": 0.11986621469259262, "logits/rejected": 0.21088480949401855, "logps/chosen": -687.1105346679688, "logps/rejected": -877.3173828125, "loss": 0.2028, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.3289037048816681, "rewards/margins": 0.23036351799964905, "rewards/rejected": -0.5592672228813171, "step": 1280 }, { "epoch": 0.98, "learning_rate": 4.287132845137709e-09, "logits/chosen": 0.08242613077163696, "logits/rejected": 0.20786967873573303, "logps/chosen": -682.5579223632812, "logps/rejected": -803.6607666015625, "loss": 0.2108, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.32676413655281067, "rewards/margins": 0.19439134001731873, "rewards/rejected": -0.5211554765701294, "step": 1290 }, { "epoch": 0.99, "learning_rate": 1.2757667974155896e-09, "logits/chosen": 0.03548216074705124, "logits/rejected": 0.12074669450521469, "logps/chosen": -665.2953491210938, "logps/rejected": -846.1324462890625, "loss": 0.2049, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.3495555818080902, "rewards/margins": 0.1981724500656128, "rewards/rejected": -0.5477280020713806, "step": 1300 }, { "epoch": 1.0, "learning_rate": 3.544089730633804e-11, "logits/chosen": 0.10499806702136993, "logits/rejected": 0.2223556488752365, "logps/chosen": -656.1048583984375, "logps/rejected": -808.8086547851562, "loss": 0.1796, "rewards/accuracies": 0.71875, "rewards/chosen": -0.3139723837375641, "rewards/margins": 0.22497034072875977, "rewards/rejected": -0.5389427542686462, "step": 1310 }, { "epoch": 1.0, "step": 1312, "total_flos": 0.0, "train_loss": 0.24134081795175627, "train_runtime": 12108.13, "train_samples_per_second": 1.734, "train_steps_per_second": 0.108 } ], "logging_steps": 10, "max_steps": 1312, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }