{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9973828840617638, "eval_steps": 10000, "global_step": 954, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02, "learning_rate": 1.0416666666666667e-07, "logits/chosen": 0.17704486846923828, "logits/rejected": 0.25409135222435, "logps/chosen": -354.4068603515625, "logps/rejected": -305.2366638183594, "loss": 0.1821, "rewards/accuracies": 0.4312500059604645, "rewards/chosen": -6.60312725813128e-05, "rewards/margins": 0.00012125837383791804, "rewards/rejected": -0.00018728969735093415, "step": 10 }, { "epoch": 0.04, "learning_rate": 2.0833333333333333e-07, "logits/chosen": 0.07091161608695984, "logits/rejected": 0.1985362321138382, "logps/chosen": -316.65069580078125, "logps/rejected": -276.1200866699219, "loss": 0.182, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.0008458361262455583, "rewards/margins": 0.0016920112539082766, "rewards/rejected": -0.0008461751276627183, "step": 20 }, { "epoch": 0.06, "learning_rate": 3.1249999999999997e-07, "logits/chosen": 0.17787829041481018, "logits/rejected": 0.2488478720188141, "logps/chosen": -294.9706115722656, "logps/rejected": -298.59521484375, "loss": 0.1822, "rewards/accuracies": 0.625, "rewards/chosen": -3.700423985719681e-05, "rewards/margins": 0.0029355171136558056, "rewards/rejected": -0.0029725211206823587, "step": 30 }, { "epoch": 0.08, "learning_rate": 4.1666666666666667e-07, "logits/chosen": 0.09609868377447128, "logits/rejected": 0.21795693039894104, "logps/chosen": -347.44097900390625, "logps/rejected": -320.9972839355469, "loss": 0.1877, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 0.0013125470140948892, "rewards/margins": 0.00661453977227211, "rewards/rejected": -0.005301993805915117, "step": 40 }, { "epoch": 0.1, "learning_rate": 5.208333333333334e-07, "logits/chosen": 0.1497882902622223, "logits/rejected": 0.240590900182724, "logps/chosen": -311.1229553222656, "logps/rejected": -286.51702880859375, "loss": 0.1814, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.005703258328139782, "rewards/margins": 0.022644545882940292, "rewards/rejected": -0.02834780514240265, "step": 50 }, { "epoch": 0.13, "learning_rate": 6.249999999999999e-07, "logits/chosen": 0.13869214057922363, "logits/rejected": 0.28307411074638367, "logps/chosen": -295.9754638671875, "logps/rejected": -281.43798828125, "loss": 0.1766, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.03096725046634674, "rewards/margins": 0.028959080576896667, "rewards/rejected": -0.059926338493824005, "step": 60 }, { "epoch": 0.15, "learning_rate": 7.291666666666666e-07, "logits/chosen": 0.18460798263549805, "logits/rejected": 0.2718513607978821, "logps/chosen": -335.46148681640625, "logps/rejected": -330.33404541015625, "loss": 0.174, "rewards/accuracies": 0.59375, "rewards/chosen": -0.057377688586711884, "rewards/margins": 0.05648452043533325, "rewards/rejected": -0.11386220157146454, "step": 70 }, { "epoch": 0.17, "learning_rate": 8.333333333333333e-07, "logits/chosen": 0.29816848039627075, "logits/rejected": 0.4011983871459961, "logps/chosen": -330.4580383300781, "logps/rejected": -311.96490478515625, "loss": 0.159, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.11794394254684448, "rewards/margins": 0.13102997839450836, "rewards/rejected": -0.24897389113903046, "step": 80 }, { "epoch": 0.19, "learning_rate": 9.374999999999999e-07, "logits/chosen": 0.2283201515674591, "logits/rejected": 0.37335914373397827, "logps/chosen": -358.6737365722656, "logps/rejected": -304.0804138183594, "loss": 0.1421, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.21732211112976074, "rewards/margins": 0.15273679792881012, "rewards/rejected": -0.37005892395973206, "step": 90 }, { "epoch": 0.21, "learning_rate": 9.999463737538052e-07, "logits/chosen": 0.2938156723976135, "logits/rejected": 0.46553492546081543, "logps/chosen": -361.78338623046875, "logps/rejected": -343.25750732421875, "loss": 0.1217, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.27221935987472534, "rewards/margins": 0.23653486371040344, "rewards/rejected": -0.5087541937828064, "step": 100 }, { "epoch": 0.23, "learning_rate": 9.993432105822034e-07, "logits/chosen": 0.31155580282211304, "logits/rejected": 0.3508353531360626, "logps/chosen": -353.184814453125, "logps/rejected": -366.32720947265625, "loss": 0.106, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.40565404295921326, "rewards/margins": 0.2631165683269501, "rewards/rejected": -0.6687706708908081, "step": 110 }, { "epoch": 0.25, "learning_rate": 9.980706626858607e-07, "logits/chosen": 0.26659709215164185, "logits/rejected": 0.3288796842098236, "logps/chosen": -374.50274658203125, "logps/rejected": -403.8424377441406, "loss": 0.0951, "rewards/accuracies": 0.625, "rewards/chosen": -0.5394914150238037, "rewards/margins": 0.28696924448013306, "rewards/rejected": -0.8264607191085815, "step": 120 }, { "epoch": 0.27, "learning_rate": 9.961304359538434e-07, "logits/chosen": 0.1616436094045639, "logits/rejected": 0.2970871031284332, "logps/chosen": -396.555419921875, "logps/rejected": -362.3848876953125, "loss": 0.0934, "rewards/accuracies": 0.625, "rewards/chosen": -0.5805934071540833, "rewards/margins": 0.19475166499614716, "rewards/rejected": -0.775344967842102, "step": 130 }, { "epoch": 0.29, "learning_rate": 9.935251313189563e-07, "logits/chosen": 0.1485656201839447, "logits/rejected": 0.2714545428752899, "logps/chosen": -384.0659484863281, "logps/rejected": -346.6048278808594, "loss": 0.0933, "rewards/accuracies": 0.59375, "rewards/chosen": -0.5202253460884094, "rewards/margins": 0.24675369262695312, "rewards/rejected": -0.766978919506073, "step": 140 }, { "epoch": 0.31, "learning_rate": 9.902582412711118e-07, "logits/chosen": 0.12988325953483582, "logits/rejected": 0.1523539423942566, "logps/chosen": -379.16839599609375, "logps/rejected": -395.9466552734375, "loss": 0.1019, "rewards/accuracies": 0.6875, "rewards/chosen": -0.4386775493621826, "rewards/margins": 0.37129276990890503, "rewards/rejected": -0.8099702596664429, "step": 150 }, { "epoch": 0.33, "learning_rate": 9.86334145175542e-07, "logits/chosen": 0.06655962765216827, "logits/rejected": 0.09024105966091156, "logps/chosen": -341.7105407714844, "logps/rejected": -360.19805908203125, "loss": 0.0937, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.3964901566505432, "rewards/margins": 0.3985019028186798, "rewards/rejected": -0.7949920892715454, "step": 160 }, { "epoch": 0.36, "learning_rate": 9.817581034021272e-07, "logits/chosen": 0.16973164677619934, "logits/rejected": 0.21836213767528534, "logps/chosen": -398.22369384765625, "logps/rejected": -417.8206481933594, "loss": 0.081, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.6185532808303833, "rewards/margins": 0.4811604917049408, "rewards/rejected": -1.0997138023376465, "step": 170 }, { "epoch": 0.38, "learning_rate": 9.765362502737097e-07, "logits/chosen": 0.09212584793567657, "logits/rejected": 0.23974208533763885, "logps/chosen": -388.64910888671875, "logps/rejected": -411.5782775878906, "loss": 0.0713, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.6261709928512573, "rewards/margins": 0.4908596873283386, "rewards/rejected": -1.1170307397842407, "step": 180 }, { "epoch": 0.4, "learning_rate": 9.706755858428485e-07, "logits/chosen": 0.1811675727367401, "logits/rejected": 0.27236208319664, "logps/chosen": -419.11376953125, "logps/rejected": -437.33843994140625, "loss": 0.0681, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.8445426225662231, "rewards/margins": 0.4015916883945465, "rewards/rejected": -1.2461342811584473, "step": 190 }, { "epoch": 0.42, "learning_rate": 9.641839665080363e-07, "logits/chosen": 0.14256766438484192, "logits/rejected": 0.2711044251918793, "logps/chosen": -414.55975341796875, "logps/rejected": -416.9037170410156, "loss": 0.0675, "rewards/accuracies": 0.6875, "rewards/chosen": -0.7406997680664062, "rewards/margins": 0.48706990480422974, "rewards/rejected": -1.2277696132659912, "step": 200 }, { "epoch": 0.44, "learning_rate": 9.570700944819582e-07, "logits/chosen": 0.23208096623420715, "logits/rejected": 0.35697174072265625, "logps/chosen": -382.19970703125, "logps/rejected": -386.50701904296875, "loss": 0.0708, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.6804240942001343, "rewards/margins": 0.48590850830078125, "rewards/rejected": -1.166332721710205, "step": 210 }, { "epoch": 0.46, "learning_rate": 9.493435061259129e-07, "logits/chosen": 0.13639363646507263, "logits/rejected": 0.23731064796447754, "logps/chosen": -382.42022705078125, "logps/rejected": -369.6554870605469, "loss": 0.0763, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.6574115753173828, "rewards/margins": 0.40243881940841675, "rewards/rejected": -1.0598504543304443, "step": 220 }, { "epoch": 0.48, "learning_rate": 9.4101455916603e-07, "logits/chosen": 0.1799091249704361, "logits/rejected": 0.2304597645998001, "logps/chosen": -416.672607421875, "logps/rejected": -420.39862060546875, "loss": 0.0668, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.9061130285263062, "rewards/margins": 0.46666598320007324, "rewards/rejected": -1.3727790117263794, "step": 230 }, { "epoch": 0.5, "learning_rate": 9.320944188084241e-07, "logits/chosen": 0.08318189531564713, "logits/rejected": 0.13486048579216003, "logps/chosen": -408.77545166015625, "logps/rejected": -427.9566345214844, "loss": 0.0639, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.968237042427063, "rewards/margins": 0.2922549843788147, "rewards/rejected": -1.260491967201233, "step": 240 }, { "epoch": 0.52, "learning_rate": 9.225950427718974e-07, "logits/chosen": 0.051157813519239426, "logits/rejected": 0.1319509893655777, "logps/chosen": -385.2474670410156, "logps/rejected": -402.11126708984375, "loss": 0.0631, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.7319932579994202, "rewards/margins": 0.468679815530777, "rewards/rejected": -1.2006731033325195, "step": 250 }, { "epoch": 0.54, "learning_rate": 9.125291652582547e-07, "logits/chosen": 0.013853952288627625, "logits/rejected": 0.10071275383234024, "logps/chosen": -445.53607177734375, "logps/rejected": -434.2711486816406, "loss": 0.0641, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.9089228510856628, "rewards/margins": 0.4331666827201843, "rewards/rejected": -1.3420894145965576, "step": 260 }, { "epoch": 0.57, "learning_rate": 9.019102798817195e-07, "logits/chosen": 0.1297096163034439, "logits/rejected": 0.1613592505455017, "logps/chosen": -403.47393798828125, "logps/rejected": -446.1951599121094, "loss": 0.0685, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.7434005737304688, "rewards/margins": 0.6140644550323486, "rewards/rejected": -1.357465147972107, "step": 270 }, { "epoch": 0.59, "learning_rate": 8.90752621580335e-07, "logits/chosen": 0.16231071949005127, "logits/rejected": 0.1873283088207245, "logps/chosen": -362.4006652832031, "logps/rejected": -398.279296875, "loss": 0.0751, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.6591774225234985, "rewards/margins": 0.41294485330581665, "rewards/rejected": -1.07212233543396, "step": 280 }, { "epoch": 0.61, "learning_rate": 8.79071147533597e-07, "logits/chosen": 0.14204099774360657, "logits/rejected": 0.20997166633605957, "logps/chosen": -424.5856018066406, "logps/rejected": -456.9698181152344, "loss": 0.0642, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.7815448045730591, "rewards/margins": 0.5602203011512756, "rewards/rejected": -1.34176504611969, "step": 290 }, { "epoch": 0.63, "learning_rate": 8.668815171119019e-07, "logits/chosen": 0.2026984989643097, "logits/rejected": 0.23374077677726746, "logps/chosen": -380.8060607910156, "logps/rejected": -468.7802734375, "loss": 0.0554, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.8905105590820312, "rewards/margins": 0.5638677477836609, "rewards/rejected": -1.454378366470337, "step": 300 }, { "epoch": 0.65, "learning_rate": 8.54200070884685e-07, "logits/chosen": 0.23336808383464813, "logits/rejected": 0.25176650285720825, "logps/chosen": -385.24676513671875, "logps/rejected": -462.87322998046875, "loss": 0.0565, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.8951492309570312, "rewards/margins": 0.6165014505386353, "rewards/rejected": -1.5116506814956665, "step": 310 }, { "epoch": 0.67, "learning_rate": 8.410438087153911e-07, "logits/chosen": 0.22913236916065216, "logits/rejected": 0.3360585570335388, "logps/chosen": -383.767578125, "logps/rejected": -424.25067138671875, "loss": 0.0641, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.6813658475875854, "rewards/margins": 0.6591276526451111, "rewards/rejected": -1.3404934406280518, "step": 320 }, { "epoch": 0.69, "learning_rate": 8.274303669726426e-07, "logits/chosen": 0.22990348935127258, "logits/rejected": 0.3006184697151184, "logps/chosen": -366.43499755859375, "logps/rejected": -444.06536865234375, "loss": 0.0636, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.6766657829284668, "rewards/margins": 0.6564770936965942, "rewards/rejected": -1.333142876625061, "step": 330 }, { "epoch": 0.71, "learning_rate": 8.133779948881513e-07, "logits/chosen": 0.22257550060749054, "logits/rejected": 0.3241097033023834, "logps/chosen": -360.141845703125, "logps/rejected": -405.85711669921875, "loss": 0.0662, "rewards/accuracies": 0.78125, "rewards/chosen": -0.7344536781311035, "rewards/margins": 0.7157880067825317, "rewards/rejected": -1.4502416849136353, "step": 340 }, { "epoch": 0.73, "learning_rate": 7.989055300930704e-07, "logits/chosen": 0.1499968320131302, "logits/rejected": 0.15372925996780396, "logps/chosen": -388.67559814453125, "logps/rejected": -462.0445251464844, "loss": 0.0644, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.8717344403266907, "rewards/margins": 0.6429644227027893, "rewards/rejected": -1.51469886302948, "step": 350 }, { "epoch": 0.75, "learning_rate": 7.840323733655778e-07, "logits/chosen": 0.08885981142520905, "logits/rejected": 0.19541098177433014, "logps/chosen": -407.87286376953125, "logps/rejected": -420.4515686035156, "loss": 0.0583, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.797155499458313, "rewards/margins": 0.5855330228805542, "rewards/rejected": -1.3826884031295776, "step": 360 }, { "epoch": 0.77, "learning_rate": 7.687784626235447e-07, "logits/chosen": 0.05912008136510849, "logits/rejected": 0.17702099680900574, "logps/chosen": -428.82354736328125, "logps/rejected": -466.0895080566406, "loss": 0.0599, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.803920567035675, "rewards/margins": 0.7507921457290649, "rewards/rejected": -1.5547125339508057, "step": 370 }, { "epoch": 0.8, "learning_rate": 7.531642461971514e-07, "logits/chosen": 0.11388075351715088, "logits/rejected": 0.1931450068950653, "logps/chosen": -388.9282531738281, "logps/rejected": -427.1614685058594, "loss": 0.0578, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.9585503339767456, "rewards/margins": 0.5912213325500488, "rewards/rejected": -1.5497716665267944, "step": 380 }, { "epoch": 0.82, "learning_rate": 7.372106554172801e-07, "logits/chosen": -0.049389470368623734, "logits/rejected": 0.10218650102615356, "logps/chosen": -443.7737731933594, "logps/rejected": -484.5735778808594, "loss": 0.0446, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.0208237171173096, "rewards/margins": 0.8150562047958374, "rewards/rejected": -1.835879921913147, "step": 390 }, { "epoch": 0.84, "learning_rate": 7.209390765564318e-07, "logits/chosen": 0.07526848465204239, "logits/rejected": 0.1457681804895401, "logps/chosen": -430.77130126953125, "logps/rejected": -478.53118896484375, "loss": 0.0488, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.137662410736084, "rewards/margins": 0.6997725963592529, "rewards/rejected": -1.837435007095337, "step": 400 }, { "epoch": 0.86, "learning_rate": 7.043713221597773e-07, "logits/chosen": -0.014962440356612206, "logits/rejected": 0.049673158675432205, "logps/chosen": -394.35980224609375, "logps/rejected": -455.79168701171875, "loss": 0.0469, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.0516221523284912, "rewards/margins": 0.6002627015113831, "rewards/rejected": -1.65188467502594, "step": 410 }, { "epoch": 0.88, "learning_rate": 6.875296018047809e-07, "logits/chosen": 0.1113734096288681, "logits/rejected": 0.17297616600990295, "logps/chosen": -371.1769104003906, "logps/rejected": -433.82763671875, "loss": 0.057, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.7784308791160583, "rewards/margins": 0.7032991647720337, "rewards/rejected": -1.4817302227020264, "step": 420 }, { "epoch": 0.9, "learning_rate": 6.704364923285857e-07, "logits/chosen": 0.08021976053714752, "logits/rejected": 0.09611347317695618, "logps/chosen": -433.26898193359375, "logps/rejected": -482.2544860839844, "loss": 0.0623, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.9919212460517883, "rewards/margins": 0.5928072333335876, "rewards/rejected": -1.584728479385376, "step": 430 }, { "epoch": 0.92, "learning_rate": 6.531149075630796e-07, "logits/chosen": 0.06492827087640762, "logits/rejected": 0.09372309595346451, "logps/chosen": -369.0657958984375, "logps/rejected": -427.1637268066406, "loss": 0.0602, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.8450859785079956, "rewards/margins": 0.6487796902656555, "rewards/rejected": -1.4938656091690063, "step": 440 }, { "epoch": 0.94, "learning_rate": 6.355880676182085e-07, "logits/chosen": 0.015085640363395214, "logits/rejected": 0.1697283238172531, "logps/chosen": -454.42071533203125, "logps/rejected": -461.6656799316406, "loss": 0.0537, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.0446925163269043, "rewards/margins": 0.7324589490890503, "rewards/rejected": -1.7771514654159546, "step": 450 }, { "epoch": 0.96, "learning_rate": 6.178794677547137e-07, "logits/chosen": 0.052903078496456146, "logits/rejected": 0.21909013390541077, "logps/chosen": -389.771728515625, "logps/rejected": -432.63311767578125, "loss": 0.0475, "rewards/accuracies": 0.71875, "rewards/chosen": -0.918341338634491, "rewards/margins": 0.7504295706748962, "rewards/rejected": -1.6687707901000977, "step": 460 }, { "epoch": 0.98, "learning_rate": 6.000128468880222e-07, "logits/chosen": 0.0020152360666543245, "logits/rejected": 0.10528425872325897, "logps/chosen": -439.73016357421875, "logps/rejected": -486.3055114746094, "loss": 0.0531, "rewards/accuracies": 0.75, "rewards/chosen": -1.0058103799819946, "rewards/margins": 0.8824182748794556, "rewards/rejected": -1.8882286548614502, "step": 470 }, { "epoch": 1.0, "learning_rate": 5.820121557655108e-07, "logits/chosen": 0.03267590329051018, "logits/rejected": 0.10403893887996674, "logps/chosen": -426.3312072753906, "logps/rejected": -521.575439453125, "loss": 0.0497, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.897496223449707, "rewards/margins": 1.0473217964172363, "rewards/rejected": -1.9448179006576538, "step": 480 }, { "epoch": 1.03, "learning_rate": 5.639015248598023e-07, "logits/chosen": -0.05066138505935669, "logits/rejected": 0.0016520231729373336, "logps/chosen": -459.2066955566406, "logps/rejected": -572.3805541992188, "loss": 0.0254, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.404326319694519, "rewards/margins": 1.2682745456695557, "rewards/rejected": -2.6726012229919434, "step": 490 }, { "epoch": 1.05, "learning_rate": 5.457052320211339e-07, "logits/chosen": 0.10663177818059921, "logits/rejected": 0.143524631857872, "logps/chosen": -454.5547790527344, "logps/rejected": -574.3235473632812, "loss": 0.0198, "rewards/accuracies": 0.71875, "rewards/chosen": -1.592284083366394, "rewards/margins": 1.2184875011444092, "rewards/rejected": -2.8107717037200928, "step": 500 }, { "epoch": 1.07, "learning_rate": 5.274476699321637e-07, "logits/chosen": -0.019788045436143875, "logits/rejected": 0.12656378746032715, "logps/chosen": -488.24627685546875, "logps/rejected": -596.00537109375, "loss": 0.015, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.8213142156600952, "rewards/margins": 1.3653538227081299, "rewards/rejected": -3.1866683959960938, "step": 510 }, { "epoch": 1.09, "learning_rate": 5.091533134088387e-07, "logits/chosen": -0.0814504474401474, "logits/rejected": 0.05524957925081253, "logps/chosen": -552.7730712890625, "logps/rejected": -634.5548095703125, "loss": 0.0147, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.0995850563049316, "rewards/margins": 1.1655638217926025, "rewards/rejected": -3.2651493549346924, "step": 520 }, { "epoch": 1.11, "learning_rate": 4.908466865911614e-07, "logits/chosen": 0.03363295644521713, "logits/rejected": 0.043015364557504654, "logps/chosen": -468.89593505859375, "logps/rejected": -560.2864990234375, "loss": 0.0174, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.5512639284133911, "rewards/margins": 1.2513355016708374, "rewards/rejected": -2.8025994300842285, "step": 530 }, { "epoch": 1.13, "learning_rate": 4.7255233006783624e-07, "logits/chosen": -0.03754299506545067, "logits/rejected": 0.08725563436746597, "logps/chosen": -456.68243408203125, "logps/rejected": -549.9105224609375, "loss": 0.0178, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.657478928565979, "rewards/margins": 1.0530353784561157, "rewards/rejected": -2.7105140686035156, "step": 540 }, { "epoch": 1.15, "learning_rate": 4.5429476797886617e-07, "logits/chosen": 0.0340617299079895, "logits/rejected": 0.1264275759458542, "logps/chosen": -469.5687561035156, "logps/rejected": -592.4705810546875, "loss": 0.0185, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.408406138420105, "rewards/margins": 1.4667712450027466, "rewards/rejected": -2.8751769065856934, "step": 550 }, { "epoch": 1.17, "learning_rate": 4.3609847514019763e-07, "logits/chosen": 0.0167356226593256, "logits/rejected": 0.032135289162397385, "logps/chosen": -480.41278076171875, "logps/rejected": -577.2174072265625, "loss": 0.0165, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.5578255653381348, "rewards/margins": 1.0947318077087402, "rewards/rejected": -2.652557611465454, "step": 560 }, { "epoch": 1.19, "learning_rate": 4.179878442344892e-07, "logits/chosen": 0.10041844844818115, "logits/rejected": 0.16732005774974823, "logps/chosen": -453.9161071777344, "logps/rejected": -615.6796875, "loss": 0.0153, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.7070415019989014, "rewards/margins": 1.4755295515060425, "rewards/rejected": -3.1825711727142334, "step": 570 }, { "epoch": 1.21, "learning_rate": 3.9998715311197783e-07, "logits/chosen": 0.1310591995716095, "logits/rejected": 0.20585906505584717, "logps/chosen": -493.8118591308594, "logps/rejected": -631.4963989257812, "loss": 0.015, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.7850983142852783, "rewards/margins": 1.443263292312622, "rewards/rejected": -3.228361129760742, "step": 580 }, { "epoch": 1.24, "learning_rate": 3.821205322452863e-07, "logits/chosen": 0.22954685986042023, "logits/rejected": 0.2483092099428177, "logps/chosen": -473.4378967285156, "logps/rejected": -605.134033203125, "loss": 0.0149, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.700280785560608, "rewards/margins": 1.460669755935669, "rewards/rejected": -3.1609506607055664, "step": 590 }, { "epoch": 1.26, "learning_rate": 3.6441193238179146e-07, "logits/chosen": 0.13607949018478394, "logits/rejected": 0.1680508852005005, "logps/chosen": -451.55340576171875, "logps/rejected": -627.7686157226562, "loss": 0.0147, "rewards/accuracies": 0.75, "rewards/chosen": -1.6148862838745117, "rewards/margins": 1.678989052772522, "rewards/rejected": -3.2938759326934814, "step": 600 }, { "epoch": 1.28, "learning_rate": 3.4688509243692034e-07, "logits/chosen": 0.04345204681158066, "logits/rejected": 0.13040025532245636, "logps/chosen": -461.54095458984375, "logps/rejected": -684.9581909179688, "loss": 0.0153, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.6480602025985718, "rewards/margins": 1.6946277618408203, "rewards/rejected": -3.3426880836486816, "step": 610 }, { "epoch": 1.3, "learning_rate": 3.295635076714144e-07, "logits/chosen": 0.18233785033226013, "logits/rejected": 0.19972297549247742, "logps/chosen": -408.9209899902344, "logps/rejected": -547.9658813476562, "loss": 0.0143, "rewards/accuracies": 0.75, "rewards/chosen": -1.6356074810028076, "rewards/margins": 1.3703811168670654, "rewards/rejected": -3.005988597869873, "step": 620 }, { "epoch": 1.32, "learning_rate": 3.12470398195219e-07, "logits/chosen": 0.15017299354076385, "logits/rejected": 0.07167269289493561, "logps/chosen": -474.58172607421875, "logps/rejected": -649.4796142578125, "loss": 0.0129, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.6831333637237549, "rewards/margins": 1.4837870597839355, "rewards/rejected": -3.1669201850891113, "step": 630 }, { "epoch": 1.34, "learning_rate": 2.956286778402226e-07, "logits/chosen": 0.03866753727197647, "logits/rejected": 0.20129835605621338, "logps/chosen": -546.3468017578125, "logps/rejected": -608.462646484375, "loss": 0.0126, "rewards/accuracies": 0.75, "rewards/chosen": -1.7091865539550781, "rewards/margins": 1.3178246021270752, "rewards/rejected": -3.0270111560821533, "step": 640 }, { "epoch": 1.36, "learning_rate": 2.7906092344356826e-07, "logits/chosen": 0.2127591073513031, "logits/rejected": 0.24179625511169434, "logps/chosen": -462.47412109375, "logps/rejected": -581.084228515625, "loss": 0.014, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.751960039138794, "rewards/margins": 1.4448457956314087, "rewards/rejected": -3.196805953979492, "step": 650 }, { "epoch": 1.38, "learning_rate": 2.6278934458271996e-07, "logits/chosen": 0.09269841015338898, "logits/rejected": 0.2964209318161011, "logps/chosen": -479.434326171875, "logps/rejected": -605.9524536132812, "loss": 0.0123, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.8022867441177368, "rewards/margins": 1.3753817081451416, "rewards/rejected": -3.177668333053589, "step": 660 }, { "epoch": 1.4, "learning_rate": 2.468357538028487e-07, "logits/chosen": 0.16141146421432495, "logits/rejected": 0.18542757630348206, "logps/chosen": -487.90277099609375, "logps/rejected": -652.5034790039062, "loss": 0.0107, "rewards/accuracies": 0.75, "rewards/chosen": -1.9332258701324463, "rewards/margins": 1.736053705215454, "rewards/rejected": -3.6692795753479004, "step": 670 }, { "epoch": 1.42, "learning_rate": 2.312215373764551e-07, "logits/chosen": 0.07799498736858368, "logits/rejected": 0.17718131840229034, "logps/chosen": -603.2567138671875, "logps/rejected": -699.2156372070312, "loss": 0.0101, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.1482930183410645, "rewards/margins": 1.3787685632705688, "rewards/rejected": -3.5270614624023438, "step": 680 }, { "epoch": 1.44, "learning_rate": 2.1596762663442213e-07, "logits/chosen": 0.2014874666929245, "logits/rejected": 0.3246391713619232, "logps/chosen": -489.08349609375, "logps/rejected": -607.5847778320312, "loss": 0.0096, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.083740711212158, "rewards/margins": 1.446257472038269, "rewards/rejected": -3.5299980640411377, "step": 690 }, { "epoch": 1.47, "learning_rate": 2.0109446990692963e-07, "logits/chosen": 0.09734896570444107, "logits/rejected": 0.16283641755580902, "logps/chosen": -540.1688232421875, "logps/rejected": -701.462890625, "loss": 0.0094, "rewards/accuracies": 0.8125, "rewards/chosen": -2.07643985748291, "rewards/margins": 1.7090556621551514, "rewards/rejected": -3.7854957580566406, "step": 700 }, { "epoch": 1.49, "learning_rate": 1.8662200511184872e-07, "logits/chosen": 0.07912759482860565, "logits/rejected": 0.19963078200817108, "logps/chosen": -491.30426025390625, "logps/rejected": -630.0563354492188, "loss": 0.0099, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.9977525472640991, "rewards/margins": 1.5802443027496338, "rewards/rejected": -3.5779967308044434, "step": 710 }, { "epoch": 1.51, "learning_rate": 1.725696330273575e-07, "logits/chosen": 0.14783975481987, "logits/rejected": 0.27563345432281494, "logps/chosen": -530.8796997070312, "logps/rejected": -640.3440551757812, "loss": 0.0107, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.036653518676758, "rewards/margins": 1.323557734489441, "rewards/rejected": -3.3602116107940674, "step": 720 }, { "epoch": 1.53, "learning_rate": 1.589561912846089e-07, "logits/chosen": 0.16717246174812317, "logits/rejected": 0.2920343279838562, "logps/chosen": -499.3802795410156, "logps/rejected": -612.64892578125, "loss": 0.012, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.0618550777435303, "rewards/margins": 1.435462236404419, "rewards/rejected": -3.4973175525665283, "step": 730 }, { "epoch": 1.55, "learning_rate": 1.4579992911531496e-07, "logits/chosen": 0.1249130517244339, "logits/rejected": 0.23616066575050354, "logps/chosen": -575.0750732421875, "logps/rejected": -649.9669189453125, "loss": 0.0106, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.2815146446228027, "rewards/margins": 1.226216197013855, "rewards/rejected": -3.5077309608459473, "step": 740 }, { "epoch": 1.57, "learning_rate": 1.3311848288809813e-07, "logits/chosen": 0.21837782859802246, "logits/rejected": 0.31546956300735474, "logps/chosen": -510.7059020996094, "logps/rejected": -609.2933959960938, "loss": 0.0119, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.790372610092163, "rewards/margins": 1.2426694631576538, "rewards/rejected": -3.0330421924591064, "step": 750 }, { "epoch": 1.59, "learning_rate": 1.209288524664029e-07, "logits/chosen": 0.14562873542308807, "logits/rejected": 0.3084864318370819, "logps/chosen": -622.6912841796875, "logps/rejected": -749.8731689453125, "loss": 0.0131, "rewards/accuracies": 0.71875, "rewards/chosen": -2.2252538204193115, "rewards/margins": 1.5818650722503662, "rewards/rejected": -3.8071188926696777, "step": 760 }, { "epoch": 1.61, "learning_rate": 1.0924737841966497e-07, "logits/chosen": 0.1799144446849823, "logits/rejected": 0.354133278131485, "logps/chosen": -585.0472412109375, "logps/rejected": -712.3133544921875, "loss": 0.0107, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.1570990085601807, "rewards/margins": 1.6586040258407593, "rewards/rejected": -3.8157036304473877, "step": 770 }, { "epoch": 1.63, "learning_rate": 9.808972011828054e-08, "logits/chosen": 0.20896565914154053, "logits/rejected": 0.1832619458436966, "logps/chosen": -474.9366149902344, "logps/rejected": -665.3892822265625, "loss": 0.0099, "rewards/accuracies": 0.8125, "rewards/chosen": -1.9308887720108032, "rewards/margins": 1.5281493663787842, "rewards/rejected": -3.459038257598877, "step": 780 }, { "epoch": 1.65, "learning_rate": 8.747083474174527e-08, "logits/chosen": 0.25221484899520874, "logits/rejected": 0.3025228679180145, "logps/chosen": -486.76678466796875, "logps/rejected": -610.9810791015625, "loss": 0.01, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.9139289855957031, "rewards/margins": 1.4173685312271118, "rewards/rejected": -3.3312973976135254, "step": 790 }, { "epoch": 1.67, "learning_rate": 7.740495722810269e-08, "logits/chosen": 0.12703558802604675, "logits/rejected": 0.25433093309402466, "logps/chosen": -528.8013916015625, "logps/rejected": -645.4374389648438, "loss": 0.01, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.129984140396118, "rewards/margins": 1.322923183441162, "rewards/rejected": -3.452907085418701, "step": 800 }, { "epoch": 1.7, "learning_rate": 6.790558119157597e-08, "logits/chosen": 0.1941952407360077, "logits/rejected": 0.36538344621658325, "logps/chosen": -536.0458374023438, "logps/rejected": -630.6697387695312, "loss": 0.0111, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.9618316888809204, "rewards/margins": 1.3840124607086182, "rewards/rejected": -3.34584379196167, "step": 810 }, { "epoch": 1.72, "learning_rate": 5.898544083397e-08, "logits/chosen": 0.1936766654253006, "logits/rejected": 0.22626741230487823, "logps/chosen": -482.18902587890625, "logps/rejected": -640.9258422851562, "loss": 0.0113, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.822951078414917, "rewards/margins": 1.679091215133667, "rewards/rejected": -3.502042055130005, "step": 820 }, { "epoch": 1.74, "learning_rate": 5.065649387408705e-08, "logits/chosen": 0.16037659347057343, "logits/rejected": 0.23867423832416534, "logps/chosen": -536.796630859375, "logps/rejected": -645.6795654296875, "loss": 0.0119, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.09273624420166, "rewards/margins": 1.3475998640060425, "rewards/rejected": -3.440336227416992, "step": 830 }, { "epoch": 1.76, "learning_rate": 4.292990551804171e-08, "logits/chosen": 0.11955185234546661, "logits/rejected": 0.2987907826900482, "logps/chosen": -521.8675537109375, "logps/rejected": -622.3560791015625, "loss": 0.0115, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.9727070331573486, "rewards/margins": 1.207002878189087, "rewards/rejected": -3.1797099113464355, "step": 840 }, { "epoch": 1.78, "learning_rate": 3.581603349196371e-08, "logits/chosen": 0.12183141708374023, "logits/rejected": 0.24950018525123596, "logps/chosen": -529.2427978515625, "logps/rejected": -662.9299926757812, "loss": 0.0112, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -2.024509906768799, "rewards/margins": 1.5907318592071533, "rewards/rejected": -3.615241289138794, "step": 850 }, { "epoch": 1.8, "learning_rate": 2.9324414157151367e-08, "logits/chosen": 0.11247365176677704, "logits/rejected": 0.28803473711013794, "logps/chosen": -538.6015625, "logps/rejected": -616.6097412109375, "loss": 0.0105, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.005286693572998, "rewards/margins": 1.320533037185669, "rewards/rejected": -3.325819492340088, "step": 860 }, { "epoch": 1.82, "learning_rate": 2.3463749726290284e-08, "logits/chosen": 0.09726160764694214, "logits/rejected": 0.3085189759731293, "logps/chosen": -527.7420043945312, "logps/rejected": -666.7064208984375, "loss": 0.0114, "rewards/accuracies": 0.78125, "rewards/chosen": -1.963595986366272, "rewards/margins": 1.6061077117919922, "rewards/rejected": -3.5697035789489746, "step": 870 }, { "epoch": 1.84, "learning_rate": 1.824189659787284e-08, "logits/chosen": 0.19652321934700012, "logits/rejected": 0.2885872423648834, "logps/chosen": -515.560546875, "logps/rejected": -641.10791015625, "loss": 0.0111, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.9605176448822021, "rewards/margins": 1.3721264600753784, "rewards/rejected": -3.33264422416687, "step": 880 }, { "epoch": 1.86, "learning_rate": 1.3665854824458035e-08, "logits/chosen": 0.16733339428901672, "logits/rejected": 0.3634529113769531, "logps/chosen": -542.18505859375, "logps/rejected": -629.7310791015625, "loss": 0.0115, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.0391831398010254, "rewards/margins": 1.1835925579071045, "rewards/rejected": -3.2227752208709717, "step": 890 }, { "epoch": 1.88, "learning_rate": 9.741758728888217e-09, "logits/chosen": 0.08950433880090714, "logits/rejected": 0.2665843069553375, "logps/chosen": -533.1641845703125, "logps/rejected": -621.0523681640625, "loss": 0.0113, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.9605424404144287, "rewards/margins": 1.1125773191452026, "rewards/rejected": -3.073119640350342, "step": 900 }, { "epoch": 1.91, "learning_rate": 6.474868681043577e-09, "logits/chosen": 0.13345034420490265, "logits/rejected": 0.2458508014678955, "logps/chosen": -523.0572509765625, "logps/rejected": -666.5548706054688, "loss": 0.0107, "rewards/accuracies": 0.75, "rewards/chosen": -2.094968557357788, "rewards/margins": 1.4136923551559448, "rewards/rejected": -3.5086607933044434, "step": 910 }, { "epoch": 1.93, "learning_rate": 3.869564046156459e-09, "logits/chosen": 0.17636564373970032, "logits/rejected": 0.24904970824718475, "logps/chosen": -521.7586669921875, "logps/rejected": -661.547119140625, "loss": 0.0115, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.0953400135040283, "rewards/margins": 1.3953152894973755, "rewards/rejected": -3.4906551837921143, "step": 920 }, { "epoch": 1.95, "learning_rate": 1.929337314139412e-09, "logits/chosen": 0.1708141714334488, "logits/rejected": 0.2874212861061096, "logps/chosen": -481.3929138183594, "logps/rejected": -591.492431640625, "loss": 0.0107, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.8482071161270142, "rewards/margins": 1.3176212310791016, "rewards/rejected": -3.165828227996826, "step": 930 }, { "epoch": 1.97, "learning_rate": 6.567894177967325e-10, "logits/chosen": 0.1810809224843979, "logits/rejected": 0.3499010503292084, "logps/chosen": -509.21966552734375, "logps/rejected": -619.0591430664062, "loss": 0.0119, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.7878868579864502, "rewards/margins": 1.3797376155853271, "rewards/rejected": -3.1676242351531982, "step": 940 }, { "epoch": 1.99, "learning_rate": 5.3626246194704575e-11, "logits/chosen": 0.12432925403118134, "logits/rejected": 0.1847553700208664, "logps/chosen": -471.4737854003906, "logps/rejected": -620.7115478515625, "loss": 0.0121, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.8229620456695557, "rewards/margins": 1.5415856838226318, "rewards/rejected": -3.3645477294921875, "step": 950 }, { "epoch": 2.0, "step": 954, "total_flos": 0.0, "train_loss": 0.050850671487596796, "train_runtime": 12712.7589, "train_samples_per_second": 9.618, "train_steps_per_second": 0.075 } ], "logging_steps": 10, "max_steps": 954, "num_train_epochs": 2, "save_steps": 10000, "total_flos": 0.0, "trial_name": null, "trial_params": null }