{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 391, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 210.34713052784278, "learning_rate": 2.5e-09, "logits/chosen": -4.623842239379883, "logits/rejected": -4.85917854309082, "logps/chosen": -239.31422424316406, "logps/rejected": -207.56365966796875, "loss": 0.6927, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.03, "grad_norm": 198.95172630432864, "learning_rate": 2.5e-08, "logits/chosen": -4.3338446617126465, "logits/rejected": -4.64424991607666, "logps/chosen": -265.20184326171875, "logps/rejected": -215.72174072265625, "loss": 0.6928, "rewards/accuracies": 0.4236111044883728, "rewards/chosen": -0.004745930898934603, "rewards/margins": -0.004067909903824329, "rewards/rejected": -0.0006780209369026124, "step": 10 }, { "epoch": 0.05, "grad_norm": 204.7891876677461, "learning_rate": 5e-08, "logits/chosen": -4.509727478027344, "logits/rejected": -4.74410343170166, "logps/chosen": -267.73052978515625, "logps/rejected": -216.7478485107422, "loss": 0.6872, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 0.010470375418663025, "rewards/margins": 0.01739482954144478, "rewards/rejected": -0.006924452725797892, "step": 20 }, { "epoch": 0.08, "grad_norm": 204.94575488992174, "learning_rate": 7.5e-08, "logits/chosen": -4.5970940589904785, "logits/rejected": -4.777865409851074, "logps/chosen": -257.5598449707031, "logps/rejected": -215.4015350341797, "loss": 0.6544, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": 0.04864828661084175, "rewards/margins": 0.09208732843399048, "rewards/rejected": -0.04343904182314873, "step": 30 }, { "epoch": 0.1, "grad_norm": 163.67699084811588, "learning_rate": 1e-07, "logits/chosen": -4.643096923828125, "logits/rejected": -4.7387237548828125, "logps/chosen": -249.96743774414062, "logps/rejected": -223.3234405517578, "loss": 0.5584, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": 0.18158790469169617, "rewards/margins": 0.36420467495918274, "rewards/rejected": -0.18261677026748657, "step": 40 }, { "epoch": 0.13, "grad_norm": 125.1152304775479, "learning_rate": 9.979985922607475e-08, "logits/chosen": -4.558148384094238, "logits/rejected": -4.785082817077637, "logps/chosen": -265.6357727050781, "logps/rejected": -234.0360107421875, "loss": 0.45, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.3166799247264862, "rewards/margins": 0.7249041795730591, "rewards/rejected": -0.4082241952419281, "step": 50 }, { "epoch": 0.15, "grad_norm": 110.5697848266263, "learning_rate": 9.92010391574745e-08, "logits/chosen": -4.701218605041504, "logits/rejected": -4.855440139770508, "logps/chosen": -232.1560821533203, "logps/rejected": -235.8180389404297, "loss": 0.3379, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 0.44831886887550354, "rewards/margins": 1.4881489276885986, "rewards/rejected": -1.039829969406128, "step": 60 }, { "epoch": 0.18, "grad_norm": 103.09926490168155, "learning_rate": 9.820833372667812e-08, "logits/chosen": -4.597586631774902, "logits/rejected": -4.846543312072754, "logps/chosen": -243.5035858154297, "logps/rejected": -245.3424072265625, "loss": 0.3085, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 0.4819186329841614, "rewards/margins": 1.8609161376953125, "rewards/rejected": -1.378997564315796, "step": 70 }, { "epoch": 0.2, "grad_norm": 89.87848352821936, "learning_rate": 9.682969016701356e-08, "logits/chosen": -4.592278957366943, "logits/rejected": -4.840281963348389, "logps/chosen": -249.3519744873047, "logps/rejected": -261.445068359375, "loss": 0.2624, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": 0.593399703502655, "rewards/margins": 2.1497161388397217, "rewards/rejected": -1.5563163757324219, "step": 80 }, { "epoch": 0.23, "grad_norm": 98.45898295424381, "learning_rate": 9.507614539004081e-08, "logits/chosen": -4.667254447937012, "logits/rejected": -4.913816928863525, "logps/chosen": -235.763427734375, "logps/rejected": -244.2578582763672, "loss": 0.2462, "rewards/accuracies": 0.918749988079071, "rewards/chosen": 0.7724655866622925, "rewards/margins": 2.8438591957092285, "rewards/rejected": -2.0713934898376465, "step": 90 }, { "epoch": 0.26, "grad_norm": 87.96881533227138, "learning_rate": 9.296173762811083e-08, "logits/chosen": -4.5116472244262695, "logits/rejected": -4.829812049865723, "logps/chosen": -238.08468627929688, "logps/rejected": -269.5484619140625, "loss": 0.2472, "rewards/accuracies": 0.875, "rewards/chosen": 0.931675910949707, "rewards/margins": 3.0536458492279053, "rewards/rejected": -2.1219699382781982, "step": 100 }, { "epoch": 0.26, "eval_logits/chosen": -4.58513879776001, "eval_logits/rejected": -4.80186128616333, "eval_logps/chosen": -394.6981201171875, "eval_logps/rejected": -515.9166259765625, "eval_loss": 0.9610964059829712, "eval_rewards/accuracies": 0.390625, "eval_rewards/chosen": -0.21118265390396118, "eval_rewards/margins": -0.3347358703613281, "eval_rewards/rejected": 0.12355318665504456, "eval_runtime": 97.8315, "eval_samples_per_second": 20.443, "eval_steps_per_second": 0.327, "step": 100 }, { "epoch": 0.28, "grad_norm": 84.98735748868098, "learning_rate": 9.050339404945832e-08, "logits/chosen": -4.55401611328125, "logits/rejected": -4.845933437347412, "logps/chosen": -229.4434356689453, "logps/rejected": -257.52984619140625, "loss": 0.2226, "rewards/accuracies": 0.90625, "rewards/chosen": 0.9871166348457336, "rewards/margins": 3.0293149948120117, "rewards/rejected": -2.042198419570923, "step": 110 }, { "epoch": 0.31, "grad_norm": 75.78122724506682, "learning_rate": 8.77207952455395e-08, "logits/chosen": -4.49249792098999, "logits/rejected": -4.787415981292725, "logps/chosen": -252.7578125, "logps/rejected": -273.38555908203125, "loss": 0.2215, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 1.1615877151489258, "rewards/margins": 3.400435209274292, "rewards/rejected": -2.238847255706787, "step": 120 }, { "epoch": 0.33, "grad_norm": 109.1136183108071, "learning_rate": 8.463621767547997e-08, "logits/chosen": -4.589264869689941, "logits/rejected": -4.87318229675293, "logps/chosen": -239.29531860351562, "logps/rejected": -265.04693603515625, "loss": 0.2169, "rewards/accuracies": 0.893750011920929, "rewards/chosen": 0.9574294090270996, "rewards/margins": 3.4433422088623047, "rewards/rejected": -2.485912799835205, "step": 130 }, { "epoch": 0.36, "grad_norm": 79.72525878658313, "learning_rate": 8.127435532896387e-08, "logits/chosen": -4.636221885681152, "logits/rejected": -4.9098310470581055, "logps/chosen": -267.59625244140625, "logps/rejected": -288.02349853515625, "loss": 0.2063, "rewards/accuracies": 0.918749988079071, "rewards/chosen": 0.8299416303634644, "rewards/margins": 3.622443675994873, "rewards/rejected": -2.792501926422119, "step": 140 }, { "epoch": 0.38, "grad_norm": 94.45112212404622, "learning_rate": 7.766212203526569e-08, "logits/chosen": -4.643942832946777, "logits/rejected": -4.911728382110596, "logps/chosen": -233.4263153076172, "logps/rejected": -277.07818603515625, "loss": 0.2098, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 0.9495984315872192, "rewards/margins": 3.8475449085235596, "rewards/rejected": -2.89794659614563, "step": 150 }, { "epoch": 0.41, "grad_norm": 85.35291313866578, "learning_rate": 7.382843600106538e-08, "logits/chosen": -4.690377235412598, "logits/rejected": -4.9024457931518555, "logps/chosen": -233.21981811523438, "logps/rejected": -271.2682189941406, "loss": 0.1861, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.9118326306343079, "rewards/margins": 3.6947906017303467, "rewards/rejected": -2.7829582691192627, "step": 160 }, { "epoch": 0.43, "grad_norm": 75.89103255157417, "learning_rate": 6.980398830195784e-08, "logits/chosen": -4.554282188415527, "logits/rejected": -4.874223232269287, "logps/chosen": -236.4412078857422, "logps/rejected": -279.4911804199219, "loss": 0.1833, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 1.2316501140594482, "rewards/margins": 4.21605920791626, "rewards/rejected": -2.9844090938568115, "step": 170 }, { "epoch": 0.46, "grad_norm": 77.81291773020575, "learning_rate": 6.562099718102787e-08, "logits/chosen": -4.651320934295654, "logits/rejected": -4.9173784255981445, "logps/chosen": -215.70126342773438, "logps/rejected": -251.5159149169922, "loss": 0.2065, "rewards/accuracies": 0.918749988079071, "rewards/chosen": 1.0581190586090088, "rewards/margins": 3.832904815673828, "rewards/rejected": -2.7747855186462402, "step": 180 }, { "epoch": 0.49, "grad_norm": 86.461876717381, "learning_rate": 6.131295012148612e-08, "logits/chosen": -4.617634296417236, "logits/rejected": -4.793360233306885, "logps/chosen": -239.32681274414062, "logps/rejected": -286.96124267578125, "loss": 0.2013, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": 1.0783030986785889, "rewards/margins": 3.6680614948272705, "rewards/rejected": -2.5897579193115234, "step": 190 }, { "epoch": 0.51, "grad_norm": 84.24320751887706, "learning_rate": 5.691433575823665e-08, "logits/chosen": -4.624228477478027, "logits/rejected": -4.830000877380371, "logps/chosen": -233.09713745117188, "logps/rejected": -271.84051513671875, "loss": 0.2112, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 0.9521042108535767, "rewards/margins": 3.7768027782440186, "rewards/rejected": -2.8246986865997314, "step": 200 }, { "epoch": 0.51, "eval_logits/chosen": -4.606511116027832, "eval_logits/rejected": -4.8388166427612305, "eval_logps/chosen": -405.0722351074219, "eval_logps/rejected": -524.7885131835938, "eval_loss": 1.102483868598938, "eval_rewards/accuracies": 0.375, "eval_rewards/chosen": -0.729888916015625, "eval_rewards/margins": -0.40984660387039185, "eval_rewards/rejected": -0.32004231214523315, "eval_runtime": 97.8012, "eval_samples_per_second": 20.45, "eval_steps_per_second": 0.327, "step": 200 }, { "epoch": 0.54, "grad_norm": 70.97621814359026, "learning_rate": 5.2460367774593905e-08, "logits/chosen": -4.6944451332092285, "logits/rejected": -4.962179183959961, "logps/chosen": -243.93307495117188, "logps/rejected": -297.62066650390625, "loss": 0.1723, "rewards/accuracies": 0.9375, "rewards/chosen": 1.0976004600524902, "rewards/margins": 4.546332836151123, "rewards/rejected": -3.448732376098633, "step": 210 }, { "epoch": 0.56, "grad_norm": 71.88477654183092, "learning_rate": 4.798670299452925e-08, "logits/chosen": -4.529160499572754, "logits/rejected": -4.8643479347229, "logps/chosen": -241.5579833984375, "logps/rejected": -293.224365234375, "loss": 0.1923, "rewards/accuracies": 0.918749988079071, "rewards/chosen": 1.0626676082611084, "rewards/margins": 4.490227699279785, "rewards/rejected": -3.4275600910186768, "step": 220 }, { "epoch": 0.59, "grad_norm": 83.09100453064212, "learning_rate": 4.3529155927297226e-08, "logits/chosen": -4.6047258377075195, "logits/rejected": -4.93651008605957, "logps/chosen": -241.11477661132812, "logps/rejected": -293.9808044433594, "loss": 0.2012, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.9927155375480652, "rewards/margins": 4.5062031745910645, "rewards/rejected": -3.5134873390197754, "step": 230 }, { "epoch": 0.61, "grad_norm": 70.11336436391163, "learning_rate": 3.9123412049691636e-08, "logits/chosen": -4.588685035705566, "logits/rejected": -4.866146087646484, "logps/chosen": -252.31533813476562, "logps/rejected": -294.6343688964844, "loss": 0.1875, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 1.0253931283950806, "rewards/margins": 4.710432529449463, "rewards/rejected": -3.6850390434265137, "step": 240 }, { "epoch": 0.64, "grad_norm": 77.75874575792918, "learning_rate": 3.480474212128766e-08, "logits/chosen": -4.716187000274658, "logits/rejected": -4.966707229614258, "logps/chosen": -231.89279174804688, "logps/rejected": -266.51666259765625, "loss": 0.1825, "rewards/accuracies": 0.9375, "rewards/chosen": 0.840434193611145, "rewards/margins": 3.7858078479766846, "rewards/rejected": -2.94537353515625, "step": 250 }, { "epoch": 0.66, "grad_norm": 97.12524424809816, "learning_rate": 3.060771981975726e-08, "logits/chosen": -4.585513114929199, "logits/rejected": -4.878482341766357, "logps/chosen": -234.92617797851562, "logps/rejected": -297.1214904785156, "loss": 0.1837, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 0.9561206102371216, "rewards/margins": 4.824769973754883, "rewards/rejected": -3.86864972114563, "step": 260 }, { "epoch": 0.69, "grad_norm": 92.49874438996748, "learning_rate": 2.6565944956764818e-08, "logits/chosen": -4.684746742248535, "logits/rejected": -4.911890983581543, "logps/chosen": -243.29568481445312, "logps/rejected": -288.39111328125, "loss": 0.1961, "rewards/accuracies": 0.9375, "rewards/chosen": 0.8868792653083801, "rewards/margins": 4.555182456970215, "rewards/rejected": -3.6683037281036377, "step": 270 }, { "epoch": 0.72, "grad_norm": 73.7028241699641, "learning_rate": 2.2711774490274766e-08, "logits/chosen": -4.634344577789307, "logits/rejected": -4.873081207275391, "logps/chosen": -245.1703338623047, "logps/rejected": -317.2539978027344, "loss": 0.1644, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.9668266177177429, "rewards/margins": 4.682557582855225, "rewards/rejected": -3.715731143951416, "step": 280 }, { "epoch": 0.74, "grad_norm": 142.25337407808868, "learning_rate": 1.9076063486687256e-08, "logits/chosen": -4.503401756286621, "logits/rejected": -4.866554260253906, "logps/chosen": -250.9346160888672, "logps/rejected": -283.41046142578125, "loss": 0.1799, "rewards/accuracies": 0.918749988079071, "rewards/chosen": 1.083687424659729, "rewards/margins": 4.472739219665527, "rewards/rejected": -3.389052152633667, "step": 290 }, { "epoch": 0.77, "grad_norm": 88.66793876665662, "learning_rate": 1.5687918106563324e-08, "logits/chosen": -4.625166416168213, "logits/rejected": -4.831929683685303, "logps/chosen": -232.6981658935547, "logps/rejected": -288.00457763671875, "loss": 0.195, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": 1.0354994535446167, "rewards/margins": 4.637454509735107, "rewards/rejected": -3.6019554138183594, "step": 300 }, { "epoch": 0.77, "eval_logits/chosen": -4.624210357666016, "eval_logits/rejected": -4.856749057769775, "eval_logps/chosen": -411.3396911621094, "eval_logps/rejected": -531.6535034179688, "eval_loss": 1.1301820278167725, "eval_rewards/accuracies": 0.41015625, "eval_rewards/chosen": -1.0432608127593994, "eval_rewards/margins": -0.3799673318862915, "eval_rewards/rejected": -0.6632934212684631, "eval_runtime": 97.9609, "eval_samples_per_second": 20.416, "eval_steps_per_second": 0.327, "step": 300 }, { "epoch": 0.79, "grad_norm": 90.90394303193246, "learning_rate": 1.257446259144494e-08, "logits/chosen": -4.541079044342041, "logits/rejected": -4.873132228851318, "logps/chosen": -239.60592651367188, "logps/rejected": -298.159423828125, "loss": 0.1847, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.076683759689331, "rewards/margins": 4.874758243560791, "rewards/rejected": -3.7980740070343018, "step": 310 }, { "epoch": 0.82, "grad_norm": 87.85310576006609, "learning_rate": 9.760622117187234e-09, "logits/chosen": -4.597599029541016, "logits/rejected": -4.9500837326049805, "logps/chosen": -227.94247436523438, "logps/rejected": -279.3809814453125, "loss": 0.1934, "rewards/accuracies": 0.918749988079071, "rewards/chosen": 0.8610901832580566, "rewards/margins": 4.507565498352051, "rewards/rejected": -3.6464743614196777, "step": 320 }, { "epoch": 0.84, "grad_norm": 85.81889719468313, "learning_rate": 7.2689232521989885e-09, "logits/chosen": -4.554391860961914, "logits/rejected": -4.864416599273682, "logps/chosen": -249.89169311523438, "logps/rejected": -304.54913330078125, "loss": 0.1773, "rewards/accuracies": 0.90625, "rewards/chosen": 0.9128581881523132, "rewards/margins": 4.5053324699401855, "rewards/rejected": -3.5924744606018066, "step": 330 }, { "epoch": 0.87, "grad_norm": 77.2990699180903, "learning_rate": 5.119313618049309e-09, "logits/chosen": -4.570425987243652, "logits/rejected": -4.913475513458252, "logps/chosen": -250.6792449951172, "logps/rejected": -277.26556396484375, "loss": 0.1723, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 1.1733391284942627, "rewards/margins": 4.697513580322266, "rewards/rejected": -3.524174451828003, "step": 340 }, { "epoch": 0.9, "grad_norm": 74.61892537865367, "learning_rate": 3.3290021961708158e-09, "logits/chosen": -4.588479995727539, "logits/rejected": -4.761317253112793, "logps/chosen": -238.91921997070312, "logps/rejected": -291.2458190917969, "loss": 0.1937, "rewards/accuracies": 0.9375, "rewards/chosen": 0.8275976181030273, "rewards/margins": 4.001389026641846, "rewards/rejected": -3.1737911701202393, "step": 350 }, { "epoch": 0.92, "grad_norm": 81.22407668854541, "learning_rate": 1.9123215591052013e-09, "logits/chosen": -4.583038806915283, "logits/rejected": -4.805889129638672, "logps/chosen": -244.8368682861328, "logps/rejected": -294.9869079589844, "loss": 0.1907, "rewards/accuracies": 0.918749988079071, "rewards/chosen": 0.8399018049240112, "rewards/margins": 4.216121673583984, "rewards/rejected": -3.3762192726135254, "step": 360 }, { "epoch": 0.95, "grad_norm": 75.17805842008224, "learning_rate": 8.806131292167618e-10, "logits/chosen": -4.595518112182617, "logits/rejected": -4.752079010009766, "logps/chosen": -239.1554412841797, "logps/rejected": -302.4869079589844, "loss": 0.1904, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 0.8832891583442688, "rewards/margins": 4.165283679962158, "rewards/rejected": -3.281994581222534, "step": 370 }, { "epoch": 0.97, "grad_norm": 91.00267878372446, "learning_rate": 2.4213638345040867e-10, "logits/chosen": -4.70483922958374, "logits/rejected": -4.97845983505249, "logps/chosen": -242.5469207763672, "logps/rejected": -292.7474670410156, "loss": 0.1788, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.9767888188362122, "rewards/margins": 4.587931156158447, "rewards/rejected": -3.61114239692688, "step": 380 }, { "epoch": 1.0, "grad_norm": 100.5241948062632, "learning_rate": 2.0027310073833516e-12, "logits/chosen": -4.696263313293457, "logits/rejected": -4.96966028213501, "logps/chosen": -238.3385772705078, "logps/rejected": -292.5868835449219, "loss": 0.1773, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 1.066699743270874, "rewards/margins": 4.670289516448975, "rewards/rejected": -3.6035892963409424, "step": 390 }, { "epoch": 1.0, "step": 391, "total_flos": 0.0, "train_loss": 0.256967593336959, "train_runtime": 6146.1986, "train_samples_per_second": 8.135, "train_steps_per_second": 0.064 } ], "logging_steps": 10, "max_steps": 391, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }