{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 478, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 1316.2617480695828, "learning_rate": 1.0416666666666666e-08, "logits/chosen": -1.689455509185791, "logits/rejected": -1.4794573783874512, "logps/chosen": -126.21005249023438, "logps/rejected": -98.13133239746094, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.02, "grad_norm": 1084.7724692148897, "learning_rate": 1.0416666666666667e-07, "logits/chosen": -1.707624912261963, "logits/rejected": -1.6101186275482178, "logps/chosen": -139.66224670410156, "logps/rejected": -91.32621002197266, "loss": 0.686, "rewards/accuracies": 0.5, "rewards/chosen": 0.01818913221359253, "rewards/margins": 0.027222516015172005, "rewards/rejected": -0.009033381938934326, "step": 10 }, { "epoch": 0.04, "grad_norm": 372.5890585979663, "learning_rate": 2.0833333333333333e-07, "logits/chosen": -1.6384038925170898, "logits/rejected": -1.6487312316894531, "logps/chosen": -130.37515258789062, "logps/rejected": -93.99095153808594, "loss": 0.4495, "rewards/accuracies": 0.84375, "rewards/chosen": 0.6898423433303833, "rewards/margins": 0.9020320177078247, "rewards/rejected": -0.21218962967395782, "step": 20 }, { "epoch": 0.06, "grad_norm": 363.4016752055454, "learning_rate": 3.1249999999999997e-07, "logits/chosen": -1.6896642446517944, "logits/rejected": -1.6273959875106812, "logps/chosen": -130.80935668945312, "logps/rejected": -106.37054443359375, "loss": 0.2556, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 2.216484785079956, "rewards/margins": 3.602745771408081, "rewards/rejected": -1.386260986328125, "step": 30 }, { "epoch": 0.08, "grad_norm": 254.07955635631413, "learning_rate": 4.1666666666666667e-07, "logits/chosen": -1.6251739263534546, "logits/rejected": -1.5512189865112305, "logps/chosen": -142.7510528564453, "logps/rejected": -113.31219482421875, "loss": 0.1925, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 2.6909842491149902, "rewards/margins": 7.113295078277588, "rewards/rejected": -4.422310829162598, "step": 40 }, { "epoch": 0.1, "grad_norm": 450.1083647442556, "learning_rate": 4.999733114418725e-07, "logits/chosen": -1.5827839374542236, "logits/rejected": -1.6095731258392334, "logps/chosen": -127.47169494628906, "logps/rejected": -124.7472152709961, "loss": 0.187, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": 2.2808213233947754, "rewards/margins": 9.9701509475708, "rewards/rejected": -7.6893310546875, "step": 50 }, { "epoch": 0.13, "grad_norm": 256.12553340364644, "learning_rate": 4.990398100856366e-07, "logits/chosen": -1.6854372024536133, "logits/rejected": -1.6347172260284424, "logps/chosen": -143.6377410888672, "logps/rejected": -138.22506713867188, "loss": 0.1745, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 2.208740711212158, "rewards/margins": 13.148699760437012, "rewards/rejected": -10.939959526062012, "step": 60 }, { "epoch": 0.15, "grad_norm": 266.6931182384945, "learning_rate": 4.967775735898179e-07, "logits/chosen": -1.6533622741699219, "logits/rejected": -1.6828343868255615, "logps/chosen": -136.77774047851562, "logps/rejected": -139.76748657226562, "loss": 0.1719, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 2.496314525604248, "rewards/margins": 15.036949157714844, "rewards/rejected": -12.540634155273438, "step": 70 }, { "epoch": 0.17, "grad_norm": 311.4093156505733, "learning_rate": 4.931986719649298e-07, "logits/chosen": -1.6512008905410767, "logits/rejected": -1.642218828201294, "logps/chosen": -129.7277374267578, "logps/rejected": -130.1470489501953, "loss": 0.1638, "rewards/accuracies": 0.918749988079071, "rewards/chosen": 3.2196338176727295, "rewards/margins": 14.145663261413574, "rewards/rejected": -10.926031112670898, "step": 80 }, { "epoch": 0.19, "grad_norm": 353.08827705610673, "learning_rate": 4.883222001996351e-07, "logits/chosen": -1.7057621479034424, "logits/rejected": -1.7253615856170654, "logps/chosen": -136.57843017578125, "logps/rejected": -141.9429931640625, "loss": 0.1501, "rewards/accuracies": 0.918749988079071, "rewards/chosen": 3.7895398139953613, "rewards/margins": 15.804595947265625, "rewards/rejected": -12.015056610107422, "step": 90 }, { "epoch": 0.21, "grad_norm": 173.53718581228586, "learning_rate": 4.821741763807186e-07, "logits/chosen": -1.7514231204986572, "logits/rejected": -1.7408148050308228, "logps/chosen": -116.7274169921875, "logps/rejected": -125.6600570678711, "loss": 0.1496, "rewards/accuracies": 0.90625, "rewards/chosen": 3.375108242034912, "rewards/margins": 13.876774787902832, "rewards/rejected": -10.501666069030762, "step": 100 }, { "epoch": 0.21, "eval_logits/chosen": -1.8009788990020752, "eval_logits/rejected": -1.790999174118042, "eval_logps/chosen": -123.99901580810547, "eval_logps/rejected": -130.8439178466797, "eval_loss": 0.13556738197803497, "eval_rewards/accuracies": 0.94140625, "eval_rewards/chosen": 4.131972789764404, "eval_rewards/margins": 15.41292953491211, "eval_rewards/rejected": -11.280956268310547, "eval_runtime": 97.6442, "eval_samples_per_second": 20.483, "eval_steps_per_second": 0.328, "step": 100 }, { "epoch": 0.23, "grad_norm": 220.3790872760113, "learning_rate": 4.747874028753375e-07, "logits/chosen": -1.6608690023422241, "logits/rejected": -1.7199954986572266, "logps/chosen": -120.6917724609375, "logps/rejected": -133.25762939453125, "loss": 0.1546, "rewards/accuracies": 0.9375, "rewards/chosen": 3.4064812660217285, "rewards/margins": 12.731483459472656, "rewards/rejected": -9.325002670288086, "step": 110 }, { "epoch": 0.25, "grad_norm": 179.12119430014053, "learning_rate": 4.662012913161997e-07, "logits/chosen": -1.7208821773529053, "logits/rejected": -1.7148889303207397, "logps/chosen": -118.548583984375, "logps/rejected": -133.46463012695312, "loss": 0.1456, "rewards/accuracies": 0.875, "rewards/chosen": 4.286909580230713, "rewards/margins": 13.92347240447998, "rewards/rejected": -9.636563301086426, "step": 120 }, { "epoch": 0.27, "grad_norm": 119.5811123757762, "learning_rate": 4.5646165232345103e-07, "logits/chosen": -1.688997507095337, "logits/rejected": -1.7153244018554688, "logps/chosen": -124.4625244140625, "logps/rejected": -129.4587860107422, "loss": 0.1477, "rewards/accuracies": 0.918749988079071, "rewards/chosen": 4.8756022453308105, "rewards/margins": 15.727206230163574, "rewards/rejected": -10.851605415344238, "step": 130 }, { "epoch": 0.29, "grad_norm": 293.67492501764275, "learning_rate": 4.456204510851956e-07, "logits/chosen": -1.6008217334747314, "logits/rejected": -1.5764399766921997, "logps/chosen": -121.84040832519531, "logps/rejected": -125.70499420166016, "loss": 0.1534, "rewards/accuracies": 0.90625, "rewards/chosen": 4.718628406524658, "rewards/margins": 14.510149955749512, "rewards/rejected": -9.791521072387695, "step": 140 }, { "epoch": 0.31, "grad_norm": 250.47040288737202, "learning_rate": 4.337355301007335e-07, "logits/chosen": -1.7394781112670898, "logits/rejected": -1.7636123895645142, "logps/chosen": -119.86863708496094, "logps/rejected": -122.49459075927734, "loss": 0.1403, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.755049705505371, "rewards/margins": 14.378946304321289, "rewards/rejected": -9.623896598815918, "step": 150 }, { "epoch": 0.33, "grad_norm": 394.54185792168863, "learning_rate": 4.2087030056579986e-07, "logits/chosen": -1.6095482110977173, "logits/rejected": -1.5893223285675049, "logps/chosen": -127.0114517211914, "logps/rejected": -128.70870971679688, "loss": 0.1577, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 4.486311435699463, "rewards/margins": 13.415287971496582, "rewards/rejected": -8.928976058959961, "step": 160 }, { "epoch": 0.36, "grad_norm": 336.28954028277553, "learning_rate": 4.070934040463998e-07, "logits/chosen": -1.7846260070800781, "logits/rejected": -1.7679131031036377, "logps/chosen": -121.9030990600586, "logps/rejected": -130.14794921875, "loss": 0.1516, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 5.171528339385986, "rewards/margins": 15.514431953430176, "rewards/rejected": -10.342904090881348, "step": 170 }, { "epoch": 0.38, "grad_norm": 266.7294156199543, "learning_rate": 3.9247834624635404e-07, "logits/chosen": -1.7311102151870728, "logits/rejected": -1.725064992904663, "logps/chosen": -125.84059143066406, "logps/rejected": -125.44111633300781, "loss": 0.1616, "rewards/accuracies": 0.90625, "rewards/chosen": 5.19378137588501, "rewards/margins": 15.565200805664062, "rewards/rejected": -10.371419906616211, "step": 180 }, { "epoch": 0.4, "grad_norm": 243.44642169675112, "learning_rate": 3.7710310482256523e-07, "logits/chosen": -1.764722466468811, "logits/rejected": -1.7476263046264648, "logps/chosen": -114.77116394042969, "logps/rejected": -139.33917236328125, "loss": 0.1684, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 5.108515739440918, "rewards/margins": 16.28169822692871, "rewards/rejected": -11.173181533813477, "step": 190 }, { "epoch": 0.42, "grad_norm": 258.83830985573707, "learning_rate": 3.610497133404795e-07, "logits/chosen": -1.7498699426651, "logits/rejected": -1.7545902729034424, "logps/chosen": -120.42120361328125, "logps/rejected": -123.0103988647461, "loss": 0.1795, "rewards/accuracies": 0.9375, "rewards/chosen": 5.614555835723877, "rewards/margins": 16.115243911743164, "rewards/rejected": -10.500688552856445, "step": 200 }, { "epoch": 0.42, "eval_logits/chosen": -1.86138117313385, "eval_logits/rejected": -1.8606913089752197, "eval_logps/chosen": -120.21395111083984, "eval_logps/rejected": -130.0475616455078, "eval_loss": 0.13641399145126343, "eval_rewards/accuracies": 0.93359375, "eval_rewards/chosen": 5.267488479614258, "eval_rewards/margins": 16.30953025817871, "eval_rewards/rejected": -11.042043685913086, "eval_runtime": 97.5652, "eval_samples_per_second": 20.499, "eval_steps_per_second": 0.328, "step": 200 }, { "epoch": 0.44, "grad_norm": 258.76531038458563, "learning_rate": 3.4440382358952115e-07, "logits/chosen": -1.689432144165039, "logits/rejected": -1.632364273071289, "logps/chosen": -117.7408447265625, "logps/rejected": -117.53926086425781, "loss": 0.1494, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 4.45497989654541, "rewards/margins": 12.836206436157227, "rewards/rejected": -8.381224632263184, "step": 210 }, { "epoch": 0.46, "grad_norm": 163.97448768931915, "learning_rate": 3.272542485937368e-07, "logits/chosen": -1.8162240982055664, "logits/rejected": -1.8604834079742432, "logps/chosen": -118.64457702636719, "logps/rejected": -118.34297180175781, "loss": 0.1524, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 5.34921407699585, "rewards/margins": 14.593510627746582, "rewards/rejected": -9.244296073913574, "step": 220 }, { "epoch": 0.48, "grad_norm": 122.96583033580265, "learning_rate": 3.096924887558854e-07, "logits/chosen": -1.7572132349014282, "logits/rejected": -1.708581566810608, "logps/chosen": -131.16842651367188, "logps/rejected": -137.84829711914062, "loss": 0.1815, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 4.962122440338135, "rewards/margins": 16.372663497924805, "rewards/rejected": -11.410540580749512, "step": 230 }, { "epoch": 0.5, "grad_norm": 300.9054437339817, "learning_rate": 2.9181224366319943e-07, "logits/chosen": -1.768376111984253, "logits/rejected": -1.7853620052337646, "logps/chosen": -118.08064270019531, "logps/rejected": -129.52337646484375, "loss": 0.1638, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.15334415435791, "rewards/margins": 13.721521377563477, "rewards/rejected": -9.568175315856934, "step": 240 }, { "epoch": 0.52, "grad_norm": 279.7087004166346, "learning_rate": 2.7370891215954565e-07, "logits/chosen": -1.7569881677627563, "logits/rejected": -1.7871322631835938, "logps/chosen": -121.43367767333984, "logps/rejected": -130.93099975585938, "loss": 0.1737, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 4.7944416999816895, "rewards/margins": 14.978727340698242, "rewards/rejected": -10.184286117553711, "step": 250 }, { "epoch": 0.54, "grad_norm": 163.84527157544522, "learning_rate": 2.55479083351317e-07, "logits/chosen": -1.7968614101409912, "logits/rejected": -1.8222767114639282, "logps/chosen": -125.540283203125, "logps/rejected": -119.97703552246094, "loss": 0.1484, "rewards/accuracies": 0.90625, "rewards/chosen": 5.486014366149902, "rewards/margins": 16.016239166259766, "rewards/rejected": -10.530224800109863, "step": 260 }, { "epoch": 0.56, "grad_norm": 150.71761898213634, "learning_rate": 2.3722002126275822e-07, "logits/chosen": -1.787398099899292, "logits/rejected": -1.723350167274475, "logps/chosen": -115.22535705566406, "logps/rejected": -123.31414794921875, "loss": 0.139, "rewards/accuracies": 0.90625, "rewards/chosen": 5.049933433532715, "rewards/margins": 14.674034118652344, "rewards/rejected": -9.624099731445312, "step": 270 }, { "epoch": 0.59, "grad_norm": 267.05255152770263, "learning_rate": 2.19029145890313e-07, "logits/chosen": -1.6403396129608154, "logits/rejected": -1.728877067565918, "logps/chosen": -121.6760482788086, "logps/rejected": -137.2602996826172, "loss": 0.1837, "rewards/accuracies": 0.918749988079071, "rewards/chosen": 4.614184379577637, "rewards/margins": 15.387414932250977, "rewards/rejected": -10.773229598999023, "step": 280 }, { "epoch": 0.61, "grad_norm": 355.7773679008215, "learning_rate": 2.0100351342479216e-07, "logits/chosen": -1.7196556329727173, "logits/rejected": -1.7061948776245117, "logps/chosen": -112.95915222167969, "logps/rejected": -124.29833984375, "loss": 0.1648, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 4.5611090660095215, "rewards/margins": 15.115681648254395, "rewards/rejected": -10.554571151733398, "step": 290 }, { "epoch": 0.63, "grad_norm": 387.9028451533635, "learning_rate": 1.8323929841460178e-07, "logits/chosen": -1.6907150745391846, "logits/rejected": -1.6335220336914062, "logps/chosen": -130.05313110351562, "logps/rejected": -141.36476135253906, "loss": 0.1585, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 4.916111946105957, "rewards/margins": 15.453518867492676, "rewards/rejected": -10.537405967712402, "step": 300 }, { "epoch": 0.63, "eval_logits/chosen": -1.7980220317840576, "eval_logits/rejected": -1.7959610223770142, "eval_logps/chosen": -120.6431655883789, "eval_logps/rejected": -132.25042724609375, "eval_loss": 0.14247241616249084, "eval_rewards/accuracies": 0.92578125, "eval_rewards/chosen": 5.138728141784668, "eval_rewards/margins": 16.841632843017578, "eval_rewards/rejected": -11.702906608581543, "eval_runtime": 97.7011, "eval_samples_per_second": 20.471, "eval_steps_per_second": 0.328, "step": 300 }, { "epoch": 0.65, "grad_norm": 308.7683125090126, "learning_rate": 1.6583128063291573e-07, "logits/chosen": -1.6941922903060913, "logits/rejected": -1.7201515436172485, "logps/chosen": -121.898193359375, "logps/rejected": -132.00962829589844, "loss": 0.1619, "rewards/accuracies": 0.9375, "rewards/chosen": 5.231846809387207, "rewards/margins": 15.2730131149292, "rewards/rejected": -10.041168212890625, "step": 310 }, { "epoch": 0.67, "grad_norm": 208.88097429038612, "learning_rate": 1.488723393865766e-07, "logits/chosen": -1.7199640274047852, "logits/rejected": -1.6923316717147827, "logps/chosen": -111.40040588378906, "logps/rejected": -131.6649627685547, "loss": 0.154, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": 5.818942070007324, "rewards/margins": 16.441822052001953, "rewards/rejected": -10.62287712097168, "step": 320 }, { "epoch": 0.69, "grad_norm": 167.39144709959334, "learning_rate": 1.3245295796480788e-07, "logits/chosen": -1.65103018283844, "logits/rejected": -1.7277615070343018, "logps/chosen": -121.6876449584961, "logps/rejected": -127.0235366821289, "loss": 0.1507, "rewards/accuracies": 0.90625, "rewards/chosen": 4.948590278625488, "rewards/margins": 14.931228637695312, "rewards/rejected": -9.982640266418457, "step": 330 }, { "epoch": 0.71, "grad_norm": 156.90849982717606, "learning_rate": 1.1666074087171627e-07, "logits/chosen": -1.602086067199707, "logits/rejected": -1.6703577041625977, "logps/chosen": -115.71211242675781, "logps/rejected": -116.56599426269531, "loss": 0.1908, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": 4.433460235595703, "rewards/margins": 13.112295150756836, "rewards/rejected": -8.678834915161133, "step": 340 }, { "epoch": 0.73, "grad_norm": 295.768622683131, "learning_rate": 1.0157994641835734e-07, "logits/chosen": -1.6467043161392212, "logits/rejected": -1.659854531288147, "logps/chosen": -115.77754974365234, "logps/rejected": -137.26531982421875, "loss": 0.1326, "rewards/accuracies": 0.9375, "rewards/chosen": 4.292226314544678, "rewards/margins": 15.004185676574707, "rewards/rejected": -10.711957931518555, "step": 350 }, { "epoch": 0.75, "grad_norm": 201.50715466732544, "learning_rate": 8.729103716819111e-08, "logits/chosen": -1.6420570611953735, "logits/rejected": -1.663745641708374, "logps/chosen": -122.47066497802734, "logps/rejected": -131.9073486328125, "loss": 0.1543, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": 4.703906059265137, "rewards/margins": 14.480855941772461, "rewards/rejected": -9.776951789855957, "step": 360 }, { "epoch": 0.77, "grad_norm": 398.4959635020424, "learning_rate": 7.387025063449081e-08, "logits/chosen": -1.7505977153778076, "logits/rejected": -1.6930338144302368, "logps/chosen": -112.33251953125, "logps/rejected": -121.57230377197266, "loss": 0.1443, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 4.217165470123291, "rewards/margins": 12.93463134765625, "rewards/rejected": -8.7174654006958, "step": 370 }, { "epoch": 0.79, "grad_norm": 239.63863129657855, "learning_rate": 6.138919252022435e-08, "logits/chosen": -1.7603282928466797, "logits/rejected": -1.7833961248397827, "logps/chosen": -119.14958190917969, "logps/rejected": -136.85977172851562, "loss": 0.1576, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 5.251246452331543, "rewards/margins": 16.546478271484375, "rewards/rejected": -11.295232772827148, "step": 380 }, { "epoch": 0.82, "grad_norm": 197.58750581294214, "learning_rate": 4.991445467064689e-08, "logits/chosen": -1.656961441040039, "logits/rejected": -1.6608669757843018, "logps/chosen": -115.59519958496094, "logps/rejected": -124.886474609375, "loss": 0.1242, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 5.070186614990234, "rewards/margins": 15.444877624511719, "rewards/rejected": -10.3746919631958, "step": 390 }, { "epoch": 0.84, "grad_norm": 266.2162184394133, "learning_rate": 3.9507259776993954e-08, "logits/chosen": -1.636885643005371, "logits/rejected": -1.713200330734253, "logps/chosen": -117.88565826416016, "logps/rejected": -124.86863708496094, "loss": 0.2005, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 4.46274471282959, "rewards/margins": 14.449310302734375, "rewards/rejected": -9.986566543579102, "step": 400 }, { "epoch": 0.84, "eval_logits/chosen": -1.8133598566055298, "eval_logits/rejected": -1.8130455017089844, "eval_logps/chosen": -119.97606658935547, "eval_logps/rejected": -131.71229553222656, "eval_loss": 0.14202240109443665, "eval_rewards/accuracies": 0.92578125, "eval_rewards/chosen": 5.338851451873779, "eval_rewards/margins": 16.880319595336914, "eval_rewards/rejected": -11.541468620300293, "eval_runtime": 97.6019, "eval_samples_per_second": 20.491, "eval_steps_per_second": 0.328, "step": 400 }, { "epoch": 0.86, "grad_norm": 300.6588760835356, "learning_rate": 3.022313472693447e-08, "logits/chosen": -1.735870599746704, "logits/rejected": -1.7638452053070068, "logps/chosen": -128.670166015625, "logps/rejected": -129.5255126953125, "loss": 0.1508, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 5.593460559844971, "rewards/margins": 16.819358825683594, "rewards/rejected": -11.225897789001465, "step": 410 }, { "epoch": 0.88, "grad_norm": 184.26802651594184, "learning_rate": 2.2111614344599684e-08, "logits/chosen": -1.7033697366714478, "logits/rejected": -1.704993486404419, "logps/chosen": -113.72819519042969, "logps/rejected": -124.34715270996094, "loss": 0.1419, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 4.3744401931762695, "rewards/margins": 14.479223251342773, "rewards/rejected": -10.104782104492188, "step": 420 }, { "epoch": 0.9, "grad_norm": 269.33105520831003, "learning_rate": 1.521597710086439e-08, "logits/chosen": -1.6633046865463257, "logits/rejected": -1.6779390573501587, "logps/chosen": -129.4523162841797, "logps/rejected": -124.72953796386719, "loss": 0.154, "rewards/accuracies": 0.918749988079071, "rewards/chosen": 4.997079372406006, "rewards/margins": 14.370283126831055, "rewards/rejected": -9.373201370239258, "step": 430 }, { "epoch": 0.92, "grad_norm": 181.5022378731684, "learning_rate": 9.57301420397924e-09, "logits/chosen": -1.7851531505584717, "logits/rejected": -1.790157675743103, "logps/chosen": -120.8554458618164, "logps/rejected": -131.34410095214844, "loss": 0.144, "rewards/accuracies": 0.96875, "rewards/chosen": 5.435299873352051, "rewards/margins": 16.81759262084961, "rewards/rejected": -11.382290840148926, "step": 440 }, { "epoch": 0.94, "grad_norm": 227.09412853715097, "learning_rate": 5.212833302556258e-09, "logits/chosen": -1.7998348474502563, "logits/rejected": -1.7618439197540283, "logps/chosen": -116.46590423583984, "logps/rejected": -127.42796325683594, "loss": 0.159, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.652300834655762, "rewards/margins": 17.369140625, "rewards/rejected": -11.716839790344238, "step": 450 }, { "epoch": 0.96, "grad_norm": 225.69247261851913, "learning_rate": 2.158697848236607e-09, "logits/chosen": -1.6827681064605713, "logits/rejected": -1.7046699523925781, "logps/chosen": -120.79095458984375, "logps/rejected": -130.98043823242188, "loss": 0.1335, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 5.568859100341797, "rewards/margins": 16.174407958984375, "rewards/rejected": -10.605547904968262, "step": 460 }, { "epoch": 0.98, "grad_norm": 199.48903785359678, "learning_rate": 4.269029751107489e-10, "logits/chosen": -1.6656444072723389, "logits/rejected": -1.6796722412109375, "logps/chosen": -114.7574462890625, "logps/rejected": -139.92117309570312, "loss": 0.1514, "rewards/accuracies": 0.90625, "rewards/chosen": 4.486257553100586, "rewards/margins": 15.330111503601074, "rewards/rejected": -10.843853950500488, "step": 470 }, { "epoch": 1.0, "step": 478, "total_flos": 0.0, "train_loss": 0.17749474911510196, "train_runtime": 7645.2484, "train_samples_per_second": 7.996, "train_steps_per_second": 0.063 } ], "logging_steps": 10, "max_steps": 478, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }