{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9994242947610823, "eval_steps": 100, "global_step": 868, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 146.8957421194674, "learning_rate": 5.747126436781609e-09, "logits/chosen": -1.8686045408248901, "logits/rejected": -1.7644572257995605, "logps/chosen": -235.48362731933594, "logps/rejected": -183.77415466308594, "loss": 0.6922, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.01, "grad_norm": 237.04909600464902, "learning_rate": 5.747126436781609e-08, "logits/chosen": -1.9218311309814453, "logits/rejected": -1.8686226606369019, "logps/chosen": -240.50628662109375, "logps/rejected": -216.8230438232422, "loss": 0.6941, "rewards/accuracies": 0.4444444477558136, "rewards/chosen": -0.002868607407435775, "rewards/margins": -0.0005126786418259144, "rewards/rejected": -0.002355928998440504, "step": 10 }, { "epoch": 0.02, "grad_norm": 211.82620683349913, "learning_rate": 1.1494252873563217e-07, "logits/chosen": -2.010253429412842, "logits/rejected": -1.8642921447753906, "logps/chosen": -283.1783752441406, "logps/rejected": -215.68887329101562, "loss": 0.6864, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.008166970685124397, "rewards/margins": 0.013039084151387215, "rewards/rejected": -0.004872114397585392, "step": 20 }, { "epoch": 0.03, "grad_norm": 112.97267570781544, "learning_rate": 1.7241379310344828e-07, "logits/chosen": -1.9509646892547607, "logits/rejected": -1.8835735321044922, "logps/chosen": -240.29074096679688, "logps/rejected": -221.15274047851562, "loss": 0.666, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 0.06300461292266846, "rewards/margins": 0.06308884918689728, "rewards/rejected": -8.423496183240786e-05, "step": 30 }, { "epoch": 0.05, "grad_norm": 95.74241597637995, "learning_rate": 2.2988505747126435e-07, "logits/chosen": -1.9107061624526978, "logits/rejected": -1.8901317119598389, "logps/chosen": -237.59036254882812, "logps/rejected": -216.1823272705078, "loss": 0.6254, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.1618741899728775, "rewards/margins": 0.1895591914653778, "rewards/rejected": -0.027684981003403664, "step": 40 }, { "epoch": 0.06, "grad_norm": 99.41131976874209, "learning_rate": 2.873563218390804e-07, "logits/chosen": -1.9803855419158936, "logits/rejected": -1.9117343425750732, "logps/chosen": -222.8354949951172, "logps/rejected": -207.8356170654297, "loss": 0.587, "rewards/accuracies": 0.768750011920929, "rewards/chosen": 0.23140163719654083, "rewards/margins": 0.3971676230430603, "rewards/rejected": -0.16576598584651947, "step": 50 }, { "epoch": 0.07, "grad_norm": 85.607755330573, "learning_rate": 3.4482758620689656e-07, "logits/chosen": -2.0026185512542725, "logits/rejected": -1.9393552541732788, "logps/chosen": -283.61199951171875, "logps/rejected": -245.7741241455078, "loss": 0.5351, "rewards/accuracies": 0.793749988079071, "rewards/chosen": 0.2226170301437378, "rewards/margins": 0.6314491629600525, "rewards/rejected": -0.4088321626186371, "step": 60 }, { "epoch": 0.08, "grad_norm": 105.09596276450131, "learning_rate": 4.0229885057471266e-07, "logits/chosen": -1.9401233196258545, "logits/rejected": -1.9097837209701538, "logps/chosen": -223.41983032226562, "logps/rejected": -222.15283203125, "loss": 0.5181, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.10973484814167023, "rewards/margins": 0.7763268351554871, "rewards/rejected": -0.8860616683959961, "step": 70 }, { "epoch": 0.09, "grad_norm": 103.35591967882027, "learning_rate": 4.597701149425287e-07, "logits/chosen": -1.8168106079101562, "logits/rejected": -1.7877649068832397, "logps/chosen": -227.1862030029297, "logps/rejected": -217.2149200439453, "loss": 0.487, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.1610104888677597, "rewards/margins": 0.8937110900878906, "rewards/rejected": -1.0547215938568115, "step": 80 }, { "epoch": 0.1, "grad_norm": 96.75574881093117, "learning_rate": 4.999817969178237e-07, "logits/chosen": -1.8949458599090576, "logits/rejected": -1.8469880819320679, "logps/chosen": -249.9053192138672, "logps/rejected": -233.94631958007812, "loss": 0.4415, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -0.24459132552146912, "rewards/margins": 1.046515703201294, "rewards/rejected": -1.291106939315796, "step": 90 }, { "epoch": 0.12, "grad_norm": 92.71145741103783, "learning_rate": 4.996582603056428e-07, "logits/chosen": -1.8467813730239868, "logits/rejected": -1.7593021392822266, "logps/chosen": -251.5709686279297, "logps/rejected": -255.50814819335938, "loss": 0.4898, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.3932832181453705, "rewards/margins": 1.1354423761367798, "rewards/rejected": -1.5287256240844727, "step": 100 }, { "epoch": 0.12, "eval_logits/chosen": -1.8469043970108032, "eval_logits/rejected": -1.7659016847610474, "eval_logps/chosen": -339.4445495605469, "eval_logps/rejected": -353.2087707519531, "eval_loss": 0.5505225658416748, "eval_rewards/accuracies": 0.6875, "eval_rewards/chosen": -0.19665074348449707, "eval_rewards/margins": 0.8084924817085266, "eval_rewards/rejected": -1.005143165588379, "eval_runtime": 98.3147, "eval_samples_per_second": 20.343, "eval_steps_per_second": 0.325, "step": 100 }, { "epoch": 0.13, "grad_norm": 108.17707201404545, "learning_rate": 4.989308132738126e-07, "logits/chosen": -1.8153671026229858, "logits/rejected": -1.6921818256378174, "logps/chosen": -244.46237182617188, "logps/rejected": -222.83145141601562, "loss": 0.4368, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.10259479284286499, "rewards/margins": 1.188932180404663, "rewards/rejected": -1.2915267944335938, "step": 110 }, { "epoch": 0.14, "grad_norm": 100.40422750465044, "learning_rate": 4.978006327248536e-07, "logits/chosen": -1.7503843307495117, "logits/rejected": -1.6768817901611328, "logps/chosen": -247.3853759765625, "logps/rejected": -240.05496215820312, "loss": 0.4237, "rewards/accuracies": 0.8125, "rewards/chosen": -0.34009289741516113, "rewards/margins": 1.1859456300735474, "rewards/rejected": -1.526038408279419, "step": 120 }, { "epoch": 0.15, "grad_norm": 91.1070676572417, "learning_rate": 4.962695471250032e-07, "logits/chosen": -1.654911756515503, "logits/rejected": -1.62875235080719, "logps/chosen": -254.4684600830078, "logps/rejected": -245.8209686279297, "loss": 0.4249, "rewards/accuracies": 0.8125, "rewards/chosen": -0.5070122480392456, "rewards/margins": 1.3276703357696533, "rewards/rejected": -1.8346824645996094, "step": 130 }, { "epoch": 0.16, "grad_norm": 97.14051346245, "learning_rate": 4.94340033546025e-07, "logits/chosen": -1.6183685064315796, "logits/rejected": -1.6242666244506836, "logps/chosen": -220.20852661132812, "logps/rejected": -242.1243133544922, "loss": 0.4218, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -0.6856725811958313, "rewards/margins": 1.2277655601501465, "rewards/rejected": -1.913438081741333, "step": 140 }, { "epoch": 0.17, "grad_norm": 94.8408241800107, "learning_rate": 4.920152136576705e-07, "logits/chosen": -1.4978981018066406, "logits/rejected": -1.5163871049880981, "logps/chosen": -251.6667938232422, "logps/rejected": -254.0281219482422, "loss": 0.4261, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.6180266737937927, "rewards/margins": 1.381075143814087, "rewards/rejected": -1.9991016387939453, "step": 150 }, { "epoch": 0.18, "grad_norm": 108.14894780962342, "learning_rate": 4.892988486772756e-07, "logits/chosen": -1.579886794090271, "logits/rejected": -1.5718797445297241, "logps/chosen": -250.04495239257812, "logps/rejected": -262.3863830566406, "loss": 0.4246, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.6856819987297058, "rewards/margins": 1.5999362468719482, "rewards/rejected": -2.285618305206299, "step": 160 }, { "epoch": 0.2, "grad_norm": 70.9507948612333, "learning_rate": 4.861953332846629e-07, "logits/chosen": -1.7066303491592407, "logits/rejected": -1.6715686321258545, "logps/chosen": -270.17132568359375, "logps/rejected": -260.03350830078125, "loss": 0.4316, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.8634172677993774, "rewards/margins": 1.3165854215621948, "rewards/rejected": -2.1800026893615723, "step": 170 }, { "epoch": 0.21, "grad_norm": 114.85174445864124, "learning_rate": 4.827096885121953e-07, "logits/chosen": -1.817678689956665, "logits/rejected": -1.7374699115753174, "logps/chosen": -286.0107116699219, "logps/rejected": -279.7050476074219, "loss": 0.4309, "rewards/accuracies": 0.75, "rewards/chosen": -1.0613857507705688, "rewards/margins": 1.1405597925186157, "rewards/rejected": -2.2019453048706055, "step": 180 }, { "epoch": 0.22, "grad_norm": 90.50728576001431, "learning_rate": 4.788475536214821e-07, "logits/chosen": -1.7829034328460693, "logits/rejected": -1.7400600910186768, "logps/chosen": -236.38650512695312, "logps/rejected": -245.64291381835938, "loss": 0.3873, "rewards/accuracies": 0.84375, "rewards/chosen": -0.8258365392684937, "rewards/margins": 1.4504865407943726, "rewards/rejected": -2.276322841644287, "step": 190 }, { "epoch": 0.23, "grad_norm": 100.41476084088063, "learning_rate": 4.746151769798818e-07, "logits/chosen": -1.7743650674819946, "logits/rejected": -1.7203725576400757, "logps/chosen": -279.8271789550781, "logps/rejected": -269.1463317871094, "loss": 0.4277, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -0.8149574398994446, "rewards/margins": 1.5317128896713257, "rewards/rejected": -2.346670389175415, "step": 200 }, { "epoch": 0.23, "eval_logits/chosen": -1.8008995056152344, "eval_logits/rejected": -1.7248116731643677, "eval_logps/chosen": -345.1794738769531, "eval_logps/rejected": -370.77880859375, "eval_loss": 0.46549829840660095, "eval_rewards/accuracies": 0.73828125, "eval_rewards/chosen": -0.4833980202674866, "eval_rewards/margins": 1.4002480506896973, "eval_rewards/rejected": -1.8836462497711182, "eval_runtime": 98.0211, "eval_samples_per_second": 20.404, "eval_steps_per_second": 0.326, "step": 200 }, { "epoch": 0.24, "grad_norm": 99.20354803224151, "learning_rate": 4.7001940595156055e-07, "logits/chosen": -1.7606436014175415, "logits/rejected": -1.6915203332901, "logps/chosen": -232.93832397460938, "logps/rejected": -246.24072265625, "loss": 0.441, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.0429778099060059, "rewards/margins": 1.6135696172714233, "rewards/rejected": -2.6565470695495605, "step": 210 }, { "epoch": 0.25, "grad_norm": 80.49166768353011, "learning_rate": 4.650676758194623e-07, "logits/chosen": -1.820059061050415, "logits/rejected": -1.741408348083496, "logps/chosen": -265.69769287109375, "logps/rejected": -259.860595703125, "loss": 0.4003, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -0.915117621421814, "rewards/margins": 1.692671537399292, "rewards/rejected": -2.6077892780303955, "step": 220 }, { "epoch": 0.26, "grad_norm": 103.96889607439884, "learning_rate": 4.5976799775611215e-07, "logits/chosen": -1.84983229637146, "logits/rejected": -1.7714653015136719, "logps/chosen": -260.39678955078125, "logps/rejected": -254.94235229492188, "loss": 0.4118, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -1.0704491138458252, "rewards/margins": 1.982627511024475, "rewards/rejected": -3.05307674407959, "step": 230 }, { "epoch": 0.28, "grad_norm": 87.59607388393054, "learning_rate": 4.5412894586271543e-07, "logits/chosen": -1.8541374206542969, "logits/rejected": -1.7867300510406494, "logps/chosen": -259.6327819824219, "logps/rejected": -233.4642333984375, "loss": 0.3961, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -0.8504000902175903, "rewards/margins": 1.6259987354278564, "rewards/rejected": -2.4763987064361572, "step": 240 }, { "epoch": 0.29, "grad_norm": 87.76098428515671, "learning_rate": 4.481596432975201e-07, "logits/chosen": -1.8068411350250244, "logits/rejected": -1.7720015048980713, "logps/chosen": -218.69546508789062, "logps/rejected": -231.09292602539062, "loss": 0.4236, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.8816181421279907, "rewards/margins": 1.4680168628692627, "rewards/rejected": -2.349634885787964, "step": 250 }, { "epoch": 0.3, "grad_norm": 99.04429789846779, "learning_rate": 4.41869747515886e-07, "logits/chosen": -1.7596534490585327, "logits/rejected": -1.7420539855957031, "logps/chosen": -272.49273681640625, "logps/rejected": -301.8647766113281, "loss": 0.3824, "rewards/accuracies": 0.8125, "rewards/chosen": -1.0995886325836182, "rewards/margins": 1.7163059711456299, "rewards/rejected": -2.815894603729248, "step": 260 }, { "epoch": 0.31, "grad_norm": 91.17380836071179, "learning_rate": 4.352694346459396e-07, "logits/chosen": -1.7119308710098267, "logits/rejected": -1.713330626487732, "logps/chosen": -245.8417205810547, "logps/rejected": -257.9246826171875, "loss": 0.3928, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -0.6780133843421936, "rewards/margins": 1.5433403253555298, "rewards/rejected": -2.221353769302368, "step": 270 }, { "epoch": 0.32, "grad_norm": 85.89753055245001, "learning_rate": 4.2836938302509256e-07, "logits/chosen": -1.7531852722167969, "logits/rejected": -1.6354246139526367, "logps/chosen": -243.12454223632812, "logps/rejected": -249.15576171875, "loss": 0.4123, "rewards/accuracies": 0.8125, "rewards/chosen": -0.8161875605583191, "rewards/margins": 1.92093825340271, "rewards/rejected": -2.737125873565674, "step": 280 }, { "epoch": 0.33, "grad_norm": 93.29457537220665, "learning_rate": 4.2118075592405874e-07, "logits/chosen": -1.6761655807495117, "logits/rejected": -1.668341040611267, "logps/chosen": -263.28924560546875, "logps/rejected": -276.89190673828125, "loss": 0.3899, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -0.7706686854362488, "rewards/margins": 1.6867666244506836, "rewards/rejected": -2.457435131072998, "step": 290 }, { "epoch": 0.35, "grad_norm": 104.22663391151875, "learning_rate": 4.137151834863213e-07, "logits/chosen": -1.5810635089874268, "logits/rejected": -1.56969153881073, "logps/chosen": -215.5782012939453, "logps/rejected": -242.8097381591797, "loss": 0.4188, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.0694911479949951, "rewards/margins": 1.6282224655151367, "rewards/rejected": -2.6977133750915527, "step": 300 }, { "epoch": 0.35, "eval_logits/chosen": -1.6898695230484009, "eval_logits/rejected": -1.6142553091049194, "eval_logps/chosen": -336.9513244628906, "eval_logps/rejected": -373.6328430175781, "eval_loss": 0.39220622181892395, "eval_rewards/accuracies": 0.796875, "eval_rewards/chosen": -0.07198946177959442, "eval_rewards/margins": 1.9543578624725342, "eval_rewards/rejected": -2.0263473987579346, "eval_runtime": 98.0591, "eval_samples_per_second": 20.396, "eval_steps_per_second": 0.326, "step": 300 }, { "epoch": 0.36, "grad_norm": 83.12181184953562, "learning_rate": 4.059847439122671e-07, "logits/chosen": -1.7229493856430054, "logits/rejected": -1.590850591659546, "logps/chosen": -260.96539306640625, "logps/rejected": -262.43865966796875, "loss": 0.4106, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.0447856187820435, "rewards/margins": 1.545778512954712, "rewards/rejected": -2.590564250946045, "step": 310 }, { "epoch": 0.37, "grad_norm": 101.57242075352875, "learning_rate": 3.98001943918432e-07, "logits/chosen": -1.6267013549804688, "logits/rejected": -1.5495989322662354, "logps/chosen": -246.096923828125, "logps/rejected": -268.87274169921875, "loss": 0.3937, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.1101688146591187, "rewards/margins": 1.6341445446014404, "rewards/rejected": -2.7443130016326904, "step": 320 }, { "epoch": 0.38, "grad_norm": 95.29775137930592, "learning_rate": 3.8977969850346866e-07, "logits/chosen": -1.5687649250030518, "logits/rejected": -1.5476423501968384, "logps/chosen": -262.67156982421875, "logps/rejected": -260.74725341796875, "loss": 0.3887, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -0.9716947674751282, "rewards/margins": 1.6578747034072876, "rewards/rejected": -2.6295692920684814, "step": 330 }, { "epoch": 0.39, "grad_norm": 105.61150036189296, "learning_rate": 3.8133131005357465e-07, "logits/chosen": -1.662001371383667, "logits/rejected": -1.5881233215332031, "logps/chosen": -249.8246612548828, "logps/rejected": -272.1719970703125, "loss": 0.38, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -0.9675191044807434, "rewards/margins": 2.143432140350342, "rewards/rejected": -3.1109511852264404, "step": 340 }, { "epoch": 0.4, "grad_norm": 90.41304599394526, "learning_rate": 3.7267044682118435e-07, "logits/chosen": -1.6518446207046509, "logits/rejected": -1.6180702447891235, "logps/chosen": -237.4104461669922, "logps/rejected": -247.13473510742188, "loss": 0.3749, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.1348434686660767, "rewards/margins": 1.6873624324798584, "rewards/rejected": -2.8222060203552246, "step": 350 }, { "epoch": 0.41, "grad_norm": 95.16076600849088, "learning_rate": 3.638111208117425e-07, "logits/chosen": -1.7075703144073486, "logits/rejected": -1.6682260036468506, "logps/chosen": -247.7568817138672, "logps/rejected": -263.2861633300781, "loss": 0.3914, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.0359086990356445, "rewards/margins": 1.2954949140548706, "rewards/rejected": -2.3314034938812256, "step": 360 }, { "epoch": 0.43, "grad_norm": 96.37188736705814, "learning_rate": 3.5476766511433605e-07, "logits/chosen": -1.7550392150878906, "logits/rejected": -1.6907631158828735, "logps/chosen": -281.13836669921875, "logps/rejected": -266.0453796386719, "loss": 0.4097, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.0189838409423828, "rewards/margins": 1.6438014507293701, "rewards/rejected": -2.662785291671753, "step": 370 }, { "epoch": 0.44, "grad_norm": 101.71059089383448, "learning_rate": 3.455547107128602e-07, "logits/chosen": -1.7938178777694702, "logits/rejected": -1.787418007850647, "logps/chosen": -298.5751647949219, "logps/rejected": -281.03118896484375, "loss": 0.3611, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -1.0122178792953491, "rewards/margins": 2.0528714656829834, "rewards/rejected": -3.065088987350464, "step": 380 }, { "epoch": 0.45, "grad_norm": 109.30188938122785, "learning_rate": 3.361871628152338e-07, "logits/chosen": -1.8471622467041016, "logits/rejected": -1.820481300354004, "logps/chosen": -251.8716278076172, "logps/rejected": -283.156982421875, "loss": 0.4006, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.0065103769302368, "rewards/margins": 1.774526834487915, "rewards/rejected": -2.7810370922088623, "step": 390 }, { "epoch": 0.46, "grad_norm": 89.15033820155391, "learning_rate": 3.2668017673896077e-07, "logits/chosen": -1.8275989294052124, "logits/rejected": -1.8090099096298218, "logps/chosen": -244.0500946044922, "logps/rejected": -245.302490234375, "loss": 0.3506, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.62747722864151, "rewards/margins": 2.005857467651367, "rewards/rejected": -2.6333346366882324, "step": 400 }, { "epoch": 0.46, "eval_logits/chosen": -2.029554843902588, "eval_logits/rejected": -1.9793704748153687, "eval_logps/chosen": -331.169189453125, "eval_logps/rejected": -374.0495300292969, "eval_loss": 0.3456653654575348, "eval_rewards/accuracies": 0.8203125, "eval_rewards/chosen": 0.217118039727211, "eval_rewards/margins": 2.2643015384674072, "eval_rewards/rejected": -2.0471832752227783, "eval_runtime": 97.9966, "eval_samples_per_second": 20.409, "eval_steps_per_second": 0.327, "step": 400 }, { "epoch": 0.47, "grad_norm": 95.54284470696203, "learning_rate": 3.1704913339205103e-07, "logits/chosen": -1.9682222604751587, "logits/rejected": -1.9323310852050781, "logps/chosen": -251.9891357421875, "logps/rejected": -276.3076477050781, "loss": 0.3841, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -0.7251254320144653, "rewards/margins": 1.8693323135375977, "rewards/rejected": -2.5944573879241943, "step": 410 }, { "epoch": 0.48, "grad_norm": 84.89194807368709, "learning_rate": 3.0730961438896885e-07, "logits/chosen": -1.973587989807129, "logits/rejected": -1.9758260250091553, "logps/chosen": -326.0084533691406, "logps/rejected": -316.46380615234375, "loss": 0.3676, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.2638908624649048, "rewards/margins": 1.681646704673767, "rewards/rejected": -2.945537805557251, "step": 420 }, { "epoch": 0.5, "grad_norm": 79.45330669787697, "learning_rate": 2.9747737684186795e-07, "logits/chosen": -1.9824804067611694, "logits/rejected": -2.0262296199798584, "logps/chosen": -253.5684814453125, "logps/rejected": -258.55352783203125, "loss": 0.3861, "rewards/accuracies": 0.8125, "rewards/chosen": -0.8729881048202515, "rewards/margins": 1.7805559635162354, "rewards/rejected": -2.6535439491271973, "step": 430 }, { "epoch": 0.51, "grad_norm": 92.0142521630137, "learning_rate": 2.8756832786789663e-07, "logits/chosen": -2.008882761001587, "logits/rejected": -1.9781932830810547, "logps/chosen": -269.37042236328125, "logps/rejected": -264.4306335449219, "loss": 0.382, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -0.5206736326217651, "rewards/margins": 1.916638970375061, "rewards/rejected": -2.437312364578247, "step": 440 }, { "epoch": 0.52, "grad_norm": 81.53843237959488, "learning_rate": 2.7759849885381747e-07, "logits/chosen": -1.951061487197876, "logits/rejected": -1.9269872903823853, "logps/chosen": -282.7742614746094, "logps/rejected": -261.77130126953125, "loss": 0.3613, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.7152455449104309, "rewards/margins": 2.0386595726013184, "rewards/rejected": -2.7539050579071045, "step": 450 }, { "epoch": 0.53, "grad_norm": 81.29719600574327, "learning_rate": 2.675840195195762e-07, "logits/chosen": -1.935346007347107, "logits/rejected": -1.8713289499282837, "logps/chosen": -237.9385528564453, "logps/rejected": -262.90496826171875, "loss": 0.3768, "rewards/accuracies": 0.84375, "rewards/chosen": -0.7526956796646118, "rewards/margins": 2.036323070526123, "rewards/rejected": -2.7890188694000244, "step": 460 }, { "epoch": 0.54, "grad_norm": 79.41069847067702, "learning_rate": 2.575410918227829e-07, "logits/chosen": -1.9017337560653687, "logits/rejected": -1.8612467050552368, "logps/chosen": -286.1165771484375, "logps/rejected": -286.47576904296875, "loss": 0.3801, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.7328876256942749, "rewards/margins": 1.7589473724365234, "rewards/rejected": -2.491835117340088, "step": 470 }, { "epoch": 0.55, "grad_norm": 84.31202005756423, "learning_rate": 2.474859637463226e-07, "logits/chosen": -1.7930266857147217, "logits/rejected": -1.6920406818389893, "logps/chosen": -261.0568542480469, "logps/rejected": -249.1828155517578, "loss": 0.373, "rewards/accuracies": 0.8125, "rewards/chosen": -0.6825451254844666, "rewards/margins": 2.1065242290496826, "rewards/rejected": -2.789069414138794, "step": 480 }, { "epoch": 0.56, "grad_norm": 85.77456711320595, "learning_rate": 2.3743490301150355e-07, "logits/chosen": -1.8167390823364258, "logits/rejected": -1.6751165390014648, "logps/chosen": -254.97879028320312, "logps/rejected": -258.82794189453125, "loss": 0.3847, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -0.421345055103302, "rewards/margins": 2.164630174636841, "rewards/rejected": -2.585975408554077, "step": 490 }, { "epoch": 0.58, "grad_norm": 80.63419673971578, "learning_rate": 2.274041707592724e-07, "logits/chosen": -1.7511818408966064, "logits/rejected": -1.744807481765747, "logps/chosen": -238.7528839111328, "logps/rejected": -277.8020935058594, "loss": 0.3611, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -1.161329984664917, "rewards/margins": 2.127523422241211, "rewards/rejected": -3.288853406906128, "step": 500 }, { "epoch": 0.58, "eval_logits/chosen": -1.8592296838760376, "eval_logits/rejected": -1.818342685699463, "eval_logps/chosen": -330.5163879394531, "eval_logps/rejected": -381.7996520996094, "eval_loss": 0.29585352540016174, "eval_rewards/accuracies": 0.8515625, "eval_rewards/chosen": 0.24975742399692535, "eval_rewards/margins": 2.68444561958313, "eval_rewards/rejected": -2.4346883296966553, "eval_runtime": 97.917, "eval_samples_per_second": 20.425, "eval_steps_per_second": 0.327, "step": 500 }, { "epoch": 0.59, "grad_norm": 93.97823403064545, "learning_rate": 2.17409995242075e-07, "logits/chosen": -1.8619616031646729, "logits/rejected": -1.8197988271713257, "logps/chosen": -246.95120239257812, "logps/rejected": -251.1341552734375, "loss": 0.3787, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -0.9560788869857788, "rewards/margins": 2.1973986625671387, "rewards/rejected": -3.153477668762207, "step": 510 }, { "epoch": 0.6, "grad_norm": 81.29931903371055, "learning_rate": 2.0746854556892544e-07, "logits/chosen": -1.8456933498382568, "logits/rejected": -1.8626302480697632, "logps/chosen": -228.947265625, "logps/rejected": -255.99887084960938, "loss": 0.383, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.850503146648407, "rewards/margins": 1.7254650592803955, "rewards/rejected": -2.5759682655334473, "step": 520 }, { "epoch": 0.61, "grad_norm": 82.34962963997694, "learning_rate": 1.9759590554616173e-07, "logits/chosen": -1.9397910833358765, "logits/rejected": -1.9602453708648682, "logps/chosen": -259.236328125, "logps/rejected": -268.33233642578125, "loss": 0.3812, "rewards/accuracies": 0.84375, "rewards/chosen": -0.5990003347396851, "rewards/margins": 1.7735519409179688, "rewards/rejected": -2.3725523948669434, "step": 530 }, { "epoch": 0.62, "grad_norm": 106.09742448074628, "learning_rate": 1.8780804765620746e-07, "logits/chosen": -1.8515198230743408, "logits/rejected": -1.8524954319000244, "logps/chosen": -268.37164306640625, "logps/rejected": -304.6097106933594, "loss": 0.3726, "rewards/accuracies": 0.78125, "rewards/chosen": -0.32395535707473755, "rewards/margins": 1.7947556972503662, "rewards/rejected": -2.118710994720459, "step": 540 }, { "epoch": 0.63, "grad_norm": 89.06639468347461, "learning_rate": 1.7812080721643973e-07, "logits/chosen": -1.8575937747955322, "logits/rejected": -1.761614441871643, "logps/chosen": -263.5863037109375, "logps/rejected": -246.20455932617188, "loss": 0.3972, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.7359617352485657, "rewards/margins": 1.8330894708633423, "rewards/rejected": -2.5690512657165527, "step": 550 }, { "epoch": 0.64, "grad_norm": 91.51640548835455, "learning_rate": 1.6854985675997063e-07, "logits/chosen": -1.8521220684051514, "logits/rejected": -1.8151109218597412, "logps/chosen": -261.0481872558594, "logps/rejected": -272.9272155761719, "loss": 0.3641, "rewards/accuracies": 0.78125, "rewards/chosen": -0.9162181615829468, "rewards/margins": 1.6873579025268555, "rewards/rejected": -2.603576183319092, "step": 560 }, { "epoch": 0.66, "grad_norm": 83.89007997657542, "learning_rate": 1.5911068067978818e-07, "logits/chosen": -1.7626062631607056, "logits/rejected": -1.7946131229400635, "logps/chosen": -240.85189819335938, "logps/rejected": -273.8741149902344, "loss": 0.3514, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -0.9277374148368835, "rewards/margins": 1.9961919784545898, "rewards/rejected": -2.923929214477539, "step": 570 }, { "epoch": 0.67, "grad_norm": 103.64429212789244, "learning_rate": 1.4981855017728197e-07, "logits/chosen": -1.8487884998321533, "logits/rejected": -1.8991634845733643, "logps/chosen": -267.7337341308594, "logps/rejected": -304.248291015625, "loss": 0.37, "rewards/accuracies": 0.8125, "rewards/chosen": -1.1558853387832642, "rewards/margins": 1.9654242992401123, "rewards/rejected": -3.121309757232666, "step": 580 }, { "epoch": 0.68, "grad_norm": 87.97513457883551, "learning_rate": 1.406884985556804e-07, "logits/chosen": -1.8694698810577393, "logits/rejected": -1.844276785850525, "logps/chosen": -257.92413330078125, "logps/rejected": -268.24462890625, "loss": 0.3586, "rewards/accuracies": 0.8125, "rewards/chosen": -1.2031538486480713, "rewards/margins": 2.1714892387390137, "rewards/rejected": -3.374642848968506, "step": 590 }, { "epoch": 0.69, "grad_norm": 85.79715679551722, "learning_rate": 1.3173529689837354e-07, "logits/chosen": -1.8827781677246094, "logits/rejected": -1.8290605545043945, "logps/chosen": -256.6871032714844, "logps/rejected": -256.0238952636719, "loss": 0.3562, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.1103084087371826, "rewards/margins": 1.9665857553482056, "rewards/rejected": -3.0768942832946777, "step": 600 }, { "epoch": 0.69, "eval_logits/chosen": -1.9736415147781372, "eval_logits/rejected": -1.9216868877410889, "eval_logps/chosen": -327.7752685546875, "eval_logps/rejected": -382.5696105957031, "eval_loss": 0.2513369023799896, "eval_rewards/accuracies": 0.87109375, "eval_rewards/chosen": 0.386812299489975, "eval_rewards/margins": 2.8599982261657715, "eval_rewards/rejected": -2.4731857776641846, "eval_runtime": 97.9427, "eval_samples_per_second": 20.42, "eval_steps_per_second": 0.327, "step": 600 }, { "epoch": 0.7, "grad_norm": 91.70310566548932, "learning_rate": 1.2297343017146726e-07, "logits/chosen": -1.8489186763763428, "logits/rejected": -1.8288800716400146, "logps/chosen": -259.6686096191406, "logps/rejected": -267.1070861816406, "loss": 0.3693, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.0914928913116455, "rewards/margins": 1.885284662246704, "rewards/rejected": -2.9767773151397705, "step": 610 }, { "epoch": 0.71, "grad_norm": 83.02991328479807, "learning_rate": 1.1441707378923474e-07, "logits/chosen": -1.950595498085022, "logits/rejected": -1.8974205255508423, "logps/chosen": -237.6828155517578, "logps/rejected": -257.06005859375, "loss": 0.3625, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -0.4481216371059418, "rewards/margins": 2.152703046798706, "rewards/rejected": -2.600825071334839, "step": 620 }, { "epoch": 0.73, "grad_norm": 97.69736464926112, "learning_rate": 1.06080070680377e-07, "logits/chosen": -1.874447226524353, "logits/rejected": -1.8658256530761719, "logps/chosen": -269.42034912109375, "logps/rejected": -275.75946044921875, "loss": 0.3679, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -0.6408880949020386, "rewards/margins": 2.110172986984253, "rewards/rejected": -2.751060962677002, "step": 630 }, { "epoch": 0.74, "grad_norm": 89.55100518815718, "learning_rate": 9.797590889219587e-08, "logits/chosen": -1.927781343460083, "logits/rejected": -1.915203332901001, "logps/chosen": -262.8731384277344, "logps/rejected": -265.70404052734375, "loss": 0.3725, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -0.5387133359909058, "rewards/margins": 2.1157500743865967, "rewards/rejected": -2.654463529586792, "step": 640 }, { "epoch": 0.75, "grad_norm": 88.47283263403602, "learning_rate": 9.011769976891367e-08, "logits/chosen": -1.9129142761230469, "logits/rejected": -1.9037758111953735, "logps/chosen": -253.7337188720703, "logps/rejected": -273.1281433105469, "loss": 0.3654, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -0.8051580190658569, "rewards/margins": 1.8512630462646484, "rewards/rejected": -2.656421184539795, "step": 650 }, { "epoch": 0.76, "grad_norm": 101.63563998101947, "learning_rate": 8.251815673944218e-08, "logits/chosen": -1.8455785512924194, "logits/rejected": -1.862540602684021, "logps/chosen": -269.90655517578125, "logps/rejected": -266.5269470214844, "loss": 0.3636, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -1.0431994199752808, "rewards/margins": 1.969745397567749, "rewards/rejected": -3.0129449367523193, "step": 660 }, { "epoch": 0.77, "grad_norm": 101.36570058059344, "learning_rate": 7.518957474892148e-08, "logits/chosen": -1.8844772577285767, "logits/rejected": -1.8293778896331787, "logps/chosen": -262.8396301269531, "logps/rejected": -270.7688293457031, "loss": 0.3566, "rewards/accuracies": 0.84375, "rewards/chosen": -1.0009379386901855, "rewards/margins": 2.288020133972168, "rewards/rejected": -3.2889580726623535, "step": 670 }, { "epoch": 0.78, "grad_norm": 108.29800746794625, "learning_rate": 6.814381036730274e-08, "logits/chosen": -1.9113209247589111, "logits/rejected": -1.8669430017471313, "logps/chosen": -248.9401092529297, "logps/rejected": -263.89459228515625, "loss": 0.3789, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -0.7384175062179565, "rewards/margins": 2.038583993911743, "rewards/rejected": -2.77700138092041, "step": 680 }, { "epoch": 0.79, "grad_norm": 81.30832233061167, "learning_rate": 6.139226260715872e-08, "logits/chosen": -1.936092734336853, "logits/rejected": -1.950042724609375, "logps/chosen": -261.84283447265625, "logps/rejected": -281.1523132324219, "loss": 0.3622, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -0.8801490068435669, "rewards/margins": 2.052794933319092, "rewards/rejected": -2.932943820953369, "step": 690 }, { "epoch": 0.81, "grad_norm": 81.87543968348014, "learning_rate": 5.4945854481754734e-08, "logits/chosen": -1.8324077129364014, "logits/rejected": -1.7888282537460327, "logps/chosen": -245.5817413330078, "logps/rejected": -259.8585510253906, "loss": 0.3624, "rewards/accuracies": 0.78125, "rewards/chosen": -0.9253555536270142, "rewards/margins": 2.031710624694824, "rewards/rejected": -2.957066059112549, "step": 700 }, { "epoch": 0.81, "eval_logits/chosen": -1.9716989994049072, "eval_logits/rejected": -1.930106282234192, "eval_logps/chosen": -322.60308837890625, "eval_logps/rejected": -380.21783447265625, "eval_loss": 0.21937939524650574, "eval_rewards/accuracies": 0.90625, "eval_rewards/chosen": 0.6454216241836548, "eval_rewards/margins": 3.001021385192871, "eval_rewards/rejected": -2.3555996417999268, "eval_runtime": 98.0586, "eval_samples_per_second": 20.396, "eval_steps_per_second": 0.326, "step": 700 }, { "epoch": 0.82, "grad_norm": 104.29775989655523, "learning_rate": 4.881501533321605e-08, "logits/chosen": -1.7864516973495483, "logits/rejected": -1.7958072423934937, "logps/chosen": -228.2520294189453, "logps/rejected": -255.38577270507812, "loss": 0.3399, "rewards/accuracies": 0.8125, "rewards/chosen": -0.952102780342102, "rewards/margins": 2.169015884399414, "rewards/rejected": -3.1211180686950684, "step": 710 }, { "epoch": 0.83, "grad_norm": 87.51056493700892, "learning_rate": 4.300966395938377e-08, "logits/chosen": -1.91278076171875, "logits/rejected": -1.887738823890686, "logps/chosen": -269.040283203125, "logps/rejected": -277.3271484375, "loss": 0.3737, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -0.9400026202201843, "rewards/margins": 2.1387667655944824, "rewards/rejected": -3.0787696838378906, "step": 720 }, { "epoch": 0.84, "grad_norm": 82.85641235575972, "learning_rate": 3.7539192566655246e-08, "logits/chosen": -1.8706934452056885, "logits/rejected": -1.8632476329803467, "logps/chosen": -258.32257080078125, "logps/rejected": -262.11151123046875, "loss": 0.3615, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -0.5498217344284058, "rewards/margins": 1.930101752281189, "rewards/rejected": -2.479923725128174, "step": 730 }, { "epoch": 0.85, "grad_norm": 86.13162915966026, "learning_rate": 3.24124515747731e-08, "logits/chosen": -1.827125906944275, "logits/rejected": -1.8333660364151, "logps/chosen": -245.1622314453125, "logps/rejected": -269.59857177734375, "loss": 0.376, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.0357568264007568, "rewards/margins": 1.9602575302124023, "rewards/rejected": -2.996014356613159, "step": 740 }, { "epoch": 0.86, "grad_norm": 86.92913330390785, "learning_rate": 2.763773529814506e-08, "logits/chosen": -1.940159559249878, "logits/rejected": -1.9158337116241455, "logps/chosen": -282.51397705078125, "logps/rejected": -276.21331787109375, "loss": 0.372, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.7603832483291626, "rewards/margins": 2.119266986846924, "rewards/rejected": -2.879650354385376, "step": 750 }, { "epoch": 0.88, "grad_norm": 95.56287400260709, "learning_rate": 2.3222768526860698e-08, "logits/chosen": -1.863567590713501, "logits/rejected": -1.788558006286621, "logps/chosen": -249.87838745117188, "logps/rejected": -264.7607727050781, "loss": 0.3676, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.6970678567886353, "rewards/margins": 2.2210423946380615, "rewards/rejected": -2.9181103706359863, "step": 760 }, { "epoch": 0.89, "grad_norm": 82.6081133840389, "learning_rate": 1.9174694029115146e-08, "logits/chosen": -1.9369175434112549, "logits/rejected": -1.9106756448745728, "logps/chosen": -287.55975341796875, "logps/rejected": -268.37493896484375, "loss": 0.3621, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.7068324685096741, "rewards/margins": 1.9774510860443115, "rewards/rejected": -2.684283494949341, "step": 770 }, { "epoch": 0.9, "grad_norm": 84.83905388303022, "learning_rate": 1.5500060995258134e-08, "logits/chosen": -1.8579909801483154, "logits/rejected": -1.8068830966949463, "logps/chosen": -258.0039367675781, "logps/rejected": -259.26202392578125, "loss": 0.3456, "rewards/accuracies": 0.875, "rewards/chosen": -0.8524407148361206, "rewards/margins": 2.122145414352417, "rewards/rejected": -2.974586248397827, "step": 780 }, { "epoch": 0.91, "grad_norm": 94.72717106858794, "learning_rate": 1.2204814442165812e-08, "logits/chosen": -1.8840267658233643, "logits/rejected": -1.8291581869125366, "logps/chosen": -252.4437713623047, "logps/rejected": -252.0703582763672, "loss": 0.3741, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.8425480127334595, "rewards/margins": 2.2477831840515137, "rewards/rejected": -3.0903308391571045, "step": 790 }, { "epoch": 0.92, "grad_norm": 102.52909854244074, "learning_rate": 9.294285595075669e-09, "logits/chosen": -1.9619266986846924, "logits/rejected": -1.904088020324707, "logps/chosen": -277.6653747558594, "logps/rejected": -272.3161315917969, "loss": 0.4069, "rewards/accuracies": 0.78125, "rewards/chosen": -0.8278266787528992, "rewards/margins": 2.0403804779052734, "rewards/rejected": -2.868206739425659, "step": 800 }, { "epoch": 0.92, "eval_logits/chosen": -1.9591288566589355, "eval_logits/rejected": -1.9204463958740234, "eval_logps/chosen": -322.0539245605469, "eval_logps/rejected": -380.2658386230469, "eval_loss": 0.20271854102611542, "eval_rewards/accuracies": 0.9140625, "eval_rewards/chosen": 0.6728801131248474, "eval_rewards/margins": 3.0308780670166016, "eval_rewards/rejected": -2.3579981327056885, "eval_runtime": 97.802, "eval_samples_per_second": 20.449, "eval_steps_per_second": 0.327, "step": 800 }, { "epoch": 0.93, "grad_norm": 85.0418535767566, "learning_rate": 6.773183262446914e-09, "logits/chosen": -1.8587850332260132, "logits/rejected": -1.7795337438583374, "logps/chosen": -248.3004913330078, "logps/rejected": -261.2222595214844, "loss": 0.3824, "rewards/accuracies": 0.8125, "rewards/chosen": -0.7576299905776978, "rewards/margins": 2.073674440383911, "rewards/rejected": -2.8313040733337402, "step": 810 }, { "epoch": 0.94, "grad_norm": 88.3275931540908, "learning_rate": 4.645586217799452e-09, "logits/chosen": -1.8911056518554688, "logits/rejected": -1.940203309059143, "logps/chosen": -265.9781799316406, "logps/rejected": -290.08770751953125, "loss": 0.3902, "rewards/accuracies": 0.8125, "rewards/chosen": -0.839026927947998, "rewards/margins": 2.1166014671325684, "rewards/rejected": -2.9556286334991455, "step": 820 }, { "epoch": 0.96, "grad_norm": 108.33767167817186, "learning_rate": 2.9149366008568987e-09, "logits/chosen": -1.8930469751358032, "logits/rejected": -1.9079952239990234, "logps/chosen": -263.65264892578125, "logps/rejected": -278.3803405761719, "loss": 0.3825, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -0.6759049296379089, "rewards/margins": 2.146210193634033, "rewards/rejected": -2.822114944458008, "step": 830 }, { "epoch": 0.97, "grad_norm": 86.51713108380295, "learning_rate": 1.5840343486700215e-09, "logits/chosen": -1.9469448328018188, "logits/rejected": -1.939854383468628, "logps/chosen": -281.2010498046875, "logps/rejected": -275.5614318847656, "loss": 0.367, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -0.5235612392425537, "rewards/margins": 2.1837284564971924, "rewards/rejected": -2.707289457321167, "step": 840 }, { "epoch": 0.98, "grad_norm": 82.94960696998083, "learning_rate": 6.550326657293881e-10, "logits/chosen": -1.9341312646865845, "logits/rejected": -1.901523232460022, "logps/chosen": -257.5312805175781, "logps/rejected": -268.84619140625, "loss": 0.346, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -0.7536182999610901, "rewards/margins": 2.499413251876831, "rewards/rejected": -3.2530312538146973, "step": 850 }, { "epoch": 0.99, "grad_norm": 100.59142277641878, "learning_rate": 1.2943454039654467e-10, "logits/chosen": -1.821735143661499, "logits/rejected": -1.835100531578064, "logps/chosen": -244.0499267578125, "logps/rejected": -261.69805908203125, "loss": 0.3624, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.0147624015808105, "rewards/margins": 1.7580705881118774, "rewards/rejected": -2.7728328704833984, "step": 860 }, { "epoch": 1.0, "step": 868, "total_flos": 0.0, "train_loss": 0.40559157427005504, "train_runtime": 13777.3263, "train_samples_per_second": 8.066, "train_steps_per_second": 0.063 } ], "logging_steps": 10, "max_steps": 868, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }