diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -3,20 +3,20 @@ "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, - "global_step": 478, + "global_step": 1346, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, - "grad_norm": 579.7744748341399, - "learning_rate": 1.0416666666666666e-08, - "logits/chosen": -2.408252239227295, - "logits/rejected": -2.408294677734375, - "logps/chosen": -208.4792022705078, - "logps/rejected": -178.0951690673828, - "loss": 0.69, + "grad_norm": 19.997434679340365, + "learning_rate": 3.7037037037037036e-09, + "logits/chosen": -1.5453667640686035, + "logits/rejected": -1.4501094818115234, + "logps/chosen": -154.90496826171875, + "logps/rejected": -163.46749877929688, + "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, @@ -24,786 +24,2235 @@ "step": 1 }, { - "epoch": 0.02, - "grad_norm": 535.1486260819535, - "learning_rate": 1.0416666666666667e-07, - "logits/chosen": -2.544614553451538, - "logits/rejected": -2.538756847381592, - "logps/chosen": -261.5119323730469, - "logps/rejected": -166.40280151367188, - "loss": 0.7009, - "rewards/accuracies": 0.4652777910232544, - "rewards/chosen": 0.008291814476251602, - "rewards/margins": 0.011442840099334717, - "rewards/rejected": -0.003151026088744402, + "epoch": 0.01, + "grad_norm": 23.34805628576002, + "learning_rate": 3.7037037037037036e-08, + "logits/chosen": -1.726180076599121, + "logits/rejected": -1.6590251922607422, + "logps/chosen": -212.55889892578125, + "logps/rejected": -207.66229248046875, + "loss": 0.693, + "rewards/accuracies": 0.4236111044883728, + "rewards/chosen": -0.0009580536279827356, + "rewards/margins": -0.0008072062046267092, + "rewards/rejected": -0.00015084730694070458, "step": 10 }, { - "epoch": 0.04, - "grad_norm": 384.83945719674614, - "learning_rate": 2.0833333333333333e-07, - "logits/chosen": -2.5195038318634033, - "logits/rejected": -2.5265002250671387, - "logps/chosen": -252.65768432617188, - "logps/rejected": -178.087158203125, - "loss": 0.5876, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": 0.10371126234531403, - "rewards/margins": 0.296107679605484, - "rewards/rejected": -0.1923964023590088, + "epoch": 0.01, + "grad_norm": 25.078221894482837, + "learning_rate": 7.407407407407407e-08, + "logits/chosen": -1.7819910049438477, + "logits/rejected": -1.7246497869491577, + "logps/chosen": -215.92410278320312, + "logps/rejected": -204.28173828125, + "loss": 0.6928, + "rewards/accuracies": 0.53125, + "rewards/chosen": 0.0006128093227744102, + "rewards/margins": 0.0009098866721615195, + "rewards/rejected": -0.00029707717476412654, "step": 20 }, { - "epoch": 0.06, - "grad_norm": 202.13874393663576, - "learning_rate": 3.1249999999999997e-07, - "logits/chosen": -2.4858596324920654, - "logits/rejected": -2.47269606590271, - "logps/chosen": -240.64242553710938, - "logps/rejected": -181.0841522216797, - "loss": 0.3862, - "rewards/accuracies": 0.8374999761581421, - "rewards/chosen": 0.3985957205295563, - "rewards/margins": 1.4220483303070068, - "rewards/rejected": -1.023452639579773, + "epoch": 0.02, + "grad_norm": 22.11850294931131, + "learning_rate": 1.111111111111111e-07, + "logits/chosen": -1.7216441631317139, + "logits/rejected": -1.6432338953018188, + "logps/chosen": -225.88699340820312, + "logps/rejected": -185.396240234375, + "loss": 0.6914, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": 0.004545359406620264, + "rewards/margins": 0.0039664083160459995, + "rewards/rejected": 0.0005789512651972473, "step": 30 }, { - "epoch": 0.08, - "grad_norm": 142.19955934070376, - "learning_rate": 4.1666666666666667e-07, - "logits/chosen": -2.4594264030456543, - "logits/rejected": -2.4308407306671143, - "logps/chosen": -283.8061828613281, - "logps/rejected": -208.5924835205078, - "loss": 0.3332, - "rewards/accuracies": 0.8687499761581421, - "rewards/chosen": 1.9889726638793945, - "rewards/margins": 3.029081344604492, - "rewards/rejected": -1.0401084423065186, + "epoch": 0.03, + "grad_norm": 21.112061167479588, + "learning_rate": 1.4814814814814815e-07, + "logits/chosen": -1.8239158391952515, + "logits/rejected": -1.7619836330413818, + "logps/chosen": -256.3050231933594, + "logps/rejected": -245.38198852539062, + "loss": 0.6877, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.019687145948410034, + "rewards/margins": 0.009989428333938122, + "rewards/rejected": 0.009697715751826763, "step": 40 }, { - "epoch": 0.1, - "grad_norm": 713.6549875009309, - "learning_rate": 4.999733114418725e-07, - "logits/chosen": -2.442108631134033, - "logits/rejected": -2.46040678024292, - "logps/chosen": -246.4988555908203, - "logps/rejected": -187.14138793945312, - "loss": 0.4454, - "rewards/accuracies": 0.8187500238418579, - "rewards/chosen": 2.2771620750427246, - "rewards/margins": 4.51132345199585, - "rewards/rejected": -2.234161853790283, + "epoch": 0.04, + "grad_norm": 20.90083454488397, + "learning_rate": 1.8518518518518516e-07, + "logits/chosen": -1.7489999532699585, + "logits/rejected": -1.7239185571670532, + "logps/chosen": -217.8907928466797, + "logps/rejected": -211.642333984375, + "loss": 0.68, + "rewards/accuracies": 0.65625, + "rewards/chosen": 0.03657875210046768, + "rewards/margins": 0.0253153033554554, + "rewards/rejected": 0.011263446882367134, "step": 50 }, { - "epoch": 0.13, - "grad_norm": 206.28301377298783, - "learning_rate": 4.990398100856366e-07, - "logits/chosen": -2.4876925945281982, - "logits/rejected": -2.449016571044922, - "logps/chosen": -271.1671142578125, - "logps/rejected": -196.48915100097656, - "loss": 0.282, - "rewards/accuracies": 0.887499988079071, - "rewards/chosen": 1.714887261390686, - "rewards/margins": 5.19891881942749, - "rewards/rejected": -3.4840316772460938, + "epoch": 0.04, + "grad_norm": 21.77197780821128, + "learning_rate": 2.222222222222222e-07, + "logits/chosen": -1.8024189472198486, + "logits/rejected": -1.6662009954452515, + "logps/chosen": -271.22235107421875, + "logps/rejected": -233.28823852539062, + "loss": 0.6713, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": 0.06959772109985352, + "rewards/margins": 0.04836159199476242, + "rewards/rejected": 0.021236125379800797, "step": 60 }, { - "epoch": 0.15, - "grad_norm": 133.99852927659347, - "learning_rate": 4.967775735898179e-07, - "logits/chosen": -2.4287192821502686, - "logits/rejected": -2.4621310234069824, - "logps/chosen": -274.3385925292969, - "logps/rejected": -198.64663696289062, - "loss": 0.3118, - "rewards/accuracies": 0.831250011920929, - "rewards/chosen": 3.02872371673584, - "rewards/margins": 6.160218238830566, - "rewards/rejected": -3.1314942836761475, + "epoch": 0.05, + "grad_norm": 21.554118928262394, + "learning_rate": 2.5925925925925923e-07, + "logits/chosen": -1.7849029302597046, + "logits/rejected": -1.718541145324707, + "logps/chosen": -224.2913055419922, + "logps/rejected": -203.49624633789062, + "loss": 0.6608, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": 0.05916479229927063, + "rewards/margins": 0.06906653940677643, + "rewards/rejected": -0.009901740588247776, "step": 70 }, { - "epoch": 0.17, - "grad_norm": 274.89332519675594, - "learning_rate": 4.931986719649298e-07, - "logits/chosen": -2.42030668258667, - "logits/rejected": -2.423424243927002, - "logps/chosen": -263.48272705078125, - "logps/rejected": -186.29991149902344, - "loss": 0.3017, - "rewards/accuracies": 0.893750011920929, - "rewards/chosen": 3.6749210357666016, - "rewards/margins": 6.233966827392578, - "rewards/rejected": -2.559046506881714, + "epoch": 0.06, + "grad_norm": 20.717805914821145, + "learning_rate": 2.962962962962963e-07, + "logits/chosen": -1.866686463356018, + "logits/rejected": -1.7345689535140991, + "logps/chosen": -252.84555053710938, + "logps/rejected": -228.15359497070312, + "loss": 0.6487, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": 0.05975146219134331, + "rewards/margins": 0.1380428969860077, + "rewards/rejected": -0.07829144597053528, "step": 80 }, { - "epoch": 0.19, - "grad_norm": 202.07196994453352, - "learning_rate": 4.883222001996351e-07, - "logits/chosen": -2.4479732513427734, - "logits/rejected": -2.4088387489318848, - "logps/chosen": -267.5174255371094, - "logps/rejected": -193.828125, - "loss": 0.3603, - "rewards/accuracies": 0.8812500238418579, - "rewards/chosen": 3.906665802001953, - "rewards/margins": 7.1938323974609375, - "rewards/rejected": -3.2871665954589844, + "epoch": 0.07, + "grad_norm": 24.15752327353423, + "learning_rate": 3.333333333333333e-07, + "logits/chosen": -1.7628377676010132, + "logits/rejected": -1.696528673171997, + "logps/chosen": -256.96954345703125, + "logps/rejected": -260.56292724609375, + "loss": 0.6244, + "rewards/accuracies": 0.71875, + "rewards/chosen": 0.009766384959220886, + "rewards/margins": 0.20868687331676483, + "rewards/rejected": -0.19892050325870514, "step": 90 }, { - "epoch": 0.21, - "grad_norm": 260.799637557766, - "learning_rate": 4.821741763807186e-07, - "logits/chosen": -2.4610652923583984, - "logits/rejected": -2.454207181930542, - "logps/chosen": -231.1355743408203, - "logps/rejected": -183.55123901367188, - "loss": 0.3315, - "rewards/accuracies": 0.9125000238418579, - "rewards/chosen": 2.3220086097717285, - "rewards/margins": 5.911337852478027, - "rewards/rejected": -3.589329242706299, + "epoch": 0.07, + "grad_norm": 22.043263558957296, + "learning_rate": 3.703703703703703e-07, + "logits/chosen": -1.758519172668457, + "logits/rejected": -1.6800458431243896, + "logps/chosen": -239.4019775390625, + "logps/rejected": -226.4539794921875, + "loss": 0.6142, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -0.1279800534248352, + "rewards/margins": 0.24427437782287598, + "rewards/rejected": -0.3722544014453888, "step": 100 }, { - "epoch": 0.21, - "eval_logits/chosen": -2.5300614833831787, - "eval_logits/rejected": -2.515779495239258, - "eval_logps/chosen": -250.55865478515625, - "eval_logps/rejected": -187.6100616455078, - "eval_loss": 0.297485888004303, - "eval_rewards/accuracies": 0.90234375, - "eval_rewards/chosen": 3.437309980392456, - "eval_rewards/margins": 7.265252113342285, - "eval_rewards/rejected": -3.82794189453125, - "eval_runtime": 97.5532, - "eval_samples_per_second": 20.502, - "eval_steps_per_second": 0.328, + "epoch": 0.07, + "eval_logits/chosen": -1.7870845794677734, + "eval_logits/rejected": -1.752089500427246, + "eval_logps/chosen": -302.7545166015625, + "eval_logps/rejected": -330.3116455078125, + "eval_loss": 0.6372222304344177, + "eval_rewards/accuracies": 0.69921875, + "eval_rewards/chosen": -0.21120953559875488, + "eval_rewards/margins": 0.2142910659313202, + "eval_rewards/rejected": -0.42550066113471985, + "eval_runtime": 98.1168, + "eval_samples_per_second": 20.384, + "eval_steps_per_second": 0.326, "step": 100 }, { - "epoch": 0.23, - "grad_norm": 164.61060726491735, - "learning_rate": 4.747874028753375e-07, - "logits/chosen": -2.36721134185791, - "logits/rejected": -2.398892879486084, - "logps/chosen": -238.1865692138672, - "logps/rejected": -193.205810546875, - "loss": 0.3602, - "rewards/accuracies": 0.893750011920929, - "rewards/chosen": 2.924015522003174, - "rewards/margins": 6.160517692565918, - "rewards/rejected": -3.2365028858184814, + "epoch": 0.08, + "grad_norm": 27.76371677719842, + "learning_rate": 4.0740740740740737e-07, + "logits/chosen": -1.7010570764541626, + "logits/rejected": -1.6076711416244507, + "logps/chosen": -263.64080810546875, + "logps/rejected": -244.4661407470703, + "loss": 0.5944, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -0.26566341519355774, + "rewards/margins": 0.27579429745674133, + "rewards/rejected": -0.5414577722549438, "step": 110 }, { - "epoch": 0.25, - "grad_norm": 394.19842542709944, - "learning_rate": 4.662012913161997e-07, - "logits/chosen": -2.4796204566955566, - "logits/rejected": -2.462428331375122, - "logps/chosen": -237.8124237060547, - "logps/rejected": -191.6023712158203, - "loss": 0.2323, - "rewards/accuracies": 0.8687499761581421, - "rewards/chosen": 3.661634922027588, - "rewards/margins": 6.902056694030762, - "rewards/rejected": -3.240421772003174, + "epoch": 0.09, + "grad_norm": 31.731787834651325, + "learning_rate": 4.444444444444444e-07, + "logits/chosen": -1.7044484615325928, + "logits/rejected": -1.6412681341171265, + "logps/chosen": -269.258544921875, + "logps/rejected": -282.24713134765625, + "loss": 0.5696, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.39111006259918213, + "rewards/margins": 0.36286839842796326, + "rewards/rejected": -0.7539784908294678, "step": 120 }, { - "epoch": 0.27, - "grad_norm": 409.3218341794219, - "learning_rate": 4.5646165232345103e-07, - "logits/chosen": -2.4416961669921875, - "logits/rejected": -2.473330497741699, - "logps/chosen": -259.2460632324219, - "logps/rejected": -185.64683532714844, - "loss": 0.3435, - "rewards/accuracies": 0.90625, - "rewards/chosen": 4.6567606925964355, - "rewards/margins": 7.721456050872803, - "rewards/rejected": -3.064694881439209, + "epoch": 0.1, + "grad_norm": 37.69053452768137, + "learning_rate": 4.814814814814814e-07, + "logits/chosen": -1.860668420791626, + "logits/rejected": -1.8003613948822021, + "logps/chosen": -286.83123779296875, + "logps/rejected": -299.9700927734375, + "loss": 0.553, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.3989219665527344, + "rewards/margins": 0.4949173927307129, + "rewards/rejected": -0.8938392400741577, "step": 130 }, { - "epoch": 0.29, - "grad_norm": 328.4389039905761, - "learning_rate": 4.456204510851956e-07, - "logits/chosen": -2.326636791229248, - "logits/rejected": -2.3331127166748047, - "logps/chosen": -261.88006591796875, - "logps/rejected": -197.84036254882812, - "loss": 0.2942, - "rewards/accuracies": 0.862500011920929, - "rewards/chosen": 3.5752804279327393, - "rewards/margins": 6.8619065284729, - "rewards/rejected": -3.2866263389587402, + "epoch": 0.1, + "grad_norm": 45.43263624776604, + "learning_rate": 4.999789692194508e-07, + "logits/chosen": -1.7893892526626587, + "logits/rejected": -1.7304092645645142, + "logps/chosen": -324.6953430175781, + "logps/rejected": -354.299072265625, + "loss": 0.5254, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.7778688073158264, + "rewards/margins": 0.5480495095252991, + "rewards/rejected": -1.325918436050415, "step": 140 }, { - "epoch": 0.31, - "grad_norm": 664.6717326413732, - "learning_rate": 4.337355301007335e-07, - "logits/chosen": -2.4599616527557373, - "logits/rejected": -2.4934587478637695, - "logps/chosen": -239.1993865966797, - "logps/rejected": -179.16249084472656, - "loss": 0.4201, - "rewards/accuracies": 0.875, - "rewards/chosen": 3.849210262298584, - "rewards/margins": 7.315499782562256, - "rewards/rejected": -3.46628999710083, + "epoch": 0.11, + "grad_norm": 38.76316746995152, + "learning_rate": 4.998107442045616e-07, + "logits/chosen": -1.7821582555770874, + "logits/rejected": -1.7230908870697021, + "logps/chosen": -304.8173828125, + "logps/rejected": -349.8572082519531, + "loss": 0.5181, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.702239990234375, + "rewards/margins": 0.6602993011474609, + "rewards/rejected": -1.362539291381836, "step": 150 }, { - "epoch": 0.33, - "grad_norm": 389.31781863733255, - "learning_rate": 4.2087030056579986e-07, - "logits/chosen": -2.4196219444274902, - "logits/rejected": -2.3700897693634033, - "logps/chosen": -260.79736328125, - "logps/rejected": -198.62042236328125, - "loss": 0.4699, - "rewards/accuracies": 0.856249988079071, - "rewards/chosen": 2.786912441253662, - "rewards/margins": 6.555578708648682, - "rewards/rejected": -3.768665313720703, + "epoch": 0.12, + "grad_norm": 38.48791118895226, + "learning_rate": 4.994744073829293e-07, + "logits/chosen": -1.9425595998764038, + "logits/rejected": -1.8510286808013916, + "logps/chosen": -315.58038330078125, + "logps/rejected": -354.3575744628906, + "loss": 0.5104, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -0.928961455821991, + "rewards/margins": 0.6893821954727173, + "rewards/rejected": -1.618343710899353, "step": 160 }, { - "epoch": 0.36, - "grad_norm": 450.98983026990686, - "learning_rate": 4.070934040463998e-07, - "logits/chosen": -2.5323286056518555, - "logits/rejected": -2.534219741821289, - "logps/chosen": -251.9270782470703, - "logps/rejected": -188.5828094482422, - "loss": 0.3003, - "rewards/accuracies": 0.893750011920929, - "rewards/chosen": 2.2955946922302246, - "rewards/margins": 8.251626968383789, - "rewards/rejected": -5.956032752990723, + "epoch": 0.13, + "grad_norm": 34.98469211427676, + "learning_rate": 4.989701850946613e-07, + "logits/chosen": -2.1142868995666504, + "logits/rejected": -2.0348961353302, + "logps/chosen": -336.83929443359375, + "logps/rejected": -374.04046630859375, + "loss": 0.5009, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.7460821866989136, + "rewards/margins": 0.756350040435791, + "rewards/rejected": -1.5024322271347046, "step": 170 }, { - "epoch": 0.38, - "grad_norm": 222.8025881040315, - "learning_rate": 3.9247834624635404e-07, - "logits/chosen": -2.4943268299102783, - "logits/rejected": -2.4797425270080566, - "logps/chosen": -261.6493835449219, - "logps/rejected": -186.13827514648438, - "loss": 0.3893, - "rewards/accuracies": 0.918749988079071, - "rewards/chosen": 3.1764323711395264, - "rewards/margins": 8.824117660522461, - "rewards/rejected": -5.6476850509643555, + "epoch": 0.13, + "grad_norm": 45.707896850254954, + "learning_rate": 4.982984166595104e-07, + "logits/chosen": -2.1524124145507812, + "logits/rejected": -2.127004861831665, + "logps/chosen": -349.15081787109375, + "logps/rejected": -417.07452392578125, + "loss": 0.4877, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.093063473701477, + "rewards/margins": 0.8320043683052063, + "rewards/rejected": -1.9250679016113281, "step": 180 }, { - "epoch": 0.4, - "grad_norm": 170.38903025294675, - "learning_rate": 3.7710310482256523e-07, - "logits/chosen": -2.4573581218719482, - "logits/rejected": -2.4471302032470703, - "logps/chosen": -242.3362274169922, - "logps/rejected": -206.1191864013672, - "loss": 0.2873, - "rewards/accuracies": 0.8812500238418579, - "rewards/chosen": 2.354017496109009, - "rewards/margins": 8.386995315551758, - "rewards/rejected": -6.0329766273498535, + "epoch": 0.14, + "grad_norm": 41.90753016315753, + "learning_rate": 4.974595541485259e-07, + "logits/chosen": -2.118150234222412, + "logits/rejected": -2.0182414054870605, + "logps/chosen": -376.4796447753906, + "logps/rejected": -435.715087890625, + "loss": 0.4957, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.2654850482940674, + "rewards/margins": 0.9563212394714355, + "rewards/rejected": -2.221806049346924, "step": 190 }, { - "epoch": 0.42, - "grad_norm": 233.638489796621, - "learning_rate": 3.610497133404795e-07, - "logits/chosen": -2.4837393760681152, - "logits/rejected": -2.4798505306243896, - "logps/chosen": -252.32040405273438, - "logps/rejected": -173.71713256835938, - "loss": 0.2909, - "rewards/accuracies": 0.925000011920929, - "rewards/chosen": 3.7421042919158936, - "rewards/margins": 8.136415481567383, - "rewards/rejected": -4.394310474395752, + "epoch": 0.15, + "grad_norm": 43.92066412227129, + "learning_rate": 4.964541620798307e-07, + "logits/chosen": -2.0093464851379395, + "logits/rejected": -1.843632698059082, + "logps/chosen": -358.4123840332031, + "logps/rejected": -408.1987609863281, + "loss": 0.4726, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.1928468942642212, + "rewards/margins": 0.9090213775634766, + "rewards/rejected": -2.101868152618408, "step": 200 }, { - "epoch": 0.42, - "eval_logits/chosen": -2.5459139347076416, - "eval_logits/rejected": -2.5309653282165527, - "eval_logps/chosen": -247.70968627929688, - "eval_logps/rejected": -188.19419860839844, - "eval_loss": 0.2753676474094391, - "eval_rewards/accuracies": 0.91796875, - "eval_rewards/chosen": 4.861801624298096, - "eval_rewards/margins": 8.981813430786133, - "eval_rewards/rejected": -4.120011806488037, - "eval_runtime": 97.2888, - "eval_samples_per_second": 20.557, - "eval_steps_per_second": 0.329, + "epoch": 0.15, + "eval_logits/chosen": -2.047096014022827, + "eval_logits/rejected": -2.0018393993377686, + "eval_logps/chosen": -416.041015625, + "eval_logps/rejected": -498.22076416015625, + "eval_loss": 0.5515660047531128, + "eval_rewards/accuracies": 0.75, + "eval_rewards/chosen": -1.3440749645233154, + "eval_rewards/margins": 0.7605167627334595, + "eval_rewards/rejected": -2.1045916080474854, + "eval_runtime": 98.1193, + "eval_samples_per_second": 20.383, + "eval_steps_per_second": 0.326, "step": 200 }, { - "epoch": 0.44, - "grad_norm": 337.30781066112155, - "learning_rate": 3.4440382358952115e-07, - "logits/chosen": -2.4054043292999268, - "logits/rejected": -2.334240436553955, - "logps/chosen": -229.5664825439453, - "logps/rejected": -177.37989807128906, - "loss": 0.3982, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": 3.1289913654327393, - "rewards/margins": 6.229460716247559, - "rewards/rejected": -3.100468873977661, + "epoch": 0.16, + "grad_norm": 38.95795396663457, + "learning_rate": 4.952829170387241e-07, + "logits/chosen": -1.9518171548843384, + "logits/rejected": -1.8534870147705078, + "logps/chosen": -342.5758361816406, + "logps/rejected": -430.5548400878906, + "loss": 0.4601, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -1.3162614107131958, + "rewards/margins": 1.062984824180603, + "rewards/rejected": -2.3792459964752197, "step": 210 }, { - "epoch": 0.46, - "grad_norm": 196.23824528381684, - "learning_rate": 3.272542485937368e-07, - "logits/chosen": -2.4668684005737305, - "logits/rejected": -2.4853501319885254, - "logps/chosen": -247.2740478515625, - "logps/rejected": -182.8220977783203, - "loss": 0.564, - "rewards/accuracies": 0.925000011920929, - "rewards/chosen": 4.549849510192871, - "rewards/margins": 9.18285083770752, - "rewards/rejected": -4.633000373840332, + "epoch": 0.16, + "grad_norm": 43.63837784340095, + "learning_rate": 4.939466072223697e-07, + "logits/chosen": -1.9940170049667358, + "logits/rejected": -1.8776836395263672, + "logps/chosen": -409.5476989746094, + "logps/rejected": -429.2942810058594, + "loss": 0.4812, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.562101125717163, + "rewards/margins": 0.7047168612480164, + "rewards/rejected": -2.266817808151245, "step": 220 }, { - "epoch": 0.48, - "grad_norm": 188.59878410714853, - "learning_rate": 3.096924887558854e-07, - "logits/chosen": -2.4467105865478516, - "logits/rejected": -2.394397735595703, - "logps/chosen": -269.22100830078125, - "logps/rejected": -207.54287719726562, - "loss": 0.61, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": 3.9587960243225098, - "rewards/margins": 9.096400260925293, - "rewards/rejected": -5.137604713439941, + "epoch": 0.17, + "grad_norm": 48.17484694517937, + "learning_rate": 4.924461319093725e-07, + "logits/chosen": -1.911233901977539, + "logits/rejected": -1.8419215679168701, + "logps/chosen": -366.72210693359375, + "logps/rejected": -425.0821838378906, + "loss": 0.4424, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.3060904741287231, + "rewards/margins": 0.9145557284355164, + "rewards/rejected": -2.220646381378174, "step": 230 }, { - "epoch": 0.5, - "grad_norm": 248.56418253631352, - "learning_rate": 2.9181224366319943e-07, - "logits/chosen": -2.438028335571289, - "logits/rejected": -2.4430088996887207, - "logps/chosen": -228.5868377685547, - "logps/rejected": -184.39010620117188, - "loss": 0.2929, - "rewards/accuracies": 0.90625, - "rewards/chosen": 2.860379934310913, - "rewards/margins": 8.014602661132812, - "rewards/rejected": -5.15422248840332, + "epoch": 0.18, + "grad_norm": 42.0731451177421, + "learning_rate": 4.907825008546038e-07, + "logits/chosen": -1.7983890771865845, + "logits/rejected": -1.6776669025421143, + "logps/chosen": -404.84942626953125, + "logps/rejected": -461.9938049316406, + "loss": 0.452, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.4010937213897705, + "rewards/margins": 0.9703173637390137, + "rewards/rejected": -2.371410846710205, "step": 240 }, { - "epoch": 0.52, - "grad_norm": 236.10656708079867, - "learning_rate": 2.7370891215954565e-07, - "logits/chosen": -2.468085527420044, - "logits/rejected": -2.471389055252075, - "logps/chosen": -245.74221801757812, - "logps/rejected": -189.75096130371094, - "loss": 0.2793, - "rewards/accuracies": 0.918749988079071, - "rewards/chosen": 4.629996299743652, - "rewards/margins": 8.119891166687012, - "rewards/rejected": -3.489894151687622, + "epoch": 0.19, + "grad_norm": 45.788391518578905, + "learning_rate": 4.889568336096795e-07, + "logits/chosen": -1.7662121057510376, + "logits/rejected": -1.6878684759140015, + "logps/chosen": -368.25567626953125, + "logps/rejected": -454.2179260253906, + "loss": 0.4514, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -1.3891403675079346, + "rewards/margins": 1.1145490407943726, + "rewards/rejected": -2.5036895275115967, "step": 250 }, { - "epoch": 0.54, - "grad_norm": 536.510359732396, - "learning_rate": 2.55479083351317e-07, - "logits/chosen": -2.4861550331115723, - "logits/rejected": -2.4899330139160156, - "logps/chosen": -254.806396484375, - "logps/rejected": -172.7798614501953, - "loss": 0.343, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": 4.790152549743652, - "rewards/margins": 9.49361515045166, - "rewards/rejected": -4.703463077545166, + "epoch": 0.19, + "grad_norm": 53.99920452912666, + "learning_rate": 4.869703587695508e-07, + "logits/chosen": -1.7349084615707397, + "logits/rejected": -1.6830803155899048, + "logps/chosen": -421.4908752441406, + "logps/rejected": -481.27752685546875, + "loss": 0.4519, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3783433437347412, + "rewards/margins": 1.064690351486206, + "rewards/rejected": -2.4430336952209473, "step": 260 }, + { + "epoch": 0.2, + "grad_norm": 48.505757522556536, + "learning_rate": 4.848244131457127e-07, + "logits/chosen": -1.601601243019104, + "logits/rejected": -1.4739550352096558, + "logps/chosen": -386.08428955078125, + "logps/rejected": -498.78558349609375, + "loss": 0.4367, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.4528039693832397, + "rewards/margins": 1.1855452060699463, + "rewards/rejected": -2.6383490562438965, + "step": 270 + }, + { + "epoch": 0.21, + "grad_norm": 49.31169957835279, + "learning_rate": 4.825204408665877e-07, + "logits/chosen": -1.697139024734497, + "logits/rejected": -1.5427892208099365, + "logps/chosen": -399.53729248046875, + "logps/rejected": -492.3568420410156, + "loss": 0.4205, + "rewards/accuracies": 0.84375, + "rewards/chosen": -1.5937035083770752, + "rewards/margins": 1.3382676839828491, + "rewards/rejected": -2.931971311569214, + "step": 280 + }, + { + "epoch": 0.22, + "grad_norm": 38.828455566499606, + "learning_rate": 4.800599924056907e-07, + "logits/chosen": -1.7314732074737549, + "logits/rejected": -1.5896819829940796, + "logps/chosen": -407.6255798339844, + "logps/rejected": -502.23016357421875, + "loss": 0.4189, + "rewards/accuracies": 0.84375, + "rewards/chosen": -1.5207576751708984, + "rewards/margins": 1.350722074508667, + "rewards/rejected": -2.8714797496795654, + "step": 290 + }, + { + "epoch": 0.22, + "grad_norm": 44.43194576384378, + "learning_rate": 4.774447235382259e-07, + "logits/chosen": -1.7066457271575928, + "logits/rejected": -1.5852010250091553, + "logps/chosen": -425.8912658691406, + "logps/rejected": -526.4098510742188, + "loss": 0.4421, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.5712147951126099, + "rewards/margins": 1.0973128080368042, + "rewards/rejected": -2.668527841567993, + "step": 300 + }, + { + "epoch": 0.22, + "eval_logits/chosen": -1.8325203657150269, + "eval_logits/rejected": -1.7522265911102295, + "eval_logps/chosen": -396.3378601074219, + "eval_logps/rejected": -492.3901062011719, + "eval_loss": 0.5335086584091187, + "eval_rewards/accuracies": 0.75390625, + "eval_rewards/chosen": -1.147042989730835, + "eval_rewards/margins": 0.8992425799369812, + "eval_rewards/rejected": -2.046285629272461, + "eval_runtime": 98.1593, + "eval_samples_per_second": 20.375, + "eval_steps_per_second": 0.326, + "step": 300 + }, + { + "epoch": 0.23, + "grad_norm": 44.68676000662293, + "learning_rate": 4.7467639422682426e-07, + "logits/chosen": -1.5684497356414795, + "logits/rejected": -1.5880094766616821, + "logps/chosen": -429.06072998046875, + "logps/rejected": -549.975830078125, + "loss": 0.431, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.6676822900772095, + "rewards/margins": 1.3817517757415771, + "rewards/rejected": -3.049434185028076, + "step": 310 + }, + { + "epoch": 0.24, + "grad_norm": 49.129673207510045, + "learning_rate": 4.7175686743716223e-07, + "logits/chosen": -1.6045185327529907, + "logits/rejected": -1.5781729221343994, + "logps/chosen": -407.8392639160156, + "logps/rejected": -560.4733276367188, + "loss": 0.4164, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.750781774520874, + "rewards/margins": 1.4461250305175781, + "rewards/rejected": -3.196906566619873, + "step": 320 + }, + { + "epoch": 0.25, + "grad_norm": 44.13714858626996, + "learning_rate": 4.686881078842688e-07, + "logits/chosen": -1.65378737449646, + "logits/rejected": -1.5229465961456299, + "logps/chosen": -369.09075927734375, + "logps/rejected": -465.15411376953125, + "loss": 0.4162, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.5404636859893799, + "rewards/margins": 1.1851208209991455, + "rewards/rejected": -2.7255845069885254, + "step": 330 + }, + { + "epoch": 0.25, + "grad_norm": 67.75141126041814, + "learning_rate": 4.654721807103558e-07, + "logits/chosen": -1.668287992477417, + "logits/rejected": -1.4913742542266846, + "logps/chosen": -451.5079040527344, + "logps/rejected": -548.6111450195312, + "loss": 0.4124, + "rewards/accuracies": 0.84375, + "rewards/chosen": -1.996059775352478, + "rewards/margins": 1.3178468942642212, + "rewards/rejected": -3.3139069080352783, + "step": 340 + }, + { + "epoch": 0.26, + "grad_norm": 51.63470007658604, + "learning_rate": 4.621112500950678e-07, + "logits/chosen": -1.4794889688491821, + "logits/rejected": -1.3635644912719727, + "logps/chosen": -392.33319091796875, + "logps/rejected": -509.9889221191406, + "loss": 0.3917, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.6687583923339844, + "rewards/margins": 1.3096448183059692, + "rewards/rejected": -2.9784035682678223, + "step": 350 + }, + { + "epoch": 0.27, + "grad_norm": 49.136281330589576, + "learning_rate": 4.5860757779908225e-07, + "logits/chosen": -1.3447411060333252, + "logits/rejected": -1.1584227085113525, + "logps/chosen": -392.1809997558594, + "logps/rejected": -517.8204345703125, + "loss": 0.4184, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -1.6987504959106445, + "rewards/margins": 1.4527620077133179, + "rewards/rejected": -3.151512384414673, + "step": 360 + }, + { + "epoch": 0.27, + "grad_norm": 54.14810763202607, + "learning_rate": 4.5496352164204304e-07, + "logits/chosen": -1.0756736993789673, + "logits/rejected": -0.9471368789672852, + "logps/chosen": -381.1976013183594, + "logps/rejected": -491.38787841796875, + "loss": 0.4015, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.6106783151626587, + "rewards/margins": 1.3218671083450317, + "rewards/rejected": -2.9325456619262695, + "step": 370 + }, + { + "epoch": 0.28, + "grad_norm": 61.80294176300012, + "learning_rate": 4.5118153391584966e-07, + "logits/chosen": -0.9351271390914917, + "logits/rejected": -0.686697244644165, + "logps/chosen": -431.38116455078125, + "logps/rejected": -523.0051879882812, + "loss": 0.3966, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.933549165725708, + "rewards/margins": 1.2298634052276611, + "rewards/rejected": -3.1634128093719482, + "step": 380 + }, + { + "epoch": 0.29, + "grad_norm": 49.95605603054853, + "learning_rate": 4.472641597343713e-07, + "logits/chosen": -0.7354714274406433, + "logits/rejected": -0.6010663509368896, + "logps/chosen": -441.1346740722656, + "logps/rejected": -589.41064453125, + "loss": 0.4152, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -2.0311450958251953, + "rewards/margins": 1.5603392124176025, + "rewards/rejected": -3.5914840698242188, + "step": 390 + }, + { + "epoch": 0.3, + "grad_norm": 51.04567852493441, + "learning_rate": 4.4321403532069523e-07, + "logits/chosen": -0.6377015113830566, + "logits/rejected": -0.3965984880924225, + "logps/chosen": -387.339111328125, + "logps/rejected": -515.1150512695312, + "loss": 0.3828, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.7281858921051025, + "rewards/margins": 1.4771182537078857, + "rewards/rejected": -3.205303907394409, + "step": 400 + }, + { + "epoch": 0.3, + "eval_logits/chosen": -1.1203975677490234, + "eval_logits/rejected": -0.9529741406440735, + "eval_logps/chosen": -448.1488037109375, + "eval_logps/rejected": -563.927978515625, + "eval_loss": 0.523780882358551, + "eval_rewards/accuracies": 0.76953125, + "eval_rewards/chosen": -1.6651523113250732, + "eval_rewards/margins": 1.096511721611023, + "eval_rewards/rejected": -2.7616641521453857, + "eval_runtime": 98.0838, + "eval_samples_per_second": 20.391, + "eval_steps_per_second": 0.326, + "step": 400 + }, + { + "epoch": 0.3, + "grad_norm": 48.086291234203195, + "learning_rate": 4.390338862330631e-07, + "logits/chosen": -0.7085340619087219, + "logits/rejected": -0.5028501749038696, + "logps/chosen": -433.546142578125, + "logps/rejected": -573.782958984375, + "loss": 0.3738, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -1.8388134241104126, + "rewards/margins": 1.763929009437561, + "rewards/rejected": -3.6027424335479736, + "step": 410 + }, + { + "epoch": 0.31, + "grad_norm": 64.03484534800386, + "learning_rate": 4.3472652553068835e-07, + "logits/chosen": -0.47518905997276306, + "logits/rejected": -0.23270010948181152, + "logps/chosen": -484.5774841308594, + "logps/rejected": -611.3958740234375, + "loss": 0.3739, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -2.291532039642334, + "rewards/margins": 1.4785693883895874, + "rewards/rejected": -3.770102024078369, + "step": 420 + }, + { + "epoch": 0.32, + "grad_norm": 52.3682956987419, + "learning_rate": 4.3029485188068895e-07, + "logits/chosen": -0.6198094487190247, + "logits/rejected": -0.28696033358573914, + "logps/chosen": -403.52703857421875, + "logps/rejected": -581.9299926757812, + "loss": 0.3654, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.8076034784317017, + "rewards/margins": 1.710561990737915, + "rewards/rejected": -3.5181660652160645, + "step": 430 + }, + { + "epoch": 0.33, + "grad_norm": 52.85823705565832, + "learning_rate": 4.257418476074103e-07, + "logits/chosen": -0.6958103179931641, + "logits/rejected": -0.40623918175697327, + "logps/chosen": -442.66473388671875, + "logps/rejected": -562.0119018554688, + "loss": 0.3841, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -1.8445909023284912, + "rewards/margins": 1.4965617656707764, + "rewards/rejected": -3.3411526679992676, + "step": 440 + }, + { + "epoch": 0.33, + "grad_norm": 58.890892905451516, + "learning_rate": 4.210705766854504e-07, + "logits/chosen": -0.8186171650886536, + "logits/rejected": -0.4163144528865814, + "logps/chosen": -412.9081115722656, + "logps/rejected": -538.0160522460938, + "loss": 0.3763, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.5352731943130493, + "rewards/margins": 1.580110788345337, + "rewards/rejected": -3.1153836250305176, + "step": 450 + }, + { + "epoch": 0.34, + "grad_norm": 54.82348692428555, + "learning_rate": 4.16284182677737e-07, + "logits/chosen": -0.5104752779006958, + "logits/rejected": -0.022202759981155396, + "logps/chosen": -504.6559143066406, + "logps/rejected": -638.7310180664062, + "loss": 0.3747, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -2.382429599761963, + "rewards/margins": 1.8315938711166382, + "rewards/rejected": -4.214023590087891, + "step": 460 + }, + { + "epoch": 0.35, + "grad_norm": 54.979690542314074, + "learning_rate": 4.113858866200466e-07, + "logits/chosen": -0.7018298506736755, + "logits/rejected": -0.30557408928871155, + "logps/chosen": -490.76947021484375, + "logps/rejected": -648.2581787109375, + "loss": 0.3667, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -2.0751655101776123, + "rewards/margins": 1.8448684215545654, + "rewards/rejected": -3.9200336933135986, + "step": 470 + }, + { + "epoch": 0.36, + "grad_norm": 52.384860047470646, + "learning_rate": 4.063789848533865e-07, + "logits/chosen": -0.5765314698219299, + "logits/rejected": -0.1146412268280983, + "logps/chosen": -431.3514709472656, + "logps/rejected": -595.962646484375, + "loss": 0.364, + "rewards/accuracies": 0.84375, + "rewards/chosen": -1.9231284856796265, + "rewards/margins": 1.8943427801132202, + "rewards/rejected": -3.8174710273742676, + "step": 480 + }, + { + "epoch": 0.36, + "grad_norm": 48.24790366086697, + "learning_rate": 4.0126684680570074e-07, + "logits/chosen": -0.8232128024101257, + "logits/rejected": -0.3200731873512268, + "logps/chosen": -487.1802673339844, + "logps/rejected": -620.1703491210938, + "loss": 0.3632, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -1.8669496774673462, + "rewards/margins": 1.7825498580932617, + "rewards/rejected": -3.6494994163513184, + "step": 490 + }, + { + "epoch": 0.37, + "grad_norm": 53.47941403954316, + "learning_rate": 3.960529127243902e-07, + "logits/chosen": -0.6839994192123413, + "logits/rejected": -0.3794109523296356, + "logps/chosen": -454.06494140625, + "logps/rejected": -560.31396484375, + "loss": 0.3576, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -1.671570062637329, + "rewards/margins": 1.5296937227249146, + "rewards/rejected": -3.201263904571533, + "step": 500 + }, + { + "epoch": 0.37, + "eval_logits/chosen": -1.1202046871185303, + "eval_logits/rejected": -0.8921781182289124, + "eval_logps/chosen": -444.0173034667969, + "eval_logps/rejected": -560.5327758789062, + "eval_loss": 0.5183658599853516, + "eval_rewards/accuracies": 0.76953125, + "eval_rewards/chosen": -1.6238377094268799, + "eval_rewards/margins": 1.103874921798706, + "eval_rewards/rejected": -2.727712631225586, + "eval_runtime": 98.0942, + "eval_samples_per_second": 20.389, + "eval_steps_per_second": 0.326, + "step": 500 + }, + { + "epoch": 0.38, + "grad_norm": 47.67862520127771, + "learning_rate": 3.9074069136117594e-07, + "logits/chosen": -0.6645992994308472, + "logits/rejected": -0.23552604019641876, + "logps/chosen": -437.04168701171875, + "logps/rejected": -573.43017578125, + "loss": 0.3616, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -2.357367753982544, + "rewards/margins": 1.7008861303329468, + "rewards/rejected": -4.058254241943359, + "step": 510 + }, + { + "epoch": 0.39, + "grad_norm": 55.20432992821677, + "learning_rate": 3.8533375761086094e-07, + "logits/chosen": -0.6538324356079102, + "logits/rejected": -0.28878337144851685, + "logps/chosen": -424.3060607910156, + "logps/rejected": -554.125732421875, + "loss": 0.3651, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.6924982070922852, + "rewards/margins": 1.4187054634094238, + "rewards/rejected": -3.111203670501709, + "step": 520 + }, + { + "epoch": 0.39, + "grad_norm": 48.99836031219826, + "learning_rate": 3.79835750105581e-07, + "logits/chosen": -0.5227429270744324, + "logits/rejected": -0.26728391647338867, + "logps/chosen": -419.548095703125, + "logps/rejected": -586.7620849609375, + "loss": 0.3642, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -1.778322458267212, + "rewards/margins": 1.765242338180542, + "rewards/rejected": -3.543564558029175, + "step": 530 + }, + { + "epoch": 0.4, + "grad_norm": 58.08597953496209, + "learning_rate": 3.742503687661627e-07, + "logits/chosen": -0.20574072003364563, + "logits/rejected": 0.2035825252532959, + "logps/chosen": -465.63763427734375, + "logps/rejected": -606.3990478515625, + "loss": 0.3525, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.341270923614502, + "rewards/margins": 1.657379388809204, + "rewards/rejected": -3.998650312423706, + "step": 540 + }, + { + "epoch": 0.41, + "grad_norm": 45.74418223365167, + "learning_rate": 3.685813723122372e-07, + "logits/chosen": -0.06602563709020615, + "logits/rejected": 0.41205328702926636, + "logps/chosen": -458.8877868652344, + "logps/rejected": -588.2848510742188, + "loss": 0.374, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -2.2084052562713623, + "rewards/margins": 1.5406155586242676, + "rewards/rejected": -3.74902081489563, + "step": 550 + }, + { + "epoch": 0.42, + "grad_norm": 49.47898611306175, + "learning_rate": 3.6283257573278466e-07, + "logits/chosen": -0.08754973113536835, + "logits/rejected": 0.23806925117969513, + "logps/chosen": -423.28985595703125, + "logps/rejected": -584.5783081054688, + "loss": 0.3585, + "rewards/accuracies": 0.90625, + "rewards/chosen": -1.8439204692840576, + "rewards/margins": 1.726400375366211, + "rewards/rejected": -3.5703208446502686, + "step": 560 + }, + { + "epoch": 0.42, + "grad_norm": 59.10150187997885, + "learning_rate": 3.5700784771881224e-07, + "logits/chosen": 0.15894003212451935, + "logits/rejected": 0.4934872090816498, + "logps/chosen": -443.24053955078125, + "logps/rejected": -601.0978393554688, + "loss": 0.3442, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.163815975189209, + "rewards/margins": 1.6688892841339111, + "rewards/rejected": -3.83270525932312, + "step": 570 + }, + { + "epoch": 0.43, + "grad_norm": 54.44170466261096, + "learning_rate": 3.511111080598925e-07, + "logits/chosen": 0.004490786697715521, + "logits/rejected": 0.34507110714912415, + "logps/chosen": -441.2269592285156, + "logps/rejected": -602.7559814453125, + "loss": 0.3492, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -1.914200782775879, + "rewards/margins": 1.9196977615356445, + "rewards/rejected": -3.8338985443115234, + "step": 580 + }, + { + "epoch": 0.44, + "grad_norm": 53.89900166712047, + "learning_rate": 3.451463250063146e-07, + "logits/chosen": -0.27837514877319336, + "logits/rejected": 0.13625089824199677, + "logps/chosen": -448.6805725097656, + "logps/rejected": -645.89013671875, + "loss": 0.324, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -1.93107008934021, + "rewards/margins": 2.10721755027771, + "rewards/rejected": -4.038287162780762, + "step": 590 + }, + { + "epoch": 0.45, + "grad_norm": 51.7074718432086, + "learning_rate": 3.3911751259862403e-07, + "logits/chosen": -0.041231829673051834, + "logits/rejected": 0.34056779742240906, + "logps/chosen": -443.47412109375, + "logps/rejected": -629.924560546875, + "loss": 0.3328, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -1.894299864768982, + "rewards/margins": 2.1391186714172363, + "rewards/rejected": -4.033418655395508, + "step": 600 + }, + { + "epoch": 0.45, + "eval_logits/chosen": -0.06941252946853638, + "eval_logits/rejected": 0.24228118360042572, + "eval_logps/chosen": -493.6551513671875, + "eval_logps/rejected": -628.6859130859375, + "eval_loss": 0.5151087641716003, + "eval_rewards/accuracies": 0.765625, + "eval_rewards/chosen": -2.1202151775360107, + "eval_rewards/margins": 1.2890279293060303, + "eval_rewards/rejected": -3.409243106842041, + "eval_runtime": 98.1083, + "eval_samples_per_second": 20.386, + "eval_steps_per_second": 0.326, + "step": 600 + }, + { + "epoch": 0.45, + "grad_norm": 55.66397888256945, + "learning_rate": 3.3302872796634754e-07, + "logits/chosen": 0.49535948038101196, + "logits/rejected": 1.0743920803070068, + "logps/chosen": -468.02978515625, + "logps/rejected": -642.6104736328125, + "loss": 0.3096, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -2.297302722930908, + "rewards/margins": 2.0772812366485596, + "rewards/rejected": -4.374583721160889, + "step": 610 + }, + { + "epoch": 0.46, + "grad_norm": 51.7383697331199, + "learning_rate": 3.2688406859772035e-07, + "logits/chosen": 0.2597588300704956, + "logits/rejected": 0.673038125038147, + "logps/chosen": -444.8384704589844, + "logps/rejected": -659.6806030273438, + "loss": 0.3346, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.2546586990356445, + "rewards/margins": 2.226531744003296, + "rewards/rejected": -4.4811906814575195, + "step": 620 + }, + { + "epoch": 0.47, + "grad_norm": 54.19919430150042, + "learning_rate": 3.206876695822541e-07, + "logits/chosen": -0.12093131244182587, + "logits/rejected": 0.4730301797389984, + "logps/chosen": -452.8343811035156, + "logps/rejected": -602.112060546875, + "loss": 0.3051, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -1.8909651041030884, + "rewards/margins": 1.9853994846343994, + "rewards/rejected": -3.876364231109619, + "step": 630 + }, + { + "epoch": 0.48, + "grad_norm": 50.389338383353866, + "learning_rate": 3.144437008280012e-07, + "logits/chosen": -0.09394478052854538, + "logits/rejected": 0.33072829246520996, + "logps/chosen": -455.43731689453125, + "logps/rejected": -609.87744140625, + "loss": 0.3263, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -2.1607372760772705, + "rewards/margins": 1.885525107383728, + "rewards/rejected": -4.046262264251709, + "step": 640 + }, + { + "epoch": 0.48, + "grad_norm": 61.479087062458596, + "learning_rate": 3.0815636425538665e-07, + "logits/chosen": 0.20981228351593018, + "logits/rejected": 0.55927574634552, + "logps/chosen": -474.67803955078125, + "logps/rejected": -664.0802001953125, + "loss": 0.3204, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.3460288047790527, + "rewards/margins": 2.071723222732544, + "rewards/rejected": -4.417751789093018, + "step": 650 + }, + { + "epoch": 0.49, + "grad_norm": 69.05844503540243, + "learning_rate": 3.018298909694986e-07, + "logits/chosen": 0.25689777731895447, + "logits/rejected": 0.7616270184516907, + "logps/chosen": -470.198486328125, + "logps/rejected": -662.539306640625, + "loss": 0.3289, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -2.4795608520507812, + "rewards/margins": 1.899802565574646, + "rewards/rejected": -4.379363059997559, + "step": 660 + }, + { + "epoch": 0.5, + "grad_norm": 45.7911918343519, + "learning_rate": 2.954685384127371e-07, + "logits/chosen": 0.062225330621004105, + "logits/rejected": 0.41303902864456177, + "logps/chosen": -428.16387939453125, + "logps/rejected": -597.1376953125, + "loss": 0.306, + "rewards/accuracies": 0.84375, + "rewards/chosen": -2.1429684162139893, + "rewards/margins": 1.8161710500717163, + "rewards/rejected": -3.959139347076416, + "step": 670 + }, + { + "epoch": 0.51, + "grad_norm": 57.36445697043156, + "learning_rate": 2.8907658749974054e-07, + "logits/chosen": -0.09550069272518158, + "logits/rejected": 0.35600176453590393, + "logps/chosen": -455.721923828125, + "logps/rejected": -670.6884765625, + "loss": 0.2969, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.200000762939453, + "rewards/margins": 2.278864622116089, + "rewards/rejected": -4.478865146636963, + "step": 680 + }, + { + "epoch": 0.51, + "grad_norm": 57.63859566762506, + "learning_rate": 2.8265833973651503e-07, + "logits/chosen": 0.4362770915031433, + "logits/rejected": 1.019630789756775, + "logps/chosen": -481.36944580078125, + "logps/rejected": -682.4991455078125, + "loss": 0.3037, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -2.728414297103882, + "rewards/margins": 2.1998586654663086, + "rewards/rejected": -4.928272724151611, + "step": 690 + }, + { + "epoch": 0.52, + "grad_norm": 38.04349040823844, + "learning_rate": 2.7621811432570736e-07, + "logits/chosen": 0.0020432949531823397, + "logits/rejected": 0.678579568862915, + "logps/chosen": -452.239990234375, + "logps/rejected": -615.9015502929688, + "loss": 0.3131, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.9651463031768799, + "rewards/margins": 2.032357931137085, + "rewards/rejected": -3.997504472732544, + "step": 700 + }, + { + "epoch": 0.52, + "eval_logits/chosen": -0.1655656397342682, + "eval_logits/rejected": 0.172866553068161, + "eval_logps/chosen": -451.9695739746094, + "eval_logps/rejected": -578.1397705078125, + "eval_loss": 0.5152586698532104, + "eval_rewards/accuracies": 0.765625, + "eval_rewards/chosen": -1.7033603191375732, + "eval_rewards/margins": 1.2004213333129883, + "eval_rewards/rejected": -2.9037814140319824, + "eval_runtime": 98.1096, + "eval_samples_per_second": 20.385, + "eval_steps_per_second": 0.326, + "step": 700 + }, + { + "epoch": 0.53, + "grad_norm": 60.9670305167988, + "learning_rate": 2.6976024525996917e-07, + "logits/chosen": 0.702204167842865, + "logits/rejected": 0.9482040405273438, + "logps/chosen": -400.1419372558594, + "logps/rejected": -616.5157470703125, + "loss": 0.2933, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -2.058786630630493, + "rewards/margins": 2.1224417686462402, + "rewards/rejected": -4.1812286376953125, + "step": 710 + }, + { + "epoch": 0.53, + "grad_norm": 57.54299069526284, + "learning_rate": 2.6328907840536706e-07, + "logits/chosen": 0.6100508570671082, + "logits/rejected": 0.9714565277099609, + "logps/chosen": -446.7682189941406, + "logps/rejected": -678.0450439453125, + "loss": 0.312, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -2.1031315326690674, + "rewards/margins": 2.523500680923462, + "rewards/rejected": -4.626631736755371, + "step": 720 + }, + { + "epoch": 0.54, + "grad_norm": 55.010003089665325, + "learning_rate": 2.568089685768038e-07, + "logits/chosen": 0.33207422494888306, + "logits/rejected": 0.8213400840759277, + "logps/chosen": -466.47967529296875, + "logps/rejected": -616.5966796875, + "loss": 0.3037, + "rewards/accuracies": 0.84375, + "rewards/chosen": -2.145937442779541, + "rewards/margins": 1.8470547199249268, + "rewards/rejected": -3.992992401123047, + "step": 730 + }, + { + "epoch": 0.55, + "grad_norm": 56.034615355832116, + "learning_rate": 2.503242766074156e-07, + "logits/chosen": 1.0638476610183716, + "logits/rejected": 1.7732141017913818, + "logps/chosen": -467.27789306640625, + "logps/rejected": -673.653564453125, + "loss": 0.3135, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -2.2686350345611572, + "rewards/margins": 2.4400157928466797, + "rewards/rejected": -4.708651542663574, + "step": 740 + }, + { + "epoch": 0.56, + "grad_norm": 55.33213405677105, + "learning_rate": 2.4383936641392136e-07, + "logits/chosen": 0.8964494466781616, + "logits/rejected": 1.4589967727661133, + "logps/chosen": -475.98529052734375, + "logps/rejected": -655.1957397460938, + "loss": 0.3094, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.501713514328003, + "rewards/margins": 2.241748094558716, + "rewards/rejected": -4.743461608886719, + "step": 750 + }, { "epoch": 0.56, - "grad_norm": 203.6794572887938, - "learning_rate": 2.3722002126275822e-07, - "logits/chosen": -2.456101179122925, - "logits/rejected": -2.407947063446045, - "logps/chosen": -236.4749755859375, - "logps/rejected": -187.09112548828125, - "loss": 0.2939, + "grad_norm": 56.701790611883894, + "learning_rate": 2.3735860205989493e-07, + "logits/chosen": 0.8617841005325317, + "logits/rejected": 1.3072969913482666, + "logps/chosen": -474.41583251953125, + "logps/rejected": -685.8687744140625, + "loss": 0.2749, "rewards/accuracies": 0.8687499761581421, - "rewards/chosen": 3.9466958045959473, - "rewards/margins": 8.208832740783691, - "rewards/rejected": -4.262135982513428, - "step": 270 + "rewards/chosen": -2.408444881439209, + "rewards/margins": 2.277350425720215, + "rewards/rejected": -4.685795307159424, + "step": 760 + }, + { + "epoch": 0.57, + "grad_norm": 53.32774327458172, + "learning_rate": 2.308863448189402e-07, + "logits/chosen": 0.5902019739151001, + "logits/rejected": 0.8791207075119019, + "logps/chosen": -452.8211364746094, + "logps/rejected": -647.130126953125, + "loss": 0.3028, + "rewards/accuracies": 0.84375, + "rewards/chosen": -2.1281752586364746, + "rewards/margins": 2.103821277618408, + "rewards/rejected": -4.231996536254883, + "step": 770 + }, + { + "epoch": 0.58, + "grad_norm": 66.84990640273234, + "learning_rate": 2.2442695023974246e-07, + "logits/chosen": 0.7262977361679077, + "logits/rejected": 1.3522447347640991, + "logps/chosen": -426.29339599609375, + "logps/rejected": -622.6487426757812, + "loss": 0.289, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.120136022567749, + "rewards/margins": 2.228008985519409, + "rewards/rejected": -4.348145008087158, + "step": 780 }, { "epoch": 0.59, - "grad_norm": 394.1475095150393, - "learning_rate": 2.19029145890313e-07, - "logits/chosen": -2.343445301055908, - "logits/rejected": -2.3995354175567627, - "logps/chosen": -257.370361328125, - "logps/rejected": -201.5979461669922, - "loss": 0.3501, - "rewards/accuracies": 0.875, - "rewards/chosen": 3.717083692550659, - "rewards/margins": 8.61104965209961, - "rewards/rejected": -4.893965721130371, - "step": 280 + "grad_norm": 58.09209476264956, + "learning_rate": 2.179847652149729e-07, + "logits/chosen": 0.5433401465415955, + "logits/rejected": 1.0540757179260254, + "logps/chosen": -516.3043212890625, + "logps/rejected": -711.8958740234375, + "loss": 0.2875, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -2.7564001083374023, + "rewards/margins": 2.2523038387298584, + "rewards/rejected": -5.00870418548584, + "step": 790 + }, + { + "epoch": 0.59, + "grad_norm": 51.81489549587344, + "learning_rate": 2.115641250560183e-07, + "logits/chosen": 0.46662241220474243, + "logits/rejected": 0.9530113339424133, + "logps/chosen": -531.3187866210938, + "logps/rejected": -718.2420043945312, + "loss": 0.2547, + "rewards/accuracies": 0.90625, + "rewards/chosen": -2.6745731830596924, + "rewards/margins": 2.1868960857391357, + "rewards/rejected": -4.86146879196167, + "step": 800 + }, + { + "epoch": 0.59, + "eval_logits/chosen": 0.12700016796588898, + "eval_logits/rejected": 0.44755151867866516, + "eval_logps/chosen": -535.29150390625, + "eval_logps/rejected": -673.45654296875, + "eval_loss": 0.5256171822547913, + "eval_rewards/accuracies": 0.76171875, + "eval_rewards/chosen": -2.5365793704986572, + "eval_rewards/margins": 1.3203709125518799, + "eval_rewards/rejected": -3.856950521469116, + "eval_runtime": 98.0354, + "eval_samples_per_second": 20.401, + "eval_steps_per_second": 0.326, + "step": 800 + }, + { + "epoch": 0.6, + "grad_norm": 54.66675021265227, + "learning_rate": 2.051693505755042e-07, + "logits/chosen": 0.33729100227355957, + "logits/rejected": 0.9711275100708008, + "logps/chosen": -483.88861083984375, + "logps/rejected": -701.2562255859375, + "loss": 0.2491, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -2.316730260848999, + "rewards/margins": 2.6504831314086914, + "rewards/rejected": -4.9672136306762695, + "step": 810 }, { "epoch": 0.61, - "grad_norm": 206.90087071186883, - "learning_rate": 2.0100351342479216e-07, - "logits/chosen": -2.4418747425079346, - "logits/rejected": -2.428316593170166, - "logps/chosen": -234.2547607421875, - "logps/rejected": -182.3361053466797, - "loss": 0.3955, + "grad_norm": 57.83387891787277, + "learning_rate": 1.9880474517957542e-07, + "logits/chosen": 0.38913315534591675, + "logits/rejected": 0.8929936289787292, + "logps/chosen": -511.2947692871094, + "logps/rejected": -719.1864013671875, + "loss": 0.2861, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -2.5449912548065186, + "rewards/margins": 2.4298605918884277, + "rewards/rejected": -4.974852085113525, + "step": 820 + }, + { + "epoch": 0.62, + "grad_norm": 54.573496843465676, + "learning_rate": 1.9247459197189e-07, + "logits/chosen": 0.2409726083278656, + "logits/rejected": 0.4742881655693054, + "logps/chosen": -487.2377014160156, + "logps/rejected": -707.2930908203125, + "loss": 0.2709, "rewards/accuracies": 0.8687499761581421, - "rewards/chosen": 3.4869391918182373, - "rewards/margins": 8.724508285522461, - "rewards/rejected": -5.2375688552856445, - "step": 290 + "rewards/chosen": -2.423832416534424, + "rewards/margins": 2.337522029876709, + "rewards/rejected": -4.761354446411133, + "step": 830 }, { - "epoch": 0.63, - "grad_norm": 232.39029956497873, - "learning_rate": 1.8323929841460178e-07, - "logits/chosen": -2.434248208999634, - "logits/rejected": -2.357980251312256, - "logps/chosen": -265.85308837890625, - "logps/rejected": -211.96859741210938, - "loss": 0.6445, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": 3.5436806678771973, - "rewards/margins": 8.844223976135254, - "rewards/rejected": -5.300544738769531, - "step": 300 + "epoch": 0.62, + "grad_norm": 57.59285615682952, + "learning_rate": 1.8618315087127602e-07, + "logits/chosen": -0.03551248461008072, + "logits/rejected": 0.5289381146430969, + "logps/chosen": -485.76348876953125, + "logps/rejected": -648.2017822265625, + "loss": 0.2908, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.2680952548980713, + "rewards/margins": 2.0456957817077637, + "rewards/rejected": -4.313790321350098, + "step": 840 }, { "epoch": 0.63, - "eval_logits/chosen": -2.5059330463409424, - "eval_logits/rejected": -2.491495132446289, - "eval_logps/chosen": -249.0003662109375, - "eval_logps/rejected": -190.69680786132812, - "eval_loss": 0.22447091341018677, - "eval_rewards/accuracies": 0.91015625, - "eval_rewards/chosen": 4.216455936431885, - "eval_rewards/margins": 9.587770462036133, - "eval_rewards/rejected": -5.371314525604248, - "eval_runtime": 97.2649, - "eval_samples_per_second": 20.562, - "eval_steps_per_second": 0.329, - "step": 300 + "grad_norm": 73.30688641890157, + "learning_rate": 1.7993465574499102e-07, + "logits/chosen": -0.13685956597328186, + "logits/rejected": 0.5006009340286255, + "logps/chosen": -509.0750427246094, + "logps/rejected": -739.2849731445312, + "loss": 0.2572, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -2.391108512878418, + "rewards/margins": 2.590216875076294, + "rewards/rejected": -4.981324672698975, + "step": 850 + }, + { + "epoch": 0.64, + "grad_norm": 60.15832089933168, + "learning_rate": 1.7373331155951233e-07, + "logits/chosen": 0.16482150554656982, + "logits/rejected": 0.5440123677253723, + "logps/chosen": -488.3111877441406, + "logps/rejected": -694.9370727539062, + "loss": 0.2517, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -2.627448558807373, + "rewards/margins": 2.2580535411834717, + "rewards/rejected": -4.885501861572266, + "step": 860 }, { "epoch": 0.65, - "grad_norm": 216.5700858394941, - "learning_rate": 1.6583128063291573e-07, - "logits/chosen": -2.4202470779418945, - "logits/rejected": -2.42097806930542, - "logps/chosen": -255.98904418945312, - "logps/rejected": -196.6630401611328, - "loss": 0.3789, + "grad_norm": 48.979507346317426, + "learning_rate": 1.6758329155077743e-07, + "logits/chosen": 0.2300490438938141, + "logits/rejected": 0.6597995758056641, + "logps/chosen": -471.4276428222656, + "logps/rejected": -696.9656372070312, + "loss": 0.2572, "rewards/accuracies": 0.8812500238418579, - "rewards/chosen": 3.9936203956604004, - "rewards/margins": 8.884687423706055, - "rewards/rejected": -4.891066551208496, - "step": 310 + "rewards/chosen": -2.564178943634033, + "rewards/margins": 2.4548990726470947, + "rewards/rejected": -5.019078254699707, + "step": 870 + }, + { + "epoch": 0.65, + "grad_norm": 54.56547566856159, + "learning_rate": 1.6148873441577662e-07, + "logits/chosen": 0.0745859146118164, + "logits/rejected": 0.5892910361289978, + "logps/chosen": -490.7748107910156, + "logps/rejected": -704.1431884765625, + "loss": 0.2334, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -2.500243663787842, + "rewards/margins": 2.4989378452301025, + "rewards/rejected": -4.999181270599365, + "step": 880 + }, + { + "epoch": 0.66, + "grad_norm": 55.545059461777754, + "learning_rate": 1.5545374152738934e-07, + "logits/chosen": 0.016130054369568825, + "logits/rejected": 0.48376768827438354, + "logps/chosen": -458.7793884277344, + "logps/rejected": -710.6851196289062, + "loss": 0.25, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.172322988510132, + "rewards/margins": 2.6098039150238037, + "rewards/rejected": -4.782127380371094, + "step": 890 }, { "epoch": 0.67, - "grad_norm": 232.75719945601446, - "learning_rate": 1.488723393865766e-07, - "logits/chosen": -2.493706464767456, - "logits/rejected": -2.461465358734131, - "logps/chosen": -238.67599487304688, - "logps/rejected": -194.2203826904297, - "loss": 0.3515, - "rewards/accuracies": 0.8687499761581421, - "rewards/chosen": 4.206465244293213, - "rewards/margins": 9.311426162719727, - "rewards/rejected": -5.104961395263672, - "step": 320 + "grad_norm": 58.8472459822313, + "learning_rate": 1.4948237417433775e-07, + "logits/chosen": -0.05456262826919556, + "logits/rejected": 0.4301723539829254, + "logps/chosen": -505.43719482421875, + "logps/rejected": -729.1197509765625, + "loss": 0.2764, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": -2.475316286087036, + "rewards/margins": 2.53151535987854, + "rewards/rejected": -5.006831169128418, + "step": 900 + }, + { + "epoch": 0.67, + "eval_logits/chosen": -0.2431109994649887, + "eval_logits/rejected": 0.051950518041849136, + "eval_logps/chosen": -538.3812866210938, + "eval_logps/rejected": -682.334228515625, + "eval_loss": 0.5221121907234192, + "eval_rewards/accuracies": 0.77734375, + "eval_rewards/chosen": -2.567476749420166, + "eval_rewards/margins": 1.3782495260238647, + "eval_rewards/rejected": -3.9457266330718994, + "eval_runtime": 98.1143, + "eval_samples_per_second": 20.384, + "eval_steps_per_second": 0.326, + "step": 900 + }, + { + "epoch": 0.68, + "grad_norm": 76.04810070991684, + "learning_rate": 1.435786508281158e-07, + "logits/chosen": 0.3129107356071472, + "logits/rejected": 0.6085219383239746, + "logps/chosen": -462.0315856933594, + "logps/rejected": -686.1160888671875, + "loss": 0.2458, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -2.3812341690063477, + "rewards/margins": 2.453599452972412, + "rewards/rejected": -4.83483362197876, + "step": 910 + }, + { + "epoch": 0.68, + "grad_norm": 58.24143150255672, + "learning_rate": 1.3774654443873174e-07, + "logits/chosen": -0.17105606198310852, + "logits/rejected": 0.10635167360305786, + "logps/chosen": -476.74884033203125, + "logps/rejected": -704.75537109375, + "loss": 0.2662, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.2945311069488525, + "rewards/margins": 2.386672258377075, + "rewards/rejected": -4.681203365325928, + "step": 920 }, { "epoch": 0.69, - "grad_norm": 281.239048485679, - "learning_rate": 1.3245295796480788e-07, - "logits/chosen": -2.4040043354034424, - "logits/rejected": -2.4380762577056885, - "logps/chosen": -247.7274169921875, - "logps/rejected": -179.15911865234375, - "loss": 0.4043, - "rewards/accuracies": 0.893750011920929, - "rewards/chosen": 5.208529472351074, - "rewards/margins": 9.102964401245117, - "rewards/rejected": -3.8944339752197266, - "step": 330 + "grad_norm": 61.580815446047275, + "learning_rate": 1.31989979761085e-07, + "logits/chosen": 0.19714362919330597, + "logits/rejected": 0.6800551414489746, + "logps/chosen": -470.64697265625, + "logps/rejected": -731.5118408203125, + "loss": 0.2524, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -2.3182313442230225, + "rewards/margins": 2.8053975105285645, + "rewards/rejected": -5.123629093170166, + "step": 930 + }, + { + "epoch": 0.7, + "grad_norm": 66.23348721239962, + "learning_rate": 1.2631283071377618e-07, + "logits/chosen": 0.15574759244918823, + "logits/rejected": 0.7999376058578491, + "logps/chosen": -489.5287170410156, + "logps/rejected": -739.4246215820312, + "loss": 0.2535, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.4655532836914062, + "rewards/margins": 2.7967965602874756, + "rewards/rejected": -5.2623491287231445, + "step": 940 }, { "epoch": 0.71, - "grad_norm": 183.00963279480595, - "learning_rate": 1.1666074087171627e-07, - "logits/chosen": -2.3740360736846924, - "logits/rejected": -2.4260330200195312, - "logps/chosen": -235.00357055664062, - "logps/rejected": -166.9297637939453, - "loss": 0.2751, - "rewards/accuracies": 0.8687499761581421, - "rewards/chosen": 4.350711822509766, - "rewards/margins": 7.54253625869751, - "rewards/rejected": -3.1918249130249023, - "step": 340 + "grad_norm": 55.370463362822846, + "learning_rate": 1.2071891777212744e-07, + "logits/chosen": 0.4210018217563629, + "logits/rejected": 1.0494170188903809, + "logps/chosen": -442.0162658691406, + "logps/rejected": -666.9036865234375, + "loss": 0.238, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.3151867389678955, + "rewards/margins": 2.4912314414978027, + "rewards/rejected": -4.806417942047119, + "step": 950 + }, + { + "epoch": 0.71, + "grad_norm": 54.37619099014209, + "learning_rate": 1.1521200539716874e-07, + "logits/chosen": 0.04333481192588806, + "logits/rejected": 0.6315183639526367, + "logps/chosen": -440.3460998535156, + "logps/rejected": -668.5031127929688, + "loss": 0.2538, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -2.369847297668457, + "rewards/margins": 2.58512282371521, + "rewards/rejected": -4.954970359802246, + "step": 960 + }, + { + "epoch": 0.72, + "grad_norm": 59.50016505068154, + "learning_rate": 1.0979579950231821e-07, + "logits/chosen": -0.0721534788608551, + "logits/rejected": 0.5839505195617676, + "logps/chosen": -496.072021484375, + "logps/rejected": -720.0166015625, + "loss": 0.2331, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.456573963165283, + "rewards/margins": 2.6809329986572266, + "rewards/rejected": -5.13750696182251, + "step": 970 }, { "epoch": 0.73, - "grad_norm": 258.2328231454923, - "learning_rate": 1.0157994641835734e-07, - "logits/chosen": -2.4078361988067627, - "logits/rejected": -2.4156928062438965, - "logps/chosen": -234.7695770263672, - "logps/rejected": -214.1270294189453, - "loss": 0.3005, + "grad_norm": 63.83570606232123, + "learning_rate": 1.0447394495946291e-07, + "logits/chosen": 0.007242382504045963, + "logits/rejected": 0.5322221517562866, + "logps/chosen": -520.4622192382812, + "logps/rejected": -716.791015625, + "loss": 0.2726, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.6428182125091553, + "rewards/margins": 2.505241870880127, + "rewards/rejected": -5.148059844970703, + "step": 980 + }, + { + "epoch": 0.74, + "grad_norm": 76.60392151494439, + "learning_rate": 9.925002314611841e-08, + "logits/chosen": -0.2113700807094574, + "logits/rejected": 0.4387185573577881, + "logps/chosen": -535.4547119140625, + "logps/rejected": -757.9600830078125, + "loss": 0.2356, + "rewards/accuracies": 0.90625, + "rewards/chosen": -2.6833412647247314, + "rewards/margins": 2.5519299507141113, + "rewards/rejected": -5.235270977020264, + "step": 990 + }, + { + "epoch": 0.74, + "grad_norm": 55.208058181749024, + "learning_rate": 9.412754953531663e-08, + "logits/chosen": 0.000503048300743103, + "logits/rejected": 0.7104528546333313, + "logps/chosen": -494.1576232910156, + "logps/rejected": -719.8685913085938, + "loss": 0.2261, "rewards/accuracies": 0.887499988079071, - "rewards/chosen": 4.440161228179932, - "rewards/margins": 9.19445514678955, - "rewards/rejected": -4.754293918609619, - "step": 350 + "rewards/chosen": -2.6024506092071533, + "rewards/margins": 2.5579113960266113, + "rewards/rejected": -5.160361289978027, + "step": 1000 + }, + { + "epoch": 0.74, + "eval_logits/chosen": -0.1104351058602333, + "eval_logits/rejected": 0.20232482254505157, + "eval_logps/chosen": -558.2006225585938, + "eval_logps/rejected": -712.748291015625, + "eval_loss": 0.5298039317131042, + "eval_rewards/accuracies": 0.76953125, + "eval_rewards/chosen": -2.7656705379486084, + "eval_rewards/margins": 1.4841958284378052, + "eval_rewards/rejected": -4.249866485595703, + "eval_runtime": 98.1214, + "eval_samples_per_second": 20.383, + "eval_steps_per_second": 0.326, + "step": 1000 }, { "epoch": 0.75, - "grad_norm": 260.2428729882813, - "learning_rate": 8.729103716819111e-08, - "logits/chosen": -2.372528076171875, - "logits/rejected": -2.3916471004486084, - "logps/chosen": -241.23934936523438, - "logps/rejected": -187.92953491210938, - "loss": 0.2667, - "rewards/accuracies": 0.875, - "rewards/chosen": 4.416121482849121, - "rewards/margins": 8.337173461914062, - "rewards/rejected": -3.921051502227783, - "step": 360 + "grad_norm": 57.330660352565694, + "learning_rate": 8.910997132984479e-08, + "logits/chosen": 0.19682852923870087, + "logits/rejected": 0.5228155255317688, + "logps/chosen": -529.1737060546875, + "logps/rejected": -791.169921875, + "loss": 0.2475, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -2.5559616088867188, + "rewards/margins": 2.631944179534912, + "rewards/rejected": -5.187905311584473, + "step": 1010 + }, + { + "epoch": 0.76, + "grad_norm": 55.457376945821615, + "learning_rate": 8.42006651424274e-08, + "logits/chosen": -0.11488042026758194, + "logits/rejected": 0.43977147340774536, + "logps/chosen": -529.4292602539062, + "logps/rejected": -779.668212890625, + "loss": 0.2315, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -2.6139321327209473, + "rewards/margins": 2.7150697708129883, + "rewards/rejected": -5.3290019035339355, + "step": 1020 }, { "epoch": 0.77, - "grad_norm": 316.09399625633, - "learning_rate": 7.387025063449081e-08, - "logits/chosen": -2.4834017753601074, - "logits/rejected": -2.428774118423462, - "logps/chosen": -223.4984893798828, - "logps/rejected": -176.99790954589844, - "loss": 0.3594, + "grad_norm": 57.42354594975122, + "learning_rate": 7.940293472341217e-08, + "logits/chosen": 0.25594162940979004, + "logits/rejected": 0.661308228969574, + "logps/chosen": -465.62347412109375, + "logps/rejected": -748.9591064453125, + "loss": 0.2417, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -2.5969550609588623, + "rewards/margins": 2.9456839561462402, + "rewards/rejected": -5.542638778686523, + "step": 1030 + }, + { + "epoch": 0.77, + "grad_norm": 44.59552618611705, + "learning_rate": 7.472000873748918e-08, + "logits/chosen": 0.21030166745185852, + "logits/rejected": 0.7939587831497192, + "logps/chosen": -488.8642578125, + "logps/rejected": -709.2621459960938, + "loss": 0.2323, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -2.7148165702819824, + "rewards/margins": 2.5755038261413574, + "rewards/rejected": -5.29032039642334, + "step": 1040 + }, + { + "epoch": 0.78, + "grad_norm": 64.60445708831573, + "learning_rate": 7.015503859093927e-08, + "logits/chosen": 0.2117297202348709, + "logits/rejected": 0.6777232885360718, + "logps/chosen": -483.72283935546875, + "logps/rejected": -733.4895629882812, + "loss": 0.2268, "rewards/accuracies": 0.90625, - "rewards/chosen": 3.544004440307617, - "rewards/margins": 8.096233367919922, - "rewards/rejected": -4.552228927612305, - "step": 370 + "rewards/chosen": -2.7278270721435547, + "rewards/margins": 2.6605074405670166, + "rewards/rejected": -5.38833475112915, + "step": 1050 }, { "epoch": 0.79, - "grad_norm": 229.93737644129538, - "learning_rate": 6.138919252022435e-08, - "logits/chosen": -2.4690134525299072, - "logits/rejected": -2.4799323081970215, - "logps/chosen": -249.5747528076172, - "logps/rejected": -198.73275756835938, - "loss": 0.3121, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": 3.8754029273986816, - "rewards/margins": 9.315340042114258, - "rewards/rejected": -5.439937591552734, - "step": 380 + "grad_norm": 55.01169146748481, + "learning_rate": 6.571109631087451e-08, + "logits/chosen": 0.0924178883433342, + "logits/rejected": 0.8385933041572571, + "logps/chosen": -520.66064453125, + "logps/rejected": -789.1815185546875, + "loss": 0.2073, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": -2.671236515045166, + "rewards/margins": 3.076190710067749, + "rewards/rejected": -5.747427463531494, + "step": 1060 + }, + { + "epoch": 0.79, + "grad_norm": 57.63704573643993, + "learning_rate": 6.139117247789687e-08, + "logits/chosen": 0.09710516035556793, + "logits/rejected": 0.5915436148643494, + "logps/chosen": -536.4342651367188, + "logps/rejected": -775.2525634765625, + "loss": 0.209, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -2.6840100288391113, + "rewards/margins": 2.6324105262756348, + "rewards/rejected": -5.316420555114746, + "step": 1070 + }, + { + "epoch": 0.8, + "grad_norm": 65.7023707481576, + "learning_rate": 5.719817421356685e-08, + "logits/chosen": -0.050966888666152954, + "logits/rejected": 0.3987392485141754, + "logps/chosen": -558.064453125, + "logps/rejected": -772.4810791015625, + "loss": 0.229, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -2.901360034942627, + "rewards/margins": 2.482283115386963, + "rewards/rejected": -5.383642673492432, + "step": 1080 + }, + { + "epoch": 0.81, + "grad_norm": 67.44035324151042, + "learning_rate": 5.313492322403701e-08, + "logits/chosen": 0.33903616666793823, + "logits/rejected": 0.7725549936294556, + "logps/chosen": -484.7267150878906, + "logps/rejected": -763.6196899414062, + "loss": 0.2302, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -2.789487838745117, + "rewards/margins": 2.8590240478515625, + "rewards/rejected": -5.6485114097595215, + "step": 1090 }, { "epoch": 0.82, - "grad_norm": 178.09073107787978, - "learning_rate": 4.991445467064689e-08, - "logits/chosen": -2.463972568511963, - "logits/rejected": -2.4395322799682617, - "logps/chosen": -239.1776123046875, - "logps/rejected": -189.71803283691406, - "loss": 0.222, - "rewards/accuracies": 0.9375, - "rewards/chosen": 3.557317018508911, - "rewards/margins": 8.469549179077148, - "rewards/rejected": -4.912230968475342, - "step": 390 + "grad_norm": 49.24699961134825, + "learning_rate": 4.9204153901165805e-08, + "logits/chosen": 0.14409589767456055, + "logits/rejected": 0.7108417749404907, + "logps/chosen": -508.53668212890625, + "logps/rejected": -749.4099731445312, + "loss": 0.2219, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.825862407684326, + "rewards/margins": 2.6767187118530273, + "rewards/rejected": -5.502581596374512, + "step": 1100 }, { - "epoch": 0.84, - "grad_norm": 211.56309254204024, - "learning_rate": 3.9507259776993954e-08, - "logits/chosen": -2.3866400718688965, - "logits/rejected": -2.4389476776123047, - "logps/chosen": -239.34860229492188, - "logps/rejected": -181.77676391601562, - "loss": 0.2653, - "rewards/accuracies": 0.8812500238418579, - "rewards/chosen": 3.949272632598877, - "rewards/margins": 8.648920059204102, - "rewards/rejected": -4.699648857116699, - "step": 400 + "epoch": 0.82, + "eval_logits/chosen": -0.006662141531705856, + "eval_logits/rejected": 0.307841420173645, + "eval_logps/chosen": -591.4903564453125, + "eval_logps/rejected": -754.2210693359375, + "eval_loss": 0.5380497574806213, + "eval_rewards/accuracies": 0.76953125, + "eval_rewards/chosen": -3.0985679626464844, + "eval_rewards/margins": 1.566027045249939, + "eval_rewards/rejected": -4.664595127105713, + "eval_runtime": 98.1364, + "eval_samples_per_second": 20.38, + "eval_steps_per_second": 0.326, + "step": 1100 + }, + { + "epoch": 0.82, + "grad_norm": 53.77081112472508, + "learning_rate": 4.540851148239036e-08, + "logits/chosen": 0.05285036563873291, + "logits/rejected": 0.5418864488601685, + "logps/chosen": -480.589599609375, + "logps/rejected": -721.0247192382812, + "loss": 0.2259, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -2.6466457843780518, + "rewards/margins": 2.60426664352417, + "rewards/rejected": -5.250911712646484, + "step": 1110 + }, + { + "epoch": 0.83, + "grad_norm": 61.18183448166774, + "learning_rate": 4.1750550270596206e-08, + "logits/chosen": 0.007505857851356268, + "logits/rejected": 0.7265870571136475, + "logps/chosen": -525.0111694335938, + "logps/rejected": -770.3839111328125, + "loss": 0.2062, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -2.770293712615967, + "rewards/margins": 2.8563060760498047, + "rewards/rejected": -5.6265997886657715, + "step": 1120 }, { "epoch": 0.84, - "eval_logits/chosen": -2.512333631515503, - "eval_logits/rejected": -2.4965250492095947, - "eval_logps/chosen": -248.40835571289062, - "eval_logps/rejected": -190.5157470703125, - "eval_loss": 0.21030600368976593, - "eval_rewards/accuracies": 0.92578125, - "eval_rewards/chosen": 4.512471675872803, - "eval_rewards/margins": 9.793259620666504, - "eval_rewards/rejected": -5.280787944793701, - "eval_runtime": 97.2069, - "eval_samples_per_second": 20.575, - "eval_steps_per_second": 0.329, - "step": 400 + "grad_norm": 68.7383025915832, + "learning_rate": 3.823273191518234e-08, + "logits/chosen": 0.1847713440656662, + "logits/rejected": 0.7002714276313782, + "logps/chosen": -485.5477600097656, + "logps/rejected": -744.1683959960938, + "loss": 0.2208, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -2.557469606399536, + "rewards/margins": 2.7433865070343018, + "rewards/rejected": -5.300856113433838, + "step": 1130 + }, + { + "epoch": 0.85, + "grad_norm": 52.91705405093606, + "learning_rate": 3.485742375547745e-08, + "logits/chosen": 0.2934524416923523, + "logits/rejected": 0.7874400019645691, + "logps/chosen": -540.8467407226562, + "logps/rejected": -773.0992431640625, + "loss": 0.2157, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -2.8220906257629395, + "rewards/margins": 2.7147064208984375, + "rewards/rejected": -5.536796569824219, + "step": 1140 + }, + { + "epoch": 0.85, + "grad_norm": 71.88387361950765, + "learning_rate": 3.162689722762365e-08, + "logits/chosen": -0.06768418103456497, + "logits/rejected": 0.4680994153022766, + "logps/chosen": -544.25439453125, + "logps/rejected": -792.7861328125, + "loss": 0.219, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -2.854830026626587, + "rewards/margins": 2.878235340118408, + "rewards/rejected": -5.733065128326416, + "step": 1150 }, { "epoch": 0.86, - "grad_norm": 337.4045735200527, - "learning_rate": 3.022313472693447e-08, - "logits/chosen": -2.431990146636963, - "logits/rejected": -2.4296875, - "logps/chosen": -269.318603515625, - "logps/rejected": -194.96946716308594, - "loss": 0.6111, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": 3.5342013835906982, - "rewards/margins": 8.450251579284668, - "rewards/rejected": -4.916050910949707, - "step": 410 + "grad_norm": 55.10045985903261, + "learning_rate": 2.8543326335997904e-08, + "logits/chosen": 0.22700032591819763, + "logits/rejected": 0.5510324239730835, + "logps/chosen": -525.5604248046875, + "logps/rejected": -764.0299072265625, + "loss": 0.2111, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.728689193725586, + "rewards/margins": 2.7138609886169434, + "rewards/rejected": -5.442550182342529, + "step": 1160 + }, + { + "epoch": 0.87, + "grad_norm": 69.28988155945117, + "learning_rate": 2.560878619020157e-08, + "logits/chosen": 0.41892099380493164, + "logits/rejected": 0.961225152015686, + "logps/chosen": -475.5420837402344, + "logps/rejected": -717.2962036132812, + "loss": 0.2244, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -2.7839531898498535, + "rewards/margins": 2.69148588180542, + "rewards/rejected": -5.475439071655273, + "step": 1170 }, { "epoch": 0.88, - "grad_norm": 403.24647604289254, - "learning_rate": 2.2111614344599684e-08, - "logits/chosen": -2.39760684967041, - "logits/rejected": -2.376661777496338, - "logps/chosen": -237.7588653564453, - "logps/rejected": -190.60279846191406, - "loss": 0.3801, - "rewards/accuracies": 0.875, - "rewards/chosen": 3.0914576053619385, - "rewards/margins": 7.803999423980713, - "rewards/rejected": -4.712540626525879, - "step": 420 + "grad_norm": 64.73986275241352, + "learning_rate": 2.2825251608601466e-08, + "logits/chosen": 0.1623622179031372, + "logits/rejected": 0.6435664892196655, + "logps/chosen": -536.0408935546875, + "logps/rejected": -744.7503662109375, + "loss": 0.2261, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -2.80417799949646, + "rewards/margins": 2.5097708702087402, + "rewards/rejected": -5.313948631286621, + "step": 1180 + }, + { + "epoch": 0.88, + "grad_norm": 46.21048147117716, + "learning_rate": 2.0194595789362474e-08, + "logits/chosen": -0.05237337946891785, + "logits/rejected": 0.4507738947868347, + "logps/chosen": -519.0675048828125, + "logps/rejected": -777.24072265625, + "loss": 0.2245, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.453911542892456, + "rewards/margins": 3.05096435546875, + "rewards/rejected": -5.504876136779785, + "step": 1190 + }, + { + "epoch": 0.89, + "grad_norm": 55.74231137792324, + "learning_rate": 1.7718589049866728e-08, + "logits/chosen": 0.32885435223579407, + "logits/rejected": 0.45243850350379944, + "logps/chosen": -502.6609802246094, + "logps/rejected": -755.8111572265625, + "loss": 0.2165, + "rewards/accuracies": 0.90625, + "rewards/chosen": -2.85229754447937, + "rewards/margins": 2.4965226650238037, + "rewards/rejected": -5.348820209503174, + "step": 1200 + }, + { + "epoch": 0.89, + "eval_logits/chosen": -0.09797076135873795, + "eval_logits/rejected": 0.20152458548545837, + "eval_logps/chosen": -580.1854858398438, + "eval_logps/rejected": -738.0179443359375, + "eval_loss": 0.5336408615112305, + "eval_rewards/accuracies": 0.76171875, + "eval_rewards/chosen": -2.9855194091796875, + "eval_rewards/margins": 1.5170438289642334, + "eval_rewards/rejected": -4.5025634765625, + "eval_runtime": 98.1382, + "eval_samples_per_second": 20.379, + "eval_steps_per_second": 0.326, + "step": 1200 }, { "epoch": 0.9, - "grad_norm": 208.90618387532385, - "learning_rate": 1.521597710086439e-08, - "logits/chosen": -2.4121103286743164, - "logits/rejected": -2.3859496116638184, - "logps/chosen": -263.43292236328125, - "logps/rejected": -191.71656799316406, - "loss": 0.3449, - "rewards/accuracies": 0.887499988079071, - "rewards/chosen": 3.8668575286865234, - "rewards/margins": 8.32303524017334, - "rewards/rejected": -4.456177234649658, - "step": 430 + "grad_norm": 59.87247578419851, + "learning_rate": 1.539889763536645e-08, + "logits/chosen": 0.1656707227230072, + "logits/rejected": 0.5841933488845825, + "logps/chosen": -501.6282653808594, + "logps/rejected": -756.2093505859375, + "loss": 0.2316, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -2.5563437938690186, + "rewards/margins": 2.784249782562256, + "rewards/rejected": -5.3405938148498535, + "step": 1210 + }, + { + "epoch": 0.91, + "grad_norm": 60.03113730852447, + "learning_rate": 1.3237082597673172e-08, + "logits/chosen": 0.1520698368549347, + "logits/rejected": 0.7296702861785889, + "logps/chosen": -479.1258850097656, + "logps/rejected": -756.1190185546875, + "loss": 0.1913, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -2.538914680480957, + "rewards/margins": 2.866367816925049, + "rewards/rejected": -5.405282974243164, + "step": 1220 + }, + { + "epoch": 0.91, + "grad_norm": 50.74907680854386, + "learning_rate": 1.1234598744637502e-08, + "logits/chosen": 0.06600452959537506, + "logits/rejected": 0.34231507778167725, + "logps/chosen": -536.6253051757812, + "logps/rejected": -802.2919311523438, + "loss": 0.2021, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -2.8277297019958496, + "rewards/margins": 2.8031983375549316, + "rewards/rejected": -5.630928039550781, + "step": 1230 }, { "epoch": 0.92, - "grad_norm": 221.6697110810539, - "learning_rate": 9.57301420397924e-09, - "logits/chosen": -2.5127618312835693, - "logits/rejected": -2.4956202507019043, - "logps/chosen": -254.6200408935547, - "logps/rejected": -190.78817749023438, - "loss": 0.3624, - "rewards/accuracies": 0.918749988079071, - "rewards/chosen": 4.447186470031738, - "rewards/margins": 9.914505004882812, - "rewards/rejected": -5.467317581176758, - "step": 440 + "grad_norm": 53.04165278516392, + "learning_rate": 9.392793661126414e-09, + "logits/chosen": -0.1451832503080368, + "logits/rejected": 0.4121910631656647, + "logps/chosen": -569.254150390625, + "logps/rejected": -809.6979370117188, + "loss": 0.196, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.7587039470672607, + "rewards/margins": 2.8314685821533203, + "rewards/rejected": -5.590172290802002, + "step": 1240 + }, + { + "epoch": 0.93, + "grad_norm": 64.9259915789077, + "learning_rate": 7.71290680215711e-09, + "logits/chosen": 0.08149626106023788, + "logits/rejected": 0.6067185401916504, + "logps/chosen": -518.8594970703125, + "logps/rejected": -728.6957397460938, + "loss": 0.227, + "rewards/accuracies": 0.90625, + "rewards/chosen": -2.8665976524353027, + "rewards/margins": 2.4750990867614746, + "rewards/rejected": -5.3416972160339355, + "step": 1250 }, { "epoch": 0.94, - "grad_norm": 304.27875154546507, - "learning_rate": 5.212833302556258e-09, - "logits/chosen": -2.5326547622680664, - "logits/rejected": -2.4754910469055176, - "logps/chosen": -242.94784545898438, - "logps/rejected": -187.10147094726562, - "loss": 0.2887, - "rewards/accuracies": 0.918749988079071, - "rewards/chosen": 4.133586406707764, - "rewards/margins": 9.706035614013672, - "rewards/rejected": -5.572449207305908, - "step": 450 + "grad_norm": 56.48468871383734, + "learning_rate": 6.196068658797543e-09, + "logits/chosen": 0.31367558240890503, + "logits/rejected": 0.856947124004364, + "logps/chosen": -480.42205810546875, + "logps/rejected": -714.3417358398438, + "loss": 0.2094, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -2.750648021697998, + "rewards/margins": 2.5990233421325684, + "rewards/rejected": -5.349670886993408, + "step": 1260 + }, + { + "epoch": 0.94, + "grad_norm": 55.81763854062536, + "learning_rate": 4.843299997394717e-09, + "logits/chosen": 0.29153138399124146, + "logits/rejected": 0.6525137424468994, + "logps/chosen": -505.7523498535156, + "logps/rejected": -773.7969970703125, + "loss": 0.199, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": -2.656862735748291, + "rewards/margins": 2.929708957672119, + "rewards/rejected": -5.58657169342041, + "step": 1270 + }, + { + "epoch": 0.95, + "grad_norm": 45.256008770336216, + "learning_rate": 3.655511172643372e-09, + "logits/chosen": -0.10818688571453094, + "logits/rejected": 0.48161396384239197, + "logps/chosen": -536.8485107421875, + "logps/rejected": -788.7921142578125, + "loss": 0.1927, + "rewards/accuracies": 0.90625, + "rewards/chosen": -2.5987658500671387, + "rewards/margins": 2.9826765060424805, + "rewards/rejected": -5.581442832946777, + "step": 1280 }, { "epoch": 0.96, - "grad_norm": 480.62571484354635, - "learning_rate": 2.158697848236607e-09, - "logits/chosen": -2.419916868209839, - "logits/rejected": -2.4316189289093018, - "logps/chosen": -255.69723510742188, - "logps/rejected": -203.59130859375, - "loss": 0.4414, - "rewards/accuracies": 0.893750011920929, - "rewards/chosen": 3.5629382133483887, - "rewards/margins": 8.139005661010742, - "rewards/rejected": -4.576067924499512, - "step": 460 + "grad_norm": 75.93254873129665, + "learning_rate": 2.633501514956532e-09, + "logits/chosen": -0.07302987575531006, + "logits/rejected": 0.7443720102310181, + "logps/chosen": -507.47894287109375, + "logps/rejected": -765.6873168945312, + "loss": 0.2005, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -2.673506259918213, + "rewards/margins": 2.9605891704559326, + "rewards/rejected": -5.634096145629883, + "step": 1290 + }, + { + "epoch": 0.97, + "grad_norm": 55.65736054179639, + "learning_rate": 1.777958792550993e-09, + "logits/chosen": 0.05213532596826553, + "logits/rejected": 0.4943299889564514, + "logps/chosen": -537.5960693359375, + "logps/rejected": -807.1434326171875, + "loss": 0.1728, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.9276278018951416, + "rewards/margins": 2.9666807651519775, + "rewards/rejected": -5.894307613372803, + "step": 1300 + }, + { + "epoch": 0.97, + "eval_logits/chosen": -0.06342385709285736, + "eval_logits/rejected": 0.23894435167312622, + "eval_logps/chosen": -598.8973999023438, + "eval_logps/rejected": -761.6608276367188, + "eval_loss": 0.5417826175689697, + "eval_rewards/accuracies": 0.75390625, + "eval_rewards/chosen": -3.172638177871704, + "eval_rewards/margins": 1.5663537979125977, + "eval_rewards/rejected": -4.738992214202881, + "eval_runtime": 98.1336, + "eval_samples_per_second": 20.38, + "eval_steps_per_second": 0.326, + "step": 1300 + }, + { + "epoch": 0.97, + "grad_norm": 50.04724298737362, + "learning_rate": 1.0894587486089125e-09, + "logits/chosen": 0.30416375398635864, + "logits/rejected": 0.5567177534103394, + "logps/chosen": -561.5665893554688, + "logps/rejected": -784.9238891601562, + "loss": 0.2226, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -3.119717836380005, + "rewards/margins": 2.4494900703430176, + "rewards/rejected": -5.569207668304443, + "step": 1310 }, { "epoch": 0.98, - "grad_norm": 258.3454560672873, - "learning_rate": 4.269029751107489e-10, - "logits/chosen": -2.416884660720825, - "logits/rejected": -2.3896336555480957, - "logps/chosen": -230.7972412109375, - "logps/rejected": -200.25830078125, - "loss": 0.2781, - "rewards/accuracies": 0.862500011920929, - "rewards/chosen": 3.390946865081787, - "rewards/margins": 9.046823501586914, - "rewards/rejected": -5.655877113342285, - "step": 470 + "grad_norm": 58.64068052496794, + "learning_rate": 5.684647138277098e-10, + "logits/chosen": 0.06530937552452087, + "logits/rejected": 0.6911398768424988, + "logps/chosen": -524.456298828125, + "logps/rejected": -789.5941162109375, + "loss": 0.2211, + "rewards/accuracies": 0.90625, + "rewards/chosen": -2.7515296936035156, + "rewards/margins": 3.075373411178589, + "rewards/rejected": -5.826903343200684, + "step": 1320 + }, + { + "epoch": 0.99, + "grad_norm": 51.7145291726634, + "learning_rate": 2.153272946184559e-10, + "logits/chosen": 0.033466748893260956, + "logits/rejected": 0.613431453704834, + "logps/chosen": -560.579833984375, + "logps/rejected": -819.1729736328125, + "loss": 0.2092, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": -2.932191848754883, + "rewards/margins": 2.935133695602417, + "rewards/rejected": -5.867325782775879, + "step": 1330 + }, + { + "epoch": 1.0, + "grad_norm": 45.056558756064035, + "learning_rate": 3.0284137163189004e-11, + "logits/chosen": -0.001438182545825839, + "logits/rejected": 0.446831613779068, + "logps/chosen": -533.5986938476562, + "logps/rejected": -776.7227783203125, + "loss": 0.1891, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": -2.894265651702881, + "rewards/margins": 2.6473288536071777, + "rewards/rejected": -5.541594505310059, + "step": 1340 }, { "epoch": 1.0, - "step": 478, + "step": 1346, "total_flos": 0.0, - "train_loss": 0.4018711235732713, - "train_runtime": 7633.33, - "train_samples_per_second": 8.009, - "train_steps_per_second": 0.063 + "train_loss": 0.3438705863959722, + "train_runtime": 21850.8794, + "train_samples_per_second": 7.884, + "train_steps_per_second": 0.062 } ], "logging_steps": 10, - "max_steps": 478, + "max_steps": 1346, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100,