diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -1,22 +1,21 @@ { "best_metric": null, "best_model_checkpoint": null, - "epoch": 0.9996020692399522, + "epoch": 1.0, "eval_steps": 100, - "global_step": 1256, + "global_step": 478, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { - "epoch": 0.0007958615200955034, - "grad_norm": 4.986090946069212, - "learning_rate": 3.968253968253968e-09, - "logits/chosen": -2.735659122467041, - "logits/rejected": -2.7581238746643066, - "logps/chosen": -124.62968444824219, - "logps/rejected": -168.09475708007812, - "loss": 0.6931, + "epoch": 0.0, + "learning_rate": 1.0416666666666666e-08, + "logits/chosen": -2.847970962524414, + "logits/rejected": -2.79160213470459, + "logps/chosen": -284.9612731933594, + "logps/rejected": -276.45928955078125, + "loss": 0.0866, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, @@ -24,2101 +23,742 @@ "step": 1 }, { - "epoch": 0.007958615200955034, - "grad_norm": 5.160120887614425, - "learning_rate": 3.968253968253968e-08, - "logits/chosen": -2.738856315612793, - "logits/rejected": -2.7277917861938477, - "logps/chosen": -146.72731018066406, - "logps/rejected": -131.20956420898438, - "loss": 0.6931, - "rewards/accuracies": 0.4444444477558136, - "rewards/chosen": -0.00025126771652139723, - "rewards/margins": -0.0002628751390147954, - "rewards/rejected": 1.1607427040871698e-05, + "epoch": 0.02, + "learning_rate": 1.0416666666666667e-07, + "logits/chosen": -2.754711151123047, + "logits/rejected": -2.7527668476104736, + "logps/chosen": -249.9619140625, + "logps/rejected": -223.05694580078125, + "loss": 0.0866, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.00014172698138281703, + "rewards/margins": -5.154276004759595e-05, + "rewards/rejected": -9.018417040351778e-05, "step": 10 }, { - "epoch": 0.01591723040191007, - "grad_norm": 4.941410796258004, - "learning_rate": 7.936507936507936e-08, - "logits/chosen": -2.7067627906799316, - "logits/rejected": -2.7038016319274902, - "logps/chosen": -129.4619598388672, - "logps/rejected": -130.26687622070312, - "loss": 0.6932, - "rewards/accuracies": 0.5375000238418579, - "rewards/chosen": 6.337546074064448e-05, - "rewards/margins": 0.00019685756706167012, - "rewards/rejected": -0.00013348212814889848, + "epoch": 0.04, + "learning_rate": 2.0833333333333333e-07, + "logits/chosen": -2.7448391914367676, + "logits/rejected": -2.7453367710113525, + "logps/chosen": -257.4018249511719, + "logps/rejected": -247.55197143554688, + "loss": 0.0866, + "rewards/accuracies": 0.53125, + "rewards/chosen": 0.0005366196855902672, + "rewards/margins": 0.0017497055232524872, + "rewards/rejected": -0.0012130856048315763, "step": 20 }, { - "epoch": 0.0238758456028651, - "grad_norm": 4.798328591529667, - "learning_rate": 1.1904761904761903e-07, - "logits/chosen": -2.684114694595337, - "logits/rejected": -2.6811366081237793, - "logps/chosen": -141.76788330078125, - "logps/rejected": -155.64646911621094, - "loss": 0.693, - "rewards/accuracies": 0.5562499761581421, - "rewards/chosen": 0.0006387188332155347, - "rewards/margins": 0.0004968013381585479, - "rewards/rejected": 0.00014191746595315635, + "epoch": 0.06, + "learning_rate": 3.1249999999999997e-07, + "logits/chosen": -2.8008646965026855, + "logits/rejected": -2.753528356552124, + "logps/chosen": -300.37725830078125, + "logps/rejected": -261.90411376953125, + "loss": 0.086, + "rewards/accuracies": 0.71875, + "rewards/chosen": 0.0019977777265012264, + "rewards/margins": 0.010120971128344536, + "rewards/rejected": -0.008123192936182022, "step": 30 }, { - "epoch": 0.03183446080382014, - "grad_norm": 5.077340975164852, - "learning_rate": 1.5873015873015872e-07, - "logits/chosen": -2.69206166267395, - "logits/rejected": -2.684312343597412, - "logps/chosen": -154.81820678710938, - "logps/rejected": -164.00318908691406, - "loss": 0.6928, - "rewards/accuracies": 0.543749988079071, - "rewards/chosen": 3.8339756429195404e-05, - "rewards/margins": 0.0005206236382946372, - "rewards/rejected": -0.00048228385276161134, + "epoch": 0.08, + "learning_rate": 4.1666666666666667e-07, + "logits/chosen": -2.7638278007507324, + "logits/rejected": -2.7516701221466064, + "logps/chosen": -256.6244201660156, + "logps/rejected": -274.86651611328125, + "loss": 0.0851, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.00190549751278013, + "rewards/margins": 0.025926370173692703, + "rewards/rejected": -0.027831867337226868, "step": 40 }, { - "epoch": 0.03979307600477517, - "grad_norm": 4.879872886513438, - "learning_rate": 1.984126984126984e-07, - "logits/chosen": -2.7071824073791504, - "logits/rejected": -2.688455820083618, - "logps/chosen": -143.71075439453125, - "logps/rejected": -137.49859619140625, - "loss": 0.6923, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": 0.00012845727906096727, - "rewards/margins": 0.002997648436576128, - "rewards/rejected": -0.0028691913466900587, + "epoch": 0.1, + "learning_rate": 4.999733114418725e-07, + "logits/chosen": -2.767770767211914, + "logits/rejected": -2.7401928901672363, + "logps/chosen": -284.50543212890625, + "logps/rejected": -256.618408203125, + "loss": 0.0834, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.02425984852015972, + "rewards/margins": 0.06489187479019165, + "rewards/rejected": -0.08915172517299652, "step": 50 }, { - "epoch": 0.0477516912057302, - "grad_norm": 4.800891825988376, - "learning_rate": 2.3809523809523806e-07, - "logits/chosen": -2.715952157974243, - "logits/rejected": -2.7166202068328857, - "logps/chosen": -145.3311309814453, - "logps/rejected": -158.9158935546875, - "loss": 0.6912, - "rewards/accuracies": 0.5562499761581421, - "rewards/chosen": 0.002955437172204256, - "rewards/margins": 0.003327813697978854, - "rewards/rejected": -0.00037237658398225904, + "epoch": 0.13, + "learning_rate": 4.990398100856366e-07, + "logits/chosen": -2.7361197471618652, + "logits/rejected": -2.724640130996704, + "logps/chosen": -282.0168151855469, + "logps/rejected": -256.8324279785156, + "loss": 0.0805, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.06132340431213379, + "rewards/margins": 0.10652206093072891, + "rewards/rejected": -0.1678454577922821, "step": 60 }, { - "epoch": 0.055710306406685235, - "grad_norm": 5.073043436012192, - "learning_rate": 2.7777777777777776e-07, - "logits/chosen": -2.737873077392578, - "logits/rejected": -2.728829860687256, - "logps/chosen": -148.14971923828125, - "logps/rejected": -142.3303985595703, - "loss": 0.6892, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -0.005114335101097822, - "rewards/margins": 0.0065292841754853725, - "rewards/rejected": -0.01164362020790577, + "epoch": 0.15, + "learning_rate": 4.967775735898179e-07, + "logits/chosen": -2.7822699546813965, + "logits/rejected": -2.739877223968506, + "logps/chosen": -291.2421875, + "logps/rejected": -274.023193359375, + "loss": 0.0781, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.24106796085834503, + "rewards/margins": 0.17975760996341705, + "rewards/rejected": -0.4208255410194397, "step": 70 }, { - "epoch": 0.06366892160764027, - "grad_norm": 5.019983039590536, - "learning_rate": 3.1746031746031743e-07, - "logits/chosen": -2.7146174907684326, - "logits/rejected": -2.6961400508880615, - "logps/chosen": -155.6132049560547, - "logps/rejected": -147.08509826660156, - "loss": 0.6878, - "rewards/accuracies": 0.5062500238418579, - "rewards/chosen": -0.024789467453956604, - "rewards/margins": 0.004503136035054922, - "rewards/rejected": -0.029292598366737366, + "epoch": 0.17, + "learning_rate": 4.931986719649298e-07, + "logits/chosen": -2.7829222679138184, + "logits/rejected": -2.7552578449249268, + "logps/chosen": -290.30157470703125, + "logps/rejected": -332.9333801269531, + "loss": 0.0744, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.29948586225509644, + "rewards/margins": 0.3068729043006897, + "rewards/rejected": -0.6063587665557861, "step": 80 }, { - "epoch": 0.07162753680859531, - "grad_norm": 5.340063734876701, - "learning_rate": 3.5714285714285716e-07, - "logits/chosen": -2.7231202125549316, - "logits/rejected": -2.729904890060425, - "logps/chosen": -149.95083618164062, - "logps/rejected": -170.76242065429688, - "loss": 0.683, - "rewards/accuracies": 0.5687500238418579, - "rewards/chosen": -0.04104585200548172, - "rewards/margins": 0.021799778565764427, - "rewards/rejected": -0.0628456324338913, + "epoch": 0.19, + "learning_rate": 4.883222001996351e-07, + "logits/chosen": -2.8093197345733643, + "logits/rejected": -2.7847156524658203, + "logps/chosen": -308.68927001953125, + "logps/rejected": -327.748046875, + "loss": 0.0733, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.3332245349884033, + "rewards/margins": 0.4322693943977356, + "rewards/rejected": -0.7654938697814941, "step": 90 }, { - "epoch": 0.07958615200955034, - "grad_norm": 5.627078193282986, - "learning_rate": 3.968253968253968e-07, - "logits/chosen": -2.679619312286377, - "logits/rejected": -2.6625077724456787, - "logps/chosen": -147.9279022216797, - "logps/rejected": -142.6844482421875, - "loss": 0.679, - "rewards/accuracies": 0.5687500238418579, - "rewards/chosen": -0.10115940868854523, - "rewards/margins": 0.04033854603767395, - "rewards/rejected": -0.14149793982505798, + "epoch": 0.21, + "learning_rate": 4.821741763807186e-07, + "logits/chosen": -2.776167392730713, + "logits/rejected": -2.743318557739258, + "logps/chosen": -356.5120544433594, + "logps/rejected": -373.7995910644531, + "loss": 0.0717, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.6754709482192993, + "rewards/margins": 0.440376341342926, + "rewards/rejected": -1.1158472299575806, "step": 100 }, { - "epoch": 0.07958615200955034, - "eval_logits/chosen": -2.6942806243896484, - "eval_logits/rejected": -2.686227798461914, - "eval_logps/chosen": -158.6035614013672, - "eval_logps/rejected": -168.8750457763672, - "eval_loss": 0.6759105324745178, - "eval_rewards/accuracies": 0.5998134613037109, - "eval_rewards/chosen": -0.1436006873846054, - "eval_rewards/margins": 0.03815995156764984, - "eval_rewards/rejected": -0.18176063895225525, - "eval_runtime": 153.1197, - "eval_samples_per_second": 55.852, - "eval_steps_per_second": 0.875, + "epoch": 0.21, + "eval_logits/chosen": -2.728982448577881, + "eval_logits/rejected": -2.712980031967163, + "eval_logps/chosen": -316.240478515625, + "eval_logps/rejected": -367.9753112792969, + "eval_loss": 0.073273666203022, + "eval_rewards/accuracies": 0.7265625, + "eval_rewards/chosen": -0.5920084714889526, + "eval_rewards/margins": 0.5142118334770203, + "eval_rewards/rejected": -1.1062203645706177, + "eval_runtime": 53.5143, + "eval_samples_per_second": 37.373, + "eval_steps_per_second": 0.598, "step": 100 }, { - "epoch": 0.08754476721050537, - "grad_norm": 6.93010288991277, - "learning_rate": 4.365079365079365e-07, - "logits/chosen": -2.6722376346588135, - "logits/rejected": -2.652876138687134, - "logps/chosen": -177.9219970703125, - "logps/rejected": -164.42129516601562, - "loss": 0.6754, - "rewards/accuracies": 0.5625, - "rewards/chosen": -0.19631250202655792, - "rewards/margins": 0.019793253391981125, - "rewards/rejected": -0.21610574424266815, + "epoch": 0.23, + "learning_rate": 4.747874028753375e-07, + "logits/chosen": -2.6736016273498535, + "logits/rejected": -2.6673741340637207, + "logps/chosen": -319.99285888671875, + "logps/rejected": -394.5204772949219, + "loss": 0.0675, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.557295560836792, + "rewards/margins": 0.660796582698822, + "rewards/rejected": -1.2180922031402588, "step": 110 }, { - "epoch": 0.0955033824114604, - "grad_norm": 9.118856419956929, - "learning_rate": 4.761904761904761e-07, - "logits/chosen": -2.6485350131988525, - "logits/rejected": -2.6367721557617188, - "logps/chosen": -167.44517517089844, - "logps/rejected": -174.68959045410156, - "loss": 0.6673, - "rewards/accuracies": 0.574999988079071, - "rewards/chosen": -0.2831340432167053, - "rewards/margins": 0.0641321912407875, - "rewards/rejected": -0.34726622700691223, + "epoch": 0.25, + "learning_rate": 4.662012913161997e-07, + "logits/chosen": -2.6141879558563232, + "logits/rejected": -2.5749917030334473, + "logps/chosen": -359.4350280761719, + "logps/rejected": -392.105712890625, + "loss": 0.0709, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.9189525842666626, + "rewards/margins": 0.5283113718032837, + "rewards/rejected": -1.4472639560699463, "step": 120 }, { - "epoch": 0.10346199761241544, - "grad_norm": 14.535361780051513, - "learning_rate": 4.999845414634076e-07, - "logits/chosen": -2.6754541397094727, - "logits/rejected": -2.646840810775757, - "logps/chosen": -180.4161376953125, - "logps/rejected": -173.7825164794922, - "loss": 0.647, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -0.2550658583641052, - "rewards/margins": 0.12581779062747955, - "rewards/rejected": -0.38088366389274597, + "epoch": 0.27, + "learning_rate": 4.5646165232345103e-07, + "logits/chosen": -2.5792384147644043, + "logits/rejected": -2.5439562797546387, + "logps/chosen": -372.8755798339844, + "logps/rejected": -363.8757629394531, + "loss": 0.0701, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.8074409365653992, + "rewards/margins": 0.42888420820236206, + "rewards/rejected": -1.2363251447677612, "step": 130 }, { - "epoch": 0.11142061281337047, - "grad_norm": 9.275457035342084, - "learning_rate": 4.998106548810311e-07, - "logits/chosen": -2.5903968811035156, - "logits/rejected": -2.5657451152801514, - "logps/chosen": -212.9225311279297, - "logps/rejected": -203.63796997070312, - "loss": 0.6494, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -0.6731986999511719, - "rewards/margins": 0.11040596663951874, - "rewards/rejected": -0.783604621887207, + "epoch": 0.29, + "learning_rate": 4.456204510851956e-07, + "logits/chosen": -2.4518723487854004, + "logits/rejected": -2.429705858230591, + "logps/chosen": -335.4825744628906, + "logps/rejected": -358.73992919921875, + "loss": 0.0687, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.8712352514266968, + "rewards/margins": 0.5185465812683105, + "rewards/rejected": -1.3897819519042969, "step": 140 }, { - "epoch": 0.1193792280143255, - "grad_norm": 14.605167530290739, - "learning_rate": 4.994436933879359e-07, - "logits/chosen": -2.510530948638916, - "logits/rejected": -2.503571033477783, - "logps/chosen": -217.52725219726562, - "logps/rejected": -246.1698455810547, - "loss": 0.6321, - "rewards/accuracies": 0.6875, - "rewards/chosen": -0.7798799276351929, - "rewards/margins": 0.22194448113441467, - "rewards/rejected": -1.0018242597579956, + "epoch": 0.31, + "learning_rate": 4.337355301007335e-07, + "logits/chosen": -2.3888745307922363, + "logits/rejected": -2.3675789833068848, + "logps/chosen": -345.4061584472656, + "logps/rejected": -378.5643005371094, + "loss": 0.0674, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.8017293810844421, + "rewards/margins": 0.5842651128768921, + "rewards/rejected": -1.385994553565979, "step": 150 }, { - "epoch": 0.12733784321528055, - "grad_norm": 13.975831761557039, - "learning_rate": 4.988839406031596e-07, - "logits/chosen": -2.507648468017578, - "logits/rejected": -2.5182366371154785, - "logps/chosen": -202.53158569335938, - "logps/rejected": -266.3342590332031, - "loss": 0.6275, - "rewards/accuracies": 0.71875, - "rewards/chosen": -0.8189308047294617, - "rewards/margins": 0.371000200510025, - "rewards/rejected": -1.189931035041809, + "epoch": 0.33, + "learning_rate": 4.2087030056579986e-07, + "logits/chosen": -2.339261531829834, + "logits/rejected": -2.3114781379699707, + "logps/chosen": -338.3687744140625, + "logps/rejected": -375.3154296875, + "loss": 0.0684, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.7738892436027527, + "rewards/margins": 0.6639295816421509, + "rewards/rejected": -1.4378188848495483, "step": 160 }, { - "epoch": 0.13529645841623558, - "grad_norm": 19.875830962562063, - "learning_rate": 4.981318291512395e-07, - "logits/chosen": -2.433882474899292, - "logits/rejected": -2.430387020111084, - "logps/chosen": -218.954833984375, - "logps/rejected": -256.6148681640625, - "loss": 0.6212, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -0.8254417181015015, - "rewards/margins": 0.2912672758102417, - "rewards/rejected": -1.1167091131210327, + "epoch": 0.36, + "learning_rate": 4.070934040463998e-07, + "logits/chosen": -2.2538902759552, + "logits/rejected": -2.2150468826293945, + "logps/chosen": -380.0686950683594, + "logps/rejected": -427.46514892578125, + "loss": 0.0679, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.1141223907470703, + "rewards/margins": 0.6542497873306274, + "rewards/rejected": -1.7683721780776978, "step": 170 }, { - "epoch": 0.14325507361719061, - "grad_norm": 24.36373306347673, - "learning_rate": 4.971879403278432e-07, - "logits/chosen": -2.3362064361572266, - "logits/rejected": -2.3115732669830322, - "logps/chosen": -261.9034423828125, - "logps/rejected": -281.554443359375, - "loss": 0.6153, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -1.1337685585021973, - "rewards/margins": 0.24233467876911163, - "rewards/rejected": -1.376103401184082, + "epoch": 0.38, + "learning_rate": 3.9247834624635404e-07, + "logits/chosen": -2.331003427505493, + "logits/rejected": -2.2696776390075684, + "logps/chosen": -374.4627380371094, + "logps/rejected": -422.1844177246094, + "loss": 0.066, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.980134129524231, + "rewards/margins": 0.7622623443603516, + "rewards/rejected": -1.742396593093872, "step": 180 }, { - "epoch": 0.15121368881814565, - "grad_norm": 24.478766866690933, - "learning_rate": 4.960530036504941e-07, - "logits/chosen": -2.0850882530212402, - "logits/rejected": -2.040325403213501, - "logps/chosen": -273.06854248046875, - "logps/rejected": -312.0998229980469, - "loss": 0.6106, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -1.2871907949447632, - "rewards/margins": 0.4555794596672058, - "rewards/rejected": -1.7427704334259033, + "epoch": 0.4, + "learning_rate": 3.7710310482256523e-07, + "logits/chosen": -2.2278285026550293, + "logits/rejected": -2.187769651412964, + "logps/chosen": -367.53619384765625, + "logps/rejected": -409.8011169433594, + "loss": 0.0653, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.1032383441925049, + "rewards/margins": 0.6265621781349182, + "rewards/rejected": -1.7298005819320679, "step": 190 }, { - "epoch": 0.15917230401910068, - "grad_norm": 19.670570237645645, - "learning_rate": 4.947278962947386e-07, - "logits/chosen": -1.907619833946228, - "logits/rejected": -1.9094089269638062, - "logps/chosen": -261.22698974609375, - "logps/rejected": -314.9784851074219, - "loss": 0.5947, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -1.1207753419876099, - "rewards/margins": 0.399713933467865, - "rewards/rejected": -1.5204893350601196, + "epoch": 0.42, + "learning_rate": 3.610497133404795e-07, + "logits/chosen": -2.2965497970581055, + "logits/rejected": -2.2077629566192627, + "logps/chosen": -437.07275390625, + "logps/rejected": -473.34228515625, + "loss": 0.0672, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.024962067604065, + "rewards/margins": 0.9094398617744446, + "rewards/rejected": -1.9344017505645752, "step": 200 }, { - "epoch": 0.15917230401910068, - "eval_logits/chosen": -1.661999225616455, - "eval_logits/rejected": -1.6082700490951538, - "eval_logps/chosen": -295.57269287109375, - "eval_logps/rejected": -351.9329528808594, - "eval_loss": 0.6026676893234253, - "eval_rewards/accuracies": 0.6679104566574097, - "eval_rewards/chosen": -1.5132923126220703, - "eval_rewards/margins": 0.49904683232307434, - "eval_rewards/rejected": -2.0123391151428223, - "eval_runtime": 153.0416, - "eval_samples_per_second": 55.88, - "eval_steps_per_second": 0.876, + "epoch": 0.42, + "eval_logits/chosen": -2.23770809173584, + "eval_logits/rejected": -2.204430103302002, + "eval_logps/chosen": -350.147216796875, + "eval_logps/rejected": -429.3445129394531, + "eval_loss": 0.06620719283819199, + "eval_rewards/accuracies": 0.7421875, + "eval_rewards/chosen": -0.9310759902000427, + "eval_rewards/margins": 0.7888363599777222, + "eval_rewards/rejected": -1.7199124097824097, + "eval_runtime": 53.5039, + "eval_samples_per_second": 37.38, + "eval_steps_per_second": 0.598, "step": 200 }, { - "epoch": 0.1671309192200557, - "grad_norm": 16.167356453307296, - "learning_rate": 4.932136424161899e-07, - "logits/chosen": -1.4656827449798584, - "logits/rejected": -1.4232580661773682, - "logps/chosen": -280.2381591796875, - "logps/rejected": -332.170654296875, - "loss": 0.6021, - "rewards/accuracies": 0.65625, - "rewards/chosen": -1.546456217765808, - "rewards/margins": 0.4581894874572754, - "rewards/rejected": -2.004645824432373, + "epoch": 0.44, + "learning_rate": 3.4440382358952115e-07, + "logits/chosen": -2.2368922233581543, + "logits/rejected": -2.164578437805176, + "logps/chosen": -367.9010009765625, + "logps/rejected": -403.06689453125, + "loss": 0.0688, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.0069431066513062, + "rewards/margins": 0.6194397211074829, + "rewards/rejected": -1.6263830661773682, "step": 210 }, { - "epoch": 0.17508953442101075, - "grad_norm": 14.356845739543413, - "learning_rate": 4.915114123589732e-07, - "logits/chosen": -1.5527619123458862, - "logits/rejected": -1.5004384517669678, - "logps/chosen": -306.37225341796875, - "logps/rejected": -345.39385986328125, - "loss": 0.6093, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -1.6947263479232788, - "rewards/margins": 0.40286189317703247, - "rewards/rejected": -2.097588300704956, + "epoch": 0.46, + "learning_rate": 3.272542485937368e-07, + "logits/chosen": -2.223553419113159, + "logits/rejected": -2.1863625049591064, + "logps/chosen": -383.9325256347656, + "logps/rejected": -408.54156494140625, + "loss": 0.066, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.893720805644989, + "rewards/margins": 0.7813506126403809, + "rewards/rejected": -1.6750714778900146, "step": 220 }, { - "epoch": 0.18304814962196578, - "grad_norm": 18.794378620034212, - "learning_rate": 4.896225217511849e-07, - "logits/chosen": -1.7889591455459595, - "logits/rejected": -1.7687686681747437, - "logps/chosen": -236.9343719482422, - "logps/rejected": -293.1578674316406, - "loss": 0.58, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -0.9763768911361694, - "rewards/margins": 0.46785277128219604, - "rewards/rejected": -1.4442296028137207, + "epoch": 0.48, + "learning_rate": 3.096924887558854e-07, + "logits/chosen": -2.1857941150665283, + "logits/rejected": -2.1605677604675293, + "logps/chosen": -385.0683288574219, + "logps/rejected": -465.76214599609375, + "loss": 0.065, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.0325820446014404, + "rewards/margins": 0.7163792848587036, + "rewards/rejected": -1.7489612102508545, "step": 230 }, { - "epoch": 0.1910067648229208, - "grad_norm": 16.085123205817453, - "learning_rate": 4.875484304880629e-07, - "logits/chosen": -1.8684545755386353, - "logits/rejected": -1.8107630014419556, - "logps/chosen": -275.6334533691406, - "logps/rejected": -318.3482360839844, - "loss": 0.5724, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -1.13875412940979, - "rewards/margins": 0.4855495095252991, - "rewards/rejected": -1.6243034601211548, + "epoch": 0.5, + "learning_rate": 2.9181224366319943e-07, + "logits/chosen": -2.1450023651123047, + "logits/rejected": -2.1016509532928467, + "logps/chosen": -375.78448486328125, + "logps/rejected": -456.2076110839844, + "loss": 0.0663, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.2208166122436523, + "rewards/margins": 0.755986213684082, + "rewards/rejected": -1.9768028259277344, "step": 240 }, { - "epoch": 0.19896538002387584, - "grad_norm": 18.207419558759465, - "learning_rate": 4.852907416036558e-07, - "logits/chosen": -1.8272058963775635, - "logits/rejected": -1.8065818548202515, - "logps/chosen": -253.82308959960938, - "logps/rejected": -313.4346618652344, - "loss": 0.5908, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -1.1452624797821045, - "rewards/margins": 0.4727972149848938, - "rewards/rejected": -1.618059754371643, + "epoch": 0.52, + "learning_rate": 2.7370891215954565e-07, + "logits/chosen": -2.084066390991211, + "logits/rejected": -2.000739574432373, + "logps/chosen": -408.6034240722656, + "logps/rejected": -460.944091796875, + "loss": 0.0656, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.496781587600708, + "rewards/margins": 0.7584124207496643, + "rewards/rejected": -2.2551939487457275, "step": 250 }, { - "epoch": 0.20692399522483088, - "grad_norm": 17.79415319185038, - "learning_rate": 4.828512000318616e-07, - "logits/chosen": -1.2992960214614868, - "logits/rejected": -1.1269283294677734, - "logps/chosen": -330.8904113769531, - "logps/rejected": -382.5238952636719, - "loss": 0.5689, - "rewards/accuracies": 0.71875, - "rewards/chosen": -1.661895990371704, - "rewards/margins": 0.6118096709251404, - "rewards/rejected": -2.2737057209014893, + "epoch": 0.54, + "learning_rate": 2.55479083351317e-07, + "logits/chosen": -2.1052582263946533, + "logits/rejected": -2.0781798362731934, + "logps/chosen": -385.717529296875, + "logps/rejected": -427.49053955078125, + "loss": 0.0675, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.171087622642517, + "rewards/margins": 0.6191247701644897, + "rewards/rejected": -1.7902123928070068, "step": 260 }, { - "epoch": 0.2148826104257859, - "grad_norm": 14.406997827267608, - "learning_rate": 4.802316912577946e-07, - "logits/chosen": -1.2646944522857666, - "logits/rejected": -1.1142375469207764, - "logps/chosen": -307.0154724121094, - "logps/rejected": -348.8962707519531, - "loss": 0.5872, - "rewards/accuracies": 0.71875, - "rewards/chosen": -1.5936686992645264, - "rewards/margins": 0.5530277490615845, - "rewards/rejected": -2.1466963291168213, + "epoch": 0.56, + "learning_rate": 2.3722002126275822e-07, + "logits/chosen": -2.060842514038086, + "logits/rejected": -2.032809257507324, + "logps/chosen": -411.20904541015625, + "logps/rejected": -466.87945556640625, + "loss": 0.0644, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.3635554313659668, + "rewards/margins": 0.7496285438537598, + "rewards/rejected": -2.1131839752197266, "step": 270 }, { - "epoch": 0.22284122562674094, - "grad_norm": 20.387615258054584, - "learning_rate": 4.774342398605221e-07, - "logits/chosen": -1.1495741605758667, - "logits/rejected": -1.0322356224060059, - "logps/chosen": -358.7090759277344, - "logps/rejected": -401.15283203125, - "loss": 0.5778, - "rewards/accuracies": 0.65625, - "rewards/chosen": -2.131046772003174, - "rewards/margins": 0.4818685054779053, - "rewards/rejected": -2.6129150390625, + "epoch": 0.59, + "learning_rate": 2.19029145890313e-07, + "logits/chosen": -2.0691192150115967, + "logits/rejected": -2.014458417892456, + "logps/chosen": -368.3045349121094, + "logps/rejected": -434.04840087890625, + "loss": 0.0659, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.2689889669418335, + "rewards/margins": 0.7694240808486938, + "rewards/rejected": -2.0384132862091064, "step": 280 }, { - "epoch": 0.23079984082769597, - "grad_norm": 17.33296023667363, - "learning_rate": 4.744610079482978e-07, - "logits/chosen": -1.5665136575698853, - "logits/rejected": -1.3881865739822388, - "logps/chosen": -337.09918212890625, - "logps/rejected": -372.98944091796875, - "loss": 0.5975, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -1.7142517566680908, - "rewards/margins": 0.448456346988678, - "rewards/rejected": -2.162708282470703, + "epoch": 0.61, + "learning_rate": 2.0100351342479216e-07, + "logits/chosen": -2.0620741844177246, + "logits/rejected": -2.0015225410461426, + "logps/chosen": -380.07489013671875, + "logps/rejected": -437.5985412597656, + "loss": 0.0641, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.3346366882324219, + "rewards/margins": 0.6989216208457947, + "rewards/rejected": -2.0335583686828613, "step": 290 }, { - "epoch": 0.238758456028651, - "grad_norm": 14.305368392746832, - "learning_rate": 4.713142934875005e-07, - "logits/chosen": -1.339380145072937, - "logits/rejected": -1.121058702468872, - "logps/chosen": -292.33392333984375, - "logps/rejected": -326.50115966796875, - "loss": 0.578, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -1.4477901458740234, - "rewards/margins": 0.5085382461547852, - "rewards/rejected": -1.9563281536102295, + "epoch": 0.63, + "learning_rate": 1.8323929841460178e-07, + "logits/chosen": -2.0831615924835205, + "logits/rejected": -2.0331509113311768, + "logps/chosen": -385.72332763671875, + "logps/rejected": -458.3067932128906, + "loss": 0.0648, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.294547200202942, + "rewards/margins": 0.7150087356567383, + "rewards/rejected": -2.0095558166503906, "step": 300 }, { - "epoch": 0.238758456028651, - "eval_logits/chosen": -1.5127747058868408, - "eval_logits/rejected": -1.3925303220748901, - "eval_logps/chosen": -271.07684326171875, - "eval_logps/rejected": -322.1283874511719, - "eval_loss": 0.575111448764801, - "eval_rewards/accuracies": 0.6893656849861145, - "eval_rewards/chosen": -1.2683334350585938, - "eval_rewards/margins": 0.4459605813026428, - "eval_rewards/rejected": -1.714294195175171, - "eval_runtime": 153.0554, - "eval_samples_per_second": 55.875, - "eval_steps_per_second": 0.876, + "epoch": 0.63, + "eval_logits/chosen": -2.1098170280456543, + "eval_logits/rejected": -2.072664976119995, + "eval_logps/chosen": -382.6705322265625, + "eval_logps/rejected": -471.1217041015625, + "eval_loss": 0.06434247642755508, + "eval_rewards/accuracies": 0.7734375, + "eval_rewards/chosen": -1.256308674812317, + "eval_rewards/margins": 0.8813759088516235, + "eval_rewards/rejected": -2.1376843452453613, + "eval_runtime": 53.4979, + "eval_samples_per_second": 37.385, + "eval_steps_per_second": 0.598, "step": 300 }, { - "epoch": 0.24671707122960604, - "grad_norm": 16.44125545365527, - "learning_rate": 4.679965285265706e-07, - "logits/chosen": -1.284905195236206, - "logits/rejected": -1.2035940885543823, - "logps/chosen": -255.46212768554688, - "logps/rejected": -315.3814697265625, - "loss": 0.5717, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -1.3790826797485352, - "rewards/margins": 0.47862672805786133, - "rewards/rejected": -1.857709288597107, + "epoch": 0.65, + "learning_rate": 1.6583128063291573e-07, + "logits/chosen": -2.038897752761841, + "logits/rejected": -1.9522100687026978, + "logps/chosen": -395.3385009765625, + "logps/rejected": -463.807861328125, + "loss": 0.0619, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -1.2215960025787354, + "rewards/margins": 0.9447793960571289, + "rewards/rejected": -2.1663756370544434, "step": 310 }, { - "epoch": 0.2546756864305611, - "grad_norm": 18.14884709080922, - "learning_rate": 4.64510277316316e-07, - "logits/chosen": -0.8231368064880371, - "logits/rejected": -0.5619879961013794, - "logps/chosen": -297.4189453125, - "logps/rejected": -376.95050048828125, - "loss": 0.5411, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": -1.5178688764572144, - "rewards/margins": 0.8594205975532532, - "rewards/rejected": -2.3772895336151123, + "epoch": 0.67, + "learning_rate": 1.488723393865766e-07, + "logits/chosen": -2.0476739406585693, + "logits/rejected": -2.003208875656128, + "logps/chosen": -440.6886291503906, + "logps/rejected": -456.0061950683594, + "loss": 0.0649, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.443062663078308, + "rewards/margins": 0.7629345655441284, + "rewards/rejected": -2.2059974670410156, "step": 320 }, { - "epoch": 0.26263430163151613, - "grad_norm": 16.049521884862, - "learning_rate": 4.6085823432804137e-07, - "logits/chosen": -0.8478446006774902, - "logits/rejected": -0.6845074892044067, - "logps/chosen": -326.3879699707031, - "logps/rejected": -379.32586669921875, - "loss": 0.5815, - "rewards/accuracies": 0.71875, - "rewards/chosen": -1.8262717723846436, - "rewards/margins": 0.5317971110343933, - "rewards/rejected": -2.3580689430236816, + "epoch": 0.69, + "learning_rate": 1.3245295796480788e-07, + "logits/chosen": -2.0648202896118164, + "logits/rejected": -2.0035393238067627, + "logps/chosen": -399.42724609375, + "logps/rejected": -452.73468017578125, + "loss": 0.0643, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.472140908241272, + "rewards/margins": 0.7188968062400818, + "rewards/rejected": -2.191037654876709, "step": 330 }, { - "epoch": 0.27059291683247116, - "grad_norm": 17.252563575103384, - "learning_rate": 4.570432221710314e-07, - "logits/chosen": -0.7273017764091492, - "logits/rejected": -0.5780073404312134, - "logps/chosen": -302.949462890625, - "logps/rejected": -364.41619873046875, - "loss": 0.5814, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -1.567679762840271, - "rewards/margins": 0.5210777521133423, - "rewards/rejected": -2.0887577533721924, + "epoch": 0.71, + "learning_rate": 1.1666074087171627e-07, + "logits/chosen": -2.010235548019409, + "logits/rejected": -1.9665842056274414, + "logps/chosen": -401.20391845703125, + "logps/rejected": -479.63140869140625, + "loss": 0.0655, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.4996058940887451, + "rewards/margins": 0.8774793744087219, + "rewards/rejected": -2.3770852088928223, "step": 340 }, { - "epoch": 0.2785515320334262, - "grad_norm": 15.400974157549417, - "learning_rate": 4.5306818941099866e-07, - "logits/chosen": -0.4381464123725891, - "logits/rejected": -0.08063732087612152, - "logps/chosen": -313.7085876464844, - "logps/rejected": -357.1277770996094, - "loss": 0.5942, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -1.6348435878753662, - "rewards/margins": 0.5984010100364685, - "rewards/rejected": -2.2332444190979004, + "epoch": 0.73, + "learning_rate": 1.0157994641835734e-07, + "logits/chosen": -1.9906257390975952, + "logits/rejected": -1.9489715099334717, + "logps/chosen": -377.80291748046875, + "logps/rejected": -473.23565673828125, + "loss": 0.0621, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.5185272693634033, + "rewards/margins": 0.806858241558075, + "rewards/rejected": -2.325385570526123, "step": 350 }, { - "epoch": 0.28651014723438123, - "grad_norm": 14.504561357614989, - "learning_rate": 4.4893620829118124e-07, - "logits/chosen": -0.4463214874267578, - "logits/rejected": -0.24701222777366638, - "logps/chosen": -351.8408203125, - "logps/rejected": -392.67095947265625, - "loss": 0.5536, - "rewards/accuracies": 0.6875, - "rewards/chosen": -2.0010366439819336, - "rewards/margins": 0.4717040956020355, - "rewards/rejected": -2.472740888595581, + "epoch": 0.75, + "learning_rate": 8.729103716819111e-08, + "logits/chosen": -2.0000805854797363, + "logits/rejected": -1.9243812561035156, + "logps/chosen": -406.80120849609375, + "logps/rejected": -481.9264221191406, + "loss": 0.0596, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3953304290771484, + "rewards/margins": 1.0305477380752563, + "rewards/rejected": -2.4258780479431152, "step": 360 }, { - "epoch": 0.29446876243533626, - "grad_norm": 17.52031908845256, - "learning_rate": 4.4465047235785185e-07, - "logits/chosen": -0.9400293231010437, - "logits/rejected": -0.6988920569419861, - "logps/chosen": -370.46893310546875, - "logps/rejected": -406.6575927734375, - "loss": 0.5671, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -2.2562451362609863, - "rewards/margins": 0.5276610851287842, - "rewards/rejected": -2.7839062213897705, + "epoch": 0.77, + "learning_rate": 7.387025063449081e-08, + "logits/chosen": -1.9636814594268799, + "logits/rejected": -1.941389799118042, + "logps/chosen": -379.83538818359375, + "logps/rejected": -478.5880432128906, + "loss": 0.0611, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.2458791732788086, + "rewards/margins": 0.9973210096359253, + "rewards/rejected": -2.2432003021240234, "step": 370 }, { - "epoch": 0.3024273776362913, - "grad_norm": 16.26258053172861, - "learning_rate": 4.40214293992074e-07, - "logits/chosen": -1.0389044284820557, - "logits/rejected": -0.8688551187515259, - "logps/chosen": -315.2569580078125, - "logps/rejected": -383.2552795410156, - "loss": 0.5533, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -1.7786381244659424, - "rewards/margins": 0.7078015208244324, - "rewards/rejected": -2.4864397048950195, + "epoch": 0.79, + "learning_rate": 6.138919252022435e-08, + "logits/chosen": -2.004906415939331, + "logits/rejected": -1.9661741256713867, + "logps/chosen": -411.92626953125, + "logps/rejected": -458.4734802246094, + "loss": 0.0625, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.46096932888031, + "rewards/margins": 0.755499005317688, + "rewards/rejected": -2.216468334197998, "step": 380 }, { - "epoch": 0.3103859928372463, - "grad_norm": 18.389510644608478, - "learning_rate": 4.3563110184961234e-07, - "logits/chosen": -0.9544135928153992, - "logits/rejected": -0.8071931004524231, - "logps/chosen": -319.52386474609375, - "logps/rejected": -378.70819091796875, - "loss": 0.5412, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -1.7031948566436768, - "rewards/margins": 0.5624147653579712, - "rewards/rejected": -2.2656095027923584, + "epoch": 0.82, + "learning_rate": 4.991445467064689e-08, + "logits/chosen": -1.9821668863296509, + "logits/rejected": -1.9519681930541992, + "logps/chosen": -385.4888000488281, + "logps/rejected": -463.5560607910156, + "loss": 0.0612, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.282893419265747, + "rewards/margins": 0.933008074760437, + "rewards/rejected": -2.2159013748168945, "step": 390 }, { - "epoch": 0.31834460803820136, - "grad_norm": 20.77179730886419, - "learning_rate": 4.3090443821097566e-07, - "logits/chosen": -0.5997304320335388, - "logits/rejected": -0.3994545638561249, - "logps/chosen": -297.0989074707031, - "logps/rejected": -398.7206726074219, - "loss": 0.5575, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -1.6102817058563232, - "rewards/margins": 0.8746851086616516, - "rewards/rejected": -2.48496675491333, + "epoch": 0.84, + "learning_rate": 3.9507259776993954e-08, + "logits/chosen": -2.0385870933532715, + "logits/rejected": -2.0015995502471924, + "logps/chosen": -436.51800537109375, + "logps/rejected": -499.8250427246094, + "loss": 0.0636, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.3635799884796143, + "rewards/margins": 0.9055550694465637, + "rewards/rejected": -2.269134998321533, "step": 400 }, { - "epoch": 0.31834460803820136, - "eval_logits/chosen": -0.42633184790611267, - "eval_logits/rejected": -0.25109803676605225, - "eval_logps/chosen": -322.9848327636719, - "eval_logps/rejected": -395.5074157714844, - "eval_loss": 0.5613003373146057, - "eval_rewards/accuracies": 0.7052238583564758, - "eval_rewards/chosen": -1.7874133586883545, - "eval_rewards/margins": 0.6606705784797668, - "eval_rewards/rejected": -2.4480838775634766, - "eval_runtime": 153.0612, - "eval_samples_per_second": 55.873, - "eval_steps_per_second": 0.875, + "epoch": 0.84, + "eval_logits/chosen": -2.047151565551758, + "eval_logits/rejected": -2.0084266662597656, + "eval_logps/chosen": -391.10418701171875, + "eval_logps/rejected": -488.8221740722656, + "eval_loss": 0.06317802518606186, + "eval_rewards/accuracies": 0.7734375, + "eval_rewards/chosen": -1.3406453132629395, + "eval_rewards/margins": 0.9740438461303711, + "eval_rewards/rejected": -2.3146889209747314, + "eval_runtime": 53.4405, + "eval_samples_per_second": 37.425, + "eval_steps_per_second": 0.599, "step": 400 }, { - "epoch": 0.3263032232391564, - "grad_norm": 14.6184384920338, - "learning_rate": 4.2603795624364195e-07, - "logits/chosen": -0.3563244640827179, - "logits/rejected": -0.04272305592894554, - "logps/chosen": -324.42291259765625, - "logps/rejected": -374.9647521972656, - "loss": 0.5646, - "rewards/accuracies": 0.6875, - "rewards/chosen": -1.8804908990859985, - "rewards/margins": 0.5943077802658081, - "rewards/rejected": -2.4747986793518066, + "epoch": 0.86, + "learning_rate": 3.022313472693447e-08, + "logits/chosen": -2.0089824199676514, + "logits/rejected": -1.990161657333374, + "logps/chosen": -397.4513854980469, + "logps/rejected": -437.2802734375, + "loss": 0.064, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.4276202917099, + "rewards/margins": 0.6205279231071472, + "rewards/rejected": -2.0481481552124023, "step": 410 }, { - "epoch": 0.3342618384401114, - "grad_norm": 16.33524211106954, - "learning_rate": 4.210354171785795e-07, - "logits/chosen": -0.3055838942527771, - "logits/rejected": -0.16813552379608154, - "logps/chosen": -361.65093994140625, - "logps/rejected": -439.7191467285156, - "loss": 0.5423, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -2.1948649883270264, - "rewards/margins": 0.6555823087692261, - "rewards/rejected": -2.850447177886963, + "epoch": 0.88, + "learning_rate": 2.2111614344599684e-08, + "logits/chosen": -2.084846019744873, + "logits/rejected": -2.0091195106506348, + "logps/chosen": -399.70892333984375, + "logps/rejected": -477.39215087890625, + "loss": 0.061, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.2201029062271118, + "rewards/margins": 1.0571900606155396, + "rewards/rejected": -2.2772929668426514, "step": 420 }, { - "epoch": 0.34222045364106646, - "grad_norm": 16.700884894882467, - "learning_rate": 4.15900687403248e-07, - "logits/chosen": -0.16022978723049164, - "logits/rejected": 0.04306970164179802, - "logps/chosen": -349.4731750488281, - "logps/rejected": -418.4617614746094, - "loss": 0.5537, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -2.1153881549835205, - "rewards/margins": 0.6664639711380005, - "rewards/rejected": -2.7818517684936523, + "epoch": 0.9, + "learning_rate": 1.521597710086439e-08, + "logits/chosen": -2.076000928878784, + "logits/rejected": -2.0238547325134277, + "logps/chosen": -406.68243408203125, + "logps/rejected": -459.352783203125, + "loss": 0.0611, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.3172928094863892, + "rewards/margins": 0.819710910320282, + "rewards/rejected": -2.1370038986206055, "step": 430 }, { - "epoch": 0.3501790688420215, - "grad_norm": 14.476246245258272, - "learning_rate": 4.1063773547332584e-07, - "logits/chosen": 0.17018680274486542, - "logits/rejected": 0.42503976821899414, - "logps/chosen": -332.9987487792969, - "logps/rejected": -387.84613037109375, - "loss": 0.5584, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -1.8605985641479492, - "rewards/margins": 0.5384317636489868, - "rewards/rejected": -2.3990302085876465, + "epoch": 0.92, + "learning_rate": 9.57301420397924e-09, + "logits/chosen": -1.9988025426864624, + "logits/rejected": -1.979654312133789, + "logps/chosen": -415.54779052734375, + "logps/rejected": -451.9610900878906, + "loss": 0.0628, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.372821569442749, + "rewards/margins": 0.7440255880355835, + "rewards/rejected": -2.116847276687622, "step": 440 }, { - "epoch": 0.3581376840429765, - "grad_norm": 23.02311576539631, - "learning_rate": 4.0525062904547276e-07, - "logits/chosen": 0.4983510971069336, - "logits/rejected": 0.8844894170761108, - "logps/chosen": -356.9858703613281, - "logps/rejected": -418.490234375, - "loss": 0.5431, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -2.2442288398742676, - "rewards/margins": 0.6913145184516907, - "rewards/rejected": -2.9355432987213135, + "epoch": 0.94, + "learning_rate": 5.212833302556258e-09, + "logits/chosen": -2.028916120529175, + "logits/rejected": -1.9799381494522095, + "logps/chosen": -409.3075866699219, + "logps/rejected": -491.86444091796875, + "loss": 0.0605, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.326899766921997, + "rewards/margins": 0.9430659413337708, + "rewards/rejected": -2.269965648651123, "step": 450 }, { - "epoch": 0.36609629924393156, - "grad_norm": 18.11929521024602, - "learning_rate": 3.997435317334988e-07, - "logits/chosen": 0.0664445012807846, - "logits/rejected": 0.3327707350254059, - "logps/chosen": -370.38568115234375, - "logps/rejected": -447.3545837402344, - "loss": 0.5506, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -2.2337095737457275, - "rewards/margins": 0.7081635594367981, - "rewards/rejected": -2.941873073577881, + "epoch": 0.96, + "learning_rate": 2.158697848236607e-09, + "logits/chosen": -1.9905271530151367, + "logits/rejected": -1.9591131210327148, + "logps/chosen": -391.362060546875, + "logps/rejected": -461.91204833984375, + "loss": 0.0604, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.2606008052825928, + "rewards/margins": 0.8890584111213684, + "rewards/rejected": -2.1496591567993164, "step": 460 }, { - "epoch": 0.3740549144448866, - "grad_norm": 15.617407519466495, - "learning_rate": 3.941206998903701e-07, - "logits/chosen": -0.06704016029834747, - "logits/rejected": 0.2724049985408783, - "logps/chosen": -378.48236083984375, - "logps/rejected": -437.5526428222656, - "loss": 0.5621, - "rewards/accuracies": 0.71875, - "rewards/chosen": -2.237128973007202, - "rewards/margins": 0.6170836687088013, - "rewards/rejected": -2.854212760925293, - "step": 470 - }, - { - "epoch": 0.3820135296458416, - "grad_norm": 20.78463516779003, - "learning_rate": 3.8838647931853684e-07, - "logits/chosen": -0.20896323025226593, - "logits/rejected": 0.013983624055981636, - "logps/chosen": -309.01287841796875, - "logps/rejected": -382.2357177734375, - "loss": 0.5527, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -1.8711271286010742, - "rewards/margins": 0.6992978453636169, - "rewards/rejected": -2.570424795150757, - "step": 480 - }, - { - "epoch": 0.38997214484679665, - "grad_norm": 17.40353233817741, - "learning_rate": 3.825453019111281e-07, - "logits/chosen": -0.6753722429275513, - "logits/rejected": -0.3529302477836609, - "logps/chosen": -310.6385192871094, - "logps/rejected": -395.55609130859375, - "loss": 0.5438, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -1.6616952419281006, - "rewards/margins": 0.7488740682601929, - "rewards/rejected": -2.410569190979004, - "step": 490 - }, - { - "epoch": 0.3979307600477517, - "grad_norm": 19.06650086284646, - "learning_rate": 3.7660168222660824e-07, - "logits/chosen": -0.7103143930435181, - "logits/rejected": -0.3697218894958496, - "logps/chosen": -343.75140380859375, - "logps/rejected": -391.75445556640625, - "loss": 0.5311, - "rewards/accuracies": 0.71875, - "rewards/chosen": -1.753021478652954, - "rewards/margins": 0.6420953869819641, - "rewards/rejected": -2.3951168060302734, - "step": 500 - }, - { - "epoch": 0.3979307600477517, - "eval_logits/chosen": -0.14436845481395721, - "eval_logits/rejected": 0.1320793181657791, - "eval_logps/chosen": -351.6741027832031, - "eval_logps/rejected": -428.5196228027344, - "eval_loss": 0.5600787997245789, - "eval_rewards/accuracies": 0.7248134613037109, - "eval_rewards/chosen": -2.074305534362793, - "eval_rewards/margins": 0.7039004564285278, - "eval_rewards/rejected": -2.7782061100006104, - "eval_runtime": 153.2793, - "eval_samples_per_second": 55.794, - "eval_steps_per_second": 0.874, - "step": 500 - }, - { - "epoch": 0.4058893752487067, - "grad_norm": 20.933791990106656, - "learning_rate": 3.705602139995416e-07, - "logits/chosen": -0.009120747447013855, - "logits/rejected": 0.2614721655845642, - "logps/chosen": -390.1700744628906, - "logps/rejected": -449.2955627441406, - "loss": 0.5792, - "rewards/accuracies": 0.6875, - "rewards/chosen": -2.3790764808654785, - "rewards/margins": 0.6102325916290283, - "rewards/rejected": -2.989309549331665, - "step": 510 - }, - { - "epoch": 0.41384799044966175, - "grad_norm": 15.961444394222205, - "learning_rate": 3.6442556659016475e-07, - "logits/chosen": 0.2502554953098297, - "logits/rejected": 0.46849527955055237, - "logps/chosen": -383.04693603515625, - "logps/rejected": -437.341796875, - "loss": 0.5409, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -2.355401039123535, - "rewards/margins": 0.519227921962738, - "rewards/rejected": -2.874629259109497, - "step": 520 - }, - { - "epoch": 0.4218066056506168, - "grad_norm": 16.827064617105826, - "learning_rate": 3.582024813755076e-07, - "logits/chosen": 0.6198503971099854, - "logits/rejected": 0.8526325225830078, - "logps/chosen": -389.84356689453125, - "logps/rejected": -434.67999267578125, - "loss": 0.552, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -2.392110824584961, - "rewards/margins": 0.48854774236679077, - "rewards/rejected": -2.8806586265563965, - "step": 530 - }, - { - "epoch": 0.4297652208515718, - "grad_norm": 18.188037065315324, - "learning_rate": 3.5189576808485404e-07, - "logits/chosen": 0.3544410765171051, - "logits/rejected": 0.4798678755760193, - "logps/chosen": -369.5892028808594, - "logps/rejected": -434.74053955078125, - "loss": 0.5622, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -2.1900246143341064, - "rewards/margins": 0.6025354862213135, - "rewards/rejected": -2.79256010055542, - "step": 540 - }, - { - "epoch": 0.43772383605252685, - "grad_norm": 20.54659926983385, - "learning_rate": 3.4551030108237433e-07, - "logits/chosen": -0.033793479204177856, - "logits/rejected": 0.37864160537719727, - "logps/chosen": -356.50653076171875, - "logps/rejected": -409.03839111328125, - "loss": 0.5558, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -2.03935170173645, - "rewards/margins": 0.6931120157241821, - "rewards/rejected": -2.7324633598327637, - "step": 550 - }, - { - "epoch": 0.4456824512534819, - "grad_norm": 17.320248907302155, - "learning_rate": 3.390510155998023e-07, - "logits/chosen": -0.5626561045646667, - "logits/rejected": -0.217830628156662, - "logps/chosen": -339.8533630371094, - "logps/rejected": -406.0281066894531, - "loss": 0.5374, - "rewards/accuracies": 0.6875, - "rewards/chosen": -1.8559491634368896, - "rewards/margins": 0.6421788334846497, - "rewards/rejected": -2.4981281757354736, - "step": 560 - }, - { - "epoch": 0.4536410664544369, - "grad_norm": 18.454225254996054, - "learning_rate": 3.325229039220684e-07, - "logits/chosen": -0.4510825574398041, - "logits/rejected": -0.2019493579864502, - "logps/chosen": -339.4858093261719, - "logps/rejected": -399.65234375, - "loss": 0.5633, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -1.8544174432754517, - "rewards/margins": 0.6207035779953003, - "rewards/rejected": -2.475121021270752, - "step": 570 - }, - { - "epoch": 0.46159968165539195, - "grad_norm": 15.265836542622413, - "learning_rate": 3.2593101152883795e-07, - "logits/chosen": -0.5197261571884155, - "logits/rejected": -0.3228117823600769, - "logps/chosen": -316.60516357421875, - "logps/rejected": -366.8912658691406, - "loss": 0.5638, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -1.6767441034317017, - "rewards/margins": 0.5252284407615662, - "rewards/rejected": -2.201972484588623, - "step": 580 - }, - { - "epoch": 0.469558296856347, - "grad_norm": 15.206190457344837, - "learning_rate": 3.192804331949349e-07, - "logits/chosen": 0.21528498828411102, - "logits/rejected": 0.4312516152858734, - "logps/chosen": -316.5174865722656, - "logps/rejected": -376.46002197265625, - "loss": 0.5289, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -1.7388776540756226, - "rewards/margins": 0.6324383020401001, - "rewards/rejected": -2.3713161945343018, - "step": 590 - }, - { - "epoch": 0.477516912057302, - "grad_norm": 20.282458926057863, - "learning_rate": 3.125763090526674e-07, - "logits/chosen": 0.4832405149936676, - "logits/rejected": 0.88775235414505, - "logps/chosen": -342.80096435546875, - "logps/rejected": -407.1078796386719, - "loss": 0.5658, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -1.8795427083969116, - "rewards/margins": 0.7021334171295166, - "rewards/rejected": -2.5816760063171387, - "step": 600 - }, - { - "epoch": 0.477516912057302, - "eval_logits/chosen": 0.6660908460617065, - "eval_logits/rejected": 0.9124837517738342, - "eval_logps/chosen": -340.0068664550781, - "eval_logps/rejected": -416.9898986816406, - "eval_loss": 0.5562008619308472, - "eval_rewards/accuracies": 0.7192164063453674, - "eval_rewards/chosen": -1.9576338529586792, - "eval_rewards/margins": 0.7052750587463379, - "eval_rewards/rejected": -2.6629090309143066, - "eval_runtime": 153.1635, - "eval_samples_per_second": 55.836, - "eval_steps_per_second": 0.875, - "step": 600 - }, - { - "epoch": 0.48547552725825704, - "grad_norm": 14.608157782099068, - "learning_rate": 3.0582382061909623e-07, - "logits/chosen": 0.36511674523353577, - "logits/rejected": 0.6286307573318481, - "logps/chosen": -337.5827941894531, - "logps/rejected": -395.5306701660156, - "loss": 0.5395, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -1.9645658731460571, - "rewards/margins": 0.5609062910079956, - "rewards/rejected": -2.5254719257354736, - "step": 610 - }, - { - "epoch": 0.4934341424592121, - "grad_norm": 14.933588886711542, - "learning_rate": 2.9902818679131775e-07, - "logits/chosen": 0.3990648686885834, - "logits/rejected": 0.5564724206924438, - "logps/chosen": -339.22735595703125, - "logps/rejected": -399.9192810058594, - "loss": 0.5578, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -1.8450448513031006, - "rewards/margins": 0.6625381708145142, - "rewards/rejected": -2.507582902908325, - "step": 620 - }, - { - "epoch": 0.5013927576601671, - "grad_norm": 18.611562829139118, - "learning_rate": 2.921946598128571e-07, - "logits/chosen": 0.7237969636917114, - "logits/rejected": 1.000910997390747, - "logps/chosen": -346.236572265625, - "logps/rejected": -393.0661315917969, - "loss": 0.5371, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -1.9345744848251343, - "rewards/margins": 0.6754398941993713, - "rewards/rejected": -2.6100144386291504, - "step": 630 - }, - { - "epoch": 0.5093513728611222, - "grad_norm": 18.159841838504743, - "learning_rate": 2.8532852121428733e-07, - "logits/chosen": 1.1098816394805908, - "logits/rejected": 1.3760236501693726, - "logps/chosen": -353.3837890625, - "logps/rejected": -409.08062744140625, - "loss": 0.5375, - "rewards/accuracies": 0.6875, - "rewards/chosen": -2.1785995960235596, - "rewards/margins": 0.5797316431999207, - "rewards/rejected": -2.758331537246704, - "step": 640 - }, - { - "epoch": 0.5173099880620772, - "grad_norm": 17.95563717396888, - "learning_rate": 2.7843507773121414e-07, - "logits/chosen": 0.8091678619384766, - "logits/rejected": 1.021480679512024, - "logps/chosen": -350.10638427734375, - "logps/rejected": -437.02569580078125, - "loss": 0.511, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -2.128905773162842, - "rewards/margins": 0.7973555326461792, - "rewards/rejected": -2.9262614250183105, - "step": 650 - }, - { - "epoch": 0.5252686032630323, - "grad_norm": 16.8332185564681, - "learning_rate": 2.715196572027789e-07, - "logits/chosen": 1.244178056716919, - "logits/rejected": 1.4877557754516602, - "logps/chosen": -353.20233154296875, - "logps/rejected": -446.7106018066406, - "loss": 0.5551, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -2.087367057800293, - "rewards/margins": 0.838540256023407, - "rewards/rejected": -2.9259073734283447, - "step": 660 - }, - { - "epoch": 0.5332272184639872, - "grad_norm": 14.658814068318406, - "learning_rate": 2.645876044538521e-07, - "logits/chosen": 0.5786877274513245, - "logits/rejected": 0.8311547040939331, - "logps/chosen": -339.4533386230469, - "logps/rejected": -391.8295593261719, - "loss": 0.5523, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -1.9032132625579834, - "rewards/margins": 0.6066684126853943, - "rewards/rejected": -2.5098819732666016, - "step": 670 - }, - { - "epoch": 0.5411858336649423, - "grad_norm": 16.282915430254512, - "learning_rate": 2.5764427716409815e-07, - "logits/chosen": 0.6706225275993347, - "logits/rejected": 0.9435638189315796, - "logps/chosen": -339.28765869140625, - "logps/rejected": -400.4329528808594, - "loss": 0.551, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -1.8451801538467407, - "rewards/margins": 0.6226030588150024, - "rewards/rejected": -2.4677834510803223, - "step": 680 - }, - { - "epoch": 0.5491444488658973, - "grad_norm": 18.972868571281783, - "learning_rate": 2.5069504172710494e-07, - "logits/chosen": 0.5228421092033386, - "logits/rejected": 0.6484982967376709, - "logps/chosen": -349.12841796875, - "logps/rejected": -433.3041076660156, - "loss": 0.5433, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -2.007934093475342, - "rewards/margins": 0.6637459993362427, - "rewards/rejected": -2.671679735183716, - "step": 690 - }, - { - "epoch": 0.5571030640668524, - "grad_norm": 20.904681899906066, - "learning_rate": 2.4374526910277886e-07, - "logits/chosen": 0.6581841111183167, - "logits/rejected": 0.9438881874084473, - "logps/chosen": -350.12841796875, - "logps/rejected": -397.9573974609375, - "loss": 0.556, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -2.0671610832214355, - "rewards/margins": 0.5225776433944702, - "rewards/rejected": -2.589738368988037, - "step": 700 - }, - { - "epoch": 0.5571030640668524, - "eval_logits/chosen": 0.7301986217498779, - "eval_logits/rejected": 0.9968724250793457, - "eval_logps/chosen": -355.7083740234375, - "eval_logps/rejected": -428.9443359375, - "eval_loss": 0.5501761436462402, - "eval_rewards/accuracies": 0.7201492786407471, - "eval_rewards/chosen": -2.1146485805511475, - "eval_rewards/margins": 0.6678044199943542, - "eval_rewards/rejected": -2.7824532985687256, - "eval_runtime": 153.1053, - "eval_samples_per_second": 55.857, - "eval_steps_per_second": 0.875, - "step": 700 - }, - { - "epoch": 0.5650616792678074, - "grad_norm": 16.070119739343724, - "learning_rate": 2.368003306662104e-07, - "logits/chosen": 0.8950408101081848, - "logits/rejected": 1.2581216096878052, - "logps/chosen": -378.45245361328125, - "logps/rejected": -437.49542236328125, - "loss": 0.5443, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -2.228590488433838, - "rewards/margins": 0.7023531198501587, - "rewards/rejected": -2.930943489074707, - "step": 710 - }, - { - "epoch": 0.5730202944687625, - "grad_norm": 23.561044168016466, - "learning_rate": 2.2986559405621886e-07, - "logits/chosen": 0.8009999990463257, - "logits/rejected": 1.2362650632858276, - "logps/chosen": -351.4358215332031, - "logps/rejected": -408.4342346191406, - "loss": 0.5543, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -1.889967679977417, - "rewards/margins": 0.6260396242141724, - "rewards/rejected": -2.5160071849823, - "step": 720 - }, - { - "epoch": 0.5809789096697174, - "grad_norm": 17.715188324255074, - "learning_rate": 2.2294641902678443e-07, - "logits/chosen": 0.6837292909622192, - "logits/rejected": 1.051099181175232, - "logps/chosen": -342.70269775390625, - "logps/rejected": -421.7893981933594, - "loss": 0.5134, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": -2.075383424758911, - "rewards/margins": 0.8238208889961243, - "rewards/rejected": -2.8992042541503906, - "step": 730 - }, - { - "epoch": 0.5889375248706725, - "grad_norm": 19.531113452828787, - "learning_rate": 2.160481533045751e-07, - "logits/chosen": 0.7316833734512329, - "logits/rejected": 1.1876373291015625, - "logps/chosen": -371.8680725097656, - "logps/rejected": -437.3309020996094, - "loss": 0.5553, - "rewards/accuracies": 0.793749988079071, - "rewards/chosen": -2.2173359394073486, - "rewards/margins": 0.7488197088241577, - "rewards/rejected": -2.966156005859375, - "step": 740 - }, - { - "epoch": 0.5968961400716275, - "grad_norm": 17.826971988142052, - "learning_rate": 2.0917612845576882e-07, - "logits/chosen": 0.47909075021743774, - "logits/rejected": 0.9067865610122681, - "logps/chosen": -358.71551513671875, - "logps/rejected": -414.9165954589844, - "loss": 0.5267, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -2.022329807281494, - "rewards/margins": 0.7758738994598389, - "rewards/rejected": -2.798203945159912, - "step": 750 - }, - { - "epoch": 0.6048547552725826, - "grad_norm": 16.01258522652455, - "learning_rate": 2.0233565576536564e-07, - "logits/chosen": 0.6064720153808594, - "logits/rejected": 0.7591885328292847, - "logps/chosen": -335.24798583984375, - "logps/rejected": -399.1866760253906, - "loss": 0.5594, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -1.9327834844589233, - "rewards/margins": 0.5889240503311157, - "rewards/rejected": -2.521707773208618, - "step": 760 - }, - { - "epoch": 0.6128133704735376, - "grad_norm": 15.946070991890718, - "learning_rate": 1.9553202213217537e-07, - "logits/chosen": 0.1371159851551056, - "logits/rejected": 0.3473878800868988, - "logps/chosen": -303.07025146484375, - "logps/rejected": -386.58203125, - "loss": 0.5281, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -1.7267131805419922, - "rewards/margins": 0.7852660417556763, - "rewards/rejected": -2.5119788646698, - "step": 770 - }, - { - "epoch": 0.6207719856744927, - "grad_norm": 21.594187073993353, - "learning_rate": 1.887704859826528e-07, - "logits/chosen": 0.050722457468509674, - "logits/rejected": 0.4147067666053772, - "logps/chosen": -368.37640380859375, - "logps/rejected": -446.398681640625, - "loss": 0.5397, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -2.0712389945983887, - "rewards/margins": 0.7545989155769348, - "rewards/rejected": -2.8258378505706787, - "step": 780 - }, - { - "epoch": 0.6287306008754476, - "grad_norm": 20.596254559457844, - "learning_rate": 1.8205627320673836e-07, - "logits/chosen": 0.6064023375511169, - "logits/rejected": 1.075620174407959, - "logps/chosen": -376.5938720703125, - "logps/rejected": -465.5481872558594, - "loss": 0.5334, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -2.2303614616394043, - "rewards/margins": 0.9606544375419617, - "rewards/rejected": -3.1910159587860107, - "step": 790 - }, - { - "epoch": 0.6366892160764027, - "grad_norm": 15.746309704598048, - "learning_rate": 1.7539457311884675e-07, - "logits/chosen": 0.6997416615486145, - "logits/rejected": 1.1372315883636475, - "logps/chosen": -385.6691589355469, - "logps/rejected": -445.677978515625, - "loss": 0.5285, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -2.2797977924346924, - "rewards/margins": 0.7119247317314148, - "rewards/rejected": -2.991722583770752, - "step": 800 - }, - { - "epoch": 0.6366892160764027, - "eval_logits/chosen": 0.6029295921325684, - "eval_logits/rejected": 0.85638028383255, - "eval_logps/chosen": -364.04046630859375, - "eval_logps/rejected": -445.2566833496094, - "eval_loss": 0.547686755657196, - "eval_rewards/accuracies": 0.7229477763175964, - "eval_rewards/chosen": -2.197970151901245, - "eval_rewards/margins": 0.7476070523262024, - "eval_rewards/rejected": -2.9455766677856445, - "eval_runtime": 153.0451, - "eval_samples_per_second": 55.879, - "eval_steps_per_second": 0.876, - "step": 800 - }, - { - "epoch": 0.6446478312773577, - "grad_norm": 17.76306380556218, - "learning_rate": 1.687905344471226e-07, - "logits/chosen": 0.5493108034133911, - "logits/rejected": 0.7385646104812622, - "logps/chosen": -376.4003601074219, - "logps/rejected": -441.7693786621094, - "loss": 0.5619, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -2.1274545192718506, - "rewards/margins": 0.6377121210098267, - "rewards/rejected": -2.765166759490967, - "step": 810 - }, - { - "epoch": 0.6526064464783128, - "grad_norm": 17.319583567554556, - "learning_rate": 1.6224926135406693e-07, - "logits/chosen": 0.29170387983322144, - "logits/rejected": 0.5566233992576599, - "logps/chosen": -364.3298645019531, - "logps/rejected": -414.4306640625, - "loss": 0.5414, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -2.1975741386413574, - "rewards/margins": 0.6033033728599548, - "rewards/rejected": -2.800877332687378, - "step": 820 - }, - { - "epoch": 0.6605650616792678, - "grad_norm": 15.746531845729336, - "learning_rate": 1.557758094916053e-07, - "logits/chosen": -0.04723300039768219, - "logits/rejected": 0.32069122791290283, - "logps/chosen": -368.81829833984375, - "logps/rejected": -438.3575134277344, - "loss": 0.5396, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -2.1188158988952637, - "rewards/margins": 0.700646162033081, - "rewards/rejected": -2.819462299346924, - "step": 830 - }, - { - "epoch": 0.6685236768802229, - "grad_norm": 16.062030141781605, - "learning_rate": 1.4937518209365108e-07, - "logits/chosen": -0.0310503002256155, - "logits/rejected": 0.4633910059928894, - "logps/chosen": -385.63214111328125, - "logps/rejected": -429.7618103027344, - "loss": 0.556, + "epoch": 0.98, + "learning_rate": 4.269029751107489e-10, + "logits/chosen": -2.0331690311431885, + "logits/rejected": -1.9691358804702759, + "logps/chosen": -417.90203857421875, + "logps/rejected": -465.673828125, + "loss": 0.0625, "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -2.071913003921509, - "rewards/margins": 0.6927269101142883, - "rewards/rejected": -2.7646398544311523, - "step": 840 - }, - { - "epoch": 0.6764822920811778, - "grad_norm": 17.885863472179913, - "learning_rate": 1.4305232610918045e-07, - "logits/chosen": 0.18630388379096985, - "logits/rejected": 0.4484528601169586, - "logps/chosen": -366.92767333984375, - "logps/rejected": -433.56396484375, - "loss": 0.5565, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -2.2087783813476562, - "rewards/margins": 0.6998289227485657, - "rewards/rejected": -2.9086074829101562, - "step": 850 - }, - { - "epoch": 0.6844409072821329, - "grad_norm": 15.92958193178829, - "learning_rate": 1.3681212837880977e-07, - "logits/chosen": 0.26056399941444397, - "logits/rejected": 0.38261863589286804, - "logps/chosen": -355.76263427734375, - "logps/rejected": -432.66888427734375, - "loss": 0.5366, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -2.217022180557251, - "rewards/margins": 0.5866159796714783, - "rewards/rejected": -2.803637981414795, - "step": 860 - }, - { - "epoch": 0.6923995224830879, - "grad_norm": 15.260112587373007, - "learning_rate": 1.3065941185782977e-07, - "logits/chosen": 0.5801733732223511, - "logits/rejected": 0.8631863594055176, - "logps/chosen": -384.7486877441406, - "logps/rejected": -430.93670654296875, - "loss": 0.5471, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -2.3252196311950684, - "rewards/margins": 0.6237902641296387, - "rewards/rejected": -2.949010133743286, - "step": 870 - }, - { - "epoch": 0.700358137684043, - "grad_norm": 17.704754563268107, - "learning_rate": 1.2459893188861613e-07, - "logits/chosen": 0.1365218460559845, - "logits/rejected": 0.5395029783248901, - "logps/chosen": -343.422119140625, - "logps/rejected": -448.3148498535156, - "loss": 0.5426, - "rewards/accuracies": 0.856249988079071, - "rewards/chosen": -1.9493602514266968, - "rewards/margins": 0.9091870188713074, - "rewards/rejected": -2.8585472106933594, - "step": 880 - }, - { - "epoch": 0.708316752884998, - "grad_norm": 15.681417295423357, - "learning_rate": 1.1863537252529548e-07, - "logits/chosen": 0.5491958260536194, - "logits/rejected": 1.0280582904815674, - "logps/chosen": -366.83544921875, - "logps/rejected": -434.9790954589844, - "loss": 0.5335, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -2.0856850147247314, - "rewards/margins": 0.7586231231689453, - "rewards/rejected": -2.8443081378936768, - "step": 890 - }, - { - "epoch": 0.716275368085953, - "grad_norm": 16.956525191226525, - "learning_rate": 1.1277334291351145e-07, - "logits/chosen": 0.774886429309845, - "logits/rejected": 1.0509330034255981, - "logps/chosen": -348.33258056640625, - "logps/rejected": -434.04931640625, - "loss": 0.5299, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -2.1099088191986084, - "rewards/margins": 0.79808509349823, - "rewards/rejected": -2.907993793487549, - "step": 900 - }, - { - "epoch": 0.716275368085953, - "eval_logits/chosen": 0.7089307904243469, - "eval_logits/rejected": 0.9831804633140564, - "eval_logps/chosen": -355.4507751464844, - "eval_logps/rejected": -435.81591796875, - "eval_loss": 0.54501873254776, - "eval_rewards/accuracies": 0.7341417670249939, - "eval_rewards/chosen": -2.112072706222534, - "eval_rewards/margins": 0.7390963435173035, - "eval_rewards/rejected": -2.851168632507324, - "eval_runtime": 153.2199, - "eval_samples_per_second": 55.815, - "eval_steps_per_second": 0.875, - "step": 900 - }, - { - "epoch": 0.724233983286908, - "grad_norm": 18.36522083595644, - "learning_rate": 1.0701737372808431e-07, - "logits/chosen": 0.9391189813613892, - "logits/rejected": 1.0841562747955322, - "logps/chosen": -332.28680419921875, - "logps/rejected": -432.437255859375, - "loss": 0.5285, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": -1.8854007720947266, - "rewards/margins": 0.872325599193573, - "rewards/rejected": -2.7577261924743652, - "step": 910 - }, - { - "epoch": 0.7321925984878631, - "grad_norm": 14.771527831877176, - "learning_rate": 1.0137191367132078e-07, - "logits/chosen": 0.9683534502983093, - "logits/rejected": 1.2063651084899902, - "logps/chosen": -393.1899108886719, - "logps/rejected": -460.9879455566406, - "loss": 0.5276, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -2.370966911315918, - "rewards/margins": 0.6327589154243469, - "rewards/rejected": -3.003725528717041, - "step": 920 - }, - { - "epoch": 0.7401512136888182, - "grad_norm": 15.363649277993106, - "learning_rate": 9.584132603467827e-08, - "logits/chosen": 1.2887399196624756, - "logits/rejected": 1.730613350868225, - "logps/chosen": -417.56500244140625, - "logps/rejected": -460.6282653808594, - "loss": 0.5419, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -2.50395131111145, - "rewards/margins": 0.5594934225082397, - "rewards/rejected": -3.0634446144104004, - "step": 930 - }, - { - "epoch": 0.7481098288897732, - "grad_norm": 15.533558224288868, - "learning_rate": 9.042988532644249e-08, - "logits/chosen": 1.2376580238342285, - "logits/rejected": 1.4853687286376953, - "logps/chosen": -384.9251403808594, - "logps/rejected": -471.8038635253906, - "loss": 0.5375, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -2.4157216548919678, - "rewards/margins": 0.7549302577972412, - "rewards/rejected": -3.170651912689209, - "step": 940 - }, - { - "epoch": 0.7560684440907283, - "grad_norm": 17.631353509539945, - "learning_rate": 8.514177396802428e-08, - "logits/chosen": 0.7006018757820129, - "logits/rejected": 0.9818047285079956, - "logps/chosen": -390.9950256347656, - "logps/rejected": -467.605224609375, - "loss": 0.5352, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -2.511242389678955, - "rewards/margins": 0.6958988308906555, - "rewards/rejected": -3.207141160964966, - "step": 950 - }, - { - "epoch": 0.7640270592916832, - "grad_norm": 16.74451609121504, - "learning_rate": 7.998107906142839e-08, - "logits/chosen": 0.650057315826416, - "logits/rejected": 0.9443982243537903, - "logps/chosen": -383.0691223144531, - "logps/rejected": -440.1776428222656, - "loss": 0.5429, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -2.400475025177002, - "rewards/margins": 0.6352895498275757, - "rewards/rejected": -3.035764455795288, - "step": 960 - }, - { - "epoch": 0.7719856744926383, - "grad_norm": 17.31477017879575, - "learning_rate": 7.495178923039396e-08, - "logits/chosen": 0.6368435621261597, - "logits/rejected": 0.6060078144073486, - "logps/chosen": -356.3271484375, - "logps/rejected": -451.2137145996094, - "loss": 0.5412, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -2.2765326499938965, - "rewards/margins": 0.678371250629425, - "rewards/rejected": -2.9549036026000977, - "step": 970 - }, - { - "epoch": 0.7799442896935933, - "grad_norm": 19.22477593332992, - "learning_rate": 7.005779153764682e-08, - "logits/chosen": 0.31581220030784607, - "logits/rejected": 0.7790960073471069, - "logps/chosen": -355.78216552734375, - "logps/rejected": -430.12115478515625, - "loss": 0.5252, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -2.1299870014190674, - "rewards/margins": 0.7963441610336304, - "rewards/rejected": -2.926331043243408, - "step": 980 - }, - { - "epoch": 0.7879029048945484, - "grad_norm": 23.587781729505224, - "learning_rate": 6.530286848064698e-08, - "logits/chosen": 0.5126671195030212, - "logits/rejected": 0.7368132472038269, - "logps/chosen": -354.09619140625, - "logps/rejected": -436.5574645996094, - "loss": 0.5206, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -2.0094680786132812, - "rewards/margins": 0.7868490815162659, - "rewards/rejected": -2.7963171005249023, - "step": 990 - }, - { - "epoch": 0.7958615200955034, - "grad_norm": 23.091191084575406, - "learning_rate": 6.069069506815325e-08, - "logits/chosen": 0.4187610149383545, - "logits/rejected": 0.844292938709259, - "logps/chosen": -353.77496337890625, - "logps/rejected": -425.384521484375, - "loss": 0.5629, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -2.0886340141296387, - "rewards/margins": 0.760814368724823, - "rewards/rejected": -2.8494486808776855, - "step": 1000 - }, - { - "epoch": 0.7958615200955034, - "eval_logits/chosen": 0.4599636495113373, - "eval_logits/rejected": 0.7032696604728699, - "eval_logps/chosen": -359.07489013671875, - "eval_logps/rejected": -440.1051330566406, - "eval_loss": 0.544038712978363, - "eval_rewards/accuracies": 0.7322761416435242, - "eval_rewards/chosen": -2.148313522338867, - "eval_rewards/margins": 0.7457479238510132, - "eval_rewards/rejected": -2.894061326980591, - "eval_runtime": 152.9965, - "eval_samples_per_second": 55.897, - "eval_steps_per_second": 0.876, - "step": 1000 - }, - { - "epoch": 0.8038201352964585, - "grad_norm": 16.757821822715535, - "learning_rate": 5.6224835979863714e-08, - "logits/chosen": 0.31164878606796265, - "logits/rejected": 0.6217297911643982, - "logps/chosen": -365.158935546875, - "logps/rejected": -421.8297424316406, - "loss": 0.542, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -2.1484549045562744, - "rewards/margins": 0.6447950601577759, - "rewards/rejected": -2.7932498455047607, - "step": 1010 - }, - { - "epoch": 0.8117787504974134, - "grad_norm": 18.84738392137227, - "learning_rate": 5.190874281132851e-08, - "logits/chosen": 0.4319641590118408, - "logits/rejected": 0.6768335700035095, - "logps/chosen": -342.3192138671875, - "logps/rejected": -426.79150390625, - "loss": 0.5334, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -2.0261223316192627, - "rewards/margins": 0.8165225982666016, - "rewards/rejected": -2.842644453048706, - "step": 1020 - }, - { - "epoch": 0.8197373656983685, - "grad_norm": 17.342020030463754, - "learning_rate": 4.774575140626316e-08, - "logits/chosen": 0.5502648949623108, - "logits/rejected": 0.9354747533798218, - "logps/chosen": -352.39959716796875, - "logps/rejected": -427.843017578125, - "loss": 0.5377, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -2.1233630180358887, - "rewards/margins": 0.8309398889541626, - "rewards/rejected": -2.9543025493621826, - "step": 1030 - }, - { - "epoch": 0.8276959808993235, - "grad_norm": 18.89903354492542, - "learning_rate": 4.373907927832513e-08, - "logits/chosen": 0.6125099062919617, - "logits/rejected": 0.7803068161010742, - "logps/chosen": -340.08282470703125, - "logps/rejected": -415.82928466796875, - "loss": 0.5402, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -2.1237986087799072, - "rewards/margins": 0.6451882123947144, - "rewards/rejected": -2.768986701965332, - "step": 1040 - }, - { - "epoch": 0.8356545961002786, - "grad_norm": 15.617092554442154, - "learning_rate": 3.9891823124345665e-08, - "logits/chosen": 0.49313363432884216, - "logits/rejected": 0.6866206526756287, - "logps/chosen": -347.2782287597656, - "logps/rejected": -431.2796325683594, - "loss": 0.54, - "rewards/accuracies": 0.71875, - "rewards/chosen": -1.9626163244247437, - "rewards/margins": 0.8709405660629272, - "rewards/rejected": -2.833556890487671, - "step": 1050 - }, - { - "epoch": 0.8436132113012336, - "grad_norm": 16.41345421019679, - "learning_rate": 3.620695643093924e-08, - "logits/chosen": 0.5007290840148926, - "logits/rejected": 0.6147471070289612, - "logps/chosen": -332.5653381347656, - "logps/rejected": -410.90826416015625, - "loss": 0.5381, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -1.957784652709961, - "rewards/margins": 0.6474533677101135, - "rewards/rejected": -2.6052377223968506, - "step": 1060 - }, - { - "epoch": 0.8515718265021887, - "grad_norm": 17.140432579260622, - "learning_rate": 3.268732717634032e-08, - "logits/chosen": 0.4870120882987976, - "logits/rejected": 0.6993613839149475, - "logps/chosen": -340.8184814453125, - "logps/rejected": -407.65557861328125, - "loss": 0.5422, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -2.044240951538086, - "rewards/margins": 0.6913812756538391, - "rewards/rejected": -2.7356221675872803, - "step": 1070 - }, - { - "epoch": 0.8595304417031436, - "grad_norm": 16.694484491754185, - "learning_rate": 2.9335655629243645e-08, - "logits/chosen": 0.4208219647407532, - "logits/rejected": 0.6202067136764526, - "logps/chosen": -369.6133728027344, - "logps/rejected": -443.1822204589844, - "loss": 0.5484, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -2.1923069953918457, - "rewards/margins": 0.6231125593185425, - "rewards/rejected": -2.8154194355010986, - "step": 1080 - }, - { - "epoch": 0.8674890569040987, - "grad_norm": 22.133523730480416, - "learning_rate": 2.6154532246349476e-08, - "logits/chosen": 0.2894337773323059, - "logits/rejected": 0.7369552254676819, - "logps/chosen": -358.96392822265625, - "logps/rejected": -397.40496826171875, - "loss": 0.5698, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -2.173938274383545, - "rewards/margins": 0.5639660358428955, - "rewards/rejected": -2.7379043102264404, - "step": 1090 - }, - { - "epoch": 0.8754476721050537, - "grad_norm": 18.678051085655653, - "learning_rate": 2.31464156702382e-08, - "logits/chosen": 0.37609541416168213, - "logits/rejected": 0.7177656888961792, - "logps/chosen": -366.3799743652344, - "logps/rejected": -418.73046875, - "loss": 0.5351, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -2.097896099090576, - "rewards/margins": 0.6021716594696045, - "rewards/rejected": -2.7000677585601807, - "step": 1100 - }, - { - "epoch": 0.8754476721050537, - "eval_logits/chosen": 0.2752957046031952, - "eval_logits/rejected": 0.5029404759407043, - "eval_logps/chosen": -359.2066345214844, - "eval_logps/rejected": -436.4062194824219, - "eval_loss": 0.542252242565155, - "eval_rewards/accuracies": 0.7304104566574097, - "eval_rewards/chosen": -2.1496312618255615, - "eval_rewards/margins": 0.7074410319328308, - "eval_rewards/rejected": -2.857072353363037, - "eval_runtime": 153.1272, - "eval_samples_per_second": 55.849, - "eval_steps_per_second": 0.875, - "step": 1100 - }, - { - "epoch": 0.8834062873060088, - "grad_norm": 20.61267635187729, - "learning_rate": 2.031363082912252e-08, - "logits/chosen": 0.4241918623447418, - "logits/rejected": 0.5393252968788147, - "logps/chosen": -339.3815612792969, - "logps/rejected": -413.20794677734375, - "loss": 0.5198, - "rewards/accuracies": 0.71875, - "rewards/chosen": -2.0784695148468018, - "rewards/margins": 0.6469917893409729, - "rewards/rejected": -2.72546124458313, - "step": 1110 - }, - { - "epoch": 0.8913649025069638, - "grad_norm": 18.0074651342195, - "learning_rate": 1.7658367139945228e-08, - "logits/chosen": 0.34336820244789124, - "logits/rejected": 0.5900403261184692, - "logps/chosen": -356.97467041015625, - "logps/rejected": -420.053955078125, - "loss": 0.532, - "rewards/accuracies": 0.71875, - "rewards/chosen": -2.126662492752075, - "rewards/margins": 0.6412577033042908, - "rewards/rejected": -2.7679200172424316, - "step": 1120 - }, - { - "epoch": 0.8993235177079189, - "grad_norm": 20.08213128535877, - "learning_rate": 1.5182676816211632e-08, - "logits/chosen": 0.3494935631752014, - "logits/rejected": 0.6701606512069702, - "logps/chosen": -356.5093078613281, - "logps/rejected": -447.0511169433594, - "loss": 0.5238, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -2.140960931777954, - "rewards/margins": 0.7673269510269165, - "rewards/rejected": -2.908287763595581, - "step": 1130 - }, - { - "epoch": 0.9072821329088738, - "grad_norm": 15.635113581490817, - "learning_rate": 1.2888473281864597e-08, - "logits/chosen": 0.17955251038074493, - "logits/rejected": 0.35742440819740295, - "logps/chosen": -364.36993408203125, - "logps/rejected": -419.10186767578125, - "loss": 0.5323, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -2.065547227859497, - "rewards/margins": 0.5961118936538696, - "rewards/rejected": -2.6616594791412354, - "step": 1140 - }, - { - "epoch": 0.9152407481098289, - "grad_norm": 18.17543468321864, - "learning_rate": 1.0777529692427679e-08, - "logits/chosen": 0.5689483880996704, - "logits/rejected": 0.9127931594848633, - "logps/chosen": -358.0685119628906, - "logps/rejected": -420.18426513671875, - "loss": 0.5372, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -2.072906970977783, - "rewards/margins": 0.7296361327171326, - "rewards/rejected": -2.8025431632995605, - "step": 1150 - }, - { - "epoch": 0.9231993633107839, - "grad_norm": 18.913441395008096, - "learning_rate": 8.851477564560061e-09, - "logits/chosen": 0.6391158103942871, - "logits/rejected": 0.8792598843574524, - "logps/chosen": -345.1988830566406, - "logps/rejected": -423.65643310546875, - "loss": 0.5553, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -2.030240535736084, - "rewards/margins": 0.6751813888549805, - "rewards/rejected": -2.7054216861724854, - "step": 1160 - }, - { - "epoch": 0.931157978511739, - "grad_norm": 17.617100933080067, - "learning_rate": 7.111805515081531e-09, - "logits/chosen": 0.2744918763637543, - "logits/rejected": 0.6285992860794067, - "logps/chosen": -381.2764587402344, - "logps/rejected": -450.02142333984375, - "loss": 0.5253, - "rewards/accuracies": 0.71875, - "rewards/chosen": -2.267944097518921, - "rewards/margins": 0.7208031415939331, - "rewards/rejected": -2.9887471199035645, - "step": 1170 - }, - { - "epoch": 0.939116593712694, - "grad_norm": 15.79506235955512, - "learning_rate": 5.559858110443016e-09, - "logits/chosen": 0.21863842010498047, - "logits/rejected": 0.5074991583824158, - "logps/chosen": -360.7405700683594, - "logps/rejected": -429.59130859375, - "loss": 0.5093, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -2.048325777053833, - "rewards/margins": 0.6674487590789795, - "rewards/rejected": -2.7157740592956543, - "step": 1180 - }, - { - "epoch": 0.947075208913649, - "grad_norm": 15.907477201452677, - "learning_rate": 4.196834827531276e-09, - "logits/chosen": 0.36164388060569763, - "logits/rejected": 0.6227206587791443, - "logps/chosen": -360.2243347167969, - "logps/rejected": -448.7967834472656, - "loss": 0.5248, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -2.1712417602539062, - "rewards/margins": 0.8898156881332397, - "rewards/rejected": -3.0610575675964355, - "step": 1190 - }, - { - "epoch": 0.955033824114604, - "grad_norm": 16.740269639186682, - "learning_rate": 3.023789126611137e-09, - "logits/chosen": 0.4015735685825348, - "logits/rejected": 0.7451781034469604, - "logps/chosen": -357.9459533691406, - "logps/rejected": -429.60107421875, - "loss": 0.5499, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -2.10615873336792, - "rewards/margins": 0.7886561751365662, - "rewards/rejected": -2.894814968109131, - "step": 1200 - }, - { - "epoch": 0.955033824114604, - "eval_logits/chosen": 0.35614868998527527, - "eval_logits/rejected": 0.5902336239814758, - "eval_logps/chosen": -359.8675231933594, - "eval_logps/rejected": -438.7700500488281, - "eval_loss": 0.5416554808616638, - "eval_rewards/accuracies": 0.7313432693481445, - "eval_rewards/chosen": -2.1562399864196777, - "eval_rewards/margins": 0.7244706153869629, - "eval_rewards/rejected": -2.8807106018066406, - "eval_runtime": 153.0914, - "eval_samples_per_second": 55.862, - "eval_steps_per_second": 0.875, - "step": 1200 - }, - { - "epoch": 0.9629924393155591, - "grad_norm": 17.694729007133922, - "learning_rate": 2.041627637121929e-09, - "logits/chosen": 0.3815138638019562, - "logits/rejected": 0.6700073480606079, - "logps/chosen": -365.1811218261719, - "logps/rejected": -457.5047302246094, - "loss": 0.5543, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -2.237273931503296, - "rewards/margins": 0.7506656050682068, - "rewards/rejected": -2.9879393577575684, - "step": 1210 - }, - { - "epoch": 0.9709510545165141, - "grad_norm": 18.90643800869425, - "learning_rate": 1.2511094569571668e-09, - "logits/chosen": 0.5528720021247864, - "logits/rejected": 0.9850804209709167, - "logps/chosen": -364.58245849609375, - "logps/rejected": -405.16729736328125, - "loss": 0.5454, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -2.1615917682647705, - "rewards/margins": 0.6378272771835327, - "rewards/rejected": -2.7994191646575928, - "step": 1220 - }, - { - "epoch": 0.9789096697174692, - "grad_norm": 15.395362721693543, - "learning_rate": 6.528455657691112e-10, - "logits/chosen": 0.5471321940422058, - "logits/rejected": 0.6208599209785461, - "logps/chosen": -363.4918518066406, - "logps/rejected": -439.646240234375, - "loss": 0.5121, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -2.2475426197052, - "rewards/margins": 0.6490240097045898, - "rewards/rejected": -2.896566867828369, - "step": 1230 - }, - { - "epoch": 0.9868682849184242, - "grad_norm": 18.690557653353416, - "learning_rate": 2.4729835275189016e-10, - "logits/chosen": 0.4783777594566345, - "logits/rejected": 0.6712285876274109, - "logps/chosen": -352.80804443359375, - "logps/rejected": -443.9805603027344, - "loss": 0.5489, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -2.1210198402404785, - "rewards/margins": 0.8481594324111938, - "rewards/rejected": -2.969179630279541, - "step": 1240 - }, - { - "epoch": 0.9948269001193792, - "grad_norm": 19.13672093750131, - "learning_rate": 3.478125926756337e-11, - "logits/chosen": 0.5186442732810974, - "logits/rejected": 0.6572960615158081, - "logps/chosen": -358.7153625488281, - "logps/rejected": -452.71893310546875, - "loss": 0.5414, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -2.16998028755188, - "rewards/margins": 0.8236673474311829, - "rewards/rejected": -2.993648052215576, - "step": 1250 + "rewards/chosen": -1.3203939199447632, + "rewards/margins": 0.9791151285171509, + "rewards/rejected": -2.299509048461914, + "step": 470 }, { - "epoch": 0.9996020692399522, - "step": 1256, + "epoch": 1.0, + "step": 478, "total_flos": 0.0, - "train_loss": 0.56636344817034, - "train_runtime": 10031.2749, - "train_samples_per_second": 16.03, - "train_steps_per_second": 0.125 + "train_loss": 0.06801126423490596, + "train_runtime": 3957.1126, + "train_samples_per_second": 15.449, + "train_steps_per_second": 0.121 } ], "logging_steps": 10, - "max_steps": 1256, - "num_input_tokens_seen": 0, + "max_steps": 478, "num_train_epochs": 1, - "save_steps": 100, - "stateful_callbacks": { - "TrainerControl": { - "args": { - "should_epoch_stop": false, - "should_evaluate": false, - "should_log": false, - "should_save": true, - "should_training_stop": false - }, - "attributes": {} - } - }, + "save_steps": 1000, "total_flos": 0.0, - "train_batch_size": 8, "trial_name": null, "trial_params": null }