diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,2940 @@ +{ + "best_metric": 1.5154520273208618, + "best_model_checkpoint": "saves/Falcon-7B-Instruct/lora/orpo/checkpoint-1500", + "epoch": 2.997999555456768, + "eval_steps": 500, + "global_step": 1686, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.017781729273171815, + "grad_norm": 0.45023536682128906, + "learning_rate": 4.9995745934141085e-06, + "logits/chosen": -14.31452751159668, + "logits/rejected": -14.272933959960938, + "logps/chosen": -1.777596116065979, + "logps/rejected": -1.814857840538025, + "loss": 1.8528, + "odds_ratio_loss": 0.7518970966339111, + "rewards/accuracies": 0.4437499940395355, + "rewards/chosen": -0.17775960266590118, + "rewards/margins": 0.0037261671386659145, + "rewards/rejected": -0.18148578703403473, + "sft_loss": 1.777596116065979, + "step": 10 + }, + { + "epoch": 0.03556345854634363, + "grad_norm": 0.6821511387825012, + "learning_rate": 4.9982812903243405e-06, + "logits/chosen": -14.213617324829102, + "logits/rejected": -14.412919998168945, + "logps/chosen": -1.9183998107910156, + "logps/rejected": -1.8259010314941406, + "loss": 2.0025, + "odds_ratio_loss": 0.8412569761276245, + "rewards/accuracies": 0.40625, + "rewards/chosen": -0.19183996319770813, + "rewards/margins": -0.009249850176274776, + "rewards/rejected": -0.18259011209011078, + "sft_loss": 1.9183998107910156, + "step": 20 + }, + { + "epoch": 0.05334518781951545, + "grad_norm": 0.50360107421875, + "learning_rate": 4.996120496405222e-06, + "logits/chosen": -14.275195121765137, + "logits/rejected": -14.341901779174805, + "logps/chosen": -1.8644087314605713, + "logps/rejected": -2.0387845039367676, + "loss": 1.9359, + "odds_ratio_loss": 0.7153545022010803, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.1864408552646637, + "rewards/margins": 0.01743762008845806, + "rewards/rejected": -0.2038784772157669, + "sft_loss": 1.8644087314605713, + "step": 30 + }, + { + "epoch": 0.07112691709268726, + "grad_norm": 0.6971050500869751, + "learning_rate": 4.99309296196014e-06, + "logits/chosen": -14.182516098022461, + "logits/rejected": -14.20283317565918, + "logps/chosen": -1.9314234256744385, + "logps/rejected": -1.899929404258728, + "loss": 2.0128, + "odds_ratio_loss": 0.8141088485717773, + "rewards/accuracies": 0.4437499940395355, + "rewards/chosen": -0.1931423395872116, + "rewards/margins": -0.003149367868900299, + "rewards/rejected": -0.1899929791688919, + "sft_loss": 1.9314234256744385, + "step": 40 + }, + { + "epoch": 0.08890864636585907, + "grad_norm": 0.5635890960693359, + "learning_rate": 4.989199738255166e-06, + "logits/chosen": -14.374763488769531, + "logits/rejected": -14.155324935913086, + "logps/chosen": -1.9009517431259155, + "logps/rejected": -1.9379255771636963, + "loss": 1.9795, + "odds_ratio_loss": 0.7854829430580139, + "rewards/accuracies": 0.46875, + "rewards/chosen": -0.19009515643119812, + "rewards/margins": 0.003697408363223076, + "rewards/rejected": -0.19379255175590515, + "sft_loss": 1.9009517431259155, + "step": 50 + }, + { + "epoch": 0.1066903756390309, + "grad_norm": 0.6436208486557007, + "learning_rate": 4.984442177154031e-06, + "logits/chosen": -14.250883102416992, + "logits/rejected": -14.289509773254395, + "logps/chosen": -1.9730733633041382, + "logps/rejected": -2.040274143218994, + "loss": 2.0526, + "odds_ratio_loss": 0.7955271601676941, + "rewards/accuracies": 0.4437499940395355, + "rewards/chosen": -0.19730734825134277, + "rewards/margins": 0.006720039062201977, + "rewards/rejected": -0.20402738451957703, + "sft_loss": 1.9730733633041382, + "step": 60 + }, + { + "epoch": 0.12447210491220272, + "grad_norm": 0.4920930862426758, + "learning_rate": 4.978821930648704e-06, + "logits/chosen": -14.044062614440918, + "logits/rejected": -14.116564750671387, + "logps/chosen": -1.9218193292617798, + "logps/rejected": -1.7758527994155884, + "loss": 2.0142, + "odds_ratio_loss": 0.9241151809692383, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": -0.19218194484710693, + "rewards/margins": -0.014596650376915932, + "rewards/rejected": -0.17758527398109436, + "sft_loss": 1.9218193292617798, + "step": 70 + }, + { + "epoch": 0.14225383418537452, + "grad_norm": 0.6744620203971863, + "learning_rate": 4.97234095028576e-06, + "logits/chosen": -14.337008476257324, + "logits/rejected": -14.242892265319824, + "logps/chosen": -1.873708963394165, + "logps/rejected": -1.8873860836029053, + "loss": 1.9505, + "odds_ratio_loss": 0.767748236656189, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": -0.1873709261417389, + "rewards/margins": 0.0013677121605724096, + "rewards/rejected": -0.1887386292219162, + "sft_loss": 1.873708963394165, + "step": 80 + }, + { + "epoch": 0.16003556345854633, + "grad_norm": 0.48053959012031555, + "learning_rate": 4.965001486488743e-06, + "logits/chosen": -14.241889953613281, + "logits/rejected": -14.200053215026855, + "logps/chosen": -1.7481162548065186, + "logps/rejected": -1.7742578983306885, + "loss": 1.8239, + "odds_ratio_loss": 0.7575067281723022, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": -0.17481163144111633, + "rewards/margins": 0.0026141509879380465, + "rewards/rejected": -0.17742577195167542, + "sft_loss": 1.7481162548065186, + "step": 90 + }, + { + "epoch": 0.17781729273171815, + "grad_norm": 0.7781735062599182, + "learning_rate": 4.956806087776732e-06, + "logits/chosen": -14.596258163452148, + "logits/rejected": -14.529101371765137, + "logps/chosen": -1.8169043064117432, + "logps/rejected": -1.9338127374649048, + "loss": 1.8891, + "odds_ratio_loss": 0.7220322489738464, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.18169045448303223, + "rewards/margins": 0.01169085968285799, + "rewards/rejected": -0.19338129460811615, + "sft_loss": 1.8169043064117432, + "step": 100 + }, + { + "epoch": 0.19559902200489, + "grad_norm": 0.845944881439209, + "learning_rate": 4.947757599879411e-06, + "logits/chosen": -14.283439636230469, + "logits/rejected": -14.439828872680664, + "logps/chosen": -1.7649319171905518, + "logps/rejected": -1.82681405544281, + "loss": 1.8394, + "odds_ratio_loss": 0.7447811961174011, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.17649319767951965, + "rewards/margins": 0.0061882128939032555, + "rewards/rejected": -0.1826813966035843, + "sft_loss": 1.7649319171905518, + "step": 110 + }, + { + "epoch": 0.2133807512780618, + "grad_norm": 0.6829086542129517, + "learning_rate": 4.937859164748931e-06, + "logits/chosen": -14.171781539916992, + "logits/rejected": -14.2664155960083, + "logps/chosen": -1.65048348903656, + "logps/rejected": -1.6755653619766235, + "loss": 1.7267, + "odds_ratio_loss": 0.7621053457260132, + "rewards/accuracies": 0.4937500059604645, + "rewards/chosen": -0.16504836082458496, + "rewards/margins": 0.00250818463973701, + "rewards/rejected": -0.1675565242767334, + "sft_loss": 1.65048348903656, + "step": 120 + }, + { + "epoch": 0.23116248055123362, + "grad_norm": 0.6720818877220154, + "learning_rate": 4.92711421946891e-06, + "logits/chosen": -14.323086738586426, + "logits/rejected": -13.87572956085205, + "logps/chosen": -1.6958109140396118, + "logps/rejected": -1.8111820220947266, + "loss": 1.7688, + "odds_ratio_loss": 0.7296444177627563, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -0.1695810854434967, + "rewards/margins": 0.011537122540175915, + "rewards/rejected": -0.1811182051897049, + "sft_loss": 1.6958109140396118, + "step": 130 + }, + { + "epoch": 0.24894420982440543, + "grad_norm": 1.665626049041748, + "learning_rate": 4.915526495060961e-06, + "logits/chosen": -14.461613655090332, + "logits/rejected": -14.22163200378418, + "logps/chosen": -1.670013666152954, + "logps/rejected": -1.7781444787979126, + "loss": 1.7433, + "odds_ratio_loss": 0.7326729893684387, + "rewards/accuracies": 0.53125, + "rewards/chosen": -0.1670013815164566, + "rewards/margins": 0.010813087224960327, + "rewards/rejected": -0.17781445384025574, + "sft_loss": 1.670013666152954, + "step": 140 + }, + { + "epoch": 0.26672593909757725, + "grad_norm": 1.3471460342407227, + "learning_rate": 4.903100015189153e-06, + "logits/chosen": -14.236448287963867, + "logits/rejected": -14.436059951782227, + "logps/chosen": -1.6695849895477295, + "logps/rejected": -1.7570854425430298, + "loss": 1.7435, + "odds_ratio_loss": 0.7395648956298828, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": -0.1669585108757019, + "rewards/margins": 0.008750038221478462, + "rewards/rejected": -0.17570854723453522, + "sft_loss": 1.6695849895477295, + "step": 150 + }, + { + "epoch": 0.28450766837074903, + "grad_norm": 1.0652554035186768, + "learning_rate": 4.889839094762848e-06, + "logits/chosen": -14.326433181762695, + "logits/rejected": -14.2631196975708, + "logps/chosen": -1.6757389307022095, + "logps/rejected": -1.8023579120635986, + "loss": 1.7489, + "odds_ratio_loss": 0.7316839098930359, + "rewards/accuracies": 0.46875, + "rewards/chosen": -0.16757391393184662, + "rewards/margins": 0.012661868706345558, + "rewards/rejected": -0.18023577332496643, + "sft_loss": 1.6757389307022095, + "step": 160 + }, + { + "epoch": 0.3022893976439209, + "grad_norm": 1.0970226526260376, + "learning_rate": 4.875748338438416e-06, + "logits/chosen": -14.249468803405762, + "logits/rejected": -14.319056510925293, + "logps/chosen": -1.6604044437408447, + "logps/rejected": -1.6869754791259766, + "loss": 1.7344, + "odds_ratio_loss": 0.7400213479995728, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": -0.16604045033454895, + "rewards/margins": 0.0026570975314825773, + "rewards/rejected": -0.1686975508928299, + "sft_loss": 1.6604044437408447, + "step": 170 + }, + { + "epoch": 0.32007112691709266, + "grad_norm": 0.7075946927070618, + "learning_rate": 4.8608326390203386e-06, + "logits/chosen": -14.197771072387695, + "logits/rejected": -14.143452644348145, + "logps/chosen": -1.6201465129852295, + "logps/rejected": -1.7580602169036865, + "loss": 1.6907, + "odds_ratio_loss": 0.7053945660591125, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.1620146632194519, + "rewards/margins": 0.013791357167065144, + "rewards/rejected": -0.17580604553222656, + "sft_loss": 1.6201465129852295, + "step": 180 + }, + { + "epoch": 0.3378528561902645, + "grad_norm": 1.0393530130386353, + "learning_rate": 4.845097175762251e-06, + "logits/chosen": -14.362152099609375, + "logits/rejected": -14.374476432800293, + "logps/chosen": -1.6125462055206299, + "logps/rejected": -1.6287224292755127, + "loss": 1.6892, + "odds_ratio_loss": 0.7666895985603333, + "rewards/accuracies": 0.4437499940395355, + "rewards/chosen": -0.1612546145915985, + "rewards/margins": 0.001617613947018981, + "rewards/rejected": -0.16287222504615784, + "sft_loss": 1.6125462055206299, + "step": 190 + }, + { + "epoch": 0.3556345854634363, + "grad_norm": 1.0453855991363525, + "learning_rate": 4.8285474125685286e-06, + "logits/chosen": -14.311877250671387, + "logits/rejected": -14.24933910369873, + "logps/chosen": -1.6674257516860962, + "logps/rejected": -1.6954532861709595, + "loss": 1.7433, + "odds_ratio_loss": 0.7591363191604614, + "rewards/accuracies": 0.42500001192092896, + "rewards/chosen": -0.16674259305000305, + "rewards/margins": 0.0028027337975800037, + "rewards/rejected": -0.16954532265663147, + "sft_loss": 1.6674257516860962, + "step": 200 + }, + { + "epoch": 0.37341631473660813, + "grad_norm": 1.7740540504455566, + "learning_rate": 4.811189096097025e-06, + "logits/chosen": -14.10380744934082, + "logits/rejected": -14.06958293914795, + "logps/chosen": -1.6661155223846436, + "logps/rejected": -1.7266288995742798, + "loss": 1.7425, + "odds_ratio_loss": 0.7635276913642883, + "rewards/accuracies": 0.48124998807907104, + "rewards/chosen": -0.16661155223846436, + "rewards/margins": 0.006051325239241123, + "rewards/rejected": -0.1726628839969635, + "sft_loss": 1.6661155223846436, + "step": 210 + }, + { + "epoch": 0.39119804400978, + "grad_norm": 1.2109721899032593, + "learning_rate": 4.793028253763633e-06, + "logits/chosen": -14.37977123260498, + "logits/rejected": -14.255584716796875, + "logps/chosen": -1.5491920709609985, + "logps/rejected": -1.6557328701019287, + "loss": 1.6259, + "odds_ratio_loss": 0.7670000791549683, + "rewards/accuracies": 0.46875, + "rewards/chosen": -0.15491920709609985, + "rewards/margins": 0.010654089972376823, + "rewards/rejected": -0.16557331383228302, + "sft_loss": 1.5491920709609985, + "step": 220 + }, + { + "epoch": 0.40897977328295176, + "grad_norm": 1.2553755044937134, + "learning_rate": 4.774071191649352e-06, + "logits/chosen": -14.052825927734375, + "logits/rejected": -14.052751541137695, + "logps/chosen": -1.540856122970581, + "logps/rejected": -1.742560625076294, + "loss": 1.6084, + "odds_ratio_loss": 0.6751853227615356, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.15408562123775482, + "rewards/margins": 0.0201704278588295, + "rewards/rejected": -0.17425604164600372, + "sft_loss": 1.540856122970581, + "step": 230 + }, + { + "epoch": 0.4267615025561236, + "grad_norm": 1.4113616943359375, + "learning_rate": 4.7543244923105975e-06, + "logits/chosen": -14.20154094696045, + "logits/rejected": -14.3230619430542, + "logps/chosen": -1.6533008813858032, + "logps/rejected": -1.6262308359146118, + "loss": 1.7354, + "odds_ratio_loss": 0.8208959698677063, + "rewards/accuracies": 0.4625000059604645, + "rewards/chosen": -0.16533009707927704, + "rewards/margins": -0.002707002917304635, + "rewards/rejected": -0.1626230925321579, + "sft_loss": 1.6533008813858032, + "step": 240 + }, + { + "epoch": 0.4445432318292954, + "grad_norm": 0.9963915348052979, + "learning_rate": 4.733795012493506e-06, + "logits/chosen": -14.148083686828613, + "logits/rejected": -14.310300827026367, + "logps/chosen": -1.6552881002426147, + "logps/rejected": -1.6675243377685547, + "loss": 1.7326, + "odds_ratio_loss": 0.7732909321784973, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": -0.165528804063797, + "rewards/margins": 0.0012236315524205565, + "rewards/rejected": -0.16675245761871338, + "sft_loss": 1.6552881002426147, + "step": 250 + }, + { + "epoch": 0.46232496110246724, + "grad_norm": 0.9358872175216675, + "learning_rate": 4.712489880753035e-06, + "logits/chosen": -14.420260429382324, + "logits/rejected": -14.414996147155762, + "logps/chosen": -1.4923756122589111, + "logps/rejected": -1.5968248844146729, + "loss": 1.5619, + "odds_ratio_loss": 0.6949405670166016, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": -0.14923755824565887, + "rewards/margins": 0.010444932617247105, + "rewards/rejected": -0.1596824824810028, + "sft_loss": 1.4923756122589111, + "step": 260 + }, + { + "epoch": 0.480106690375639, + "grad_norm": 2.065678596496582, + "learning_rate": 4.690416494977673e-06, + "logits/chosen": -14.560025215148926, + "logits/rejected": -14.5877103805542, + "logps/chosen": -1.5790516138076782, + "logps/rejected": -1.7484729290008545, + "loss": 1.6496, + "odds_ratio_loss": 0.7055513858795166, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.15790514647960663, + "rewards/margins": 0.016942132264375687, + "rewards/rejected": -0.1748472899198532, + "sft_loss": 1.5790516138076782, + "step": 270 + }, + { + "epoch": 0.49788841964881086, + "grad_norm": 3.234992265701294, + "learning_rate": 4.667582519820639e-06, + "logits/chosen": -14.247453689575195, + "logits/rejected": -14.422063827514648, + "logps/chosen": -1.5711301565170288, + "logps/rejected": -1.6209745407104492, + "loss": 1.6445, + "odds_ratio_loss": 0.7338452339172363, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": -0.1571130007505417, + "rewards/margins": 0.0049844542518258095, + "rewards/rejected": -0.16209746897220612, + "sft_loss": 1.5711301565170288, + "step": 280 + }, + { + "epoch": 0.5156701489219827, + "grad_norm": 1.1059269905090332, + "learning_rate": 4.643995884038443e-06, + "logits/chosen": -14.227750778198242, + "logits/rejected": -14.276082038879395, + "logps/chosen": -1.6161178350448608, + "logps/rejected": -1.7087091207504272, + "loss": 1.6894, + "odds_ratio_loss": 0.7330858111381531, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": -0.16161179542541504, + "rewards/margins": 0.009259124286472797, + "rewards/rejected": -0.17087092995643616, + "sft_loss": 1.6161178350448608, + "step": 290 + }, + { + "epoch": 0.5334518781951545, + "grad_norm": 2.079979658126831, + "learning_rate": 4.6196647777377475e-06, + "logits/chosen": -14.415349960327148, + "logits/rejected": -14.31702995300293, + "logps/chosen": -1.5473922491073608, + "logps/rejected": -1.5794051885604858, + "loss": 1.6226, + "odds_ratio_loss": 0.752013087272644, + "rewards/accuracies": 0.4312500059604645, + "rewards/chosen": -0.15473923087120056, + "rewards/margins": 0.0032012953888624907, + "rewards/rejected": -0.15794052183628082, + "sft_loss": 1.5473922491073608, + "step": 300 + }, + { + "epoch": 0.5512336074683263, + "grad_norm": 0.9750655889511108, + "learning_rate": 4.59459764953147e-06, + "logits/chosen": -14.490264892578125, + "logits/rejected": -14.242512702941895, + "logps/chosen": -1.6246612071990967, + "logps/rejected": -1.6596574783325195, + "loss": 1.6988, + "odds_ratio_loss": 0.7414273619651794, + "rewards/accuracies": 0.4625000059604645, + "rewards/chosen": -0.1624661237001419, + "rewards/margins": 0.00349963316693902, + "rewards/rejected": -0.1659657508134842, + "sft_loss": 1.6246612071990967, + "step": 310 + }, + { + "epoch": 0.5690153367414981, + "grad_norm": 1.211684226989746, + "learning_rate": 4.568803203605133e-06, + "logits/chosen": -14.534784317016602, + "logits/rejected": -14.392961502075195, + "logps/chosen": -1.6170380115509033, + "logps/rejected": -1.6397918462753296, + "loss": 1.6966, + "odds_ratio_loss": 0.7957952618598938, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.16170379519462585, + "rewards/margins": 0.0022753949742764235, + "rewards/rejected": -0.1639791876077652, + "sft_loss": 1.6170380115509033, + "step": 320 + }, + { + "epoch": 0.58679706601467, + "grad_norm": 2.359046459197998, + "learning_rate": 4.542290396694462e-06, + "logits/chosen": -14.300097465515137, + "logits/rejected": -14.317700386047363, + "logps/chosen": -1.5104106664657593, + "logps/rejected": -1.5948156118392944, + "loss": 1.5853, + "odds_ratio_loss": 0.7493588328361511, + "rewards/accuracies": 0.45625001192092896, + "rewards/chosen": -0.15104106068611145, + "rewards/margins": 0.008440487086772919, + "rewards/rejected": -0.15948157012462616, + "sft_loss": 1.5104106664657593, + "step": 330 + }, + { + "epoch": 0.6045787952878418, + "grad_norm": 1.9586892127990723, + "learning_rate": 4.515068434975298e-06, + "logits/chosen": -14.25054931640625, + "logits/rejected": -14.31701946258545, + "logps/chosen": -1.544409990310669, + "logps/rejected": -1.6858068704605103, + "loss": 1.6155, + "odds_ratio_loss": 0.7111681699752808, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.1544409841299057, + "rewards/margins": 0.014139704406261444, + "rewards/rejected": -0.16858068108558655, + "sft_loss": 1.544409990310669, + "step": 340 + }, + { + "epoch": 0.6223605245610135, + "grad_norm": 0.8323342800140381, + "learning_rate": 4.487146770866887e-06, + "logits/chosen": -14.46008586883545, + "logits/rejected": -14.50438404083252, + "logps/chosen": -1.6048237085342407, + "logps/rejected": -1.6058448553085327, + "loss": 1.6804, + "odds_ratio_loss": 0.7562613487243652, + "rewards/accuracies": 0.40625, + "rewards/chosen": -0.16048237681388855, + "rewards/margins": 0.00010212887718807906, + "rewards/rejected": -0.16058449447155, + "sft_loss": 1.6048237085342407, + "step": 350 + }, + { + "epoch": 0.6401422538341853, + "grad_norm": 1.1129921674728394, + "learning_rate": 4.458535099749666e-06, + "logits/chosen": -14.248858451843262, + "logits/rejected": -14.238241195678711, + "logps/chosen": -1.6117630004882812, + "logps/rejected": -1.6199924945831299, + "loss": 1.6917, + "odds_ratio_loss": 0.799778938293457, + "rewards/accuracies": 0.4437499940395355, + "rewards/chosen": -0.16117632389068604, + "rewards/margins": 0.0008229434606619179, + "rewards/rejected": -0.16199925541877747, + "sft_loss": 1.6117630004882812, + "step": 360 + }, + { + "epoch": 0.6579239831073572, + "grad_norm": 0.7189633250236511, + "learning_rate": 4.429243356598694e-06, + "logits/chosen": -14.405240058898926, + "logits/rejected": -14.393125534057617, + "logps/chosen": -1.485335111618042, + "logps/rejected": -1.6254221200942993, + "loss": 1.5568, + "odds_ratio_loss": 0.7148580551147461, + "rewards/accuracies": 0.4937500059604645, + "rewards/chosen": -0.14853352308273315, + "rewards/margins": 0.014008693397045135, + "rewards/rejected": -0.1625421941280365, + "sft_loss": 1.485335111618042, + "step": 370 + }, + { + "epoch": 0.675705712380529, + "grad_norm": 1.066667914390564, + "learning_rate": 4.399281712533875e-06, + "logits/chosen": -14.525976181030273, + "logits/rejected": -14.461471557617188, + "logps/chosen": -1.4982891082763672, + "logps/rejected": -1.5271437168121338, + "loss": 1.5745, + "odds_ratio_loss": 0.7622246146202087, + "rewards/accuracies": 0.4625000059604645, + "rewards/chosen": -0.14982891082763672, + "rewards/margins": 0.00288546085357666, + "rewards/rejected": -0.15271437168121338, + "sft_loss": 1.4982891082763672, + "step": 380 + }, + { + "epoch": 0.6934874416537008, + "grad_norm": 1.138702392578125, + "learning_rate": 4.368660571288192e-06, + "logits/chosen": -14.477781295776367, + "logits/rejected": -14.516647338867188, + "logps/chosen": -1.5524108409881592, + "logps/rejected": -1.5919525623321533, + "loss": 1.6311, + "odds_ratio_loss": 0.7873157262802124, + "rewards/accuracies": 0.46875, + "rewards/chosen": -0.15524108707904816, + "rewards/margins": 0.003954165615141392, + "rewards/rejected": -0.15919525921344757, + "sft_loss": 1.5524108409881592, + "step": 390 + }, + { + "epoch": 0.7112691709268726, + "grad_norm": 0.8798184394836426, + "learning_rate": 4.337390565595163e-06, + "logits/chosen": -14.244585037231445, + "logits/rejected": -14.454017639160156, + "logps/chosen": -1.5560581684112549, + "logps/rejected": -1.6040303707122803, + "loss": 1.6304, + "odds_ratio_loss": 0.743826150894165, + "rewards/accuracies": 0.46875, + "rewards/chosen": -0.15560582280158997, + "rewards/margins": 0.004797201603651047, + "rewards/rejected": -0.1604030430316925, + "sft_loss": 1.5560581684112549, + "step": 400 + }, + { + "epoch": 0.7290509002000445, + "grad_norm": 1.082557201385498, + "learning_rate": 4.305482553496786e-06, + "logits/chosen": -14.316876411437988, + "logits/rejected": -14.236166000366211, + "logps/chosen": -1.5271369218826294, + "logps/rejected": -1.5710500478744507, + "loss": 1.6033, + "odds_ratio_loss": 0.7612074613571167, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -0.15271370112895966, + "rewards/margins": 0.004391324706375599, + "rewards/rejected": -0.15710501372814178, + "sft_loss": 1.5271369218826294, + "step": 410 + }, + { + "epoch": 0.7468326294732163, + "grad_norm": 1.7793493270874023, + "learning_rate": 4.272947614573244e-06, + "logits/chosen": -14.312704086303711, + "logits/rejected": -14.391843795776367, + "logps/chosen": -1.6422202587127686, + "logps/rejected": -1.6885766983032227, + "loss": 1.7183, + "odds_ratio_loss": 0.761288046836853, + "rewards/accuracies": 0.4937500059604645, + "rewards/chosen": -0.16422203183174133, + "rewards/margins": 0.004635652992874384, + "rewards/rejected": -0.16885769367218018, + "sft_loss": 1.6422202587127686, + "step": 420 + }, + { + "epoch": 0.7646143587463881, + "grad_norm": 0.7179546356201172, + "learning_rate": 4.23979704609569e-06, + "logits/chosen": -14.418655395507812, + "logits/rejected": -14.451852798461914, + "logps/chosen": -1.5165865421295166, + "logps/rejected": -1.5794544219970703, + "loss": 1.5868, + "odds_ratio_loss": 0.7019873857498169, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -0.15165865421295166, + "rewards/margins": 0.006286793854087591, + "rewards/rejected": -0.157945454120636, + "sft_loss": 1.5165865421295166, + "step": 430 + }, + { + "epoch": 0.78239608801956, + "grad_norm": 1.1082905530929565, + "learning_rate": 4.206042359103435e-06, + "logits/chosen": -14.335638046264648, + "logits/rejected": -14.379010200500488, + "logps/chosen": -1.6033432483673096, + "logps/rejected": -1.648553490638733, + "loss": 1.6799, + "odds_ratio_loss": 0.7652724981307983, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.16033433377742767, + "rewards/margins": 0.004520997405052185, + "rewards/rejected": -0.16485533118247986, + "sft_loss": 1.6033432483673096, + "step": 440 + }, + { + "epoch": 0.8001778172927317, + "grad_norm": 1.853089690208435, + "learning_rate": 4.17169527440691e-06, + "logits/chosen": -14.423266410827637, + "logits/rejected": -14.40173625946045, + "logps/chosen": -1.62055242061615, + "logps/rejected": -1.6249053478240967, + "loss": 1.6973, + "odds_ratio_loss": 0.7678921222686768, + "rewards/accuracies": 0.42500001192092896, + "rewards/chosen": -0.16205522418022156, + "rewards/margins": 0.00043528637615963817, + "rewards/rejected": -0.16249051690101624, + "sft_loss": 1.62055242061615, + "step": 450 + }, + { + "epoch": 0.8179595465659035, + "grad_norm": 1.930782675743103, + "learning_rate": 4.136767718517797e-06, + "logits/chosen": -14.467844009399414, + "logits/rejected": -14.397687911987305, + "logps/chosen": -1.423508882522583, + "logps/rejected": -1.5464013814926147, + "loss": 1.4918, + "odds_ratio_loss": 0.6831967830657959, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -0.14235089719295502, + "rewards/margins": 0.012289242818951607, + "rewards/rejected": -0.15464013814926147, + "sft_loss": 1.423508882522583, + "step": 460 + }, + { + "epoch": 0.8357412758390753, + "grad_norm": 4.023218631744385, + "learning_rate": 4.1012718195077196e-06, + "logits/chosen": -14.457601547241211, + "logits/rejected": -14.645208358764648, + "logps/chosen": -1.5619157552719116, + "logps/rejected": -1.5855516195297241, + "loss": 1.6365, + "odds_ratio_loss": 0.7463001608848572, + "rewards/accuracies": 0.40625, + "rewards/chosen": -0.15619158744812012, + "rewards/margins": 0.0023635749239474535, + "rewards/rejected": -0.15855516493320465, + "sft_loss": 1.5619157552719116, + "step": 470 + }, + { + "epoch": 0.8535230051122472, + "grad_norm": 3.4996113777160645, + "learning_rate": 4.065219902796953e-06, + "logits/chosen": -14.268010139465332, + "logits/rejected": -14.2593994140625, + "logps/chosen": -1.537496566772461, + "logps/rejected": -1.6044343709945679, + "loss": 1.6152, + "odds_ratio_loss": 0.7775283455848694, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -0.15374964475631714, + "rewards/margins": 0.006693786941468716, + "rewards/rejected": -0.16044344007968903, + "sft_loss": 1.537496566772461, + "step": 480 + }, + { + "epoch": 0.871304734385419, + "grad_norm": 0.903472900390625, + "learning_rate": 4.028624486874608e-06, + "logits/chosen": -14.208300590515137, + "logits/rejected": -14.410181045532227, + "logps/chosen": -1.4613759517669678, + "logps/rejected": -1.5640151500701904, + "loss": 1.5343, + "odds_ratio_loss": 0.728947103023529, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.14613759517669678, + "rewards/margins": 0.01026391051709652, + "rewards/rejected": -0.15640152990818024, + "sft_loss": 1.4613759517669678, + "step": 490 + }, + { + "epoch": 0.8890864636585908, + "grad_norm": 0.9934778809547424, + "learning_rate": 3.99149827895177e-06, + "logits/chosen": -14.429773330688477, + "logits/rejected": -14.360036849975586, + "logps/chosen": -1.5578300952911377, + "logps/rejected": -1.5907199382781982, + "loss": 1.6309, + "odds_ratio_loss": 0.7302489280700684, + "rewards/accuracies": 0.4937500059604645, + "rewards/chosen": -0.155783012509346, + "rewards/margins": 0.0032889836002141237, + "rewards/rejected": -0.15907198190689087, + "sft_loss": 1.5578300952911377, + "step": 500 + }, + { + "epoch": 0.8890864636585908, + "eval_logits/chosen": -14.421330451965332, + "eval_logits/rejected": -14.49679946899414, + "eval_logps/chosen": -1.5096371173858643, + "eval_logps/rejected": -1.598750114440918, + "eval_loss": 1.5815595388412476, + "eval_odds_ratio_loss": 0.7192248106002808, + "eval_rewards/accuracies": 0.49399998784065247, + "eval_rewards/chosen": -0.150963693857193, + "eval_rewards/margins": 0.008911306038498878, + "eval_rewards/rejected": -0.1598750203847885, + "eval_runtime": 203.6844, + "eval_samples_per_second": 4.91, + "eval_sft_loss": 1.5096371173858643, + "eval_steps_per_second": 2.455, + "step": 500 + }, + { + "epoch": 0.9068681929317626, + "grad_norm": 1.555288314819336, + "learning_rate": 3.953854170549114e-06, + "logits/chosen": -14.514457702636719, + "logits/rejected": -14.522333145141602, + "logps/chosen": -1.5636659860610962, + "logps/rejected": -1.554619550704956, + "loss": 1.6424, + "odds_ratio_loss": 0.7876118421554565, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.15636660158634186, + "rewards/margins": -0.0009046423947438598, + "rewards/rejected": -0.15546198189258575, + "sft_loss": 1.5636659860610962, + "step": 510 + }, + { + "epoch": 0.9246499222049345, + "grad_norm": 1.688568353652954, + "learning_rate": 3.91570523302051e-06, + "logits/chosen": -14.493148803710938, + "logits/rejected": -14.430615425109863, + "logps/chosen": -1.4217069149017334, + "logps/rejected": -1.5474450588226318, + "loss": 1.4923, + "odds_ratio_loss": 0.7059618830680847, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.14217069745063782, + "rewards/margins": 0.012573805637657642, + "rewards/rejected": -0.15474450588226318, + "sft_loss": 1.4217069149017334, + "step": 520 + }, + { + "epoch": 0.9424316514781063, + "grad_norm": 1.1463229656219482, + "learning_rate": 3.8770647130141996e-06, + "logits/chosen": -14.525823593139648, + "logits/rejected": -14.399996757507324, + "logps/chosen": -1.4726465940475464, + "logps/rejected": -1.5658392906188965, + "loss": 1.5466, + "odds_ratio_loss": 0.7396677136421204, + "rewards/accuracies": 0.4625000059604645, + "rewards/chosen": -0.14726465940475464, + "rewards/margins": 0.009319271892309189, + "rewards/rejected": -0.15658393502235413, + "sft_loss": 1.4726465940475464, + "step": 530 + }, + { + "epoch": 0.960213380751278, + "grad_norm": 2.26594877243042, + "learning_rate": 3.837946027873086e-06, + "logits/chosen": -14.543024063110352, + "logits/rejected": -14.401150703430176, + "logps/chosen": -1.5846188068389893, + "logps/rejected": -1.6536743640899658, + "loss": 1.661, + "odds_ratio_loss": 0.7640754580497742, + "rewards/accuracies": 0.53125, + "rewards/chosen": -0.15846188366413116, + "rewards/margins": 0.006905559450387955, + "rewards/rejected": -0.16536743938922882, + "sft_loss": 1.5846188068389893, + "step": 540 + }, + { + "epoch": 0.9779951100244498, + "grad_norm": 1.5690348148345947, + "learning_rate": 3.7983627609757713e-06, + "logits/chosen": -14.507429122924805, + "logits/rejected": -14.452433586120605, + "logps/chosen": -1.5472663640975952, + "logps/rejected": -1.544019103050232, + "loss": 1.6249, + "odds_ratio_loss": 0.7762556076049805, + "rewards/accuracies": 0.4312500059604645, + "rewards/chosen": -0.15472662448883057, + "rewards/margins": -0.0003247327113058418, + "rewards/rejected": -0.15440191328525543, + "sft_loss": 1.5472663640975952, + "step": 550 + }, + { + "epoch": 0.9957768392976217, + "grad_norm": 1.6722140312194824, + "learning_rate": 3.758328657019924e-06, + "logits/chosen": -14.339811325073242, + "logits/rejected": -14.203923225402832, + "logps/chosen": -1.4747674465179443, + "logps/rejected": -1.5587810277938843, + "loss": 1.5469, + "odds_ratio_loss": 0.7211123704910278, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": -0.14747674763202667, + "rewards/margins": 0.008401353843510151, + "rewards/rejected": -0.15587811172008514, + "sft_loss": 1.4747674465179443, + "step": 560 + }, + { + "epoch": 1.0135585685707935, + "grad_norm": 4.022447109222412, + "learning_rate": 3.717857617249642e-06, + "logits/chosen": -14.2714204788208, + "logits/rejected": -14.344494819641113, + "logps/chosen": -1.5280239582061768, + "logps/rejected": -1.6387897729873657, + "loss": 1.6007, + "odds_ratio_loss": 0.72718346118927, + "rewards/accuracies": 0.48124998807907104, + "rewards/chosen": -0.15280239284038544, + "rewards/margins": 0.011076575145125389, + "rewards/rejected": -0.16387899219989777, + "sft_loss": 1.5280239582061768, + "step": 570 + }, + { + "epoch": 1.0313402978439654, + "grad_norm": 0.8425918817520142, + "learning_rate": 3.6769636946284543e-06, + "logits/chosen": -14.346611022949219, + "logits/rejected": -14.158876419067383, + "logps/chosen": -1.3988748788833618, + "logps/rejected": -1.519100308418274, + "loss": 1.4702, + "odds_ratio_loss": 0.7135868072509766, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -0.1398874968290329, + "rewards/margins": 0.012022523209452629, + "rewards/rejected": -0.15191002190113068, + "sft_loss": 1.3988748788833618, + "step": 580 + }, + { + "epoch": 1.049122027117137, + "grad_norm": 1.2780736684799194, + "learning_rate": 3.6356610889596355e-06, + "logits/chosen": -14.491503715515137, + "logits/rejected": -14.515363693237305, + "logps/chosen": -1.567704200744629, + "logps/rejected": -1.5939210653305054, + "loss": 1.6438, + "odds_ratio_loss": 0.7608035206794739, + "rewards/accuracies": 0.4625000059604645, + "rewards/chosen": -0.15677042305469513, + "rewards/margins": 0.002621681662276387, + "rewards/rejected": -0.1593921184539795, + "sft_loss": 1.567704200744629, + "step": 590 + }, + { + "epoch": 1.066903756390309, + "grad_norm": 1.1325277090072632, + "learning_rate": 3.593964141955541e-06, + "logits/chosen": -14.486841201782227, + "logits/rejected": -14.375473022460938, + "logps/chosen": -1.4968254566192627, + "logps/rejected": -1.535290241241455, + "loss": 1.5708, + "odds_ratio_loss": 0.7399110198020935, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.14968255162239075, + "rewards/margins": 0.0038464791141450405, + "rewards/rejected": -0.15352903306484222, + "sft_loss": 1.4968254566192627, + "step": 600 + }, + { + "epoch": 1.0846854856634809, + "grad_norm": 1.0757452249526978, + "learning_rate": 3.5518873322576573e-06, + "logits/chosen": -14.257904052734375, + "logits/rejected": -14.527667045593262, + "logps/chosen": -1.5230721235275269, + "logps/rejected": -1.5504690408706665, + "loss": 1.6005, + "odds_ratio_loss": 0.774411678314209, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": -0.1523071974515915, + "rewards/margins": 0.0027397016528993845, + "rewards/rejected": -0.15504691004753113, + "sft_loss": 1.5230721235275269, + "step": 610 + }, + { + "epoch": 1.1024672149366526, + "grad_norm": 0.802010715007782, + "learning_rate": 3.5094452704091143e-06, + "logits/chosen": -14.364301681518555, + "logits/rejected": -14.379095077514648, + "logps/chosen": -1.4637863636016846, + "logps/rejected": -1.551966667175293, + "loss": 1.5353, + "odds_ratio_loss": 0.7155525088310242, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -0.14637863636016846, + "rewards/margins": 0.008818031288683414, + "rewards/rejected": -0.1551966667175293, + "sft_loss": 1.4637863636016846, + "step": 620 + }, + { + "epoch": 1.1202489442098245, + "grad_norm": 1.9857678413391113, + "learning_rate": 3.46665269378139e-06, + "logits/chosen": -14.460027694702148, + "logits/rejected": -14.280843734741211, + "logps/chosen": -1.57863187789917, + "logps/rejected": -1.5787197351455688, + "loss": 1.6576, + "odds_ratio_loss": 0.7900050282478333, + "rewards/accuracies": 0.4312500059604645, + "rewards/chosen": -0.15786318480968475, + "rewards/margins": 8.772313776717056e-06, + "rewards/rejected": -0.15787196159362793, + "sft_loss": 1.57863187789917, + "step": 630 + }, + { + "epoch": 1.1380306734829961, + "grad_norm": 3.4590837955474854, + "learning_rate": 3.4235244614569794e-06, + "logits/chosen": -14.505206108093262, + "logits/rejected": -14.5104398727417, + "logps/chosen": -1.5605857372283936, + "logps/rejected": -1.5010731220245361, + "loss": 1.6419, + "odds_ratio_loss": 0.8131183385848999, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.15605857968330383, + "rewards/margins": -0.0059512583538889885, + "rewards/rejected": -0.15010732412338257, + "sft_loss": 1.5605857372283936, + "step": 640 + }, + { + "epoch": 1.155812402756168, + "grad_norm": 0.7217416763305664, + "learning_rate": 3.3800755490698008e-06, + "logits/chosen": -14.571185111999512, + "logits/rejected": -14.486068725585938, + "logps/chosen": -1.416325569152832, + "logps/rejected": -1.6571706533432007, + "loss": 1.4832, + "odds_ratio_loss": 0.6685255765914917, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.1416325569152832, + "rewards/margins": 0.024084512144327164, + "rewards/rejected": -0.16571708023548126, + "sft_loss": 1.416325569152832, + "step": 650 + }, + { + "epoch": 1.17359413202934, + "grad_norm": 1.5739328861236572, + "learning_rate": 3.3363210436051287e-06, + "logits/chosen": -14.533714294433594, + "logits/rejected": -14.47203540802002, + "logps/chosen": -1.5552722215652466, + "logps/rejected": -1.6581999063491821, + "loss": 1.6292, + "odds_ratio_loss": 0.7395690083503723, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": -0.1555272340774536, + "rewards/margins": 0.010292761959135532, + "rewards/rejected": -0.16581998765468597, + "sft_loss": 1.5552722215652466, + "step": 660 + }, + { + "epoch": 1.1913758613025116, + "grad_norm": 1.1496367454528809, + "learning_rate": 3.292276138160867e-06, + "logits/chosen": -14.616167068481445, + "logits/rejected": -14.592849731445312, + "logps/chosen": -1.4817497730255127, + "logps/rejected": -1.5020571947097778, + "loss": 1.5589, + "odds_ratio_loss": 0.7710050940513611, + "rewards/accuracies": 0.41874998807907104, + "rewards/chosen": -0.1481749713420868, + "rewards/margins": 0.002030743286013603, + "rewards/rejected": -0.15020573139190674, + "sft_loss": 1.4817497730255127, + "step": 670 + }, + { + "epoch": 1.2091575905756835, + "grad_norm": 0.9039623737335205, + "learning_rate": 3.2479561266719694e-06, + "logits/chosen": -14.403009414672852, + "logits/rejected": -14.395463943481445, + "logps/chosen": -1.536474347114563, + "logps/rejected": -1.5868223905563354, + "loss": 1.6131, + "odds_ratio_loss": 0.7661021947860718, + "rewards/accuracies": 0.4937500059604645, + "rewards/chosen": -0.15364743769168854, + "rewards/margins": 0.005034810863435268, + "rewards/rejected": -0.15868225693702698, + "sft_loss": 1.536474347114563, + "step": 680 + }, + { + "epoch": 1.2269393198488552, + "grad_norm": 1.4711978435516357, + "learning_rate": 3.2033763985998533e-06, + "logits/chosen": -14.493423461914062, + "logits/rejected": -14.387487411499023, + "logps/chosen": -1.4371049404144287, + "logps/rejected": -1.6720597743988037, + "loss": 1.5038, + "odds_ratio_loss": 0.6674162149429321, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -0.14371049404144287, + "rewards/margins": 0.023495487868785858, + "rewards/rejected": -0.16720597445964813, + "sft_loss": 1.4371049404144287, + "step": 690 + }, + { + "epoch": 1.244721049122027, + "grad_norm": 1.8284207582473755, + "learning_rate": 3.1585524335886335e-06, + "logits/chosen": -14.462885856628418, + "logits/rejected": -14.4248628616333, + "logps/chosen": -1.4241148233413696, + "logps/rejected": -1.5204662084579468, + "loss": 1.4937, + "odds_ratio_loss": 0.695872962474823, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.1424115002155304, + "rewards/margins": 0.00963515229523182, + "rewards/rejected": -0.15204663574695587, + "sft_loss": 1.4241148233413696, + "step": 700 + }, + { + "epoch": 1.262502778395199, + "grad_norm": 1.8534456491470337, + "learning_rate": 3.1134997960900536e-06, + "logits/chosen": -14.414436340332031, + "logits/rejected": -14.282020568847656, + "logps/chosen": -1.4227323532104492, + "logps/rejected": -1.592142105102539, + "loss": 1.4906, + "odds_ratio_loss": 0.678249180316925, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.14227323234081268, + "rewards/margins": 0.01694098487496376, + "rewards/rejected": -0.15921422839164734, + "sft_loss": 1.4227323532104492, + "step": 710 + }, + { + "epoch": 1.2802845076683709, + "grad_norm": 1.5054043531417847, + "learning_rate": 3.0682341299589583e-06, + "logits/chosen": -14.277090072631836, + "logits/rejected": -14.398614883422852, + "logps/chosen": -1.473953127861023, + "logps/rejected": -1.4768346548080444, + "loss": 1.5498, + "odds_ratio_loss": 0.7589610815048218, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.1473953276872635, + "rewards/margins": 0.00028814561665058136, + "rewards/rejected": -0.14768347144126892, + "sft_loss": 1.473953127861023, + "step": 720 + }, + { + "epoch": 1.2980662369415426, + "grad_norm": 2.268519878387451, + "learning_rate": 3.022771153021201e-06, + "logits/chosen": -14.384374618530273, + "logits/rejected": -14.452374458312988, + "logps/chosen": -1.4778945446014404, + "logps/rejected": -1.574272871017456, + "loss": 1.5509, + "odds_ratio_loss": 0.7302027940750122, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -0.14778944849967957, + "rewards/margins": 0.009637835435569286, + "rewards/rejected": -0.15742729604244232, + "sft_loss": 1.4778945446014404, + "step": 730 + }, + { + "epoch": 1.3158479662147144, + "grad_norm": 1.292470097541809, + "learning_rate": 2.9771266516158625e-06, + "logits/chosen": -14.381269454956055, + "logits/rejected": -14.515772819519043, + "logps/chosen": -1.467395544052124, + "logps/rejected": -1.5885987281799316, + "loss": 1.5376, + "odds_ratio_loss": 0.702446699142456, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -0.14673955738544464, + "rewards/margins": 0.01212033536285162, + "rewards/rejected": -0.15885989367961884, + "sft_loss": 1.467395544052124, + "step": 740 + }, + { + "epoch": 1.3336296954878861, + "grad_norm": 1.0139861106872559, + "learning_rate": 2.9313164751136802e-06, + "logits/chosen": -14.319999694824219, + "logits/rejected": -14.376035690307617, + "logps/chosen": -1.4767507314682007, + "logps/rejected": -1.5156667232513428, + "loss": 1.5505, + "odds_ratio_loss": 0.7372480630874634, + "rewards/accuracies": 0.4937500059604645, + "rewards/chosen": -0.1476750671863556, + "rewards/margins": 0.0038915909826755524, + "rewards/rejected": -0.15156666934490204, + "sft_loss": 1.4767507314682007, + "step": 750 + }, + { + "epoch": 1.351411424761058, + "grad_norm": 1.8870171308517456, + "learning_rate": 2.8853565304135956e-06, + "logits/chosen": -14.622869491577148, + "logits/rejected": -14.402894973754883, + "logps/chosen": -1.464207410812378, + "logps/rejected": -1.509709358215332, + "loss": 1.5409, + "odds_ratio_loss": 0.7670952081680298, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.14642071723937988, + "rewards/margins": 0.004550204146653414, + "rewards/rejected": -0.1509709358215332, + "sft_loss": 1.464207410812378, + "step": 760 + }, + { + "epoch": 1.36919315403423, + "grad_norm": 2.0388317108154297, + "learning_rate": 2.839262776419313e-06, + "logits/chosen": -14.462852478027344, + "logits/rejected": -14.366838455200195, + "logps/chosen": -1.4720194339752197, + "logps/rejected": -1.659558892250061, + "loss": 1.5416, + "odds_ratio_loss": 0.6956937313079834, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.14720192551612854, + "rewards/margins": 0.01875394582748413, + "rewards/rejected": -0.16595587134361267, + "sft_loss": 1.4720194339752197, + "step": 770 + }, + { + "epoch": 1.3869748833074016, + "grad_norm": 1.2085295915603638, + "learning_rate": 2.793051218497817e-06, + "logits/chosen": -14.64861011505127, + "logits/rejected": -14.640996932983398, + "logps/chosen": -1.4684925079345703, + "logps/rejected": -1.4641777276992798, + "loss": 1.5454, + "odds_ratio_loss": 0.7690992951393127, + "rewards/accuracies": 0.42500001192092896, + "rewards/chosen": -0.14684924483299255, + "rewards/margins": -0.00043147840187884867, + "rewards/rejected": -0.1464177817106247, + "sft_loss": 1.4684925079345703, + "step": 780 + }, + { + "epoch": 1.4047566125805735, + "grad_norm": 0.9894825220108032, + "learning_rate": 2.7467379029217437e-06, + "logits/chosen": -14.430456161499023, + "logits/rejected": -14.312626838684082, + "logps/chosen": -1.4456686973571777, + "logps/rejected": -1.5765860080718994, + "loss": 1.517, + "odds_ratio_loss": 0.7131879925727844, + "rewards/accuracies": 0.53125, + "rewards/chosen": -0.1445668637752533, + "rewards/margins": 0.013091735541820526, + "rewards/rejected": -0.15765860676765442, + "sft_loss": 1.4456686973571777, + "step": 790 + }, + { + "epoch": 1.4225383418537452, + "grad_norm": 0.9283164143562317, + "learning_rate": 2.7003389112975546e-06, + "logits/chosen": -14.672605514526367, + "logits/rejected": -14.80772590637207, + "logps/chosen": -1.5466537475585938, + "logps/rejected": -1.5707231760025024, + "loss": 1.6221, + "odds_ratio_loss": 0.7544839382171631, + "rewards/accuracies": 0.45625001192092896, + "rewards/chosen": -0.15466538071632385, + "rewards/margins": 0.002406922634691, + "rewards/rejected": -0.1570723056793213, + "sft_loss": 1.5466537475585938, + "step": 800 + }, + { + "epoch": 1.440320071126917, + "grad_norm": 2.0001485347747803, + "learning_rate": 2.653870354981437e-06, + "logits/chosen": -14.508201599121094, + "logits/rejected": -14.300331115722656, + "logps/chosen": -1.343572735786438, + "logps/rejected": -1.4564281702041626, + "loss": 1.4137, + "odds_ratio_loss": 0.7009164690971375, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": -0.1343572735786438, + "rewards/margins": 0.011285548098385334, + "rewards/rejected": -0.14564281702041626, + "sft_loss": 1.343572735786438, + "step": 810 + }, + { + "epoch": 1.458101800400089, + "grad_norm": 2.214451313018799, + "learning_rate": 2.6073483694848777e-06, + "logits/chosen": -14.371310234069824, + "logits/rejected": -14.641912460327148, + "logps/chosen": -1.459166169166565, + "logps/rejected": -1.5504926443099976, + "loss": 1.5337, + "odds_ratio_loss": 0.7451664805412292, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -0.14591661095619202, + "rewards/margins": 0.009132652543485165, + "rewards/rejected": -0.15504927933216095, + "sft_loss": 1.459166169166565, + "step": 820 + }, + { + "epoch": 1.4758835296732609, + "grad_norm": 0.7959926724433899, + "learning_rate": 2.560789108871847e-06, + "logits/chosen": -14.411550521850586, + "logits/rejected": -14.393269538879395, + "logps/chosen": -1.436858892440796, + "logps/rejected": -1.588494896888733, + "loss": 1.5064, + "odds_ratio_loss": 0.6953193545341492, + "rewards/accuracies": 0.4937500059604645, + "rewards/chosen": -0.14368589222431183, + "rewards/margins": 0.015163603238761425, + "rewards/rejected": -0.15884950757026672, + "sft_loss": 1.436858892440796, + "step": 830 + }, + { + "epoch": 1.4936652589464325, + "grad_norm": 2.3933050632476807, + "learning_rate": 2.514208740149544e-06, + "logits/chosen": -14.369955062866211, + "logits/rejected": -14.41723346710205, + "logps/chosen": -1.4793574810028076, + "logps/rejected": -1.566274881362915, + "loss": 1.5523, + "odds_ratio_loss": 0.729671061038971, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": -0.14793574810028076, + "rewards/margins": 0.008691729977726936, + "rewards/rejected": -0.15662747621536255, + "sft_loss": 1.4793574810028076, + "step": 840 + }, + { + "epoch": 1.5114469882196042, + "grad_norm": 1.3297358751296997, + "learning_rate": 2.46762343765464e-06, + "logits/chosen": -14.52283000946045, + "logits/rejected": -14.487058639526367, + "logps/chosen": -1.4763939380645752, + "logps/rejected": -1.6392600536346436, + "loss": 1.5463, + "odds_ratio_loss": 0.6985622644424438, + "rewards/accuracies": 0.53125, + "rewards/chosen": -0.14763939380645752, + "rewards/margins": 0.01628662459552288, + "rewards/rejected": -0.16392602026462555, + "sft_loss": 1.4763939380645752, + "step": 850 + }, + { + "epoch": 1.5292287174927761, + "grad_norm": 1.4351073503494263, + "learning_rate": 2.4210493774369903e-06, + "logits/chosen": -14.422691345214844, + "logits/rejected": -14.469161987304688, + "logps/chosen": -1.5580244064331055, + "logps/rejected": -1.5905354022979736, + "loss": 1.6351, + "odds_ratio_loss": 0.770281970500946, + "rewards/accuracies": 0.4625000059604645, + "rewards/chosen": -0.1558024138212204, + "rewards/margins": 0.003251106943935156, + "rewards/rejected": -0.15905353426933289, + "sft_loss": 1.5580244064331055, + "step": 860 + }, + { + "epoch": 1.547010446765948, + "grad_norm": 0.898558497428894, + "learning_rate": 2.374502731642732e-06, + "logits/chosen": -14.498028755187988, + "logits/rejected": -14.491365432739258, + "logps/chosen": -1.4510562419891357, + "logps/rejected": -1.4965808391571045, + "loss": 1.5247, + "odds_ratio_loss": 0.7362778782844543, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.14510561525821686, + "rewards/margins": 0.004552468657493591, + "rewards/rejected": -0.14965808391571045, + "sft_loss": 1.4510562419891357, + "step": 870 + }, + { + "epoch": 1.56479217603912, + "grad_norm": 1.0833823680877686, + "learning_rate": 2.3279996628987556e-06, + "logits/chosen": -14.427210807800293, + "logits/rejected": -14.47814655303955, + "logps/chosen": -1.4597257375717163, + "logps/rejected": -1.4845510721206665, + "loss": 1.5356, + "odds_ratio_loss": 0.7589424252510071, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.14597256481647491, + "rewards/margins": 0.002482546027749777, + "rewards/rejected": -0.14845511317253113, + "sft_loss": 1.4597257375717163, + "step": 880 + }, + { + "epoch": 1.5825739053122916, + "grad_norm": 1.345390796661377, + "learning_rate": 2.281556318700474e-06, + "logits/chosen": -14.579635620117188, + "logits/rejected": -14.662760734558105, + "logps/chosen": -1.435423493385315, + "logps/rejected": -1.4268932342529297, + "loss": 1.5144, + "odds_ratio_loss": 0.789936900138855, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.1435423642396927, + "rewards/margins": -0.0008530290797352791, + "rewards/rejected": -0.14268933236598969, + "sft_loss": 1.435423493385315, + "step": 890 + }, + { + "epoch": 1.6003556345854635, + "grad_norm": 2.053659439086914, + "learning_rate": 2.2351888258048408e-06, + "logits/chosen": -14.430631637573242, + "logits/rejected": -14.57084846496582, + "logps/chosen": -1.4910775423049927, + "logps/rejected": -1.588700532913208, + "loss": 1.5647, + "odds_ratio_loss": 0.7359753251075745, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -0.14910776913166046, + "rewards/margins": 0.009762286208570004, + "rewards/rejected": -0.15887002646923065, + "sft_loss": 1.4910775423049927, + "step": 900 + }, + { + "epoch": 1.6181373638586352, + "grad_norm": 1.0991109609603882, + "learning_rate": 2.188913284630584e-06, + "logits/chosen": -14.372177124023438, + "logits/rejected": -14.45659065246582, + "logps/chosen": -1.5502886772155762, + "logps/rejected": -1.5803656578063965, + "loss": 1.6264, + "odds_ratio_loss": 0.7611321210861206, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -0.15502886474132538, + "rewards/margins": 0.0030077022965997458, + "rewards/rejected": -0.15803657472133636, + "sft_loss": 1.5502886772155762, + "step": 910 + }, + { + "epoch": 1.635919093131807, + "grad_norm": 4.411732196807861, + "learning_rate": 2.1427457636675652e-06, + "logits/chosen": -14.42602825164795, + "logits/rejected": -14.588623046875, + "logps/chosen": -1.5284509658813477, + "logps/rejected": -1.539262056350708, + "loss": 1.609, + "odds_ratio_loss": 0.8054282069206238, + "rewards/accuracies": 0.45625001192092896, + "rewards/chosen": -0.152845099568367, + "rewards/margins": 0.0010811155661940575, + "rewards/rejected": -0.15392622351646423, + "sft_loss": 1.5284509658813477, + "step": 920 + }, + { + "epoch": 1.653700822404979, + "grad_norm": 1.1233824491500854, + "learning_rate": 2.096702293897247e-06, + "logits/chosen": -14.348466873168945, + "logits/rejected": -14.271936416625977, + "logps/chosen": -1.4055891036987305, + "logps/rejected": -1.5611364841461182, + "loss": 1.4753, + "odds_ratio_loss": 0.6970332860946655, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.1405588984489441, + "rewards/margins": 0.015554727986454964, + "rewards/rejected": -0.1561136394739151, + "sft_loss": 1.4055891036987305, + "step": 930 + }, + { + "epoch": 1.6714825516781509, + "grad_norm": 1.966296672821045, + "learning_rate": 2.0507988632261672e-06, + "logits/chosen": -14.399055480957031, + "logits/rejected": -14.409767150878906, + "logps/chosen": -1.4748440980911255, + "logps/rejected": -1.4950190782546997, + "loss": 1.5523, + "odds_ratio_loss": 0.7747657895088196, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.1474844217300415, + "rewards/margins": 0.002017489168792963, + "rewards/rejected": -0.14950190484523773, + "sft_loss": 1.4748440980911255, + "step": 940 + }, + { + "epoch": 1.6892642809513225, + "grad_norm": 2.740964651107788, + "learning_rate": 2.005051410934382e-06, + "logits/chosen": -14.488139152526855, + "logits/rejected": -14.520662307739258, + "logps/chosen": -1.5530728101730347, + "logps/rejected": -1.6438829898834229, + "loss": 1.6259, + "odds_ratio_loss": 0.7283372282981873, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -0.15530726313591003, + "rewards/margins": 0.009081020019948483, + "rewards/rejected": -0.16438829898834229, + "sft_loss": 1.5530728101730347, + "step": 950 + }, + { + "epoch": 1.7070460102244942, + "grad_norm": 2.387444496154785, + "learning_rate": 1.9594758221407843e-06, + "logits/chosen": -14.521173477172852, + "logits/rejected": -14.390649795532227, + "logps/chosen": -1.3837093114852905, + "logps/rejected": -1.492570161819458, + "loss": 1.453, + "odds_ratio_loss": 0.6927343606948853, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.13837094604969025, + "rewards/margins": 0.010886076837778091, + "rewards/rejected": -0.14925701916217804, + "sft_loss": 1.3837093114852905, + "step": 960 + }, + { + "epoch": 1.724827739497666, + "grad_norm": 1.347974181175232, + "learning_rate": 1.9140879222872408e-06, + "logits/chosen": -14.385538101196289, + "logits/rejected": -14.44627857208252, + "logps/chosen": -1.4018323421478271, + "logps/rejected": -1.4524750709533691, + "loss": 1.4772, + "odds_ratio_loss": 0.7537996172904968, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -0.140183225274086, + "rewards/margins": 0.0050642904825508595, + "rewards/rejected": -0.14524750411510468, + "sft_loss": 1.4018323421478271, + "step": 970 + }, + { + "epoch": 1.742609468770838, + "grad_norm": 0.7524885535240173, + "learning_rate": 1.8689034716434346e-06, + "logits/chosen": -14.513898849487305, + "logits/rejected": -14.415435791015625, + "logps/chosen": -1.4909882545471191, + "logps/rejected": -1.4778468608856201, + "loss": 1.5681, + "odds_ratio_loss": 0.771571159362793, + "rewards/accuracies": 0.45625001192092896, + "rewards/chosen": -0.14909882843494415, + "rewards/margins": -0.0013141353847458959, + "rewards/rejected": -0.14778469502925873, + "sft_loss": 1.4909882545471191, + "step": 980 + }, + { + "epoch": 1.76039119804401, + "grad_norm": 1.0324140787124634, + "learning_rate": 1.8239381598343576e-06, + "logits/chosen": -14.462181091308594, + "logits/rejected": -14.475656509399414, + "logps/chosen": -1.411113977432251, + "logps/rejected": -1.4429736137390137, + "loss": 1.4872, + "odds_ratio_loss": 0.7608811259269714, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": -0.14111140370368958, + "rewards/margins": 0.003185967681929469, + "rewards/rejected": -0.14429736137390137, + "sft_loss": 1.411113977432251, + "step": 990 + }, + { + "epoch": 1.7781729273171816, + "grad_norm": 1.5470359325408936, + "learning_rate": 1.779207600392312e-06, + "logits/chosen": -14.624302864074707, + "logits/rejected": -14.673869132995605, + "logps/chosen": -1.4659007787704468, + "logps/rejected": -1.5123026371002197, + "loss": 1.5401, + "odds_ratio_loss": 0.7417243123054504, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -0.14659008383750916, + "rewards/margins": 0.004640174098312855, + "rewards/rejected": -0.15123026072978973, + "sft_loss": 1.4659007787704468, + "step": 1000 + }, + { + "epoch": 1.7781729273171816, + "eval_logits/chosen": -14.472068786621094, + "eval_logits/rejected": -14.548614501953125, + "eval_logps/chosen": -1.4554640054702759, + "eval_logps/rejected": -1.5491538047790527, + "eval_loss": 1.5269325971603394, + "eval_odds_ratio_loss": 0.7146860361099243, + "eval_rewards/accuracies": 0.5019999742507935, + "eval_rewards/chosen": -0.14554640650749207, + "eval_rewards/margins": 0.00936897937208414, + "eval_rewards/rejected": -0.15491539239883423, + "eval_runtime": 408.1752, + "eval_samples_per_second": 2.45, + "eval_sft_loss": 1.4554640054702759, + "eval_steps_per_second": 1.225, + "step": 1000 + }, + { + "epoch": 1.7959546565903532, + "grad_norm": 2.230626106262207, + "learning_rate": 1.7347273253353552e-06, + "logits/chosen": -14.449197769165039, + "logits/rejected": -14.371235847473145, + "logps/chosen": -1.4797186851501465, + "logps/rejected": -1.5376956462860107, + "loss": 1.5549, + "odds_ratio_loss": 0.7520232796669006, + "rewards/accuracies": 0.45625001192092896, + "rewards/chosen": -0.14797186851501465, + "rewards/margins": 0.005797683726996183, + "rewards/rejected": -0.15376953780651093, + "sft_loss": 1.4797186851501465, + "step": 1010 + }, + { + "epoch": 1.8137363858635251, + "grad_norm": 3.5348658561706543, + "learning_rate": 1.690512779774029e-06, + "logits/chosen": -14.508018493652344, + "logits/rejected": -14.555743217468262, + "logps/chosen": -1.4630930423736572, + "logps/rejected": -1.5662147998809814, + "loss": 1.5345, + "odds_ratio_loss": 0.714568018913269, + "rewards/accuracies": 0.46875, + "rewards/chosen": -0.14630930125713348, + "rewards/margins": 0.010312167927622795, + "rewards/rejected": -0.15662148594856262, + "sft_loss": 1.4630930423736572, + "step": 1020 + }, + { + "epoch": 1.831518115136697, + "grad_norm": 1.3148037195205688, + "learning_rate": 1.6465793165482838e-06, + "logits/chosen": -14.601341247558594, + "logits/rejected": -14.604815483093262, + "logps/chosen": -1.3863935470581055, + "logps/rejected": -1.5016463994979858, + "loss": 1.4553, + "odds_ratio_loss": 0.6889203786849976, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -0.13863936066627502, + "rewards/margins": 0.011525283567607403, + "rewards/rejected": -0.1501646339893341, + "sft_loss": 1.3863935470581055, + "step": 1030 + }, + { + "epoch": 1.849299844409869, + "grad_norm": 1.944263219833374, + "learning_rate": 1.6029421908964305e-06, + "logits/chosen": -14.364709854125977, + "logits/rejected": -14.301609992980957, + "logps/chosen": -1.444108247756958, + "logps/rejected": -1.735345482826233, + "loss": 1.5112, + "odds_ratio_loss": 0.6704747080802917, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.1444108486175537, + "rewards/margins": 0.029123712331056595, + "rewards/rejected": -0.1735345423221588, + "sft_loss": 1.444108247756958, + "step": 1040 + }, + { + "epoch": 1.8670815736830408, + "grad_norm": 3.4116580486297607, + "learning_rate": 1.559616555157985e-06, + "logits/chosen": -14.63707160949707, + "logits/rejected": -14.487701416015625, + "logps/chosen": -1.4554407596588135, + "logps/rejected": -1.532787561416626, + "loss": 1.529, + "odds_ratio_loss": 0.7356154322624207, + "rewards/accuracies": 0.4937500059604645, + "rewards/chosen": -0.14554408192634583, + "rewards/margins": 0.007734687067568302, + "rewards/rejected": -0.15327878296375275, + "sft_loss": 1.4554407596588135, + "step": 1050 + }, + { + "epoch": 1.8848633029562125, + "grad_norm": 1.2246363162994385, + "learning_rate": 1.516617453512252e-06, + "logits/chosen": -14.424253463745117, + "logits/rejected": -14.5010347366333, + "logps/chosen": -1.5207793712615967, + "logps/rejected": -1.531063437461853, + "loss": 1.5985, + "odds_ratio_loss": 0.7770354151725769, + "rewards/accuracies": 0.46875, + "rewards/chosen": -0.15207794308662415, + "rewards/margins": 0.0010283945593982935, + "rewards/rejected": -0.15310634672641754, + "sft_loss": 1.5207793712615967, + "step": 1060 + }, + { + "epoch": 1.9026450322293842, + "grad_norm": 2.65510630607605, + "learning_rate": 1.473959816754449e-06, + "logits/chosen": -14.302286148071289, + "logits/rejected": -14.37928581237793, + "logps/chosen": -1.3779585361480713, + "logps/rejected": -1.444608211517334, + "loss": 1.4506, + "odds_ratio_loss": 0.7259268164634705, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": -0.1377958357334137, + "rewards/margins": 0.00666497927159071, + "rewards/rejected": -0.14446081221103668, + "sft_loss": 1.3779585361480713, + "step": 1070 + }, + { + "epoch": 1.920426761502556, + "grad_norm": 1.5242667198181152, + "learning_rate": 1.4316584571112213e-06, + "logits/chosen": -14.791394233703613, + "logits/rejected": -14.659652709960938, + "logps/chosen": -1.451185941696167, + "logps/rejected": -1.5176652669906616, + "loss": 1.5255, + "odds_ratio_loss": 0.7434892058372498, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": -0.1451185941696167, + "rewards/margins": 0.0066479267552495, + "rewards/rejected": -0.15176650881767273, + "sft_loss": 1.451185941696167, + "step": 1080 + }, + { + "epoch": 1.938208490775728, + "grad_norm": 1.3730899095535278, + "learning_rate": 1.389728063097306e-06, + "logits/chosen": -14.725341796875, + "logits/rejected": -14.647857666015625, + "logps/chosen": -1.4444434642791748, + "logps/rejected": -1.6104562282562256, + "loss": 1.5134, + "odds_ratio_loss": 0.6893559098243713, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -0.14444434642791748, + "rewards/margins": 0.016601284965872765, + "rewards/rejected": -0.161045640707016, + "sft_loss": 1.4444434642791748, + "step": 1090 + }, + { + "epoch": 1.9559902200488999, + "grad_norm": 0.9227787852287292, + "learning_rate": 1.348183194415179e-06, + "logits/chosen": -14.495756149291992, + "logits/rejected": -14.327432632446289, + "logps/chosen": -1.419993281364441, + "logps/rejected": -1.5734179019927979, + "loss": 1.4884, + "odds_ratio_loss": 0.6841065883636475, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.14199933409690857, + "rewards/margins": 0.0153424721211195, + "rewards/rejected": -0.15734180808067322, + "sft_loss": 1.419993281364441, + "step": 1100 + }, + { + "epoch": 1.9737719493220716, + "grad_norm": 0.9474197626113892, + "learning_rate": 1.3070382768994015e-06, + "logits/chosen": -14.650873184204102, + "logits/rejected": -14.58227825164795, + "logps/chosen": -1.4364961385726929, + "logps/rejected": -1.5456112623214722, + "loss": 1.5057, + "odds_ratio_loss": 0.6923903226852417, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.1436496078968048, + "rewards/margins": 0.010911517776548862, + "rewards/rejected": -0.1545611321926117, + "sft_loss": 1.4364961385726929, + "step": 1110 + }, + { + "epoch": 1.9915536785952432, + "grad_norm": 1.042833924293518, + "learning_rate": 1.2663075975074746e-06, + "logits/chosen": -14.496850967407227, + "logits/rejected": -14.485113143920898, + "logps/chosen": -1.4912959337234497, + "logps/rejected": -1.6449800729751587, + "loss": 1.5613, + "odds_ratio_loss": 0.699638307094574, + "rewards/accuracies": 0.4937500059604645, + "rewards/chosen": -0.14912959933280945, + "rewards/margins": 0.015368418768048286, + "rewards/rejected": -0.1644980013370514, + "sft_loss": 1.4912959337234497, + "step": 1120 + }, + { + "epoch": 2.009335407868415, + "grad_norm": 2.740922689437866, + "learning_rate": 1.2260052993589034e-06, + "logits/chosen": -14.43859577178955, + "logits/rejected": -14.43925952911377, + "logps/chosen": -1.535914659500122, + "logps/rejected": -1.5225673913955688, + "loss": 1.615, + "odds_ratio_loss": 0.7913249731063843, + "rewards/accuracies": 0.4625000059604645, + "rewards/chosen": -0.15359148383140564, + "rewards/margins": -0.0013347306521609426, + "rewards/rejected": -0.15225675702095032, + "sft_loss": 1.535914659500122, + "step": 1130 + }, + { + "epoch": 2.027117137141587, + "grad_norm": 0.7980636954307556, + "learning_rate": 1.1861453768242099e-06, + "logits/chosen": -14.416345596313477, + "logits/rejected": -14.40721321105957, + "logps/chosen": -1.4432775974273682, + "logps/rejected": -1.5133426189422607, + "loss": 1.5179, + "odds_ratio_loss": 0.7463361620903015, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.14432775974273682, + "rewards/margins": 0.007006504572927952, + "rewards/rejected": -0.1513342708349228, + "sft_loss": 1.4432775974273682, + "step": 1140 + }, + { + "epoch": 2.044898866414759, + "grad_norm": 5.74677038192749, + "learning_rate": 1.1467416706655982e-06, + "logits/chosen": -14.52302074432373, + "logits/rejected": -14.662378311157227, + "logps/chosen": -1.5389671325683594, + "logps/rejected": -1.645978569984436, + "loss": 1.6125, + "odds_ratio_loss": 0.7352578043937683, + "rewards/accuracies": 0.4937500059604645, + "rewards/chosen": -0.15389671921730042, + "rewards/margins": 0.010701162740588188, + "rewards/rejected": -0.16459788382053375, + "sft_loss": 1.5389671325683594, + "step": 1150 + }, + { + "epoch": 2.062680595687931, + "grad_norm": 0.986960232257843, + "learning_rate": 1.1078078632309559e-06, + "logits/chosen": -14.417854309082031, + "logits/rejected": -14.478330612182617, + "logps/chosen": -1.4349641799926758, + "logps/rejected": -1.5250955820083618, + "loss": 1.5053, + "odds_ratio_loss": 0.7031277418136597, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.14349642395973206, + "rewards/margins": 0.009013157337903976, + "rewards/rejected": -0.15250957012176514, + "sft_loss": 1.4349641799926758, + "step": 1160 + }, + { + "epoch": 2.0804623249611023, + "grad_norm": 2.110325813293457, + "learning_rate": 1.0693574737028627e-06, + "logits/chosen": -14.4419584274292, + "logits/rejected": -14.437457084655762, + "logps/chosen": -1.5028114318847656, + "logps/rejected": -1.5467736721038818, + "loss": 1.5773, + "odds_ratio_loss": 0.7452652454376221, + "rewards/accuracies": 0.48124998807907104, + "rewards/chosen": -0.1502811312675476, + "rewards/margins": 0.004396246280521154, + "rewards/rejected": -0.1546773910522461, + "sft_loss": 1.5028114318847656, + "step": 1170 + }, + { + "epoch": 2.098244054234274, + "grad_norm": 1.9610004425048828, + "learning_rate": 1.0314038534042586e-06, + "logits/chosen": -14.581426620483398, + "logits/rejected": -14.440699577331543, + "logps/chosen": -1.3876335620880127, + "logps/rejected": -1.5056065320968628, + "loss": 1.4589, + "odds_ratio_loss": 0.7131034731864929, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -0.13876333832740784, + "rewards/margins": 0.011797326616942883, + "rewards/rejected": -0.1505606770515442, + "sft_loss": 1.3876335620880127, + "step": 1180 + }, + { + "epoch": 2.116025783507446, + "grad_norm": 1.4545583724975586, + "learning_rate": 9.939601811623946e-07, + "logits/chosen": -14.581433296203613, + "logits/rejected": -14.601740837097168, + "logps/chosen": -1.422616720199585, + "logps/rejected": -1.5079143047332764, + "loss": 1.4943, + "odds_ratio_loss": 0.7167633771896362, + "rewards/accuracies": 0.4937500059604645, + "rewards/chosen": -0.14226169884204865, + "rewards/margins": 0.00852974783629179, + "rewards/rejected": -0.15079143643379211, + "sft_loss": 1.422616720199585, + "step": 1190 + }, + { + "epoch": 2.133807512780618, + "grad_norm": 1.6818853616714478, + "learning_rate": 9.570394587326825e-07, + "logits/chosen": -14.631085395812988, + "logits/rejected": -14.470014572143555, + "logps/chosen": -1.4276001453399658, + "logps/rejected": -1.5927342176437378, + "loss": 1.4962, + "odds_ratio_loss": 0.6859818696975708, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -0.1427600234746933, + "rewards/margins": 0.016513412818312645, + "rewards/rejected": -0.1592734307050705, + "sft_loss": 1.4276001453399658, + "step": 1200 + }, + { + "epoch": 2.15158924205379, + "grad_norm": 0.9193028807640076, + "learning_rate": 9.206545062840302e-07, + "logits/chosen": -14.681951522827148, + "logits/rejected": -14.49401569366455, + "logps/chosen": -1.4501049518585205, + "logps/rejected": -1.550287127494812, + "loss": 1.5204, + "odds_ratio_loss": 0.7033563256263733, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.14501050114631653, + "rewards/margins": 0.010018205270171165, + "rewards/rejected": -0.15502868592739105, + "sft_loss": 1.4501049518585205, + "step": 1210 + }, + { + "epoch": 2.1693709713269618, + "grad_norm": 0.9245877265930176, + "learning_rate": 8.848179579472285e-07, + "logits/chosen": -14.563427925109863, + "logits/rejected": -14.575236320495605, + "logps/chosen": -1.3411179780960083, + "logps/rejected": -1.3986274003982544, + "loss": 1.4132, + "odds_ratio_loss": 0.7204734683036804, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -0.13411179184913635, + "rewards/margins": 0.005750957410782576, + "rewards/rejected": -0.13986274600028992, + "sft_loss": 1.3411179780960083, + "step": 1220 + }, + { + "epoch": 2.1871527006001332, + "grad_norm": 4.244547367095947, + "learning_rate": 8.495422574279403e-07, + "logits/chosen": -14.301058769226074, + "logits/rejected": -14.25413703918457, + "logps/chosen": -1.370896339416504, + "logps/rejected": -1.5229545831680298, + "loss": 1.4426, + "odds_ratio_loss": 0.7167730927467346, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.13708963990211487, + "rewards/margins": 0.015205820091068745, + "rewards/rejected": -0.15229545533657074, + "sft_loss": 1.370896339416504, + "step": 1230 + }, + { + "epoch": 2.204934429873305, + "grad_norm": 1.5995064973831177, + "learning_rate": 8.148396536858063e-07, + "logits/chosen": -14.522483825683594, + "logits/rejected": -14.575413703918457, + "logps/chosen": -1.506316065788269, + "logps/rejected": -1.6317975521087646, + "loss": 1.578, + "odds_ratio_loss": 0.7170661687850952, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.1506316065788269, + "rewards/margins": 0.012548169121146202, + "rewards/rejected": -0.16317978501319885, + "sft_loss": 1.506316065788269, + "step": 1240 + }, + { + "epoch": 2.222716159146477, + "grad_norm": 2.302438497543335, + "learning_rate": 7.807221966811815e-07, + "logits/chosen": -14.514368057250977, + "logits/rejected": -14.467686653137207, + "logps/chosen": -1.3972229957580566, + "logps/rejected": -1.503300666809082, + "loss": 1.4695, + "odds_ratio_loss": 0.7227479219436646, + "rewards/accuracies": 0.48124998807907104, + "rewards/chosen": -0.1397223025560379, + "rewards/margins": 0.010607777163386345, + "rewards/rejected": -0.1503300666809082, + "sft_loss": 1.3972229957580566, + "step": 1250 + }, + { + "epoch": 2.240497888419649, + "grad_norm": 2.491163492202759, + "learning_rate": 7.47201733190962e-07, + "logits/chosen": -14.408790588378906, + "logits/rejected": -14.414535522460938, + "logps/chosen": -1.4305273294448853, + "logps/rejected": -1.489025354385376, + "loss": 1.5047, + "odds_ratio_loss": 0.7419496774673462, + "rewards/accuracies": 0.48124998807907104, + "rewards/chosen": -0.14305274188518524, + "rewards/margins": 0.005849803332239389, + "rewards/rejected": -0.1489025503396988, + "sft_loss": 1.4305273294448853, + "step": 1260 + }, + { + "epoch": 2.258279617692821, + "grad_norm": 1.4214234352111816, + "learning_rate": 7.142899026949721e-07, + "logits/chosen": -14.524235725402832, + "logits/rejected": -14.52795696258545, + "logps/chosen": -1.4059574604034424, + "logps/rejected": -1.5294010639190674, + "loss": 1.4755, + "odds_ratio_loss": 0.6952685117721558, + "rewards/accuracies": 0.53125, + "rewards/chosen": -0.1405957192182541, + "rewards/margins": 0.012344368733465672, + "rewards/rejected": -0.15294012427330017, + "sft_loss": 1.4059574604034424, + "step": 1270 + }, + { + "epoch": 2.2760613469659923, + "grad_norm": 5.792439937591553, + "learning_rate": 6.819981333343273e-07, + "logits/chosen": -14.282297134399414, + "logits/rejected": -14.3455171585083, + "logps/chosen": -1.3890929222106934, + "logps/rejected": -1.5048874616622925, + "loss": 1.4613, + "odds_ratio_loss": 0.721687912940979, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.13890929520130157, + "rewards/margins": 0.011579466983675957, + "rewards/rejected": -0.1504887491464615, + "sft_loss": 1.3890929222106934, + "step": 1280 + }, + { + "epoch": 2.293843076239164, + "grad_norm": 1.8430945873260498, + "learning_rate": 6.503376379431839e-07, + "logits/chosen": -14.595362663269043, + "logits/rejected": -14.600665092468262, + "logps/chosen": -1.5411078929901123, + "logps/rejected": -1.4967674016952515, + "loss": 1.6191, + "odds_ratio_loss": 0.7797070741653442, + "rewards/accuracies": 0.41874998807907104, + "rewards/chosen": -0.15411078929901123, + "rewards/margins": -0.004434044472873211, + "rewards/rejected": -0.14967674016952515, + "sft_loss": 1.5411078929901123, + "step": 1290 + }, + { + "epoch": 2.311624805512336, + "grad_norm": 1.6006221771240234, + "learning_rate": 6.193194101552502e-07, + "logits/chosen": -14.458605766296387, + "logits/rejected": -14.19080638885498, + "logps/chosen": -1.4712187051773071, + "logps/rejected": -1.5340622663497925, + "loss": 1.5413, + "odds_ratio_loss": 0.7012200951576233, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": -0.147121861577034, + "rewards/margins": 0.006284369621425867, + "rewards/rejected": -0.15340623259544373, + "sft_loss": 1.4712187051773071, + "step": 1300 + }, + { + "epoch": 2.329406534785508, + "grad_norm": 2.382812261581421, + "learning_rate": 5.889542205864083e-07, + "logits/chosen": -14.490499496459961, + "logits/rejected": -14.431979179382324, + "logps/chosen": -1.5115288496017456, + "logps/rejected": -1.508636474609375, + "loss": 1.5893, + "odds_ratio_loss": 0.7778019309043884, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -0.15115289390087128, + "rewards/margins": -0.0002892397460527718, + "rewards/rejected": -0.1508636474609375, + "sft_loss": 1.5115288496017456, + "step": 1310 + }, + { + "epoch": 2.34718826405868, + "grad_norm": 1.2705732583999634, + "learning_rate": 5.592526130947862e-07, + "logits/chosen": -14.570259094238281, + "logits/rejected": -14.508715629577637, + "logps/chosen": -1.4529051780700684, + "logps/rejected": -1.5590078830718994, + "loss": 1.5284, + "odds_ratio_loss": 0.7552819848060608, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.14529050886631012, + "rewards/margins": 0.01061027031391859, + "rewards/rejected": -0.15590079128742218, + "sft_loss": 1.4529051780700684, + "step": 1320 + }, + { + "epoch": 2.3649699933318518, + "grad_norm": 1.9159420728683472, + "learning_rate": 5.302249011195507e-07, + "logits/chosen": -14.313755989074707, + "logits/rejected": -14.381486892700195, + "logps/chosen": -1.3798234462738037, + "logps/rejected": -1.4174072742462158, + "loss": 1.4519, + "odds_ratio_loss": 0.7206598520278931, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -0.1379823386669159, + "rewards/margins": 0.0037583820521831512, + "rewards/rejected": -0.14174072444438934, + "sft_loss": 1.3798234462738037, + "step": 1330 + }, + { + "epoch": 2.382751722605023, + "grad_norm": 3.4968583583831787, + "learning_rate": 5.018811640997307e-07, + "logits/chosen": -14.515790939331055, + "logits/rejected": -14.633216857910156, + "logps/chosen": -1.471914529800415, + "logps/rejected": -1.6871318817138672, + "loss": 1.5372, + "odds_ratio_loss": 0.6527343392372131, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.14719144999980927, + "rewards/margins": 0.02152174338698387, + "rewards/rejected": -0.16871318221092224, + "sft_loss": 1.471914529800415, + "step": 1340 + }, + { + "epoch": 2.400533451878195, + "grad_norm": 1.3940002918243408, + "learning_rate": 4.7423124397427105e-07, + "logits/chosen": -14.439651489257812, + "logits/rejected": -14.579530715942383, + "logps/chosen": -1.4648997783660889, + "logps/rejected": -1.4503980875015259, + "loss": 1.5422, + "odds_ratio_loss": 0.773194432258606, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -0.1464899778366089, + "rewards/margins": -0.0014501826371997595, + "rewards/rejected": -0.14503981173038483, + "sft_loss": 1.4648997783660889, + "step": 1350 + }, + { + "epoch": 2.418315181151367, + "grad_norm": 1.0056315660476685, + "learning_rate": 4.472847417645787e-07, + "logits/chosen": -14.650228500366211, + "logits/rejected": -14.57947826385498, + "logps/chosen": -1.3967421054840088, + "logps/rejected": -1.6349856853485107, + "loss": 1.4623, + "odds_ratio_loss": 0.6557605266571045, + "rewards/accuracies": 0.53125, + "rewards/chosen": -0.13967421650886536, + "rewards/margins": 0.023824386298656464, + "rewards/rejected": -0.16349859535694122, + "sft_loss": 1.3967421054840088, + "step": 1360 + }, + { + "epoch": 2.436096910424539, + "grad_norm": 0.7953612804412842, + "learning_rate": 4.210510142406993e-07, + "logits/chosen": -14.647814750671387, + "logits/rejected": -14.501565933227539, + "logps/chosen": -1.4687623977661133, + "logps/rejected": -1.5674188137054443, + "loss": 1.5411, + "odds_ratio_loss": 0.723659873008728, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -0.1468762457370758, + "rewards/margins": 0.009865625761449337, + "rewards/rejected": -0.15674187242984772, + "sft_loss": 1.4687623977661133, + "step": 1370 + }, + { + "epoch": 2.4538786396977104, + "grad_norm": 1.0436296463012695, + "learning_rate": 3.9553917067232966e-07, + "logits/chosen": -14.456913948059082, + "logits/rejected": -14.33137321472168, + "logps/chosen": -1.451188564300537, + "logps/rejected": -1.5067174434661865, + "loss": 1.5265, + "odds_ratio_loss": 0.7529994249343872, + "rewards/accuracies": 0.46875, + "rewards/chosen": -0.1451188623905182, + "rewards/margins": 0.005552901886403561, + "rewards/rejected": -0.15067176520824432, + "sft_loss": 1.451188564300537, + "step": 1380 + }, + { + "epoch": 2.4716603689708823, + "grad_norm": 1.177869200706482, + "learning_rate": 3.707580696657509e-07, + "logits/chosen": -14.569585800170898, + "logits/rejected": -14.398585319519043, + "logps/chosen": -1.4260759353637695, + "logps/rejected": -1.4303407669067383, + "loss": 1.5, + "odds_ratio_loss": 0.7389937043190002, + "rewards/accuracies": 0.4625000059604645, + "rewards/chosen": -0.14260759949684143, + "rewards/margins": 0.0004264971357770264, + "rewards/rejected": -0.14303407073020935, + "sft_loss": 1.4260759353637695, + "step": 1390 + }, + { + "epoch": 2.489442098244054, + "grad_norm": 1.4175564050674438, + "learning_rate": 3.4671631608781815e-07, + "logits/chosen": -14.617632865905762, + "logits/rejected": -14.494784355163574, + "logps/chosen": -1.4826141595840454, + "logps/rejected": -1.5348364114761353, + "loss": 1.5593, + "odds_ratio_loss": 0.7670896649360657, + "rewards/accuracies": 0.42500001192092896, + "rewards/chosen": -0.1482614129781723, + "rewards/margins": 0.005222243722528219, + "rewards/rejected": -0.15348365902900696, + "sft_loss": 1.4826141595840454, + "step": 1400 + }, + { + "epoch": 2.507223827517226, + "grad_norm": 1.6863813400268555, + "learning_rate": 3.234222580780405e-07, + "logits/chosen": -14.408329963684082, + "logits/rejected": -14.414642333984375, + "logps/chosen": -1.4218008518218994, + "logps/rejected": -1.4535105228424072, + "loss": 1.4962, + "odds_ratio_loss": 0.7441353797912598, + "rewards/accuracies": 0.4312500059604645, + "rewards/chosen": -0.14218010008335114, + "rewards/margins": 0.0031709708273410797, + "rewards/rejected": -0.1453510820865631, + "sft_loss": 1.4218008518218994, + "step": 1410 + }, + { + "epoch": 2.525005556790398, + "grad_norm": 4.5228376388549805, + "learning_rate": 3.0088398414982375e-07, + "logits/chosen": -14.276860237121582, + "logits/rejected": -14.423869132995605, + "logps/chosen": -1.5565452575683594, + "logps/rejected": -1.6457360982894897, + "loss": 1.6331, + "odds_ratio_loss": 0.7656282782554626, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.15565453469753265, + "rewards/margins": 0.008919079788029194, + "rewards/rejected": -0.16457359492778778, + "sft_loss": 1.5565452575683594, + "step": 1420 + }, + { + "epoch": 2.54278728606357, + "grad_norm": 6.204384803771973, + "learning_rate": 2.7910932038184487e-07, + "logits/chosen": -14.392138481140137, + "logits/rejected": -14.114950180053711, + "logps/chosen": -1.4198369979858398, + "logps/rejected": -1.5630581378936768, + "loss": 1.4895, + "odds_ratio_loss": 0.6968662142753601, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.14198370277881622, + "rewards/margins": 0.014322122558951378, + "rewards/rejected": -0.15630581974983215, + "sft_loss": 1.4198369979858398, + "step": 1430 + }, + { + "epoch": 2.5605690153367417, + "grad_norm": 2.048856258392334, + "learning_rate": 2.5810582770057325e-07, + "logits/chosen": -14.456823348999023, + "logits/rejected": -14.554224967956543, + "logps/chosen": -1.400504469871521, + "logps/rejected": -1.4313546419143677, + "loss": 1.4769, + "odds_ratio_loss": 0.764117419719696, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.14005044102668762, + "rewards/margins": 0.0030850153416395187, + "rewards/rejected": -0.1431354582309723, + "sft_loss": 1.400504469871521, + "step": 1440 + }, + { + "epoch": 2.578350744609913, + "grad_norm": 1.2424274682998657, + "learning_rate": 2.3788079925484402e-07, + "logits/chosen": -14.654197692871094, + "logits/rejected": -14.54980754852295, + "logps/chosen": -1.4363069534301758, + "logps/rejected": -1.4395811557769775, + "loss": 1.5116, + "odds_ratio_loss": 0.7526046633720398, + "rewards/accuracies": 0.45625001192092896, + "rewards/chosen": -0.14363068342208862, + "rewards/margins": 0.0003274245245847851, + "rewards/rejected": -0.14395812153816223, + "sft_loss": 1.4363069534301758, + "step": 1450 + }, + { + "epoch": 2.596132473883085, + "grad_norm": 1.602452039718628, + "learning_rate": 2.1844125788342661e-07, + "logits/chosen": -14.462437629699707, + "logits/rejected": -14.348231315612793, + "logps/chosen": -1.4353057146072388, + "logps/rejected": -1.6425654888153076, + "loss": 1.5064, + "odds_ratio_loss": 0.711114227771759, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.14353057742118835, + "rewards/margins": 0.020725984126329422, + "rewards/rejected": -0.16425655782222748, + "sft_loss": 1.4353057146072388, + "step": 1460 + }, + { + "epoch": 2.613914203156257, + "grad_norm": 1.1172051429748535, + "learning_rate": 1.9979395367644428e-07, + "logits/chosen": -14.612176895141602, + "logits/rejected": -14.652105331420898, + "logps/chosen": -1.3911590576171875, + "logps/rejected": -1.5478867292404175, + "loss": 1.4586, + "odds_ratio_loss": 0.674477756023407, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.13911592960357666, + "rewards/margins": 0.01567276194691658, + "rewards/rejected": -0.15478867292404175, + "sft_loss": 1.3911590576171875, + "step": 1470 + }, + { + "epoch": 2.631695932429429, + "grad_norm": 1.9138628244400024, + "learning_rate": 1.81945361631512e-07, + "logits/chosen": -14.578519821166992, + "logits/rejected": -14.719879150390625, + "logps/chosen": -1.458441972732544, + "logps/rejected": -1.4730455875396729, + "loss": 1.5353, + "odds_ratio_loss": 0.7685132622718811, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -0.14584419131278992, + "rewards/margins": 0.0014603782910853624, + "rewards/rejected": -0.14730457961559296, + "sft_loss": 1.458441972732544, + "step": 1480 + }, + { + "epoch": 2.6494776617026004, + "grad_norm": 2.06256103515625, + "learning_rate": 1.6490167940538343e-07, + "logits/chosen": -14.520217895507812, + "logits/rejected": -14.437647819519043, + "logps/chosen": -1.4276152849197388, + "logps/rejected": -1.4981720447540283, + "loss": 1.5003, + "odds_ratio_loss": 0.7272015810012817, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": -0.14276152849197388, + "rewards/margins": 0.007055687252432108, + "rewards/rejected": -0.14981719851493835, + "sft_loss": 1.4276152849197388, + "step": 1490 + }, + { + "epoch": 2.6672593909757722, + "grad_norm": 1.2633237838745117, + "learning_rate": 1.4866882516191339e-07, + "logits/chosen": -14.439382553100586, + "logits/rejected": -14.5440034866333, + "logps/chosen": -1.4182121753692627, + "logps/rejected": -1.5234054327011108, + "loss": 1.4914, + "odds_ratio_loss": 0.7320746779441833, + "rewards/accuracies": 0.48124998807907104, + "rewards/chosen": -0.1418212354183197, + "rewards/margins": 0.01051931269466877, + "rewards/rejected": -0.15234054625034332, + "sft_loss": 1.4182121753692627, + "step": 1500 + }, + { + "epoch": 2.6672593909757722, + "eval_logits/chosen": -14.466456413269043, + "eval_logits/rejected": -14.543242454528809, + "eval_logps/chosen": -1.44402277469635, + "eval_logps/rejected": -1.5389362573623657, + "eval_loss": 1.5154520273208618, + "eval_odds_ratio_loss": 0.7142924666404724, + "eval_rewards/accuracies": 0.5090000033378601, + "eval_rewards/chosen": -0.14440228044986725, + "eval_rewards/margins": 0.00949135422706604, + "eval_rewards/rejected": -0.15389364957809448, + "eval_runtime": 411.9433, + "eval_samples_per_second": 2.428, + "eval_sft_loss": 1.44402277469635, + "eval_steps_per_second": 1.214, + "step": 1500 + }, + { + "epoch": 2.685041120248944, + "grad_norm": 2.1881158351898193, + "learning_rate": 1.3325243551706057e-07, + "logits/chosen": -14.318315505981445, + "logits/rejected": -14.482050895690918, + "logps/chosen": -1.5127537250518799, + "logps/rejected": -1.706903100013733, + "loss": 1.5833, + "odds_ratio_loss": 0.7053884267807007, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.1512753665447235, + "rewards/margins": 0.01941494271159172, + "rewards/rejected": -0.17069032788276672, + "sft_loss": 1.5127537250518799, + "step": 1510 + }, + { + "epoch": 2.702822849522116, + "grad_norm": 1.7814643383026123, + "learning_rate": 1.1865786358165737e-07, + "logits/chosen": -14.374513626098633, + "logits/rejected": -14.638038635253906, + "logps/chosen": -1.4067102670669556, + "logps/rejected": -1.4657062292099, + "loss": 1.4794, + "odds_ratio_loss": 0.727219820022583, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -0.1406710147857666, + "rewards/margins": 0.005899603012949228, + "rewards/rejected": -0.14657063782215118, + "sft_loss": 1.4067102670669556, + "step": 1520 + }, + { + "epoch": 2.720604578795288, + "grad_norm": 4.55156135559082, + "learning_rate": 1.0489017710262311e-07, + "logits/chosen": -14.426725387573242, + "logits/rejected": -14.426490783691406, + "logps/chosen": -1.4883832931518555, + "logps/rejected": -1.6930698156356812, + "loss": 1.5615, + "odds_ratio_loss": 0.731151282787323, + "rewards/accuracies": 0.53125, + "rewards/chosen": -0.1488383412361145, + "rewards/margins": 0.020468655973672867, + "rewards/rejected": -0.16930699348449707, + "sft_loss": 1.4883832931518555, + "step": 1530 + }, + { + "epoch": 2.73838630806846, + "grad_norm": 1.255363941192627, + "learning_rate": 9.195415670326446e-08, + "logits/chosen": -14.457158088684082, + "logits/rejected": -14.496696472167969, + "logps/chosen": -1.4223277568817139, + "logps/rejected": -1.5442826747894287, + "loss": 1.4941, + "odds_ratio_loss": 0.7178906202316284, + "rewards/accuracies": 0.48124998807907104, + "rewards/chosen": -0.1422327756881714, + "rewards/margins": 0.012195492163300514, + "rewards/rejected": -0.15442825853824615, + "sft_loss": 1.4223277568817139, + "step": 1540 + }, + { + "epoch": 2.7561680373416317, + "grad_norm": 1.5739907026290894, + "learning_rate": 7.985429422327384e-08, + "logits/chosen": -14.502885818481445, + "logits/rejected": -14.488322257995605, + "logps/chosen": -1.3892616033554077, + "logps/rejected": -1.461104154586792, + "loss": 1.4636, + "odds_ratio_loss": 0.743794322013855, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.138926163315773, + "rewards/margins": 0.007184277288615704, + "rewards/rejected": -0.1461104303598404, + "sft_loss": 1.3892616033554077, + "step": 1550 + }, + { + "epoch": 2.773949766614803, + "grad_norm": 1.2010091543197632, + "learning_rate": 6.859479115900818e-08, + "logits/chosen": -14.52282428741455, + "logits/rejected": -14.561497688293457, + "logps/chosen": -1.4343992471694946, + "logps/rejected": -1.5450398921966553, + "loss": 1.5068, + "odds_ratio_loss": 0.7239341735839844, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -0.14343991875648499, + "rewards/margins": 0.011064080521464348, + "rewards/rejected": -0.15450401604175568, + "sft_loss": 1.4343992471694946, + "step": 1560 + }, + { + "epoch": 2.791731495887975, + "grad_norm": 4.445690631866455, + "learning_rate": 5.817955720457902e-08, + "logits/chosen": -14.513537406921387, + "logits/rejected": -14.568794250488281, + "logps/chosen": -1.4571216106414795, + "logps/rejected": -1.440791130065918, + "loss": 1.5361, + "odds_ratio_loss": 0.7899585962295532, + "rewards/accuracies": 0.48124998807907104, + "rewards/chosen": -0.14571216702461243, + "rewards/margins": -0.0016330400248989463, + "rewards/rejected": -0.14407911896705627, + "sft_loss": 1.4571216106414795, + "step": 1570 + }, + { + "epoch": 2.809513225161147, + "grad_norm": 3.594475030899048, + "learning_rate": 4.861220889427199e-08, + "logits/chosen": -14.410969734191895, + "logits/rejected": -14.302682876586914, + "logps/chosen": -1.4560983180999756, + "logps/rejected": -1.5099445581436157, + "loss": 1.5329, + "odds_ratio_loss": 0.7683452367782593, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -0.14560985565185547, + "rewards/margins": 0.005384599789977074, + "rewards/rejected": -0.1509944498538971, + "sft_loss": 1.4560983180999756, + "step": 1580 + }, + { + "epoch": 2.827294954434319, + "grad_norm": 0.9240034818649292, + "learning_rate": 3.9896068346758074e-08, + "logits/chosen": -14.375950813293457, + "logits/rejected": -14.428239822387695, + "logps/chosen": -1.4449069499969482, + "logps/rejected": -1.507812738418579, + "loss": 1.519, + "odds_ratio_loss": 0.7404919266700745, + "rewards/accuracies": 0.46875, + "rewards/chosen": -0.14449068903923035, + "rewards/margins": 0.0062905787490308285, + "rewards/rejected": -0.1507812738418579, + "sft_loss": 1.4449069499969482, + "step": 1590 + }, + { + "epoch": 2.8450766837074903, + "grad_norm": 4.377365589141846, + "learning_rate": 3.203416211153832e-08, + "logits/chosen": -14.38147258758545, + "logits/rejected": -14.620170593261719, + "logps/chosen": -1.417458176612854, + "logps/rejected": -1.5342447757720947, + "loss": 1.491, + "odds_ratio_loss": 0.7356201410293579, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": -0.14174583554267883, + "rewards/margins": 0.011678656563162804, + "rewards/rejected": -0.153424471616745, + "sft_loss": 1.417458176612854, + "step": 1600 + }, + { + "epoch": 2.8628584129806622, + "grad_norm": 5.88523006439209, + "learning_rate": 2.5029220118019393e-08, + "logits/chosen": -14.416727066040039, + "logits/rejected": -14.47779655456543, + "logps/chosen": -1.5760120153427124, + "logps/rejected": -1.5707772970199585, + "loss": 1.6535, + "odds_ratio_loss": 0.774849534034729, + "rewards/accuracies": 0.39375001192092896, + "rewards/chosen": -0.15760120749473572, + "rewards/margins": -0.0005234652198851109, + "rewards/rejected": -0.15707774460315704, + "sft_loss": 1.5760120153427124, + "step": 1610 + }, + { + "epoch": 2.880640142253834, + "grad_norm": 2.9929535388946533, + "learning_rate": 1.8883674727586122e-08, + "logits/chosen": -14.4614839553833, + "logits/rejected": -14.47071647644043, + "logps/chosen": -1.3790298700332642, + "logps/rejected": -1.5752503871917725, + "loss": 1.4464, + "odds_ratio_loss": 0.6737684607505798, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.13790300488471985, + "rewards/margins": 0.01962204836308956, + "rewards/rejected": -0.15752503275871277, + "sft_loss": 1.3790298700332642, + "step": 1620 + }, + { + "epoch": 2.898421871527006, + "grad_norm": 0.8867095112800598, + "learning_rate": 1.3599659889000639e-08, + "logits/chosen": -14.817098617553711, + "logits/rejected": -14.705721855163574, + "logps/chosen": -1.4359638690948486, + "logps/rejected": -1.4747111797332764, + "loss": 1.5093, + "odds_ratio_loss": 0.7331098914146423, + "rewards/accuracies": 0.46875, + "rewards/chosen": -0.14359638094902039, + "rewards/margins": 0.0038747270591557026, + "rewards/rejected": -0.1474711149930954, + "sft_loss": 1.4359638690948486, + "step": 1630 + }, + { + "epoch": 2.916203600800178, + "grad_norm": 2.0239081382751465, + "learning_rate": 9.179010397421528e-09, + "logits/chosen": -14.512980461120605, + "logits/rejected": -14.645942687988281, + "logps/chosen": -1.4728834629058838, + "logps/rejected": -1.5710668563842773, + "loss": 1.5439, + "odds_ratio_loss": 0.709987998008728, + "rewards/accuracies": 0.4625000059604645, + "rewards/chosen": -0.14728835225105286, + "rewards/margins": 0.00981833878904581, + "rewards/rejected": -0.1571066826581955, + "sft_loss": 1.4728834629058838, + "step": 1640 + }, + { + "epoch": 2.93398533007335, + "grad_norm": 3.0479774475097656, + "learning_rate": 5.623261257296509e-09, + "logits/chosen": -14.522771835327148, + "logits/rejected": -14.621217727661133, + "logps/chosen": -1.3921829462051392, + "logps/rejected": -1.5190517902374268, + "loss": 1.461, + "odds_ratio_loss": 0.6878436803817749, + "rewards/accuracies": 0.53125, + "rewards/chosen": -0.1392182856798172, + "rewards/margins": 0.012686875648796558, + "rewards/rejected": -0.15190517902374268, + "sft_loss": 1.3921829462051392, + "step": 1650 + }, + { + "epoch": 2.9517670593465217, + "grad_norm": 1.4828240871429443, + "learning_rate": 2.933647149357122e-09, + "logits/chosen": -14.36890697479248, + "logits/rejected": -14.4072847366333, + "logps/chosen": -1.3632400035858154, + "logps/rejected": -1.4689347743988037, + "loss": 1.4361, + "odds_ratio_loss": 0.7281399369239807, + "rewards/accuracies": 0.48124998807907104, + "rewards/chosen": -0.13632400333881378, + "rewards/margins": 0.010569497011601925, + "rewards/rejected": -0.1468934863805771, + "sft_loss": 1.3632400035858154, + "step": 1660 + }, + { + "epoch": 2.969548788619693, + "grad_norm": 1.2580466270446777, + "learning_rate": 1.1111020018930717e-09, + "logits/chosen": -14.664459228515625, + "logits/rejected": -14.4978666305542, + "logps/chosen": -1.4255958795547485, + "logps/rejected": -1.4593638181686401, + "loss": 1.5006, + "odds_ratio_loss": 0.7499974966049194, + "rewards/accuracies": 0.40625, + "rewards/chosen": -0.14255960285663605, + "rewards/margins": 0.0033767870627343655, + "rewards/rejected": -0.14593638479709625, + "sft_loss": 1.4255958795547485, + "step": 1670 + }, + { + "epoch": 2.987330517892865, + "grad_norm": 1.022547960281372, + "learning_rate": 1.5625866646051813e-10, + "logits/chosen": -14.368474960327148, + "logits/rejected": -14.423803329467773, + "logps/chosen": -1.413266897201538, + "logps/rejected": -1.573032021522522, + "loss": 1.4799, + "odds_ratio_loss": 0.6666654348373413, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.1413266807794571, + "rewards/margins": 0.015976523980498314, + "rewards/rejected": -0.15730319917201996, + "sft_loss": 1.413266897201538, + "step": 1680 + }, + { + "epoch": 2.997999555456768, + "step": 1686, + "total_flos": 1.8091810238164992e+18, + "train_loss": 1.5885293396059446, + "train_runtime": 25020.7826, + "train_samples_per_second": 1.079, + "train_steps_per_second": 0.067 + } + ], + "logging_steps": 10, + "max_steps": 1686, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 500, + "total_flos": 1.8091810238164992e+18, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}