Falcon-7B-Instruct-ORPO / trainer_state.json
chchen's picture
End of training
7e8058a verified
raw
history blame contribute delete
No virus
107 kB
{
"best_metric": 1.5154520273208618,
"best_model_checkpoint": "saves/Falcon-7B-Instruct/lora/orpo/checkpoint-1500",
"epoch": 2.997999555456768,
"eval_steps": 500,
"global_step": 1686,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.017781729273171815,
"grad_norm": 0.45023536682128906,
"learning_rate": 4.9995745934141085e-06,
"logits/chosen": -14.31452751159668,
"logits/rejected": -14.272933959960938,
"logps/chosen": -1.777596116065979,
"logps/rejected": -1.814857840538025,
"loss": 1.8528,
"odds_ratio_loss": 0.7518970966339111,
"rewards/accuracies": 0.4437499940395355,
"rewards/chosen": -0.17775960266590118,
"rewards/margins": 0.0037261671386659145,
"rewards/rejected": -0.18148578703403473,
"sft_loss": 1.777596116065979,
"step": 10
},
{
"epoch": 0.03556345854634363,
"grad_norm": 0.6821511387825012,
"learning_rate": 4.9982812903243405e-06,
"logits/chosen": -14.213617324829102,
"logits/rejected": -14.412919998168945,
"logps/chosen": -1.9183998107910156,
"logps/rejected": -1.8259010314941406,
"loss": 2.0025,
"odds_ratio_loss": 0.8412569761276245,
"rewards/accuracies": 0.40625,
"rewards/chosen": -0.19183996319770813,
"rewards/margins": -0.009249850176274776,
"rewards/rejected": -0.18259011209011078,
"sft_loss": 1.9183998107910156,
"step": 20
},
{
"epoch": 0.05334518781951545,
"grad_norm": 0.50360107421875,
"learning_rate": 4.996120496405222e-06,
"logits/chosen": -14.275195121765137,
"logits/rejected": -14.341901779174805,
"logps/chosen": -1.8644087314605713,
"logps/rejected": -2.0387845039367676,
"loss": 1.9359,
"odds_ratio_loss": 0.7153545022010803,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": -0.1864408552646637,
"rewards/margins": 0.01743762008845806,
"rewards/rejected": -0.2038784772157669,
"sft_loss": 1.8644087314605713,
"step": 30
},
{
"epoch": 0.07112691709268726,
"grad_norm": 0.6971050500869751,
"learning_rate": 4.99309296196014e-06,
"logits/chosen": -14.182516098022461,
"logits/rejected": -14.20283317565918,
"logps/chosen": -1.9314234256744385,
"logps/rejected": -1.899929404258728,
"loss": 2.0128,
"odds_ratio_loss": 0.8141088485717773,
"rewards/accuracies": 0.4437499940395355,
"rewards/chosen": -0.1931423395872116,
"rewards/margins": -0.003149367868900299,
"rewards/rejected": -0.1899929791688919,
"sft_loss": 1.9314234256744385,
"step": 40
},
{
"epoch": 0.08890864636585907,
"grad_norm": 0.5635890960693359,
"learning_rate": 4.989199738255166e-06,
"logits/chosen": -14.374763488769531,
"logits/rejected": -14.155324935913086,
"logps/chosen": -1.9009517431259155,
"logps/rejected": -1.9379255771636963,
"loss": 1.9795,
"odds_ratio_loss": 0.7854829430580139,
"rewards/accuracies": 0.46875,
"rewards/chosen": -0.19009515643119812,
"rewards/margins": 0.003697408363223076,
"rewards/rejected": -0.19379255175590515,
"sft_loss": 1.9009517431259155,
"step": 50
},
{
"epoch": 0.1066903756390309,
"grad_norm": 0.6436208486557007,
"learning_rate": 4.984442177154031e-06,
"logits/chosen": -14.250883102416992,
"logits/rejected": -14.289509773254395,
"logps/chosen": -1.9730733633041382,
"logps/rejected": -2.040274143218994,
"loss": 2.0526,
"odds_ratio_loss": 0.7955271601676941,
"rewards/accuracies": 0.4437499940395355,
"rewards/chosen": -0.19730734825134277,
"rewards/margins": 0.006720039062201977,
"rewards/rejected": -0.20402738451957703,
"sft_loss": 1.9730733633041382,
"step": 60
},
{
"epoch": 0.12447210491220272,
"grad_norm": 0.4920930862426758,
"learning_rate": 4.978821930648704e-06,
"logits/chosen": -14.044062614440918,
"logits/rejected": -14.116564750671387,
"logps/chosen": -1.9218193292617798,
"logps/rejected": -1.7758527994155884,
"loss": 2.0142,
"odds_ratio_loss": 0.9241151809692383,
"rewards/accuracies": 0.44999998807907104,
"rewards/chosen": -0.19218194484710693,
"rewards/margins": -0.014596650376915932,
"rewards/rejected": -0.17758527398109436,
"sft_loss": 1.9218193292617798,
"step": 70
},
{
"epoch": 0.14225383418537452,
"grad_norm": 0.6744620203971863,
"learning_rate": 4.97234095028576e-06,
"logits/chosen": -14.337008476257324,
"logits/rejected": -14.242892265319824,
"logps/chosen": -1.873708963394165,
"logps/rejected": -1.8873860836029053,
"loss": 1.9505,
"odds_ratio_loss": 0.767748236656189,
"rewards/accuracies": 0.5062500238418579,
"rewards/chosen": -0.1873709261417389,
"rewards/margins": 0.0013677121605724096,
"rewards/rejected": -0.1887386292219162,
"sft_loss": 1.873708963394165,
"step": 80
},
{
"epoch": 0.16003556345854633,
"grad_norm": 0.48053959012031555,
"learning_rate": 4.965001486488743e-06,
"logits/chosen": -14.241889953613281,
"logits/rejected": -14.200053215026855,
"logps/chosen": -1.7481162548065186,
"logps/rejected": -1.7742578983306885,
"loss": 1.8239,
"odds_ratio_loss": 0.7575067281723022,
"rewards/accuracies": 0.44999998807907104,
"rewards/chosen": -0.17481163144111633,
"rewards/margins": 0.0026141509879380465,
"rewards/rejected": -0.17742577195167542,
"sft_loss": 1.7481162548065186,
"step": 90
},
{
"epoch": 0.17781729273171815,
"grad_norm": 0.7781735062599182,
"learning_rate": 4.956806087776732e-06,
"logits/chosen": -14.596258163452148,
"logits/rejected": -14.529101371765137,
"logps/chosen": -1.8169043064117432,
"logps/rejected": -1.9338127374649048,
"loss": 1.8891,
"odds_ratio_loss": 0.7220322489738464,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.18169045448303223,
"rewards/margins": 0.01169085968285799,
"rewards/rejected": -0.19338129460811615,
"sft_loss": 1.8169043064117432,
"step": 100
},
{
"epoch": 0.19559902200489,
"grad_norm": 0.845944881439209,
"learning_rate": 4.947757599879411e-06,
"logits/chosen": -14.283439636230469,
"logits/rejected": -14.439828872680664,
"logps/chosen": -1.7649319171905518,
"logps/rejected": -1.82681405544281,
"loss": 1.8394,
"odds_ratio_loss": 0.7447811961174011,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -0.17649319767951965,
"rewards/margins": 0.0061882128939032555,
"rewards/rejected": -0.1826813966035843,
"sft_loss": 1.7649319171905518,
"step": 110
},
{
"epoch": 0.2133807512780618,
"grad_norm": 0.6829086542129517,
"learning_rate": 4.937859164748931e-06,
"logits/chosen": -14.171781539916992,
"logits/rejected": -14.2664155960083,
"logps/chosen": -1.65048348903656,
"logps/rejected": -1.6755653619766235,
"loss": 1.7267,
"odds_ratio_loss": 0.7621053457260132,
"rewards/accuracies": 0.4937500059604645,
"rewards/chosen": -0.16504836082458496,
"rewards/margins": 0.00250818463973701,
"rewards/rejected": -0.1675565242767334,
"sft_loss": 1.65048348903656,
"step": 120
},
{
"epoch": 0.23116248055123362,
"grad_norm": 0.6720818877220154,
"learning_rate": 4.92711421946891e-06,
"logits/chosen": -14.323086738586426,
"logits/rejected": -13.87572956085205,
"logps/chosen": -1.6958109140396118,
"logps/rejected": -1.8111820220947266,
"loss": 1.7688,
"odds_ratio_loss": 0.7296444177627563,
"rewards/accuracies": 0.4749999940395355,
"rewards/chosen": -0.1695810854434967,
"rewards/margins": 0.011537122540175915,
"rewards/rejected": -0.1811182051897049,
"sft_loss": 1.6958109140396118,
"step": 130
},
{
"epoch": 0.24894420982440543,
"grad_norm": 1.665626049041748,
"learning_rate": 4.915526495060961e-06,
"logits/chosen": -14.461613655090332,
"logits/rejected": -14.22163200378418,
"logps/chosen": -1.670013666152954,
"logps/rejected": -1.7781444787979126,
"loss": 1.7433,
"odds_ratio_loss": 0.7326729893684387,
"rewards/accuracies": 0.53125,
"rewards/chosen": -0.1670013815164566,
"rewards/margins": 0.010813087224960327,
"rewards/rejected": -0.17781445384025574,
"sft_loss": 1.670013666152954,
"step": 140
},
{
"epoch": 0.26672593909757725,
"grad_norm": 1.3471460342407227,
"learning_rate": 4.903100015189153e-06,
"logits/chosen": -14.236448287963867,
"logits/rejected": -14.436059951782227,
"logps/chosen": -1.6695849895477295,
"logps/rejected": -1.7570854425430298,
"loss": 1.7435,
"odds_ratio_loss": 0.7395648956298828,
"rewards/accuracies": 0.44999998807907104,
"rewards/chosen": -0.1669585108757019,
"rewards/margins": 0.008750038221478462,
"rewards/rejected": -0.17570854723453522,
"sft_loss": 1.6695849895477295,
"step": 150
},
{
"epoch": 0.28450766837074903,
"grad_norm": 1.0652554035186768,
"learning_rate": 4.889839094762848e-06,
"logits/chosen": -14.326433181762695,
"logits/rejected": -14.2631196975708,
"logps/chosen": -1.6757389307022095,
"logps/rejected": -1.8023579120635986,
"loss": 1.7489,
"odds_ratio_loss": 0.7316839098930359,
"rewards/accuracies": 0.46875,
"rewards/chosen": -0.16757391393184662,
"rewards/margins": 0.012661868706345558,
"rewards/rejected": -0.18023577332496643,
"sft_loss": 1.6757389307022095,
"step": 160
},
{
"epoch": 0.3022893976439209,
"grad_norm": 1.0970226526260376,
"learning_rate": 4.875748338438416e-06,
"logits/chosen": -14.249468803405762,
"logits/rejected": -14.319056510925293,
"logps/chosen": -1.6604044437408447,
"logps/rejected": -1.6869754791259766,
"loss": 1.7344,
"odds_ratio_loss": 0.7400213479995728,
"rewards/accuracies": 0.44999998807907104,
"rewards/chosen": -0.16604045033454895,
"rewards/margins": 0.0026570975314825773,
"rewards/rejected": -0.1686975508928299,
"sft_loss": 1.6604044437408447,
"step": 170
},
{
"epoch": 0.32007112691709266,
"grad_norm": 0.7075946927070618,
"learning_rate": 4.8608326390203386e-06,
"logits/chosen": -14.197771072387695,
"logits/rejected": -14.143452644348145,
"logps/chosen": -1.6201465129852295,
"logps/rejected": -1.7580602169036865,
"loss": 1.6907,
"odds_ratio_loss": 0.7053945660591125,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.1620146632194519,
"rewards/margins": 0.013791357167065144,
"rewards/rejected": -0.17580604553222656,
"sft_loss": 1.6201465129852295,
"step": 180
},
{
"epoch": 0.3378528561902645,
"grad_norm": 1.0393530130386353,
"learning_rate": 4.845097175762251e-06,
"logits/chosen": -14.362152099609375,
"logits/rejected": -14.374476432800293,
"logps/chosen": -1.6125462055206299,
"logps/rejected": -1.6287224292755127,
"loss": 1.6892,
"odds_ratio_loss": 0.7666895985603333,
"rewards/accuracies": 0.4437499940395355,
"rewards/chosen": -0.1612546145915985,
"rewards/margins": 0.001617613947018981,
"rewards/rejected": -0.16287222504615784,
"sft_loss": 1.6125462055206299,
"step": 190
},
{
"epoch": 0.3556345854634363,
"grad_norm": 1.0453855991363525,
"learning_rate": 4.8285474125685286e-06,
"logits/chosen": -14.311877250671387,
"logits/rejected": -14.24933910369873,
"logps/chosen": -1.6674257516860962,
"logps/rejected": -1.6954532861709595,
"loss": 1.7433,
"odds_ratio_loss": 0.7591363191604614,
"rewards/accuracies": 0.42500001192092896,
"rewards/chosen": -0.16674259305000305,
"rewards/margins": 0.0028027337975800037,
"rewards/rejected": -0.16954532265663147,
"sft_loss": 1.6674257516860962,
"step": 200
},
{
"epoch": 0.37341631473660813,
"grad_norm": 1.7740540504455566,
"learning_rate": 4.811189096097025e-06,
"logits/chosen": -14.10380744934082,
"logits/rejected": -14.06958293914795,
"logps/chosen": -1.6661155223846436,
"logps/rejected": -1.7266288995742798,
"loss": 1.7425,
"odds_ratio_loss": 0.7635276913642883,
"rewards/accuracies": 0.48124998807907104,
"rewards/chosen": -0.16661155223846436,
"rewards/margins": 0.006051325239241123,
"rewards/rejected": -0.1726628839969635,
"sft_loss": 1.6661155223846436,
"step": 210
},
{
"epoch": 0.39119804400978,
"grad_norm": 1.2109721899032593,
"learning_rate": 4.793028253763633e-06,
"logits/chosen": -14.37977123260498,
"logits/rejected": -14.255584716796875,
"logps/chosen": -1.5491920709609985,
"logps/rejected": -1.6557328701019287,
"loss": 1.6259,
"odds_ratio_loss": 0.7670000791549683,
"rewards/accuracies": 0.46875,
"rewards/chosen": -0.15491920709609985,
"rewards/margins": 0.010654089972376823,
"rewards/rejected": -0.16557331383228302,
"sft_loss": 1.5491920709609985,
"step": 220
},
{
"epoch": 0.40897977328295176,
"grad_norm": 1.2553755044937134,
"learning_rate": 4.774071191649352e-06,
"logits/chosen": -14.052825927734375,
"logits/rejected": -14.052751541137695,
"logps/chosen": -1.540856122970581,
"logps/rejected": -1.742560625076294,
"loss": 1.6084,
"odds_ratio_loss": 0.6751853227615356,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.15408562123775482,
"rewards/margins": 0.0201704278588295,
"rewards/rejected": -0.17425604164600372,
"sft_loss": 1.540856122970581,
"step": 230
},
{
"epoch": 0.4267615025561236,
"grad_norm": 1.4113616943359375,
"learning_rate": 4.7543244923105975e-06,
"logits/chosen": -14.20154094696045,
"logits/rejected": -14.3230619430542,
"logps/chosen": -1.6533008813858032,
"logps/rejected": -1.6262308359146118,
"loss": 1.7354,
"odds_ratio_loss": 0.8208959698677063,
"rewards/accuracies": 0.4625000059604645,
"rewards/chosen": -0.16533009707927704,
"rewards/margins": -0.002707002917304635,
"rewards/rejected": -0.1626230925321579,
"sft_loss": 1.6533008813858032,
"step": 240
},
{
"epoch": 0.4445432318292954,
"grad_norm": 0.9963915348052979,
"learning_rate": 4.733795012493506e-06,
"logits/chosen": -14.148083686828613,
"logits/rejected": -14.310300827026367,
"logps/chosen": -1.6552881002426147,
"logps/rejected": -1.6675243377685547,
"loss": 1.7326,
"odds_ratio_loss": 0.7732909321784973,
"rewards/accuracies": 0.44999998807907104,
"rewards/chosen": -0.165528804063797,
"rewards/margins": 0.0012236315524205565,
"rewards/rejected": -0.16675245761871338,
"sft_loss": 1.6552881002426147,
"step": 250
},
{
"epoch": 0.46232496110246724,
"grad_norm": 0.9358872175216675,
"learning_rate": 4.712489880753035e-06,
"logits/chosen": -14.420260429382324,
"logits/rejected": -14.414996147155762,
"logps/chosen": -1.4923756122589111,
"logps/rejected": -1.5968248844146729,
"loss": 1.5619,
"odds_ratio_loss": 0.6949405670166016,
"rewards/accuracies": 0.5062500238418579,
"rewards/chosen": -0.14923755824565887,
"rewards/margins": 0.010444932617247105,
"rewards/rejected": -0.1596824824810028,
"sft_loss": 1.4923756122589111,
"step": 260
},
{
"epoch": 0.480106690375639,
"grad_norm": 2.065678596496582,
"learning_rate": 4.690416494977673e-06,
"logits/chosen": -14.560025215148926,
"logits/rejected": -14.5877103805542,
"logps/chosen": -1.5790516138076782,
"logps/rejected": -1.7484729290008545,
"loss": 1.6496,
"odds_ratio_loss": 0.7055513858795166,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -0.15790514647960663,
"rewards/margins": 0.016942132264375687,
"rewards/rejected": -0.1748472899198532,
"sft_loss": 1.5790516138076782,
"step": 270
},
{
"epoch": 0.49788841964881086,
"grad_norm": 3.234992265701294,
"learning_rate": 4.667582519820639e-06,
"logits/chosen": -14.247453689575195,
"logits/rejected": -14.422063827514648,
"logps/chosen": -1.5711301565170288,
"logps/rejected": -1.6209745407104492,
"loss": 1.6445,
"odds_ratio_loss": 0.7338452339172363,
"rewards/accuracies": 0.5062500238418579,
"rewards/chosen": -0.1571130007505417,
"rewards/margins": 0.0049844542518258095,
"rewards/rejected": -0.16209746897220612,
"sft_loss": 1.5711301565170288,
"step": 280
},
{
"epoch": 0.5156701489219827,
"grad_norm": 1.1059269905090332,
"learning_rate": 4.643995884038443e-06,
"logits/chosen": -14.227750778198242,
"logits/rejected": -14.276082038879395,
"logps/chosen": -1.6161178350448608,
"logps/rejected": -1.7087091207504272,
"loss": 1.6894,
"odds_ratio_loss": 0.7330858111381531,
"rewards/accuracies": 0.5062500238418579,
"rewards/chosen": -0.16161179542541504,
"rewards/margins": 0.009259124286472797,
"rewards/rejected": -0.17087092995643616,
"sft_loss": 1.6161178350448608,
"step": 290
},
{
"epoch": 0.5334518781951545,
"grad_norm": 2.079979658126831,
"learning_rate": 4.6196647777377475e-06,
"logits/chosen": -14.415349960327148,
"logits/rejected": -14.31702995300293,
"logps/chosen": -1.5473922491073608,
"logps/rejected": -1.5794051885604858,
"loss": 1.6226,
"odds_ratio_loss": 0.752013087272644,
"rewards/accuracies": 0.4312500059604645,
"rewards/chosen": -0.15473923087120056,
"rewards/margins": 0.0032012953888624907,
"rewards/rejected": -0.15794052183628082,
"sft_loss": 1.5473922491073608,
"step": 300
},
{
"epoch": 0.5512336074683263,
"grad_norm": 0.9750655889511108,
"learning_rate": 4.59459764953147e-06,
"logits/chosen": -14.490264892578125,
"logits/rejected": -14.242512702941895,
"logps/chosen": -1.6246612071990967,
"logps/rejected": -1.6596574783325195,
"loss": 1.6988,
"odds_ratio_loss": 0.7414273619651794,
"rewards/accuracies": 0.4625000059604645,
"rewards/chosen": -0.1624661237001419,
"rewards/margins": 0.00349963316693902,
"rewards/rejected": -0.1659657508134842,
"sft_loss": 1.6246612071990967,
"step": 310
},
{
"epoch": 0.5690153367414981,
"grad_norm": 1.211684226989746,
"learning_rate": 4.568803203605133e-06,
"logits/chosen": -14.534784317016602,
"logits/rejected": -14.392961502075195,
"logps/chosen": -1.6170380115509033,
"logps/rejected": -1.6397918462753296,
"loss": 1.6966,
"odds_ratio_loss": 0.7957952618598938,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.16170379519462585,
"rewards/margins": 0.0022753949742764235,
"rewards/rejected": -0.1639791876077652,
"sft_loss": 1.6170380115509033,
"step": 320
},
{
"epoch": 0.58679706601467,
"grad_norm": 2.359046459197998,
"learning_rate": 4.542290396694462e-06,
"logits/chosen": -14.300097465515137,
"logits/rejected": -14.317700386047363,
"logps/chosen": -1.5104106664657593,
"logps/rejected": -1.5948156118392944,
"loss": 1.5853,
"odds_ratio_loss": 0.7493588328361511,
"rewards/accuracies": 0.45625001192092896,
"rewards/chosen": -0.15104106068611145,
"rewards/margins": 0.008440487086772919,
"rewards/rejected": -0.15948157012462616,
"sft_loss": 1.5104106664657593,
"step": 330
},
{
"epoch": 0.6045787952878418,
"grad_norm": 1.9586892127990723,
"learning_rate": 4.515068434975298e-06,
"logits/chosen": -14.25054931640625,
"logits/rejected": -14.31701946258545,
"logps/chosen": -1.544409990310669,
"logps/rejected": -1.6858068704605103,
"loss": 1.6155,
"odds_ratio_loss": 0.7111681699752808,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.1544409841299057,
"rewards/margins": 0.014139704406261444,
"rewards/rejected": -0.16858068108558655,
"sft_loss": 1.544409990310669,
"step": 340
},
{
"epoch": 0.6223605245610135,
"grad_norm": 0.8323342800140381,
"learning_rate": 4.487146770866887e-06,
"logits/chosen": -14.46008586883545,
"logits/rejected": -14.50438404083252,
"logps/chosen": -1.6048237085342407,
"logps/rejected": -1.6058448553085327,
"loss": 1.6804,
"odds_ratio_loss": 0.7562613487243652,
"rewards/accuracies": 0.40625,
"rewards/chosen": -0.16048237681388855,
"rewards/margins": 0.00010212887718807906,
"rewards/rejected": -0.16058449447155,
"sft_loss": 1.6048237085342407,
"step": 350
},
{
"epoch": 0.6401422538341853,
"grad_norm": 1.1129921674728394,
"learning_rate": 4.458535099749666e-06,
"logits/chosen": -14.248858451843262,
"logits/rejected": -14.238241195678711,
"logps/chosen": -1.6117630004882812,
"logps/rejected": -1.6199924945831299,
"loss": 1.6917,
"odds_ratio_loss": 0.799778938293457,
"rewards/accuracies": 0.4437499940395355,
"rewards/chosen": -0.16117632389068604,
"rewards/margins": 0.0008229434606619179,
"rewards/rejected": -0.16199925541877747,
"sft_loss": 1.6117630004882812,
"step": 360
},
{
"epoch": 0.6579239831073572,
"grad_norm": 0.7189633250236511,
"learning_rate": 4.429243356598694e-06,
"logits/chosen": -14.405240058898926,
"logits/rejected": -14.393125534057617,
"logps/chosen": -1.485335111618042,
"logps/rejected": -1.6254221200942993,
"loss": 1.5568,
"odds_ratio_loss": 0.7148580551147461,
"rewards/accuracies": 0.4937500059604645,
"rewards/chosen": -0.14853352308273315,
"rewards/margins": 0.014008693397045135,
"rewards/rejected": -0.1625421941280365,
"sft_loss": 1.485335111618042,
"step": 370
},
{
"epoch": 0.675705712380529,
"grad_norm": 1.066667914390564,
"learning_rate": 4.399281712533875e-06,
"logits/chosen": -14.525976181030273,
"logits/rejected": -14.461471557617188,
"logps/chosen": -1.4982891082763672,
"logps/rejected": -1.5271437168121338,
"loss": 1.5745,
"odds_ratio_loss": 0.7622246146202087,
"rewards/accuracies": 0.4625000059604645,
"rewards/chosen": -0.14982891082763672,
"rewards/margins": 0.00288546085357666,
"rewards/rejected": -0.15271437168121338,
"sft_loss": 1.4982891082763672,
"step": 380
},
{
"epoch": 0.6934874416537008,
"grad_norm": 1.138702392578125,
"learning_rate": 4.368660571288192e-06,
"logits/chosen": -14.477781295776367,
"logits/rejected": -14.516647338867188,
"logps/chosen": -1.5524108409881592,
"logps/rejected": -1.5919525623321533,
"loss": 1.6311,
"odds_ratio_loss": 0.7873157262802124,
"rewards/accuracies": 0.46875,
"rewards/chosen": -0.15524108707904816,
"rewards/margins": 0.003954165615141392,
"rewards/rejected": -0.15919525921344757,
"sft_loss": 1.5524108409881592,
"step": 390
},
{
"epoch": 0.7112691709268726,
"grad_norm": 0.8798184394836426,
"learning_rate": 4.337390565595163e-06,
"logits/chosen": -14.244585037231445,
"logits/rejected": -14.454017639160156,
"logps/chosen": -1.5560581684112549,
"logps/rejected": -1.6040303707122803,
"loss": 1.6304,
"odds_ratio_loss": 0.743826150894165,
"rewards/accuracies": 0.46875,
"rewards/chosen": -0.15560582280158997,
"rewards/margins": 0.004797201603651047,
"rewards/rejected": -0.1604030430316925,
"sft_loss": 1.5560581684112549,
"step": 400
},
{
"epoch": 0.7290509002000445,
"grad_norm": 1.082557201385498,
"learning_rate": 4.305482553496786e-06,
"logits/chosen": -14.316876411437988,
"logits/rejected": -14.236166000366211,
"logps/chosen": -1.5271369218826294,
"logps/rejected": -1.5710500478744507,
"loss": 1.6033,
"odds_ratio_loss": 0.7612074613571167,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": -0.15271370112895966,
"rewards/margins": 0.004391324706375599,
"rewards/rejected": -0.15710501372814178,
"sft_loss": 1.5271369218826294,
"step": 410
},
{
"epoch": 0.7468326294732163,
"grad_norm": 1.7793493270874023,
"learning_rate": 4.272947614573244e-06,
"logits/chosen": -14.312704086303711,
"logits/rejected": -14.391843795776367,
"logps/chosen": -1.6422202587127686,
"logps/rejected": -1.6885766983032227,
"loss": 1.7183,
"odds_ratio_loss": 0.761288046836853,
"rewards/accuracies": 0.4937500059604645,
"rewards/chosen": -0.16422203183174133,
"rewards/margins": 0.004635652992874384,
"rewards/rejected": -0.16885769367218018,
"sft_loss": 1.6422202587127686,
"step": 420
},
{
"epoch": 0.7646143587463881,
"grad_norm": 0.7179546356201172,
"learning_rate": 4.23979704609569e-06,
"logits/chosen": -14.418655395507812,
"logits/rejected": -14.451852798461914,
"logps/chosen": -1.5165865421295166,
"logps/rejected": -1.5794544219970703,
"loss": 1.5868,
"odds_ratio_loss": 0.7019873857498169,
"rewards/accuracies": 0.518750011920929,
"rewards/chosen": -0.15165865421295166,
"rewards/margins": 0.006286793854087591,
"rewards/rejected": -0.157945454120636,
"sft_loss": 1.5165865421295166,
"step": 430
},
{
"epoch": 0.78239608801956,
"grad_norm": 1.1082905530929565,
"learning_rate": 4.206042359103435e-06,
"logits/chosen": -14.335638046264648,
"logits/rejected": -14.379010200500488,
"logps/chosen": -1.6033432483673096,
"logps/rejected": -1.648553490638733,
"loss": 1.6799,
"odds_ratio_loss": 0.7652724981307983,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": -0.16033433377742767,
"rewards/margins": 0.004520997405052185,
"rewards/rejected": -0.16485533118247986,
"sft_loss": 1.6033432483673096,
"step": 440
},
{
"epoch": 0.8001778172927317,
"grad_norm": 1.853089690208435,
"learning_rate": 4.17169527440691e-06,
"logits/chosen": -14.423266410827637,
"logits/rejected": -14.40173625946045,
"logps/chosen": -1.62055242061615,
"logps/rejected": -1.6249053478240967,
"loss": 1.6973,
"odds_ratio_loss": 0.7678921222686768,
"rewards/accuracies": 0.42500001192092896,
"rewards/chosen": -0.16205522418022156,
"rewards/margins": 0.00043528637615963817,
"rewards/rejected": -0.16249051690101624,
"sft_loss": 1.62055242061615,
"step": 450
},
{
"epoch": 0.8179595465659035,
"grad_norm": 1.930782675743103,
"learning_rate": 4.136767718517797e-06,
"logits/chosen": -14.467844009399414,
"logits/rejected": -14.397687911987305,
"logps/chosen": -1.423508882522583,
"logps/rejected": -1.5464013814926147,
"loss": 1.4918,
"odds_ratio_loss": 0.6831967830657959,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": -0.14235089719295502,
"rewards/margins": 0.012289242818951607,
"rewards/rejected": -0.15464013814926147,
"sft_loss": 1.423508882522583,
"step": 460
},
{
"epoch": 0.8357412758390753,
"grad_norm": 4.023218631744385,
"learning_rate": 4.1012718195077196e-06,
"logits/chosen": -14.457601547241211,
"logits/rejected": -14.645208358764648,
"logps/chosen": -1.5619157552719116,
"logps/rejected": -1.5855516195297241,
"loss": 1.6365,
"odds_ratio_loss": 0.7463001608848572,
"rewards/accuracies": 0.40625,
"rewards/chosen": -0.15619158744812012,
"rewards/margins": 0.0023635749239474535,
"rewards/rejected": -0.15855516493320465,
"sft_loss": 1.5619157552719116,
"step": 470
},
{
"epoch": 0.8535230051122472,
"grad_norm": 3.4996113777160645,
"learning_rate": 4.065219902796953e-06,
"logits/chosen": -14.268010139465332,
"logits/rejected": -14.2593994140625,
"logps/chosen": -1.537496566772461,
"logps/rejected": -1.6044343709945679,
"loss": 1.6152,
"odds_ratio_loss": 0.7775283455848694,
"rewards/accuracies": 0.48750001192092896,
"rewards/chosen": -0.15374964475631714,
"rewards/margins": 0.006693786941468716,
"rewards/rejected": -0.16044344007968903,
"sft_loss": 1.537496566772461,
"step": 480
},
{
"epoch": 0.871304734385419,
"grad_norm": 0.903472900390625,
"learning_rate": 4.028624486874608e-06,
"logits/chosen": -14.208300590515137,
"logits/rejected": -14.410181045532227,
"logps/chosen": -1.4613759517669678,
"logps/rejected": -1.5640151500701904,
"loss": 1.5343,
"odds_ratio_loss": 0.728947103023529,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": -0.14613759517669678,
"rewards/margins": 0.01026391051709652,
"rewards/rejected": -0.15640152990818024,
"sft_loss": 1.4613759517669678,
"step": 490
},
{
"epoch": 0.8890864636585908,
"grad_norm": 0.9934778809547424,
"learning_rate": 3.99149827895177e-06,
"logits/chosen": -14.429773330688477,
"logits/rejected": -14.360036849975586,
"logps/chosen": -1.5578300952911377,
"logps/rejected": -1.5907199382781982,
"loss": 1.6309,
"odds_ratio_loss": 0.7302489280700684,
"rewards/accuracies": 0.4937500059604645,
"rewards/chosen": -0.155783012509346,
"rewards/margins": 0.0032889836002141237,
"rewards/rejected": -0.15907198190689087,
"sft_loss": 1.5578300952911377,
"step": 500
},
{
"epoch": 0.8890864636585908,
"eval_logits/chosen": -14.421330451965332,
"eval_logits/rejected": -14.49679946899414,
"eval_logps/chosen": -1.5096371173858643,
"eval_logps/rejected": -1.598750114440918,
"eval_loss": 1.5815595388412476,
"eval_odds_ratio_loss": 0.7192248106002808,
"eval_rewards/accuracies": 0.49399998784065247,
"eval_rewards/chosen": -0.150963693857193,
"eval_rewards/margins": 0.008911306038498878,
"eval_rewards/rejected": -0.1598750203847885,
"eval_runtime": 203.6844,
"eval_samples_per_second": 4.91,
"eval_sft_loss": 1.5096371173858643,
"eval_steps_per_second": 2.455,
"step": 500
},
{
"epoch": 0.9068681929317626,
"grad_norm": 1.555288314819336,
"learning_rate": 3.953854170549114e-06,
"logits/chosen": -14.514457702636719,
"logits/rejected": -14.522333145141602,
"logps/chosen": -1.5636659860610962,
"logps/rejected": -1.554619550704956,
"loss": 1.6424,
"odds_ratio_loss": 0.7876118421554565,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.15636660158634186,
"rewards/margins": -0.0009046423947438598,
"rewards/rejected": -0.15546198189258575,
"sft_loss": 1.5636659860610962,
"step": 510
},
{
"epoch": 0.9246499222049345,
"grad_norm": 1.688568353652954,
"learning_rate": 3.91570523302051e-06,
"logits/chosen": -14.493148803710938,
"logits/rejected": -14.430615425109863,
"logps/chosen": -1.4217069149017334,
"logps/rejected": -1.5474450588226318,
"loss": 1.4923,
"odds_ratio_loss": 0.7059618830680847,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -0.14217069745063782,
"rewards/margins": 0.012573805637657642,
"rewards/rejected": -0.15474450588226318,
"sft_loss": 1.4217069149017334,
"step": 520
},
{
"epoch": 0.9424316514781063,
"grad_norm": 1.1463229656219482,
"learning_rate": 3.8770647130141996e-06,
"logits/chosen": -14.525823593139648,
"logits/rejected": -14.399996757507324,
"logps/chosen": -1.4726465940475464,
"logps/rejected": -1.5658392906188965,
"loss": 1.5466,
"odds_ratio_loss": 0.7396677136421204,
"rewards/accuracies": 0.4625000059604645,
"rewards/chosen": -0.14726465940475464,
"rewards/margins": 0.009319271892309189,
"rewards/rejected": -0.15658393502235413,
"sft_loss": 1.4726465940475464,
"step": 530
},
{
"epoch": 0.960213380751278,
"grad_norm": 2.26594877243042,
"learning_rate": 3.837946027873086e-06,
"logits/chosen": -14.543024063110352,
"logits/rejected": -14.401150703430176,
"logps/chosen": -1.5846188068389893,
"logps/rejected": -1.6536743640899658,
"loss": 1.661,
"odds_ratio_loss": 0.7640754580497742,
"rewards/accuracies": 0.53125,
"rewards/chosen": -0.15846188366413116,
"rewards/margins": 0.006905559450387955,
"rewards/rejected": -0.16536743938922882,
"sft_loss": 1.5846188068389893,
"step": 540
},
{
"epoch": 0.9779951100244498,
"grad_norm": 1.5690348148345947,
"learning_rate": 3.7983627609757713e-06,
"logits/chosen": -14.507429122924805,
"logits/rejected": -14.452433586120605,
"logps/chosen": -1.5472663640975952,
"logps/rejected": -1.544019103050232,
"loss": 1.6249,
"odds_ratio_loss": 0.7762556076049805,
"rewards/accuracies": 0.4312500059604645,
"rewards/chosen": -0.15472662448883057,
"rewards/margins": -0.0003247327113058418,
"rewards/rejected": -0.15440191328525543,
"sft_loss": 1.5472663640975952,
"step": 550
},
{
"epoch": 0.9957768392976217,
"grad_norm": 1.6722140312194824,
"learning_rate": 3.758328657019924e-06,
"logits/chosen": -14.339811325073242,
"logits/rejected": -14.203923225402832,
"logps/chosen": -1.4747674465179443,
"logps/rejected": -1.5587810277938843,
"loss": 1.5469,
"odds_ratio_loss": 0.7211123704910278,
"rewards/accuracies": 0.5062500238418579,
"rewards/chosen": -0.14747674763202667,
"rewards/margins": 0.008401353843510151,
"rewards/rejected": -0.15587811172008514,
"sft_loss": 1.4747674465179443,
"step": 560
},
{
"epoch": 1.0135585685707935,
"grad_norm": 4.022447109222412,
"learning_rate": 3.717857617249642e-06,
"logits/chosen": -14.2714204788208,
"logits/rejected": -14.344494819641113,
"logps/chosen": -1.5280239582061768,
"logps/rejected": -1.6387897729873657,
"loss": 1.6007,
"odds_ratio_loss": 0.72718346118927,
"rewards/accuracies": 0.48124998807907104,
"rewards/chosen": -0.15280239284038544,
"rewards/margins": 0.011076575145125389,
"rewards/rejected": -0.16387899219989777,
"sft_loss": 1.5280239582061768,
"step": 570
},
{
"epoch": 1.0313402978439654,
"grad_norm": 0.8425918817520142,
"learning_rate": 3.6769636946284543e-06,
"logits/chosen": -14.346611022949219,
"logits/rejected": -14.158876419067383,
"logps/chosen": -1.3988748788833618,
"logps/rejected": -1.519100308418274,
"loss": 1.4702,
"odds_ratio_loss": 0.7135868072509766,
"rewards/accuracies": 0.48750001192092896,
"rewards/chosen": -0.1398874968290329,
"rewards/margins": 0.012022523209452629,
"rewards/rejected": -0.15191002190113068,
"sft_loss": 1.3988748788833618,
"step": 580
},
{
"epoch": 1.049122027117137,
"grad_norm": 1.2780736684799194,
"learning_rate": 3.6356610889596355e-06,
"logits/chosen": -14.491503715515137,
"logits/rejected": -14.515363693237305,
"logps/chosen": -1.567704200744629,
"logps/rejected": -1.5939210653305054,
"loss": 1.6438,
"odds_ratio_loss": 0.7608035206794739,
"rewards/accuracies": 0.4625000059604645,
"rewards/chosen": -0.15677042305469513,
"rewards/margins": 0.002621681662276387,
"rewards/rejected": -0.1593921184539795,
"sft_loss": 1.567704200744629,
"step": 590
},
{
"epoch": 1.066903756390309,
"grad_norm": 1.1325277090072632,
"learning_rate": 3.593964141955541e-06,
"logits/chosen": -14.486841201782227,
"logits/rejected": -14.375473022460938,
"logps/chosen": -1.4968254566192627,
"logps/rejected": -1.535290241241455,
"loss": 1.5708,
"odds_ratio_loss": 0.7399110198020935,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.14968255162239075,
"rewards/margins": 0.0038464791141450405,
"rewards/rejected": -0.15352903306484222,
"sft_loss": 1.4968254566192627,
"step": 600
},
{
"epoch": 1.0846854856634809,
"grad_norm": 1.0757452249526978,
"learning_rate": 3.5518873322576573e-06,
"logits/chosen": -14.257904052734375,
"logits/rejected": -14.527667045593262,
"logps/chosen": -1.5230721235275269,
"logps/rejected": -1.5504690408706665,
"loss": 1.6005,
"odds_ratio_loss": 0.774411678314209,
"rewards/accuracies": 0.5062500238418579,
"rewards/chosen": -0.1523071974515915,
"rewards/margins": 0.0027397016528993845,
"rewards/rejected": -0.15504691004753113,
"sft_loss": 1.5230721235275269,
"step": 610
},
{
"epoch": 1.1024672149366526,
"grad_norm": 0.802010715007782,
"learning_rate": 3.5094452704091143e-06,
"logits/chosen": -14.364301681518555,
"logits/rejected": -14.379095077514648,
"logps/chosen": -1.4637863636016846,
"logps/rejected": -1.551966667175293,
"loss": 1.5353,
"odds_ratio_loss": 0.7155525088310242,
"rewards/accuracies": 0.4749999940395355,
"rewards/chosen": -0.14637863636016846,
"rewards/margins": 0.008818031288683414,
"rewards/rejected": -0.1551966667175293,
"sft_loss": 1.4637863636016846,
"step": 620
},
{
"epoch": 1.1202489442098245,
"grad_norm": 1.9857678413391113,
"learning_rate": 3.46665269378139e-06,
"logits/chosen": -14.460027694702148,
"logits/rejected": -14.280843734741211,
"logps/chosen": -1.57863187789917,
"logps/rejected": -1.5787197351455688,
"loss": 1.6576,
"odds_ratio_loss": 0.7900050282478333,
"rewards/accuracies": 0.4312500059604645,
"rewards/chosen": -0.15786318480968475,
"rewards/margins": 8.772313776717056e-06,
"rewards/rejected": -0.15787196159362793,
"sft_loss": 1.57863187789917,
"step": 630
},
{
"epoch": 1.1380306734829961,
"grad_norm": 3.4590837955474854,
"learning_rate": 3.4235244614569794e-06,
"logits/chosen": -14.505206108093262,
"logits/rejected": -14.5104398727417,
"logps/chosen": -1.5605857372283936,
"logps/rejected": -1.5010731220245361,
"loss": 1.6419,
"odds_ratio_loss": 0.8131183385848999,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": -0.15605857968330383,
"rewards/margins": -0.0059512583538889885,
"rewards/rejected": -0.15010732412338257,
"sft_loss": 1.5605857372283936,
"step": 640
},
{
"epoch": 1.155812402756168,
"grad_norm": 0.7217416763305664,
"learning_rate": 3.3800755490698008e-06,
"logits/chosen": -14.571185111999512,
"logits/rejected": -14.486068725585938,
"logps/chosen": -1.416325569152832,
"logps/rejected": -1.6571706533432007,
"loss": 1.4832,
"odds_ratio_loss": 0.6685255765914917,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.1416325569152832,
"rewards/margins": 0.024084512144327164,
"rewards/rejected": -0.16571708023548126,
"sft_loss": 1.416325569152832,
"step": 650
},
{
"epoch": 1.17359413202934,
"grad_norm": 1.5739328861236572,
"learning_rate": 3.3363210436051287e-06,
"logits/chosen": -14.533714294433594,
"logits/rejected": -14.47203540802002,
"logps/chosen": -1.5552722215652466,
"logps/rejected": -1.6581999063491821,
"loss": 1.6292,
"odds_ratio_loss": 0.7395690083503723,
"rewards/accuracies": 0.5062500238418579,
"rewards/chosen": -0.1555272340774536,
"rewards/margins": 0.010292761959135532,
"rewards/rejected": -0.16581998765468597,
"sft_loss": 1.5552722215652466,
"step": 660
},
{
"epoch": 1.1913758613025116,
"grad_norm": 1.1496367454528809,
"learning_rate": 3.292276138160867e-06,
"logits/chosen": -14.616167068481445,
"logits/rejected": -14.592849731445312,
"logps/chosen": -1.4817497730255127,
"logps/rejected": -1.5020571947097778,
"loss": 1.5589,
"odds_ratio_loss": 0.7710050940513611,
"rewards/accuracies": 0.41874998807907104,
"rewards/chosen": -0.1481749713420868,
"rewards/margins": 0.002030743286013603,
"rewards/rejected": -0.15020573139190674,
"sft_loss": 1.4817497730255127,
"step": 670
},
{
"epoch": 1.2091575905756835,
"grad_norm": 0.9039623737335205,
"learning_rate": 3.2479561266719694e-06,
"logits/chosen": -14.403009414672852,
"logits/rejected": -14.395463943481445,
"logps/chosen": -1.536474347114563,
"logps/rejected": -1.5868223905563354,
"loss": 1.6131,
"odds_ratio_loss": 0.7661021947860718,
"rewards/accuracies": 0.4937500059604645,
"rewards/chosen": -0.15364743769168854,
"rewards/margins": 0.005034810863435268,
"rewards/rejected": -0.15868225693702698,
"sft_loss": 1.536474347114563,
"step": 680
},
{
"epoch": 1.2269393198488552,
"grad_norm": 1.4711978435516357,
"learning_rate": 3.2033763985998533e-06,
"logits/chosen": -14.493423461914062,
"logits/rejected": -14.387487411499023,
"logps/chosen": -1.4371049404144287,
"logps/rejected": -1.6720597743988037,
"loss": 1.5038,
"odds_ratio_loss": 0.6674162149429321,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": -0.14371049404144287,
"rewards/margins": 0.023495487868785858,
"rewards/rejected": -0.16720597445964813,
"sft_loss": 1.4371049404144287,
"step": 690
},
{
"epoch": 1.244721049122027,
"grad_norm": 1.8284207582473755,
"learning_rate": 3.1585524335886335e-06,
"logits/chosen": -14.462885856628418,
"logits/rejected": -14.4248628616333,
"logps/chosen": -1.4241148233413696,
"logps/rejected": -1.5204662084579468,
"loss": 1.4937,
"odds_ratio_loss": 0.695872962474823,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": -0.1424115002155304,
"rewards/margins": 0.00963515229523182,
"rewards/rejected": -0.15204663574695587,
"sft_loss": 1.4241148233413696,
"step": 700
},
{
"epoch": 1.262502778395199,
"grad_norm": 1.8534456491470337,
"learning_rate": 3.1134997960900536e-06,
"logits/chosen": -14.414436340332031,
"logits/rejected": -14.282020568847656,
"logps/chosen": -1.4227323532104492,
"logps/rejected": -1.592142105102539,
"loss": 1.4906,
"odds_ratio_loss": 0.678249180316925,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": -0.14227323234081268,
"rewards/margins": 0.01694098487496376,
"rewards/rejected": -0.15921422839164734,
"sft_loss": 1.4227323532104492,
"step": 710
},
{
"epoch": 1.2802845076683709,
"grad_norm": 1.5054043531417847,
"learning_rate": 3.0682341299589583e-06,
"logits/chosen": -14.277090072631836,
"logits/rejected": -14.398614883422852,
"logps/chosen": -1.473953127861023,
"logps/rejected": -1.4768346548080444,
"loss": 1.5498,
"odds_ratio_loss": 0.7589610815048218,
"rewards/accuracies": 0.4000000059604645,
"rewards/chosen": -0.1473953276872635,
"rewards/margins": 0.00028814561665058136,
"rewards/rejected": -0.14768347144126892,
"sft_loss": 1.473953127861023,
"step": 720
},
{
"epoch": 1.2980662369415426,
"grad_norm": 2.268519878387451,
"learning_rate": 3.022771153021201e-06,
"logits/chosen": -14.384374618530273,
"logits/rejected": -14.452374458312988,
"logps/chosen": -1.4778945446014404,
"logps/rejected": -1.574272871017456,
"loss": 1.5509,
"odds_ratio_loss": 0.7302027940750122,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": -0.14778944849967957,
"rewards/margins": 0.009637835435569286,
"rewards/rejected": -0.15742729604244232,
"sft_loss": 1.4778945446014404,
"step": 730
},
{
"epoch": 1.3158479662147144,
"grad_norm": 1.292470097541809,
"learning_rate": 2.9771266516158625e-06,
"logits/chosen": -14.381269454956055,
"logits/rejected": -14.515772819519043,
"logps/chosen": -1.467395544052124,
"logps/rejected": -1.5885987281799316,
"loss": 1.5376,
"odds_ratio_loss": 0.702446699142456,
"rewards/accuracies": 0.48750001192092896,
"rewards/chosen": -0.14673955738544464,
"rewards/margins": 0.01212033536285162,
"rewards/rejected": -0.15885989367961884,
"sft_loss": 1.467395544052124,
"step": 740
},
{
"epoch": 1.3336296954878861,
"grad_norm": 1.0139861106872559,
"learning_rate": 2.9313164751136802e-06,
"logits/chosen": -14.319999694824219,
"logits/rejected": -14.376035690307617,
"logps/chosen": -1.4767507314682007,
"logps/rejected": -1.5156667232513428,
"loss": 1.5505,
"odds_ratio_loss": 0.7372480630874634,
"rewards/accuracies": 0.4937500059604645,
"rewards/chosen": -0.1476750671863556,
"rewards/margins": 0.0038915909826755524,
"rewards/rejected": -0.15156666934490204,
"sft_loss": 1.4767507314682007,
"step": 750
},
{
"epoch": 1.351411424761058,
"grad_norm": 1.8870171308517456,
"learning_rate": 2.8853565304135956e-06,
"logits/chosen": -14.622869491577148,
"logits/rejected": -14.402894973754883,
"logps/chosen": -1.464207410812378,
"logps/rejected": -1.509709358215332,
"loss": 1.5409,
"odds_ratio_loss": 0.7670952081680298,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.14642071723937988,
"rewards/margins": 0.004550204146653414,
"rewards/rejected": -0.1509709358215332,
"sft_loss": 1.464207410812378,
"step": 760
},
{
"epoch": 1.36919315403423,
"grad_norm": 2.0388317108154297,
"learning_rate": 2.839262776419313e-06,
"logits/chosen": -14.462852478027344,
"logits/rejected": -14.366838455200195,
"logps/chosen": -1.4720194339752197,
"logps/rejected": -1.659558892250061,
"loss": 1.5416,
"odds_ratio_loss": 0.6956937313079834,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -0.14720192551612854,
"rewards/margins": 0.01875394582748413,
"rewards/rejected": -0.16595587134361267,
"sft_loss": 1.4720194339752197,
"step": 770
},
{
"epoch": 1.3869748833074016,
"grad_norm": 1.2085295915603638,
"learning_rate": 2.793051218497817e-06,
"logits/chosen": -14.64861011505127,
"logits/rejected": -14.640996932983398,
"logps/chosen": -1.4684925079345703,
"logps/rejected": -1.4641777276992798,
"loss": 1.5454,
"odds_ratio_loss": 0.7690992951393127,
"rewards/accuracies": 0.42500001192092896,
"rewards/chosen": -0.14684924483299255,
"rewards/margins": -0.00043147840187884867,
"rewards/rejected": -0.1464177817106247,
"sft_loss": 1.4684925079345703,
"step": 780
},
{
"epoch": 1.4047566125805735,
"grad_norm": 0.9894825220108032,
"learning_rate": 2.7467379029217437e-06,
"logits/chosen": -14.430456161499023,
"logits/rejected": -14.312626838684082,
"logps/chosen": -1.4456686973571777,
"logps/rejected": -1.5765860080718994,
"loss": 1.517,
"odds_ratio_loss": 0.7131879925727844,
"rewards/accuracies": 0.53125,
"rewards/chosen": -0.1445668637752533,
"rewards/margins": 0.013091735541820526,
"rewards/rejected": -0.15765860676765442,
"sft_loss": 1.4456686973571777,
"step": 790
},
{
"epoch": 1.4225383418537452,
"grad_norm": 0.9283164143562317,
"learning_rate": 2.7003389112975546e-06,
"logits/chosen": -14.672605514526367,
"logits/rejected": -14.80772590637207,
"logps/chosen": -1.5466537475585938,
"logps/rejected": -1.5707231760025024,
"loss": 1.6221,
"odds_ratio_loss": 0.7544839382171631,
"rewards/accuracies": 0.45625001192092896,
"rewards/chosen": -0.15466538071632385,
"rewards/margins": 0.002406922634691,
"rewards/rejected": -0.1570723056793213,
"sft_loss": 1.5466537475585938,
"step": 800
},
{
"epoch": 1.440320071126917,
"grad_norm": 2.0001485347747803,
"learning_rate": 2.653870354981437e-06,
"logits/chosen": -14.508201599121094,
"logits/rejected": -14.300331115722656,
"logps/chosen": -1.343572735786438,
"logps/rejected": -1.4564281702041626,
"loss": 1.4137,
"odds_ratio_loss": 0.7009164690971375,
"rewards/accuracies": 0.5062500238418579,
"rewards/chosen": -0.1343572735786438,
"rewards/margins": 0.011285548098385334,
"rewards/rejected": -0.14564281702041626,
"sft_loss": 1.343572735786438,
"step": 810
},
{
"epoch": 1.458101800400089,
"grad_norm": 2.214451313018799,
"learning_rate": 2.6073483694848777e-06,
"logits/chosen": -14.371310234069824,
"logits/rejected": -14.641912460327148,
"logps/chosen": -1.459166169166565,
"logps/rejected": -1.5504926443099976,
"loss": 1.5337,
"odds_ratio_loss": 0.7451664805412292,
"rewards/accuracies": 0.518750011920929,
"rewards/chosen": -0.14591661095619202,
"rewards/margins": 0.009132652543485165,
"rewards/rejected": -0.15504927933216095,
"sft_loss": 1.459166169166565,
"step": 820
},
{
"epoch": 1.4758835296732609,
"grad_norm": 0.7959926724433899,
"learning_rate": 2.560789108871847e-06,
"logits/chosen": -14.411550521850586,
"logits/rejected": -14.393269538879395,
"logps/chosen": -1.436858892440796,
"logps/rejected": -1.588494896888733,
"loss": 1.5064,
"odds_ratio_loss": 0.6953193545341492,
"rewards/accuracies": 0.4937500059604645,
"rewards/chosen": -0.14368589222431183,
"rewards/margins": 0.015163603238761425,
"rewards/rejected": -0.15884950757026672,
"sft_loss": 1.436858892440796,
"step": 830
},
{
"epoch": 1.4936652589464325,
"grad_norm": 2.3933050632476807,
"learning_rate": 2.514208740149544e-06,
"logits/chosen": -14.369955062866211,
"logits/rejected": -14.41723346710205,
"logps/chosen": -1.4793574810028076,
"logps/rejected": -1.566274881362915,
"loss": 1.5523,
"odds_ratio_loss": 0.729671061038971,
"rewards/accuracies": 0.5062500238418579,
"rewards/chosen": -0.14793574810028076,
"rewards/margins": 0.008691729977726936,
"rewards/rejected": -0.15662747621536255,
"sft_loss": 1.4793574810028076,
"step": 840
},
{
"epoch": 1.5114469882196042,
"grad_norm": 1.3297358751296997,
"learning_rate": 2.46762343765464e-06,
"logits/chosen": -14.52283000946045,
"logits/rejected": -14.487058639526367,
"logps/chosen": -1.4763939380645752,
"logps/rejected": -1.6392600536346436,
"loss": 1.5463,
"odds_ratio_loss": 0.6985622644424438,
"rewards/accuracies": 0.53125,
"rewards/chosen": -0.14763939380645752,
"rewards/margins": 0.01628662459552288,
"rewards/rejected": -0.16392602026462555,
"sft_loss": 1.4763939380645752,
"step": 850
},
{
"epoch": 1.5292287174927761,
"grad_norm": 1.4351073503494263,
"learning_rate": 2.4210493774369903e-06,
"logits/chosen": -14.422691345214844,
"logits/rejected": -14.469161987304688,
"logps/chosen": -1.5580244064331055,
"logps/rejected": -1.5905354022979736,
"loss": 1.6351,
"odds_ratio_loss": 0.770281970500946,
"rewards/accuracies": 0.4625000059604645,
"rewards/chosen": -0.1558024138212204,
"rewards/margins": 0.003251106943935156,
"rewards/rejected": -0.15905353426933289,
"sft_loss": 1.5580244064331055,
"step": 860
},
{
"epoch": 1.547010446765948,
"grad_norm": 0.898558497428894,
"learning_rate": 2.374502731642732e-06,
"logits/chosen": -14.498028755187988,
"logits/rejected": -14.491365432739258,
"logps/chosen": -1.4510562419891357,
"logps/rejected": -1.4965808391571045,
"loss": 1.5247,
"odds_ratio_loss": 0.7362778782844543,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.14510561525821686,
"rewards/margins": 0.004552468657493591,
"rewards/rejected": -0.14965808391571045,
"sft_loss": 1.4510562419891357,
"step": 870
},
{
"epoch": 1.56479217603912,
"grad_norm": 1.0833823680877686,
"learning_rate": 2.3279996628987556e-06,
"logits/chosen": -14.427210807800293,
"logits/rejected": -14.47814655303955,
"logps/chosen": -1.4597257375717163,
"logps/rejected": -1.4845510721206665,
"loss": 1.5356,
"odds_ratio_loss": 0.7589424252510071,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -0.14597256481647491,
"rewards/margins": 0.002482546027749777,
"rewards/rejected": -0.14845511317253113,
"sft_loss": 1.4597257375717163,
"step": 880
},
{
"epoch": 1.5825739053122916,
"grad_norm": 1.345390796661377,
"learning_rate": 2.281556318700474e-06,
"logits/chosen": -14.579635620117188,
"logits/rejected": -14.662760734558105,
"logps/chosen": -1.435423493385315,
"logps/rejected": -1.4268932342529297,
"loss": 1.5144,
"odds_ratio_loss": 0.789936900138855,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.1435423642396927,
"rewards/margins": -0.0008530290797352791,
"rewards/rejected": -0.14268933236598969,
"sft_loss": 1.435423493385315,
"step": 890
},
{
"epoch": 1.6003556345854635,
"grad_norm": 2.053659439086914,
"learning_rate": 2.2351888258048408e-06,
"logits/chosen": -14.430631637573242,
"logits/rejected": -14.57084846496582,
"logps/chosen": -1.4910775423049927,
"logps/rejected": -1.588700532913208,
"loss": 1.5647,
"odds_ratio_loss": 0.7359753251075745,
"rewards/accuracies": 0.518750011920929,
"rewards/chosen": -0.14910776913166046,
"rewards/margins": 0.009762286208570004,
"rewards/rejected": -0.15887002646923065,
"sft_loss": 1.4910775423049927,
"step": 900
},
{
"epoch": 1.6181373638586352,
"grad_norm": 1.0991109609603882,
"learning_rate": 2.188913284630584e-06,
"logits/chosen": -14.372177124023438,
"logits/rejected": -14.45659065246582,
"logps/chosen": -1.5502886772155762,
"logps/rejected": -1.5803656578063965,
"loss": 1.6264,
"odds_ratio_loss": 0.7611321210861206,
"rewards/accuracies": 0.4749999940395355,
"rewards/chosen": -0.15502886474132538,
"rewards/margins": 0.0030077022965997458,
"rewards/rejected": -0.15803657472133636,
"sft_loss": 1.5502886772155762,
"step": 910
},
{
"epoch": 1.635919093131807,
"grad_norm": 4.411732196807861,
"learning_rate": 2.1427457636675652e-06,
"logits/chosen": -14.42602825164795,
"logits/rejected": -14.588623046875,
"logps/chosen": -1.5284509658813477,
"logps/rejected": -1.539262056350708,
"loss": 1.609,
"odds_ratio_loss": 0.8054282069206238,
"rewards/accuracies": 0.45625001192092896,
"rewards/chosen": -0.152845099568367,
"rewards/margins": 0.0010811155661940575,
"rewards/rejected": -0.15392622351646423,
"sft_loss": 1.5284509658813477,
"step": 920
},
{
"epoch": 1.653700822404979,
"grad_norm": 1.1233824491500854,
"learning_rate": 2.096702293897247e-06,
"logits/chosen": -14.348466873168945,
"logits/rejected": -14.271936416625977,
"logps/chosen": -1.4055891036987305,
"logps/rejected": -1.5611364841461182,
"loss": 1.4753,
"odds_ratio_loss": 0.6970332860946655,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -0.1405588984489441,
"rewards/margins": 0.015554727986454964,
"rewards/rejected": -0.1561136394739151,
"sft_loss": 1.4055891036987305,
"step": 930
},
{
"epoch": 1.6714825516781509,
"grad_norm": 1.966296672821045,
"learning_rate": 2.0507988632261672e-06,
"logits/chosen": -14.399055480957031,
"logits/rejected": -14.409767150878906,
"logps/chosen": -1.4748440980911255,
"logps/rejected": -1.4950190782546997,
"loss": 1.5523,
"odds_ratio_loss": 0.7747657895088196,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.1474844217300415,
"rewards/margins": 0.002017489168792963,
"rewards/rejected": -0.14950190484523773,
"sft_loss": 1.4748440980911255,
"step": 940
},
{
"epoch": 1.6892642809513225,
"grad_norm": 2.740964651107788,
"learning_rate": 2.005051410934382e-06,
"logits/chosen": -14.488139152526855,
"logits/rejected": -14.520662307739258,
"logps/chosen": -1.5530728101730347,
"logps/rejected": -1.6438829898834229,
"loss": 1.6259,
"odds_ratio_loss": 0.7283372282981873,
"rewards/accuracies": 0.48750001192092896,
"rewards/chosen": -0.15530726313591003,
"rewards/margins": 0.009081020019948483,
"rewards/rejected": -0.16438829898834229,
"sft_loss": 1.5530728101730347,
"step": 950
},
{
"epoch": 1.7070460102244942,
"grad_norm": 2.387444496154785,
"learning_rate": 1.9594758221407843e-06,
"logits/chosen": -14.521173477172852,
"logits/rejected": -14.390649795532227,
"logps/chosen": -1.3837093114852905,
"logps/rejected": -1.492570161819458,
"loss": 1.453,
"odds_ratio_loss": 0.6927343606948853,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -0.13837094604969025,
"rewards/margins": 0.010886076837778091,
"rewards/rejected": -0.14925701916217804,
"sft_loss": 1.3837093114852905,
"step": 960
},
{
"epoch": 1.724827739497666,
"grad_norm": 1.347974181175232,
"learning_rate": 1.9140879222872408e-06,
"logits/chosen": -14.385538101196289,
"logits/rejected": -14.44627857208252,
"logps/chosen": -1.4018323421478271,
"logps/rejected": -1.4524750709533691,
"loss": 1.4772,
"odds_ratio_loss": 0.7537996172904968,
"rewards/accuracies": 0.48750001192092896,
"rewards/chosen": -0.140183225274086,
"rewards/margins": 0.0050642904825508595,
"rewards/rejected": -0.14524750411510468,
"sft_loss": 1.4018323421478271,
"step": 970
},
{
"epoch": 1.742609468770838,
"grad_norm": 0.7524885535240173,
"learning_rate": 1.8689034716434346e-06,
"logits/chosen": -14.513898849487305,
"logits/rejected": -14.415435791015625,
"logps/chosen": -1.4909882545471191,
"logps/rejected": -1.4778468608856201,
"loss": 1.5681,
"odds_ratio_loss": 0.771571159362793,
"rewards/accuracies": 0.45625001192092896,
"rewards/chosen": -0.14909882843494415,
"rewards/margins": -0.0013141353847458959,
"rewards/rejected": -0.14778469502925873,
"sft_loss": 1.4909882545471191,
"step": 980
},
{
"epoch": 1.76039119804401,
"grad_norm": 1.0324140787124634,
"learning_rate": 1.8239381598343576e-06,
"logits/chosen": -14.462181091308594,
"logits/rejected": -14.475656509399414,
"logps/chosen": -1.411113977432251,
"logps/rejected": -1.4429736137390137,
"loss": 1.4872,
"odds_ratio_loss": 0.7608811259269714,
"rewards/accuracies": 0.44999998807907104,
"rewards/chosen": -0.14111140370368958,
"rewards/margins": 0.003185967681929469,
"rewards/rejected": -0.14429736137390137,
"sft_loss": 1.411113977432251,
"step": 990
},
{
"epoch": 1.7781729273171816,
"grad_norm": 1.5470359325408936,
"learning_rate": 1.779207600392312e-06,
"logits/chosen": -14.624302864074707,
"logits/rejected": -14.673869132995605,
"logps/chosen": -1.4659007787704468,
"logps/rejected": -1.5123026371002197,
"loss": 1.5401,
"odds_ratio_loss": 0.7417243123054504,
"rewards/accuracies": 0.4749999940395355,
"rewards/chosen": -0.14659008383750916,
"rewards/margins": 0.004640174098312855,
"rewards/rejected": -0.15123026072978973,
"sft_loss": 1.4659007787704468,
"step": 1000
},
{
"epoch": 1.7781729273171816,
"eval_logits/chosen": -14.472068786621094,
"eval_logits/rejected": -14.548614501953125,
"eval_logps/chosen": -1.4554640054702759,
"eval_logps/rejected": -1.5491538047790527,
"eval_loss": 1.5269325971603394,
"eval_odds_ratio_loss": 0.7146860361099243,
"eval_rewards/accuracies": 0.5019999742507935,
"eval_rewards/chosen": -0.14554640650749207,
"eval_rewards/margins": 0.00936897937208414,
"eval_rewards/rejected": -0.15491539239883423,
"eval_runtime": 408.1752,
"eval_samples_per_second": 2.45,
"eval_sft_loss": 1.4554640054702759,
"eval_steps_per_second": 1.225,
"step": 1000
},
{
"epoch": 1.7959546565903532,
"grad_norm": 2.230626106262207,
"learning_rate": 1.7347273253353552e-06,
"logits/chosen": -14.449197769165039,
"logits/rejected": -14.371235847473145,
"logps/chosen": -1.4797186851501465,
"logps/rejected": -1.5376956462860107,
"loss": 1.5549,
"odds_ratio_loss": 0.7520232796669006,
"rewards/accuracies": 0.45625001192092896,
"rewards/chosen": -0.14797186851501465,
"rewards/margins": 0.005797683726996183,
"rewards/rejected": -0.15376953780651093,
"sft_loss": 1.4797186851501465,
"step": 1010
},
{
"epoch": 1.8137363858635251,
"grad_norm": 3.5348658561706543,
"learning_rate": 1.690512779774029e-06,
"logits/chosen": -14.508018493652344,
"logits/rejected": -14.555743217468262,
"logps/chosen": -1.4630930423736572,
"logps/rejected": -1.5662147998809814,
"loss": 1.5345,
"odds_ratio_loss": 0.714568018913269,
"rewards/accuracies": 0.46875,
"rewards/chosen": -0.14630930125713348,
"rewards/margins": 0.010312167927622795,
"rewards/rejected": -0.15662148594856262,
"sft_loss": 1.4630930423736572,
"step": 1020
},
{
"epoch": 1.831518115136697,
"grad_norm": 1.3148037195205688,
"learning_rate": 1.6465793165482838e-06,
"logits/chosen": -14.601341247558594,
"logits/rejected": -14.604815483093262,
"logps/chosen": -1.3863935470581055,
"logps/rejected": -1.5016463994979858,
"loss": 1.4553,
"odds_ratio_loss": 0.6889203786849976,
"rewards/accuracies": 0.518750011920929,
"rewards/chosen": -0.13863936066627502,
"rewards/margins": 0.011525283567607403,
"rewards/rejected": -0.1501646339893341,
"sft_loss": 1.3863935470581055,
"step": 1030
},
{
"epoch": 1.849299844409869,
"grad_norm": 1.944263219833374,
"learning_rate": 1.6029421908964305e-06,
"logits/chosen": -14.364709854125977,
"logits/rejected": -14.301609992980957,
"logps/chosen": -1.444108247756958,
"logps/rejected": -1.735345482826233,
"loss": 1.5112,
"odds_ratio_loss": 0.6704747080802917,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -0.1444108486175537,
"rewards/margins": 0.029123712331056595,
"rewards/rejected": -0.1735345423221588,
"sft_loss": 1.444108247756958,
"step": 1040
},
{
"epoch": 1.8670815736830408,
"grad_norm": 3.4116580486297607,
"learning_rate": 1.559616555157985e-06,
"logits/chosen": -14.63707160949707,
"logits/rejected": -14.487701416015625,
"logps/chosen": -1.4554407596588135,
"logps/rejected": -1.532787561416626,
"loss": 1.529,
"odds_ratio_loss": 0.7356154322624207,
"rewards/accuracies": 0.4937500059604645,
"rewards/chosen": -0.14554408192634583,
"rewards/margins": 0.007734687067568302,
"rewards/rejected": -0.15327878296375275,
"sft_loss": 1.4554407596588135,
"step": 1050
},
{
"epoch": 1.8848633029562125,
"grad_norm": 1.2246363162994385,
"learning_rate": 1.516617453512252e-06,
"logits/chosen": -14.424253463745117,
"logits/rejected": -14.5010347366333,
"logps/chosen": -1.5207793712615967,
"logps/rejected": -1.531063437461853,
"loss": 1.5985,
"odds_ratio_loss": 0.7770354151725769,
"rewards/accuracies": 0.46875,
"rewards/chosen": -0.15207794308662415,
"rewards/margins": 0.0010283945593982935,
"rewards/rejected": -0.15310634672641754,
"sft_loss": 1.5207793712615967,
"step": 1060
},
{
"epoch": 1.9026450322293842,
"grad_norm": 2.65510630607605,
"learning_rate": 1.473959816754449e-06,
"logits/chosen": -14.302286148071289,
"logits/rejected": -14.37928581237793,
"logps/chosen": -1.3779585361480713,
"logps/rejected": -1.444608211517334,
"loss": 1.4506,
"odds_ratio_loss": 0.7259268164634705,
"rewards/accuracies": 0.5062500238418579,
"rewards/chosen": -0.1377958357334137,
"rewards/margins": 0.00666497927159071,
"rewards/rejected": -0.14446081221103668,
"sft_loss": 1.3779585361480713,
"step": 1070
},
{
"epoch": 1.920426761502556,
"grad_norm": 1.5242667198181152,
"learning_rate": 1.4316584571112213e-06,
"logits/chosen": -14.791394233703613,
"logits/rejected": -14.659652709960938,
"logps/chosen": -1.451185941696167,
"logps/rejected": -1.5176652669906616,
"loss": 1.5255,
"odds_ratio_loss": 0.7434892058372498,
"rewards/accuracies": 0.44999998807907104,
"rewards/chosen": -0.1451185941696167,
"rewards/margins": 0.0066479267552495,
"rewards/rejected": -0.15176650881767273,
"sft_loss": 1.451185941696167,
"step": 1080
},
{
"epoch": 1.938208490775728,
"grad_norm": 1.3730899095535278,
"learning_rate": 1.389728063097306e-06,
"logits/chosen": -14.725341796875,
"logits/rejected": -14.647857666015625,
"logps/chosen": -1.4444434642791748,
"logps/rejected": -1.6104562282562256,
"loss": 1.5134,
"odds_ratio_loss": 0.6893559098243713,
"rewards/accuracies": 0.48750001192092896,
"rewards/chosen": -0.14444434642791748,
"rewards/margins": 0.016601284965872765,
"rewards/rejected": -0.161045640707016,
"sft_loss": 1.4444434642791748,
"step": 1090
},
{
"epoch": 1.9559902200488999,
"grad_norm": 0.9227787852287292,
"learning_rate": 1.348183194415179e-06,
"logits/chosen": -14.495756149291992,
"logits/rejected": -14.327432632446289,
"logps/chosen": -1.419993281364441,
"logps/rejected": -1.5734179019927979,
"loss": 1.4884,
"odds_ratio_loss": 0.6841065883636475,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -0.14199933409690857,
"rewards/margins": 0.0153424721211195,
"rewards/rejected": -0.15734180808067322,
"sft_loss": 1.419993281364441,
"step": 1100
},
{
"epoch": 1.9737719493220716,
"grad_norm": 0.9474197626113892,
"learning_rate": 1.3070382768994015e-06,
"logits/chosen": -14.650873184204102,
"logits/rejected": -14.58227825164795,
"logps/chosen": -1.4364961385726929,
"logps/rejected": -1.5456112623214722,
"loss": 1.5057,
"odds_ratio_loss": 0.6923903226852417,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": -0.1436496078968048,
"rewards/margins": 0.010911517776548862,
"rewards/rejected": -0.1545611321926117,
"sft_loss": 1.4364961385726929,
"step": 1110
},
{
"epoch": 1.9915536785952432,
"grad_norm": 1.042833924293518,
"learning_rate": 1.2663075975074746e-06,
"logits/chosen": -14.496850967407227,
"logits/rejected": -14.485113143920898,
"logps/chosen": -1.4912959337234497,
"logps/rejected": -1.6449800729751587,
"loss": 1.5613,
"odds_ratio_loss": 0.699638307094574,
"rewards/accuracies": 0.4937500059604645,
"rewards/chosen": -0.14912959933280945,
"rewards/margins": 0.015368418768048286,
"rewards/rejected": -0.1644980013370514,
"sft_loss": 1.4912959337234497,
"step": 1120
},
{
"epoch": 2.009335407868415,
"grad_norm": 2.740922689437866,
"learning_rate": 1.2260052993589034e-06,
"logits/chosen": -14.43859577178955,
"logits/rejected": -14.43925952911377,
"logps/chosen": -1.535914659500122,
"logps/rejected": -1.5225673913955688,
"loss": 1.615,
"odds_ratio_loss": 0.7913249731063843,
"rewards/accuracies": 0.4625000059604645,
"rewards/chosen": -0.15359148383140564,
"rewards/margins": -0.0013347306521609426,
"rewards/rejected": -0.15225675702095032,
"sft_loss": 1.535914659500122,
"step": 1130
},
{
"epoch": 2.027117137141587,
"grad_norm": 0.7980636954307556,
"learning_rate": 1.1861453768242099e-06,
"logits/chosen": -14.416345596313477,
"logits/rejected": -14.40721321105957,
"logps/chosen": -1.4432775974273682,
"logps/rejected": -1.5133426189422607,
"loss": 1.5179,
"odds_ratio_loss": 0.7463361620903015,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -0.14432775974273682,
"rewards/margins": 0.007006504572927952,
"rewards/rejected": -0.1513342708349228,
"sft_loss": 1.4432775974273682,
"step": 1140
},
{
"epoch": 2.044898866414759,
"grad_norm": 5.74677038192749,
"learning_rate": 1.1467416706655982e-06,
"logits/chosen": -14.52302074432373,
"logits/rejected": -14.662378311157227,
"logps/chosen": -1.5389671325683594,
"logps/rejected": -1.645978569984436,
"loss": 1.6125,
"odds_ratio_loss": 0.7352578043937683,
"rewards/accuracies": 0.4937500059604645,
"rewards/chosen": -0.15389671921730042,
"rewards/margins": 0.010701162740588188,
"rewards/rejected": -0.16459788382053375,
"sft_loss": 1.5389671325683594,
"step": 1150
},
{
"epoch": 2.062680595687931,
"grad_norm": 0.986960232257843,
"learning_rate": 1.1078078632309559e-06,
"logits/chosen": -14.417854309082031,
"logits/rejected": -14.478330612182617,
"logps/chosen": -1.4349641799926758,
"logps/rejected": -1.5250955820083618,
"loss": 1.5053,
"odds_ratio_loss": 0.7031277418136597,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.14349642395973206,
"rewards/margins": 0.009013157337903976,
"rewards/rejected": -0.15250957012176514,
"sft_loss": 1.4349641799926758,
"step": 1160
},
{
"epoch": 2.0804623249611023,
"grad_norm": 2.110325813293457,
"learning_rate": 1.0693574737028627e-06,
"logits/chosen": -14.4419584274292,
"logits/rejected": -14.437457084655762,
"logps/chosen": -1.5028114318847656,
"logps/rejected": -1.5467736721038818,
"loss": 1.5773,
"odds_ratio_loss": 0.7452652454376221,
"rewards/accuracies": 0.48124998807907104,
"rewards/chosen": -0.1502811312675476,
"rewards/margins": 0.004396246280521154,
"rewards/rejected": -0.1546773910522461,
"sft_loss": 1.5028114318847656,
"step": 1170
},
{
"epoch": 2.098244054234274,
"grad_norm": 1.9610004425048828,
"learning_rate": 1.0314038534042586e-06,
"logits/chosen": -14.581426620483398,
"logits/rejected": -14.440699577331543,
"logps/chosen": -1.3876335620880127,
"logps/rejected": -1.5056065320968628,
"loss": 1.4589,
"odds_ratio_loss": 0.7131034731864929,
"rewards/accuracies": 0.4749999940395355,
"rewards/chosen": -0.13876333832740784,
"rewards/margins": 0.011797326616942883,
"rewards/rejected": -0.1505606770515442,
"sft_loss": 1.3876335620880127,
"step": 1180
},
{
"epoch": 2.116025783507446,
"grad_norm": 1.4545583724975586,
"learning_rate": 9.939601811623946e-07,
"logits/chosen": -14.581433296203613,
"logits/rejected": -14.601740837097168,
"logps/chosen": -1.422616720199585,
"logps/rejected": -1.5079143047332764,
"loss": 1.4943,
"odds_ratio_loss": 0.7167633771896362,
"rewards/accuracies": 0.4937500059604645,
"rewards/chosen": -0.14226169884204865,
"rewards/margins": 0.00852974783629179,
"rewards/rejected": -0.15079143643379211,
"sft_loss": 1.422616720199585,
"step": 1190
},
{
"epoch": 2.133807512780618,
"grad_norm": 1.6818853616714478,
"learning_rate": 9.570394587326825e-07,
"logits/chosen": -14.631085395812988,
"logits/rejected": -14.470014572143555,
"logps/chosen": -1.4276001453399658,
"logps/rejected": -1.5927342176437378,
"loss": 1.4962,
"odds_ratio_loss": 0.6859818696975708,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": -0.1427600234746933,
"rewards/margins": 0.016513412818312645,
"rewards/rejected": -0.1592734307050705,
"sft_loss": 1.4276001453399658,
"step": 1200
},
{
"epoch": 2.15158924205379,
"grad_norm": 0.9193028807640076,
"learning_rate": 9.206545062840302e-07,
"logits/chosen": -14.681951522827148,
"logits/rejected": -14.49401569366455,
"logps/chosen": -1.4501049518585205,
"logps/rejected": -1.550287127494812,
"loss": 1.5204,
"odds_ratio_loss": 0.7033563256263733,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.14501050114631653,
"rewards/margins": 0.010018205270171165,
"rewards/rejected": -0.15502868592739105,
"sft_loss": 1.4501049518585205,
"step": 1210
},
{
"epoch": 2.1693709713269618,
"grad_norm": 0.9245877265930176,
"learning_rate": 8.848179579472285e-07,
"logits/chosen": -14.563427925109863,
"logits/rejected": -14.575236320495605,
"logps/chosen": -1.3411179780960083,
"logps/rejected": -1.3986274003982544,
"loss": 1.4132,
"odds_ratio_loss": 0.7204734683036804,
"rewards/accuracies": 0.48750001192092896,
"rewards/chosen": -0.13411179184913635,
"rewards/margins": 0.005750957410782576,
"rewards/rejected": -0.13986274600028992,
"sft_loss": 1.3411179780960083,
"step": 1220
},
{
"epoch": 2.1871527006001332,
"grad_norm": 4.244547367095947,
"learning_rate": 8.495422574279403e-07,
"logits/chosen": -14.301058769226074,
"logits/rejected": -14.25413703918457,
"logps/chosen": -1.370896339416504,
"logps/rejected": -1.5229545831680298,
"loss": 1.4426,
"odds_ratio_loss": 0.7167730927467346,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -0.13708963990211487,
"rewards/margins": 0.015205820091068745,
"rewards/rejected": -0.15229545533657074,
"sft_loss": 1.370896339416504,
"step": 1230
},
{
"epoch": 2.204934429873305,
"grad_norm": 1.5995064973831177,
"learning_rate": 8.148396536858063e-07,
"logits/chosen": -14.522483825683594,
"logits/rejected": -14.575413703918457,
"logps/chosen": -1.506316065788269,
"logps/rejected": -1.6317975521087646,
"loss": 1.578,
"odds_ratio_loss": 0.7170661687850952,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -0.1506316065788269,
"rewards/margins": 0.012548169121146202,
"rewards/rejected": -0.16317978501319885,
"sft_loss": 1.506316065788269,
"step": 1240
},
{
"epoch": 2.222716159146477,
"grad_norm": 2.302438497543335,
"learning_rate": 7.807221966811815e-07,
"logits/chosen": -14.514368057250977,
"logits/rejected": -14.467686653137207,
"logps/chosen": -1.3972229957580566,
"logps/rejected": -1.503300666809082,
"loss": 1.4695,
"odds_ratio_loss": 0.7227479219436646,
"rewards/accuracies": 0.48124998807907104,
"rewards/chosen": -0.1397223025560379,
"rewards/margins": 0.010607777163386345,
"rewards/rejected": -0.1503300666809082,
"sft_loss": 1.3972229957580566,
"step": 1250
},
{
"epoch": 2.240497888419649,
"grad_norm": 2.491163492202759,
"learning_rate": 7.47201733190962e-07,
"logits/chosen": -14.408790588378906,
"logits/rejected": -14.414535522460938,
"logps/chosen": -1.4305273294448853,
"logps/rejected": -1.489025354385376,
"loss": 1.5047,
"odds_ratio_loss": 0.7419496774673462,
"rewards/accuracies": 0.48124998807907104,
"rewards/chosen": -0.14305274188518524,
"rewards/margins": 0.005849803332239389,
"rewards/rejected": -0.1489025503396988,
"sft_loss": 1.4305273294448853,
"step": 1260
},
{
"epoch": 2.258279617692821,
"grad_norm": 1.4214234352111816,
"learning_rate": 7.142899026949721e-07,
"logits/chosen": -14.524235725402832,
"logits/rejected": -14.52795696258545,
"logps/chosen": -1.4059574604034424,
"logps/rejected": -1.5294010639190674,
"loss": 1.4755,
"odds_ratio_loss": 0.6952685117721558,
"rewards/accuracies": 0.53125,
"rewards/chosen": -0.1405957192182541,
"rewards/margins": 0.012344368733465672,
"rewards/rejected": -0.15294012427330017,
"sft_loss": 1.4059574604034424,
"step": 1270
},
{
"epoch": 2.2760613469659923,
"grad_norm": 5.792439937591553,
"learning_rate": 6.819981333343273e-07,
"logits/chosen": -14.282297134399414,
"logits/rejected": -14.3455171585083,
"logps/chosen": -1.3890929222106934,
"logps/rejected": -1.5048874616622925,
"loss": 1.4613,
"odds_ratio_loss": 0.721687912940979,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -0.13890929520130157,
"rewards/margins": 0.011579466983675957,
"rewards/rejected": -0.1504887491464615,
"sft_loss": 1.3890929222106934,
"step": 1280
},
{
"epoch": 2.293843076239164,
"grad_norm": 1.8430945873260498,
"learning_rate": 6.503376379431839e-07,
"logits/chosen": -14.595362663269043,
"logits/rejected": -14.600665092468262,
"logps/chosen": -1.5411078929901123,
"logps/rejected": -1.4967674016952515,
"loss": 1.6191,
"odds_ratio_loss": 0.7797070741653442,
"rewards/accuracies": 0.41874998807907104,
"rewards/chosen": -0.15411078929901123,
"rewards/margins": -0.004434044472873211,
"rewards/rejected": -0.14967674016952515,
"sft_loss": 1.5411078929901123,
"step": 1290
},
{
"epoch": 2.311624805512336,
"grad_norm": 1.6006221771240234,
"learning_rate": 6.193194101552502e-07,
"logits/chosen": -14.458605766296387,
"logits/rejected": -14.19080638885498,
"logps/chosen": -1.4712187051773071,
"logps/rejected": -1.5340622663497925,
"loss": 1.5413,
"odds_ratio_loss": 0.7012200951576233,
"rewards/accuracies": 0.5062500238418579,
"rewards/chosen": -0.147121861577034,
"rewards/margins": 0.006284369621425867,
"rewards/rejected": -0.15340623259544373,
"sft_loss": 1.4712187051773071,
"step": 1300
},
{
"epoch": 2.329406534785508,
"grad_norm": 2.382812261581421,
"learning_rate": 5.889542205864083e-07,
"logits/chosen": -14.490499496459961,
"logits/rejected": -14.431979179382324,
"logps/chosen": -1.5115288496017456,
"logps/rejected": -1.508636474609375,
"loss": 1.5893,
"odds_ratio_loss": 0.7778019309043884,
"rewards/accuracies": 0.48750001192092896,
"rewards/chosen": -0.15115289390087128,
"rewards/margins": -0.0002892397460527718,
"rewards/rejected": -0.1508636474609375,
"sft_loss": 1.5115288496017456,
"step": 1310
},
{
"epoch": 2.34718826405868,
"grad_norm": 1.2705732583999634,
"learning_rate": 5.592526130947862e-07,
"logits/chosen": -14.570259094238281,
"logits/rejected": -14.508715629577637,
"logps/chosen": -1.4529051780700684,
"logps/rejected": -1.5590078830718994,
"loss": 1.5284,
"odds_ratio_loss": 0.7552819848060608,
"rewards/accuracies": 0.4000000059604645,
"rewards/chosen": -0.14529050886631012,
"rewards/margins": 0.01061027031391859,
"rewards/rejected": -0.15590079128742218,
"sft_loss": 1.4529051780700684,
"step": 1320
},
{
"epoch": 2.3649699933318518,
"grad_norm": 1.9159420728683472,
"learning_rate": 5.302249011195507e-07,
"logits/chosen": -14.313755989074707,
"logits/rejected": -14.381486892700195,
"logps/chosen": -1.3798234462738037,
"logps/rejected": -1.4174072742462158,
"loss": 1.4519,
"odds_ratio_loss": 0.7206598520278931,
"rewards/accuracies": 0.48750001192092896,
"rewards/chosen": -0.1379823386669159,
"rewards/margins": 0.0037583820521831512,
"rewards/rejected": -0.14174072444438934,
"sft_loss": 1.3798234462738037,
"step": 1330
},
{
"epoch": 2.382751722605023,
"grad_norm": 3.4968583583831787,
"learning_rate": 5.018811640997307e-07,
"logits/chosen": -14.515790939331055,
"logits/rejected": -14.633216857910156,
"logps/chosen": -1.471914529800415,
"logps/rejected": -1.6871318817138672,
"loss": 1.5372,
"odds_ratio_loss": 0.6527343392372131,
"rewards/accuracies": 0.59375,
"rewards/chosen": -0.14719144999980927,
"rewards/margins": 0.02152174338698387,
"rewards/rejected": -0.16871318221092224,
"sft_loss": 1.471914529800415,
"step": 1340
},
{
"epoch": 2.400533451878195,
"grad_norm": 1.3940002918243408,
"learning_rate": 4.7423124397427105e-07,
"logits/chosen": -14.439651489257812,
"logits/rejected": -14.579530715942383,
"logps/chosen": -1.4648997783660889,
"logps/rejected": -1.4503980875015259,
"loss": 1.5422,
"odds_ratio_loss": 0.773194432258606,
"rewards/accuracies": 0.48750001192092896,
"rewards/chosen": -0.1464899778366089,
"rewards/margins": -0.0014501826371997595,
"rewards/rejected": -0.14503981173038483,
"sft_loss": 1.4648997783660889,
"step": 1350
},
{
"epoch": 2.418315181151367,
"grad_norm": 1.0056315660476685,
"learning_rate": 4.472847417645787e-07,
"logits/chosen": -14.650228500366211,
"logits/rejected": -14.57947826385498,
"logps/chosen": -1.3967421054840088,
"logps/rejected": -1.6349856853485107,
"loss": 1.4623,
"odds_ratio_loss": 0.6557605266571045,
"rewards/accuracies": 0.53125,
"rewards/chosen": -0.13967421650886536,
"rewards/margins": 0.023824386298656464,
"rewards/rejected": -0.16349859535694122,
"sft_loss": 1.3967421054840088,
"step": 1360
},
{
"epoch": 2.436096910424539,
"grad_norm": 0.7953612804412842,
"learning_rate": 4.210510142406993e-07,
"logits/chosen": -14.647814750671387,
"logits/rejected": -14.501565933227539,
"logps/chosen": -1.4687623977661133,
"logps/rejected": -1.5674188137054443,
"loss": 1.5411,
"odds_ratio_loss": 0.723659873008728,
"rewards/accuracies": 0.4749999940395355,
"rewards/chosen": -0.1468762457370758,
"rewards/margins": 0.009865625761449337,
"rewards/rejected": -0.15674187242984772,
"sft_loss": 1.4687623977661133,
"step": 1370
},
{
"epoch": 2.4538786396977104,
"grad_norm": 1.0436296463012695,
"learning_rate": 3.9553917067232966e-07,
"logits/chosen": -14.456913948059082,
"logits/rejected": -14.33137321472168,
"logps/chosen": -1.451188564300537,
"logps/rejected": -1.5067174434661865,
"loss": 1.5265,
"odds_ratio_loss": 0.7529994249343872,
"rewards/accuracies": 0.46875,
"rewards/chosen": -0.1451188623905182,
"rewards/margins": 0.005552901886403561,
"rewards/rejected": -0.15067176520824432,
"sft_loss": 1.451188564300537,
"step": 1380
},
{
"epoch": 2.4716603689708823,
"grad_norm": 1.177869200706482,
"learning_rate": 3.707580696657509e-07,
"logits/chosen": -14.569585800170898,
"logits/rejected": -14.398585319519043,
"logps/chosen": -1.4260759353637695,
"logps/rejected": -1.4303407669067383,
"loss": 1.5,
"odds_ratio_loss": 0.7389937043190002,
"rewards/accuracies": 0.4625000059604645,
"rewards/chosen": -0.14260759949684143,
"rewards/margins": 0.0004264971357770264,
"rewards/rejected": -0.14303407073020935,
"sft_loss": 1.4260759353637695,
"step": 1390
},
{
"epoch": 2.489442098244054,
"grad_norm": 1.4175564050674438,
"learning_rate": 3.4671631608781815e-07,
"logits/chosen": -14.617632865905762,
"logits/rejected": -14.494784355163574,
"logps/chosen": -1.4826141595840454,
"logps/rejected": -1.5348364114761353,
"loss": 1.5593,
"odds_ratio_loss": 0.7670896649360657,
"rewards/accuracies": 0.42500001192092896,
"rewards/chosen": -0.1482614129781723,
"rewards/margins": 0.005222243722528219,
"rewards/rejected": -0.15348365902900696,
"sft_loss": 1.4826141595840454,
"step": 1400
},
{
"epoch": 2.507223827517226,
"grad_norm": 1.6863813400268555,
"learning_rate": 3.234222580780405e-07,
"logits/chosen": -14.408329963684082,
"logits/rejected": -14.414642333984375,
"logps/chosen": -1.4218008518218994,
"logps/rejected": -1.4535105228424072,
"loss": 1.4962,
"odds_ratio_loss": 0.7441353797912598,
"rewards/accuracies": 0.4312500059604645,
"rewards/chosen": -0.14218010008335114,
"rewards/margins": 0.0031709708273410797,
"rewards/rejected": -0.1453510820865631,
"sft_loss": 1.4218008518218994,
"step": 1410
},
{
"epoch": 2.525005556790398,
"grad_norm": 4.5228376388549805,
"learning_rate": 3.0088398414982375e-07,
"logits/chosen": -14.276860237121582,
"logits/rejected": -14.423869132995605,
"logps/chosen": -1.5565452575683594,
"logps/rejected": -1.6457360982894897,
"loss": 1.6331,
"odds_ratio_loss": 0.7656282782554626,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.15565453469753265,
"rewards/margins": 0.008919079788029194,
"rewards/rejected": -0.16457359492778778,
"sft_loss": 1.5565452575683594,
"step": 1420
},
{
"epoch": 2.54278728606357,
"grad_norm": 6.204384803771973,
"learning_rate": 2.7910932038184487e-07,
"logits/chosen": -14.392138481140137,
"logits/rejected": -14.114950180053711,
"logps/chosen": -1.4198369979858398,
"logps/rejected": -1.5630581378936768,
"loss": 1.4895,
"odds_ratio_loss": 0.6968662142753601,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.14198370277881622,
"rewards/margins": 0.014322122558951378,
"rewards/rejected": -0.15630581974983215,
"sft_loss": 1.4198369979858398,
"step": 1430
},
{
"epoch": 2.5605690153367417,
"grad_norm": 2.048856258392334,
"learning_rate": 2.5810582770057325e-07,
"logits/chosen": -14.456823348999023,
"logits/rejected": -14.554224967956543,
"logps/chosen": -1.400504469871521,
"logps/rejected": -1.4313546419143677,
"loss": 1.4769,
"odds_ratio_loss": 0.764117419719696,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.14005044102668762,
"rewards/margins": 0.0030850153416395187,
"rewards/rejected": -0.1431354582309723,
"sft_loss": 1.400504469871521,
"step": 1440
},
{
"epoch": 2.578350744609913,
"grad_norm": 1.2424274682998657,
"learning_rate": 2.3788079925484402e-07,
"logits/chosen": -14.654197692871094,
"logits/rejected": -14.54980754852295,
"logps/chosen": -1.4363069534301758,
"logps/rejected": -1.4395811557769775,
"loss": 1.5116,
"odds_ratio_loss": 0.7526046633720398,
"rewards/accuracies": 0.45625001192092896,
"rewards/chosen": -0.14363068342208862,
"rewards/margins": 0.0003274245245847851,
"rewards/rejected": -0.14395812153816223,
"sft_loss": 1.4363069534301758,
"step": 1450
},
{
"epoch": 2.596132473883085,
"grad_norm": 1.602452039718628,
"learning_rate": 2.1844125788342661e-07,
"logits/chosen": -14.462437629699707,
"logits/rejected": -14.348231315612793,
"logps/chosen": -1.4353057146072388,
"logps/rejected": -1.6425654888153076,
"loss": 1.5064,
"odds_ratio_loss": 0.711114227771759,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -0.14353057742118835,
"rewards/margins": 0.020725984126329422,
"rewards/rejected": -0.16425655782222748,
"sft_loss": 1.4353057146072388,
"step": 1460
},
{
"epoch": 2.613914203156257,
"grad_norm": 1.1172051429748535,
"learning_rate": 1.9979395367644428e-07,
"logits/chosen": -14.612176895141602,
"logits/rejected": -14.652105331420898,
"logps/chosen": -1.3911590576171875,
"logps/rejected": -1.5478867292404175,
"loss": 1.4586,
"odds_ratio_loss": 0.674477756023407,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": -0.13911592960357666,
"rewards/margins": 0.01567276194691658,
"rewards/rejected": -0.15478867292404175,
"sft_loss": 1.3911590576171875,
"step": 1470
},
{
"epoch": 2.631695932429429,
"grad_norm": 1.9138628244400024,
"learning_rate": 1.81945361631512e-07,
"logits/chosen": -14.578519821166992,
"logits/rejected": -14.719879150390625,
"logps/chosen": -1.458441972732544,
"logps/rejected": -1.4730455875396729,
"loss": 1.5353,
"odds_ratio_loss": 0.7685132622718811,
"rewards/accuracies": 0.518750011920929,
"rewards/chosen": -0.14584419131278992,
"rewards/margins": 0.0014603782910853624,
"rewards/rejected": -0.14730457961559296,
"sft_loss": 1.458441972732544,
"step": 1480
},
{
"epoch": 2.6494776617026004,
"grad_norm": 2.06256103515625,
"learning_rate": 1.6490167940538343e-07,
"logits/chosen": -14.520217895507812,
"logits/rejected": -14.437647819519043,
"logps/chosen": -1.4276152849197388,
"logps/rejected": -1.4981720447540283,
"loss": 1.5003,
"odds_ratio_loss": 0.7272015810012817,
"rewards/accuracies": 0.5062500238418579,
"rewards/chosen": -0.14276152849197388,
"rewards/margins": 0.007055687252432108,
"rewards/rejected": -0.14981719851493835,
"sft_loss": 1.4276152849197388,
"step": 1490
},
{
"epoch": 2.6672593909757722,
"grad_norm": 1.2633237838745117,
"learning_rate": 1.4866882516191339e-07,
"logits/chosen": -14.439382553100586,
"logits/rejected": -14.5440034866333,
"logps/chosen": -1.4182121753692627,
"logps/rejected": -1.5234054327011108,
"loss": 1.4914,
"odds_ratio_loss": 0.7320746779441833,
"rewards/accuracies": 0.48124998807907104,
"rewards/chosen": -0.1418212354183197,
"rewards/margins": 0.01051931269466877,
"rewards/rejected": -0.15234054625034332,
"sft_loss": 1.4182121753692627,
"step": 1500
},
{
"epoch": 2.6672593909757722,
"eval_logits/chosen": -14.466456413269043,
"eval_logits/rejected": -14.543242454528809,
"eval_logps/chosen": -1.44402277469635,
"eval_logps/rejected": -1.5389362573623657,
"eval_loss": 1.5154520273208618,
"eval_odds_ratio_loss": 0.7142924666404724,
"eval_rewards/accuracies": 0.5090000033378601,
"eval_rewards/chosen": -0.14440228044986725,
"eval_rewards/margins": 0.00949135422706604,
"eval_rewards/rejected": -0.15389364957809448,
"eval_runtime": 411.9433,
"eval_samples_per_second": 2.428,
"eval_sft_loss": 1.44402277469635,
"eval_steps_per_second": 1.214,
"step": 1500
},
{
"epoch": 2.685041120248944,
"grad_norm": 2.1881158351898193,
"learning_rate": 1.3325243551706057e-07,
"logits/chosen": -14.318315505981445,
"logits/rejected": -14.482050895690918,
"logps/chosen": -1.5127537250518799,
"logps/rejected": -1.706903100013733,
"loss": 1.5833,
"odds_ratio_loss": 0.7053884267807007,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.1512753665447235,
"rewards/margins": 0.01941494271159172,
"rewards/rejected": -0.17069032788276672,
"sft_loss": 1.5127537250518799,
"step": 1510
},
{
"epoch": 2.702822849522116,
"grad_norm": 1.7814643383026123,
"learning_rate": 1.1865786358165737e-07,
"logits/chosen": -14.374513626098633,
"logits/rejected": -14.638038635253906,
"logps/chosen": -1.4067102670669556,
"logps/rejected": -1.4657062292099,
"loss": 1.4794,
"odds_ratio_loss": 0.727219820022583,
"rewards/accuracies": 0.4749999940395355,
"rewards/chosen": -0.1406710147857666,
"rewards/margins": 0.005899603012949228,
"rewards/rejected": -0.14657063782215118,
"sft_loss": 1.4067102670669556,
"step": 1520
},
{
"epoch": 2.720604578795288,
"grad_norm": 4.55156135559082,
"learning_rate": 1.0489017710262311e-07,
"logits/chosen": -14.426725387573242,
"logits/rejected": -14.426490783691406,
"logps/chosen": -1.4883832931518555,
"logps/rejected": -1.6930698156356812,
"loss": 1.5615,
"odds_ratio_loss": 0.731151282787323,
"rewards/accuracies": 0.53125,
"rewards/chosen": -0.1488383412361145,
"rewards/margins": 0.020468655973672867,
"rewards/rejected": -0.16930699348449707,
"sft_loss": 1.4883832931518555,
"step": 1530
},
{
"epoch": 2.73838630806846,
"grad_norm": 1.255363941192627,
"learning_rate": 9.195415670326446e-08,
"logits/chosen": -14.457158088684082,
"logits/rejected": -14.496696472167969,
"logps/chosen": -1.4223277568817139,
"logps/rejected": -1.5442826747894287,
"loss": 1.4941,
"odds_ratio_loss": 0.7178906202316284,
"rewards/accuracies": 0.48124998807907104,
"rewards/chosen": -0.1422327756881714,
"rewards/margins": 0.012195492163300514,
"rewards/rejected": -0.15442825853824615,
"sft_loss": 1.4223277568817139,
"step": 1540
},
{
"epoch": 2.7561680373416317,
"grad_norm": 1.5739907026290894,
"learning_rate": 7.985429422327384e-08,
"logits/chosen": -14.502885818481445,
"logits/rejected": -14.488322257995605,
"logps/chosen": -1.3892616033554077,
"logps/rejected": -1.461104154586792,
"loss": 1.4636,
"odds_ratio_loss": 0.743794322013855,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.138926163315773,
"rewards/margins": 0.007184277288615704,
"rewards/rejected": -0.1461104303598404,
"sft_loss": 1.3892616033554077,
"step": 1550
},
{
"epoch": 2.773949766614803,
"grad_norm": 1.2010091543197632,
"learning_rate": 6.859479115900818e-08,
"logits/chosen": -14.52282428741455,
"logits/rejected": -14.561497688293457,
"logps/chosen": -1.4343992471694946,
"logps/rejected": -1.5450398921966553,
"loss": 1.5068,
"odds_ratio_loss": 0.7239341735839844,
"rewards/accuracies": 0.518750011920929,
"rewards/chosen": -0.14343991875648499,
"rewards/margins": 0.011064080521464348,
"rewards/rejected": -0.15450401604175568,
"sft_loss": 1.4343992471694946,
"step": 1560
},
{
"epoch": 2.791731495887975,
"grad_norm": 4.445690631866455,
"learning_rate": 5.817955720457902e-08,
"logits/chosen": -14.513537406921387,
"logits/rejected": -14.568794250488281,
"logps/chosen": -1.4571216106414795,
"logps/rejected": -1.440791130065918,
"loss": 1.5361,
"odds_ratio_loss": 0.7899585962295532,
"rewards/accuracies": 0.48124998807907104,
"rewards/chosen": -0.14571216702461243,
"rewards/margins": -0.0016330400248989463,
"rewards/rejected": -0.14407911896705627,
"sft_loss": 1.4571216106414795,
"step": 1570
},
{
"epoch": 2.809513225161147,
"grad_norm": 3.594475030899048,
"learning_rate": 4.861220889427199e-08,
"logits/chosen": -14.410969734191895,
"logits/rejected": -14.302682876586914,
"logps/chosen": -1.4560983180999756,
"logps/rejected": -1.5099445581436157,
"loss": 1.5329,
"odds_ratio_loss": 0.7683452367782593,
"rewards/accuracies": 0.4749999940395355,
"rewards/chosen": -0.14560985565185547,
"rewards/margins": 0.005384599789977074,
"rewards/rejected": -0.1509944498538971,
"sft_loss": 1.4560983180999756,
"step": 1580
},
{
"epoch": 2.827294954434319,
"grad_norm": 0.9240034818649292,
"learning_rate": 3.9896068346758074e-08,
"logits/chosen": -14.375950813293457,
"logits/rejected": -14.428239822387695,
"logps/chosen": -1.4449069499969482,
"logps/rejected": -1.507812738418579,
"loss": 1.519,
"odds_ratio_loss": 0.7404919266700745,
"rewards/accuracies": 0.46875,
"rewards/chosen": -0.14449068903923035,
"rewards/margins": 0.0062905787490308285,
"rewards/rejected": -0.1507812738418579,
"sft_loss": 1.4449069499969482,
"step": 1590
},
{
"epoch": 2.8450766837074903,
"grad_norm": 4.377365589141846,
"learning_rate": 3.203416211153832e-08,
"logits/chosen": -14.38147258758545,
"logits/rejected": -14.620170593261719,
"logps/chosen": -1.417458176612854,
"logps/rejected": -1.5342447757720947,
"loss": 1.491,
"odds_ratio_loss": 0.7356201410293579,
"rewards/accuracies": 0.44999998807907104,
"rewards/chosen": -0.14174583554267883,
"rewards/margins": 0.011678656563162804,
"rewards/rejected": -0.153424471616745,
"sft_loss": 1.417458176612854,
"step": 1600
},
{
"epoch": 2.8628584129806622,
"grad_norm": 5.88523006439209,
"learning_rate": 2.5029220118019393e-08,
"logits/chosen": -14.416727066040039,
"logits/rejected": -14.47779655456543,
"logps/chosen": -1.5760120153427124,
"logps/rejected": -1.5707772970199585,
"loss": 1.6535,
"odds_ratio_loss": 0.774849534034729,
"rewards/accuracies": 0.39375001192092896,
"rewards/chosen": -0.15760120749473572,
"rewards/margins": -0.0005234652198851109,
"rewards/rejected": -0.15707774460315704,
"sft_loss": 1.5760120153427124,
"step": 1610
},
{
"epoch": 2.880640142253834,
"grad_norm": 2.9929535388946533,
"learning_rate": 1.8883674727586122e-08,
"logits/chosen": -14.4614839553833,
"logits/rejected": -14.47071647644043,
"logps/chosen": -1.3790298700332642,
"logps/rejected": -1.5752503871917725,
"loss": 1.4464,
"odds_ratio_loss": 0.6737684607505798,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -0.13790300488471985,
"rewards/margins": 0.01962204836308956,
"rewards/rejected": -0.15752503275871277,
"sft_loss": 1.3790298700332642,
"step": 1620
},
{
"epoch": 2.898421871527006,
"grad_norm": 0.8867095112800598,
"learning_rate": 1.3599659889000639e-08,
"logits/chosen": -14.817098617553711,
"logits/rejected": -14.705721855163574,
"logps/chosen": -1.4359638690948486,
"logps/rejected": -1.4747111797332764,
"loss": 1.5093,
"odds_ratio_loss": 0.7331098914146423,
"rewards/accuracies": 0.46875,
"rewards/chosen": -0.14359638094902039,
"rewards/margins": 0.0038747270591557026,
"rewards/rejected": -0.1474711149930954,
"sft_loss": 1.4359638690948486,
"step": 1630
},
{
"epoch": 2.916203600800178,
"grad_norm": 2.0239081382751465,
"learning_rate": 9.179010397421528e-09,
"logits/chosen": -14.512980461120605,
"logits/rejected": -14.645942687988281,
"logps/chosen": -1.4728834629058838,
"logps/rejected": -1.5710668563842773,
"loss": 1.5439,
"odds_ratio_loss": 0.709987998008728,
"rewards/accuracies": 0.4625000059604645,
"rewards/chosen": -0.14728835225105286,
"rewards/margins": 0.00981833878904581,
"rewards/rejected": -0.1571066826581955,
"sft_loss": 1.4728834629058838,
"step": 1640
},
{
"epoch": 2.93398533007335,
"grad_norm": 3.0479774475097656,
"learning_rate": 5.623261257296509e-09,
"logits/chosen": -14.522771835327148,
"logits/rejected": -14.621217727661133,
"logps/chosen": -1.3921829462051392,
"logps/rejected": -1.5190517902374268,
"loss": 1.461,
"odds_ratio_loss": 0.6878436803817749,
"rewards/accuracies": 0.53125,
"rewards/chosen": -0.1392182856798172,
"rewards/margins": 0.012686875648796558,
"rewards/rejected": -0.15190517902374268,
"sft_loss": 1.3921829462051392,
"step": 1650
},
{
"epoch": 2.9517670593465217,
"grad_norm": 1.4828240871429443,
"learning_rate": 2.933647149357122e-09,
"logits/chosen": -14.36890697479248,
"logits/rejected": -14.4072847366333,
"logps/chosen": -1.3632400035858154,
"logps/rejected": -1.4689347743988037,
"loss": 1.4361,
"odds_ratio_loss": 0.7281399369239807,
"rewards/accuracies": 0.48124998807907104,
"rewards/chosen": -0.13632400333881378,
"rewards/margins": 0.010569497011601925,
"rewards/rejected": -0.1468934863805771,
"sft_loss": 1.3632400035858154,
"step": 1660
},
{
"epoch": 2.969548788619693,
"grad_norm": 1.2580466270446777,
"learning_rate": 1.1111020018930717e-09,
"logits/chosen": -14.664459228515625,
"logits/rejected": -14.4978666305542,
"logps/chosen": -1.4255958795547485,
"logps/rejected": -1.4593638181686401,
"loss": 1.5006,
"odds_ratio_loss": 0.7499974966049194,
"rewards/accuracies": 0.40625,
"rewards/chosen": -0.14255960285663605,
"rewards/margins": 0.0033767870627343655,
"rewards/rejected": -0.14593638479709625,
"sft_loss": 1.4255958795547485,
"step": 1670
},
{
"epoch": 2.987330517892865,
"grad_norm": 1.022547960281372,
"learning_rate": 1.5625866646051813e-10,
"logits/chosen": -14.368474960327148,
"logits/rejected": -14.423803329467773,
"logps/chosen": -1.413266897201538,
"logps/rejected": -1.573032021522522,
"loss": 1.4799,
"odds_ratio_loss": 0.6666654348373413,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.1413266807794571,
"rewards/margins": 0.015976523980498314,
"rewards/rejected": -0.15730319917201996,
"sft_loss": 1.413266897201538,
"step": 1680
},
{
"epoch": 2.997999555456768,
"step": 1686,
"total_flos": 1.8091810238164992e+18,
"train_loss": 1.5885293396059446,
"train_runtime": 25020.7826,
"train_samples_per_second": 1.079,
"train_steps_per_second": 0.067
}
],
"logging_steps": 10,
"max_steps": 1686,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"total_flos": 1.8091810238164992e+18,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}