Vicuna-7B-v1.5-ORPO / trainer_state.json
chchen's picture
End of training
a21bf1d verified
raw
history blame contribute delete
No virus
107 kB
{
"best_metric": 1.0073015689849854,
"best_model_checkpoint": "saves/Vicuna-7B-v1.5/lora/orpo/checkpoint-1500",
"epoch": 2.997999555456768,
"eval_steps": 500,
"global_step": 1686,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.017781729273171815,
"grad_norm": 0.3158996105194092,
"learning_rate": 4.9995745934141085e-06,
"logits/chosen": -0.7898403406143188,
"logits/rejected": -0.7731221914291382,
"logps/chosen": -1.1474043130874634,
"logps/rejected": -1.2031431198120117,
"loss": 1.227,
"odds_ratio_loss": 0.7959282994270325,
"rewards/accuracies": 0.4124999940395355,
"rewards/chosen": -0.11474044620990753,
"rewards/margins": 0.005573858506977558,
"rewards/rejected": -0.12031430006027222,
"sft_loss": 1.1474043130874634,
"step": 10
},
{
"epoch": 0.03556345854634363,
"grad_norm": 0.8646821975708008,
"learning_rate": 4.9982812903243405e-06,
"logits/chosen": -0.7618139982223511,
"logits/rejected": -0.7260042428970337,
"logps/chosen": -0.9931285977363586,
"logps/rejected": -1.050875186920166,
"loss": 1.0707,
"odds_ratio_loss": 0.7757659554481506,
"rewards/accuracies": 0.45625001192092896,
"rewards/chosen": -0.09931285679340363,
"rewards/margins": 0.005774644669145346,
"rewards/rejected": -0.10508750379085541,
"sft_loss": 0.9931285977363586,
"step": 20
},
{
"epoch": 0.05334518781951545,
"grad_norm": 0.2927573025226593,
"learning_rate": 4.996120496405222e-06,
"logits/chosen": -0.7767494916915894,
"logits/rejected": -0.7559677362442017,
"logps/chosen": -1.040177345275879,
"logps/rejected": -1.2401186227798462,
"loss": 1.1087,
"odds_ratio_loss": 0.6853717565536499,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -0.10401773452758789,
"rewards/margins": 0.019994117319583893,
"rewards/rejected": -0.12401185184717178,
"sft_loss": 1.040177345275879,
"step": 30
},
{
"epoch": 0.07112691709268726,
"grad_norm": 0.3339848518371582,
"learning_rate": 4.99309296196014e-06,
"logits/chosen": -0.7875353693962097,
"logits/rejected": -0.7857375741004944,
"logps/chosen": -1.0764983892440796,
"logps/rejected": -1.1753004789352417,
"loss": 1.1498,
"odds_ratio_loss": 0.7328984141349792,
"rewards/accuracies": 0.5062500238418579,
"rewards/chosen": -0.10764984041452408,
"rewards/margins": 0.009880214929580688,
"rewards/rejected": -0.11753007024526596,
"sft_loss": 1.0764983892440796,
"step": 40
},
{
"epoch": 0.08890864636585907,
"grad_norm": 0.3153611719608307,
"learning_rate": 4.989199738255166e-06,
"logits/chosen": -0.7786640524864197,
"logits/rejected": -0.7964621782302856,
"logps/chosen": -1.0476799011230469,
"logps/rejected": -1.1452114582061768,
"loss": 1.1221,
"odds_ratio_loss": 0.7446193099021912,
"rewards/accuracies": 0.4625000059604645,
"rewards/chosen": -0.10476799309253693,
"rewards/margins": 0.009753172285854816,
"rewards/rejected": -0.11452116817235947,
"sft_loss": 1.0476799011230469,
"step": 50
},
{
"epoch": 0.1066903756390309,
"grad_norm": 2.7500874996185303,
"learning_rate": 4.984442177154031e-06,
"logits/chosen": -0.7653383612632751,
"logits/rejected": -0.7529075741767883,
"logps/chosen": -1.1525957584381104,
"logps/rejected": -1.2310835123062134,
"loss": 1.2305,
"odds_ratio_loss": 0.7788733243942261,
"rewards/accuracies": 0.45625001192092896,
"rewards/chosen": -0.1152595728635788,
"rewards/margins": 0.007848784327507019,
"rewards/rejected": -0.12310836464166641,
"sft_loss": 1.1525957584381104,
"step": 60
},
{
"epoch": 0.12447210491220272,
"grad_norm": 0.3525276184082031,
"learning_rate": 4.978821930648704e-06,
"logits/chosen": -0.8071187734603882,
"logits/rejected": -0.7696810364723206,
"logps/chosen": -1.0399789810180664,
"logps/rejected": -1.0721027851104736,
"loss": 1.1208,
"odds_ratio_loss": 0.8085241317749023,
"rewards/accuracies": 0.4625000059604645,
"rewards/chosen": -0.10399790853261948,
"rewards/margins": 0.003212365787476301,
"rewards/rejected": -0.10721027851104736,
"sft_loss": 1.0399789810180664,
"step": 70
},
{
"epoch": 0.14225383418537452,
"grad_norm": 0.6355476379394531,
"learning_rate": 4.97234095028576e-06,
"logits/chosen": -0.738179624080658,
"logits/rejected": -0.7453175783157349,
"logps/chosen": -1.1585901975631714,
"logps/rejected": -1.2273097038269043,
"loss": 1.2343,
"odds_ratio_loss": 0.7569113969802856,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.1158590167760849,
"rewards/margins": 0.0068719410337507725,
"rewards/rejected": -0.12273095548152924,
"sft_loss": 1.1585901975631714,
"step": 80
},
{
"epoch": 0.16003556345854633,
"grad_norm": 0.2942532002925873,
"learning_rate": 4.965001486488743e-06,
"logits/chosen": -0.7591525316238403,
"logits/rejected": -0.7494860887527466,
"logps/chosen": -1.0791616439819336,
"logps/rejected": -1.2336231470108032,
"loss": 1.1471,
"odds_ratio_loss": 0.6791869401931763,
"rewards/accuracies": 0.4749999940395355,
"rewards/chosen": -0.10791617631912231,
"rewards/margins": 0.015446141362190247,
"rewards/rejected": -0.12336231768131256,
"sft_loss": 1.0791616439819336,
"step": 90
},
{
"epoch": 0.17781729273171815,
"grad_norm": 0.35266247391700745,
"learning_rate": 4.956806087776732e-06,
"logits/chosen": -0.6999791860580444,
"logits/rejected": -0.6948890686035156,
"logps/chosen": -1.0402957201004028,
"logps/rejected": -1.2390520572662354,
"loss": 1.1124,
"odds_ratio_loss": 0.7215061187744141,
"rewards/accuracies": 0.4937500059604645,
"rewards/chosen": -0.10402955859899521,
"rewards/margins": 0.01987563632428646,
"rewards/rejected": -0.12390519678592682,
"sft_loss": 1.0402957201004028,
"step": 100
},
{
"epoch": 0.19559902200489,
"grad_norm": 0.4545610845088959,
"learning_rate": 4.947757599879411e-06,
"logits/chosen": -0.7189663052558899,
"logits/rejected": -0.6851673126220703,
"logps/chosen": -1.147323489189148,
"logps/rejected": -1.289452314376831,
"loss": 1.2227,
"odds_ratio_loss": 0.7533982396125793,
"rewards/accuracies": 0.4749999940395355,
"rewards/chosen": -0.11473236232995987,
"rewards/margins": 0.014212870970368385,
"rewards/rejected": -0.1289452314376831,
"sft_loss": 1.147323489189148,
"step": 110
},
{
"epoch": 0.2133807512780618,
"grad_norm": 0.6324980854988098,
"learning_rate": 4.937859164748931e-06,
"logits/chosen": -0.7043695449829102,
"logits/rejected": -0.6795639991760254,
"logps/chosen": -1.0146863460540771,
"logps/rejected": -1.0826324224472046,
"loss": 1.0907,
"odds_ratio_loss": 0.760542094707489,
"rewards/accuracies": 0.4937500059604645,
"rewards/chosen": -0.10146863758563995,
"rewards/margins": 0.006794607732445002,
"rewards/rejected": -0.10826325416564941,
"sft_loss": 1.0146863460540771,
"step": 120
},
{
"epoch": 0.23116248055123362,
"grad_norm": 0.4255826771259308,
"learning_rate": 4.92711421946891e-06,
"logits/chosen": -0.6701909899711609,
"logits/rejected": -0.7547520995140076,
"logps/chosen": -1.0397005081176758,
"logps/rejected": -1.1938796043395996,
"loss": 1.1117,
"odds_ratio_loss": 0.7198113799095154,
"rewards/accuracies": 0.5062500238418579,
"rewards/chosen": -0.10397003591060638,
"rewards/margins": 0.015417915768921375,
"rewards/rejected": -0.11938796192407608,
"sft_loss": 1.0397005081176758,
"step": 130
},
{
"epoch": 0.24894420982440543,
"grad_norm": 0.7161264419555664,
"learning_rate": 4.915526495060961e-06,
"logits/chosen": -0.6202753782272339,
"logits/rejected": -0.64984530210495,
"logps/chosen": -1.0066936016082764,
"logps/rejected": -1.1723135709762573,
"loss": 1.0745,
"odds_ratio_loss": 0.6777721643447876,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -0.10066936165094376,
"rewards/margins": 0.016561999917030334,
"rewards/rejected": -0.1172313541173935,
"sft_loss": 1.0066936016082764,
"step": 140
},
{
"epoch": 0.26672593909757725,
"grad_norm": 0.540038526058197,
"learning_rate": 4.903100015189153e-06,
"logits/chosen": -0.5942473411560059,
"logits/rejected": -0.5408576726913452,
"logps/chosen": -0.9665758013725281,
"logps/rejected": -1.1337311267852783,
"loss": 1.0386,
"odds_ratio_loss": 0.719926118850708,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -0.09665757417678833,
"rewards/margins": 0.01671554148197174,
"rewards/rejected": -0.11337311565876007,
"sft_loss": 0.9665758013725281,
"step": 150
},
{
"epoch": 0.28450766837074903,
"grad_norm": 2.370271682739258,
"learning_rate": 4.889839094762848e-06,
"logits/chosen": -0.5599099397659302,
"logits/rejected": -0.5666571855545044,
"logps/chosen": -1.0475890636444092,
"logps/rejected": -1.1946136951446533,
"loss": 1.1206,
"odds_ratio_loss": 0.7300440073013306,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": -0.10475890338420868,
"rewards/margins": 0.014702451415359974,
"rewards/rejected": -0.11946137249469757,
"sft_loss": 1.0475890636444092,
"step": 160
},
{
"epoch": 0.3022893976439209,
"grad_norm": 0.37259843945503235,
"learning_rate": 4.875748338438416e-06,
"logits/chosen": -0.5827142000198364,
"logits/rejected": -0.5626250505447388,
"logps/chosen": -0.9911508560180664,
"logps/rejected": -1.0813571214675903,
"loss": 1.0632,
"odds_ratio_loss": 0.720399022102356,
"rewards/accuracies": 0.4937500059604645,
"rewards/chosen": -0.09911508113145828,
"rewards/margins": 0.009020629338920116,
"rewards/rejected": -0.10813571512699127,
"sft_loss": 0.9911508560180664,
"step": 170
},
{
"epoch": 0.32007112691709266,
"grad_norm": 0.3821701109409332,
"learning_rate": 4.8608326390203386e-06,
"logits/chosen": -0.6059321165084839,
"logits/rejected": -0.5918234586715698,
"logps/chosen": -0.9553475379943848,
"logps/rejected": -1.1111819744110107,
"loss": 1.0245,
"odds_ratio_loss": 0.6911659240722656,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -0.09553476423025131,
"rewards/margins": 0.01558343879878521,
"rewards/rejected": -0.11111819744110107,
"sft_loss": 0.9553475379943848,
"step": 180
},
{
"epoch": 0.3378528561902645,
"grad_norm": 0.3977317810058594,
"learning_rate": 4.845097175762251e-06,
"logits/chosen": -0.49882182478904724,
"logits/rejected": -0.48370814323425293,
"logps/chosen": -0.989281952381134,
"logps/rejected": -1.0615712404251099,
"loss": 1.0617,
"odds_ratio_loss": 0.7244290113449097,
"rewards/accuracies": 0.4937500059604645,
"rewards/chosen": -0.09892819821834564,
"rewards/margins": 0.007228921167552471,
"rewards/rejected": -0.10615710914134979,
"sft_loss": 0.989281952381134,
"step": 190
},
{
"epoch": 0.3556345854634363,
"grad_norm": 0.46290695667266846,
"learning_rate": 4.8285474125685286e-06,
"logits/chosen": -0.518696129322052,
"logits/rejected": -0.5193291306495667,
"logps/chosen": -1.1205590963363647,
"logps/rejected": -1.1714627742767334,
"loss": 1.198,
"odds_ratio_loss": 0.7740126252174377,
"rewards/accuracies": 0.48750001192092896,
"rewards/chosen": -0.11205589771270752,
"rewards/margins": 0.00509037496522069,
"rewards/rejected": -0.11714627593755722,
"sft_loss": 1.1205590963363647,
"step": 200
},
{
"epoch": 0.37341631473660813,
"grad_norm": 0.32425227761268616,
"learning_rate": 4.811189096097025e-06,
"logits/chosen": -0.5530649423599243,
"logits/rejected": -0.5483794808387756,
"logps/chosen": -0.9994535446166992,
"logps/rejected": -1.1620233058929443,
"loss": 1.0712,
"odds_ratio_loss": 0.7175347208976746,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.09994535893201828,
"rewards/margins": 0.01625697687268257,
"rewards/rejected": -0.11620233952999115,
"sft_loss": 0.9994535446166992,
"step": 210
},
{
"epoch": 0.39119804400978,
"grad_norm": 0.5374495387077332,
"learning_rate": 4.793028253763633e-06,
"logits/chosen": -0.46489372849464417,
"logits/rejected": -0.49711689352989197,
"logps/chosen": -0.9644722938537598,
"logps/rejected": -1.098311185836792,
"loss": 1.0422,
"odds_ratio_loss": 0.7768682837486267,
"rewards/accuracies": 0.4749999940395355,
"rewards/chosen": -0.09644722938537598,
"rewards/margins": 0.013383878394961357,
"rewards/rejected": -0.10983110964298248,
"sft_loss": 0.9644722938537598,
"step": 220
},
{
"epoch": 0.40897977328295176,
"grad_norm": 0.7932880520820618,
"learning_rate": 4.774071191649352e-06,
"logits/chosen": -0.5470231771469116,
"logits/rejected": -0.5435986518859863,
"logps/chosen": -0.9579310417175293,
"logps/rejected": -1.1810802221298218,
"loss": 1.0212,
"odds_ratio_loss": 0.6330138444900513,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": -0.09579310566186905,
"rewards/margins": 0.02231491729617119,
"rewards/rejected": -0.11810803413391113,
"sft_loss": 0.9579310417175293,
"step": 230
},
{
"epoch": 0.4267615025561236,
"grad_norm": 0.618280291557312,
"learning_rate": 4.7543244923105975e-06,
"logits/chosen": -0.5025745630264282,
"logits/rejected": -0.4722610414028168,
"logps/chosen": -1.0212466716766357,
"logps/rejected": -1.0026448965072632,
"loss": 1.1058,
"odds_ratio_loss": 0.8450964093208313,
"rewards/accuracies": 0.45625001192092896,
"rewards/chosen": -0.1021246686577797,
"rewards/margins": -0.0018601752817630768,
"rewards/rejected": -0.10026448965072632,
"sft_loss": 1.0212466716766357,
"step": 240
},
{
"epoch": 0.4445432318292954,
"grad_norm": 0.39385247230529785,
"learning_rate": 4.733795012493506e-06,
"logits/chosen": -0.5138652324676514,
"logits/rejected": -0.4715350270271301,
"logps/chosen": -1.0123497247695923,
"logps/rejected": -1.13383150100708,
"loss": 1.0857,
"odds_ratio_loss": 0.7335414886474609,
"rewards/accuracies": 0.46875,
"rewards/chosen": -0.10123495757579803,
"rewards/margins": 0.012148191221058369,
"rewards/rejected": -0.11338315904140472,
"sft_loss": 1.0123497247695923,
"step": 250
},
{
"epoch": 0.46232496110246724,
"grad_norm": 0.3666248619556427,
"learning_rate": 4.712489880753035e-06,
"logits/chosen": -0.3967147171497345,
"logits/rejected": -0.3805852234363556,
"logps/chosen": -0.946629524230957,
"logps/rejected": -1.0246347188949585,
"loss": 1.0164,
"odds_ratio_loss": 0.6973500847816467,
"rewards/accuracies": 0.48750001192092896,
"rewards/chosen": -0.09466294944286346,
"rewards/margins": 0.007800529710948467,
"rewards/rejected": -0.1024634838104248,
"sft_loss": 0.946629524230957,
"step": 260
},
{
"epoch": 0.480106690375639,
"grad_norm": 0.6196191906929016,
"learning_rate": 4.690416494977673e-06,
"logits/chosen": -0.3590370714664459,
"logits/rejected": -0.3209628164768219,
"logps/chosen": -0.9477987289428711,
"logps/rejected": -1.1744658946990967,
"loss": 1.0133,
"odds_ratio_loss": 0.654593825340271,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -0.09477987140417099,
"rewards/margins": 0.02266671508550644,
"rewards/rejected": -0.11744660139083862,
"sft_loss": 0.9477987289428711,
"step": 270
},
{
"epoch": 0.49788841964881086,
"grad_norm": 0.38255006074905396,
"learning_rate": 4.667582519820639e-06,
"logits/chosen": -0.4478569030761719,
"logits/rejected": -0.40335726737976074,
"logps/chosen": -1.0600357055664062,
"logps/rejected": -1.0844862461090088,
"loss": 1.1374,
"odds_ratio_loss": 0.7734627723693848,
"rewards/accuracies": 0.5062500238418579,
"rewards/chosen": -0.10600356757640839,
"rewards/margins": 0.002445052145048976,
"rewards/rejected": -0.10844862461090088,
"sft_loss": 1.0600357055664062,
"step": 280
},
{
"epoch": 0.5156701489219827,
"grad_norm": 0.6143254637718201,
"learning_rate": 4.643995884038443e-06,
"logits/chosen": -0.42634057998657227,
"logits/rejected": -0.4024909436702728,
"logps/chosen": -1.0625637769699097,
"logps/rejected": -1.2203805446624756,
"loss": 1.1314,
"odds_ratio_loss": 0.6885315179824829,
"rewards/accuracies": 0.5062500238418579,
"rewards/chosen": -0.1062563806772232,
"rewards/margins": 0.01578168198466301,
"rewards/rejected": -0.12203805148601532,
"sft_loss": 1.0625637769699097,
"step": 290
},
{
"epoch": 0.5334518781951545,
"grad_norm": 0.3366183042526245,
"learning_rate": 4.6196647777377475e-06,
"logits/chosen": -0.37543022632598877,
"logits/rejected": -0.3797139525413513,
"logps/chosen": -0.9299192428588867,
"logps/rejected": -0.9767643213272095,
"loss": 1.0053,
"odds_ratio_loss": 0.7540563344955444,
"rewards/accuracies": 0.48124998807907104,
"rewards/chosen": -0.09299192577600479,
"rewards/margins": 0.004684499930590391,
"rewards/rejected": -0.09767641872167587,
"sft_loss": 0.9299192428588867,
"step": 300
},
{
"epoch": 0.5512336074683263,
"grad_norm": 0.5256261825561523,
"learning_rate": 4.59459764953147e-06,
"logits/chosen": -0.3965223431587219,
"logits/rejected": -0.4247291684150696,
"logps/chosen": -1.0226197242736816,
"logps/rejected": -1.121930718421936,
"loss": 1.0919,
"odds_ratio_loss": 0.6925050616264343,
"rewards/accuracies": 0.48124998807907104,
"rewards/chosen": -0.1022619754076004,
"rewards/margins": 0.00993109680712223,
"rewards/rejected": -0.11219307035207748,
"sft_loss": 1.0226197242736816,
"step": 310
},
{
"epoch": 0.5690153367414981,
"grad_norm": 0.5753230452537537,
"learning_rate": 4.568803203605133e-06,
"logits/chosen": -0.38987019658088684,
"logits/rejected": -0.40249496698379517,
"logps/chosen": -1.0238714218139648,
"logps/rejected": -1.191584825515747,
"loss": 1.0951,
"odds_ratio_loss": 0.7120264768600464,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.10238714516162872,
"rewards/margins": 0.016771327704191208,
"rewards/rejected": -0.11915846914052963,
"sft_loss": 1.0238714218139648,
"step": 320
},
{
"epoch": 0.58679706601467,
"grad_norm": 0.40169399976730347,
"learning_rate": 4.542290396694462e-06,
"logits/chosen": -0.4059433043003082,
"logits/rejected": -0.4052697718143463,
"logps/chosen": -0.9671312570571899,
"logps/rejected": -1.0644605159759521,
"loss": 1.0391,
"odds_ratio_loss": 0.7196342349052429,
"rewards/accuracies": 0.5062500238418579,
"rewards/chosen": -0.09671313315629959,
"rewards/margins": 0.009732924401760101,
"rewards/rejected": -0.1064460501074791,
"sft_loss": 0.9671312570571899,
"step": 330
},
{
"epoch": 0.6045787952878418,
"grad_norm": 0.5619000792503357,
"learning_rate": 4.515068434975298e-06,
"logits/chosen": -0.4578043818473816,
"logits/rejected": -0.4284750819206238,
"logps/chosen": -0.9811161756515503,
"logps/rejected": -1.1456761360168457,
"loss": 1.0484,
"odds_ratio_loss": 0.6727977991104126,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.09811162203550339,
"rewards/margins": 0.016455989331007004,
"rewards/rejected": -0.11456761509180069,
"sft_loss": 0.9811161756515503,
"step": 340
},
{
"epoch": 0.6223605245610135,
"grad_norm": 0.5821824073791504,
"learning_rate": 4.487146770866887e-06,
"logits/chosen": -0.34484004974365234,
"logits/rejected": -0.3222612738609314,
"logps/chosen": -1.0583232641220093,
"logps/rejected": -1.117333173751831,
"loss": 1.1304,
"odds_ratio_loss": 0.7205663919448853,
"rewards/accuracies": 0.42500001192092896,
"rewards/chosen": -0.10583231598138809,
"rewards/margins": 0.005901001859456301,
"rewards/rejected": -0.1117333322763443,
"sft_loss": 1.0583232641220093,
"step": 350
},
{
"epoch": 0.6401422538341853,
"grad_norm": 0.28447961807250977,
"learning_rate": 4.458535099749666e-06,
"logits/chosen": -0.43229636549949646,
"logits/rejected": -0.40540462732315063,
"logps/chosen": -1.1308929920196533,
"logps/rejected": -1.0958976745605469,
"loss": 1.2174,
"odds_ratio_loss": 0.8652679324150085,
"rewards/accuracies": 0.46875,
"rewards/chosen": -0.11308930814266205,
"rewards/margins": -0.0034995335154235363,
"rewards/rejected": -0.10958977788686752,
"sft_loss": 1.1308929920196533,
"step": 360
},
{
"epoch": 0.6579239831073572,
"grad_norm": 0.27178603410720825,
"learning_rate": 4.429243356598694e-06,
"logits/chosen": -0.40932542085647583,
"logits/rejected": -0.3859841227531433,
"logps/chosen": -0.9554696083068848,
"logps/rejected": -1.1517064571380615,
"loss": 1.0243,
"odds_ratio_loss": 0.6880883574485779,
"rewards/accuracies": 0.53125,
"rewards/chosen": -0.09554696083068848,
"rewards/margins": 0.019623693078756332,
"rewards/rejected": -0.11517064273357391,
"sft_loss": 0.9554696083068848,
"step": 370
},
{
"epoch": 0.675705712380529,
"grad_norm": 0.34544578194618225,
"learning_rate": 4.399281712533875e-06,
"logits/chosen": -0.32934245467185974,
"logits/rejected": -0.3599315285682678,
"logps/chosen": -0.9367265701293945,
"logps/rejected": -1.0202996730804443,
"loss": 1.0101,
"odds_ratio_loss": 0.7333763837814331,
"rewards/accuracies": 0.4625000059604645,
"rewards/chosen": -0.09367264807224274,
"rewards/margins": 0.008357317186892033,
"rewards/rejected": -0.1020299643278122,
"sft_loss": 0.9367265701293945,
"step": 380
},
{
"epoch": 0.6934874416537008,
"grad_norm": 0.48474597930908203,
"learning_rate": 4.368660571288192e-06,
"logits/chosen": -0.3377426266670227,
"logits/rejected": -0.32565537095069885,
"logps/chosen": -0.9353078007698059,
"logps/rejected": -1.0242602825164795,
"loss": 1.0071,
"odds_ratio_loss": 0.7176766395568848,
"rewards/accuracies": 0.45625001192092896,
"rewards/chosen": -0.09353077411651611,
"rewards/margins": 0.008895261213183403,
"rewards/rejected": -0.10242603719234467,
"sft_loss": 0.9353078007698059,
"step": 390
},
{
"epoch": 0.7112691709268726,
"grad_norm": 0.3825822174549103,
"learning_rate": 4.337390565595163e-06,
"logits/chosen": -0.4158423840999603,
"logits/rejected": -0.36646509170532227,
"logps/chosen": -1.0673354864120483,
"logps/rejected": -1.0877690315246582,
"loss": 1.1448,
"odds_ratio_loss": 0.7746785879135132,
"rewards/accuracies": 0.48750001192092896,
"rewards/chosen": -0.1067335456609726,
"rewards/margins": 0.0020433522295206785,
"rewards/rejected": -0.10877690464258194,
"sft_loss": 1.0673354864120483,
"step": 400
},
{
"epoch": 0.7290509002000445,
"grad_norm": 0.36279189586639404,
"learning_rate": 4.305482553496786e-06,
"logits/chosen": -0.33700472116470337,
"logits/rejected": -0.3831488788127899,
"logps/chosen": -0.9607623815536499,
"logps/rejected": -1.0405422449111938,
"loss": 1.0363,
"odds_ratio_loss": 0.7554237842559814,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": -0.09607623517513275,
"rewards/margins": 0.007977982982993126,
"rewards/rejected": -0.10405422747135162,
"sft_loss": 0.9607623815536499,
"step": 410
},
{
"epoch": 0.7468326294732163,
"grad_norm": 0.457087904214859,
"learning_rate": 4.272947614573244e-06,
"logits/chosen": -0.3999176621437073,
"logits/rejected": -0.3756122291088104,
"logps/chosen": -1.0111384391784668,
"logps/rejected": -1.0757354497909546,
"loss": 1.0826,
"odds_ratio_loss": 0.7144282460212708,
"rewards/accuracies": 0.53125,
"rewards/chosen": -0.10111384093761444,
"rewards/margins": 0.006459714379161596,
"rewards/rejected": -0.10757355391979218,
"sft_loss": 1.0111384391784668,
"step": 420
},
{
"epoch": 0.7646143587463881,
"grad_norm": 0.2605019509792328,
"learning_rate": 4.23979704609569e-06,
"logits/chosen": -0.36384835839271545,
"logits/rejected": -0.34030967950820923,
"logps/chosen": -0.9615520238876343,
"logps/rejected": -1.0373448133468628,
"loss": 1.0309,
"odds_ratio_loss": 0.6935026049613953,
"rewards/accuracies": 0.53125,
"rewards/chosen": -0.09615520387887955,
"rewards/margins": 0.007579285651445389,
"rewards/rejected": -0.10373447835445404,
"sft_loss": 0.9615520238876343,
"step": 430
},
{
"epoch": 0.78239608801956,
"grad_norm": 0.41911929845809937,
"learning_rate": 4.206042359103435e-06,
"logits/chosen": -0.38596296310424805,
"logits/rejected": -0.37879234552383423,
"logps/chosen": -0.9808257222175598,
"logps/rejected": -1.121048927307129,
"loss": 1.0531,
"odds_ratio_loss": 0.7229377627372742,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": -0.09808257222175598,
"rewards/margins": 0.014022317714989185,
"rewards/rejected": -0.11210489273071289,
"sft_loss": 0.9808257222175598,
"step": 440
},
{
"epoch": 0.8001778172927317,
"grad_norm": 0.7460839748382568,
"learning_rate": 4.17169527440691e-06,
"logits/chosen": -0.39514169096946716,
"logits/rejected": -0.3737938106060028,
"logps/chosen": -0.9438737630844116,
"logps/rejected": -1.0060594081878662,
"loss": 1.0182,
"odds_ratio_loss": 0.7436385154724121,
"rewards/accuracies": 0.5062500238418579,
"rewards/chosen": -0.09438737481832504,
"rewards/margins": 0.006218560039997101,
"rewards/rejected": -0.10060594230890274,
"sft_loss": 0.9438737630844116,
"step": 450
},
{
"epoch": 0.8179595465659035,
"grad_norm": 0.5300458669662476,
"learning_rate": 4.136767718517797e-06,
"logits/chosen": -0.3699805736541748,
"logits/rejected": -0.3850511312484741,
"logps/chosen": -0.959467887878418,
"logps/rejected": -1.100988507270813,
"loss": 1.0256,
"odds_ratio_loss": 0.6614881753921509,
"rewards/accuracies": 0.53125,
"rewards/chosen": -0.0959467813372612,
"rewards/margins": 0.014152060262858868,
"rewards/rejected": -0.11009885370731354,
"sft_loss": 0.959467887878418,
"step": 460
},
{
"epoch": 0.8357412758390753,
"grad_norm": 0.9485012292861938,
"learning_rate": 4.1012718195077196e-06,
"logits/chosen": -0.37103739380836487,
"logits/rejected": -0.3039020895957947,
"logps/chosen": -0.9647709131240845,
"logps/rejected": -1.0279747247695923,
"loss": 1.0376,
"odds_ratio_loss": 0.7286756038665771,
"rewards/accuracies": 0.42500001192092896,
"rewards/chosen": -0.09647707641124725,
"rewards/margins": 0.006320389453321695,
"rewards/rejected": -0.10279747098684311,
"sft_loss": 0.9647709131240845,
"step": 470
},
{
"epoch": 0.8535230051122472,
"grad_norm": 0.5754956603050232,
"learning_rate": 4.065219902796953e-06,
"logits/chosen": -0.40020495653152466,
"logits/rejected": -0.39535146951675415,
"logps/chosen": -0.9706109166145325,
"logps/rejected": -1.093976378440857,
"loss": 1.0453,
"odds_ratio_loss": 0.7464355230331421,
"rewards/accuracies": 0.4937500059604645,
"rewards/chosen": -0.09706110507249832,
"rewards/margins": 0.01233654748648405,
"rewards/rejected": -0.10939764976501465,
"sft_loss": 0.9706109166145325,
"step": 480
},
{
"epoch": 0.871304734385419,
"grad_norm": 0.3195387125015259,
"learning_rate": 4.028624486874608e-06,
"logits/chosen": -0.4315417408943176,
"logits/rejected": -0.36453911662101746,
"logps/chosen": -0.9465911984443665,
"logps/rejected": -1.1121985912322998,
"loss": 1.0194,
"odds_ratio_loss": 0.7276239991188049,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.09465911984443665,
"rewards/margins": 0.016560742631554604,
"rewards/rejected": -0.1112198606133461,
"sft_loss": 0.9465911984443665,
"step": 490
},
{
"epoch": 0.8890864636585908,
"grad_norm": 0.6305994391441345,
"learning_rate": 3.99149827895177e-06,
"logits/chosen": -0.38445502519607544,
"logits/rejected": -0.38218945264816284,
"logps/chosen": -1.0171244144439697,
"logps/rejected": -1.0506142377853394,
"loss": 1.0913,
"odds_ratio_loss": 0.7415187358856201,
"rewards/accuracies": 0.4312500059604645,
"rewards/chosen": -0.10171245038509369,
"rewards/margins": 0.0033489768393337727,
"rewards/rejected": -0.10506142675876617,
"sft_loss": 1.0171244144439697,
"step": 500
},
{
"epoch": 0.8890864636585908,
"eval_logits/chosen": -0.34904247522354126,
"eval_logits/rejected": -0.31755369901657104,
"eval_logps/chosen": -0.9676439166069031,
"eval_logps/rejected": -1.1074860095977783,
"eval_loss": 1.0354068279266357,
"eval_odds_ratio_loss": 0.6776295900344849,
"eval_rewards/accuracies": 0.5180000066757202,
"eval_rewards/chosen": -0.09676438570022583,
"eval_rewards/margins": 0.013984210789203644,
"eval_rewards/rejected": -0.11074860394001007,
"eval_runtime": 185.9798,
"eval_samples_per_second": 5.377,
"eval_sft_loss": 0.9676439166069031,
"eval_steps_per_second": 2.688,
"step": 500
},
{
"epoch": 0.9068681929317626,
"grad_norm": 0.33740749955177307,
"learning_rate": 3.953854170549114e-06,
"logits/chosen": -0.3074025809764862,
"logits/rejected": -0.30263853073120117,
"logps/chosen": -0.9824435114860535,
"logps/rejected": -1.0204169750213623,
"loss": 1.0555,
"odds_ratio_loss": 0.7308207750320435,
"rewards/accuracies": 0.48750001192092896,
"rewards/chosen": -0.09824434667825699,
"rewards/margins": 0.0037973597645759583,
"rewards/rejected": -0.10204169899225235,
"sft_loss": 0.9824435114860535,
"step": 510
},
{
"epoch": 0.9246499222049345,
"grad_norm": 0.4032406210899353,
"learning_rate": 3.91570523302051e-06,
"logits/chosen": -0.3414192199707031,
"logits/rejected": -0.36243736743927,
"logps/chosen": -0.8989545702934265,
"logps/rejected": -1.0376076698303223,
"loss": 0.9695,
"odds_ratio_loss": 0.7055255174636841,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -0.08989545702934265,
"rewards/margins": 0.013865319080650806,
"rewards/rejected": -0.10376076400279999,
"sft_loss": 0.8989545702934265,
"step": 520
},
{
"epoch": 0.9424316514781063,
"grad_norm": 0.3632182776927948,
"learning_rate": 3.8770647130141996e-06,
"logits/chosen": -0.3258126378059387,
"logits/rejected": -0.33273980021476746,
"logps/chosen": -0.9584708213806152,
"logps/rejected": -1.0552600622177124,
"loss": 1.0316,
"odds_ratio_loss": 0.731722891330719,
"rewards/accuracies": 0.46875,
"rewards/chosen": -0.09584707766771317,
"rewards/margins": 0.009678924456238747,
"rewards/rejected": -0.10552600771188736,
"sft_loss": 0.9584708213806152,
"step": 530
},
{
"epoch": 0.960213380751278,
"grad_norm": 0.3121795058250427,
"learning_rate": 3.837946027873086e-06,
"logits/chosen": -0.32046863436698914,
"logits/rejected": -0.3653668463230133,
"logps/chosen": -0.966636061668396,
"logps/rejected": -1.1031057834625244,
"loss": 1.0367,
"odds_ratio_loss": 0.7007311582565308,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.09666360169649124,
"rewards/margins": 0.01364696491509676,
"rewards/rejected": -0.11031056940555573,
"sft_loss": 0.966636061668396,
"step": 540
},
{
"epoch": 0.9779951100244498,
"grad_norm": 0.6487416625022888,
"learning_rate": 3.7983627609757713e-06,
"logits/chosen": -0.34747475385665894,
"logits/rejected": -0.3490690290927887,
"logps/chosen": -0.9615602493286133,
"logps/rejected": -1.0271753072738647,
"loss": 1.0318,
"odds_ratio_loss": 0.702663779258728,
"rewards/accuracies": 0.48750001192092896,
"rewards/chosen": -0.09615601599216461,
"rewards/margins": 0.0065615237690508366,
"rewards/rejected": -0.10271754115819931,
"sft_loss": 0.9615602493286133,
"step": 550
},
{
"epoch": 0.9957768392976217,
"grad_norm": 0.3890874683856964,
"learning_rate": 3.758328657019924e-06,
"logits/chosen": -0.37014687061309814,
"logits/rejected": -0.4008961319923401,
"logps/chosen": -0.9199098348617554,
"logps/rejected": -1.0562833547592163,
"loss": 0.9886,
"odds_ratio_loss": 0.6868860721588135,
"rewards/accuracies": 0.48124998807907104,
"rewards/chosen": -0.09199099242687225,
"rewards/margins": 0.013637351803481579,
"rewards/rejected": -0.1056283488869667,
"sft_loss": 0.9199098348617554,
"step": 560
},
{
"epoch": 1.0135585685707935,
"grad_norm": 1.5021965503692627,
"learning_rate": 3.717857617249642e-06,
"logits/chosen": -0.409252405166626,
"logits/rejected": -0.3774147033691406,
"logps/chosen": -1.0592560768127441,
"logps/rejected": -1.1887257099151611,
"loss": 1.135,
"odds_ratio_loss": 0.7577823400497437,
"rewards/accuracies": 0.48750001192092896,
"rewards/chosen": -0.10592560470104218,
"rewards/margins": 0.012946966104209423,
"rewards/rejected": -0.11887258291244507,
"sft_loss": 1.0592560768127441,
"step": 570
},
{
"epoch": 1.0313402978439654,
"grad_norm": 0.36601969599723816,
"learning_rate": 3.6769636946284543e-06,
"logits/chosen": -0.33855992555618286,
"logits/rejected": -0.38329094648361206,
"logps/chosen": -0.9246651530265808,
"logps/rejected": -1.0259661674499512,
"loss": 0.9949,
"odds_ratio_loss": 0.7019587755203247,
"rewards/accuracies": 0.5062500238418579,
"rewards/chosen": -0.09246651828289032,
"rewards/margins": 0.01013010274618864,
"rewards/rejected": -0.10259661823511124,
"sft_loss": 0.9246651530265808,
"step": 580
},
{
"epoch": 1.049122027117137,
"grad_norm": 0.3644584119319916,
"learning_rate": 3.6356610889596355e-06,
"logits/chosen": -0.3362785577774048,
"logits/rejected": -0.3195570707321167,
"logps/chosen": -0.9757383465766907,
"logps/rejected": -1.0168259143829346,
"loss": 1.0499,
"odds_ratio_loss": 0.7411800622940063,
"rewards/accuracies": 0.4749999940395355,
"rewards/chosen": -0.09757383167743683,
"rewards/margins": 0.004108763299882412,
"rewards/rejected": -0.10168258845806122,
"sft_loss": 0.9757383465766907,
"step": 590
},
{
"epoch": 1.066903756390309,
"grad_norm": 0.38790592551231384,
"learning_rate": 3.593964141955541e-06,
"logits/chosen": -0.31955039501190186,
"logits/rejected": -0.3287174701690674,
"logps/chosen": -0.9446002244949341,
"logps/rejected": -0.9857986569404602,
"loss": 1.0183,
"odds_ratio_loss": 0.7368658185005188,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": -0.09446002542972565,
"rewards/margins": 0.004119834862649441,
"rewards/rejected": -0.09857985377311707,
"sft_loss": 0.9446002244949341,
"step": 600
},
{
"epoch": 1.0846854856634809,
"grad_norm": 0.3323744237422943,
"learning_rate": 3.5518873322576573e-06,
"logits/chosen": -0.43425217270851135,
"logits/rejected": -0.3568256199359894,
"logps/chosen": -0.9986424446105957,
"logps/rejected": -1.0531480312347412,
"loss": 1.073,
"odds_ratio_loss": 0.7439261674880981,
"rewards/accuracies": 0.4625000059604645,
"rewards/chosen": -0.09986423701047897,
"rewards/margins": 0.005450558383017778,
"rewards/rejected": -0.10531480610370636,
"sft_loss": 0.9986424446105957,
"step": 610
},
{
"epoch": 1.1024672149366526,
"grad_norm": 0.45893725752830505,
"learning_rate": 3.5094452704091143e-06,
"logits/chosen": -0.3812747299671173,
"logits/rejected": -0.36471351981163025,
"logps/chosen": -0.9423580169677734,
"logps/rejected": -1.0641114711761475,
"loss": 1.0114,
"odds_ratio_loss": 0.6907029747962952,
"rewards/accuracies": 0.5062500238418579,
"rewards/chosen": -0.09423580020666122,
"rewards/margins": 0.01217535138130188,
"rewards/rejected": -0.10641114413738251,
"sft_loss": 0.9423580169677734,
"step": 620
},
{
"epoch": 1.1202489442098245,
"grad_norm": 0.5117968916893005,
"learning_rate": 3.46665269378139e-06,
"logits/chosen": -0.3292369842529297,
"logits/rejected": -0.3725055158138275,
"logps/chosen": -0.9826286435127258,
"logps/rejected": -1.0871622562408447,
"loss": 1.0548,
"odds_ratio_loss": 0.7213753461837769,
"rewards/accuracies": 0.4749999940395355,
"rewards/chosen": -0.09826286137104034,
"rewards/margins": 0.010453373193740845,
"rewards/rejected": -0.10871622711420059,
"sft_loss": 0.9826286435127258,
"step": 630
},
{
"epoch": 1.1380306734829961,
"grad_norm": 0.5622742176055908,
"learning_rate": 3.4235244614569794e-06,
"logits/chosen": -0.3315224051475525,
"logits/rejected": -0.3257826566696167,
"logps/chosen": -1.1072447299957275,
"logps/rejected": -1.0443857908248901,
"loss": 1.1924,
"odds_ratio_loss": 0.8511736989021301,
"rewards/accuracies": 0.48750001192092896,
"rewards/chosen": -0.11072447150945663,
"rewards/margins": -0.006285896059125662,
"rewards/rejected": -0.10443858057260513,
"sft_loss": 1.1072447299957275,
"step": 640
},
{
"epoch": 1.155812402756168,
"grad_norm": 0.27428311109542847,
"learning_rate": 3.3800755490698008e-06,
"logits/chosen": -0.30900219082832336,
"logits/rejected": -0.33938735723495483,
"logps/chosen": -0.9312244653701782,
"logps/rejected": -1.0983222723007202,
"loss": 0.9964,
"odds_ratio_loss": 0.651997447013855,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.0931224375963211,
"rewards/margins": 0.01670977845788002,
"rewards/rejected": -0.10983221232891083,
"sft_loss": 0.9312244653701782,
"step": 650
},
{
"epoch": 1.17359413202934,
"grad_norm": 1.0422977209091187,
"learning_rate": 3.3363210436051287e-06,
"logits/chosen": -0.3527902662754059,
"logits/rejected": -0.3563137948513031,
"logps/chosen": -0.978245735168457,
"logps/rejected": -1.0940849781036377,
"loss": 1.0514,
"odds_ratio_loss": 0.73140949010849,
"rewards/accuracies": 0.46875,
"rewards/chosen": -0.0978245884180069,
"rewards/margins": 0.011583918705582619,
"rewards/rejected": -0.10940849781036377,
"sft_loss": 0.978245735168457,
"step": 660
},
{
"epoch": 1.1913758613025116,
"grad_norm": 0.4168451428413391,
"learning_rate": 3.292276138160867e-06,
"logits/chosen": -0.28714054822921753,
"logits/rejected": -0.30155253410339355,
"logps/chosen": -0.934456467628479,
"logps/rejected": -1.0636101961135864,
"loss": 1.0032,
"odds_ratio_loss": 0.6879295110702515,
"rewards/accuracies": 0.4312500059604645,
"rewards/chosen": -0.09344564378261566,
"rewards/margins": 0.012915370985865593,
"rewards/rejected": -0.1063610091805458,
"sft_loss": 0.934456467628479,
"step": 670
},
{
"epoch": 1.2091575905756835,
"grad_norm": 0.34239086508750916,
"learning_rate": 3.2479561266719694e-06,
"logits/chosen": -0.381683886051178,
"logits/rejected": -0.37388402223587036,
"logps/chosen": -0.9762662649154663,
"logps/rejected": -1.0414526462554932,
"loss": 1.0493,
"odds_ratio_loss": 0.7306024432182312,
"rewards/accuracies": 0.4749999940395355,
"rewards/chosen": -0.09762662649154663,
"rewards/margins": 0.006518647074699402,
"rewards/rejected": -0.10414527356624603,
"sft_loss": 0.9762662649154663,
"step": 680
},
{
"epoch": 1.2269393198488552,
"grad_norm": 0.4666767716407776,
"learning_rate": 3.2033763985998533e-06,
"logits/chosen": -0.3561275601387024,
"logits/rejected": -0.3666972517967224,
"logps/chosen": -0.9278993606567383,
"logps/rejected": -1.172456979751587,
"loss": 0.9924,
"odds_ratio_loss": 0.6447319984436035,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.09278994053602219,
"rewards/margins": 0.024455763399600983,
"rewards/rejected": -0.11724568903446198,
"sft_loss": 0.9278993606567383,
"step": 690
},
{
"epoch": 1.244721049122027,
"grad_norm": 0.4466889202594757,
"learning_rate": 3.1585524335886335e-06,
"logits/chosen": -0.3700794279575348,
"logits/rejected": -0.37532711029052734,
"logps/chosen": -0.893964409828186,
"logps/rejected": -1.0242712497711182,
"loss": 0.9628,
"odds_ratio_loss": 0.6878638863563538,
"rewards/accuracies": 0.518750011920929,
"rewards/chosen": -0.08939644694328308,
"rewards/margins": 0.013030675239861012,
"rewards/rejected": -0.10242712497711182,
"sft_loss": 0.893964409828186,
"step": 700
},
{
"epoch": 1.262502778395199,
"grad_norm": 0.6432116031646729,
"learning_rate": 3.1134997960900536e-06,
"logits/chosen": -0.3843459486961365,
"logits/rejected": -0.4183478355407715,
"logps/chosen": -0.8787266612052917,
"logps/rejected": -1.1227346658706665,
"loss": 0.9417,
"odds_ratio_loss": 0.6295467615127563,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.08787266910076141,
"rewards/margins": 0.024400796741247177,
"rewards/rejected": -0.11227346956729889,
"sft_loss": 0.8787266612052917,
"step": 710
},
{
"epoch": 1.2802845076683709,
"grad_norm": 0.47079232335090637,
"learning_rate": 3.0682341299589583e-06,
"logits/chosen": -0.3750189244747162,
"logits/rejected": -0.33040302991867065,
"logps/chosen": -0.9284566640853882,
"logps/rejected": -0.9662970304489136,
"loss": 1.0031,
"odds_ratio_loss": 0.7467560172080994,
"rewards/accuracies": 0.4124999940395355,
"rewards/chosen": -0.09284567832946777,
"rewards/margins": 0.0037840281147509813,
"rewards/rejected": -0.09662970155477524,
"sft_loss": 0.9284566640853882,
"step": 720
},
{
"epoch": 1.2980662369415426,
"grad_norm": 0.4881021976470947,
"learning_rate": 3.022771153021201e-06,
"logits/chosen": -0.3772386610507965,
"logits/rejected": -0.3512099087238312,
"logps/chosen": -0.9160524606704712,
"logps/rejected": -1.0388538837432861,
"loss": 0.986,
"odds_ratio_loss": 0.6990936994552612,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -0.09160524606704712,
"rewards/margins": 0.012280138209462166,
"rewards/rejected": -0.10388537496328354,
"sft_loss": 0.9160524606704712,
"step": 730
},
{
"epoch": 1.3158479662147144,
"grad_norm": 0.3279300034046173,
"learning_rate": 2.9771266516158625e-06,
"logits/chosen": -0.33211830258369446,
"logits/rejected": -0.3039989471435547,
"logps/chosen": -0.9333264231681824,
"logps/rejected": -1.0419334173202515,
"loss": 1.0054,
"odds_ratio_loss": 0.72088623046875,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.09333264082670212,
"rewards/margins": 0.010860702954232693,
"rewards/rejected": -0.10419335216283798,
"sft_loss": 0.9333264231681824,
"step": 740
},
{
"epoch": 1.3336296954878861,
"grad_norm": 0.311788409948349,
"learning_rate": 2.9313164751136802e-06,
"logits/chosen": -0.3910767436027527,
"logits/rejected": -0.36302170157432556,
"logps/chosen": -0.9149459004402161,
"logps/rejected": -1.0412867069244385,
"loss": 0.9824,
"odds_ratio_loss": 0.6748364567756653,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.0914945974946022,
"rewards/margins": 0.01263406127691269,
"rewards/rejected": -0.1041286438703537,
"sft_loss": 0.9149459004402161,
"step": 750
},
{
"epoch": 1.351411424761058,
"grad_norm": 0.5009350180625916,
"learning_rate": 2.8853565304135956e-06,
"logits/chosen": -0.28646108508110046,
"logits/rejected": -0.3241187632083893,
"logps/chosen": -0.988601803779602,
"logps/rejected": -1.0276473760604858,
"loss": 1.0645,
"odds_ratio_loss": 0.759224534034729,
"rewards/accuracies": 0.4625000059604645,
"rewards/chosen": -0.09886018931865692,
"rewards/margins": 0.0039045563898980618,
"rewards/rejected": -0.10276474803686142,
"sft_loss": 0.988601803779602,
"step": 760
},
{
"epoch": 1.36919315403423,
"grad_norm": 0.5821639895439148,
"learning_rate": 2.839262776419313e-06,
"logits/chosen": -0.345294713973999,
"logits/rejected": -0.34865519404411316,
"logps/chosen": -0.9152688980102539,
"logps/rejected": -1.12654709815979,
"loss": 0.9828,
"odds_ratio_loss": 0.6755408644676208,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": -0.09152691066265106,
"rewards/margins": 0.021127816289663315,
"rewards/rejected": -0.11265470832586288,
"sft_loss": 0.9152688980102539,
"step": 770
},
{
"epoch": 1.3869748833074016,
"grad_norm": 0.39795824885368347,
"learning_rate": 2.793051218497817e-06,
"logits/chosen": -0.27542608976364136,
"logits/rejected": -0.27257028222084045,
"logps/chosen": -0.931863009929657,
"logps/rejected": -0.9498918652534485,
"loss": 1.0074,
"odds_ratio_loss": 0.7550782561302185,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.09318631142377853,
"rewards/margins": 0.0018028710037469864,
"rewards/rejected": -0.09498917311429977,
"sft_loss": 0.931863009929657,
"step": 780
},
{
"epoch": 1.4047566125805735,
"grad_norm": 0.37384262681007385,
"learning_rate": 2.7467379029217437e-06,
"logits/chosen": -0.34524422883987427,
"logits/rejected": -0.36011195182800293,
"logps/chosen": -0.9515836834907532,
"logps/rejected": -1.0694557428359985,
"loss": 1.0211,
"odds_ratio_loss": 0.6952496767044067,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.09515835344791412,
"rewards/margins": 0.011787201277911663,
"rewards/rejected": -0.10694557428359985,
"sft_loss": 0.9515836834907532,
"step": 790
},
{
"epoch": 1.4225383418537452,
"grad_norm": 0.30680692195892334,
"learning_rate": 2.7003389112975546e-06,
"logits/chosen": -0.26400548219680786,
"logits/rejected": -0.20824924111366272,
"logps/chosen": -0.9995955228805542,
"logps/rejected": -1.0734318494796753,
"loss": 1.0721,
"odds_ratio_loss": 0.7255308628082275,
"rewards/accuracies": 0.48750001192092896,
"rewards/chosen": -0.09995955973863602,
"rewards/margins": 0.007383632007986307,
"rewards/rejected": -0.10734319686889648,
"sft_loss": 0.9995955228805542,
"step": 800
},
{
"epoch": 1.440320071126917,
"grad_norm": 0.7603825926780701,
"learning_rate": 2.653870354981437e-06,
"logits/chosen": -0.36708512902259827,
"logits/rejected": -0.4067977964878082,
"logps/chosen": -0.869776725769043,
"logps/rejected": -0.9957377314567566,
"loss": 0.9397,
"odds_ratio_loss": 0.6991982460021973,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -0.08697767555713654,
"rewards/margins": 0.012596105225384235,
"rewards/rejected": -0.0995737761259079,
"sft_loss": 0.869776725769043,
"step": 810
},
{
"epoch": 1.458101800400089,
"grad_norm": 0.8572419881820679,
"learning_rate": 2.6073483694848777e-06,
"logits/chosen": -0.3313853442668915,
"logits/rejected": -0.2504517734050751,
"logps/chosen": -0.9180091619491577,
"logps/rejected": -1.0551806688308716,
"loss": 0.9865,
"odds_ratio_loss": 0.6853691339492798,
"rewards/accuracies": 0.518750011920929,
"rewards/chosen": -0.09180092811584473,
"rewards/margins": 0.013717141933739185,
"rewards/rejected": -0.10551806539297104,
"sft_loss": 0.9180091619491577,
"step": 820
},
{
"epoch": 1.4758835296732609,
"grad_norm": 0.2907600700855255,
"learning_rate": 2.560789108871847e-06,
"logits/chosen": -0.35712695121765137,
"logits/rejected": -0.34705477952957153,
"logps/chosen": -0.9147292971611023,
"logps/rejected": -1.1361644268035889,
"loss": 0.9806,
"odds_ratio_loss": 0.6587303280830383,
"rewards/accuracies": 0.4937500059604645,
"rewards/chosen": -0.09147293865680695,
"rewards/margins": 0.02214350923895836,
"rewards/rejected": -0.113616444170475,
"sft_loss": 0.9147292971611023,
"step": 830
},
{
"epoch": 1.4936652589464325,
"grad_norm": 0.9957931637763977,
"learning_rate": 2.514208740149544e-06,
"logits/chosen": -0.38370782136917114,
"logits/rejected": -0.372738778591156,
"logps/chosen": -1.0301647186279297,
"logps/rejected": -1.131388783454895,
"loss": 1.1016,
"odds_ratio_loss": 0.7141064405441284,
"rewards/accuracies": 0.46875,
"rewards/chosen": -0.10301647335290909,
"rewards/margins": 0.010122401639819145,
"rewards/rejected": -0.11313886940479279,
"sft_loss": 1.0301647186279297,
"step": 840
},
{
"epoch": 1.5114469882196042,
"grad_norm": 0.3347834050655365,
"learning_rate": 2.46762343765464e-06,
"logits/chosen": -0.33272939920425415,
"logits/rejected": -0.3354397416114807,
"logps/chosen": -0.9821497797966003,
"logps/rejected": -1.1356861591339111,
"loss": 1.0494,
"odds_ratio_loss": 0.672347903251648,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": -0.09821496903896332,
"rewards/margins": 0.015353633090853691,
"rewards/rejected": -0.11356861889362335,
"sft_loss": 0.9821497797966003,
"step": 850
},
{
"epoch": 1.5292287174927761,
"grad_norm": 0.40781450271606445,
"learning_rate": 2.4210493774369903e-06,
"logits/chosen": -0.3659764528274536,
"logits/rejected": -0.34343641996383667,
"logps/chosen": -0.9932387471199036,
"logps/rejected": -1.0735210180282593,
"loss": 1.0663,
"odds_ratio_loss": 0.7305824160575867,
"rewards/accuracies": 0.5062500238418579,
"rewards/chosen": -0.09932386875152588,
"rewards/margins": 0.008028226904571056,
"rewards/rejected": -0.10735210031270981,
"sft_loss": 0.9932387471199036,
"step": 860
},
{
"epoch": 1.547010446765948,
"grad_norm": 0.33270904421806335,
"learning_rate": 2.374502731642732e-06,
"logits/chosen": -0.33156028389930725,
"logits/rejected": -0.3256151080131531,
"logps/chosen": -0.9762036204338074,
"logps/rejected": -1.0732605457305908,
"loss": 1.0483,
"odds_ratio_loss": 0.7209652662277222,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": -0.09762036800384521,
"rewards/margins": 0.009705697186291218,
"rewards/rejected": -0.10732606798410416,
"sft_loss": 0.9762036204338074,
"step": 870
},
{
"epoch": 1.56479217603912,
"grad_norm": 0.46649253368377686,
"learning_rate": 2.3279996628987556e-06,
"logits/chosen": -0.3505496084690094,
"logits/rejected": -0.3284318149089813,
"logps/chosen": -0.9539216756820679,
"logps/rejected": -1.0178234577178955,
"loss": 1.0269,
"odds_ratio_loss": 0.7295688390731812,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": -0.09539216756820679,
"rewards/margins": 0.006390177644789219,
"rewards/rejected": -0.10178234428167343,
"sft_loss": 0.9539216756820679,
"step": 880
},
{
"epoch": 1.5825739053122916,
"grad_norm": 0.343382865190506,
"learning_rate": 2.281556318700474e-06,
"logits/chosen": -0.2859468460083008,
"logits/rejected": -0.25978535413742065,
"logps/chosen": -0.904071033000946,
"logps/rejected": -0.9673022031784058,
"loss": 0.9788,
"odds_ratio_loss": 0.7473067045211792,
"rewards/accuracies": 0.5062500238418579,
"rewards/chosen": -0.0904071107506752,
"rewards/margins": 0.00632312148809433,
"rewards/rejected": -0.09673022478818893,
"sft_loss": 0.904071033000946,
"step": 890
},
{
"epoch": 1.6003556345854635,
"grad_norm": 0.6206201314926147,
"learning_rate": 2.2351888258048408e-06,
"logits/chosen": -0.3074144423007965,
"logits/rejected": -0.2645527720451355,
"logps/chosen": -0.8916131854057312,
"logps/rejected": -0.9986615180969238,
"loss": 0.9603,
"odds_ratio_loss": 0.6866299510002136,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.08916132152080536,
"rewards/margins": 0.010704840533435345,
"rewards/rejected": -0.09986615926027298,
"sft_loss": 0.8916131854057312,
"step": 900
},
{
"epoch": 1.6181373638586352,
"grad_norm": 0.3601900339126587,
"learning_rate": 2.188913284630584e-06,
"logits/chosen": -0.33852243423461914,
"logits/rejected": -0.3135743737220764,
"logps/chosen": -0.9911006689071655,
"logps/rejected": -1.016789197921753,
"loss": 1.0679,
"odds_ratio_loss": 0.7680201530456543,
"rewards/accuracies": 0.4937500059604645,
"rewards/chosen": -0.09911007434129715,
"rewards/margins": 0.0025688547175377607,
"rewards/rejected": -0.10167893022298813,
"sft_loss": 0.9911006689071655,
"step": 910
},
{
"epoch": 1.635919093131807,
"grad_norm": 0.6057630777359009,
"learning_rate": 2.1427457636675652e-06,
"logits/chosen": -0.3320189118385315,
"logits/rejected": -0.28204983472824097,
"logps/chosen": -1.0480351448059082,
"logps/rejected": -1.1421617269515991,
"loss": 1.1202,
"odds_ratio_loss": 0.7219060659408569,
"rewards/accuracies": 0.518750011920929,
"rewards/chosen": -0.10480351746082306,
"rewards/margins": 0.00941266119480133,
"rewards/rejected": -0.11421617120504379,
"sft_loss": 1.0480351448059082,
"step": 920
},
{
"epoch": 1.653700822404979,
"grad_norm": 0.27687886357307434,
"learning_rate": 2.096702293897247e-06,
"logits/chosen": -0.3558569550514221,
"logits/rejected": -0.4100232720375061,
"logps/chosen": -0.9075578451156616,
"logps/rejected": -1.1192221641540527,
"loss": 0.9773,
"odds_ratio_loss": 0.6971360445022583,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": -0.09075579047203064,
"rewards/margins": 0.02116643264889717,
"rewards/rejected": -0.11192221939563751,
"sft_loss": 0.9075578451156616,
"step": 930
},
{
"epoch": 1.6714825516781509,
"grad_norm": 0.5104541182518005,
"learning_rate": 2.0507988632261672e-06,
"logits/chosen": -0.37269848585128784,
"logits/rejected": -0.3488038182258606,
"logps/chosen": -0.8780601620674133,
"logps/rejected": -1.035788893699646,
"loss": 0.9453,
"odds_ratio_loss": 0.6724425554275513,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.08780601620674133,
"rewards/margins": 0.015772882848978043,
"rewards/rejected": -0.10357888787984848,
"sft_loss": 0.8780601620674133,
"step": 940
},
{
"epoch": 1.6892642809513225,
"grad_norm": 1.108080506324768,
"learning_rate": 2.005051410934382e-06,
"logits/chosen": -0.3843027949333191,
"logits/rejected": -0.36695486307144165,
"logps/chosen": -1.0294411182403564,
"logps/rejected": -1.073974847793579,
"loss": 1.1057,
"odds_ratio_loss": 0.7625271081924438,
"rewards/accuracies": 0.48124998807907104,
"rewards/chosen": -0.10294412076473236,
"rewards/margins": 0.004453369881957769,
"rewards/rejected": -0.10739749670028687,
"sft_loss": 1.0294411182403564,
"step": 950
},
{
"epoch": 1.7070460102244942,
"grad_norm": 0.6668155789375305,
"learning_rate": 1.9594758221407843e-06,
"logits/chosen": -0.30207034945487976,
"logits/rejected": -0.31365981698036194,
"logps/chosen": -0.8924224972724915,
"logps/rejected": -1.0662165880203247,
"loss": 0.9564,
"odds_ratio_loss": 0.6395965218544006,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.08924224227666855,
"rewards/margins": 0.017379416152834892,
"rewards/rejected": -0.10662166774272919,
"sft_loss": 0.8924224972724915,
"step": 960
},
{
"epoch": 1.724827739497666,
"grad_norm": 0.5297231674194336,
"learning_rate": 1.9140879222872408e-06,
"logits/chosen": -0.3790926933288574,
"logits/rejected": -0.34034663438796997,
"logps/chosen": -0.9109382629394531,
"logps/rejected": -0.9725145101547241,
"loss": 0.9864,
"odds_ratio_loss": 0.7550500631332397,
"rewards/accuracies": 0.48124998807907104,
"rewards/chosen": -0.09109383821487427,
"rewards/margins": 0.006157620809972286,
"rewards/rejected": -0.09725145250558853,
"sft_loss": 0.9109382629394531,
"step": 970
},
{
"epoch": 1.742609468770838,
"grad_norm": 0.2978646457195282,
"learning_rate": 1.8689034716434346e-06,
"logits/chosen": -0.3594937026500702,
"logits/rejected": -0.3786514699459076,
"logps/chosen": -0.9791936874389648,
"logps/rejected": -1.0208795070648193,
"loss": 1.054,
"odds_ratio_loss": 0.7475694417953491,
"rewards/accuracies": 0.46875,
"rewards/chosen": -0.09791935980319977,
"rewards/margins": 0.0041685826145112514,
"rewards/rejected": -0.10208795219659805,
"sft_loss": 0.9791936874389648,
"step": 980
},
{
"epoch": 1.76039119804401,
"grad_norm": 0.3484848439693451,
"learning_rate": 1.8239381598343576e-06,
"logits/chosen": -0.29449883103370667,
"logits/rejected": -0.3054262697696686,
"logps/chosen": -0.9115015864372253,
"logps/rejected": -1.0031999349594116,
"loss": 0.9826,
"odds_ratio_loss": 0.7106297016143799,
"rewards/accuracies": 0.48750001192092896,
"rewards/chosen": -0.09115016460418701,
"rewards/margins": 0.00916983187198639,
"rewards/rejected": -0.1003199964761734,
"sft_loss": 0.9115015864372253,
"step": 990
},
{
"epoch": 1.7781729273171816,
"grad_norm": 2.2374985218048096,
"learning_rate": 1.779207600392312e-06,
"logits/chosen": -0.2810733914375305,
"logits/rejected": -0.27120235562324524,
"logps/chosen": -0.9607506990432739,
"logps/rejected": -1.0408788919448853,
"loss": 1.0328,
"odds_ratio_loss": 0.7200591564178467,
"rewards/accuracies": 0.46875,
"rewards/chosen": -0.09607508033514023,
"rewards/margins": 0.008012807928025723,
"rewards/rejected": -0.10408788919448853,
"sft_loss": 0.9607506990432739,
"step": 1000
},
{
"epoch": 1.7781729273171816,
"eval_logits/chosen": -0.33078742027282715,
"eval_logits/rejected": -0.29791274666786194,
"eval_logps/chosen": -0.9451074004173279,
"eval_logps/rejected": -1.0856181383132935,
"eval_loss": 1.0125839710235596,
"eval_odds_ratio_loss": 0.6747645735740662,
"eval_rewards/accuracies": 0.515999972820282,
"eval_rewards/chosen": -0.0945107489824295,
"eval_rewards/margins": 0.014051074162125587,
"eval_rewards/rejected": -0.10856182873249054,
"eval_runtime": 185.8537,
"eval_samples_per_second": 5.381,
"eval_sft_loss": 0.9451074004173279,
"eval_steps_per_second": 2.69,
"step": 1000
},
{
"epoch": 1.7959546565903532,
"grad_norm": 0.7795166373252869,
"learning_rate": 1.7347273253353552e-06,
"logits/chosen": -0.33356940746307373,
"logits/rejected": -0.3380289077758789,
"logps/chosen": -0.918900191783905,
"logps/rejected": -0.9768841862678528,
"loss": 0.9932,
"odds_ratio_loss": 0.7426038980484009,
"rewards/accuracies": 0.4749999940395355,
"rewards/chosen": -0.09189002215862274,
"rewards/margins": 0.005798395723104477,
"rewards/rejected": -0.09768841415643692,
"sft_loss": 0.918900191783905,
"step": 1010
},
{
"epoch": 1.8137363858635251,
"grad_norm": 0.8157365322113037,
"learning_rate": 1.690512779774029e-06,
"logits/chosen": -0.3094736635684967,
"logits/rejected": -0.28969138860702515,
"logps/chosen": -0.9715908765792847,
"logps/rejected": -1.1499989032745361,
"loss": 1.037,
"odds_ratio_loss": 0.6542772054672241,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": -0.09715909510850906,
"rewards/margins": 0.017840798944234848,
"rewards/rejected": -0.11499989032745361,
"sft_loss": 0.9715908765792847,
"step": 1020
},
{
"epoch": 1.831518115136697,
"grad_norm": 0.5331993103027344,
"learning_rate": 1.6465793165482838e-06,
"logits/chosen": -0.274508535861969,
"logits/rejected": -0.26048415899276733,
"logps/chosen": -0.9679173231124878,
"logps/rejected": -1.0533314943313599,
"loss": 1.0376,
"odds_ratio_loss": 0.6963869333267212,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -0.09679173678159714,
"rewards/margins": 0.008541420102119446,
"rewards/rejected": -0.10533314943313599,
"sft_loss": 0.9679173231124878,
"step": 1030
},
{
"epoch": 1.849299844409869,
"grad_norm": 0.4930827021598816,
"learning_rate": 1.6029421908964305e-06,
"logits/chosen": -0.3850288391113281,
"logits/rejected": -0.3791029155254364,
"logps/chosen": -0.8834483027458191,
"logps/rejected": -1.2469079494476318,
"loss": 0.9502,
"odds_ratio_loss": 0.6672720313072205,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.08834483474493027,
"rewards/margins": 0.03634597733616829,
"rewards/rejected": -0.12469079345464706,
"sft_loss": 0.8834483027458191,
"step": 1040
},
{
"epoch": 1.8670815736830408,
"grad_norm": 0.7664922475814819,
"learning_rate": 1.559616555157985e-06,
"logits/chosen": -0.30128011107444763,
"logits/rejected": -0.33186617493629456,
"logps/chosen": -0.9356236457824707,
"logps/rejected": -1.047398328781128,
"loss": 1.0066,
"odds_ratio_loss": 0.7096288800239563,
"rewards/accuracies": 0.46875,
"rewards/chosen": -0.09356234967708588,
"rewards/margins": 0.01117746438831091,
"rewards/rejected": -0.10473982989788055,
"sft_loss": 0.9356236457824707,
"step": 1050
},
{
"epoch": 1.8848633029562125,
"grad_norm": 0.465348482131958,
"learning_rate": 1.516617453512252e-06,
"logits/chosen": -0.36206910014152527,
"logits/rejected": -0.34239286184310913,
"logps/chosen": -0.9592390060424805,
"logps/rejected": -1.0232237577438354,
"loss": 1.0338,
"odds_ratio_loss": 0.7456762194633484,
"rewards/accuracies": 0.48750001192092896,
"rewards/chosen": -0.09592391550540924,
"rewards/margins": 0.006398468278348446,
"rewards/rejected": -0.10232237726449966,
"sft_loss": 0.9592390060424805,
"step": 1060
},
{
"epoch": 1.9026450322293842,
"grad_norm": 0.830959677696228,
"learning_rate": 1.473959816754449e-06,
"logits/chosen": -0.39980772137641907,
"logits/rejected": -0.3537663221359253,
"logps/chosen": -0.920127272605896,
"logps/rejected": -0.9525257349014282,
"loss": 0.9942,
"odds_ratio_loss": 0.7409034967422485,
"rewards/accuracies": 0.44999998807907104,
"rewards/chosen": -0.09201272577047348,
"rewards/margins": 0.0032398372422903776,
"rewards/rejected": -0.09525256603956223,
"sft_loss": 0.920127272605896,
"step": 1070
},
{
"epoch": 1.920426761502556,
"grad_norm": 0.442227303981781,
"learning_rate": 1.4316584571112213e-06,
"logits/chosen": -0.23950842022895813,
"logits/rejected": -0.25979962944984436,
"logps/chosen": -0.9493446350097656,
"logps/rejected": -1.02411687374115,
"loss": 1.022,
"odds_ratio_loss": 0.7267680764198303,
"rewards/accuracies": 0.4937500059604645,
"rewards/chosen": -0.09493447840213776,
"rewards/margins": 0.007477219216525555,
"rewards/rejected": -0.10241168737411499,
"sft_loss": 0.9493446350097656,
"step": 1080
},
{
"epoch": 1.938208490775728,
"grad_norm": 0.4206017851829529,
"learning_rate": 1.389728063097306e-06,
"logits/chosen": -0.23708462715148926,
"logits/rejected": -0.24299781024456024,
"logps/chosen": -0.9439695477485657,
"logps/rejected": -1.1116364002227783,
"loss": 1.0118,
"odds_ratio_loss": 0.6782708764076233,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": -0.09439694881439209,
"rewards/margins": 0.016766689717769623,
"rewards/rejected": -0.1111636534333229,
"sft_loss": 0.9439695477485657,
"step": 1090
},
{
"epoch": 1.9559902200488999,
"grad_norm": 0.3826051354408264,
"learning_rate": 1.348183194415179e-06,
"logits/chosen": -0.332774817943573,
"logits/rejected": -0.35824882984161377,
"logps/chosen": -0.9340184926986694,
"logps/rejected": -1.110877275466919,
"loss": 1.0005,
"odds_ratio_loss": 0.6648778915405273,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.09340184926986694,
"rewards/margins": 0.01768588088452816,
"rewards/rejected": -0.11108773946762085,
"sft_loss": 0.9340184926986694,
"step": 1100
},
{
"epoch": 1.9737719493220716,
"grad_norm": 0.3005673587322235,
"learning_rate": 1.3070382768994015e-06,
"logits/chosen": -0.30200204253196716,
"logits/rejected": -0.3130107522010803,
"logps/chosen": -0.9192419052124023,
"logps/rejected": -0.9889400601387024,
"loss": 0.9898,
"odds_ratio_loss": 0.7055012583732605,
"rewards/accuracies": 0.45625001192092896,
"rewards/chosen": -0.09192419052124023,
"rewards/margins": 0.006969820708036423,
"rewards/rejected": -0.09889401495456696,
"sft_loss": 0.9192419052124023,
"step": 1110
},
{
"epoch": 1.9915536785952432,
"grad_norm": 0.4379596710205078,
"learning_rate": 1.2663075975074746e-06,
"logits/chosen": -0.3314594626426697,
"logits/rejected": -0.33315131068229675,
"logps/chosen": -0.9054539799690247,
"logps/rejected": -1.0939247608184814,
"loss": 0.9734,
"odds_ratio_loss": 0.6797955632209778,
"rewards/accuracies": 0.5062500238418579,
"rewards/chosen": -0.0905454009771347,
"rewards/margins": 0.018847089260816574,
"rewards/rejected": -0.10939247906208038,
"sft_loss": 0.9054539799690247,
"step": 1120
},
{
"epoch": 2.009335407868415,
"grad_norm": 0.6127385497093201,
"learning_rate": 1.2260052993589034e-06,
"logits/chosen": -0.382732093334198,
"logits/rejected": -0.36521822214126587,
"logps/chosen": -1.0369594097137451,
"logps/rejected": -1.0331060886383057,
"loss": 1.1183,
"odds_ratio_loss": 0.8130975961685181,
"rewards/accuracies": 0.46875,
"rewards/chosen": -0.10369595140218735,
"rewards/margins": -0.0003853384405374527,
"rewards/rejected": -0.10331060737371445,
"sft_loss": 1.0369594097137451,
"step": 1130
},
{
"epoch": 2.027117137141587,
"grad_norm": 0.3373187780380249,
"learning_rate": 1.1861453768242099e-06,
"logits/chosen": -0.3635232448577881,
"logits/rejected": -0.3613505959510803,
"logps/chosen": -0.9056431651115417,
"logps/rejected": -1.0306495428085327,
"loss": 0.9749,
"odds_ratio_loss": 0.6926708221435547,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -0.0905643105506897,
"rewards/margins": 0.012500641867518425,
"rewards/rejected": -0.10306496918201447,
"sft_loss": 0.9056431651115417,
"step": 1140
},
{
"epoch": 2.044898866414759,
"grad_norm": 0.9102166891098022,
"learning_rate": 1.1467416706655982e-06,
"logits/chosen": -0.2888937294483185,
"logits/rejected": -0.26064902544021606,
"logps/chosen": -0.9796838760375977,
"logps/rejected": -1.1222679615020752,
"loss": 1.0522,
"odds_ratio_loss": 0.7250452637672424,
"rewards/accuracies": 0.4937500059604645,
"rewards/chosen": -0.09796838462352753,
"rewards/margins": 0.014258405193686485,
"rewards/rejected": -0.11222679913043976,
"sft_loss": 0.9796838760375977,
"step": 1150
},
{
"epoch": 2.062680595687931,
"grad_norm": 0.3294011652469635,
"learning_rate": 1.1078078632309559e-06,
"logits/chosen": -0.34561508893966675,
"logits/rejected": -0.3147248923778534,
"logps/chosen": -0.9134725332260132,
"logps/rejected": -1.0285111665725708,
"loss": 0.9808,
"odds_ratio_loss": 0.6730369329452515,
"rewards/accuracies": 0.53125,
"rewards/chosen": -0.09134725481271744,
"rewards/margins": 0.011503859423100948,
"rewards/rejected": -0.10285113006830215,
"sft_loss": 0.9134725332260132,
"step": 1160
},
{
"epoch": 2.0804623249611023,
"grad_norm": 0.34308087825775146,
"learning_rate": 1.0693574737028627e-06,
"logits/chosen": -0.3372167944908142,
"logits/rejected": -0.33946290612220764,
"logps/chosen": -0.9201191067695618,
"logps/rejected": -1.0031434297561646,
"loss": 0.9946,
"odds_ratio_loss": 0.744364321231842,
"rewards/accuracies": 0.4625000059604645,
"rewards/chosen": -0.09201192110776901,
"rewards/margins": 0.008302421309053898,
"rewards/rejected": -0.10031434148550034,
"sft_loss": 0.9201191067695618,
"step": 1170
},
{
"epoch": 2.098244054234274,
"grad_norm": 0.5865955948829651,
"learning_rate": 1.0314038534042586e-06,
"logits/chosen": -0.2901017963886261,
"logits/rejected": -0.32853323221206665,
"logps/chosen": -0.9257968068122864,
"logps/rejected": -1.0451035499572754,
"loss": 0.9964,
"odds_ratio_loss": 0.7055808901786804,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.0925796777009964,
"rewards/margins": 0.01193068828433752,
"rewards/rejected": -0.10451038181781769,
"sft_loss": 0.9257968068122864,
"step": 1180
},
{
"epoch": 2.116025783507446,
"grad_norm": 0.41964584589004517,
"learning_rate": 9.939601811623946e-07,
"logits/chosen": -0.31542712450027466,
"logits/rejected": -0.30006498098373413,
"logps/chosen": -0.9362471699714661,
"logps/rejected": -1.0245290994644165,
"loss": 1.0084,
"odds_ratio_loss": 0.7219125032424927,
"rewards/accuracies": 0.4749999940395355,
"rewards/chosen": -0.09362472593784332,
"rewards/margins": 0.008828198537230492,
"rewards/rejected": -0.10245291888713837,
"sft_loss": 0.9362471699714661,
"step": 1190
},
{
"epoch": 2.133807512780618,
"grad_norm": 0.48077794909477234,
"learning_rate": 9.570394587326825e-07,
"logits/chosen": -0.29744619131088257,
"logits/rejected": -0.34743356704711914,
"logps/chosen": -0.9422229528427124,
"logps/rejected": -1.1074718236923218,
"loss": 1.0093,
"odds_ratio_loss": 0.6704057455062866,
"rewards/accuracies": 0.59375,
"rewards/chosen": -0.094222292304039,
"rewards/margins": 0.016524888575077057,
"rewards/rejected": -0.11074719578027725,
"sft_loss": 0.9422229528427124,
"step": 1200
},
{
"epoch": 2.15158924205379,
"grad_norm": 0.3064732253551483,
"learning_rate": 9.206545062840302e-07,
"logits/chosen": -0.2666998505592346,
"logits/rejected": -0.3201262652873993,
"logps/chosen": -0.8927067518234253,
"logps/rejected": -1.0634257793426514,
"loss": 0.9575,
"odds_ratio_loss": 0.6478100419044495,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.08927067369222641,
"rewards/margins": 0.017071900889277458,
"rewards/rejected": -0.10634257644414902,
"sft_loss": 0.8927067518234253,
"step": 1210
},
{
"epoch": 2.1693709713269618,
"grad_norm": 0.3534330725669861,
"learning_rate": 8.848179579472285e-07,
"logits/chosen": -0.3102249801158905,
"logits/rejected": -0.2955402433872223,
"logps/chosen": -0.9082851409912109,
"logps/rejected": -0.9553133845329285,
"loss": 0.9795,
"odds_ratio_loss": 0.7121320962905884,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": -0.09082850813865662,
"rewards/margins": 0.00470283068716526,
"rewards/rejected": -0.09553134441375732,
"sft_loss": 0.9082851409912109,
"step": 1220
},
{
"epoch": 2.1871527006001332,
"grad_norm": 0.6444931626319885,
"learning_rate": 8.495422574279403e-07,
"logits/chosen": -0.3936762809753418,
"logits/rejected": -0.42016810178756714,
"logps/chosen": -0.8496967554092407,
"logps/rejected": -1.0362155437469482,
"loss": 0.9135,
"odds_ratio_loss": 0.6377807855606079,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.08496967703104019,
"rewards/margins": 0.018651869148015976,
"rewards/rejected": -0.10362155735492706,
"sft_loss": 0.8496967554092407,
"step": 1230
},
{
"epoch": 2.204934429873305,
"grad_norm": 0.4805600941181183,
"learning_rate": 8.148396536858063e-07,
"logits/chosen": -0.3237206041812897,
"logits/rejected": -0.3143185079097748,
"logps/chosen": -0.9960983991622925,
"logps/rejected": -1.1420572996139526,
"loss": 1.0672,
"odds_ratio_loss": 0.7113397121429443,
"rewards/accuracies": 0.53125,
"rewards/chosen": -0.0996098443865776,
"rewards/margins": 0.014595886692404747,
"rewards/rejected": -0.1142057403922081,
"sft_loss": 0.9960983991622925,
"step": 1240
},
{
"epoch": 2.222716159146477,
"grad_norm": 0.676315188407898,
"learning_rate": 7.807221966811815e-07,
"logits/chosen": -0.29545170068740845,
"logits/rejected": -0.31817343831062317,
"logps/chosen": -0.9420124292373657,
"logps/rejected": -1.0276824235916138,
"loss": 1.0181,
"odds_ratio_loss": 0.7609573006629944,
"rewards/accuracies": 0.4937500059604645,
"rewards/chosen": -0.0942012369632721,
"rewards/margins": 0.008567007258534431,
"rewards/rejected": -0.10276825726032257,
"sft_loss": 0.9420124292373657,
"step": 1250
},
{
"epoch": 2.240497888419649,
"grad_norm": 0.3943430781364441,
"learning_rate": 7.47201733190962e-07,
"logits/chosen": -0.3520922362804413,
"logits/rejected": -0.3318483829498291,
"logps/chosen": -0.8970060348510742,
"logps/rejected": -0.9855879545211792,
"loss": 0.9669,
"odds_ratio_loss": 0.6993352174758911,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.0897006094455719,
"rewards/margins": 0.008858194574713707,
"rewards/rejected": -0.09855880588293076,
"sft_loss": 0.8970060348510742,
"step": 1260
},
{
"epoch": 2.258279617692821,
"grad_norm": 0.5184921026229858,
"learning_rate": 7.142899026949721e-07,
"logits/chosen": -0.33211636543273926,
"logits/rejected": -0.3313821256160736,
"logps/chosen": -0.9101552963256836,
"logps/rejected": -0.9938360452651978,
"loss": 0.9798,
"odds_ratio_loss": 0.6968866586685181,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": -0.09101552516222,
"rewards/margins": 0.00836807768791914,
"rewards/rejected": -0.09938360750675201,
"sft_loss": 0.9101552963256836,
"step": 1270
},
{
"epoch": 2.2760613469659923,
"grad_norm": 1.8007909059524536,
"learning_rate": 6.819981333343273e-07,
"logits/chosen": -0.3704894185066223,
"logits/rejected": -0.3426709771156311,
"logps/chosen": -0.9317655563354492,
"logps/rejected": -1.0302845239639282,
"loss": 1.003,
"odds_ratio_loss": 0.7128146886825562,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -0.09317655861377716,
"rewards/margins": 0.009851890616118908,
"rewards/rejected": -0.10302845388650894,
"sft_loss": 0.9317655563354492,
"step": 1280
},
{
"epoch": 2.293843076239164,
"grad_norm": 0.4554091989994049,
"learning_rate": 6.503376379431839e-07,
"logits/chosen": -0.2947995066642761,
"logits/rejected": -0.279682457447052,
"logps/chosen": -0.9925037622451782,
"logps/rejected": -0.9870964884757996,
"loss": 1.068,
"odds_ratio_loss": 0.7550127506256104,
"rewards/accuracies": 0.41874998807907104,
"rewards/chosen": -0.09925039112567902,
"rewards/margins": -0.0005407325807027519,
"rewards/rejected": -0.09870964288711548,
"sft_loss": 0.9925037622451782,
"step": 1290
},
{
"epoch": 2.311624805512336,
"grad_norm": 1.7697697877883911,
"learning_rate": 6.193194101552502e-07,
"logits/chosen": -0.31604236364364624,
"logits/rejected": -0.35974448919296265,
"logps/chosen": -0.936480700969696,
"logps/rejected": -1.0702247619628906,
"loss": 1.002,
"odds_ratio_loss": 0.655421793460846,
"rewards/accuracies": 0.518750011920929,
"rewards/chosen": -0.09364806860685349,
"rewards/margins": 0.013374416157603264,
"rewards/rejected": -0.1070224866271019,
"sft_loss": 0.936480700969696,
"step": 1300
},
{
"epoch": 2.329406534785508,
"grad_norm": 0.6282922625541687,
"learning_rate": 5.889542205864083e-07,
"logits/chosen": -0.3355167806148529,
"logits/rejected": -0.3377595543861389,
"logps/chosen": -0.9515066146850586,
"logps/rejected": -1.0681602954864502,
"loss": 1.0205,
"odds_ratio_loss": 0.6903635859489441,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -0.0951506644487381,
"rewards/margins": 0.01166537031531334,
"rewards/rejected": -0.10681603848934174,
"sft_loss": 0.9515066146850586,
"step": 1310
},
{
"epoch": 2.34718826405868,
"grad_norm": 0.3864741027355194,
"learning_rate": 5.592526130947862e-07,
"logits/chosen": -0.31521058082580566,
"logits/rejected": -0.3186022937297821,
"logps/chosen": -0.9329264760017395,
"logps/rejected": -1.0726194381713867,
"loss": 1.0056,
"odds_ratio_loss": 0.7264095544815063,
"rewards/accuracies": 0.45625001192092896,
"rewards/chosen": -0.09329266101121902,
"rewards/margins": 0.013969297520816326,
"rewards/rejected": -0.10726194083690643,
"sft_loss": 0.9329264760017395,
"step": 1320
},
{
"epoch": 2.3649699933318518,
"grad_norm": 0.8674092292785645,
"learning_rate": 5.302249011195507e-07,
"logits/chosen": -0.3717043995857239,
"logits/rejected": -0.3457496166229248,
"logps/chosen": -0.9407739639282227,
"logps/rejected": -0.9671589136123657,
"loss": 1.015,
"odds_ratio_loss": 0.7421091198921204,
"rewards/accuracies": 0.4749999940395355,
"rewards/chosen": -0.09407740086317062,
"rewards/margins": 0.0026384838856756687,
"rewards/rejected": -0.09671588987112045,
"sft_loss": 0.9407739639282227,
"step": 1330
},
{
"epoch": 2.382751722605023,
"grad_norm": 0.8201255798339844,
"learning_rate": 5.018811640997307e-07,
"logits/chosen": -0.3262820839881897,
"logits/rejected": -0.28208276629447937,
"logps/chosen": -0.9741110801696777,
"logps/rejected": -1.1972548961639404,
"loss": 1.0409,
"odds_ratio_loss": 0.6679055690765381,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": -0.0974111258983612,
"rewards/margins": 0.022314375266432762,
"rewards/rejected": -0.11972548812627792,
"sft_loss": 0.9741110801696777,
"step": 1340
},
{
"epoch": 2.400533451878195,
"grad_norm": 0.3292596638202667,
"learning_rate": 4.7423124397427105e-07,
"logits/chosen": -0.37047189474105835,
"logits/rejected": -0.31794866919517517,
"logps/chosen": -0.9531441926956177,
"logps/rejected": -1.015749216079712,
"loss": 1.0256,
"odds_ratio_loss": 0.7250458002090454,
"rewards/accuracies": 0.4749999940395355,
"rewards/chosen": -0.09531442075967789,
"rewards/margins": 0.006260508205741644,
"rewards/rejected": -0.10157492011785507,
"sft_loss": 0.9531441926956177,
"step": 1350
},
{
"epoch": 2.418315181151367,
"grad_norm": 0.4776778817176819,
"learning_rate": 4.472847417645787e-07,
"logits/chosen": -0.2806258201599121,
"logits/rejected": -0.3024401366710663,
"logps/chosen": -0.9200853109359741,
"logps/rejected": -1.114600419998169,
"loss": 0.9877,
"odds_ratio_loss": 0.6760807633399963,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": -0.09200852364301682,
"rewards/margins": 0.01945151947438717,
"rewards/rejected": -0.11146005243062973,
"sft_loss": 0.9200853109359741,
"step": 1360
},
{
"epoch": 2.436096910424539,
"grad_norm": 0.3043542802333832,
"learning_rate": 4.210510142406993e-07,
"logits/chosen": -0.32727354764938354,
"logits/rejected": -0.3754233717918396,
"logps/chosen": -0.9101996421813965,
"logps/rejected": -1.0942609310150146,
"loss": 0.977,
"odds_ratio_loss": 0.6675896644592285,
"rewards/accuracies": 0.5062500238418579,
"rewards/chosen": -0.09101996570825577,
"rewards/margins": 0.018406113609671593,
"rewards/rejected": -0.10942608118057251,
"sft_loss": 0.9101996421813965,
"step": 1370
},
{
"epoch": 2.4538786396977104,
"grad_norm": 0.4151700437068939,
"learning_rate": 3.9553917067232966e-07,
"logits/chosen": -0.33969706296920776,
"logits/rejected": -0.36881956458091736,
"logps/chosen": -0.9399350881576538,
"logps/rejected": -1.071777105331421,
"loss": 1.0133,
"odds_ratio_loss": 0.7333552241325378,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.09399349987506866,
"rewards/margins": 0.013184216804802418,
"rewards/rejected": -0.10717771202325821,
"sft_loss": 0.9399350881576538,
"step": 1380
},
{
"epoch": 2.4716603689708823,
"grad_norm": 0.4568045437335968,
"learning_rate": 3.707580696657509e-07,
"logits/chosen": -0.2799975275993347,
"logits/rejected": -0.30841827392578125,
"logps/chosen": -0.9116710424423218,
"logps/rejected": -0.9513956308364868,
"loss": 0.9844,
"odds_ratio_loss": 0.7269908785820007,
"rewards/accuracies": 0.4937500059604645,
"rewards/chosen": -0.09116710722446442,
"rewards/margins": 0.003972449339926243,
"rewards/rejected": -0.09513955563306808,
"sft_loss": 0.9116710424423218,
"step": 1390
},
{
"epoch": 2.489442098244054,
"grad_norm": 0.425468772649765,
"learning_rate": 3.4671631608781815e-07,
"logits/chosen": -0.3139536380767822,
"logits/rejected": -0.32965949177742004,
"logps/chosen": -0.9703924059867859,
"logps/rejected": -1.079158067703247,
"loss": 1.0439,
"odds_ratio_loss": 0.7353022694587708,
"rewards/accuracies": 0.46875,
"rewards/chosen": -0.09703925251960754,
"rewards/margins": 0.010876556858420372,
"rewards/rejected": -0.10791579633951187,
"sft_loss": 0.9703924059867859,
"step": 1400
},
{
"epoch": 2.507223827517226,
"grad_norm": 0.6458228826522827,
"learning_rate": 3.234222580780405e-07,
"logits/chosen": -0.3632466197013855,
"logits/rejected": -0.3340745270252228,
"logps/chosen": -0.942143440246582,
"logps/rejected": -0.9809234738349915,
"loss": 1.0153,
"odds_ratio_loss": 0.7311049103736877,
"rewards/accuracies": 0.5062500238418579,
"rewards/chosen": -0.09421434998512268,
"rewards/margins": 0.0038779997266829014,
"rewards/rejected": -0.09809235483407974,
"sft_loss": 0.942143440246582,
"step": 1410
},
{
"epoch": 2.525005556790398,
"grad_norm": 0.7571399211883545,
"learning_rate": 3.0088398414982375e-07,
"logits/chosen": -0.40216293931007385,
"logits/rejected": -0.3554636836051941,
"logps/chosen": -0.9506216049194336,
"logps/rejected": -1.1040918827056885,
"loss": 1.0238,
"odds_ratio_loss": 0.7313109636306763,
"rewards/accuracies": 0.48124998807907104,
"rewards/chosen": -0.09506215900182724,
"rewards/margins": 0.015347021631896496,
"rewards/rejected": -0.11040918529033661,
"sft_loss": 0.9506216049194336,
"step": 1420
},
{
"epoch": 2.54278728606357,
"grad_norm": 0.41928017139434814,
"learning_rate": 2.7910932038184487e-07,
"logits/chosen": -0.38035768270492554,
"logits/rejected": -0.43410953879356384,
"logps/chosen": -0.9504894018173218,
"logps/rejected": -1.033362627029419,
"loss": 1.0219,
"odds_ratio_loss": 0.7138369083404541,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.0950489416718483,
"rewards/margins": 0.008287337608635426,
"rewards/rejected": -0.10333627462387085,
"sft_loss": 0.9504894018173218,
"step": 1430
},
{
"epoch": 2.5605690153367417,
"grad_norm": 0.6664097905158997,
"learning_rate": 2.5810582770057325e-07,
"logits/chosen": -0.3502410054206848,
"logits/rejected": -0.31972765922546387,
"logps/chosen": -0.912204384803772,
"logps/rejected": -1.0270380973815918,
"loss": 0.9827,
"odds_ratio_loss": 0.7054314613342285,
"rewards/accuracies": 0.5062500238418579,
"rewards/chosen": -0.09122045338153839,
"rewards/margins": 0.011483349837362766,
"rewards/rejected": -0.10270379483699799,
"sft_loss": 0.912204384803772,
"step": 1440
},
{
"epoch": 2.578350744609913,
"grad_norm": 0.5214207768440247,
"learning_rate": 2.3788079925484402e-07,
"logits/chosen": -0.2704157829284668,
"logits/rejected": -0.30042511224746704,
"logps/chosen": -0.980503261089325,
"logps/rejected": -1.0476016998291016,
"loss": 1.054,
"odds_ratio_loss": 0.7349393963813782,
"rewards/accuracies": 0.4749999940395355,
"rewards/chosen": -0.0980503261089325,
"rewards/margins": 0.006709852255880833,
"rewards/rejected": -0.10476018488407135,
"sft_loss": 0.980503261089325,
"step": 1450
},
{
"epoch": 2.596132473883085,
"grad_norm": 0.3559114336967468,
"learning_rate": 2.1844125788342661e-07,
"logits/chosen": -0.3745304048061371,
"logits/rejected": -0.3963877558708191,
"logps/chosen": -0.8978282809257507,
"logps/rejected": -1.1463072299957275,
"loss": 0.966,
"odds_ratio_loss": 0.6815627813339233,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": -0.08978282660245895,
"rewards/margins": 0.02484789676964283,
"rewards/rejected": -0.11463073641061783,
"sft_loss": 0.8978282809257507,
"step": 1460
},
{
"epoch": 2.613914203156257,
"grad_norm": 0.4206191599369049,
"learning_rate": 1.9979395367644428e-07,
"logits/chosen": -0.3081280589103699,
"logits/rejected": -0.2860923111438751,
"logps/chosen": -0.8848710060119629,
"logps/rejected": -1.030397653579712,
"loss": 0.9502,
"odds_ratio_loss": 0.6536397337913513,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -0.08848710358142853,
"rewards/margins": 0.014552672393620014,
"rewards/rejected": -0.10303977876901627,
"sft_loss": 0.8848710060119629,
"step": 1470
},
{
"epoch": 2.631695932429429,
"grad_norm": 0.6648186445236206,
"learning_rate": 1.81945361631512e-07,
"logits/chosen": -0.3387419283390045,
"logits/rejected": -0.2922862768173218,
"logps/chosen": -0.927925705909729,
"logps/rejected": -0.9954597353935242,
"loss": 1.0003,
"odds_ratio_loss": 0.7234224081039429,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.0927925705909729,
"rewards/margins": 0.006753397174179554,
"rewards/rejected": -0.09954597055912018,
"sft_loss": 0.927925705909729,
"step": 1480
},
{
"epoch": 2.6494776617026004,
"grad_norm": 0.5596628189086914,
"learning_rate": 1.6490167940538343e-07,
"logits/chosen": -0.3137277066707611,
"logits/rejected": -0.3255840241909027,
"logps/chosen": -0.9538249969482422,
"logps/rejected": -1.0488290786743164,
"loss": 1.0255,
"odds_ratio_loss": 0.7165058851242065,
"rewards/accuracies": 0.48750001192092896,
"rewards/chosen": -0.09538250416517258,
"rewards/margins": 0.009500409476459026,
"rewards/rejected": -0.10488291084766388,
"sft_loss": 0.9538249969482422,
"step": 1490
},
{
"epoch": 2.6672593909757722,
"grad_norm": 0.4116540849208832,
"learning_rate": 1.4866882516191339e-07,
"logits/chosen": -0.31974849104881287,
"logits/rejected": -0.27599194645881653,
"logps/chosen": -0.9288945198059082,
"logps/rejected": -1.0830228328704834,
"loss": 0.9998,
"odds_ratio_loss": 0.7095054984092712,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -0.0928894504904747,
"rewards/margins": 0.015412822365760803,
"rewards/rejected": -0.1083022803068161,
"sft_loss": 0.9288945198059082,
"step": 1500
},
{
"epoch": 2.6672593909757722,
"eval_logits/chosen": -0.3320940136909485,
"eval_logits/rejected": -0.29884636402130127,
"eval_logps/chosen": -0.9399133324623108,
"eval_logps/rejected": -1.080655574798584,
"eval_loss": 1.0073015689849854,
"eval_odds_ratio_loss": 0.6738813519477844,
"eval_rewards/accuracies": 0.515999972820282,
"eval_rewards/chosen": -0.09399133920669556,
"eval_rewards/margins": 0.01407422125339508,
"eval_rewards/rejected": -0.10806556046009064,
"eval_runtime": 185.9317,
"eval_samples_per_second": 5.378,
"eval_sft_loss": 0.9399133324623108,
"eval_steps_per_second": 2.689,
"step": 1500
},
{
"epoch": 2.685041120248944,
"grad_norm": 0.6644484996795654,
"learning_rate": 1.3325243551706057e-07,
"logits/chosen": -0.3859871029853821,
"logits/rejected": -0.36218634247779846,
"logps/chosen": -0.9241644144058228,
"logps/rejected": -1.1543761491775513,
"loss": 0.9915,
"odds_ratio_loss": 0.6730437874794006,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.0924164205789566,
"rewards/margins": 0.023021187633275986,
"rewards/rejected": -0.11543761193752289,
"sft_loss": 0.9241644144058228,
"step": 1510
},
{
"epoch": 2.702822849522116,
"grad_norm": 0.6883984208106995,
"learning_rate": 1.1865786358165737e-07,
"logits/chosen": -0.3818913400173187,
"logits/rejected": -0.27337896823883057,
"logps/chosen": -0.9033206701278687,
"logps/rejected": -1.0108495950698853,
"loss": 0.9727,
"odds_ratio_loss": 0.6942235827445984,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.09033207595348358,
"rewards/margins": 0.010752884671092033,
"rewards/rejected": -0.10108494758605957,
"sft_loss": 0.9033206701278687,
"step": 1520
},
{
"epoch": 2.720604578795288,
"grad_norm": 1.4156850576400757,
"learning_rate": 1.0489017710262311e-07,
"logits/chosen": -0.39080482721328735,
"logits/rejected": -0.3747466206550598,
"logps/chosen": -1.0374637842178345,
"logps/rejected": -1.1824612617492676,
"loss": 1.1147,
"odds_ratio_loss": 0.7718855142593384,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": -0.10374637693166733,
"rewards/margins": 0.014499744400382042,
"rewards/rejected": -0.11824611574411392,
"sft_loss": 1.0374637842178345,
"step": 1530
},
{
"epoch": 2.73838630806846,
"grad_norm": 0.4921424984931946,
"learning_rate": 9.195415670326446e-08,
"logits/chosen": -0.326080858707428,
"logits/rejected": -0.321908175945282,
"logps/chosen": -0.9485294222831726,
"logps/rejected": -1.082155704498291,
"loss": 1.0195,
"odds_ratio_loss": 0.7096532583236694,
"rewards/accuracies": 0.46875,
"rewards/chosen": -0.09485294669866562,
"rewards/margins": 0.013362633064389229,
"rewards/rejected": -0.1082155704498291,
"sft_loss": 0.9485294222831726,
"step": 1540
},
{
"epoch": 2.7561680373416317,
"grad_norm": 0.686665415763855,
"learning_rate": 7.985429422327384e-08,
"logits/chosen": -0.35336002707481384,
"logits/rejected": -0.3244116008281708,
"logps/chosen": -0.9436219930648804,
"logps/rejected": -0.975549578666687,
"loss": 1.0188,
"odds_ratio_loss": 0.7518836855888367,
"rewards/accuracies": 0.4124999940395355,
"rewards/chosen": -0.09436219930648804,
"rewards/margins": 0.0031927600502967834,
"rewards/rejected": -0.09755495190620422,
"sft_loss": 0.9436219930648804,
"step": 1550
},
{
"epoch": 2.773949766614803,
"grad_norm": 0.30419808626174927,
"learning_rate": 6.859479115900818e-08,
"logits/chosen": -0.31769606471061707,
"logits/rejected": -0.31846362352371216,
"logps/chosen": -0.9142364263534546,
"logps/rejected": -1.0324945449829102,
"loss": 0.9834,
"odds_ratio_loss": 0.6916245222091675,
"rewards/accuracies": 0.4749999940395355,
"rewards/chosen": -0.0914236530661583,
"rewards/margins": 0.011825799010694027,
"rewards/rejected": -0.1032494530081749,
"sft_loss": 0.9142364263534546,
"step": 1560
},
{
"epoch": 2.791731495887975,
"grad_norm": 1.5349509716033936,
"learning_rate": 5.817955720457902e-08,
"logits/chosen": -0.33953648805618286,
"logits/rejected": -0.297925740480423,
"logps/chosen": -0.9395607709884644,
"logps/rejected": -1.0038203001022339,
"loss": 1.0133,
"odds_ratio_loss": 0.7371524572372437,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.09395607560873032,
"rewards/margins": 0.006425946019589901,
"rewards/rejected": -0.10038203001022339,
"sft_loss": 0.9395607709884644,
"step": 1570
},
{
"epoch": 2.809513225161147,
"grad_norm": 0.36313971877098083,
"learning_rate": 4.861220889427199e-08,
"logits/chosen": -0.35685330629348755,
"logits/rejected": -0.35064131021499634,
"logps/chosen": -0.9390374422073364,
"logps/rejected": -1.019951581954956,
"loss": 1.012,
"odds_ratio_loss": 0.7297292351722717,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": -0.09390375763177872,
"rewards/margins": 0.008091414347290993,
"rewards/rejected": -0.10199517011642456,
"sft_loss": 0.9390374422073364,
"step": 1580
},
{
"epoch": 2.827294954434319,
"grad_norm": 0.26599186658859253,
"learning_rate": 3.9896068346758074e-08,
"logits/chosen": -0.39413073658943176,
"logits/rejected": -0.38061630725860596,
"logps/chosen": -0.948017954826355,
"logps/rejected": -1.034618616104126,
"loss": 1.0172,
"odds_ratio_loss": 0.6922141313552856,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -0.09480179846286774,
"rewards/margins": 0.008660053834319115,
"rewards/rejected": -0.1034618467092514,
"sft_loss": 0.948017954826355,
"step": 1590
},
{
"epoch": 2.8450766837074903,
"grad_norm": 0.9985164403915405,
"learning_rate": 3.203416211153832e-08,
"logits/chosen": -0.3526967763900757,
"logits/rejected": -0.25582748651504517,
"logps/chosen": -0.9348894357681274,
"logps/rejected": -1.0583240985870361,
"loss": 1.0071,
"odds_ratio_loss": 0.7220235466957092,
"rewards/accuracies": 0.4312500059604645,
"rewards/chosen": -0.09348894655704498,
"rewards/margins": 0.01234346441924572,
"rewards/rejected": -0.10583242028951645,
"sft_loss": 0.9348894357681274,
"step": 1600
},
{
"epoch": 2.8628584129806622,
"grad_norm": 0.4895220994949341,
"learning_rate": 2.5029220118019393e-08,
"logits/chosen": -0.3774477243423462,
"logits/rejected": -0.34018778800964355,
"logps/chosen": -0.9445845484733582,
"logps/rejected": -0.9962360262870789,
"loss": 1.0176,
"odds_ratio_loss": 0.7305063009262085,
"rewards/accuracies": 0.4124999940395355,
"rewards/chosen": -0.0944584533572197,
"rewards/margins": 0.0051651508547365665,
"rewards/rejected": -0.09962360560894012,
"sft_loss": 0.9445845484733582,
"step": 1610
},
{
"epoch": 2.880640142253834,
"grad_norm": 0.39454635977745056,
"learning_rate": 1.8883674727586122e-08,
"logits/chosen": -0.3457157611846924,
"logits/rejected": -0.33168259263038635,
"logps/chosen": -0.8693550825119019,
"logps/rejected": -1.09225332736969,
"loss": 0.9328,
"odds_ratio_loss": 0.6342187523841858,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.08693551272153854,
"rewards/margins": 0.022289803251624107,
"rewards/rejected": -0.1092253178358078,
"sft_loss": 0.8693550825119019,
"step": 1620
},
{
"epoch": 2.898421871527006,
"grad_norm": 0.29763612151145935,
"learning_rate": 1.3599659889000639e-08,
"logits/chosen": -0.26188623905181885,
"logits/rejected": -0.27545788884162903,
"logps/chosen": -0.9086050987243652,
"logps/rejected": -0.9591732025146484,
"loss": 0.9816,
"odds_ratio_loss": 0.7299038171768188,
"rewards/accuracies": 0.4124999940395355,
"rewards/chosen": -0.0908605083823204,
"rewards/margins": 0.005056814290583134,
"rewards/rejected": -0.09591732919216156,
"sft_loss": 0.9086050987243652,
"step": 1630
},
{
"epoch": 2.916203600800178,
"grad_norm": 3.087757110595703,
"learning_rate": 9.179010397421528e-09,
"logits/chosen": -0.29684725403785706,
"logits/rejected": -0.26544058322906494,
"logps/chosen": -1.0444749593734741,
"logps/rejected": -1.1464588642120361,
"loss": 1.1156,
"odds_ratio_loss": 0.7117230892181396,
"rewards/accuracies": 0.44999998807907104,
"rewards/chosen": -0.10444750636816025,
"rewards/margins": 0.010198366828262806,
"rewards/rejected": -0.11464587599039078,
"sft_loss": 1.0444749593734741,
"step": 1640
},
{
"epoch": 2.93398533007335,
"grad_norm": 0.7389609813690186,
"learning_rate": 5.623261257296509e-09,
"logits/chosen": -0.33190470933914185,
"logits/rejected": -0.2921023964881897,
"logps/chosen": -0.8605577349662781,
"logps/rejected": -0.9687950015068054,
"loss": 0.9291,
"odds_ratio_loss": 0.6854843497276306,
"rewards/accuracies": 0.4937500059604645,
"rewards/chosen": -0.08605578541755676,
"rewards/margins": 0.010823719203472137,
"rewards/rejected": -0.0968794971704483,
"sft_loss": 0.8605577349662781,
"step": 1650
},
{
"epoch": 2.9517670593465217,
"grad_norm": 0.49204200506210327,
"learning_rate": 2.933647149357122e-09,
"logits/chosen": -0.3684224784374237,
"logits/rejected": -0.3360394537448883,
"logps/chosen": -0.9260095357894897,
"logps/rejected": -1.059597373008728,
"loss": 0.9945,
"odds_ratio_loss": 0.6844674348831177,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -0.09260095655918121,
"rewards/margins": 0.013358776457607746,
"rewards/rejected": -0.10595973581075668,
"sft_loss": 0.9260095357894897,
"step": 1660
},
{
"epoch": 2.969548788619693,
"grad_norm": 0.4070994257926941,
"learning_rate": 1.1111020018930717e-09,
"logits/chosen": -0.2591468393802643,
"logits/rejected": -0.31176748871803284,
"logps/chosen": -0.9283815622329712,
"logps/rejected": -0.9903603792190552,
"loss": 1.0009,
"odds_ratio_loss": 0.7251425981521606,
"rewards/accuracies": 0.4625000059604645,
"rewards/chosen": -0.09283814579248428,
"rewards/margins": 0.006197893992066383,
"rewards/rejected": -0.09903603792190552,
"sft_loss": 0.9283815622329712,
"step": 1670
},
{
"epoch": 2.987330517892865,
"grad_norm": 0.31971636414527893,
"learning_rate": 1.5625866646051813e-10,
"logits/chosen": -0.3598848283290863,
"logits/rejected": -0.3403863310813904,
"logps/chosen": -0.9049466252326965,
"logps/rejected": -1.057483434677124,
"loss": 0.9695,
"odds_ratio_loss": 0.6452642679214478,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.09049466997385025,
"rewards/margins": 0.015253685414791107,
"rewards/rejected": -0.10574835538864136,
"sft_loss": 0.9049466252326965,
"step": 1680
},
{
"epoch": 2.997999555456768,
"step": 1686,
"total_flos": 1.8817568285770383e+18,
"train_loss": 1.0353579054523618,
"train_runtime": 16950.0138,
"train_samples_per_second": 1.593,
"train_steps_per_second": 0.099
}
],
"logging_steps": 10,
"max_steps": 1686,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"total_flos": 1.8817568285770383e+18,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}