{ "best_metric": 1.0073015689849854, "best_model_checkpoint": "saves/Vicuna-7B-v1.5/lora/orpo/checkpoint-1500", "epoch": 2.997999555456768, "eval_steps": 500, "global_step": 1686, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.017781729273171815, "grad_norm": 0.3158996105194092, "learning_rate": 4.9995745934141085e-06, "logits/chosen": -0.7898403406143188, "logits/rejected": -0.7731221914291382, "logps/chosen": -1.1474043130874634, "logps/rejected": -1.2031431198120117, "loss": 1.227, "odds_ratio_loss": 0.7959282994270325, "rewards/accuracies": 0.4124999940395355, "rewards/chosen": -0.11474044620990753, "rewards/margins": 0.005573858506977558, "rewards/rejected": -0.12031430006027222, "sft_loss": 1.1474043130874634, "step": 10 }, { "epoch": 0.03556345854634363, "grad_norm": 0.8646821975708008, "learning_rate": 4.9982812903243405e-06, "logits/chosen": -0.7618139982223511, "logits/rejected": -0.7260042428970337, "logps/chosen": -0.9931285977363586, "logps/rejected": -1.050875186920166, "loss": 1.0707, "odds_ratio_loss": 0.7757659554481506, "rewards/accuracies": 0.45625001192092896, "rewards/chosen": -0.09931285679340363, "rewards/margins": 0.005774644669145346, "rewards/rejected": -0.10508750379085541, "sft_loss": 0.9931285977363586, "step": 20 }, { "epoch": 0.05334518781951545, "grad_norm": 0.2927573025226593, "learning_rate": 4.996120496405222e-06, "logits/chosen": -0.7767494916915894, "logits/rejected": -0.7559677362442017, "logps/chosen": -1.040177345275879, "logps/rejected": -1.2401186227798462, "loss": 1.1087, "odds_ratio_loss": 0.6853717565536499, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.10401773452758789, "rewards/margins": 0.019994117319583893, "rewards/rejected": -0.12401185184717178, "sft_loss": 1.040177345275879, "step": 30 }, { "epoch": 0.07112691709268726, "grad_norm": 0.3339848518371582, "learning_rate": 4.99309296196014e-06, "logits/chosen": -0.7875353693962097, "logits/rejected": -0.7857375741004944, "logps/chosen": -1.0764983892440796, "logps/rejected": -1.1753004789352417, "loss": 1.1498, "odds_ratio_loss": 0.7328984141349792, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.10764984041452408, "rewards/margins": 0.009880214929580688, "rewards/rejected": -0.11753007024526596, "sft_loss": 1.0764983892440796, "step": 40 }, { "epoch": 0.08890864636585907, "grad_norm": 0.3153611719608307, "learning_rate": 4.989199738255166e-06, "logits/chosen": -0.7786640524864197, "logits/rejected": -0.7964621782302856, "logps/chosen": -1.0476799011230469, "logps/rejected": -1.1452114582061768, "loss": 1.1221, "odds_ratio_loss": 0.7446193099021912, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -0.10476799309253693, "rewards/margins": 0.009753172285854816, "rewards/rejected": -0.11452116817235947, "sft_loss": 1.0476799011230469, "step": 50 }, { "epoch": 0.1066903756390309, "grad_norm": 2.7500874996185303, "learning_rate": 4.984442177154031e-06, "logits/chosen": -0.7653383612632751, "logits/rejected": -0.7529075741767883, "logps/chosen": -1.1525957584381104, "logps/rejected": -1.2310835123062134, "loss": 1.2305, "odds_ratio_loss": 0.7788733243942261, "rewards/accuracies": 0.45625001192092896, "rewards/chosen": -0.1152595728635788, "rewards/margins": 0.007848784327507019, "rewards/rejected": -0.12310836464166641, "sft_loss": 1.1525957584381104, "step": 60 }, { "epoch": 0.12447210491220272, "grad_norm": 0.3525276184082031, "learning_rate": 4.978821930648704e-06, "logits/chosen": -0.8071187734603882, "logits/rejected": -0.7696810364723206, "logps/chosen": -1.0399789810180664, "logps/rejected": -1.0721027851104736, "loss": 1.1208, "odds_ratio_loss": 0.8085241317749023, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -0.10399790853261948, "rewards/margins": 0.003212365787476301, "rewards/rejected": -0.10721027851104736, "sft_loss": 1.0399789810180664, "step": 70 }, { "epoch": 0.14225383418537452, "grad_norm": 0.6355476379394531, "learning_rate": 4.97234095028576e-06, "logits/chosen": -0.738179624080658, "logits/rejected": -0.7453175783157349, "logps/chosen": -1.1585901975631714, "logps/rejected": -1.2273097038269043, "loss": 1.2343, "odds_ratio_loss": 0.7569113969802856, "rewards/accuracies": 0.5, "rewards/chosen": -0.1158590167760849, "rewards/margins": 0.0068719410337507725, "rewards/rejected": -0.12273095548152924, "sft_loss": 1.1585901975631714, "step": 80 }, { "epoch": 0.16003556345854633, "grad_norm": 0.2942532002925873, "learning_rate": 4.965001486488743e-06, "logits/chosen": -0.7591525316238403, "logits/rejected": -0.7494860887527466, "logps/chosen": -1.0791616439819336, "logps/rejected": -1.2336231470108032, "loss": 1.1471, "odds_ratio_loss": 0.6791869401931763, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.10791617631912231, "rewards/margins": 0.015446141362190247, "rewards/rejected": -0.12336231768131256, "sft_loss": 1.0791616439819336, "step": 90 }, { "epoch": 0.17781729273171815, "grad_norm": 0.35266247391700745, "learning_rate": 4.956806087776732e-06, "logits/chosen": -0.6999791860580444, "logits/rejected": -0.6948890686035156, "logps/chosen": -1.0402957201004028, "logps/rejected": -1.2390520572662354, "loss": 1.1124, "odds_ratio_loss": 0.7215061187744141, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -0.10402955859899521, "rewards/margins": 0.01987563632428646, "rewards/rejected": -0.12390519678592682, "sft_loss": 1.0402957201004028, "step": 100 }, { "epoch": 0.19559902200489, "grad_norm": 0.4545610845088959, "learning_rate": 4.947757599879411e-06, "logits/chosen": -0.7189663052558899, "logits/rejected": -0.6851673126220703, "logps/chosen": -1.147323489189148, "logps/rejected": -1.289452314376831, "loss": 1.2227, "odds_ratio_loss": 0.7533982396125793, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.11473236232995987, "rewards/margins": 0.014212870970368385, "rewards/rejected": -0.1289452314376831, "sft_loss": 1.147323489189148, "step": 110 }, { "epoch": 0.2133807512780618, "grad_norm": 0.6324980854988098, "learning_rate": 4.937859164748931e-06, "logits/chosen": -0.7043695449829102, "logits/rejected": -0.6795639991760254, "logps/chosen": -1.0146863460540771, "logps/rejected": -1.0826324224472046, "loss": 1.0907, "odds_ratio_loss": 0.760542094707489, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -0.10146863758563995, "rewards/margins": 0.006794607732445002, "rewards/rejected": -0.10826325416564941, "sft_loss": 1.0146863460540771, "step": 120 }, { "epoch": 0.23116248055123362, "grad_norm": 0.4255826771259308, "learning_rate": 4.92711421946891e-06, "logits/chosen": -0.6701909899711609, "logits/rejected": -0.7547520995140076, "logps/chosen": -1.0397005081176758, "logps/rejected": -1.1938796043395996, "loss": 1.1117, "odds_ratio_loss": 0.7198113799095154, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.10397003591060638, "rewards/margins": 0.015417915768921375, "rewards/rejected": -0.11938796192407608, "sft_loss": 1.0397005081176758, "step": 130 }, { "epoch": 0.24894420982440543, "grad_norm": 0.7161264419555664, "learning_rate": 4.915526495060961e-06, "logits/chosen": -0.6202753782272339, "logits/rejected": -0.64984530210495, "logps/chosen": -1.0066936016082764, "logps/rejected": -1.1723135709762573, "loss": 1.0745, "odds_ratio_loss": 0.6777721643447876, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.10066936165094376, "rewards/margins": 0.016561999917030334, "rewards/rejected": -0.1172313541173935, "sft_loss": 1.0066936016082764, "step": 140 }, { "epoch": 0.26672593909757725, "grad_norm": 0.540038526058197, "learning_rate": 4.903100015189153e-06, "logits/chosen": -0.5942473411560059, "logits/rejected": -0.5408576726913452, "logps/chosen": -0.9665758013725281, "logps/rejected": -1.1337311267852783, "loss": 1.0386, "odds_ratio_loss": 0.719926118850708, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.09665757417678833, "rewards/margins": 0.01671554148197174, "rewards/rejected": -0.11337311565876007, "sft_loss": 0.9665758013725281, "step": 150 }, { "epoch": 0.28450766837074903, "grad_norm": 2.370271682739258, "learning_rate": 4.889839094762848e-06, "logits/chosen": -0.5599099397659302, "logits/rejected": -0.5666571855545044, "logps/chosen": -1.0475890636444092, "logps/rejected": -1.1946136951446533, "loss": 1.1206, "odds_ratio_loss": 0.7300440073013306, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.10475890338420868, "rewards/margins": 0.014702451415359974, "rewards/rejected": -0.11946137249469757, "sft_loss": 1.0475890636444092, "step": 160 }, { "epoch": 0.3022893976439209, "grad_norm": 0.37259843945503235, "learning_rate": 4.875748338438416e-06, "logits/chosen": -0.5827142000198364, "logits/rejected": -0.5626250505447388, "logps/chosen": -0.9911508560180664, "logps/rejected": -1.0813571214675903, "loss": 1.0632, "odds_ratio_loss": 0.720399022102356, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -0.09911508113145828, "rewards/margins": 0.009020629338920116, "rewards/rejected": -0.10813571512699127, "sft_loss": 0.9911508560180664, "step": 170 }, { "epoch": 0.32007112691709266, "grad_norm": 0.3821701109409332, "learning_rate": 4.8608326390203386e-06, "logits/chosen": -0.6059321165084839, "logits/rejected": -0.5918234586715698, "logps/chosen": -0.9553475379943848, "logps/rejected": -1.1111819744110107, "loss": 1.0245, "odds_ratio_loss": 0.6911659240722656, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.09553476423025131, "rewards/margins": 0.01558343879878521, "rewards/rejected": -0.11111819744110107, "sft_loss": 0.9553475379943848, "step": 180 }, { "epoch": 0.3378528561902645, "grad_norm": 0.3977317810058594, "learning_rate": 4.845097175762251e-06, "logits/chosen": -0.49882182478904724, "logits/rejected": -0.48370814323425293, "logps/chosen": -0.989281952381134, "logps/rejected": -1.0615712404251099, "loss": 1.0617, "odds_ratio_loss": 0.7244290113449097, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -0.09892819821834564, "rewards/margins": 0.007228921167552471, "rewards/rejected": -0.10615710914134979, "sft_loss": 0.989281952381134, "step": 190 }, { "epoch": 0.3556345854634363, "grad_norm": 0.46290695667266846, "learning_rate": 4.8285474125685286e-06, "logits/chosen": -0.518696129322052, "logits/rejected": -0.5193291306495667, "logps/chosen": -1.1205590963363647, "logps/rejected": -1.1714627742767334, "loss": 1.198, "odds_ratio_loss": 0.7740126252174377, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.11205589771270752, "rewards/margins": 0.00509037496522069, "rewards/rejected": -0.11714627593755722, "sft_loss": 1.1205590963363647, "step": 200 }, { "epoch": 0.37341631473660813, "grad_norm": 0.32425227761268616, "learning_rate": 4.811189096097025e-06, "logits/chosen": -0.5530649423599243, "logits/rejected": -0.5483794808387756, "logps/chosen": -0.9994535446166992, "logps/rejected": -1.1620233058929443, "loss": 1.0712, "odds_ratio_loss": 0.7175347208976746, "rewards/accuracies": 0.5, "rewards/chosen": -0.09994535893201828, "rewards/margins": 0.01625697687268257, "rewards/rejected": -0.11620233952999115, "sft_loss": 0.9994535446166992, "step": 210 }, { "epoch": 0.39119804400978, "grad_norm": 0.5374495387077332, "learning_rate": 4.793028253763633e-06, "logits/chosen": -0.46489372849464417, "logits/rejected": -0.49711689352989197, "logps/chosen": -0.9644722938537598, "logps/rejected": -1.098311185836792, "loss": 1.0422, "odds_ratio_loss": 0.7768682837486267, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.09644722938537598, "rewards/margins": 0.013383878394961357, "rewards/rejected": -0.10983110964298248, "sft_loss": 0.9644722938537598, "step": 220 }, { "epoch": 0.40897977328295176, "grad_norm": 0.7932880520820618, "learning_rate": 4.774071191649352e-06, "logits/chosen": -0.5470231771469116, "logits/rejected": -0.5435986518859863, "logps/chosen": -0.9579310417175293, "logps/rejected": -1.1810802221298218, "loss": 1.0212, "odds_ratio_loss": 0.6330138444900513, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.09579310566186905, "rewards/margins": 0.02231491729617119, "rewards/rejected": -0.11810803413391113, "sft_loss": 0.9579310417175293, "step": 230 }, { "epoch": 0.4267615025561236, "grad_norm": 0.618280291557312, "learning_rate": 4.7543244923105975e-06, "logits/chosen": -0.5025745630264282, "logits/rejected": -0.4722610414028168, "logps/chosen": -1.0212466716766357, "logps/rejected": -1.0026448965072632, "loss": 1.1058, "odds_ratio_loss": 0.8450964093208313, "rewards/accuracies": 0.45625001192092896, "rewards/chosen": -0.1021246686577797, "rewards/margins": -0.0018601752817630768, "rewards/rejected": -0.10026448965072632, "sft_loss": 1.0212466716766357, "step": 240 }, { "epoch": 0.4445432318292954, "grad_norm": 0.39385247230529785, "learning_rate": 4.733795012493506e-06, "logits/chosen": -0.5138652324676514, "logits/rejected": -0.4715350270271301, "logps/chosen": -1.0123497247695923, "logps/rejected": -1.13383150100708, "loss": 1.0857, "odds_ratio_loss": 0.7335414886474609, "rewards/accuracies": 0.46875, "rewards/chosen": -0.10123495757579803, "rewards/margins": 0.012148191221058369, "rewards/rejected": -0.11338315904140472, "sft_loss": 1.0123497247695923, "step": 250 }, { "epoch": 0.46232496110246724, "grad_norm": 0.3666248619556427, "learning_rate": 4.712489880753035e-06, "logits/chosen": -0.3967147171497345, "logits/rejected": -0.3805852234363556, "logps/chosen": -0.946629524230957, "logps/rejected": -1.0246347188949585, "loss": 1.0164, "odds_ratio_loss": 0.6973500847816467, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.09466294944286346, "rewards/margins": 0.007800529710948467, "rewards/rejected": -0.1024634838104248, "sft_loss": 0.946629524230957, "step": 260 }, { "epoch": 0.480106690375639, "grad_norm": 0.6196191906929016, "learning_rate": 4.690416494977673e-06, "logits/chosen": -0.3590370714664459, "logits/rejected": -0.3209628164768219, "logps/chosen": -0.9477987289428711, "logps/rejected": -1.1744658946990967, "loss": 1.0133, "odds_ratio_loss": 0.654593825340271, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.09477987140417099, "rewards/margins": 0.02266671508550644, "rewards/rejected": -0.11744660139083862, "sft_loss": 0.9477987289428711, "step": 270 }, { "epoch": 0.49788841964881086, "grad_norm": 0.38255006074905396, "learning_rate": 4.667582519820639e-06, "logits/chosen": -0.4478569030761719, "logits/rejected": -0.40335726737976074, "logps/chosen": -1.0600357055664062, "logps/rejected": -1.0844862461090088, "loss": 1.1374, "odds_ratio_loss": 0.7734627723693848, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.10600356757640839, "rewards/margins": 0.002445052145048976, "rewards/rejected": -0.10844862461090088, "sft_loss": 1.0600357055664062, "step": 280 }, { "epoch": 0.5156701489219827, "grad_norm": 0.6143254637718201, "learning_rate": 4.643995884038443e-06, "logits/chosen": -0.42634057998657227, "logits/rejected": -0.4024909436702728, "logps/chosen": -1.0625637769699097, "logps/rejected": -1.2203805446624756, "loss": 1.1314, "odds_ratio_loss": 0.6885315179824829, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.1062563806772232, "rewards/margins": 0.01578168198466301, "rewards/rejected": -0.12203805148601532, "sft_loss": 1.0625637769699097, "step": 290 }, { "epoch": 0.5334518781951545, "grad_norm": 0.3366183042526245, "learning_rate": 4.6196647777377475e-06, "logits/chosen": -0.37543022632598877, "logits/rejected": -0.3797139525413513, "logps/chosen": -0.9299192428588867, "logps/rejected": -0.9767643213272095, "loss": 1.0053, "odds_ratio_loss": 0.7540563344955444, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -0.09299192577600479, "rewards/margins": 0.004684499930590391, "rewards/rejected": -0.09767641872167587, "sft_loss": 0.9299192428588867, "step": 300 }, { "epoch": 0.5512336074683263, "grad_norm": 0.5256261825561523, "learning_rate": 4.59459764953147e-06, "logits/chosen": -0.3965223431587219, "logits/rejected": -0.4247291684150696, "logps/chosen": -1.0226197242736816, "logps/rejected": -1.121930718421936, "loss": 1.0919, "odds_ratio_loss": 0.6925050616264343, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -0.1022619754076004, "rewards/margins": 0.00993109680712223, "rewards/rejected": -0.11219307035207748, "sft_loss": 1.0226197242736816, "step": 310 }, { "epoch": 0.5690153367414981, "grad_norm": 0.5753230452537537, "learning_rate": 4.568803203605133e-06, "logits/chosen": -0.38987019658088684, "logits/rejected": -0.40249496698379517, "logps/chosen": -1.0238714218139648, "logps/rejected": -1.191584825515747, "loss": 1.0951, "odds_ratio_loss": 0.7120264768600464, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.10238714516162872, "rewards/margins": 0.016771327704191208, "rewards/rejected": -0.11915846914052963, "sft_loss": 1.0238714218139648, "step": 320 }, { "epoch": 0.58679706601467, "grad_norm": 0.40169399976730347, "learning_rate": 4.542290396694462e-06, "logits/chosen": -0.4059433043003082, "logits/rejected": -0.4052697718143463, "logps/chosen": -0.9671312570571899, "logps/rejected": -1.0644605159759521, "loss": 1.0391, "odds_ratio_loss": 0.7196342349052429, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.09671313315629959, "rewards/margins": 0.009732924401760101, "rewards/rejected": -0.1064460501074791, "sft_loss": 0.9671312570571899, "step": 330 }, { "epoch": 0.6045787952878418, "grad_norm": 0.5619000792503357, "learning_rate": 4.515068434975298e-06, "logits/chosen": -0.4578043818473816, "logits/rejected": -0.4284750819206238, "logps/chosen": -0.9811161756515503, "logps/rejected": -1.1456761360168457, "loss": 1.0484, "odds_ratio_loss": 0.6727977991104126, "rewards/accuracies": 0.5, "rewards/chosen": -0.09811162203550339, "rewards/margins": 0.016455989331007004, "rewards/rejected": -0.11456761509180069, "sft_loss": 0.9811161756515503, "step": 340 }, { "epoch": 0.6223605245610135, "grad_norm": 0.5821824073791504, "learning_rate": 4.487146770866887e-06, "logits/chosen": -0.34484004974365234, "logits/rejected": -0.3222612738609314, "logps/chosen": -1.0583232641220093, "logps/rejected": -1.117333173751831, "loss": 1.1304, "odds_ratio_loss": 0.7205663919448853, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": -0.10583231598138809, "rewards/margins": 0.005901001859456301, "rewards/rejected": -0.1117333322763443, "sft_loss": 1.0583232641220093, "step": 350 }, { "epoch": 0.6401422538341853, "grad_norm": 0.28447961807250977, "learning_rate": 4.458535099749666e-06, "logits/chosen": -0.43229636549949646, "logits/rejected": -0.40540462732315063, "logps/chosen": -1.1308929920196533, "logps/rejected": -1.0958976745605469, "loss": 1.2174, "odds_ratio_loss": 0.8652679324150085, "rewards/accuracies": 0.46875, "rewards/chosen": -0.11308930814266205, "rewards/margins": -0.0034995335154235363, "rewards/rejected": -0.10958977788686752, "sft_loss": 1.1308929920196533, "step": 360 }, { "epoch": 0.6579239831073572, "grad_norm": 0.27178603410720825, "learning_rate": 4.429243356598694e-06, "logits/chosen": -0.40932542085647583, "logits/rejected": -0.3859841227531433, "logps/chosen": -0.9554696083068848, "logps/rejected": -1.1517064571380615, "loss": 1.0243, "odds_ratio_loss": 0.6880883574485779, "rewards/accuracies": 0.53125, "rewards/chosen": -0.09554696083068848, "rewards/margins": 0.019623693078756332, "rewards/rejected": -0.11517064273357391, "sft_loss": 0.9554696083068848, "step": 370 }, { "epoch": 0.675705712380529, "grad_norm": 0.34544578194618225, "learning_rate": 4.399281712533875e-06, "logits/chosen": -0.32934245467185974, "logits/rejected": -0.3599315285682678, "logps/chosen": -0.9367265701293945, "logps/rejected": -1.0202996730804443, "loss": 1.0101, "odds_ratio_loss": 0.7333763837814331, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -0.09367264807224274, "rewards/margins": 0.008357317186892033, "rewards/rejected": -0.1020299643278122, "sft_loss": 0.9367265701293945, "step": 380 }, { "epoch": 0.6934874416537008, "grad_norm": 0.48474597930908203, "learning_rate": 4.368660571288192e-06, "logits/chosen": -0.3377426266670227, "logits/rejected": -0.32565537095069885, "logps/chosen": -0.9353078007698059, "logps/rejected": -1.0242602825164795, "loss": 1.0071, "odds_ratio_loss": 0.7176766395568848, "rewards/accuracies": 0.45625001192092896, "rewards/chosen": -0.09353077411651611, "rewards/margins": 0.008895261213183403, "rewards/rejected": -0.10242603719234467, "sft_loss": 0.9353078007698059, "step": 390 }, { "epoch": 0.7112691709268726, "grad_norm": 0.3825822174549103, "learning_rate": 4.337390565595163e-06, "logits/chosen": -0.4158423840999603, "logits/rejected": -0.36646509170532227, "logps/chosen": -1.0673354864120483, "logps/rejected": -1.0877690315246582, "loss": 1.1448, "odds_ratio_loss": 0.7746785879135132, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.1067335456609726, "rewards/margins": 0.0020433522295206785, "rewards/rejected": -0.10877690464258194, "sft_loss": 1.0673354864120483, "step": 400 }, { "epoch": 0.7290509002000445, "grad_norm": 0.36279189586639404, "learning_rate": 4.305482553496786e-06, "logits/chosen": -0.33700472116470337, "logits/rejected": -0.3831488788127899, "logps/chosen": -0.9607623815536499, "logps/rejected": -1.0405422449111938, "loss": 1.0363, "odds_ratio_loss": 0.7554237842559814, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.09607623517513275, "rewards/margins": 0.007977982982993126, "rewards/rejected": -0.10405422747135162, "sft_loss": 0.9607623815536499, "step": 410 }, { "epoch": 0.7468326294732163, "grad_norm": 0.457087904214859, "learning_rate": 4.272947614573244e-06, "logits/chosen": -0.3999176621437073, "logits/rejected": -0.3756122291088104, "logps/chosen": -1.0111384391784668, "logps/rejected": -1.0757354497909546, "loss": 1.0826, "odds_ratio_loss": 0.7144282460212708, "rewards/accuracies": 0.53125, "rewards/chosen": -0.10111384093761444, "rewards/margins": 0.006459714379161596, "rewards/rejected": -0.10757355391979218, "sft_loss": 1.0111384391784668, "step": 420 }, { "epoch": 0.7646143587463881, "grad_norm": 0.2605019509792328, "learning_rate": 4.23979704609569e-06, "logits/chosen": -0.36384835839271545, "logits/rejected": -0.34030967950820923, "logps/chosen": -0.9615520238876343, "logps/rejected": -1.0373448133468628, "loss": 1.0309, "odds_ratio_loss": 0.6935026049613953, "rewards/accuracies": 0.53125, "rewards/chosen": -0.09615520387887955, "rewards/margins": 0.007579285651445389, "rewards/rejected": -0.10373447835445404, "sft_loss": 0.9615520238876343, "step": 430 }, { "epoch": 0.78239608801956, "grad_norm": 0.41911929845809937, "learning_rate": 4.206042359103435e-06, "logits/chosen": -0.38596296310424805, "logits/rejected": -0.37879234552383423, "logps/chosen": -0.9808257222175598, "logps/rejected": -1.121048927307129, "loss": 1.0531, "odds_ratio_loss": 0.7229377627372742, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.09808257222175598, "rewards/margins": 0.014022317714989185, "rewards/rejected": -0.11210489273071289, "sft_loss": 0.9808257222175598, "step": 440 }, { "epoch": 0.8001778172927317, "grad_norm": 0.7460839748382568, "learning_rate": 4.17169527440691e-06, "logits/chosen": -0.39514169096946716, "logits/rejected": -0.3737938106060028, "logps/chosen": -0.9438737630844116, "logps/rejected": -1.0060594081878662, "loss": 1.0182, "odds_ratio_loss": 0.7436385154724121, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.09438737481832504, "rewards/margins": 0.006218560039997101, "rewards/rejected": -0.10060594230890274, "sft_loss": 0.9438737630844116, "step": 450 }, { "epoch": 0.8179595465659035, "grad_norm": 0.5300458669662476, "learning_rate": 4.136767718517797e-06, "logits/chosen": -0.3699805736541748, "logits/rejected": -0.3850511312484741, "logps/chosen": -0.959467887878418, "logps/rejected": -1.100988507270813, "loss": 1.0256, "odds_ratio_loss": 0.6614881753921509, "rewards/accuracies": 0.53125, "rewards/chosen": -0.0959467813372612, "rewards/margins": 0.014152060262858868, "rewards/rejected": -0.11009885370731354, "sft_loss": 0.959467887878418, "step": 460 }, { "epoch": 0.8357412758390753, "grad_norm": 0.9485012292861938, "learning_rate": 4.1012718195077196e-06, "logits/chosen": -0.37103739380836487, "logits/rejected": -0.3039020895957947, "logps/chosen": -0.9647709131240845, "logps/rejected": -1.0279747247695923, "loss": 1.0376, "odds_ratio_loss": 0.7286756038665771, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": -0.09647707641124725, "rewards/margins": 0.006320389453321695, "rewards/rejected": -0.10279747098684311, "sft_loss": 0.9647709131240845, "step": 470 }, { "epoch": 0.8535230051122472, "grad_norm": 0.5754956603050232, "learning_rate": 4.065219902796953e-06, "logits/chosen": -0.40020495653152466, "logits/rejected": -0.39535146951675415, "logps/chosen": -0.9706109166145325, "logps/rejected": -1.093976378440857, "loss": 1.0453, "odds_ratio_loss": 0.7464355230331421, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -0.09706110507249832, "rewards/margins": 0.01233654748648405, "rewards/rejected": -0.10939764976501465, "sft_loss": 0.9706109166145325, "step": 480 }, { "epoch": 0.871304734385419, "grad_norm": 0.3195387125015259, "learning_rate": 4.028624486874608e-06, "logits/chosen": -0.4315417408943176, "logits/rejected": -0.36453911662101746, "logps/chosen": -0.9465911984443665, "logps/rejected": -1.1121985912322998, "loss": 1.0194, "odds_ratio_loss": 0.7276239991188049, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.09465911984443665, "rewards/margins": 0.016560742631554604, "rewards/rejected": -0.1112198606133461, "sft_loss": 0.9465911984443665, "step": 490 }, { "epoch": 0.8890864636585908, "grad_norm": 0.6305994391441345, "learning_rate": 3.99149827895177e-06, "logits/chosen": -0.38445502519607544, "logits/rejected": -0.38218945264816284, "logps/chosen": -1.0171244144439697, "logps/rejected": -1.0506142377853394, "loss": 1.0913, "odds_ratio_loss": 0.7415187358856201, "rewards/accuracies": 0.4312500059604645, "rewards/chosen": -0.10171245038509369, "rewards/margins": 0.0033489768393337727, "rewards/rejected": -0.10506142675876617, "sft_loss": 1.0171244144439697, "step": 500 }, { "epoch": 0.8890864636585908, "eval_logits/chosen": -0.34904247522354126, "eval_logits/rejected": -0.31755369901657104, "eval_logps/chosen": -0.9676439166069031, "eval_logps/rejected": -1.1074860095977783, "eval_loss": 1.0354068279266357, "eval_odds_ratio_loss": 0.6776295900344849, "eval_rewards/accuracies": 0.5180000066757202, "eval_rewards/chosen": -0.09676438570022583, "eval_rewards/margins": 0.013984210789203644, "eval_rewards/rejected": -0.11074860394001007, "eval_runtime": 185.9798, "eval_samples_per_second": 5.377, "eval_sft_loss": 0.9676439166069031, "eval_steps_per_second": 2.688, "step": 500 }, { "epoch": 0.9068681929317626, "grad_norm": 0.33740749955177307, "learning_rate": 3.953854170549114e-06, "logits/chosen": -0.3074025809764862, "logits/rejected": -0.30263853073120117, "logps/chosen": -0.9824435114860535, "logps/rejected": -1.0204169750213623, "loss": 1.0555, "odds_ratio_loss": 0.7308207750320435, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.09824434667825699, "rewards/margins": 0.0037973597645759583, "rewards/rejected": -0.10204169899225235, "sft_loss": 0.9824435114860535, "step": 510 }, { "epoch": 0.9246499222049345, "grad_norm": 0.4032406210899353, "learning_rate": 3.91570523302051e-06, "logits/chosen": -0.3414192199707031, "logits/rejected": -0.36243736743927, "logps/chosen": -0.8989545702934265, "logps/rejected": -1.0376076698303223, "loss": 0.9695, "odds_ratio_loss": 0.7055255174636841, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.08989545702934265, "rewards/margins": 0.013865319080650806, "rewards/rejected": -0.10376076400279999, "sft_loss": 0.8989545702934265, "step": 520 }, { "epoch": 0.9424316514781063, "grad_norm": 0.3632182776927948, "learning_rate": 3.8770647130141996e-06, "logits/chosen": -0.3258126378059387, "logits/rejected": -0.33273980021476746, "logps/chosen": -0.9584708213806152, "logps/rejected": -1.0552600622177124, "loss": 1.0316, "odds_ratio_loss": 0.731722891330719, "rewards/accuracies": 0.46875, "rewards/chosen": -0.09584707766771317, "rewards/margins": 0.009678924456238747, "rewards/rejected": -0.10552600771188736, "sft_loss": 0.9584708213806152, "step": 530 }, { "epoch": 0.960213380751278, "grad_norm": 0.3121795058250427, "learning_rate": 3.837946027873086e-06, "logits/chosen": -0.32046863436698914, "logits/rejected": -0.3653668463230133, "logps/chosen": -0.966636061668396, "logps/rejected": -1.1031057834625244, "loss": 1.0367, "odds_ratio_loss": 0.7007311582565308, "rewards/accuracies": 0.5625, "rewards/chosen": -0.09666360169649124, "rewards/margins": 0.01364696491509676, "rewards/rejected": -0.11031056940555573, "sft_loss": 0.966636061668396, "step": 540 }, { "epoch": 0.9779951100244498, "grad_norm": 0.6487416625022888, "learning_rate": 3.7983627609757713e-06, "logits/chosen": -0.34747475385665894, "logits/rejected": -0.3490690290927887, "logps/chosen": -0.9615602493286133, "logps/rejected": -1.0271753072738647, "loss": 1.0318, "odds_ratio_loss": 0.702663779258728, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.09615601599216461, "rewards/margins": 0.0065615237690508366, "rewards/rejected": -0.10271754115819931, "sft_loss": 0.9615602493286133, "step": 550 }, { "epoch": 0.9957768392976217, "grad_norm": 0.3890874683856964, "learning_rate": 3.758328657019924e-06, "logits/chosen": -0.37014687061309814, "logits/rejected": -0.4008961319923401, "logps/chosen": -0.9199098348617554, "logps/rejected": -1.0562833547592163, "loss": 0.9886, "odds_ratio_loss": 0.6868860721588135, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -0.09199099242687225, "rewards/margins": 0.013637351803481579, "rewards/rejected": -0.1056283488869667, "sft_loss": 0.9199098348617554, "step": 560 }, { "epoch": 1.0135585685707935, "grad_norm": 1.5021965503692627, "learning_rate": 3.717857617249642e-06, "logits/chosen": -0.409252405166626, "logits/rejected": -0.3774147033691406, "logps/chosen": -1.0592560768127441, "logps/rejected": -1.1887257099151611, "loss": 1.135, "odds_ratio_loss": 0.7577823400497437, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.10592560470104218, "rewards/margins": 0.012946966104209423, "rewards/rejected": -0.11887258291244507, "sft_loss": 1.0592560768127441, "step": 570 }, { "epoch": 1.0313402978439654, "grad_norm": 0.36601969599723816, "learning_rate": 3.6769636946284543e-06, "logits/chosen": -0.33855992555618286, "logits/rejected": -0.38329094648361206, "logps/chosen": -0.9246651530265808, "logps/rejected": -1.0259661674499512, "loss": 0.9949, "odds_ratio_loss": 0.7019587755203247, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.09246651828289032, "rewards/margins": 0.01013010274618864, "rewards/rejected": -0.10259661823511124, "sft_loss": 0.9246651530265808, "step": 580 }, { "epoch": 1.049122027117137, "grad_norm": 0.3644584119319916, "learning_rate": 3.6356610889596355e-06, "logits/chosen": -0.3362785577774048, "logits/rejected": -0.3195570707321167, "logps/chosen": -0.9757383465766907, "logps/rejected": -1.0168259143829346, "loss": 1.0499, "odds_ratio_loss": 0.7411800622940063, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.09757383167743683, "rewards/margins": 0.004108763299882412, "rewards/rejected": -0.10168258845806122, "sft_loss": 0.9757383465766907, "step": 590 }, { "epoch": 1.066903756390309, "grad_norm": 0.38790592551231384, "learning_rate": 3.593964141955541e-06, "logits/chosen": -0.31955039501190186, "logits/rejected": -0.3287174701690674, "logps/chosen": -0.9446002244949341, "logps/rejected": -0.9857986569404602, "loss": 1.0183, "odds_ratio_loss": 0.7368658185005188, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.09446002542972565, "rewards/margins": 0.004119834862649441, "rewards/rejected": -0.09857985377311707, "sft_loss": 0.9446002244949341, "step": 600 }, { "epoch": 1.0846854856634809, "grad_norm": 0.3323744237422943, "learning_rate": 3.5518873322576573e-06, "logits/chosen": -0.43425217270851135, "logits/rejected": -0.3568256199359894, "logps/chosen": -0.9986424446105957, "logps/rejected": -1.0531480312347412, "loss": 1.073, "odds_ratio_loss": 0.7439261674880981, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -0.09986423701047897, "rewards/margins": 0.005450558383017778, "rewards/rejected": -0.10531480610370636, "sft_loss": 0.9986424446105957, "step": 610 }, { "epoch": 1.1024672149366526, "grad_norm": 0.45893725752830505, "learning_rate": 3.5094452704091143e-06, "logits/chosen": -0.3812747299671173, "logits/rejected": -0.36471351981163025, "logps/chosen": -0.9423580169677734, "logps/rejected": -1.0641114711761475, "loss": 1.0114, "odds_ratio_loss": 0.6907029747962952, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.09423580020666122, "rewards/margins": 0.01217535138130188, "rewards/rejected": -0.10641114413738251, "sft_loss": 0.9423580169677734, "step": 620 }, { "epoch": 1.1202489442098245, "grad_norm": 0.5117968916893005, "learning_rate": 3.46665269378139e-06, "logits/chosen": -0.3292369842529297, "logits/rejected": -0.3725055158138275, "logps/chosen": -0.9826286435127258, "logps/rejected": -1.0871622562408447, "loss": 1.0548, "odds_ratio_loss": 0.7213753461837769, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.09826286137104034, "rewards/margins": 0.010453373193740845, "rewards/rejected": -0.10871622711420059, "sft_loss": 0.9826286435127258, "step": 630 }, { "epoch": 1.1380306734829961, "grad_norm": 0.5622742176055908, "learning_rate": 3.4235244614569794e-06, "logits/chosen": -0.3315224051475525, "logits/rejected": -0.3257826566696167, "logps/chosen": -1.1072447299957275, "logps/rejected": -1.0443857908248901, "loss": 1.1924, "odds_ratio_loss": 0.8511736989021301, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.11072447150945663, "rewards/margins": -0.006285896059125662, "rewards/rejected": -0.10443858057260513, "sft_loss": 1.1072447299957275, "step": 640 }, { "epoch": 1.155812402756168, "grad_norm": 0.27428311109542847, "learning_rate": 3.3800755490698008e-06, "logits/chosen": -0.30900219082832336, "logits/rejected": -0.33938735723495483, "logps/chosen": -0.9312244653701782, "logps/rejected": -1.0983222723007202, "loss": 0.9964, "odds_ratio_loss": 0.651997447013855, "rewards/accuracies": 0.5625, "rewards/chosen": -0.0931224375963211, "rewards/margins": 0.01670977845788002, "rewards/rejected": -0.10983221232891083, "sft_loss": 0.9312244653701782, "step": 650 }, { "epoch": 1.17359413202934, "grad_norm": 1.0422977209091187, "learning_rate": 3.3363210436051287e-06, "logits/chosen": -0.3527902662754059, "logits/rejected": -0.3563137948513031, "logps/chosen": -0.978245735168457, "logps/rejected": -1.0940849781036377, "loss": 1.0514, "odds_ratio_loss": 0.73140949010849, "rewards/accuracies": 0.46875, "rewards/chosen": -0.0978245884180069, "rewards/margins": 0.011583918705582619, "rewards/rejected": -0.10940849781036377, "sft_loss": 0.978245735168457, "step": 660 }, { "epoch": 1.1913758613025116, "grad_norm": 0.4168451428413391, "learning_rate": 3.292276138160867e-06, "logits/chosen": -0.28714054822921753, "logits/rejected": -0.30155253410339355, "logps/chosen": -0.934456467628479, "logps/rejected": -1.0636101961135864, "loss": 1.0032, "odds_ratio_loss": 0.6879295110702515, "rewards/accuracies": 0.4312500059604645, "rewards/chosen": -0.09344564378261566, "rewards/margins": 0.012915370985865593, "rewards/rejected": -0.1063610091805458, "sft_loss": 0.934456467628479, "step": 670 }, { "epoch": 1.2091575905756835, "grad_norm": 0.34239086508750916, "learning_rate": 3.2479561266719694e-06, "logits/chosen": -0.381683886051178, "logits/rejected": -0.37388402223587036, "logps/chosen": -0.9762662649154663, "logps/rejected": -1.0414526462554932, "loss": 1.0493, "odds_ratio_loss": 0.7306024432182312, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.09762662649154663, "rewards/margins": 0.006518647074699402, "rewards/rejected": -0.10414527356624603, "sft_loss": 0.9762662649154663, "step": 680 }, { "epoch": 1.2269393198488552, "grad_norm": 0.4666767716407776, "learning_rate": 3.2033763985998533e-06, "logits/chosen": -0.3561275601387024, "logits/rejected": -0.3666972517967224, "logps/chosen": -0.9278993606567383, "logps/rejected": -1.172456979751587, "loss": 0.9924, "odds_ratio_loss": 0.6447319984436035, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.09278994053602219, "rewards/margins": 0.024455763399600983, "rewards/rejected": -0.11724568903446198, "sft_loss": 0.9278993606567383, "step": 690 }, { "epoch": 1.244721049122027, "grad_norm": 0.4466889202594757, "learning_rate": 3.1585524335886335e-06, "logits/chosen": -0.3700794279575348, "logits/rejected": -0.37532711029052734, "logps/chosen": -0.893964409828186, "logps/rejected": -1.0242712497711182, "loss": 0.9628, "odds_ratio_loss": 0.6878638863563538, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -0.08939644694328308, "rewards/margins": 0.013030675239861012, "rewards/rejected": -0.10242712497711182, "sft_loss": 0.893964409828186, "step": 700 }, { "epoch": 1.262502778395199, "grad_norm": 0.6432116031646729, "learning_rate": 3.1134997960900536e-06, "logits/chosen": -0.3843459486961365, "logits/rejected": -0.4183478355407715, "logps/chosen": -0.8787266612052917, "logps/rejected": -1.1227346658706665, "loss": 0.9417, "odds_ratio_loss": 0.6295467615127563, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.08787266910076141, "rewards/margins": 0.024400796741247177, "rewards/rejected": -0.11227346956729889, "sft_loss": 0.8787266612052917, "step": 710 }, { "epoch": 1.2802845076683709, "grad_norm": 0.47079232335090637, "learning_rate": 3.0682341299589583e-06, "logits/chosen": -0.3750189244747162, "logits/rejected": -0.33040302991867065, "logps/chosen": -0.9284566640853882, "logps/rejected": -0.9662970304489136, "loss": 1.0031, "odds_ratio_loss": 0.7467560172080994, "rewards/accuracies": 0.4124999940395355, "rewards/chosen": -0.09284567832946777, "rewards/margins": 0.0037840281147509813, "rewards/rejected": -0.09662970155477524, "sft_loss": 0.9284566640853882, "step": 720 }, { "epoch": 1.2980662369415426, "grad_norm": 0.4881021976470947, "learning_rate": 3.022771153021201e-06, "logits/chosen": -0.3772386610507965, "logits/rejected": -0.3512099087238312, "logps/chosen": -0.9160524606704712, "logps/rejected": -1.0388538837432861, "loss": 0.986, "odds_ratio_loss": 0.6990936994552612, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.09160524606704712, "rewards/margins": 0.012280138209462166, "rewards/rejected": -0.10388537496328354, "sft_loss": 0.9160524606704712, "step": 730 }, { "epoch": 1.3158479662147144, "grad_norm": 0.3279300034046173, "learning_rate": 2.9771266516158625e-06, "logits/chosen": -0.33211830258369446, "logits/rejected": -0.3039989471435547, "logps/chosen": -0.9333264231681824, "logps/rejected": -1.0419334173202515, "loss": 1.0054, "odds_ratio_loss": 0.72088623046875, "rewards/accuracies": 0.5, "rewards/chosen": -0.09333264082670212, "rewards/margins": 0.010860702954232693, "rewards/rejected": -0.10419335216283798, "sft_loss": 0.9333264231681824, "step": 740 }, { "epoch": 1.3336296954878861, "grad_norm": 0.311788409948349, "learning_rate": 2.9313164751136802e-06, "logits/chosen": -0.3910767436027527, "logits/rejected": -0.36302170157432556, "logps/chosen": -0.9149459004402161, "logps/rejected": -1.0412867069244385, "loss": 0.9824, "odds_ratio_loss": 0.6748364567756653, "rewards/accuracies": 0.5625, "rewards/chosen": -0.0914945974946022, "rewards/margins": 0.01263406127691269, "rewards/rejected": -0.1041286438703537, "sft_loss": 0.9149459004402161, "step": 750 }, { "epoch": 1.351411424761058, "grad_norm": 0.5009350180625916, "learning_rate": 2.8853565304135956e-06, "logits/chosen": -0.28646108508110046, "logits/rejected": -0.3241187632083893, "logps/chosen": -0.988601803779602, "logps/rejected": -1.0276473760604858, "loss": 1.0645, "odds_ratio_loss": 0.759224534034729, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -0.09886018931865692, "rewards/margins": 0.0039045563898980618, "rewards/rejected": -0.10276474803686142, "sft_loss": 0.988601803779602, "step": 760 }, { "epoch": 1.36919315403423, "grad_norm": 0.5821639895439148, "learning_rate": 2.839262776419313e-06, "logits/chosen": -0.345294713973999, "logits/rejected": -0.34865519404411316, "logps/chosen": -0.9152688980102539, "logps/rejected": -1.12654709815979, "loss": 0.9828, "odds_ratio_loss": 0.6755408644676208, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.09152691066265106, "rewards/margins": 0.021127816289663315, "rewards/rejected": -0.11265470832586288, "sft_loss": 0.9152688980102539, "step": 770 }, { "epoch": 1.3869748833074016, "grad_norm": 0.39795824885368347, "learning_rate": 2.793051218497817e-06, "logits/chosen": -0.27542608976364136, "logits/rejected": -0.27257028222084045, "logps/chosen": -0.931863009929657, "logps/rejected": -0.9498918652534485, "loss": 1.0074, "odds_ratio_loss": 0.7550782561302185, "rewards/accuracies": 0.4375, "rewards/chosen": -0.09318631142377853, "rewards/margins": 0.0018028710037469864, "rewards/rejected": -0.09498917311429977, "sft_loss": 0.931863009929657, "step": 780 }, { "epoch": 1.4047566125805735, "grad_norm": 0.37384262681007385, "learning_rate": 2.7467379029217437e-06, "logits/chosen": -0.34524422883987427, "logits/rejected": -0.36011195182800293, "logps/chosen": -0.9515836834907532, "logps/rejected": -1.0694557428359985, "loss": 1.0211, "odds_ratio_loss": 0.6952496767044067, "rewards/accuracies": 0.5, "rewards/chosen": -0.09515835344791412, "rewards/margins": 0.011787201277911663, "rewards/rejected": -0.10694557428359985, "sft_loss": 0.9515836834907532, "step": 790 }, { "epoch": 1.4225383418537452, "grad_norm": 0.30680692195892334, "learning_rate": 2.7003389112975546e-06, "logits/chosen": -0.26400548219680786, "logits/rejected": -0.20824924111366272, "logps/chosen": -0.9995955228805542, "logps/rejected": -1.0734318494796753, "loss": 1.0721, "odds_ratio_loss": 0.7255308628082275, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.09995955973863602, "rewards/margins": 0.007383632007986307, "rewards/rejected": -0.10734319686889648, "sft_loss": 0.9995955228805542, "step": 800 }, { "epoch": 1.440320071126917, "grad_norm": 0.7603825926780701, "learning_rate": 2.653870354981437e-06, "logits/chosen": -0.36708512902259827, "logits/rejected": -0.4067977964878082, "logps/chosen": -0.869776725769043, "logps/rejected": -0.9957377314567566, "loss": 0.9397, "odds_ratio_loss": 0.6991982460021973, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.08697767555713654, "rewards/margins": 0.012596105225384235, "rewards/rejected": -0.0995737761259079, "sft_loss": 0.869776725769043, "step": 810 }, { "epoch": 1.458101800400089, "grad_norm": 0.8572419881820679, "learning_rate": 2.6073483694848777e-06, "logits/chosen": -0.3313853442668915, "logits/rejected": -0.2504517734050751, "logps/chosen": -0.9180091619491577, "logps/rejected": -1.0551806688308716, "loss": 0.9865, "odds_ratio_loss": 0.6853691339492798, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -0.09180092811584473, "rewards/margins": 0.013717141933739185, "rewards/rejected": -0.10551806539297104, "sft_loss": 0.9180091619491577, "step": 820 }, { "epoch": 1.4758835296732609, "grad_norm": 0.2907600700855255, "learning_rate": 2.560789108871847e-06, "logits/chosen": -0.35712695121765137, "logits/rejected": -0.34705477952957153, "logps/chosen": -0.9147292971611023, "logps/rejected": -1.1361644268035889, "loss": 0.9806, "odds_ratio_loss": 0.6587303280830383, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -0.09147293865680695, "rewards/margins": 0.02214350923895836, "rewards/rejected": -0.113616444170475, "sft_loss": 0.9147292971611023, "step": 830 }, { "epoch": 1.4936652589464325, "grad_norm": 0.9957931637763977, "learning_rate": 2.514208740149544e-06, "logits/chosen": -0.38370782136917114, "logits/rejected": -0.372738778591156, "logps/chosen": -1.0301647186279297, "logps/rejected": -1.131388783454895, "loss": 1.1016, "odds_ratio_loss": 0.7141064405441284, "rewards/accuracies": 0.46875, "rewards/chosen": -0.10301647335290909, "rewards/margins": 0.010122401639819145, "rewards/rejected": -0.11313886940479279, "sft_loss": 1.0301647186279297, "step": 840 }, { "epoch": 1.5114469882196042, "grad_norm": 0.3347834050655365, "learning_rate": 2.46762343765464e-06, "logits/chosen": -0.33272939920425415, "logits/rejected": -0.3354397416114807, "logps/chosen": -0.9821497797966003, "logps/rejected": -1.1356861591339111, "loss": 1.0494, "odds_ratio_loss": 0.672347903251648, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.09821496903896332, "rewards/margins": 0.015353633090853691, "rewards/rejected": -0.11356861889362335, "sft_loss": 0.9821497797966003, "step": 850 }, { "epoch": 1.5292287174927761, "grad_norm": 0.40781450271606445, "learning_rate": 2.4210493774369903e-06, "logits/chosen": -0.3659764528274536, "logits/rejected": -0.34343641996383667, "logps/chosen": -0.9932387471199036, "logps/rejected": -1.0735210180282593, "loss": 1.0663, "odds_ratio_loss": 0.7305824160575867, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.09932386875152588, "rewards/margins": 0.008028226904571056, "rewards/rejected": -0.10735210031270981, "sft_loss": 0.9932387471199036, "step": 860 }, { "epoch": 1.547010446765948, "grad_norm": 0.33270904421806335, "learning_rate": 2.374502731642732e-06, "logits/chosen": -0.33156028389930725, "logits/rejected": -0.3256151080131531, "logps/chosen": -0.9762036204338074, "logps/rejected": -1.0732605457305908, "loss": 1.0483, "odds_ratio_loss": 0.7209652662277222, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.09762036800384521, "rewards/margins": 0.009705697186291218, "rewards/rejected": -0.10732606798410416, "sft_loss": 0.9762036204338074, "step": 870 }, { "epoch": 1.56479217603912, "grad_norm": 0.46649253368377686, "learning_rate": 2.3279996628987556e-06, "logits/chosen": -0.3505496084690094, "logits/rejected": -0.3284318149089813, "logps/chosen": -0.9539216756820679, "logps/rejected": -1.0178234577178955, "loss": 1.0269, "odds_ratio_loss": 0.7295688390731812, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.09539216756820679, "rewards/margins": 0.006390177644789219, "rewards/rejected": -0.10178234428167343, "sft_loss": 0.9539216756820679, "step": 880 }, { "epoch": 1.5825739053122916, "grad_norm": 0.343382865190506, "learning_rate": 2.281556318700474e-06, "logits/chosen": -0.2859468460083008, "logits/rejected": -0.25978535413742065, "logps/chosen": -0.904071033000946, "logps/rejected": -0.9673022031784058, "loss": 0.9788, "odds_ratio_loss": 0.7473067045211792, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.0904071107506752, "rewards/margins": 0.00632312148809433, "rewards/rejected": -0.09673022478818893, "sft_loss": 0.904071033000946, "step": 890 }, { "epoch": 1.6003556345854635, "grad_norm": 0.6206201314926147, "learning_rate": 2.2351888258048408e-06, "logits/chosen": -0.3074144423007965, "logits/rejected": -0.2645527720451355, "logps/chosen": -0.8916131854057312, "logps/rejected": -0.9986615180969238, "loss": 0.9603, "odds_ratio_loss": 0.6866299510002136, "rewards/accuracies": 0.5, "rewards/chosen": -0.08916132152080536, "rewards/margins": 0.010704840533435345, "rewards/rejected": -0.09986615926027298, "sft_loss": 0.8916131854057312, "step": 900 }, { "epoch": 1.6181373638586352, "grad_norm": 0.3601900339126587, "learning_rate": 2.188913284630584e-06, "logits/chosen": -0.33852243423461914, "logits/rejected": -0.3135743737220764, "logps/chosen": -0.9911006689071655, "logps/rejected": -1.016789197921753, "loss": 1.0679, "odds_ratio_loss": 0.7680201530456543, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -0.09911007434129715, "rewards/margins": 0.0025688547175377607, "rewards/rejected": -0.10167893022298813, "sft_loss": 0.9911006689071655, "step": 910 }, { "epoch": 1.635919093131807, "grad_norm": 0.6057630777359009, "learning_rate": 2.1427457636675652e-06, "logits/chosen": -0.3320189118385315, "logits/rejected": -0.28204983472824097, "logps/chosen": -1.0480351448059082, "logps/rejected": -1.1421617269515991, "loss": 1.1202, "odds_ratio_loss": 0.7219060659408569, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -0.10480351746082306, "rewards/margins": 0.00941266119480133, "rewards/rejected": -0.11421617120504379, "sft_loss": 1.0480351448059082, "step": 920 }, { "epoch": 1.653700822404979, "grad_norm": 0.27687886357307434, "learning_rate": 2.096702293897247e-06, "logits/chosen": -0.3558569550514221, "logits/rejected": -0.4100232720375061, "logps/chosen": -0.9075578451156616, "logps/rejected": -1.1192221641540527, "loss": 0.9773, "odds_ratio_loss": 0.6971360445022583, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.09075579047203064, "rewards/margins": 0.02116643264889717, "rewards/rejected": -0.11192221939563751, "sft_loss": 0.9075578451156616, "step": 930 }, { "epoch": 1.6714825516781509, "grad_norm": 0.5104541182518005, "learning_rate": 2.0507988632261672e-06, "logits/chosen": -0.37269848585128784, "logits/rejected": -0.3488038182258606, "logps/chosen": -0.8780601620674133, "logps/rejected": -1.035788893699646, "loss": 0.9453, "odds_ratio_loss": 0.6724425554275513, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.08780601620674133, "rewards/margins": 0.015772882848978043, "rewards/rejected": -0.10357888787984848, "sft_loss": 0.8780601620674133, "step": 940 }, { "epoch": 1.6892642809513225, "grad_norm": 1.108080506324768, "learning_rate": 2.005051410934382e-06, "logits/chosen": -0.3843027949333191, "logits/rejected": -0.36695486307144165, "logps/chosen": -1.0294411182403564, "logps/rejected": -1.073974847793579, "loss": 1.1057, "odds_ratio_loss": 0.7625271081924438, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -0.10294412076473236, "rewards/margins": 0.004453369881957769, "rewards/rejected": -0.10739749670028687, "sft_loss": 1.0294411182403564, "step": 950 }, { "epoch": 1.7070460102244942, "grad_norm": 0.6668155789375305, "learning_rate": 1.9594758221407843e-06, "logits/chosen": -0.30207034945487976, "logits/rejected": -0.31365981698036194, "logps/chosen": -0.8924224972724915, "logps/rejected": -1.0662165880203247, "loss": 0.9564, "odds_ratio_loss": 0.6395965218544006, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.08924224227666855, "rewards/margins": 0.017379416152834892, "rewards/rejected": -0.10662166774272919, "sft_loss": 0.8924224972724915, "step": 960 }, { "epoch": 1.724827739497666, "grad_norm": 0.5297231674194336, "learning_rate": 1.9140879222872408e-06, "logits/chosen": -0.3790926933288574, "logits/rejected": -0.34034663438796997, "logps/chosen": -0.9109382629394531, "logps/rejected": -0.9725145101547241, "loss": 0.9864, "odds_ratio_loss": 0.7550500631332397, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -0.09109383821487427, "rewards/margins": 0.006157620809972286, "rewards/rejected": -0.09725145250558853, "sft_loss": 0.9109382629394531, "step": 970 }, { "epoch": 1.742609468770838, "grad_norm": 0.2978646457195282, "learning_rate": 1.8689034716434346e-06, "logits/chosen": -0.3594937026500702, "logits/rejected": -0.3786514699459076, "logps/chosen": -0.9791936874389648, "logps/rejected": -1.0208795070648193, "loss": 1.054, "odds_ratio_loss": 0.7475694417953491, "rewards/accuracies": 0.46875, "rewards/chosen": -0.09791935980319977, "rewards/margins": 0.0041685826145112514, "rewards/rejected": -0.10208795219659805, "sft_loss": 0.9791936874389648, "step": 980 }, { "epoch": 1.76039119804401, "grad_norm": 0.3484848439693451, "learning_rate": 1.8239381598343576e-06, "logits/chosen": -0.29449883103370667, "logits/rejected": -0.3054262697696686, "logps/chosen": -0.9115015864372253, "logps/rejected": -1.0031999349594116, "loss": 0.9826, "odds_ratio_loss": 0.7106297016143799, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.09115016460418701, "rewards/margins": 0.00916983187198639, "rewards/rejected": -0.1003199964761734, "sft_loss": 0.9115015864372253, "step": 990 }, { "epoch": 1.7781729273171816, "grad_norm": 2.2374985218048096, "learning_rate": 1.779207600392312e-06, "logits/chosen": -0.2810733914375305, "logits/rejected": -0.27120235562324524, "logps/chosen": -0.9607506990432739, "logps/rejected": -1.0408788919448853, "loss": 1.0328, "odds_ratio_loss": 0.7200591564178467, "rewards/accuracies": 0.46875, "rewards/chosen": -0.09607508033514023, "rewards/margins": 0.008012807928025723, "rewards/rejected": -0.10408788919448853, "sft_loss": 0.9607506990432739, "step": 1000 }, { "epoch": 1.7781729273171816, "eval_logits/chosen": -0.33078742027282715, "eval_logits/rejected": -0.29791274666786194, "eval_logps/chosen": -0.9451074004173279, "eval_logps/rejected": -1.0856181383132935, "eval_loss": 1.0125839710235596, "eval_odds_ratio_loss": 0.6747645735740662, "eval_rewards/accuracies": 0.515999972820282, "eval_rewards/chosen": -0.0945107489824295, "eval_rewards/margins": 0.014051074162125587, "eval_rewards/rejected": -0.10856182873249054, "eval_runtime": 185.8537, "eval_samples_per_second": 5.381, "eval_sft_loss": 0.9451074004173279, "eval_steps_per_second": 2.69, "step": 1000 }, { "epoch": 1.7959546565903532, "grad_norm": 0.7795166373252869, "learning_rate": 1.7347273253353552e-06, "logits/chosen": -0.33356940746307373, "logits/rejected": -0.3380289077758789, "logps/chosen": -0.918900191783905, "logps/rejected": -0.9768841862678528, "loss": 0.9932, "odds_ratio_loss": 0.7426038980484009, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.09189002215862274, "rewards/margins": 0.005798395723104477, "rewards/rejected": -0.09768841415643692, "sft_loss": 0.918900191783905, "step": 1010 }, { "epoch": 1.8137363858635251, "grad_norm": 0.8157365322113037, "learning_rate": 1.690512779774029e-06, "logits/chosen": -0.3094736635684967, "logits/rejected": -0.28969138860702515, "logps/chosen": -0.9715908765792847, "logps/rejected": -1.1499989032745361, "loss": 1.037, "odds_ratio_loss": 0.6542772054672241, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.09715909510850906, "rewards/margins": 0.017840798944234848, "rewards/rejected": -0.11499989032745361, "sft_loss": 0.9715908765792847, "step": 1020 }, { "epoch": 1.831518115136697, "grad_norm": 0.5331993103027344, "learning_rate": 1.6465793165482838e-06, "logits/chosen": -0.274508535861969, "logits/rejected": -0.26048415899276733, "logps/chosen": -0.9679173231124878, "logps/rejected": -1.0533314943313599, "loss": 1.0376, "odds_ratio_loss": 0.6963869333267212, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.09679173678159714, "rewards/margins": 0.008541420102119446, "rewards/rejected": -0.10533314943313599, "sft_loss": 0.9679173231124878, "step": 1030 }, { "epoch": 1.849299844409869, "grad_norm": 0.4930827021598816, "learning_rate": 1.6029421908964305e-06, "logits/chosen": -0.3850288391113281, "logits/rejected": -0.3791029155254364, "logps/chosen": -0.8834483027458191, "logps/rejected": -1.2469079494476318, "loss": 0.9502, "odds_ratio_loss": 0.6672720313072205, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.08834483474493027, "rewards/margins": 0.03634597733616829, "rewards/rejected": -0.12469079345464706, "sft_loss": 0.8834483027458191, "step": 1040 }, { "epoch": 1.8670815736830408, "grad_norm": 0.7664922475814819, "learning_rate": 1.559616555157985e-06, "logits/chosen": -0.30128011107444763, "logits/rejected": -0.33186617493629456, "logps/chosen": -0.9356236457824707, "logps/rejected": -1.047398328781128, "loss": 1.0066, "odds_ratio_loss": 0.7096288800239563, "rewards/accuracies": 0.46875, "rewards/chosen": -0.09356234967708588, "rewards/margins": 0.01117746438831091, "rewards/rejected": -0.10473982989788055, "sft_loss": 0.9356236457824707, "step": 1050 }, { "epoch": 1.8848633029562125, "grad_norm": 0.465348482131958, "learning_rate": 1.516617453512252e-06, "logits/chosen": -0.36206910014152527, "logits/rejected": -0.34239286184310913, "logps/chosen": -0.9592390060424805, "logps/rejected": -1.0232237577438354, "loss": 1.0338, "odds_ratio_loss": 0.7456762194633484, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.09592391550540924, "rewards/margins": 0.006398468278348446, "rewards/rejected": -0.10232237726449966, "sft_loss": 0.9592390060424805, "step": 1060 }, { "epoch": 1.9026450322293842, "grad_norm": 0.830959677696228, "learning_rate": 1.473959816754449e-06, "logits/chosen": -0.39980772137641907, "logits/rejected": -0.3537663221359253, "logps/chosen": -0.920127272605896, "logps/rejected": -0.9525257349014282, "loss": 0.9942, "odds_ratio_loss": 0.7409034967422485, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.09201272577047348, "rewards/margins": 0.0032398372422903776, "rewards/rejected": -0.09525256603956223, "sft_loss": 0.920127272605896, "step": 1070 }, { "epoch": 1.920426761502556, "grad_norm": 0.442227303981781, "learning_rate": 1.4316584571112213e-06, "logits/chosen": -0.23950842022895813, "logits/rejected": -0.25979962944984436, "logps/chosen": -0.9493446350097656, "logps/rejected": -1.02411687374115, "loss": 1.022, "odds_ratio_loss": 0.7267680764198303, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -0.09493447840213776, "rewards/margins": 0.007477219216525555, "rewards/rejected": -0.10241168737411499, "sft_loss": 0.9493446350097656, "step": 1080 }, { "epoch": 1.938208490775728, "grad_norm": 0.4206017851829529, "learning_rate": 1.389728063097306e-06, "logits/chosen": -0.23708462715148926, "logits/rejected": -0.24299781024456024, "logps/chosen": -0.9439695477485657, "logps/rejected": -1.1116364002227783, "loss": 1.0118, "odds_ratio_loss": 0.6782708764076233, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.09439694881439209, "rewards/margins": 0.016766689717769623, "rewards/rejected": -0.1111636534333229, "sft_loss": 0.9439695477485657, "step": 1090 }, { "epoch": 1.9559902200488999, "grad_norm": 0.3826051354408264, "learning_rate": 1.348183194415179e-06, "logits/chosen": -0.332774817943573, "logits/rejected": -0.35824882984161377, "logps/chosen": -0.9340184926986694, "logps/rejected": -1.110877275466919, "loss": 1.0005, "odds_ratio_loss": 0.6648778915405273, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.09340184926986694, "rewards/margins": 0.01768588088452816, "rewards/rejected": -0.11108773946762085, "sft_loss": 0.9340184926986694, "step": 1100 }, { "epoch": 1.9737719493220716, "grad_norm": 0.3005673587322235, "learning_rate": 1.3070382768994015e-06, "logits/chosen": -0.30200204253196716, "logits/rejected": -0.3130107522010803, "logps/chosen": -0.9192419052124023, "logps/rejected": -0.9889400601387024, "loss": 0.9898, "odds_ratio_loss": 0.7055012583732605, "rewards/accuracies": 0.45625001192092896, "rewards/chosen": -0.09192419052124023, "rewards/margins": 0.006969820708036423, "rewards/rejected": -0.09889401495456696, "sft_loss": 0.9192419052124023, "step": 1110 }, { "epoch": 1.9915536785952432, "grad_norm": 0.4379596710205078, "learning_rate": 1.2663075975074746e-06, "logits/chosen": -0.3314594626426697, "logits/rejected": -0.33315131068229675, "logps/chosen": -0.9054539799690247, "logps/rejected": -1.0939247608184814, "loss": 0.9734, "odds_ratio_loss": 0.6797955632209778, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.0905454009771347, "rewards/margins": 0.018847089260816574, "rewards/rejected": -0.10939247906208038, "sft_loss": 0.9054539799690247, "step": 1120 }, { "epoch": 2.009335407868415, "grad_norm": 0.6127385497093201, "learning_rate": 1.2260052993589034e-06, "logits/chosen": -0.382732093334198, "logits/rejected": -0.36521822214126587, "logps/chosen": -1.0369594097137451, "logps/rejected": -1.0331060886383057, "loss": 1.1183, "odds_ratio_loss": 0.8130975961685181, "rewards/accuracies": 0.46875, "rewards/chosen": -0.10369595140218735, "rewards/margins": -0.0003853384405374527, "rewards/rejected": -0.10331060737371445, "sft_loss": 1.0369594097137451, "step": 1130 }, { "epoch": 2.027117137141587, "grad_norm": 0.3373187780380249, "learning_rate": 1.1861453768242099e-06, "logits/chosen": -0.3635232448577881, "logits/rejected": -0.3613505959510803, "logps/chosen": -0.9056431651115417, "logps/rejected": -1.0306495428085327, "loss": 0.9749, "odds_ratio_loss": 0.6926708221435547, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.0905643105506897, "rewards/margins": 0.012500641867518425, "rewards/rejected": -0.10306496918201447, "sft_loss": 0.9056431651115417, "step": 1140 }, { "epoch": 2.044898866414759, "grad_norm": 0.9102166891098022, "learning_rate": 1.1467416706655982e-06, "logits/chosen": -0.2888937294483185, "logits/rejected": -0.26064902544021606, "logps/chosen": -0.9796838760375977, "logps/rejected": -1.1222679615020752, "loss": 1.0522, "odds_ratio_loss": 0.7250452637672424, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -0.09796838462352753, "rewards/margins": 0.014258405193686485, "rewards/rejected": -0.11222679913043976, "sft_loss": 0.9796838760375977, "step": 1150 }, { "epoch": 2.062680595687931, "grad_norm": 0.3294011652469635, "learning_rate": 1.1078078632309559e-06, "logits/chosen": -0.34561508893966675, "logits/rejected": -0.3147248923778534, "logps/chosen": -0.9134725332260132, "logps/rejected": -1.0285111665725708, "loss": 0.9808, "odds_ratio_loss": 0.6730369329452515, "rewards/accuracies": 0.53125, "rewards/chosen": -0.09134725481271744, "rewards/margins": 0.011503859423100948, "rewards/rejected": -0.10285113006830215, "sft_loss": 0.9134725332260132, "step": 1160 }, { "epoch": 2.0804623249611023, "grad_norm": 0.34308087825775146, "learning_rate": 1.0693574737028627e-06, "logits/chosen": -0.3372167944908142, "logits/rejected": -0.33946290612220764, "logps/chosen": -0.9201191067695618, "logps/rejected": -1.0031434297561646, "loss": 0.9946, "odds_ratio_loss": 0.744364321231842, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -0.09201192110776901, "rewards/margins": 0.008302421309053898, "rewards/rejected": -0.10031434148550034, "sft_loss": 0.9201191067695618, "step": 1170 }, { "epoch": 2.098244054234274, "grad_norm": 0.5865955948829651, "learning_rate": 1.0314038534042586e-06, "logits/chosen": -0.2901017963886261, "logits/rejected": -0.32853323221206665, "logps/chosen": -0.9257968068122864, "logps/rejected": -1.0451035499572754, "loss": 0.9964, "odds_ratio_loss": 0.7055808901786804, "rewards/accuracies": 0.5, "rewards/chosen": -0.0925796777009964, "rewards/margins": 0.01193068828433752, "rewards/rejected": -0.10451038181781769, "sft_loss": 0.9257968068122864, "step": 1180 }, { "epoch": 2.116025783507446, "grad_norm": 0.41964584589004517, "learning_rate": 9.939601811623946e-07, "logits/chosen": -0.31542712450027466, "logits/rejected": -0.30006498098373413, "logps/chosen": -0.9362471699714661, "logps/rejected": -1.0245290994644165, "loss": 1.0084, "odds_ratio_loss": 0.7219125032424927, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.09362472593784332, "rewards/margins": 0.008828198537230492, "rewards/rejected": -0.10245291888713837, "sft_loss": 0.9362471699714661, "step": 1190 }, { "epoch": 2.133807512780618, "grad_norm": 0.48077794909477234, "learning_rate": 9.570394587326825e-07, "logits/chosen": -0.29744619131088257, "logits/rejected": -0.34743356704711914, "logps/chosen": -0.9422229528427124, "logps/rejected": -1.1074718236923218, "loss": 1.0093, "odds_ratio_loss": 0.6704057455062866, "rewards/accuracies": 0.59375, "rewards/chosen": -0.094222292304039, "rewards/margins": 0.016524888575077057, "rewards/rejected": -0.11074719578027725, "sft_loss": 0.9422229528427124, "step": 1200 }, { "epoch": 2.15158924205379, "grad_norm": 0.3064732253551483, "learning_rate": 9.206545062840302e-07, "logits/chosen": -0.2666998505592346, "logits/rejected": -0.3201262652873993, "logps/chosen": -0.8927067518234253, "logps/rejected": -1.0634257793426514, "loss": 0.9575, "odds_ratio_loss": 0.6478100419044495, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.08927067369222641, "rewards/margins": 0.017071900889277458, "rewards/rejected": -0.10634257644414902, "sft_loss": 0.8927067518234253, "step": 1210 }, { "epoch": 2.1693709713269618, "grad_norm": 0.3534330725669861, "learning_rate": 8.848179579472285e-07, "logits/chosen": -0.3102249801158905, "logits/rejected": -0.2955402433872223, "logps/chosen": -0.9082851409912109, "logps/rejected": -0.9553133845329285, "loss": 0.9795, "odds_ratio_loss": 0.7121320962905884, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.09082850813865662, "rewards/margins": 0.00470283068716526, "rewards/rejected": -0.09553134441375732, "sft_loss": 0.9082851409912109, "step": 1220 }, { "epoch": 2.1871527006001332, "grad_norm": 0.6444931626319885, "learning_rate": 8.495422574279403e-07, "logits/chosen": -0.3936762809753418, "logits/rejected": -0.42016810178756714, "logps/chosen": -0.8496967554092407, "logps/rejected": -1.0362155437469482, "loss": 0.9135, "odds_ratio_loss": 0.6377807855606079, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.08496967703104019, "rewards/margins": 0.018651869148015976, "rewards/rejected": -0.10362155735492706, "sft_loss": 0.8496967554092407, "step": 1230 }, { "epoch": 2.204934429873305, "grad_norm": 0.4805600941181183, "learning_rate": 8.148396536858063e-07, "logits/chosen": -0.3237206041812897, "logits/rejected": -0.3143185079097748, "logps/chosen": -0.9960983991622925, "logps/rejected": -1.1420572996139526, "loss": 1.0672, "odds_ratio_loss": 0.7113397121429443, "rewards/accuracies": 0.53125, "rewards/chosen": -0.0996098443865776, "rewards/margins": 0.014595886692404747, "rewards/rejected": -0.1142057403922081, "sft_loss": 0.9960983991622925, "step": 1240 }, { "epoch": 2.222716159146477, "grad_norm": 0.676315188407898, "learning_rate": 7.807221966811815e-07, "logits/chosen": -0.29545170068740845, "logits/rejected": -0.31817343831062317, "logps/chosen": -0.9420124292373657, "logps/rejected": -1.0276824235916138, "loss": 1.0181, "odds_ratio_loss": 0.7609573006629944, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -0.0942012369632721, "rewards/margins": 0.008567007258534431, "rewards/rejected": -0.10276825726032257, "sft_loss": 0.9420124292373657, "step": 1250 }, { "epoch": 2.240497888419649, "grad_norm": 0.3943430781364441, "learning_rate": 7.47201733190962e-07, "logits/chosen": -0.3520922362804413, "logits/rejected": -0.3318483829498291, "logps/chosen": -0.8970060348510742, "logps/rejected": -0.9855879545211792, "loss": 0.9669, "odds_ratio_loss": 0.6993352174758911, "rewards/accuracies": 0.5, "rewards/chosen": -0.0897006094455719, "rewards/margins": 0.008858194574713707, "rewards/rejected": -0.09855880588293076, "sft_loss": 0.8970060348510742, "step": 1260 }, { "epoch": 2.258279617692821, "grad_norm": 0.5184921026229858, "learning_rate": 7.142899026949721e-07, "logits/chosen": -0.33211636543273926, "logits/rejected": -0.3313821256160736, "logps/chosen": -0.9101552963256836, "logps/rejected": -0.9938360452651978, "loss": 0.9798, "odds_ratio_loss": 0.6968866586685181, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.09101552516222, "rewards/margins": 0.00836807768791914, "rewards/rejected": -0.09938360750675201, "sft_loss": 0.9101552963256836, "step": 1270 }, { "epoch": 2.2760613469659923, "grad_norm": 1.8007909059524536, "learning_rate": 6.819981333343273e-07, "logits/chosen": -0.3704894185066223, "logits/rejected": -0.3426709771156311, "logps/chosen": -0.9317655563354492, "logps/rejected": -1.0302845239639282, "loss": 1.003, "odds_ratio_loss": 0.7128146886825562, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.09317655861377716, "rewards/margins": 0.009851890616118908, "rewards/rejected": -0.10302845388650894, "sft_loss": 0.9317655563354492, "step": 1280 }, { "epoch": 2.293843076239164, "grad_norm": 0.4554091989994049, "learning_rate": 6.503376379431839e-07, "logits/chosen": -0.2947995066642761, "logits/rejected": -0.279682457447052, "logps/chosen": -0.9925037622451782, "logps/rejected": -0.9870964884757996, "loss": 1.068, "odds_ratio_loss": 0.7550127506256104, "rewards/accuracies": 0.41874998807907104, "rewards/chosen": -0.09925039112567902, "rewards/margins": -0.0005407325807027519, "rewards/rejected": -0.09870964288711548, "sft_loss": 0.9925037622451782, "step": 1290 }, { "epoch": 2.311624805512336, "grad_norm": 1.7697697877883911, "learning_rate": 6.193194101552502e-07, "logits/chosen": -0.31604236364364624, "logits/rejected": -0.35974448919296265, "logps/chosen": -0.936480700969696, "logps/rejected": -1.0702247619628906, "loss": 1.002, "odds_ratio_loss": 0.655421793460846, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -0.09364806860685349, "rewards/margins": 0.013374416157603264, "rewards/rejected": -0.1070224866271019, "sft_loss": 0.936480700969696, "step": 1300 }, { "epoch": 2.329406534785508, "grad_norm": 0.6282922625541687, "learning_rate": 5.889542205864083e-07, "logits/chosen": -0.3355167806148529, "logits/rejected": -0.3377595543861389, "logps/chosen": -0.9515066146850586, "logps/rejected": -1.0681602954864502, "loss": 1.0205, "odds_ratio_loss": 0.6903635859489441, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.0951506644487381, "rewards/margins": 0.01166537031531334, "rewards/rejected": -0.10681603848934174, "sft_loss": 0.9515066146850586, "step": 1310 }, { "epoch": 2.34718826405868, "grad_norm": 0.3864741027355194, "learning_rate": 5.592526130947862e-07, "logits/chosen": -0.31521058082580566, "logits/rejected": -0.3186022937297821, "logps/chosen": -0.9329264760017395, "logps/rejected": -1.0726194381713867, "loss": 1.0056, "odds_ratio_loss": 0.7264095544815063, "rewards/accuracies": 0.45625001192092896, "rewards/chosen": -0.09329266101121902, "rewards/margins": 0.013969297520816326, "rewards/rejected": -0.10726194083690643, "sft_loss": 0.9329264760017395, "step": 1320 }, { "epoch": 2.3649699933318518, "grad_norm": 0.8674092292785645, "learning_rate": 5.302249011195507e-07, "logits/chosen": -0.3717043995857239, "logits/rejected": -0.3457496166229248, "logps/chosen": -0.9407739639282227, "logps/rejected": -0.9671589136123657, "loss": 1.015, "odds_ratio_loss": 0.7421091198921204, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.09407740086317062, "rewards/margins": 0.0026384838856756687, "rewards/rejected": -0.09671588987112045, "sft_loss": 0.9407739639282227, "step": 1330 }, { "epoch": 2.382751722605023, "grad_norm": 0.8201255798339844, "learning_rate": 5.018811640997307e-07, "logits/chosen": -0.3262820839881897, "logits/rejected": -0.28208276629447937, "logps/chosen": -0.9741110801696777, "logps/rejected": -1.1972548961639404, "loss": 1.0409, "odds_ratio_loss": 0.6679055690765381, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.0974111258983612, "rewards/margins": 0.022314375266432762, "rewards/rejected": -0.11972548812627792, "sft_loss": 0.9741110801696777, "step": 1340 }, { "epoch": 2.400533451878195, "grad_norm": 0.3292596638202667, "learning_rate": 4.7423124397427105e-07, "logits/chosen": -0.37047189474105835, "logits/rejected": -0.31794866919517517, "logps/chosen": -0.9531441926956177, "logps/rejected": -1.015749216079712, "loss": 1.0256, "odds_ratio_loss": 0.7250458002090454, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.09531442075967789, "rewards/margins": 0.006260508205741644, "rewards/rejected": -0.10157492011785507, "sft_loss": 0.9531441926956177, "step": 1350 }, { "epoch": 2.418315181151367, "grad_norm": 0.4776778817176819, "learning_rate": 4.472847417645787e-07, "logits/chosen": -0.2806258201599121, "logits/rejected": -0.3024401366710663, "logps/chosen": -0.9200853109359741, "logps/rejected": -1.114600419998169, "loss": 0.9877, "odds_ratio_loss": 0.6760807633399963, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.09200852364301682, "rewards/margins": 0.01945151947438717, "rewards/rejected": -0.11146005243062973, "sft_loss": 0.9200853109359741, "step": 1360 }, { "epoch": 2.436096910424539, "grad_norm": 0.3043542802333832, "learning_rate": 4.210510142406993e-07, "logits/chosen": -0.32727354764938354, "logits/rejected": -0.3754233717918396, "logps/chosen": -0.9101996421813965, "logps/rejected": -1.0942609310150146, "loss": 0.977, "odds_ratio_loss": 0.6675896644592285, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.09101996570825577, "rewards/margins": 0.018406113609671593, "rewards/rejected": -0.10942608118057251, "sft_loss": 0.9101996421813965, "step": 1370 }, { "epoch": 2.4538786396977104, "grad_norm": 0.4151700437068939, "learning_rate": 3.9553917067232966e-07, "logits/chosen": -0.33969706296920776, "logits/rejected": -0.36881956458091736, "logps/chosen": -0.9399350881576538, "logps/rejected": -1.071777105331421, "loss": 1.0133, "odds_ratio_loss": 0.7333552241325378, "rewards/accuracies": 0.5, "rewards/chosen": -0.09399349987506866, "rewards/margins": 0.013184216804802418, "rewards/rejected": -0.10717771202325821, "sft_loss": 0.9399350881576538, "step": 1380 }, { "epoch": 2.4716603689708823, "grad_norm": 0.4568045437335968, "learning_rate": 3.707580696657509e-07, "logits/chosen": -0.2799975275993347, "logits/rejected": -0.30841827392578125, "logps/chosen": -0.9116710424423218, "logps/rejected": -0.9513956308364868, "loss": 0.9844, "odds_ratio_loss": 0.7269908785820007, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -0.09116710722446442, "rewards/margins": 0.003972449339926243, "rewards/rejected": -0.09513955563306808, "sft_loss": 0.9116710424423218, "step": 1390 }, { "epoch": 2.489442098244054, "grad_norm": 0.425468772649765, "learning_rate": 3.4671631608781815e-07, "logits/chosen": -0.3139536380767822, "logits/rejected": -0.32965949177742004, "logps/chosen": -0.9703924059867859, "logps/rejected": -1.079158067703247, "loss": 1.0439, "odds_ratio_loss": 0.7353022694587708, "rewards/accuracies": 0.46875, "rewards/chosen": -0.09703925251960754, "rewards/margins": 0.010876556858420372, "rewards/rejected": -0.10791579633951187, "sft_loss": 0.9703924059867859, "step": 1400 }, { "epoch": 2.507223827517226, "grad_norm": 0.6458228826522827, "learning_rate": 3.234222580780405e-07, "logits/chosen": -0.3632466197013855, "logits/rejected": -0.3340745270252228, "logps/chosen": -0.942143440246582, "logps/rejected": -0.9809234738349915, "loss": 1.0153, "odds_ratio_loss": 0.7311049103736877, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.09421434998512268, "rewards/margins": 0.0038779997266829014, "rewards/rejected": -0.09809235483407974, "sft_loss": 0.942143440246582, "step": 1410 }, { "epoch": 2.525005556790398, "grad_norm": 0.7571399211883545, "learning_rate": 3.0088398414982375e-07, "logits/chosen": -0.40216293931007385, "logits/rejected": -0.3554636836051941, "logps/chosen": -0.9506216049194336, "logps/rejected": -1.1040918827056885, "loss": 1.0238, "odds_ratio_loss": 0.7313109636306763, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -0.09506215900182724, "rewards/margins": 0.015347021631896496, "rewards/rejected": -0.11040918529033661, "sft_loss": 0.9506216049194336, "step": 1420 }, { "epoch": 2.54278728606357, "grad_norm": 0.41928017139434814, "learning_rate": 2.7910932038184487e-07, "logits/chosen": -0.38035768270492554, "logits/rejected": -0.43410953879356384, "logps/chosen": -0.9504894018173218, "logps/rejected": -1.033362627029419, "loss": 1.0219, "odds_ratio_loss": 0.7138369083404541, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.0950489416718483, "rewards/margins": 0.008287337608635426, "rewards/rejected": -0.10333627462387085, "sft_loss": 0.9504894018173218, "step": 1430 }, { "epoch": 2.5605690153367417, "grad_norm": 0.6664097905158997, "learning_rate": 2.5810582770057325e-07, "logits/chosen": -0.3502410054206848, "logits/rejected": -0.31972765922546387, "logps/chosen": -0.912204384803772, "logps/rejected": -1.0270380973815918, "loss": 0.9827, "odds_ratio_loss": 0.7054314613342285, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.09122045338153839, "rewards/margins": 0.011483349837362766, "rewards/rejected": -0.10270379483699799, "sft_loss": 0.912204384803772, "step": 1440 }, { "epoch": 2.578350744609913, "grad_norm": 0.5214207768440247, "learning_rate": 2.3788079925484402e-07, "logits/chosen": -0.2704157829284668, "logits/rejected": -0.30042511224746704, "logps/chosen": -0.980503261089325, "logps/rejected": -1.0476016998291016, "loss": 1.054, "odds_ratio_loss": 0.7349393963813782, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.0980503261089325, "rewards/margins": 0.006709852255880833, "rewards/rejected": -0.10476018488407135, "sft_loss": 0.980503261089325, "step": 1450 }, { "epoch": 2.596132473883085, "grad_norm": 0.3559114336967468, "learning_rate": 2.1844125788342661e-07, "logits/chosen": -0.3745304048061371, "logits/rejected": -0.3963877558708191, "logps/chosen": -0.8978282809257507, "logps/rejected": -1.1463072299957275, "loss": 0.966, "odds_ratio_loss": 0.6815627813339233, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.08978282660245895, "rewards/margins": 0.02484789676964283, "rewards/rejected": -0.11463073641061783, "sft_loss": 0.8978282809257507, "step": 1460 }, { "epoch": 2.613914203156257, "grad_norm": 0.4206191599369049, "learning_rate": 1.9979395367644428e-07, "logits/chosen": -0.3081280589103699, "logits/rejected": -0.2860923111438751, "logps/chosen": -0.8848710060119629, "logps/rejected": -1.030397653579712, "loss": 0.9502, "odds_ratio_loss": 0.6536397337913513, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.08848710358142853, "rewards/margins": 0.014552672393620014, "rewards/rejected": -0.10303977876901627, "sft_loss": 0.8848710060119629, "step": 1470 }, { "epoch": 2.631695932429429, "grad_norm": 0.6648186445236206, "learning_rate": 1.81945361631512e-07, "logits/chosen": -0.3387419283390045, "logits/rejected": -0.2922862768173218, "logps/chosen": -0.927925705909729, "logps/rejected": -0.9954597353935242, "loss": 1.0003, "odds_ratio_loss": 0.7234224081039429, "rewards/accuracies": 0.5, "rewards/chosen": -0.0927925705909729, "rewards/margins": 0.006753397174179554, "rewards/rejected": -0.09954597055912018, "sft_loss": 0.927925705909729, "step": 1480 }, { "epoch": 2.6494776617026004, "grad_norm": 0.5596628189086914, "learning_rate": 1.6490167940538343e-07, "logits/chosen": -0.3137277066707611, "logits/rejected": -0.3255840241909027, "logps/chosen": -0.9538249969482422, "logps/rejected": -1.0488290786743164, "loss": 1.0255, "odds_ratio_loss": 0.7165058851242065, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.09538250416517258, "rewards/margins": 0.009500409476459026, "rewards/rejected": -0.10488291084766388, "sft_loss": 0.9538249969482422, "step": 1490 }, { "epoch": 2.6672593909757722, "grad_norm": 0.4116540849208832, "learning_rate": 1.4866882516191339e-07, "logits/chosen": -0.31974849104881287, "logits/rejected": -0.27599194645881653, "logps/chosen": -0.9288945198059082, "logps/rejected": -1.0830228328704834, "loss": 0.9998, "odds_ratio_loss": 0.7095054984092712, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.0928894504904747, "rewards/margins": 0.015412822365760803, "rewards/rejected": -0.1083022803068161, "sft_loss": 0.9288945198059082, "step": 1500 }, { "epoch": 2.6672593909757722, "eval_logits/chosen": -0.3320940136909485, "eval_logits/rejected": -0.29884636402130127, "eval_logps/chosen": -0.9399133324623108, "eval_logps/rejected": -1.080655574798584, "eval_loss": 1.0073015689849854, "eval_odds_ratio_loss": 0.6738813519477844, "eval_rewards/accuracies": 0.515999972820282, "eval_rewards/chosen": -0.09399133920669556, "eval_rewards/margins": 0.01407422125339508, "eval_rewards/rejected": -0.10806556046009064, "eval_runtime": 185.9317, "eval_samples_per_second": 5.378, "eval_sft_loss": 0.9399133324623108, "eval_steps_per_second": 2.689, "step": 1500 }, { "epoch": 2.685041120248944, "grad_norm": 0.6644484996795654, "learning_rate": 1.3325243551706057e-07, "logits/chosen": -0.3859871029853821, "logits/rejected": -0.36218634247779846, "logps/chosen": -0.9241644144058228, "logps/rejected": -1.1543761491775513, "loss": 0.9915, "odds_ratio_loss": 0.6730437874794006, "rewards/accuracies": 0.5625, "rewards/chosen": -0.0924164205789566, "rewards/margins": 0.023021187633275986, "rewards/rejected": -0.11543761193752289, "sft_loss": 0.9241644144058228, "step": 1510 }, { "epoch": 2.702822849522116, "grad_norm": 0.6883984208106995, "learning_rate": 1.1865786358165737e-07, "logits/chosen": -0.3818913400173187, "logits/rejected": -0.27337896823883057, "logps/chosen": -0.9033206701278687, "logps/rejected": -1.0108495950698853, "loss": 0.9727, "odds_ratio_loss": 0.6942235827445984, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.09033207595348358, "rewards/margins": 0.010752884671092033, "rewards/rejected": -0.10108494758605957, "sft_loss": 0.9033206701278687, "step": 1520 }, { "epoch": 2.720604578795288, "grad_norm": 1.4156850576400757, "learning_rate": 1.0489017710262311e-07, "logits/chosen": -0.39080482721328735, "logits/rejected": -0.3747466206550598, "logps/chosen": -1.0374637842178345, "logps/rejected": -1.1824612617492676, "loss": 1.1147, "odds_ratio_loss": 0.7718855142593384, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.10374637693166733, "rewards/margins": 0.014499744400382042, "rewards/rejected": -0.11824611574411392, "sft_loss": 1.0374637842178345, "step": 1530 }, { "epoch": 2.73838630806846, "grad_norm": 0.4921424984931946, "learning_rate": 9.195415670326446e-08, "logits/chosen": -0.326080858707428, "logits/rejected": -0.321908175945282, "logps/chosen": -0.9485294222831726, "logps/rejected": -1.082155704498291, "loss": 1.0195, "odds_ratio_loss": 0.7096532583236694, "rewards/accuracies": 0.46875, "rewards/chosen": -0.09485294669866562, "rewards/margins": 0.013362633064389229, "rewards/rejected": -0.1082155704498291, "sft_loss": 0.9485294222831726, "step": 1540 }, { "epoch": 2.7561680373416317, "grad_norm": 0.686665415763855, "learning_rate": 7.985429422327384e-08, "logits/chosen": -0.35336002707481384, "logits/rejected": -0.3244116008281708, "logps/chosen": -0.9436219930648804, "logps/rejected": -0.975549578666687, "loss": 1.0188, "odds_ratio_loss": 0.7518836855888367, "rewards/accuracies": 0.4124999940395355, "rewards/chosen": -0.09436219930648804, "rewards/margins": 0.0031927600502967834, "rewards/rejected": -0.09755495190620422, "sft_loss": 0.9436219930648804, "step": 1550 }, { "epoch": 2.773949766614803, "grad_norm": 0.30419808626174927, "learning_rate": 6.859479115900818e-08, "logits/chosen": -0.31769606471061707, "logits/rejected": -0.31846362352371216, "logps/chosen": -0.9142364263534546, "logps/rejected": -1.0324945449829102, "loss": 0.9834, "odds_ratio_loss": 0.6916245222091675, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.0914236530661583, "rewards/margins": 0.011825799010694027, "rewards/rejected": -0.1032494530081749, "sft_loss": 0.9142364263534546, "step": 1560 }, { "epoch": 2.791731495887975, "grad_norm": 1.5349509716033936, "learning_rate": 5.817955720457902e-08, "logits/chosen": -0.33953648805618286, "logits/rejected": -0.297925740480423, "logps/chosen": -0.9395607709884644, "logps/rejected": -1.0038203001022339, "loss": 1.0133, "odds_ratio_loss": 0.7371524572372437, "rewards/accuracies": 0.5, "rewards/chosen": -0.09395607560873032, "rewards/margins": 0.006425946019589901, "rewards/rejected": -0.10038203001022339, "sft_loss": 0.9395607709884644, "step": 1570 }, { "epoch": 2.809513225161147, "grad_norm": 0.36313971877098083, "learning_rate": 4.861220889427199e-08, "logits/chosen": -0.35685330629348755, "logits/rejected": -0.35064131021499634, "logps/chosen": -0.9390374422073364, "logps/rejected": -1.019951581954956, "loss": 1.012, "odds_ratio_loss": 0.7297292351722717, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.09390375763177872, "rewards/margins": 0.008091414347290993, "rewards/rejected": -0.10199517011642456, "sft_loss": 0.9390374422073364, "step": 1580 }, { "epoch": 2.827294954434319, "grad_norm": 0.26599186658859253, "learning_rate": 3.9896068346758074e-08, "logits/chosen": -0.39413073658943176, "logits/rejected": -0.38061630725860596, "logps/chosen": -0.948017954826355, "logps/rejected": -1.034618616104126, "loss": 1.0172, "odds_ratio_loss": 0.6922141313552856, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.09480179846286774, "rewards/margins": 0.008660053834319115, "rewards/rejected": -0.1034618467092514, "sft_loss": 0.948017954826355, "step": 1590 }, { "epoch": 2.8450766837074903, "grad_norm": 0.9985164403915405, "learning_rate": 3.203416211153832e-08, "logits/chosen": -0.3526967763900757, "logits/rejected": -0.25582748651504517, "logps/chosen": -0.9348894357681274, "logps/rejected": -1.0583240985870361, "loss": 1.0071, "odds_ratio_loss": 0.7220235466957092, "rewards/accuracies": 0.4312500059604645, "rewards/chosen": -0.09348894655704498, "rewards/margins": 0.01234346441924572, "rewards/rejected": -0.10583242028951645, "sft_loss": 0.9348894357681274, "step": 1600 }, { "epoch": 2.8628584129806622, "grad_norm": 0.4895220994949341, "learning_rate": 2.5029220118019393e-08, "logits/chosen": -0.3774477243423462, "logits/rejected": -0.34018778800964355, "logps/chosen": -0.9445845484733582, "logps/rejected": -0.9962360262870789, "loss": 1.0176, "odds_ratio_loss": 0.7305063009262085, "rewards/accuracies": 0.4124999940395355, "rewards/chosen": -0.0944584533572197, "rewards/margins": 0.0051651508547365665, "rewards/rejected": -0.09962360560894012, "sft_loss": 0.9445845484733582, "step": 1610 }, { "epoch": 2.880640142253834, "grad_norm": 0.39454635977745056, "learning_rate": 1.8883674727586122e-08, "logits/chosen": -0.3457157611846924, "logits/rejected": -0.33168259263038635, "logps/chosen": -0.8693550825119019, "logps/rejected": -1.09225332736969, "loss": 0.9328, "odds_ratio_loss": 0.6342187523841858, "rewards/accuracies": 0.625, "rewards/chosen": -0.08693551272153854, "rewards/margins": 0.022289803251624107, "rewards/rejected": -0.1092253178358078, "sft_loss": 0.8693550825119019, "step": 1620 }, { "epoch": 2.898421871527006, "grad_norm": 0.29763612151145935, "learning_rate": 1.3599659889000639e-08, "logits/chosen": -0.26188623905181885, "logits/rejected": -0.27545788884162903, "logps/chosen": -0.9086050987243652, "logps/rejected": -0.9591732025146484, "loss": 0.9816, "odds_ratio_loss": 0.7299038171768188, "rewards/accuracies": 0.4124999940395355, "rewards/chosen": -0.0908605083823204, "rewards/margins": 0.005056814290583134, "rewards/rejected": -0.09591732919216156, "sft_loss": 0.9086050987243652, "step": 1630 }, { "epoch": 2.916203600800178, "grad_norm": 3.087757110595703, "learning_rate": 9.179010397421528e-09, "logits/chosen": -0.29684725403785706, "logits/rejected": -0.26544058322906494, "logps/chosen": -1.0444749593734741, "logps/rejected": -1.1464588642120361, "loss": 1.1156, "odds_ratio_loss": 0.7117230892181396, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.10444750636816025, "rewards/margins": 0.010198366828262806, "rewards/rejected": -0.11464587599039078, "sft_loss": 1.0444749593734741, "step": 1640 }, { "epoch": 2.93398533007335, "grad_norm": 0.7389609813690186, "learning_rate": 5.623261257296509e-09, "logits/chosen": -0.33190470933914185, "logits/rejected": -0.2921023964881897, "logps/chosen": -0.8605577349662781, "logps/rejected": -0.9687950015068054, "loss": 0.9291, "odds_ratio_loss": 0.6854843497276306, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -0.08605578541755676, "rewards/margins": 0.010823719203472137, "rewards/rejected": -0.0968794971704483, "sft_loss": 0.8605577349662781, "step": 1650 }, { "epoch": 2.9517670593465217, "grad_norm": 0.49204200506210327, "learning_rate": 2.933647149357122e-09, "logits/chosen": -0.3684224784374237, "logits/rejected": -0.3360394537448883, "logps/chosen": -0.9260095357894897, "logps/rejected": -1.059597373008728, "loss": 0.9945, "odds_ratio_loss": 0.6844674348831177, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.09260095655918121, "rewards/margins": 0.013358776457607746, "rewards/rejected": -0.10595973581075668, "sft_loss": 0.9260095357894897, "step": 1660 }, { "epoch": 2.969548788619693, "grad_norm": 0.4070994257926941, "learning_rate": 1.1111020018930717e-09, "logits/chosen": -0.2591468393802643, "logits/rejected": -0.31176748871803284, "logps/chosen": -0.9283815622329712, "logps/rejected": -0.9903603792190552, "loss": 1.0009, "odds_ratio_loss": 0.7251425981521606, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -0.09283814579248428, "rewards/margins": 0.006197893992066383, "rewards/rejected": -0.09903603792190552, "sft_loss": 0.9283815622329712, "step": 1670 }, { "epoch": 2.987330517892865, "grad_norm": 0.31971636414527893, "learning_rate": 1.5625866646051813e-10, "logits/chosen": -0.3598848283290863, "logits/rejected": -0.3403863310813904, "logps/chosen": -0.9049466252326965, "logps/rejected": -1.057483434677124, "loss": 0.9695, "odds_ratio_loss": 0.6452642679214478, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.09049466997385025, "rewards/margins": 0.015253685414791107, "rewards/rejected": -0.10574835538864136, "sft_loss": 0.9049466252326965, "step": 1680 }, { "epoch": 2.997999555456768, "step": 1686, "total_flos": 1.8817568285770383e+18, "train_loss": 1.0353579054523618, "train_runtime": 16950.0138, "train_samples_per_second": 1.593, "train_steps_per_second": 0.099 } ], "logging_steps": 10, "max_steps": 1686, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "total_flos": 1.8817568285770383e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }