{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 400, "global_step": 17412, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00017229496898690558, "grad_norm": 2.0902597904205322, "learning_rate": 1.148105625717566e-10, "logits/chosen": -2.8080272674560547, "logits/rejected": -2.785019874572754, "logps/chosen": -44.8405876159668, "logps/rejected": -39.36625671386719, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.0017229496898690559, "grad_norm": 2.0985445976257324, "learning_rate": 1.148105625717566e-09, "logits/chosen": -2.9043519496917725, "logits/rejected": -2.881565570831299, "logps/chosen": -51.813934326171875, "logps/rejected": -49.24929428100586, "loss": 0.6931, "rewards/accuracies": 0.4652777910232544, "rewards/chosen": -5.939431503065862e-05, "rewards/margins": 6.864402530482039e-05, "rewards/rejected": -0.00012803831486962736, "step": 10 }, { "epoch": 0.0034458993797381117, "grad_norm": 2.1032633781433105, "learning_rate": 2.296211251435132e-09, "logits/chosen": -2.9463086128234863, "logits/rejected": -2.941572427749634, "logps/chosen": -53.832359313964844, "logps/rejected": -52.88805389404297, "loss": 0.6931, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.0001046212637447752, "rewards/margins": 2.1037729311501607e-05, "rewards/rejected": -0.000125658989418298, "step": 20 }, { "epoch": 0.005168849069607168, "grad_norm": 2.233656406402588, "learning_rate": 3.4443168771526976e-09, "logits/chosen": -2.9105770587921143, "logits/rejected": -2.8925628662109375, "logps/chosen": -57.676544189453125, "logps/rejected": -57.83379364013672, "loss": 0.6931, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": 2.8576847398653626e-05, "rewards/margins": 5.704880550183589e-06, "rewards/rejected": 2.2871969122206792e-05, "step": 30 }, { "epoch": 0.006891798759476223, "grad_norm": 1.8444969654083252, "learning_rate": 4.592422502870264e-09, "logits/chosen": -2.926358938217163, "logits/rejected": -2.9026577472686768, "logps/chosen": -56.067359924316406, "logps/rejected": -50.16572189331055, "loss": 0.6931, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 4.615211582859047e-05, "rewards/margins": 0.00012403741129674017, "rewards/rejected": -7.788527000229806e-05, "step": 40 }, { "epoch": 0.00861474844934528, "grad_norm": 1.9836063385009766, "learning_rate": 5.74052812858783e-09, "logits/chosen": -2.9309310913085938, "logits/rejected": -2.9199869632720947, "logps/chosen": -53.15720748901367, "logps/rejected": -50.464534759521484, "loss": 0.6931, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": 0.00011061689292546362, "rewards/margins": 5.0750844820868224e-05, "rewards/rejected": 5.986605538055301e-05, "step": 50 }, { "epoch": 0.010337698139214336, "grad_norm": 2.349973678588867, "learning_rate": 6.888633754305395e-09, "logits/chosen": -2.9500718116760254, "logits/rejected": -2.9269402027130127, "logps/chosen": -58.4092903137207, "logps/rejected": -53.900306701660156, "loss": 0.6932, "rewards/accuracies": 0.4375, "rewards/chosen": -0.0001145638307207264, "rewards/margins": -0.00019984866958111525, "rewards/rejected": 8.528483886038885e-05, "step": 60 }, { "epoch": 0.012060647829083391, "grad_norm": 2.0366384983062744, "learning_rate": 8.036739380022962e-09, "logits/chosen": -2.906822681427002, "logits/rejected": -2.894425868988037, "logps/chosen": -54.81329345703125, "logps/rejected": -52.41325759887695, "loss": 0.693, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": 0.00014264558558352292, "rewards/margins": 0.00025373551761731505, "rewards/rejected": -0.00011108988110208884, "step": 70 }, { "epoch": 0.013783597518952447, "grad_norm": 2.247792959213257, "learning_rate": 9.184845005740529e-09, "logits/chosen": -2.9640133380889893, "logits/rejected": -2.9426205158233643, "logps/chosen": -60.194786071777344, "logps/rejected": -53.240272521972656, "loss": 0.6932, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": 3.302104232716374e-05, "rewards/margins": -5.603021782008e-05, "rewards/rejected": 8.905124559532851e-05, "step": 80 }, { "epoch": 0.015506547208821502, "grad_norm": 2.1423492431640625, "learning_rate": 1.0332950631458094e-08, "logits/chosen": -2.868622303009033, "logits/rejected": -2.861255407333374, "logps/chosen": -54.9310417175293, "logps/rejected": -51.796836853027344, "loss": 0.6932, "rewards/accuracies": 0.5, "rewards/chosen": 1.1550308954610955e-05, "rewards/margins": -1.8355758584220894e-05, "rewards/rejected": 2.9906092095188797e-05, "step": 90 }, { "epoch": 0.01722949689869056, "grad_norm": 2.204350471496582, "learning_rate": 1.148105625717566e-08, "logits/chosen": -2.968977451324463, "logits/rejected": -2.9209835529327393, "logps/chosen": -57.40143966674805, "logps/rejected": -48.77809524536133, "loss": 0.693, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": 8.09053672128357e-05, "rewards/margins": 0.00019563671958167106, "rewards/rejected": -0.00011473130143713206, "step": 100 }, { "epoch": 0.018952446588559616, "grad_norm": 2.2462244033813477, "learning_rate": 1.2629161882893224e-08, "logits/chosen": -2.9463882446289062, "logits/rejected": -2.927314281463623, "logps/chosen": -56.65543746948242, "logps/rejected": -51.98151779174805, "loss": 0.693, "rewards/accuracies": 0.5625, "rewards/chosen": 0.00016289362974930555, "rewards/margins": 0.0002862837864086032, "rewards/rejected": -0.00012339015665929765, "step": 110 }, { "epoch": 0.02067539627842867, "grad_norm": 2.3142030239105225, "learning_rate": 1.377726750861079e-08, "logits/chosen": -2.885002613067627, "logits/rejected": -2.8734192848205566, "logps/chosen": -53.67094802856445, "logps/rejected": -54.91790771484375, "loss": 0.693, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.0002903799177147448, "rewards/margins": 0.0002786249096971005, "rewards/rejected": 1.1754990737244952e-05, "step": 120 }, { "epoch": 0.022398345968297727, "grad_norm": 1.963403582572937, "learning_rate": 1.4925373134328357e-08, "logits/chosen": -2.932080030441284, "logits/rejected": -2.92702054977417, "logps/chosen": -56.63043212890625, "logps/rejected": -53.09601974487305, "loss": 0.6931, "rewards/accuracies": 0.53125, "rewards/chosen": -6.796454545110464e-05, "rewards/margins": 2.1783866941404995e-06, "rewards/rejected": -7.014292350504547e-05, "step": 130 }, { "epoch": 0.024121295658166782, "grad_norm": 2.389951467514038, "learning_rate": 1.6073478760045924e-08, "logits/chosen": -2.9416565895080566, "logits/rejected": -2.931546449661255, "logps/chosen": -54.46699905395508, "logps/rejected": -52.611183166503906, "loss": 0.693, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 7.823264604667202e-05, "rewards/margins": 0.00021567563817370683, "rewards/rejected": -0.00013744299940299243, "step": 140 }, { "epoch": 0.025844245348035838, "grad_norm": 2.066187620162964, "learning_rate": 1.722158438576349e-08, "logits/chosen": -2.8867287635803223, "logits/rejected": -2.8737969398498535, "logps/chosen": -53.074363708496094, "logps/rejected": -51.20355224609375, "loss": 0.6932, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.00011352014553267509, "rewards/margins": -0.0001107621137634851, "rewards/rejected": -2.7580172172747552e-06, "step": 150 }, { "epoch": 0.027567195037904894, "grad_norm": 1.8884066343307495, "learning_rate": 1.8369690011481057e-08, "logits/chosen": -2.9306740760803223, "logits/rejected": -2.916821002960205, "logps/chosen": -54.592872619628906, "logps/rejected": -54.249855041503906, "loss": 0.6932, "rewards/accuracies": 0.46875, "rewards/chosen": 3.338105670991354e-05, "rewards/margins": -9.989602403948084e-05, "rewards/rejected": 0.00013327706255950034, "step": 160 }, { "epoch": 0.02929014472777395, "grad_norm": 2.0824570655822754, "learning_rate": 1.9517795637198624e-08, "logits/chosen": -2.915168285369873, "logits/rejected": -2.900991439819336, "logps/chosen": -56.46404266357422, "logps/rejected": -50.75846481323242, "loss": 0.6932, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -9.851455979514867e-05, "rewards/margins": -1.5921646991046146e-05, "rewards/rejected": -8.259294554591179e-05, "step": 170 }, { "epoch": 0.031013094417643005, "grad_norm": 2.266218423843384, "learning_rate": 2.0665901262916187e-08, "logits/chosen": -2.914151906967163, "logits/rejected": -2.898178815841675, "logps/chosen": -57.04075241088867, "logps/rejected": -52.43286895751953, "loss": 0.6931, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -6.253592346183723e-06, "rewards/margins": 0.00015795855142641813, "rewards/rejected": -0.00016421216423623264, "step": 180 }, { "epoch": 0.03273604410751206, "grad_norm": 2.5385985374450684, "learning_rate": 2.1814006888633754e-08, "logits/chosen": -2.9488868713378906, "logits/rejected": -2.9160306453704834, "logps/chosen": -59.6348876953125, "logps/rejected": -51.66065216064453, "loss": 0.6931, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.00011362945951987058, "rewards/margins": 2.3863278784119757e-06, "rewards/rejected": -0.00011601579899434, "step": 190 }, { "epoch": 0.03445899379738112, "grad_norm": 2.239220380783081, "learning_rate": 2.296211251435132e-08, "logits/chosen": -2.9068408012390137, "logits/rejected": -2.897510051727295, "logps/chosen": -54.77439498901367, "logps/rejected": -53.70017623901367, "loss": 0.6931, "rewards/accuracies": 0.5, "rewards/chosen": -0.00012153792340541258, "rewards/margins": 2.215528002125211e-05, "rewards/rejected": -0.00014369319251272827, "step": 200 }, { "epoch": 0.03618194348725017, "grad_norm": 2.1656925678253174, "learning_rate": 2.4110218140068887e-08, "logits/chosen": -2.864589214324951, "logits/rejected": -2.8619420528411865, "logps/chosen": -54.096771240234375, "logps/rejected": -56.413795471191406, "loss": 0.6932, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -7.368248770944774e-05, "rewards/margins": -8.52059674798511e-05, "rewards/rejected": 1.1523479770403355e-05, "step": 210 }, { "epoch": 0.03790489317711923, "grad_norm": 2.0480666160583496, "learning_rate": 2.5258323765786448e-08, "logits/chosen": -2.904480457305908, "logits/rejected": -2.8816940784454346, "logps/chosen": -53.449119567871094, "logps/rejected": -50.03138732910156, "loss": 0.6932, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.0002139102143701166, "rewards/margins": -3.339463728480041e-05, "rewards/rejected": -0.00018051560618914664, "step": 220 }, { "epoch": 0.03962784286698828, "grad_norm": 2.2359554767608643, "learning_rate": 2.6406429391504014e-08, "logits/chosen": -2.9000978469848633, "logits/rejected": -2.8889379501342773, "logps/chosen": -49.822628021240234, "logps/rejected": -49.380130767822266, "loss": 0.6931, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.0002377505588810891, "rewards/margins": 8.301097113871947e-05, "rewards/rejected": -0.0003207615518476814, "step": 230 }, { "epoch": 0.04135079255685734, "grad_norm": 1.985116958618164, "learning_rate": 2.755453501722158e-08, "logits/chosen": -2.8757596015930176, "logits/rejected": -2.8462939262390137, "logps/chosen": -56.7255859375, "logps/rejected": -51.6666145324707, "loss": 0.6931, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -9.991443948820233e-05, "rewards/margins": 0.00014959466352593154, "rewards/rejected": -0.00024950908846221864, "step": 240 }, { "epoch": 0.043073742246726394, "grad_norm": 2.0716514587402344, "learning_rate": 2.8702640642939148e-08, "logits/chosen": -2.948909044265747, "logits/rejected": -2.931750535964966, "logps/chosen": -53.45426559448242, "logps/rejected": -50.105186462402344, "loss": 0.6931, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.00011698435992002487, "rewards/margins": 6.186254177009687e-05, "rewards/rejected": -0.00017884690896607935, "step": 250 }, { "epoch": 0.044796691936595454, "grad_norm": 1.9955936670303345, "learning_rate": 2.9850746268656714e-08, "logits/chosen": -2.9301047325134277, "logits/rejected": -2.922619342803955, "logps/chosen": -55.73781204223633, "logps/rejected": -55.22661209106445, "loss": 0.6929, "rewards/accuracies": 0.625, "rewards/chosen": -3.683334944071248e-05, "rewards/margins": 0.0005753434961661696, "rewards/rejected": -0.0006121768383309245, "step": 260 }, { "epoch": 0.046519641626464506, "grad_norm": 2.1792263984680176, "learning_rate": 3.099885189437428e-08, "logits/chosen": -2.8962621688842773, "logits/rejected": -2.888561487197876, "logps/chosen": -53.812705993652344, "logps/rejected": -53.4767951965332, "loss": 0.693, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.00036246198578737676, "rewards/margins": 0.00020978061365894973, "rewards/rejected": -0.0005722425994463265, "step": 270 }, { "epoch": 0.048242591316333565, "grad_norm": 2.0500059127807617, "learning_rate": 3.214695752009185e-08, "logits/chosen": -2.9554762840270996, "logits/rejected": -2.9338696002960205, "logps/chosen": -58.78718948364258, "logps/rejected": -52.61870193481445, "loss": 0.693, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.00021337093494366854, "rewards/margins": 0.0003163707733619958, "rewards/rejected": -0.00052974175196141, "step": 280 }, { "epoch": 0.04996554100620262, "grad_norm": 1.9906699657440186, "learning_rate": 3.3295063145809414e-08, "logits/chosen": -2.9041569232940674, "logits/rejected": -2.895073175430298, "logps/chosen": -56.95964431762695, "logps/rejected": -53.340545654296875, "loss": 0.693, "rewards/accuracies": 0.5625, "rewards/chosen": -0.00025733906659297645, "rewards/margins": 0.00026806717505678535, "rewards/rejected": -0.0005254062125459313, "step": 290 }, { "epoch": 0.051688490696071676, "grad_norm": 2.0617942810058594, "learning_rate": 3.444316877152698e-08, "logits/chosen": -2.8528239727020264, "logits/rejected": -2.8550000190734863, "logps/chosen": -55.023704528808594, "logps/rejected": -53.3245735168457, "loss": 0.693, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.00032503431430086493, "rewards/margins": 0.00029767598607577384, "rewards/rejected": -0.0006227103294804692, "step": 300 }, { "epoch": 0.05341144038594073, "grad_norm": 2.061204433441162, "learning_rate": 3.559127439724455e-08, "logits/chosen": -2.901181221008301, "logits/rejected": -2.901825428009033, "logps/chosen": -54.8587532043457, "logps/rejected": -52.582496643066406, "loss": 0.6929, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.0002555266546551138, "rewards/margins": 0.0005454671336337924, "rewards/rejected": -0.0008009938756003976, "step": 310 }, { "epoch": 0.05513439007580979, "grad_norm": 2.3455982208251953, "learning_rate": 3.6739380022962115e-08, "logits/chosen": -2.887146472930908, "logits/rejected": -2.8691606521606445, "logps/chosen": -56.60396194458008, "logps/rejected": -48.95844650268555, "loss": 0.693, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.0003960584872402251, "rewards/margins": 0.0003540136094670743, "rewards/rejected": -0.0007500721258111298, "step": 320 }, { "epoch": 0.05685733976567884, "grad_norm": 2.033954381942749, "learning_rate": 3.788748564867968e-08, "logits/chosen": -2.9154181480407715, "logits/rejected": -2.8979363441467285, "logps/chosen": -56.212135314941406, "logps/rejected": -51.129173278808594, "loss": 0.6927, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.00016808233340270817, "rewards/margins": 0.0009294062620028853, "rewards/rejected": -0.001097488566301763, "step": 330 }, { "epoch": 0.0585802894555479, "grad_norm": 2.0031909942626953, "learning_rate": 3.903559127439725e-08, "logits/chosen": -2.8824925422668457, "logits/rejected": -2.869655132293701, "logps/chosen": -52.89168167114258, "logps/rejected": -51.91801834106445, "loss": 0.693, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.0004165670252405107, "rewards/margins": 0.0003390835190657526, "rewards/rejected": -0.0007556505734100938, "step": 340 }, { "epoch": 0.06030323914541695, "grad_norm": 2.2410380840301514, "learning_rate": 4.018369690011481e-08, "logits/chosen": -2.855290412902832, "logits/rejected": -2.822758436203003, "logps/chosen": -57.14137649536133, "logps/rejected": -53.933021545410156, "loss": 0.6928, "rewards/accuracies": 0.59375, "rewards/chosen": -0.0003419906715862453, "rewards/margins": 0.0006515433778986335, "rewards/rejected": -0.0009935342241078615, "step": 350 }, { "epoch": 0.06202618883528601, "grad_norm": 2.1879637241363525, "learning_rate": 4.1331802525832375e-08, "logits/chosen": -2.9514498710632324, "logits/rejected": -2.933030605316162, "logps/chosen": -56.0881233215332, "logps/rejected": -49.44280242919922, "loss": 0.6929, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.0008741362253203988, "rewards/margins": 0.00042976863915100694, "rewards/rejected": -0.0013039048062637448, "step": 360 }, { "epoch": 0.06374913852515507, "grad_norm": 2.0574898719787598, "learning_rate": 4.247990815154994e-08, "logits/chosen": -2.9376654624938965, "logits/rejected": -2.9129061698913574, "logps/chosen": -54.38572311401367, "logps/rejected": -51.12152099609375, "loss": 0.6925, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.000479917653137818, "rewards/margins": 0.0012370418990030885, "rewards/rejected": -0.001716959523037076, "step": 370 }, { "epoch": 0.06547208821502412, "grad_norm": 1.9364941120147705, "learning_rate": 4.362801377726751e-08, "logits/chosen": -2.9991958141326904, "logits/rejected": -2.979586601257324, "logps/chosen": -55.15120315551758, "logps/rejected": -51.248802185058594, "loss": 0.6926, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.0007126034470275044, "rewards/margins": 0.001187127665616572, "rewards/rejected": -0.0018997311126440763, "step": 380 }, { "epoch": 0.06719503790489317, "grad_norm": 2.235966920852661, "learning_rate": 4.4776119402985075e-08, "logits/chosen": -2.931478977203369, "logits/rejected": -2.916945219039917, "logps/chosen": -57.42863845825195, "logps/rejected": -54.23737716674805, "loss": 0.6927, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.0010502212680876255, "rewards/margins": 0.0009564169449731708, "rewards/rejected": -0.0020066378638148308, "step": 390 }, { "epoch": 0.06891798759476224, "grad_norm": 1.7472114562988281, "learning_rate": 4.592422502870264e-08, "logits/chosen": -2.918935537338257, "logits/rejected": -2.9072728157043457, "logps/chosen": -54.609474182128906, "logps/rejected": -51.67049026489258, "loss": 0.6924, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.0010759534779936075, "rewards/margins": 0.0014685506466776133, "rewards/rejected": -0.002544503891840577, "step": 400 }, { "epoch": 0.06891798759476224, "eval_logits/chosen": -2.972291946411133, "eval_logits/rejected": -2.968651294708252, "eval_logps/chosen": -58.909366607666016, "eval_logps/rejected": -62.675540924072266, "eval_loss": 0.692988395690918, "eval_rewards/accuracies": 0.5390334725379944, "eval_rewards/chosen": 0.0010610398603603244, "eval_rewards/margins": 0.000320415070746094, "eval_rewards/rejected": 0.0007406247896142304, "eval_runtime": 384.3708, "eval_samples_per_second": 11.198, "eval_steps_per_second": 1.4, "step": 400 }, { "epoch": 0.07064093728463129, "grad_norm": 1.9456653594970703, "learning_rate": 4.707233065442021e-08, "logits/chosen": -2.9112446308135986, "logits/rejected": -2.9126250743865967, "logps/chosen": -51.31296920776367, "logps/rejected": -54.56962203979492, "loss": 0.6929, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.0019060971681028605, "rewards/margins": 0.0004303969908505678, "rewards/rejected": -0.0023364939261227846, "step": 410 }, { "epoch": 0.07236388697450034, "grad_norm": 2.3853089809417725, "learning_rate": 4.8220436280137775e-08, "logits/chosen": -2.8980908393859863, "logits/rejected": -2.8948521614074707, "logps/chosen": -55.48943328857422, "logps/rejected": -53.71550369262695, "loss": 0.6926, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.0012784868013113737, "rewards/margins": 0.0010058528278023005, "rewards/rejected": -0.0022843393962830305, "step": 420 }, { "epoch": 0.0740868366643694, "grad_norm": 2.0890214443206787, "learning_rate": 4.9368541905855335e-08, "logits/chosen": -2.936516284942627, "logits/rejected": -2.9260849952697754, "logps/chosen": -54.843605041503906, "logps/rejected": -53.190086364746094, "loss": 0.6926, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.0008548603509552777, "rewards/margins": 0.0012039890279993415, "rewards/rejected": -0.0020588492043316364, "step": 430 }, { "epoch": 0.07580978635423846, "grad_norm": 2.3940982818603516, "learning_rate": 5.0516647531572895e-08, "logits/chosen": -2.9783670902252197, "logits/rejected": -2.952179193496704, "logps/chosen": -55.11848831176758, "logps/rejected": -53.0138053894043, "loss": 0.6918, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.001027547288686037, "rewards/margins": 0.002638460136950016, "rewards/rejected": -0.0036660078912973404, "step": 440 }, { "epoch": 0.07753273604410751, "grad_norm": 2.0819737911224365, "learning_rate": 5.166475315729046e-08, "logits/chosen": -2.916598320007324, "logits/rejected": -2.8953187465667725, "logps/chosen": -57.37543869018555, "logps/rejected": -54.627662658691406, "loss": 0.6923, "rewards/accuracies": 0.625, "rewards/chosen": -0.0009044323232956231, "rewards/margins": 0.0017119159456342459, "rewards/rejected": -0.002616348210722208, "step": 450 }, { "epoch": 0.07925568573397657, "grad_norm": 2.0121195316314697, "learning_rate": 5.281285878300803e-08, "logits/chosen": -2.8869118690490723, "logits/rejected": -2.8755950927734375, "logps/chosen": -57.134727478027344, "logps/rejected": -52.3559455871582, "loss": 0.6925, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.001536081894300878, "rewards/margins": 0.001315793488174677, "rewards/rejected": -0.0028518750332295895, "step": 460 }, { "epoch": 0.08097863542384562, "grad_norm": 2.0747272968292236, "learning_rate": 5.3960964408725595e-08, "logits/chosen": -2.899627923965454, "logits/rejected": -2.875185966491699, "logps/chosen": -54.55614471435547, "logps/rejected": -50.614784240722656, "loss": 0.692, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.0010662561981007457, "rewards/margins": 0.0022261999547481537, "rewards/rejected": -0.003292456269264221, "step": 470 }, { "epoch": 0.08270158511371468, "grad_norm": 2.3838884830474854, "learning_rate": 5.510907003444316e-08, "logits/chosen": -2.9103691577911377, "logits/rejected": -2.902712345123291, "logps/chosen": -54.980934143066406, "logps/rejected": -58.52118682861328, "loss": 0.6922, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.001894455635920167, "rewards/margins": 0.0019050573464483023, "rewards/rejected": -0.003799512516707182, "step": 480 }, { "epoch": 0.08442453480358374, "grad_norm": 2.2515811920166016, "learning_rate": 5.625717566016073e-08, "logits/chosen": -2.858151912689209, "logits/rejected": -2.8224198818206787, "logps/chosen": -61.32958221435547, "logps/rejected": -50.735389709472656, "loss": 0.6915, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.002147951629012823, "rewards/margins": 0.0033252262510359287, "rewards/rejected": -0.005473177880048752, "step": 490 }, { "epoch": 0.08614748449345279, "grad_norm": 1.9670840501785278, "learning_rate": 5.7405281285878295e-08, "logits/chosen": -2.8843283653259277, "logits/rejected": -2.8672218322753906, "logps/chosen": -56.59968185424805, "logps/rejected": -51.90899658203125, "loss": 0.6922, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.0031538717448711395, "rewards/margins": 0.00200037844479084, "rewards/rejected": -0.005154250655323267, "step": 500 }, { "epoch": 0.08787043418332184, "grad_norm": 1.9930695295333862, "learning_rate": 5.855338691159586e-08, "logits/chosen": -2.857835292816162, "logits/rejected": -2.846858501434326, "logps/chosen": -59.034812927246094, "logps/rejected": -52.214622497558594, "loss": 0.6928, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.0035990376491099596, "rewards/margins": 0.0006619172054342926, "rewards/rejected": -0.004260954912751913, "step": 510 }, { "epoch": 0.08959338387319091, "grad_norm": 1.9633930921554565, "learning_rate": 5.970149253731343e-08, "logits/chosen": -2.9089746475219727, "logits/rejected": -2.8926620483398438, "logps/chosen": -57.21832275390625, "logps/rejected": -51.88628005981445, "loss": 0.6917, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.004442922305315733, "rewards/margins": 0.0029019941575825214, "rewards/rejected": -0.007344916462898254, "step": 520 }, { "epoch": 0.09131633356305996, "grad_norm": 1.8883012533187866, "learning_rate": 6.084959816303099e-08, "logits/chosen": -2.9070680141448975, "logits/rejected": -2.8766965866088867, "logps/chosen": -56.9815673828125, "logps/rejected": -50.6800651550293, "loss": 0.6909, "rewards/accuracies": 0.65625, "rewards/chosen": -0.0032037501223385334, "rewards/margins": 0.004527016542851925, "rewards/rejected": -0.007730766199529171, "step": 530 }, { "epoch": 0.09303928325292901, "grad_norm": 1.9685890674591064, "learning_rate": 6.199770378874856e-08, "logits/chosen": -2.898916482925415, "logits/rejected": -2.8864660263061523, "logps/chosen": -54.44707489013672, "logps/rejected": -53.032958984375, "loss": 0.6917, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -0.0050659701228141785, "rewards/margins": 0.002865965710952878, "rewards/rejected": -0.007931936532258987, "step": 540 }, { "epoch": 0.09476223294279806, "grad_norm": 2.0686702728271484, "learning_rate": 6.314580941446614e-08, "logits/chosen": -2.9271841049194336, "logits/rejected": -2.9111077785491943, "logps/chosen": -55.30144119262695, "logps/rejected": -51.95891571044922, "loss": 0.691, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.005565012339502573, "rewards/margins": 0.004354250617325306, "rewards/rejected": -0.009919262491166592, "step": 550 }, { "epoch": 0.09648518263266713, "grad_norm": 2.248969078063965, "learning_rate": 6.42939150401837e-08, "logits/chosen": -2.9135735034942627, "logits/rejected": -2.903707981109619, "logps/chosen": -54.01261520385742, "logps/rejected": -54.91992950439453, "loss": 0.6914, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.006114738993346691, "rewards/margins": 0.0035691908560693264, "rewards/rejected": -0.00968393124639988, "step": 560 }, { "epoch": 0.09820813232253618, "grad_norm": 2.045034170150757, "learning_rate": 6.544202066590127e-08, "logits/chosen": -2.886688709259033, "logits/rejected": -2.8828177452087402, "logps/chosen": -53.0010986328125, "logps/rejected": -54.485321044921875, "loss": 0.6916, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.0064199878834187984, "rewards/margins": 0.0031796726398169994, "rewards/rejected": -0.009599661454558372, "step": 570 }, { "epoch": 0.09993108201240523, "grad_norm": 1.7068687677383423, "learning_rate": 6.659012629161883e-08, "logits/chosen": -2.9013490676879883, "logits/rejected": -2.895749568939209, "logps/chosen": -52.609703063964844, "logps/rejected": -52.71235275268555, "loss": 0.6919, "rewards/accuracies": 0.5, "rewards/chosen": -0.0066453502513468266, "rewards/margins": 0.0026767298113554716, "rewards/rejected": -0.009322079829871655, "step": 580 }, { "epoch": 0.1016540317022743, "grad_norm": 1.9736390113830566, "learning_rate": 6.77382319173364e-08, "logits/chosen": -2.897890567779541, "logits/rejected": -2.8842339515686035, "logps/chosen": -55.642784118652344, "logps/rejected": -55.381492614746094, "loss": 0.692, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -0.006683549843728542, "rewards/margins": 0.002430492080748081, "rewards/rejected": -0.009114041924476624, "step": 590 }, { "epoch": 0.10337698139214335, "grad_norm": 2.4153261184692383, "learning_rate": 6.888633754305396e-08, "logits/chosen": -2.898934841156006, "logits/rejected": -2.879272937774658, "logps/chosen": -56.03900146484375, "logps/rejected": -56.54254150390625, "loss": 0.689, "rewards/accuracies": 0.625, "rewards/chosen": -0.004332136828452349, "rewards/margins": 0.008359143510460854, "rewards/rejected": -0.012691279873251915, "step": 600 }, { "epoch": 0.1050999310820124, "grad_norm": 2.184461832046509, "learning_rate": 7.003444316877152e-08, "logits/chosen": -2.8567090034484863, "logits/rejected": -2.855968952178955, "logps/chosen": -55.145233154296875, "logps/rejected": -53.976341247558594, "loss": 0.6927, "rewards/accuracies": 0.53125, "rewards/chosen": -0.008213085122406483, "rewards/margins": 0.0010757189011201262, "rewards/rejected": -0.009288804605603218, "step": 610 }, { "epoch": 0.10682288077188146, "grad_norm": 2.221895694732666, "learning_rate": 7.11825487944891e-08, "logits/chosen": -2.945406675338745, "logits/rejected": -2.923915386199951, "logps/chosen": -57.062896728515625, "logps/rejected": -53.8463020324707, "loss": 0.6899, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.007125864736735821, "rewards/margins": 0.006631826050579548, "rewards/rejected": -0.013757690787315369, "step": 620 }, { "epoch": 0.10854583046175052, "grad_norm": 2.301266670227051, "learning_rate": 7.233065442020666e-08, "logits/chosen": -2.939603328704834, "logits/rejected": -2.91625714302063, "logps/chosen": -56.117332458496094, "logps/rejected": -51.174259185791016, "loss": 0.6909, "rewards/accuracies": 0.625, "rewards/chosen": -0.009119195863604546, "rewards/margins": 0.004569724667817354, "rewards/rejected": -0.013688920065760612, "step": 630 }, { "epoch": 0.11026878015161957, "grad_norm": 2.2985427379608154, "learning_rate": 7.347876004592423e-08, "logits/chosen": -2.9217209815979004, "logits/rejected": -2.9191370010375977, "logps/chosen": -54.47275924682617, "logps/rejected": -54.68788528442383, "loss": 0.6922, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.010385403409600258, "rewards/margins": 0.002089323475956917, "rewards/rejected": -0.012474726885557175, "step": 640 }, { "epoch": 0.11199172984148863, "grad_norm": 2.41143536567688, "learning_rate": 7.462686567164179e-08, "logits/chosen": -2.9253361225128174, "logits/rejected": -2.928819179534912, "logps/chosen": -54.176780700683594, "logps/rejected": -55.32817459106445, "loss": 0.6932, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.012147116474807262, "rewards/margins": 0.00017380015924572945, "rewards/rejected": -0.01232091709971428, "step": 650 }, { "epoch": 0.11371467953135768, "grad_norm": 2.1162109375, "learning_rate": 7.577497129735936e-08, "logits/chosen": -2.869983196258545, "logits/rejected": -2.8705716133117676, "logps/chosen": -56.631378173828125, "logps/rejected": -53.262786865234375, "loss": 0.6915, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.009453224018216133, "rewards/margins": 0.003567267907783389, "rewards/rejected": -0.013020491227507591, "step": 660 }, { "epoch": 0.11543762922122675, "grad_norm": 2.0193326473236084, "learning_rate": 7.692307692307692e-08, "logits/chosen": -2.8858590126037598, "logits/rejected": -2.881812572479248, "logps/chosen": -55.13932418823242, "logps/rejected": -58.1707649230957, "loss": 0.6916, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.008889088407158852, "rewards/margins": 0.0032468761783093214, "rewards/rejected": -0.012135963886976242, "step": 670 }, { "epoch": 0.1171605789110958, "grad_norm": 2.262868881225586, "learning_rate": 7.80711825487945e-08, "logits/chosen": -2.8544554710388184, "logits/rejected": -2.831420660018921, "logps/chosen": -56.3204231262207, "logps/rejected": -52.126670837402344, "loss": 0.6905, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.012224559672176838, "rewards/margins": 0.005443849600851536, "rewards/rejected": -0.017668409273028374, "step": 680 }, { "epoch": 0.11888352860096485, "grad_norm": 2.2813498973846436, "learning_rate": 7.921928817451206e-08, "logits/chosen": -2.943873643875122, "logits/rejected": -2.9195990562438965, "logps/chosen": -61.373748779296875, "logps/rejected": -51.66704559326172, "loss": 0.6906, "rewards/accuracies": 0.59375, "rewards/chosen": -0.008815468288958073, "rewards/margins": 0.005301609635353088, "rewards/rejected": -0.014117076992988586, "step": 690 }, { "epoch": 0.1206064782908339, "grad_norm": 2.2850728034973145, "learning_rate": 8.036739380022962e-08, "logits/chosen": -2.9098591804504395, "logits/rejected": -2.886373519897461, "logps/chosen": -57.46692657470703, "logps/rejected": -53.712921142578125, "loss": 0.6898, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.009994572028517723, "rewards/margins": 0.00689247390255332, "rewards/rejected": -0.01688704639673233, "step": 700 }, { "epoch": 0.12232942798070297, "grad_norm": 2.093459129333496, "learning_rate": 8.151549942594719e-08, "logits/chosen": -2.9022059440612793, "logits/rejected": -2.885436534881592, "logps/chosen": -56.37190628051758, "logps/rejected": -55.75042724609375, "loss": 0.6896, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.011153196915984154, "rewards/margins": 0.007232338190078735, "rewards/rejected": -0.01838553510606289, "step": 710 }, { "epoch": 0.12405237767057202, "grad_norm": 2.2593727111816406, "learning_rate": 8.266360505166475e-08, "logits/chosen": -2.8753161430358887, "logits/rejected": -2.871997833251953, "logps/chosen": -55.58568572998047, "logps/rejected": -55.225379943847656, "loss": 0.6912, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.012562957592308521, "rewards/margins": 0.004113520495593548, "rewards/rejected": -0.01667647436261177, "step": 720 }, { "epoch": 0.12577532736044109, "grad_norm": 2.259854316711426, "learning_rate": 8.381171067738232e-08, "logits/chosen": -2.9437055587768555, "logits/rejected": -2.9245526790618896, "logps/chosen": -58.900779724121094, "logps/rejected": -54.2612190246582, "loss": 0.689, "rewards/accuracies": 0.625, "rewards/chosen": -0.014078138396143913, "rewards/margins": 0.008628633804619312, "rewards/rejected": -0.0227067731320858, "step": 730 }, { "epoch": 0.12749827705031014, "grad_norm": 2.150418281555176, "learning_rate": 8.495981630309988e-08, "logits/chosen": -2.8817498683929443, "logits/rejected": -2.8615224361419678, "logps/chosen": -56.9492073059082, "logps/rejected": -55.37824249267578, "loss": 0.6894, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.012495382688939571, "rewards/margins": 0.007908121682703495, "rewards/rejected": -0.020403504371643066, "step": 740 }, { "epoch": 0.1292212267401792, "grad_norm": 2.1469054222106934, "learning_rate": 8.610792192881746e-08, "logits/chosen": -2.9918925762176514, "logits/rejected": -2.9728503227233887, "logps/chosen": -58.38031005859375, "logps/rejected": -55.4575080871582, "loss": 0.6866, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.01179544534534216, "rewards/margins": 0.013385293073952198, "rewards/rejected": -0.025180738419294357, "step": 750 }, { "epoch": 0.13094417643004824, "grad_norm": 2.4583590030670166, "learning_rate": 8.725602755453502e-08, "logits/chosen": -2.9008548259735107, "logits/rejected": -2.8728179931640625, "logps/chosen": -57.165260314941406, "logps/rejected": -51.254608154296875, "loss": 0.6873, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.016394078731536865, "rewards/margins": 0.012260092422366142, "rewards/rejected": -0.028654176741838455, "step": 760 }, { "epoch": 0.1326671261199173, "grad_norm": 2.0270307064056396, "learning_rate": 8.840413318025258e-08, "logits/chosen": -2.9242424964904785, "logits/rejected": -2.9103903770446777, "logps/chosen": -56.168739318847656, "logps/rejected": -54.60688018798828, "loss": 0.6881, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.018161626532673836, "rewards/margins": 0.010471840389072895, "rewards/rejected": -0.028633465990424156, "step": 770 }, { "epoch": 0.13439007580978635, "grad_norm": 2.446073055267334, "learning_rate": 8.955223880597015e-08, "logits/chosen": -2.922211170196533, "logits/rejected": -2.8983256816864014, "logps/chosen": -56.91460418701172, "logps/rejected": -53.85261154174805, "loss": 0.6874, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.023327644914388657, "rewards/margins": 0.011989260092377663, "rewards/rejected": -0.03531690686941147, "step": 780 }, { "epoch": 0.1361130254996554, "grad_norm": 2.0871706008911133, "learning_rate": 9.070034443168771e-08, "logits/chosen": -2.9211132526397705, "logits/rejected": -2.895343065261841, "logps/chosen": -57.1553840637207, "logps/rejected": -56.386634826660156, "loss": 0.6869, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.0219894852489233, "rewards/margins": 0.013090623542666435, "rewards/rejected": -0.03508010879158974, "step": 790 }, { "epoch": 0.13783597518952448, "grad_norm": 2.458371639251709, "learning_rate": 9.184845005740528e-08, "logits/chosen": -2.858908176422119, "logits/rejected": -2.839359998703003, "logps/chosen": -58.53376007080078, "logps/rejected": -58.27695846557617, "loss": 0.6891, "rewards/accuracies": 0.59375, "rewards/chosen": -0.02825082466006279, "rewards/margins": 0.008799943141639233, "rewards/rejected": -0.0370507650077343, "step": 800 }, { "epoch": 0.13783597518952448, "eval_logits/chosen": -2.9621548652648926, "eval_logits/rejected": -2.9587860107421875, "eval_logps/chosen": -59.62390899658203, "eval_logps/rejected": -63.83048629760742, "eval_loss": 0.690947413444519, "eval_rewards/accuracies": 0.5748141407966614, "eval_rewards/chosen": -0.006084338761866093, "eval_rewards/margins": 0.004724523052573204, "eval_rewards/rejected": -0.010808861814439297, "eval_runtime": 384.4775, "eval_samples_per_second": 11.194, "eval_steps_per_second": 1.399, "step": 800 }, { "epoch": 0.13955892487939353, "grad_norm": 2.213287591934204, "learning_rate": 9.299655568312284e-08, "logits/chosen": -2.8889384269714355, "logits/rejected": -2.8673510551452637, "logps/chosen": -59.30315017700195, "logps/rejected": -57.970436096191406, "loss": 0.6881, "rewards/accuracies": 0.59375, "rewards/chosen": -0.023655524477362633, "rewards/margins": 0.01091140415519476, "rewards/rejected": -0.034566931426525116, "step": 810 }, { "epoch": 0.14128187456926258, "grad_norm": 2.1062045097351074, "learning_rate": 9.414466130884042e-08, "logits/chosen": -2.921374797821045, "logits/rejected": -2.9040017127990723, "logps/chosen": -55.840232849121094, "logps/rejected": -54.697853088378906, "loss": 0.6873, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.030127938836812973, "rewards/margins": 0.012304485775530338, "rewards/rejected": -0.04243241995573044, "step": 820 }, { "epoch": 0.14300482425913164, "grad_norm": 2.369229555130005, "learning_rate": 9.529276693455798e-08, "logits/chosen": -2.9105210304260254, "logits/rejected": -2.8936829566955566, "logps/chosen": -59.06435012817383, "logps/rejected": -58.14280319213867, "loss": 0.6845, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.03192334994673729, "rewards/margins": 0.01835811696946621, "rewards/rejected": -0.05028147250413895, "step": 830 }, { "epoch": 0.1447277739490007, "grad_norm": 2.176154136657715, "learning_rate": 9.644087256027555e-08, "logits/chosen": -2.9422030448913574, "logits/rejected": -2.92073917388916, "logps/chosen": -58.568359375, "logps/rejected": -54.570281982421875, "loss": 0.6855, "rewards/accuracies": 0.59375, "rewards/chosen": -0.034468166530132294, "rewards/margins": 0.016375292092561722, "rewards/rejected": -0.050843458622694016, "step": 840 }, { "epoch": 0.14645072363886974, "grad_norm": 2.1147189140319824, "learning_rate": 9.758897818599311e-08, "logits/chosen": -2.8813514709472656, "logits/rejected": -2.872459888458252, "logps/chosen": -55.68572998046875, "logps/rejected": -59.400367736816406, "loss": 0.6894, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.04169295355677605, "rewards/margins": 0.008527868427336216, "rewards/rejected": -0.05022081732749939, "step": 850 }, { "epoch": 0.1481736733287388, "grad_norm": 2.189018726348877, "learning_rate": 9.873708381171067e-08, "logits/chosen": -2.906050205230713, "logits/rejected": -2.891706705093384, "logps/chosen": -58.0321044921875, "logps/rejected": -56.58018112182617, "loss": 0.686, "rewards/accuracies": 0.59375, "rewards/chosen": -0.04129540175199509, "rewards/margins": 0.015181129798293114, "rewards/rejected": -0.05647652596235275, "step": 860 }, { "epoch": 0.14989662301860784, "grad_norm": 2.0911967754364014, "learning_rate": 9.988518943742824e-08, "logits/chosen": -2.9313414096832275, "logits/rejected": -2.9336588382720947, "logps/chosen": -56.133277893066406, "logps/rejected": -58.191932678222656, "loss": 0.6877, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.04252300783991814, "rewards/margins": 0.01182483322918415, "rewards/rejected": -0.054347842931747437, "step": 870 }, { "epoch": 0.15161957270847692, "grad_norm": 2.3610286712646484, "learning_rate": 1.0103329506314579e-07, "logits/chosen": -2.8736624717712402, "logits/rejected": -2.8538830280303955, "logps/chosen": -57.096893310546875, "logps/rejected": -55.92301559448242, "loss": 0.6833, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.03677918389439583, "rewards/margins": 0.020569270476698875, "rewards/rejected": -0.057348452508449554, "step": 880 }, { "epoch": 0.15334252239834598, "grad_norm": 2.0646069049835205, "learning_rate": 1.0218140068886336e-07, "logits/chosen": -2.897963047027588, "logits/rejected": -2.871859312057495, "logps/chosen": -62.93019485473633, "logps/rejected": -58.28081512451172, "loss": 0.6852, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.031163910403847694, "rewards/margins": 0.016853151842951775, "rewards/rejected": -0.04801706224679947, "step": 890 }, { "epoch": 0.15506547208821503, "grad_norm": 2.542097806930542, "learning_rate": 1.0332950631458092e-07, "logits/chosen": -2.917391061782837, "logits/rejected": -2.912045955657959, "logps/chosen": -58.973899841308594, "logps/rejected": -56.362701416015625, "loss": 0.6872, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.03930676728487015, "rewards/margins": 0.01295884232968092, "rewards/rejected": -0.05226561427116394, "step": 900 }, { "epoch": 0.15678842177808408, "grad_norm": 2.199962615966797, "learning_rate": 1.044776119402985e-07, "logits/chosen": -2.893681287765503, "logits/rejected": -2.891852855682373, "logps/chosen": -56.727622985839844, "logps/rejected": -57.78075408935547, "loss": 0.6875, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.04665730893611908, "rewards/margins": 0.01240667887032032, "rewards/rejected": -0.05906399339437485, "step": 910 }, { "epoch": 0.15851137146795313, "grad_norm": 2.394650936126709, "learning_rate": 1.0562571756601606e-07, "logits/chosen": -2.916598081588745, "logits/rejected": -2.885108709335327, "logps/chosen": -60.41682815551758, "logps/rejected": -54.64247512817383, "loss": 0.686, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.04918244481086731, "rewards/margins": 0.015305593609809875, "rewards/rejected": -0.06448803842067719, "step": 920 }, { "epoch": 0.16023432115782218, "grad_norm": 2.60247540473938, "learning_rate": 1.0677382319173363e-07, "logits/chosen": -2.9422740936279297, "logits/rejected": -2.930769681930542, "logps/chosen": -58.148521423339844, "logps/rejected": -59.06476593017578, "loss": 0.6864, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.04540998488664627, "rewards/margins": 0.014568351209163666, "rewards/rejected": -0.05997832864522934, "step": 930 }, { "epoch": 0.16195727084769124, "grad_norm": 2.2433910369873047, "learning_rate": 1.0792192881745119e-07, "logits/chosen": -2.9525837898254395, "logits/rejected": -2.9253265857696533, "logps/chosen": -65.92704010009766, "logps/rejected": -60.785980224609375, "loss": 0.6856, "rewards/accuracies": 0.59375, "rewards/chosen": -0.04621075838804245, "rewards/margins": 0.01654568873345852, "rewards/rejected": -0.06275644898414612, "step": 940 }, { "epoch": 0.16368022053756032, "grad_norm": 2.3847455978393555, "learning_rate": 1.0907003444316875e-07, "logits/chosen": -2.8176932334899902, "logits/rejected": -2.8024542331695557, "logps/chosen": -61.851539611816406, "logps/rejected": -61.24384689331055, "loss": 0.6891, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.059170741587877274, "rewards/margins": 0.009380336850881577, "rewards/rejected": -0.06855107843875885, "step": 950 }, { "epoch": 0.16540317022742937, "grad_norm": 2.2610373497009277, "learning_rate": 1.1021814006888632e-07, "logits/chosen": -2.808915615081787, "logits/rejected": -2.811962604522705, "logps/chosen": -57.79589080810547, "logps/rejected": -60.714317321777344, "loss": 0.6945, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.06247736141085625, "rewards/margins": -0.0015118229202926159, "rewards/rejected": -0.060965538024902344, "step": 960 }, { "epoch": 0.16712611991729842, "grad_norm": 2.532683849334717, "learning_rate": 1.1136624569460388e-07, "logits/chosen": -2.8959765434265137, "logits/rejected": -2.869158983230591, "logps/chosen": -66.4780502319336, "logps/rejected": -57.21562957763672, "loss": 0.6874, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.052064068615436554, "rewards/margins": 0.01277141459286213, "rewards/rejected": -0.06483548879623413, "step": 970 }, { "epoch": 0.16884906960716747, "grad_norm": 2.805853843688965, "learning_rate": 1.1251435132032146e-07, "logits/chosen": -2.9637677669525146, "logits/rejected": -2.945610761642456, "logps/chosen": -62.131736755371094, "logps/rejected": -59.7706298828125, "loss": 0.6864, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.04719981551170349, "rewards/margins": 0.014899635687470436, "rewards/rejected": -0.06209943816065788, "step": 980 }, { "epoch": 0.17057201929703653, "grad_norm": 2.672757148742676, "learning_rate": 1.1366245694603902e-07, "logits/chosen": -2.8849339485168457, "logits/rejected": -2.866464376449585, "logps/chosen": -60.4627571105957, "logps/rejected": -58.09014129638672, "loss": 0.6881, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.05641549080610275, "rewards/margins": 0.011244365945458412, "rewards/rejected": -0.06765986233949661, "step": 990 }, { "epoch": 0.17229496898690558, "grad_norm": 2.177389621734619, "learning_rate": 1.1481056257175659e-07, "logits/chosen": -2.851919651031494, "logits/rejected": -2.82978892326355, "logps/chosen": -63.827980041503906, "logps/rejected": -58.105125427246094, "loss": 0.6865, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.056848056614398956, "rewards/margins": 0.014737958088517189, "rewards/rejected": -0.071586012840271, "step": 1000 }, { "epoch": 0.17401791867677463, "grad_norm": 2.3776516914367676, "learning_rate": 1.1595866819747415e-07, "logits/chosen": -2.8108181953430176, "logits/rejected": -2.816577672958374, "logps/chosen": -60.68506622314453, "logps/rejected": -63.4400749206543, "loss": 0.69, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.06378235667943954, "rewards/margins": 0.007416282780468464, "rewards/rejected": -0.07119864225387573, "step": 1010 }, { "epoch": 0.17574086836664368, "grad_norm": 2.2297322750091553, "learning_rate": 1.1710677382319172e-07, "logits/chosen": -2.9127297401428223, "logits/rejected": -2.889315128326416, "logps/chosen": -63.0562629699707, "logps/rejected": -60.82661819458008, "loss": 0.6802, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.048960812389850616, "rewards/margins": 0.02726539969444275, "rewards/rejected": -0.07622621953487396, "step": 1020 }, { "epoch": 0.17746381805651276, "grad_norm": 2.291780471801758, "learning_rate": 1.1825487944890928e-07, "logits/chosen": -2.9396207332611084, "logits/rejected": -2.9200801849365234, "logps/chosen": -60.832740783691406, "logps/rejected": -58.027183532714844, "loss": 0.6856, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.05459435656666756, "rewards/margins": 0.016271965578198433, "rewards/rejected": -0.07086631655693054, "step": 1030 }, { "epoch": 0.17918676774638181, "grad_norm": 2.8056833744049072, "learning_rate": 1.1940298507462686e-07, "logits/chosen": -2.8990871906280518, "logits/rejected": -2.88954758644104, "logps/chosen": -59.774559020996094, "logps/rejected": -62.26892852783203, "loss": 0.6898, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -0.06455157697200775, "rewards/margins": 0.008438936434686184, "rewards/rejected": -0.0729905217885971, "step": 1040 }, { "epoch": 0.18090971743625087, "grad_norm": 2.4161949157714844, "learning_rate": 1.205510907003444e-07, "logits/chosen": -2.8405020236968994, "logits/rejected": -2.8092803955078125, "logps/chosen": -64.3838882446289, "logps/rejected": -58.1729736328125, "loss": 0.6834, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.06246421858668327, "rewards/margins": 0.02075287327170372, "rewards/rejected": -0.08321709930896759, "step": 1050 }, { "epoch": 0.18263266712611992, "grad_norm": 2.3523647785186768, "learning_rate": 1.2169919632606198e-07, "logits/chosen": -2.858825206756592, "logits/rejected": -2.8416731357574463, "logps/chosen": -64.46825408935547, "logps/rejected": -62.1015625, "loss": 0.6884, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.06816236674785614, "rewards/margins": 0.010872631333768368, "rewards/rejected": -0.07903499901294708, "step": 1060 }, { "epoch": 0.18435561681598897, "grad_norm": 2.313222646713257, "learning_rate": 1.2284730195177955e-07, "logits/chosen": -2.9378304481506348, "logits/rejected": -2.9094176292419434, "logps/chosen": -64.5782470703125, "logps/rejected": -60.97203826904297, "loss": 0.6826, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.06780969351530075, "rewards/margins": 0.022828945890069008, "rewards/rejected": -0.09063863754272461, "step": 1070 }, { "epoch": 0.18607856650585802, "grad_norm": 2.3939578533172607, "learning_rate": 1.2399540757749712e-07, "logits/chosen": -2.936554431915283, "logits/rejected": -2.918708086013794, "logps/chosen": -64.82514190673828, "logps/rejected": -59.743865966796875, "loss": 0.6852, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.07702767848968506, "rewards/margins": 0.01737195998430252, "rewards/rejected": -0.09439964592456818, "step": 1080 }, { "epoch": 0.18780151619572708, "grad_norm": 2.4799766540527344, "learning_rate": 1.251435132032147e-07, "logits/chosen": -2.8482329845428467, "logits/rejected": -2.8464126586914062, "logps/chosen": -61.432395935058594, "logps/rejected": -60.96601104736328, "loss": 0.6888, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.08055031299591064, "rewards/margins": 0.010185223072767258, "rewards/rejected": -0.0907355397939682, "step": 1090 }, { "epoch": 0.18952446588559613, "grad_norm": 2.4442903995513916, "learning_rate": 1.2629161882893227e-07, "logits/chosen": -2.869385242462158, "logits/rejected": -2.8703014850616455, "logps/chosen": -59.78876876831055, "logps/rejected": -63.30439376831055, "loss": 0.6866, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.0754605308175087, "rewards/margins": 0.015025362372398376, "rewards/rejected": -0.09048587828874588, "step": 1100 }, { "epoch": 0.1912474155754652, "grad_norm": 2.785104990005493, "learning_rate": 1.2743972445464984e-07, "logits/chosen": -2.8878140449523926, "logits/rejected": -2.897512912750244, "logps/chosen": -61.808631896972656, "logps/rejected": -63.763877868652344, "loss": 0.6892, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.07973656803369522, "rewards/margins": 0.009631333872675896, "rewards/rejected": -0.08936790376901627, "step": 1110 }, { "epoch": 0.19297036526533426, "grad_norm": 2.718078374862671, "learning_rate": 1.285878300803674e-07, "logits/chosen": -2.9095702171325684, "logits/rejected": -2.892914056777954, "logps/chosen": -64.25332641601562, "logps/rejected": -61.17406463623047, "loss": 0.6865, "rewards/accuracies": 0.59375, "rewards/chosen": -0.0720595195889473, "rewards/margins": 0.014846527948975563, "rewards/rejected": -0.08690604567527771, "step": 1120 }, { "epoch": 0.1946933149552033, "grad_norm": 2.2941906452178955, "learning_rate": 1.2973593570608496e-07, "logits/chosen": -2.9266607761383057, "logits/rejected": -2.9116671085357666, "logps/chosen": -59.73681640625, "logps/rejected": -62.68779754638672, "loss": 0.6803, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.0724368542432785, "rewards/margins": 0.02737646922469139, "rewards/rejected": -0.09981332719326019, "step": 1130 }, { "epoch": 0.19641626464507236, "grad_norm": 2.5265445709228516, "learning_rate": 1.3088404133180254e-07, "logits/chosen": -2.8809289932250977, "logits/rejected": -2.860023021697998, "logps/chosen": -64.78260040283203, "logps/rejected": -60.81663131713867, "loss": 0.6861, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.06992609798908234, "rewards/margins": 0.016008172184228897, "rewards/rejected": -0.08593426644802094, "step": 1140 }, { "epoch": 0.19813921433494142, "grad_norm": 2.750117063522339, "learning_rate": 1.3203214695752008e-07, "logits/chosen": -2.8423831462860107, "logits/rejected": -2.832681655883789, "logps/chosen": -61.31705856323242, "logps/rejected": -61.98966598510742, "loss": 0.682, "rewards/accuracies": 0.625, "rewards/chosen": -0.0669320821762085, "rewards/margins": 0.023900505155324936, "rewards/rejected": -0.09083259105682373, "step": 1150 }, { "epoch": 0.19986216402481047, "grad_norm": 2.309347152709961, "learning_rate": 1.3318025258323766e-07, "logits/chosen": -2.9073128700256348, "logits/rejected": -2.900723695755005, "logps/chosen": -61.77983856201172, "logps/rejected": -61.5976676940918, "loss": 0.6836, "rewards/accuracies": 0.65625, "rewards/chosen": -0.060506559908390045, "rewards/margins": 0.020602624863386154, "rewards/rejected": -0.0811091959476471, "step": 1160 }, { "epoch": 0.20158511371467952, "grad_norm": 2.2978622913360596, "learning_rate": 1.3432835820895523e-07, "logits/chosen": -2.8373751640319824, "logits/rejected": -2.828467845916748, "logps/chosen": -59.83747482299805, "logps/rejected": -61.81534957885742, "loss": 0.6866, "rewards/accuracies": 0.5625, "rewards/chosen": -0.0691220685839653, "rewards/margins": 0.014754706993699074, "rewards/rejected": -0.08387677371501923, "step": 1170 }, { "epoch": 0.2033080634045486, "grad_norm": 2.4647703170776367, "learning_rate": 1.354764638346728e-07, "logits/chosen": -2.8165135383605957, "logits/rejected": -2.794498920440674, "logps/chosen": -61.49424362182617, "logps/rejected": -59.40460205078125, "loss": 0.6837, "rewards/accuracies": 0.59375, "rewards/chosen": -0.07810764014720917, "rewards/margins": 0.020892778411507607, "rewards/rejected": -0.09900043159723282, "step": 1180 }, { "epoch": 0.20503101309441765, "grad_norm": 2.720895767211914, "learning_rate": 1.3662456946039035e-07, "logits/chosen": -2.922314405441284, "logits/rejected": -2.900804042816162, "logps/chosen": -66.13607788085938, "logps/rejected": -58.975799560546875, "loss": 0.6807, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.07079530507326126, "rewards/margins": 0.02672487497329712, "rewards/rejected": -0.09752018749713898, "step": 1190 }, { "epoch": 0.2067539627842867, "grad_norm": 2.2189040184020996, "learning_rate": 1.3777267508610792e-07, "logits/chosen": -2.825812816619873, "logits/rejected": -2.8158926963806152, "logps/chosen": -61.317138671875, "logps/rejected": -58.899383544921875, "loss": 0.6874, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.07221807539463043, "rewards/margins": 0.013589012436568737, "rewards/rejected": -0.08580709993839264, "step": 1200 }, { "epoch": 0.2067539627842867, "eval_logits/chosen": -2.9395346641540527, "eval_logits/rejected": -2.9361255168914795, "eval_logps/chosen": -62.038490295410156, "eval_logps/rejected": -67.01734924316406, "eval_loss": 0.687571108341217, "eval_rewards/accuracies": 0.5871282815933228, "eval_rewards/chosen": -0.030230168253183365, "eval_rewards/margins": 0.012447311542928219, "eval_rewards/rejected": -0.04267747700214386, "eval_runtime": 384.82, "eval_samples_per_second": 11.184, "eval_steps_per_second": 1.398, "step": 1200 }, { "epoch": 0.20847691247415576, "grad_norm": 3.0337154865264893, "learning_rate": 1.389207807118255e-07, "logits/chosen": -2.8906006813049316, "logits/rejected": -2.865419387817383, "logps/chosen": -62.349098205566406, "logps/rejected": -62.536407470703125, "loss": 0.6814, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.07368902117013931, "rewards/margins": 0.025377903133630753, "rewards/rejected": -0.09906691312789917, "step": 1210 }, { "epoch": 0.2101998621640248, "grad_norm": 2.5097906589508057, "learning_rate": 1.4006888633754304e-07, "logits/chosen": -2.89139986038208, "logits/rejected": -2.8693718910217285, "logps/chosen": -60.9052619934082, "logps/rejected": -61.641456604003906, "loss": 0.6812, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.07215452194213867, "rewards/margins": 0.025878047570586205, "rewards/rejected": -0.09803257137537003, "step": 1220 }, { "epoch": 0.21192281185389386, "grad_norm": 2.8678061962127686, "learning_rate": 1.4121699196326062e-07, "logits/chosen": -2.9567160606384277, "logits/rejected": -2.925724506378174, "logps/chosen": -64.60700988769531, "logps/rejected": -61.93334197998047, "loss": 0.6792, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.07604970782995224, "rewards/margins": 0.0297552403062582, "rewards/rejected": -0.10580495744943619, "step": 1230 }, { "epoch": 0.2136457615437629, "grad_norm": 2.821640729904175, "learning_rate": 1.423650975889782e-07, "logits/chosen": -2.855620861053467, "logits/rejected": -2.845649480819702, "logps/chosen": -61.8939094543457, "logps/rejected": -63.21649169921875, "loss": 0.6837, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.088918536901474, "rewards/margins": 0.020954841747879982, "rewards/rejected": -0.10987337678670883, "step": 1240 }, { "epoch": 0.21536871123363197, "grad_norm": 2.81172776222229, "learning_rate": 1.4351320321469576e-07, "logits/chosen": -2.9326090812683105, "logits/rejected": -2.9026684761047363, "logps/chosen": -63.4760627746582, "logps/rejected": -61.67022705078125, "loss": 0.6807, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.08308391273021698, "rewards/margins": 0.027814438566565514, "rewards/rejected": -0.11089835315942764, "step": 1250 }, { "epoch": 0.21709166092350105, "grad_norm": 2.774033308029175, "learning_rate": 1.446613088404133e-07, "logits/chosen": -2.819913148880005, "logits/rejected": -2.800523519515991, "logps/chosen": -62.77690887451172, "logps/rejected": -60.63389205932617, "loss": 0.681, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.08326883614063263, "rewards/margins": 0.026683837175369263, "rewards/rejected": -0.10995267331600189, "step": 1260 }, { "epoch": 0.2188146106133701, "grad_norm": 2.880520820617676, "learning_rate": 1.4580941446613089e-07, "logits/chosen": -2.856600522994995, "logits/rejected": -2.8484253883361816, "logps/chosen": -61.86387252807617, "logps/rejected": -64.2399673461914, "loss": 0.687, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.08681348711252213, "rewards/margins": 0.014278444461524487, "rewards/rejected": -0.10109193623065948, "step": 1270 }, { "epoch": 0.22053756030323915, "grad_norm": 2.6990182399749756, "learning_rate": 1.4695752009184846e-07, "logits/chosen": -2.8809759616851807, "logits/rejected": -2.8884129524230957, "logps/chosen": -59.63513946533203, "logps/rejected": -68.55680847167969, "loss": 0.6853, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.07746431231498718, "rewards/margins": 0.017880458384752274, "rewards/rejected": -0.09534476697444916, "step": 1280 }, { "epoch": 0.2222605099931082, "grad_norm": 3.019570827484131, "learning_rate": 1.4810562571756603e-07, "logits/chosen": -2.8340437412261963, "logits/rejected": -2.8122403621673584, "logps/chosen": -63.11928176879883, "logps/rejected": -60.483482360839844, "loss": 0.6792, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.07383199036121368, "rewards/margins": 0.03013637661933899, "rewards/rejected": -0.10396836698055267, "step": 1290 }, { "epoch": 0.22398345968297725, "grad_norm": 2.6284210681915283, "learning_rate": 1.4925373134328358e-07, "logits/chosen": -2.8784289360046387, "logits/rejected": -2.870450973510742, "logps/chosen": -59.34076690673828, "logps/rejected": -64.48880767822266, "loss": 0.6847, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.09290703386068344, "rewards/margins": 0.01890570856630802, "rewards/rejected": -0.11181274801492691, "step": 1300 }, { "epoch": 0.2257064093728463, "grad_norm": 3.0810391902923584, "learning_rate": 1.5040183696900115e-07, "logits/chosen": -2.8738865852355957, "logits/rejected": -2.8507232666015625, "logps/chosen": -62.38170623779297, "logps/rejected": -60.55586624145508, "loss": 0.6773, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.08380500972270966, "rewards/margins": 0.03459765389561653, "rewards/rejected": -0.11840268224477768, "step": 1310 }, { "epoch": 0.22742935906271536, "grad_norm": 2.8459906578063965, "learning_rate": 1.5154994259471873e-07, "logits/chosen": -2.895685911178589, "logits/rejected": -2.8805301189422607, "logps/chosen": -64.91357421875, "logps/rejected": -61.18379592895508, "loss": 0.6817, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.08374587446451187, "rewards/margins": 0.02516447938978672, "rewards/rejected": -0.10891035944223404, "step": 1320 }, { "epoch": 0.22915230875258444, "grad_norm": 2.7527568340301514, "learning_rate": 1.5269804822043627e-07, "logits/chosen": -2.9197354316711426, "logits/rejected": -2.8971869945526123, "logps/chosen": -65.04910278320312, "logps/rejected": -61.24757766723633, "loss": 0.6812, "rewards/accuracies": 0.59375, "rewards/chosen": -0.0866512805223465, "rewards/margins": 0.02625083550810814, "rewards/rejected": -0.11290212720632553, "step": 1330 }, { "epoch": 0.2308752584424535, "grad_norm": 2.576115846633911, "learning_rate": 1.5384615384615385e-07, "logits/chosen": -2.81272292137146, "logits/rejected": -2.7970199584960938, "logps/chosen": -64.03965759277344, "logps/rejected": -62.298255920410156, "loss": 0.6826, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.08078499883413315, "rewards/margins": 0.02357543632388115, "rewards/rejected": -0.10436044633388519, "step": 1340 }, { "epoch": 0.23259820813232254, "grad_norm": 2.717984199523926, "learning_rate": 1.5499425947187142e-07, "logits/chosen": -2.855532169342041, "logits/rejected": -2.8379480838775635, "logps/chosen": -64.80758666992188, "logps/rejected": -64.42263793945312, "loss": 0.6885, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.10266206413507462, "rewards/margins": 0.01211500447243452, "rewards/rejected": -0.11477706581354141, "step": 1350 }, { "epoch": 0.2343211578221916, "grad_norm": 2.949355125427246, "learning_rate": 1.56142365097589e-07, "logits/chosen": -2.814122200012207, "logits/rejected": -2.802731990814209, "logps/chosen": -65.5488510131836, "logps/rejected": -65.61713409423828, "loss": 0.6864, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.09135110676288605, "rewards/margins": 0.015937041491270065, "rewards/rejected": -0.10728814452886581, "step": 1360 }, { "epoch": 0.23604410751206065, "grad_norm": 3.703526020050049, "learning_rate": 1.5729047072330654e-07, "logits/chosen": -2.8992276191711426, "logits/rejected": -2.883058547973633, "logps/chosen": -63.055274963378906, "logps/rejected": -64.58672332763672, "loss": 0.6793, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.09181781858205795, "rewards/margins": 0.03024251200258732, "rewards/rejected": -0.12206032127141953, "step": 1370 }, { "epoch": 0.2377670572019297, "grad_norm": 3.2197022438049316, "learning_rate": 1.584385763490241e-07, "logits/chosen": -2.841649293899536, "logits/rejected": -2.833360433578491, "logps/chosen": -64.94597625732422, "logps/rejected": -62.3499870300293, "loss": 0.683, "rewards/accuracies": 0.59375, "rewards/chosen": -0.0867023766040802, "rewards/margins": 0.022678587585687637, "rewards/rejected": -0.10938096046447754, "step": 1380 }, { "epoch": 0.23949000689179875, "grad_norm": 3.200864553451538, "learning_rate": 1.5958668197474169e-07, "logits/chosen": -2.9327969551086426, "logits/rejected": -2.9127845764160156, "logps/chosen": -66.3973388671875, "logps/rejected": -63.198211669921875, "loss": 0.6827, "rewards/accuracies": 0.59375, "rewards/chosen": -0.08382676541805267, "rewards/margins": 0.023857740685343742, "rewards/rejected": -0.10768450796604156, "step": 1390 }, { "epoch": 0.2412129565816678, "grad_norm": 2.830458641052246, "learning_rate": 1.6073478760045923e-07, "logits/chosen": -2.8190531730651855, "logits/rejected": -2.8015060424804688, "logps/chosen": -65.630859375, "logps/rejected": -63.66929244995117, "loss": 0.6794, "rewards/accuracies": 0.625, "rewards/chosen": -0.08757500350475311, "rewards/margins": 0.03044615313410759, "rewards/rejected": -0.1180211678147316, "step": 1400 }, { "epoch": 0.24293590627153688, "grad_norm": 2.9669930934906006, "learning_rate": 1.618828932261768e-07, "logits/chosen": -2.8535845279693604, "logits/rejected": -2.8465869426727295, "logps/chosen": -63.831695556640625, "logps/rejected": -64.20135498046875, "loss": 0.6833, "rewards/accuracies": 0.625, "rewards/chosen": -0.09446011483669281, "rewards/margins": 0.02210450917482376, "rewards/rejected": -0.11656463146209717, "step": 1410 }, { "epoch": 0.24465885596140594, "grad_norm": 3.1416923999786377, "learning_rate": 1.6303099885189438e-07, "logits/chosen": -2.804192304611206, "logits/rejected": -2.807462692260742, "logps/chosen": -61.50630569458008, "logps/rejected": -65.51634216308594, "loss": 0.6896, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.10375789552927017, "rewards/margins": 0.009949756786227226, "rewards/rejected": -0.11370766162872314, "step": 1420 }, { "epoch": 0.246381805651275, "grad_norm": 3.045356512069702, "learning_rate": 1.6417910447761195e-07, "logits/chosen": -2.877074718475342, "logits/rejected": -2.8679327964782715, "logps/chosen": -64.40380859375, "logps/rejected": -67.46661376953125, "loss": 0.6804, "rewards/accuracies": 0.59375, "rewards/chosen": -0.09864847362041473, "rewards/margins": 0.028175463899970055, "rewards/rejected": -0.12682393193244934, "step": 1430 }, { "epoch": 0.24810475534114404, "grad_norm": 2.733543634414673, "learning_rate": 1.653272101033295e-07, "logits/chosen": -2.8185031414031982, "logits/rejected": -2.799570083618164, "logps/chosen": -61.34563446044922, "logps/rejected": -60.22956466674805, "loss": 0.6799, "rewards/accuracies": 0.59375, "rewards/chosen": -0.09759248793125153, "rewards/margins": 0.029250025749206543, "rewards/rejected": -0.12684249877929688, "step": 1440 }, { "epoch": 0.2498277050310131, "grad_norm": 3.470953941345215, "learning_rate": 1.6647531572904707e-07, "logits/chosen": -2.8219285011291504, "logits/rejected": -2.7927727699279785, "logps/chosen": -64.85969543457031, "logps/rejected": -64.55895233154297, "loss": 0.6786, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.09148148447275162, "rewards/margins": 0.03249611333012581, "rewards/rejected": -0.12397760152816772, "step": 1450 }, { "epoch": 0.25155065472088217, "grad_norm": 3.3708927631378174, "learning_rate": 1.6762342135476465e-07, "logits/chosen": -2.840583324432373, "logits/rejected": -2.8144376277923584, "logps/chosen": -63.61417770385742, "logps/rejected": -63.9477653503418, "loss": 0.6756, "rewards/accuracies": 0.59375, "rewards/chosen": -0.10387835651636124, "rewards/margins": 0.038820572197437286, "rewards/rejected": -0.14269892871379852, "step": 1460 }, { "epoch": 0.2532736044107512, "grad_norm": 3.427384614944458, "learning_rate": 1.687715269804822e-07, "logits/chosen": -2.9569153785705566, "logits/rejected": -2.9273934364318848, "logps/chosen": -67.2813720703125, "logps/rejected": -66.46237182617188, "loss": 0.6765, "rewards/accuracies": 0.625, "rewards/chosen": -0.08899648487567902, "rewards/margins": 0.03610143065452576, "rewards/rejected": -0.12509790062904358, "step": 1470 }, { "epoch": 0.2549965541006203, "grad_norm": 3.171168565750122, "learning_rate": 1.6991963260619977e-07, "logits/chosen": -2.9081575870513916, "logits/rejected": -2.8784821033477783, "logps/chosen": -61.340660095214844, "logps/rejected": -63.49712371826172, "loss": 0.6797, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.09166257083415985, "rewards/margins": 0.029424916952848434, "rewards/rejected": -0.12108749151229858, "step": 1480 }, { "epoch": 0.2567195037904893, "grad_norm": 3.0038366317749023, "learning_rate": 1.7106773823191734e-07, "logits/chosen": -2.890097141265869, "logits/rejected": -2.8695075511932373, "logps/chosen": -68.01859283447266, "logps/rejected": -64.6750259399414, "loss": 0.6807, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.09513401985168457, "rewards/margins": 0.027182336896657944, "rewards/rejected": -0.12231633812189102, "step": 1490 }, { "epoch": 0.2584424534803584, "grad_norm": 3.3400094509124756, "learning_rate": 1.722158438576349e-07, "logits/chosen": -2.8165030479431152, "logits/rejected": -2.815304756164551, "logps/chosen": -62.07170486450195, "logps/rejected": -65.22959899902344, "loss": 0.6835, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.1098056212067604, "rewards/margins": 0.02261367253959179, "rewards/rejected": -0.13241930305957794, "step": 1500 }, { "epoch": 0.2601654031702274, "grad_norm": 4.100131988525391, "learning_rate": 1.7336394948335246e-07, "logits/chosen": -2.81020188331604, "logits/rejected": -2.7851126194000244, "logps/chosen": -68.38099670410156, "logps/rejected": -66.84229278564453, "loss": 0.6716, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.09384194761514664, "rewards/margins": 0.04618433117866516, "rewards/rejected": -0.1400262713432312, "step": 1510 }, { "epoch": 0.2618883528600965, "grad_norm": 3.5354981422424316, "learning_rate": 1.7451205510907003e-07, "logits/chosen": -2.865053653717041, "logits/rejected": -2.862806558609009, "logps/chosen": -64.65748596191406, "logps/rejected": -63.5691032409668, "loss": 0.6868, "rewards/accuracies": 0.5625, "rewards/chosen": -0.11419986188411713, "rewards/margins": 0.01575290784239769, "rewards/rejected": -0.12995277345180511, "step": 1520 }, { "epoch": 0.26361130254996556, "grad_norm": 3.4934535026550293, "learning_rate": 1.756601607347876e-07, "logits/chosen": -2.794623851776123, "logits/rejected": -2.7821030616760254, "logps/chosen": -65.29815673828125, "logps/rejected": -64.14790344238281, "loss": 0.6832, "rewards/accuracies": 0.59375, "rewards/chosen": -0.12472577393054962, "rewards/margins": 0.023620011284947395, "rewards/rejected": -0.14834578335285187, "step": 1530 }, { "epoch": 0.2653342522398346, "grad_norm": 3.742764711380005, "learning_rate": 1.7680826636050515e-07, "logits/chosen": -2.871792793273926, "logits/rejected": -2.846169948577881, "logps/chosen": -68.71684265136719, "logps/rejected": -66.43431091308594, "loss": 0.675, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.12237291038036346, "rewards/margins": 0.040168728679418564, "rewards/rejected": -0.16254164278507233, "step": 1540 }, { "epoch": 0.26705720192970367, "grad_norm": 3.3786399364471436, "learning_rate": 1.7795637198622273e-07, "logits/chosen": -2.806138038635254, "logits/rejected": -2.7949366569519043, "logps/chosen": -67.20889282226562, "logps/rejected": -65.76429748535156, "loss": 0.6752, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.1240202784538269, "rewards/margins": 0.03948847949504852, "rewards/rejected": -0.16350875794887543, "step": 1550 }, { "epoch": 0.2687801516195727, "grad_norm": 3.4467484951019287, "learning_rate": 1.791044776119403e-07, "logits/chosen": -2.817890167236328, "logits/rejected": -2.80246639251709, "logps/chosen": -65.50123596191406, "logps/rejected": -66.32374572753906, "loss": 0.6821, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.13890376687049866, "rewards/margins": 0.02497437223792076, "rewards/rejected": -0.16387812793254852, "step": 1560 }, { "epoch": 0.2705031013094418, "grad_norm": 3.2813069820404053, "learning_rate": 1.8025258323765787e-07, "logits/chosen": -2.8562819957733154, "logits/rejected": -2.861222982406616, "logps/chosen": -67.8829116821289, "logps/rejected": -72.48780822753906, "loss": 0.6844, "rewards/accuracies": 0.5625, "rewards/chosen": -0.14664840698242188, "rewards/margins": 0.021208012476563454, "rewards/rejected": -0.16785642504692078, "step": 1570 }, { "epoch": 0.2722260509993108, "grad_norm": 3.9598214626312256, "learning_rate": 1.8140068886337542e-07, "logits/chosen": -2.8063104152679443, "logits/rejected": -2.801279306411743, "logps/chosen": -67.23238372802734, "logps/rejected": -71.10231018066406, "loss": 0.6778, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.143377423286438, "rewards/margins": 0.035317786037921906, "rewards/rejected": -0.1786952167749405, "step": 1580 }, { "epoch": 0.2739490006891799, "grad_norm": 5.066261291503906, "learning_rate": 1.82548794489093e-07, "logits/chosen": -2.838998317718506, "logits/rejected": -2.8293375968933105, "logps/chosen": -68.40142822265625, "logps/rejected": -72.52519226074219, "loss": 0.6739, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.1380554735660553, "rewards/margins": 0.04200926795601845, "rewards/rejected": -0.18006475269794464, "step": 1590 }, { "epoch": 0.27567195037904896, "grad_norm": 3.795305013656616, "learning_rate": 1.8369690011481057e-07, "logits/chosen": -2.824371814727783, "logits/rejected": -2.8006680011749268, "logps/chosen": -68.08535766601562, "logps/rejected": -69.51658630371094, "loss": 0.676, "rewards/accuracies": 0.59375, "rewards/chosen": -0.14742091298103333, "rewards/margins": 0.03879866749048233, "rewards/rejected": -0.18621957302093506, "step": 1600 }, { "epoch": 0.27567195037904896, "eval_logits/chosen": -2.8975741863250732, "eval_logits/rejected": -2.8941969871520996, "eval_logps/chosen": -69.5812759399414, "eval_logps/rejected": -75.9064712524414, "eval_loss": 0.6819967031478882, "eval_rewards/accuracies": 0.5850371718406677, "eval_rewards/chosen": -0.10565808415412903, "eval_rewards/margins": 0.025910574942827225, "eval_rewards/rejected": -0.13156867027282715, "eval_runtime": 384.5219, "eval_samples_per_second": 11.193, "eval_steps_per_second": 1.399, "step": 1600 }, { "epoch": 0.277394900068918, "grad_norm": 4.144330978393555, "learning_rate": 1.848450057405281e-07, "logits/chosen": -2.845106840133667, "logits/rejected": -2.825932502746582, "logps/chosen": -73.3796615600586, "logps/rejected": -77.50773620605469, "loss": 0.6769, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.17480914294719696, "rewards/margins": 0.03731060028076172, "rewards/rejected": -0.21211972832679749, "step": 1610 }, { "epoch": 0.27911784975878706, "grad_norm": 3.920064926147461, "learning_rate": 1.8599311136624569e-07, "logits/chosen": -2.7817695140838623, "logits/rejected": -2.765631914138794, "logps/chosen": -71.98372650146484, "logps/rejected": -76.48005676269531, "loss": 0.6773, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.194097101688385, "rewards/margins": 0.037283383309841156, "rewards/rejected": -0.23138046264648438, "step": 1620 }, { "epoch": 0.2808407994486561, "grad_norm": 5.102138996124268, "learning_rate": 1.8714121699196326e-07, "logits/chosen": -2.826125383377075, "logits/rejected": -2.8114256858825684, "logps/chosen": -73.12959289550781, "logps/rejected": -71.62154388427734, "loss": 0.6734, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.1605428159236908, "rewards/margins": 0.04527025297284126, "rewards/rejected": -0.20581302046775818, "step": 1630 }, { "epoch": 0.28256374913852517, "grad_norm": 3.954345226287842, "learning_rate": 1.8828932261768083e-07, "logits/chosen": -2.8946547508239746, "logits/rejected": -2.8683266639709473, "logps/chosen": -75.14608001708984, "logps/rejected": -70.92219543457031, "loss": 0.6789, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.1698450744152069, "rewards/margins": 0.034184712916612625, "rewards/rejected": -0.20402979850769043, "step": 1640 }, { "epoch": 0.2842866988283942, "grad_norm": 4.2861151695251465, "learning_rate": 1.8943742824339838e-07, "logits/chosen": -2.864513874053955, "logits/rejected": -2.8481781482696533, "logps/chosen": -74.4118423461914, "logps/rejected": -76.10733795166016, "loss": 0.6717, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.16274502873420715, "rewards/margins": 0.04893755167722702, "rewards/rejected": -0.21168258786201477, "step": 1650 }, { "epoch": 0.28600964851826327, "grad_norm": 4.130656719207764, "learning_rate": 1.9058553386911595e-07, "logits/chosen": -2.8591482639312744, "logits/rejected": -2.830831527709961, "logps/chosen": -73.754638671875, "logps/rejected": -70.54668426513672, "loss": 0.6742, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.17435503005981445, "rewards/margins": 0.04359282925724983, "rewards/rejected": -0.21794787049293518, "step": 1660 }, { "epoch": 0.2877325982081323, "grad_norm": 4.471282958984375, "learning_rate": 1.9173363949483353e-07, "logits/chosen": -2.8563740253448486, "logits/rejected": -2.8407225608825684, "logps/chosen": -72.65254974365234, "logps/rejected": -74.63856506347656, "loss": 0.6757, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.16366687417030334, "rewards/margins": 0.039933811873197556, "rewards/rejected": -0.2036006897687912, "step": 1670 }, { "epoch": 0.2894555478980014, "grad_norm": 4.474172115325928, "learning_rate": 1.928817451205511e-07, "logits/chosen": -2.8281986713409424, "logits/rejected": -2.8113129138946533, "logps/chosen": -75.64710998535156, "logps/rejected": -76.26982116699219, "loss": 0.6753, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.18650095164775848, "rewards/margins": 0.04180046170949936, "rewards/rejected": -0.22830140590667725, "step": 1680 }, { "epoch": 0.29117849758787046, "grad_norm": 4.739830493927002, "learning_rate": 1.9402985074626865e-07, "logits/chosen": -2.8134493827819824, "logits/rejected": -2.8036468029022217, "logps/chosen": -73.00105285644531, "logps/rejected": -70.45274353027344, "loss": 0.6853, "rewards/accuracies": 0.5625, "rewards/chosen": -0.17002905905246735, "rewards/margins": 0.019979029893875122, "rewards/rejected": -0.19000807404518127, "step": 1690 }, { "epoch": 0.2929014472777395, "grad_norm": 4.144531726837158, "learning_rate": 1.9517795637198622e-07, "logits/chosen": -2.769505262374878, "logits/rejected": -2.7662644386291504, "logps/chosen": -69.11921691894531, "logps/rejected": -75.20559692382812, "loss": 0.685, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.18292875587940216, "rewards/margins": 0.021692339330911636, "rewards/rejected": -0.2046210765838623, "step": 1700 }, { "epoch": 0.29462439696760856, "grad_norm": 4.729769706726074, "learning_rate": 1.963260619977038e-07, "logits/chosen": -2.8375561237335205, "logits/rejected": -2.8075573444366455, "logps/chosen": -76.96826171875, "logps/rejected": -75.673583984375, "loss": 0.6739, "rewards/accuracies": 0.59375, "rewards/chosen": -0.17701391875743866, "rewards/margins": 0.043523646891117096, "rewards/rejected": -0.22053758800029755, "step": 1710 }, { "epoch": 0.2963473466574776, "grad_norm": 4.864485740661621, "learning_rate": 1.9747416762342134e-07, "logits/chosen": -2.818294048309326, "logits/rejected": -2.7952685356140137, "logps/chosen": -76.0293197631836, "logps/rejected": -72.1862564086914, "loss": 0.6797, "rewards/accuracies": 0.59375, "rewards/chosen": -0.18885084986686707, "rewards/margins": 0.03140731528401375, "rewards/rejected": -0.2202581912279129, "step": 1720 }, { "epoch": 0.29807029634734666, "grad_norm": 4.814679145812988, "learning_rate": 1.9862227324913891e-07, "logits/chosen": -2.8460941314697266, "logits/rejected": -2.8359787464141846, "logps/chosen": -76.57044219970703, "logps/rejected": -74.95271301269531, "loss": 0.6882, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -0.21247780323028564, "rewards/margins": 0.016523724421858788, "rewards/rejected": -0.22900155186653137, "step": 1730 }, { "epoch": 0.2997932460372157, "grad_norm": 4.04186487197876, "learning_rate": 1.997703788748565e-07, "logits/chosen": -2.8214850425720215, "logits/rejected": -2.814258098602295, "logps/chosen": -73.9696273803711, "logps/rejected": -79.91903686523438, "loss": 0.6719, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.2130396068096161, "rewards/margins": 0.049149997532367706, "rewards/rejected": -0.2621895968914032, "step": 1740 }, { "epoch": 0.30151619572708477, "grad_norm": 4.963952541351318, "learning_rate": 1.999998713790723e-07, "logits/chosen": -2.817506790161133, "logits/rejected": -2.8111929893493652, "logps/chosen": -76.08406066894531, "logps/rejected": -80.0066909790039, "loss": 0.6764, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.20121629536151886, "rewards/margins": 0.03890758752822876, "rewards/rejected": -0.24012386798858643, "step": 1750 }, { "epoch": 0.30323914541695385, "grad_norm": 4.477365493774414, "learning_rate": 1.999993488571206e-07, "logits/chosen": -2.837259292602539, "logits/rejected": -2.8106486797332764, "logps/chosen": -76.87786865234375, "logps/rejected": -76.6913070678711, "loss": 0.6719, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.19860495626926422, "rewards/margins": 0.04847431927919388, "rewards/rejected": -0.2470792829990387, "step": 1760 }, { "epoch": 0.3049620951068229, "grad_norm": 4.9479498863220215, "learning_rate": 1.9999842439743547e-07, "logits/chosen": -2.833933115005493, "logits/rejected": -2.8048694133758545, "logps/chosen": -75.09016418457031, "logps/rejected": -72.16284942626953, "loss": 0.6684, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.1871923804283142, "rewards/margins": 0.05504288524389267, "rewards/rejected": -0.2422352284193039, "step": 1770 }, { "epoch": 0.30668504479669195, "grad_norm": 4.612185955047607, "learning_rate": 1.999970980037328e-07, "logits/chosen": -2.7986302375793457, "logits/rejected": -2.7996907234191895, "logps/chosen": -74.53868865966797, "logps/rejected": -82.4207992553711, "loss": 0.6697, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.19617575407028198, "rewards/margins": 0.05358383059501648, "rewards/rejected": -0.24975958466529846, "step": 1780 }, { "epoch": 0.308407994486561, "grad_norm": 5.077559471130371, "learning_rate": 1.999953696813438e-07, "logits/chosen": -2.8587779998779297, "logits/rejected": -2.8454761505126953, "logps/chosen": -75.32337188720703, "logps/rejected": -79.9382553100586, "loss": 0.6684, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.22314408421516418, "rewards/margins": 0.05672816187143326, "rewards/rejected": -0.27987223863601685, "step": 1790 }, { "epoch": 0.31013094417643006, "grad_norm": 5.012825965881348, "learning_rate": 1.9999323943721533e-07, "logits/chosen": -2.857203960418701, "logits/rejected": -2.8387646675109863, "logps/chosen": -76.38417053222656, "logps/rejected": -79.95955657958984, "loss": 0.6739, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.24100014567375183, "rewards/margins": 0.044186659157276154, "rewards/rejected": -0.2851868271827698, "step": 1800 }, { "epoch": 0.3118538938662991, "grad_norm": 6.740519046783447, "learning_rate": 1.9999070727990972e-07, "logits/chosen": -2.835402727127075, "logits/rejected": -2.8102526664733887, "logps/chosen": -81.8633041381836, "logps/rejected": -81.54666137695312, "loss": 0.6754, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.24266846477985382, "rewards/margins": 0.04269016906619072, "rewards/rejected": -0.28535860776901245, "step": 1810 }, { "epoch": 0.31357684355616816, "grad_norm": 5.182940483093262, "learning_rate": 1.999877732196047e-07, "logits/chosen": -2.8230953216552734, "logits/rejected": -2.799881935119629, "logps/chosen": -81.13172912597656, "logps/rejected": -79.09537506103516, "loss": 0.6814, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.25399473309516907, "rewards/margins": 0.031056974083185196, "rewards/rejected": -0.28505173325538635, "step": 1820 }, { "epoch": 0.31529979324603724, "grad_norm": 6.279678821563721, "learning_rate": 1.9998443726809344e-07, "logits/chosen": -2.760938882827759, "logits/rejected": -2.7558131217956543, "logps/chosen": -77.61692810058594, "logps/rejected": -80.39018249511719, "loss": 0.6739, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.2132168710231781, "rewards/margins": 0.04549407586455345, "rewards/rejected": -0.25871092081069946, "step": 1830 }, { "epoch": 0.31702274293590627, "grad_norm": 5.1286301612854, "learning_rate": 1.9998069943878452e-07, "logits/chosen": -2.885629415512085, "logits/rejected": -2.880615472793579, "logps/chosen": -81.87084197998047, "logps/rejected": -84.01935577392578, "loss": 0.6761, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.259524941444397, "rewards/margins": 0.04153280705213547, "rewards/rejected": -0.30105772614479065, "step": 1840 }, { "epoch": 0.31874569262577535, "grad_norm": 6.253596782684326, "learning_rate": 1.9997655974670177e-07, "logits/chosen": -2.8121185302734375, "logits/rejected": -2.8132262229919434, "logps/chosen": -80.99190521240234, "logps/rejected": -83.85938262939453, "loss": 0.6805, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.2681719660758972, "rewards/margins": 0.032716695219278336, "rewards/rejected": -0.30088865756988525, "step": 1850 }, { "epoch": 0.32046864231564437, "grad_norm": 4.878740310668945, "learning_rate": 1.9997201820848421e-07, "logits/chosen": -2.771169424057007, "logits/rejected": -2.7482142448425293, "logps/chosen": -80.3410415649414, "logps/rejected": -78.98155975341797, "loss": 0.6681, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.2096785604953766, "rewards/margins": 0.056961387395858765, "rewards/rejected": -0.26663994789123535, "step": 1860 }, { "epoch": 0.32219159200551345, "grad_norm": 8.953726768493652, "learning_rate": 1.999670748423862e-07, "logits/chosen": -2.773393154144287, "logits/rejected": -2.7547454833984375, "logps/chosen": -77.43568420410156, "logps/rejected": -80.14945983886719, "loss": 0.6631, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.20601901412010193, "rewards/margins": 0.06925632059574127, "rewards/rejected": -0.2752753496170044, "step": 1870 }, { "epoch": 0.3239145416953825, "grad_norm": 5.518745422363281, "learning_rate": 1.9996172966827712e-07, "logits/chosen": -2.8194966316223145, "logits/rejected": -2.7963428497314453, "logps/chosen": -76.18521118164062, "logps/rejected": -78.38603210449219, "loss": 0.668, "rewards/accuracies": 0.625, "rewards/chosen": -0.21804073452949524, "rewards/margins": 0.0592648908495903, "rewards/rejected": -0.27730563282966614, "step": 1880 }, { "epoch": 0.32563749138525155, "grad_norm": 5.660772323608398, "learning_rate": 1.9995598270764132e-07, "logits/chosen": -2.8152401447296143, "logits/rejected": -2.809969902038574, "logps/chosen": -75.25090789794922, "logps/rejected": -80.28077697753906, "loss": 0.67, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.21531884372234344, "rewards/margins": 0.05318418890237808, "rewards/rejected": -0.2685030400753021, "step": 1890 }, { "epoch": 0.32736044107512063, "grad_norm": 5.434048175811768, "learning_rate": 1.9994983398357822e-07, "logits/chosen": -2.803793430328369, "logits/rejected": -2.7829272747039795, "logps/chosen": -82.60762786865234, "logps/rejected": -79.85516357421875, "loss": 0.675, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.24345216155052185, "rewards/margins": 0.043585650622844696, "rewards/rejected": -0.28703781962394714, "step": 1900 }, { "epoch": 0.32908339076498966, "grad_norm": 5.854109764099121, "learning_rate": 1.9994328352080197e-07, "logits/chosen": -2.7232251167297363, "logits/rejected": -2.6980268955230713, "logps/chosen": -81.93873596191406, "logps/rejected": -87.40576171875, "loss": 0.6598, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.27153441309928894, "rewards/margins": 0.07509879767894745, "rewards/rejected": -0.3466332256793976, "step": 1910 }, { "epoch": 0.33080634045485874, "grad_norm": 7.13450288772583, "learning_rate": 1.9993633134564157e-07, "logits/chosen": -2.7804644107818604, "logits/rejected": -2.7603726387023926, "logps/chosen": -85.59014892578125, "logps/rejected": -87.0826187133789, "loss": 0.6706, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.2912139296531677, "rewards/margins": 0.054294537752866745, "rewards/rejected": -0.34550851583480835, "step": 1920 }, { "epoch": 0.33252929014472776, "grad_norm": 7.671220779418945, "learning_rate": 1.9992897748604057e-07, "logits/chosen": -2.740054130554199, "logits/rejected": -2.715630054473877, "logps/chosen": -86.39445495605469, "logps/rejected": -88.7936019897461, "loss": 0.6712, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.3088904023170471, "rewards/margins": 0.0533197820186615, "rewards/rejected": -0.36221015453338623, "step": 1930 }, { "epoch": 0.33425223983459684, "grad_norm": 6.598614692687988, "learning_rate": 1.9992122197155713e-07, "logits/chosen": -2.7623841762542725, "logits/rejected": -2.752276659011841, "logps/chosen": -79.55069732666016, "logps/rejected": -82.56233215332031, "loss": 0.6708, "rewards/accuracies": 0.625, "rewards/chosen": -0.27555832266807556, "rewards/margins": 0.05399967357516289, "rewards/rejected": -0.32955801486968994, "step": 1940 }, { "epoch": 0.33597518952446587, "grad_norm": 8.005949020385742, "learning_rate": 1.9991306483336379e-07, "logits/chosen": -2.789032459259033, "logits/rejected": -2.787119150161743, "logps/chosen": -82.3895034790039, "logps/rejected": -88.818603515625, "loss": 0.6738, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.2936074137687683, "rewards/margins": 0.04737875610589981, "rewards/rejected": -0.3409861624240875, "step": 1950 }, { "epoch": 0.33769813921433495, "grad_norm": 6.6296305656433105, "learning_rate": 1.9990450610424739e-07, "logits/chosen": -2.7821130752563477, "logits/rejected": -2.7680463790893555, "logps/chosen": -85.0245590209961, "logps/rejected": -89.01289367675781, "loss": 0.6709, "rewards/accuracies": 0.59375, "rewards/chosen": -0.29906749725341797, "rewards/margins": 0.053921766579151154, "rewards/rejected": -0.3529892563819885, "step": 1960 }, { "epoch": 0.33942108890420397, "grad_norm": 5.420909881591797, "learning_rate": 1.9989554581860885e-07, "logits/chosen": -2.7975800037384033, "logits/rejected": -2.7792773246765137, "logps/chosen": -86.5243148803711, "logps/rejected": -83.91753387451172, "loss": 0.6821, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.29315876960754395, "rewards/margins": 0.032392654567956924, "rewards/rejected": -0.32555145025253296, "step": 1970 }, { "epoch": 0.34114403859407305, "grad_norm": 5.429892063140869, "learning_rate": 1.9988618401246327e-07, "logits/chosen": -2.7424893379211426, "logits/rejected": -2.7354159355163574, "logps/chosen": -85.56155395507812, "logps/rejected": -84.77156066894531, "loss": 0.6859, "rewards/accuracies": 0.5625, "rewards/chosen": -0.2920836806297302, "rewards/margins": 0.022945603355765343, "rewards/rejected": -0.31502923369407654, "step": 1980 }, { "epoch": 0.34286698828394213, "grad_norm": 4.82956600189209, "learning_rate": 1.9987642072343948e-07, "logits/chosen": -2.8278892040252686, "logits/rejected": -2.801731586456299, "logps/chosen": -78.64983367919922, "logps/rejected": -82.09465789794922, "loss": 0.6565, "rewards/accuracies": 0.6875, "rewards/chosen": -0.2664022147655487, "rewards/margins": 0.08327129483222961, "rewards/rejected": -0.34967344999313354, "step": 1990 }, { "epoch": 0.34458993797381116, "grad_norm": 7.895046234130859, "learning_rate": 1.9986625599078007e-07, "logits/chosen": -2.777226686477661, "logits/rejected": -2.7833473682403564, "logps/chosen": -78.89508056640625, "logps/rejected": -88.81587982177734, "loss": 0.6751, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.2843848168849945, "rewards/margins": 0.04450703412294388, "rewards/rejected": -0.3288918137550354, "step": 2000 }, { "epoch": 0.34458993797381116, "eval_logits/chosen": -2.8467745780944824, "eval_logits/rejected": -2.843392848968506, "eval_logps/chosen": -76.16111755371094, "eval_logps/rejected": -83.7308349609375, "eval_loss": 0.677007257938385, "eval_rewards/accuracies": 0.5889869928359985, "eval_rewards/chosen": -0.17145642638206482, "eval_rewards/margins": 0.03835584595799446, "eval_rewards/rejected": -0.20981229841709137, "eval_runtime": 398.552, "eval_samples_per_second": 10.799, "eval_steps_per_second": 1.35, "step": 2000 }, { "epoch": 0.34631288766368024, "grad_norm": 7.642477035522461, "learning_rate": 1.9985568985534123e-07, "logits/chosen": -2.80598521232605, "logits/rejected": -2.784700393676758, "logps/chosen": -80.71785736083984, "logps/rejected": -80.69572448730469, "loss": 0.6681, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.23994019627571106, "rewards/margins": 0.05967951938509941, "rewards/rejected": -0.29961973428726196, "step": 2010 }, { "epoch": 0.34803583735354926, "grad_norm": 6.802818298339844, "learning_rate": 1.9984472235959246e-07, "logits/chosen": -2.7718281745910645, "logits/rejected": -2.7579102516174316, "logps/chosen": -77.16645812988281, "logps/rejected": -87.4555435180664, "loss": 0.6615, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.2625424563884735, "rewards/margins": 0.07505720853805542, "rewards/rejected": -0.33759966492652893, "step": 2020 }, { "epoch": 0.34975878704341834, "grad_norm": 7.350240707397461, "learning_rate": 1.9983335354761662e-07, "logits/chosen": -2.8413634300231934, "logits/rejected": -2.824218988418579, "logps/chosen": -84.81204986572266, "logps/rejected": -88.4823226928711, "loss": 0.6693, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.27297335863113403, "rewards/margins": 0.058346938341856, "rewards/rejected": -0.33132028579711914, "step": 2030 }, { "epoch": 0.35148173673328736, "grad_norm": 6.206594467163086, "learning_rate": 1.9982158346510952e-07, "logits/chosen": -2.740448474884033, "logits/rejected": -2.735485076904297, "logps/chosen": -81.70101165771484, "logps/rejected": -87.45999908447266, "loss": 0.6639, "rewards/accuracies": 0.625, "rewards/chosen": -0.2743123173713684, "rewards/margins": 0.07054021954536438, "rewards/rejected": -0.3448525071144104, "step": 2040 }, { "epoch": 0.35320468642315644, "grad_norm": 6.593442440032959, "learning_rate": 1.998094121593799e-07, "logits/chosen": -2.806100845336914, "logits/rejected": -2.7891507148742676, "logps/chosen": -77.92963409423828, "logps/rejected": -85.45118713378906, "loss": 0.6689, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.2585117518901825, "rewards/margins": 0.058480918407440186, "rewards/rejected": -0.31699270009994507, "step": 2050 }, { "epoch": 0.3549276361130255, "grad_norm": 7.442837238311768, "learning_rate": 1.9979683967934911e-07, "logits/chosen": -2.809927463531494, "logits/rejected": -2.784133195877075, "logps/chosen": -80.66841125488281, "logps/rejected": -82.72510528564453, "loss": 0.6659, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.24928779900074005, "rewards/margins": 0.06473737210035324, "rewards/rejected": -0.3140251934528351, "step": 2060 }, { "epoch": 0.35665058580289455, "grad_norm": 6.645026683807373, "learning_rate": 1.9978386607555103e-07, "logits/chosen": -2.825409412384033, "logits/rejected": -2.8098556995391846, "logps/chosen": -84.98794555664062, "logps/rejected": -89.08955383300781, "loss": 0.6676, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.2859798073768616, "rewards/margins": 0.06237654760479927, "rewards/rejected": -0.34835633635520935, "step": 2070 }, { "epoch": 0.35837353549276363, "grad_norm": 5.906277179718018, "learning_rate": 1.9977049140013183e-07, "logits/chosen": -2.758378744125366, "logits/rejected": -2.7402453422546387, "logps/chosen": -83.45210266113281, "logps/rejected": -87.67256927490234, "loss": 0.6625, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.3064289689064026, "rewards/margins": 0.0735897570848465, "rewards/rejected": -0.3800187110900879, "step": 2080 }, { "epoch": 0.36009648518263265, "grad_norm": 5.731607437133789, "learning_rate": 1.997567157068497e-07, "logits/chosen": -2.7923924922943115, "logits/rejected": -2.7936103343963623, "logps/chosen": -86.95465087890625, "logps/rejected": -90.43916320800781, "loss": 0.6733, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.31404390931129456, "rewards/margins": 0.051842041313648224, "rewards/rejected": -0.36588597297668457, "step": 2090 }, { "epoch": 0.36181943487250173, "grad_norm": 5.9960479736328125, "learning_rate": 1.997425390510747e-07, "logits/chosen": -2.756856918334961, "logits/rejected": -2.741403579711914, "logps/chosen": -85.68098449707031, "logps/rejected": -86.40949249267578, "loss": 0.6662, "rewards/accuracies": 0.625, "rewards/chosen": -0.2941354513168335, "rewards/margins": 0.0659971535205841, "rewards/rejected": -0.3601325750350952, "step": 2100 }, { "epoch": 0.36354238456237076, "grad_norm": 6.551288604736328, "learning_rate": 1.9972796148978856e-07, "logits/chosen": -2.7723071575164795, "logits/rejected": -2.7736432552337646, "logps/chosen": -80.19908905029297, "logps/rejected": -90.48481750488281, "loss": 0.6665, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.29734283685684204, "rewards/margins": 0.06577859073877335, "rewards/rejected": -0.3631214201450348, "step": 2110 }, { "epoch": 0.36526533425223984, "grad_norm": 6.967728137969971, "learning_rate": 1.9971298308158441e-07, "logits/chosen": -2.7236945629119873, "logits/rejected": -2.7059988975524902, "logps/chosen": -80.61097717285156, "logps/rejected": -84.89838409423828, "loss": 0.6569, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.26060813665390015, "rewards/margins": 0.08486412465572357, "rewards/rejected": -0.3454722762107849, "step": 2120 }, { "epoch": 0.3669882839421089, "grad_norm": 9.18038558959961, "learning_rate": 1.9969760388666645e-07, "logits/chosen": -2.7267282009124756, "logits/rejected": -2.7112841606140137, "logps/chosen": -85.85907745361328, "logps/rejected": -92.39701843261719, "loss": 0.6504, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.3092494308948517, "rewards/margins": 0.0988670364022255, "rewards/rejected": -0.4081164300441742, "step": 2130 }, { "epoch": 0.36871123363197794, "grad_norm": 8.384355545043945, "learning_rate": 1.996818239668499e-07, "logits/chosen": -2.7107224464416504, "logits/rejected": -2.705289125442505, "logps/chosen": -81.84632110595703, "logps/rejected": -93.03125762939453, "loss": 0.6584, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.32457980513572693, "rewards/margins": 0.07963573187589645, "rewards/rejected": -0.4042155146598816, "step": 2140 }, { "epoch": 0.370434183321847, "grad_norm": 7.8044514656066895, "learning_rate": 1.9966564338556065e-07, "logits/chosen": -2.733471155166626, "logits/rejected": -2.701479434967041, "logps/chosen": -87.98902893066406, "logps/rejected": -88.12610626220703, "loss": 0.6534, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.3127537965774536, "rewards/margins": 0.09064503014087677, "rewards/rejected": -0.4033988416194916, "step": 2150 }, { "epoch": 0.37215713301171605, "grad_norm": 8.45705509185791, "learning_rate": 1.9964906220783492e-07, "logits/chosen": -2.716222047805786, "logits/rejected": -2.705082893371582, "logps/chosen": -94.84622955322266, "logps/rejected": -92.92621612548828, "loss": 0.6708, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.3836338222026825, "rewards/margins": 0.056928135454654694, "rewards/rejected": -0.4405619502067566, "step": 2160 }, { "epoch": 0.3738800827015851, "grad_norm": 8.92052936553955, "learning_rate": 1.9963208050031922e-07, "logits/chosen": -2.8072280883789062, "logits/rejected": -2.7956135272979736, "logps/chosen": -93.62696838378906, "logps/rejected": -100.60514831542969, "loss": 0.6451, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.386393666267395, "rewards/margins": 0.11263259500265121, "rewards/rejected": -0.49902623891830444, "step": 2170 }, { "epoch": 0.37560303239145415, "grad_norm": 9.09363842010498, "learning_rate": 1.9961469833126987e-07, "logits/chosen": -2.8279776573181152, "logits/rejected": -2.803985595703125, "logps/chosen": -106.29798889160156, "logps/rejected": -105.96732330322266, "loss": 0.6672, "rewards/accuracies": 0.5625, "rewards/chosen": -0.46758827567100525, "rewards/margins": 0.07012106478214264, "rewards/rejected": -0.5377094149589539, "step": 2180 }, { "epoch": 0.37732598208132323, "grad_norm": 7.870185375213623, "learning_rate": 1.995969157705528e-07, "logits/chosen": -2.871009588241577, "logits/rejected": -2.8679919242858887, "logps/chosen": -95.17460632324219, "logps/rejected": -101.10989379882812, "loss": 0.67, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.4195151925086975, "rewards/margins": 0.060843341052532196, "rewards/rejected": -0.4803585410118103, "step": 2190 }, { "epoch": 0.37904893177119225, "grad_norm": 8.457090377807617, "learning_rate": 1.995787328896433e-07, "logits/chosen": -2.7545523643493652, "logits/rejected": -2.7420895099639893, "logps/chosen": -90.79286193847656, "logps/rejected": -100.09413146972656, "loss": 0.659, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.3792213797569275, "rewards/margins": 0.08726942539215088, "rewards/rejected": -0.4664907455444336, "step": 2200 }, { "epoch": 0.38077188146106133, "grad_norm": 8.350848197937012, "learning_rate": 1.9956014976162572e-07, "logits/chosen": -2.796229839324951, "logits/rejected": -2.7860822677612305, "logps/chosen": -92.96107482910156, "logps/rejected": -95.38506317138672, "loss": 0.6708, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.37106844782829285, "rewards/margins": 0.058512069284915924, "rewards/rejected": -0.42958053946495056, "step": 2210 }, { "epoch": 0.3824948311509304, "grad_norm": 8.519368171691895, "learning_rate": 1.9954116646119315e-07, "logits/chosen": -2.674560308456421, "logits/rejected": -2.668351650238037, "logps/chosen": -90.71675872802734, "logps/rejected": -96.26646423339844, "loss": 0.6669, "rewards/accuracies": 0.625, "rewards/chosen": -0.35924988985061646, "rewards/margins": 0.06914254277944565, "rewards/rejected": -0.4283924698829651, "step": 2220 }, { "epoch": 0.38421778084079944, "grad_norm": 8.899928092956543, "learning_rate": 1.9952178306464708e-07, "logits/chosen": -2.7807633876800537, "logits/rejected": -2.7601094245910645, "logps/chosen": -92.98501586914062, "logps/rejected": -93.89694213867188, "loss": 0.6722, "rewards/accuracies": 0.625, "rewards/chosen": -0.38546445965766907, "rewards/margins": 0.054776061326265335, "rewards/rejected": -0.4402404725551605, "step": 2230 }, { "epoch": 0.3859407305306685, "grad_norm": 7.316110610961914, "learning_rate": 1.9950199964989728e-07, "logits/chosen": -2.772390604019165, "logits/rejected": -2.74361515045166, "logps/chosen": -94.11860656738281, "logps/rejected": -94.66634368896484, "loss": 0.6754, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.367158979177475, "rewards/margins": 0.050020165741443634, "rewards/rejected": -0.4171791076660156, "step": 2240 }, { "epoch": 0.38766368022053754, "grad_norm": 13.700274467468262, "learning_rate": 1.9948181629646125e-07, "logits/chosen": -2.707676410675049, "logits/rejected": -2.6810202598571777, "logps/chosen": -93.48914337158203, "logps/rejected": -94.83229064941406, "loss": 0.672, "rewards/accuracies": 0.625, "rewards/chosen": -0.3691152334213257, "rewards/margins": 0.05425702780485153, "rewards/rejected": -0.4233722686767578, "step": 2250 }, { "epoch": 0.3893866299104066, "grad_norm": 7.7341108322143555, "learning_rate": 1.99461233085464e-07, "logits/chosen": -2.712756395339966, "logits/rejected": -2.686300039291382, "logps/chosen": -96.61649322509766, "logps/rejected": -99.14938354492188, "loss": 0.6654, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.3552453815937042, "rewards/margins": 0.07118260115385056, "rewards/rejected": -0.4264279901981354, "step": 2260 }, { "epoch": 0.39110957960027565, "grad_norm": 8.820086479187012, "learning_rate": 1.9944025009963783e-07, "logits/chosen": -2.701951265335083, "logits/rejected": -2.6750802993774414, "logps/chosen": -91.39891052246094, "logps/rejected": -94.74635314941406, "loss": 0.6633, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.364845335483551, "rewards/margins": 0.07676877081394196, "rewards/rejected": -0.4416140615940094, "step": 2270 }, { "epoch": 0.3928325292901447, "grad_norm": 7.709187984466553, "learning_rate": 1.9941886742332175e-07, "logits/chosen": -2.724069833755493, "logits/rejected": -2.720072031021118, "logps/chosen": -89.43894958496094, "logps/rejected": -98.11272430419922, "loss": 0.6601, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.34747377038002014, "rewards/margins": 0.08022281527519226, "rewards/rejected": -0.42769655585289, "step": 2280 }, { "epoch": 0.3945554789800138, "grad_norm": 7.868160247802734, "learning_rate": 1.9939708514246143e-07, "logits/chosen": -2.6722376346588135, "logits/rejected": -2.6551549434661865, "logps/chosen": -91.05497741699219, "logps/rejected": -99.68929290771484, "loss": 0.6515, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.376445472240448, "rewards/margins": 0.09919819980859756, "rewards/rejected": -0.47564369440078735, "step": 2290 }, { "epoch": 0.39627842866988283, "grad_norm": 8.843903541564941, "learning_rate": 1.9937490334460857e-07, "logits/chosen": -2.7841269969940186, "logits/rejected": -2.759320020675659, "logps/chosen": -99.24239349365234, "logps/rejected": -103.56510925292969, "loss": 0.6549, "rewards/accuracies": 0.6875, "rewards/chosen": -0.41207608580589294, "rewards/margins": 0.09394040703773499, "rewards/rejected": -0.5060164928436279, "step": 2300 }, { "epoch": 0.3980013783597519, "grad_norm": 7.929662704467773, "learning_rate": 1.9935232211892083e-07, "logits/chosen": -2.714627742767334, "logits/rejected": -2.7024972438812256, "logps/chosen": -96.11257934570312, "logps/rejected": -103.85871887207031, "loss": 0.655, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.4413214325904846, "rewards/margins": 0.09551285207271576, "rewards/rejected": -0.5368342399597168, "step": 2310 }, { "epoch": 0.39972432804962094, "grad_norm": 7.020380020141602, "learning_rate": 1.9932934155616127e-07, "logits/chosen": -2.7679104804992676, "logits/rejected": -2.7354788780212402, "logps/chosen": -104.21220397949219, "logps/rejected": -106.46934509277344, "loss": 0.6535, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.4562048316001892, "rewards/margins": 0.09959544241428375, "rewards/rejected": -0.5558002591133118, "step": 2320 }, { "epoch": 0.40144727773949, "grad_norm": 8.07779598236084, "learning_rate": 1.9930596174869797e-07, "logits/chosen": -2.723388195037842, "logits/rejected": -2.7130160331726074, "logps/chosen": -99.20726013183594, "logps/rejected": -105.5082015991211, "loss": 0.642, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.4184534549713135, "rewards/margins": 0.12403968721628189, "rewards/rejected": -0.5424931645393372, "step": 2330 }, { "epoch": 0.40317022742935904, "grad_norm": 11.052251815795898, "learning_rate": 1.992821827905039e-07, "logits/chosen": -2.74686336517334, "logits/rejected": -2.7400949001312256, "logps/chosen": -95.10917663574219, "logps/rejected": -102.4656982421875, "loss": 0.6719, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.41894015669822693, "rewards/margins": 0.05962671712040901, "rewards/rejected": -0.47856688499450684, "step": 2340 }, { "epoch": 0.4048931771192281, "grad_norm": 13.94660758972168, "learning_rate": 1.9925800477715623e-07, "logits/chosen": -2.7292239665985107, "logits/rejected": -2.7181527614593506, "logps/chosen": -97.06452941894531, "logps/rejected": -103.667236328125, "loss": 0.6482, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.3829440474510193, "rewards/margins": 0.11175689846277237, "rewards/rejected": -0.49470096826553345, "step": 2350 }, { "epoch": 0.4066161268090972, "grad_norm": 9.01716136932373, "learning_rate": 1.992334278058362e-07, "logits/chosen": -2.7573258876800537, "logits/rejected": -2.746138334274292, "logps/chosen": -92.14654541015625, "logps/rejected": -101.3851089477539, "loss": 0.6496, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.4030567705631256, "rewards/margins": 0.10895220935344696, "rewards/rejected": -0.5120089650154114, "step": 2360 }, { "epoch": 0.4083390764989662, "grad_norm": 11.883419036865234, "learning_rate": 1.9920845197532854e-07, "logits/chosen": -2.75362229347229, "logits/rejected": -2.7488625049591064, "logps/chosen": -100.34485626220703, "logps/rejected": -108.6867904663086, "loss": 0.6528, "rewards/accuracies": 0.625, "rewards/chosen": -0.45487451553344727, "rewards/margins": 0.10138414055109024, "rewards/rejected": -0.5562586784362793, "step": 2370 }, { "epoch": 0.4100620261888353, "grad_norm": 8.847671508789062, "learning_rate": 1.991830773860212e-07, "logits/chosen": -2.713022470474243, "logits/rejected": -2.696528911590576, "logps/chosen": -100.02056884765625, "logps/rejected": -104.38777160644531, "loss": 0.6686, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.4528826177120209, "rewards/margins": 0.0709812194108963, "rewards/rejected": -0.5238637924194336, "step": 2380 }, { "epoch": 0.41178497587870433, "grad_norm": 12.014663696289062, "learning_rate": 1.9915730413990486e-07, "logits/chosen": -2.75661301612854, "logits/rejected": -2.738581895828247, "logps/chosen": -106.23995208740234, "logps/rejected": -110.62483978271484, "loss": 0.6544, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.4921490550041199, "rewards/margins": 0.0999661311507225, "rewards/rejected": -0.5921152830123901, "step": 2390 }, { "epoch": 0.4135079255685734, "grad_norm": 10.028485298156738, "learning_rate": 1.9913113234057264e-07, "logits/chosen": -2.7949843406677246, "logits/rejected": -2.7854666709899902, "logps/chosen": -96.82019805908203, "logps/rejected": -107.0666275024414, "loss": 0.6518, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.4390484690666199, "rewards/margins": 0.10471247136592865, "rewards/rejected": -0.5437608957290649, "step": 2400 }, { "epoch": 0.4135079255685734, "eval_logits/chosen": -2.79258131980896, "eval_logits/rejected": -2.7893073558807373, "eval_logps/chosen": -96.29039764404297, "eval_logps/rejected": -106.56369018554688, "eval_loss": 0.6676347851753235, "eval_rewards/accuracies": 0.606877326965332, "eval_rewards/chosen": -0.37274929881095886, "eval_rewards/margins": 0.06539163738489151, "eval_rewards/rejected": -0.43814095854759216, "eval_runtime": 384.7623, "eval_samples_per_second": 11.186, "eval_steps_per_second": 1.398, "step": 2400 }, { "epoch": 0.41523087525844243, "grad_norm": 11.282934188842773, "learning_rate": 1.9910456209321956e-07, "logits/chosen": -2.7576167583465576, "logits/rejected": -2.73296856880188, "logps/chosen": -101.84904479980469, "logps/rejected": -103.86643981933594, "loss": 0.6567, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.4611765742301941, "rewards/margins": 0.09233228862285614, "rewards/rejected": -0.5535088777542114, "step": 2410 }, { "epoch": 0.4169538249483115, "grad_norm": 13.180535316467285, "learning_rate": 1.9907759350464212e-07, "logits/chosen": -2.7671501636505127, "logits/rejected": -2.7475426197052, "logps/chosen": -102.40633392333984, "logps/rejected": -110.27593994140625, "loss": 0.649, "rewards/accuracies": 0.6875, "rewards/chosen": -0.4542531371116638, "rewards/margins": 0.1068098396062851, "rewards/rejected": -0.5610629916191101, "step": 2420 }, { "epoch": 0.41867677463818054, "grad_norm": 9.693948745727539, "learning_rate": 1.9905022668323803e-07, "logits/chosen": -2.7108891010284424, "logits/rejected": -2.6946499347686768, "logps/chosen": -100.01832580566406, "logps/rejected": -101.14888000488281, "loss": 0.6754, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.4425184726715088, "rewards/margins": 0.05519191175699234, "rewards/rejected": -0.49771031737327576, "step": 2430 }, { "epoch": 0.4203997243280496, "grad_norm": 13.568098068237305, "learning_rate": 1.9902246173900554e-07, "logits/chosen": -2.7337582111358643, "logits/rejected": -2.724292755126953, "logps/chosen": -96.25423431396484, "logps/rejected": -104.6993179321289, "loss": 0.6456, "rewards/accuracies": 0.65625, "rewards/chosen": -0.39634186029434204, "rewards/margins": 0.1141355037689209, "rewards/rejected": -0.5104773640632629, "step": 2440 }, { "epoch": 0.4221226740179187, "grad_norm": 9.999527931213379, "learning_rate": 1.9899429878354318e-07, "logits/chosen": -2.710087776184082, "logits/rejected": -2.6934750080108643, "logps/chosen": -100.3898696899414, "logps/rejected": -104.9481201171875, "loss": 0.6662, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.4584103524684906, "rewards/margins": 0.07401929795742035, "rewards/rejected": -0.532429575920105, "step": 2450 }, { "epoch": 0.4238456237077877, "grad_norm": 6.6206536293029785, "learning_rate": 1.989657379300492e-07, "logits/chosen": -2.712404727935791, "logits/rejected": -2.688204288482666, "logps/chosen": -100.12140655517578, "logps/rejected": -103.390625, "loss": 0.6607, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.42770424485206604, "rewards/margins": 0.09007831662893295, "rewards/rejected": -0.5177825093269348, "step": 2460 }, { "epoch": 0.4255685733976568, "grad_norm": 10.734334945678711, "learning_rate": 1.9893677929332123e-07, "logits/chosen": -2.7907164096832275, "logits/rejected": -2.77321195602417, "logps/chosen": -99.40889739990234, "logps/rejected": -104.71065521240234, "loss": 0.6583, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.4294106364250183, "rewards/margins": 0.09319963306188583, "rewards/rejected": -0.5226103067398071, "step": 2470 }, { "epoch": 0.4272915230875258, "grad_norm": 9.096755981445312, "learning_rate": 1.9890742298975574e-07, "logits/chosen": -2.7226462364196777, "logits/rejected": -2.6995315551757812, "logps/chosen": -99.27317810058594, "logps/rejected": -102.8205795288086, "loss": 0.6601, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.4444698393344879, "rewards/margins": 0.08868283033370972, "rewards/rejected": -0.53315269947052, "step": 2480 }, { "epoch": 0.4290144727773949, "grad_norm": 12.619534492492676, "learning_rate": 1.9887766913734748e-07, "logits/chosen": -2.735262870788574, "logits/rejected": -2.7315163612365723, "logps/chosen": -91.02969360351562, "logps/rejected": -103.66337585449219, "loss": 0.6457, "rewards/accuracies": 0.625, "rewards/chosen": -0.407538503408432, "rewards/margins": 0.1184179037809372, "rewards/rejected": -0.5259564518928528, "step": 2490 }, { "epoch": 0.43073742246726393, "grad_norm": 9.47433853149414, "learning_rate": 1.9884751785568928e-07, "logits/chosen": -2.763235569000244, "logits/rejected": -2.744462490081787, "logps/chosen": -107.6962890625, "logps/rejected": -116.11775207519531, "loss": 0.6578, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.5090689063072205, "rewards/margins": 0.09511864930391312, "rewards/rejected": -0.6041876077651978, "step": 2500 }, { "epoch": 0.432460372157133, "grad_norm": 9.720632553100586, "learning_rate": 1.9881696926597125e-07, "logits/chosen": -2.6765334606170654, "logits/rejected": -2.6696765422821045, "logps/chosen": -101.8257827758789, "logps/rejected": -110.89205169677734, "loss": 0.6486, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.4863107204437256, "rewards/margins": 0.11515744030475616, "rewards/rejected": -0.6014681458473206, "step": 2510 }, { "epoch": 0.4341833218470021, "grad_norm": 11.542259216308594, "learning_rate": 1.987860234909805e-07, "logits/chosen": -2.6656720638275146, "logits/rejected": -2.6460516452789307, "logps/chosen": -103.51522064208984, "logps/rejected": -109.52400207519531, "loss": 0.6524, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.5028563737869263, "rewards/margins": 0.10998809337615967, "rewards/rejected": -0.6128444075584412, "step": 2520 }, { "epoch": 0.4359062715368711, "grad_norm": 11.412103652954102, "learning_rate": 1.987546806551006e-07, "logits/chosen": -2.726616382598877, "logits/rejected": -2.72737455368042, "logps/chosen": -101.48871612548828, "logps/rejected": -109.21578216552734, "loss": 0.6715, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.4935535490512848, "rewards/margins": 0.06767278909683228, "rewards/rejected": -0.5612263083457947, "step": 2530 }, { "epoch": 0.4376292212267402, "grad_norm": 11.56679630279541, "learning_rate": 1.9872294088431105e-07, "logits/chosen": -2.719217538833618, "logits/rejected": -2.706432580947876, "logps/chosen": -106.15666198730469, "logps/rejected": -116.7001724243164, "loss": 0.6531, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.5228097438812256, "rewards/margins": 0.11348120123147964, "rewards/rejected": -0.6362909078598022, "step": 2540 }, { "epoch": 0.4393521709166092, "grad_norm": 15.340383529663086, "learning_rate": 1.9869080430618684e-07, "logits/chosen": -2.716742515563965, "logits/rejected": -2.690329074859619, "logps/chosen": -109.13045501708984, "logps/rejected": -112.8603744506836, "loss": 0.6542, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.4905763566493988, "rewards/margins": 0.1051291674375534, "rewards/rejected": -0.595705509185791, "step": 2550 }, { "epoch": 0.4410751206064783, "grad_norm": 13.995706558227539, "learning_rate": 1.9865827104989774e-07, "logits/chosen": -2.7623062133789062, "logits/rejected": -2.743518352508545, "logps/chosen": -105.69709777832031, "logps/rejected": -112.81187438964844, "loss": 0.6601, "rewards/accuracies": 0.625, "rewards/chosen": -0.49545541405677795, "rewards/margins": 0.09465087950229645, "rewards/rejected": -0.5901063084602356, "step": 2560 }, { "epoch": 0.4427980702963473, "grad_norm": 11.846534729003906, "learning_rate": 1.9862534124620814e-07, "logits/chosen": -2.704515218734741, "logits/rejected": -2.6942319869995117, "logps/chosen": -114.15962982177734, "logps/rejected": -117.64332580566406, "loss": 0.6697, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.5424486398696899, "rewards/margins": 0.07557938992977142, "rewards/rejected": -0.6180279850959778, "step": 2570 }, { "epoch": 0.4445210199862164, "grad_norm": 13.413008689880371, "learning_rate": 1.9859201502747614e-07, "logits/chosen": -2.722397804260254, "logits/rejected": -2.7104969024658203, "logps/chosen": -109.5277099609375, "logps/rejected": -115.61268615722656, "loss": 0.6698, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.5544854998588562, "rewards/margins": 0.07775478065013885, "rewards/rejected": -0.6322402954101562, "step": 2580 }, { "epoch": 0.4462439696760855, "grad_norm": 10.059220314025879, "learning_rate": 1.985582925276533e-07, "logits/chosen": -2.694520950317383, "logits/rejected": -2.6677818298339844, "logps/chosen": -107.39048767089844, "logps/rejected": -109.03306579589844, "loss": 0.6586, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.5027720332145691, "rewards/margins": 0.08564041554927826, "rewards/rejected": -0.5884124040603638, "step": 2590 }, { "epoch": 0.4479669193659545, "grad_norm": 14.61945629119873, "learning_rate": 1.9852417388228392e-07, "logits/chosen": -2.7207679748535156, "logits/rejected": -2.6922860145568848, "logps/chosen": -110.1673355102539, "logps/rejected": -108.0245361328125, "loss": 0.6654, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.515986442565918, "rewards/margins": 0.08189831674098969, "rewards/rejected": -0.5978847742080688, "step": 2600 }, { "epoch": 0.4496898690558236, "grad_norm": 10.718831062316895, "learning_rate": 1.9848965922850464e-07, "logits/chosen": -2.703439235687256, "logits/rejected": -2.6759893894195557, "logps/chosen": -105.53524017333984, "logps/rejected": -105.3673324584961, "loss": 0.6693, "rewards/accuracies": 0.59375, "rewards/chosen": -0.4681882858276367, "rewards/margins": 0.07423902302980423, "rewards/rejected": -0.5424273610115051, "step": 2610 }, { "epoch": 0.4514128187456926, "grad_norm": 10.117892265319824, "learning_rate": 1.9845474870504378e-07, "logits/chosen": -2.7295031547546387, "logits/rejected": -2.7077276706695557, "logps/chosen": -93.91828918457031, "logps/rejected": -104.33548736572266, "loss": 0.6406, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.4194442331790924, "rewards/margins": 0.12605123221874237, "rewards/rejected": -0.5454954504966736, "step": 2620 }, { "epoch": 0.4531357684355617, "grad_norm": 8.47807788848877, "learning_rate": 1.984194424522208e-07, "logits/chosen": -2.667053699493408, "logits/rejected": -2.6478772163391113, "logps/chosen": -96.94065856933594, "logps/rejected": -108.85382080078125, "loss": 0.6343, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.41304540634155273, "rewards/margins": 0.14610633254051208, "rewards/rejected": -0.5591517686843872, "step": 2630 }, { "epoch": 0.4548587181254307, "grad_norm": 11.933856010437012, "learning_rate": 1.9838374061194575e-07, "logits/chosen": -2.683161497116089, "logits/rejected": -2.6706173419952393, "logps/chosen": -97.94457244873047, "logps/rejected": -106.34178161621094, "loss": 0.6477, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.46626657247543335, "rewards/margins": 0.11536351591348648, "rewards/rejected": -0.5816301107406616, "step": 2640 }, { "epoch": 0.4565816678152998, "grad_norm": 10.261037826538086, "learning_rate": 1.983476433277188e-07, "logits/chosen": -2.6465036869049072, "logits/rejected": -2.635824203491211, "logps/chosen": -102.07220458984375, "logps/rejected": -112.19637298583984, "loss": 0.6536, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.4834628701210022, "rewards/margins": 0.10731105506420135, "rewards/rejected": -0.59077388048172, "step": 2650 }, { "epoch": 0.4583046175051689, "grad_norm": 11.558037757873535, "learning_rate": 1.9831115074462944e-07, "logits/chosen": -2.6853396892547607, "logits/rejected": -2.6572957038879395, "logps/chosen": -109.86177825927734, "logps/rejected": -115.7259750366211, "loss": 0.6476, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.5141522288322449, "rewards/margins": 0.12386395037174225, "rewards/rejected": -0.6380161046981812, "step": 2660 }, { "epoch": 0.4600275671950379, "grad_norm": 12.215116500854492, "learning_rate": 1.982742630093561e-07, "logits/chosen": -2.6593005657196045, "logits/rejected": -2.6388514041900635, "logps/chosen": -115.66468811035156, "logps/rejected": -121.42977142333984, "loss": 0.6646, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.6060430407524109, "rewards/margins": 0.09283698350191116, "rewards/rejected": -0.6988800764083862, "step": 2670 }, { "epoch": 0.461750516884907, "grad_norm": 10.993180274963379, "learning_rate": 1.9823698027016548e-07, "logits/chosen": -2.7275149822235107, "logits/rejected": -2.703504800796509, "logps/chosen": -114.2564926147461, "logps/rejected": -116.74766540527344, "loss": 0.6615, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.546272337436676, "rewards/margins": 0.09389664232730865, "rewards/rejected": -0.6401689648628235, "step": 2680 }, { "epoch": 0.463473466574776, "grad_norm": 11.470463752746582, "learning_rate": 1.98199302676912e-07, "logits/chosen": -2.690214157104492, "logits/rejected": -2.6763110160827637, "logps/chosen": -100.8199462890625, "logps/rejected": -111.06422424316406, "loss": 0.6436, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.49626702070236206, "rewards/margins": 0.12201160192489624, "rewards/rejected": -0.6182786226272583, "step": 2690 }, { "epoch": 0.4651964162646451, "grad_norm": 9.260663032531738, "learning_rate": 1.9816123038103701e-07, "logits/chosen": -2.6932690143585205, "logits/rejected": -2.6703147888183594, "logps/chosen": -99.10530090332031, "logps/rejected": -108.68851470947266, "loss": 0.6524, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.45348605513572693, "rewards/margins": 0.10938525199890137, "rewards/rejected": -0.5628713369369507, "step": 2700 }, { "epoch": 0.4669193659545141, "grad_norm": 11.823142051696777, "learning_rate": 1.9812276353556852e-07, "logits/chosen": -2.716191053390503, "logits/rejected": -2.698429584503174, "logps/chosen": -105.06886291503906, "logps/rejected": -108.39021301269531, "loss": 0.6578, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.44185343384742737, "rewards/margins": 0.09492018818855286, "rewards/rejected": -0.536773681640625, "step": 2710 }, { "epoch": 0.4686423156443832, "grad_norm": 11.33167839050293, "learning_rate": 1.9808390229512026e-07, "logits/chosen": -2.686174154281616, "logits/rejected": -2.6930418014526367, "logps/chosen": -99.7154312133789, "logps/rejected": -110.9734878540039, "loss": 0.6506, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.4637439250946045, "rewards/margins": 0.11209309101104736, "rewards/rejected": -0.5758370161056519, "step": 2720 }, { "epoch": 0.4703652653342522, "grad_norm": 13.145691871643066, "learning_rate": 1.980446468158912e-07, "logits/chosen": -2.693547248840332, "logits/rejected": -2.684006929397583, "logps/chosen": -108.5434341430664, "logps/rejected": -113.65617370605469, "loss": 0.6567, "rewards/accuracies": 0.625, "rewards/chosen": -0.4874972403049469, "rewards/margins": 0.10460158437490463, "rewards/rejected": -0.5920988321304321, "step": 2730 }, { "epoch": 0.4720882150241213, "grad_norm": 10.7879638671875, "learning_rate": 1.9800499725566506e-07, "logits/chosen": -2.676889657974243, "logits/rejected": -2.6644585132598877, "logps/chosen": -100.70921325683594, "logps/rejected": -102.6191177368164, "loss": 0.6691, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.4652387499809265, "rewards/margins": 0.07463903725147247, "rewards/rejected": -0.5398778319358826, "step": 2740 }, { "epoch": 0.4738111647139904, "grad_norm": 12.473271369934082, "learning_rate": 1.9796495377380933e-07, "logits/chosen": -2.6230242252349854, "logits/rejected": -2.6228199005126953, "logps/chosen": -92.10359191894531, "logps/rejected": -106.92274475097656, "loss": 0.6424, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.42038828134536743, "rewards/margins": 0.1267349123954773, "rewards/rejected": -0.5471231341362, "step": 2750 }, { "epoch": 0.4755341144038594, "grad_norm": 12.147249221801758, "learning_rate": 1.9792451653127496e-07, "logits/chosen": -2.6653361320495605, "logits/rejected": -2.657927989959717, "logps/chosen": -97.95817565917969, "logps/rejected": -110.66314697265625, "loss": 0.6398, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.42441844940185547, "rewards/margins": 0.1387939751148224, "rewards/rejected": -0.5632123947143555, "step": 2760 }, { "epoch": 0.4772570640937285, "grad_norm": 10.381051063537598, "learning_rate": 1.9788368569059551e-07, "logits/chosen": -2.7360317707061768, "logits/rejected": -2.7096409797668457, "logps/chosen": -106.78315734863281, "logps/rejected": -111.8558349609375, "loss": 0.6596, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.5055192112922668, "rewards/margins": 0.0921645388007164, "rewards/rejected": -0.5976837873458862, "step": 2770 }, { "epoch": 0.4789800137835975, "grad_norm": 13.484970092773438, "learning_rate": 1.9784246141588662e-07, "logits/chosen": -2.6131300926208496, "logits/rejected": -2.5922048091888428, "logps/chosen": -112.4483642578125, "logps/rejected": -119.95368957519531, "loss": 0.6543, "rewards/accuracies": 0.625, "rewards/chosen": -0.5929507613182068, "rewards/margins": 0.10826816409826279, "rewards/rejected": -0.7012189030647278, "step": 2780 }, { "epoch": 0.4807029634734666, "grad_norm": 12.457329750061035, "learning_rate": 1.9780084387284535e-07, "logits/chosen": -2.688363552093506, "logits/rejected": -2.664832353591919, "logps/chosen": -109.06170654296875, "logps/rejected": -117.6565933227539, "loss": 0.6457, "rewards/accuracies": 0.65625, "rewards/chosen": -0.5575080513954163, "rewards/margins": 0.12331205606460571, "rewards/rejected": -0.680820107460022, "step": 2790 }, { "epoch": 0.4824259131633356, "grad_norm": 13.478117942810059, "learning_rate": 1.977588332287493e-07, "logits/chosen": -2.707609176635742, "logits/rejected": -2.6857008934020996, "logps/chosen": -124.88829040527344, "logps/rejected": -128.7651824951172, "loss": 0.6695, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.6486111283302307, "rewards/margins": 0.08287011831998825, "rewards/rejected": -0.7314812541007996, "step": 2800 }, { "epoch": 0.4824259131633356, "eval_logits/chosen": -2.7450242042541504, "eval_logits/rejected": -2.7414777278900146, "eval_logps/chosen": -106.3523178100586, "eval_logps/rejected": -118.35004425048828, "eval_loss": 0.6630884408950806, "eval_rewards/accuracies": 0.6140799522399902, "eval_rewards/chosen": -0.47336843609809875, "eval_rewards/margins": 0.0826360210776329, "eval_rewards/rejected": -0.5560044646263123, "eval_runtime": 384.301, "eval_samples_per_second": 11.2, "eval_steps_per_second": 1.4, "step": 2800 }, { "epoch": 0.4841488628532047, "grad_norm": 16.544485092163086, "learning_rate": 1.9771642965245623e-07, "logits/chosen": -2.634906768798828, "logits/rejected": -2.6151344776153564, "logps/chosen": -112.07231140136719, "logps/rejected": -121.56315612792969, "loss": 0.656, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.5526391863822937, "rewards/margins": 0.10764841735363007, "rewards/rejected": -0.6602876782417297, "step": 2810 }, { "epoch": 0.48587181254307377, "grad_norm": 13.295195579528809, "learning_rate": 1.9767363331440324e-07, "logits/chosen": -2.7115139961242676, "logits/rejected": -2.7019412517547607, "logps/chosen": -111.91889953613281, "logps/rejected": -114.33284759521484, "loss": 0.6771, "rewards/accuracies": 0.59375, "rewards/chosen": -0.5611149072647095, "rewards/margins": 0.06479386240243912, "rewards/rejected": -0.6259086728096008, "step": 2820 }, { "epoch": 0.4875947622329428, "grad_norm": 10.603714942932129, "learning_rate": 1.9763044438660606e-07, "logits/chosen": -2.591468334197998, "logits/rejected": -2.5788750648498535, "logps/chosen": -108.39720153808594, "logps/rejected": -118.72285461425781, "loss": 0.6448, "rewards/accuracies": 0.6875, "rewards/chosen": -0.5164579153060913, "rewards/margins": 0.12574324011802673, "rewards/rejected": -0.6422011852264404, "step": 2830 }, { "epoch": 0.48931771192281187, "grad_norm": 10.576966285705566, "learning_rate": 1.9758686304265845e-07, "logits/chosen": -2.697780132293701, "logits/rejected": -2.6885037422180176, "logps/chosen": -109.06404876708984, "logps/rejected": -115.7732162475586, "loss": 0.6539, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.5489058494567871, "rewards/margins": 0.10463793575763702, "rewards/rejected": -0.6535437703132629, "step": 2840 }, { "epoch": 0.4910406616126809, "grad_norm": 14.944794654846191, "learning_rate": 1.975428894577314e-07, "logits/chosen": -2.6671526432037354, "logits/rejected": -2.653252124786377, "logps/chosen": -110.45272064208984, "logps/rejected": -124.46026611328125, "loss": 0.6443, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.5722262263298035, "rewards/margins": 0.13191375136375427, "rewards/rejected": -0.7041400074958801, "step": 2850 }, { "epoch": 0.49276361130255, "grad_norm": 11.296040534973145, "learning_rate": 1.9749852380857247e-07, "logits/chosen": -2.6553471088409424, "logits/rejected": -2.637782573699951, "logps/chosen": -112.59574890136719, "logps/rejected": -122.50654602050781, "loss": 0.6428, "rewards/accuracies": 0.6875, "rewards/chosen": -0.5746721029281616, "rewards/margins": 0.13814301788806915, "rewards/rejected": -0.7128151655197144, "step": 2860 }, { "epoch": 0.494486560992419, "grad_norm": 11.216682434082031, "learning_rate": 1.9745376627350515e-07, "logits/chosen": -2.7220160961151123, "logits/rejected": -2.7088305950164795, "logps/chosen": -113.31159973144531, "logps/rejected": -120.91353607177734, "loss": 0.6485, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.5657340884208679, "rewards/margins": 0.12379582971334457, "rewards/rejected": -0.6895298957824707, "step": 2870 }, { "epoch": 0.4962095106822881, "grad_norm": 11.167985916137695, "learning_rate": 1.9740861703242797e-07, "logits/chosen": -2.7385172843933105, "logits/rejected": -2.715331554412842, "logps/chosen": -111.76835632324219, "logps/rejected": -119.62261962890625, "loss": 0.6401, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.5546174645423889, "rewards/margins": 0.14337070286273956, "rewards/rejected": -0.6979882121086121, "step": 2880 }, { "epoch": 0.49793246037215716, "grad_norm": 14.719575881958008, "learning_rate": 1.97363076266814e-07, "logits/chosen": -2.737037181854248, "logits/rejected": -2.729886531829834, "logps/chosen": -112.28880310058594, "logps/rejected": -123.53924560546875, "loss": 0.6474, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.5839313268661499, "rewards/margins": 0.13259650766849518, "rewards/rejected": -0.7165278196334839, "step": 2890 }, { "epoch": 0.4996554100620262, "grad_norm": 15.115251541137695, "learning_rate": 1.9731714415970998e-07, "logits/chosen": -2.683711528778076, "logits/rejected": -2.678250789642334, "logps/chosen": -109.89457702636719, "logps/rejected": -120.73185729980469, "loss": 0.6465, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.5626266598701477, "rewards/margins": 0.13326212763786316, "rewards/rejected": -0.6958888173103333, "step": 2900 }, { "epoch": 0.5013783597518953, "grad_norm": 14.059300422668457, "learning_rate": 1.9727082089573552e-07, "logits/chosen": -2.7109973430633545, "logits/rejected": -2.700655221939087, "logps/chosen": -117.3915786743164, "logps/rejected": -132.0679931640625, "loss": 0.6301, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.6010380983352661, "rewards/margins": 0.16865003108978271, "rewards/rejected": -0.7696880102157593, "step": 2910 }, { "epoch": 0.5031013094417643, "grad_norm": 12.552085876464844, "learning_rate": 1.9722410666108251e-07, "logits/chosen": -2.6676573753356934, "logits/rejected": -2.6615819931030273, "logps/chosen": -117.3313980102539, "logps/rejected": -135.73495483398438, "loss": 0.6261, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.6300705671310425, "rewards/margins": 0.1892434060573578, "rewards/rejected": -0.8193138837814331, "step": 2920 }, { "epoch": 0.5048242591316333, "grad_norm": 9.844046592712402, "learning_rate": 1.9717700164351435e-07, "logits/chosen": -2.638333559036255, "logits/rejected": -2.618190288543701, "logps/chosen": -119.87908935546875, "logps/rejected": -129.7265625, "loss": 0.6452, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.6576894521713257, "rewards/margins": 0.1377858966588974, "rewards/rejected": -0.7954753637313843, "step": 2930 }, { "epoch": 0.5065472088215024, "grad_norm": 13.81851577758789, "learning_rate": 1.9712950603236508e-07, "logits/chosen": -2.6915862560272217, "logits/rejected": -2.664580821990967, "logps/chosen": -115.47206115722656, "logps/rejected": -120.67753601074219, "loss": 0.6709, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.6299867630004883, "rewards/margins": 0.07517345994710922, "rewards/rejected": -0.7051601409912109, "step": 2940 }, { "epoch": 0.5082701585113715, "grad_norm": 16.14995765686035, "learning_rate": 1.9708162001853873e-07, "logits/chosen": -2.679628372192383, "logits/rejected": -2.6672160625457764, "logps/chosen": -115.45826721191406, "logps/rejected": -128.9546661376953, "loss": 0.6392, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.6021052002906799, "rewards/margins": 0.150004580616951, "rewards/rejected": -0.7521097660064697, "step": 2950 }, { "epoch": 0.5099931082012406, "grad_norm": 14.37147045135498, "learning_rate": 1.9703334379450855e-07, "logits/chosen": -2.662416458129883, "logits/rejected": -2.642178535461426, "logps/chosen": -113.80577087402344, "logps/rejected": -125.65077209472656, "loss": 0.6401, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.5899726152420044, "rewards/margins": 0.1525414139032364, "rewards/rejected": -0.7425141334533691, "step": 2960 }, { "epoch": 0.5117160578911096, "grad_norm": 15.08690357208252, "learning_rate": 1.969846775543161e-07, "logits/chosen": -2.6417126655578613, "logits/rejected": -2.6231141090393066, "logps/chosen": -120.83221435546875, "logps/rejected": -128.4465789794922, "loss": 0.6525, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.6522833704948425, "rewards/margins": 0.11867114156484604, "rewards/rejected": -0.7709546089172363, "step": 2970 }, { "epoch": 0.5134390075809786, "grad_norm": 18.869640350341797, "learning_rate": 1.9693562149357072e-07, "logits/chosen": -2.600480794906616, "logits/rejected": -2.5805163383483887, "logps/chosen": -113.0972900390625, "logps/rejected": -125.45975494384766, "loss": 0.6352, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.5786206126213074, "rewards/margins": 0.1582469642162323, "rewards/rejected": -0.7368675470352173, "step": 2980 }, { "epoch": 0.5151619572708477, "grad_norm": 13.40176010131836, "learning_rate": 1.9688617580944843e-07, "logits/chosen": -2.647470712661743, "logits/rejected": -2.635159969329834, "logps/chosen": -123.00044250488281, "logps/rejected": -128.8345489501953, "loss": 0.6638, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.6534823179244995, "rewards/margins": 0.09893546253442764, "rewards/rejected": -0.7524177432060242, "step": 2990 }, { "epoch": 0.5168849069607168, "grad_norm": 11.170113563537598, "learning_rate": 1.9683634070069143e-07, "logits/chosen": -2.6602885723114014, "logits/rejected": -2.654470920562744, "logps/chosen": -114.0888671875, "logps/rejected": -125.89949798583984, "loss": 0.6579, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.6134128570556641, "rewards/margins": 0.10806174576282501, "rewards/rejected": -0.7214745283126831, "step": 3000 }, { "epoch": 0.5186078566505858, "grad_norm": 11.310858726501465, "learning_rate": 1.967861163676071e-07, "logits/chosen": -2.6761560440063477, "logits/rejected": -2.6583657264709473, "logps/chosen": -118.22630310058594, "logps/rejected": -124.42864990234375, "loss": 0.663, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.6270531415939331, "rewards/margins": 0.09819537401199341, "rewards/rejected": -0.7252485156059265, "step": 3010 }, { "epoch": 0.5203308063404548, "grad_norm": 16.713150024414062, "learning_rate": 1.9673550301206733e-07, "logits/chosen": -2.722543239593506, "logits/rejected": -2.7021353244781494, "logps/chosen": -119.0759048461914, "logps/rejected": -125.45185852050781, "loss": 0.6528, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.6386953592300415, "rewards/margins": 0.12587383389472961, "rewards/rejected": -0.764569103717804, "step": 3020 }, { "epoch": 0.5220537560303239, "grad_norm": 10.162747383117676, "learning_rate": 1.9668450083750762e-07, "logits/chosen": -2.6935548782348633, "logits/rejected": -2.6769282817840576, "logps/chosen": -115.16400146484375, "logps/rejected": -122.29130554199219, "loss": 0.6621, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.5884724855422974, "rewards/margins": 0.09111051261425018, "rewards/rejected": -0.679582953453064, "step": 3030 }, { "epoch": 0.523776705720193, "grad_norm": 10.428220748901367, "learning_rate": 1.9663311004892628e-07, "logits/chosen": -2.7106406688690186, "logits/rejected": -2.707573175430298, "logps/chosen": -106.23811340332031, "logps/rejected": -118.0770492553711, "loss": 0.6545, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.5447341203689575, "rewards/margins": 0.1072479635477066, "rewards/rejected": -0.6519821286201477, "step": 3040 }, { "epoch": 0.525499655410062, "grad_norm": 18.292776107788086, "learning_rate": 1.9658133085288365e-07, "logits/chosen": -2.650239944458008, "logits/rejected": -2.6476333141326904, "logps/chosen": -105.09783935546875, "logps/rejected": -115.9225845336914, "loss": 0.658, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.5097786784172058, "rewards/margins": 0.09773959219455719, "rewards/rejected": -0.6075183153152466, "step": 3050 }, { "epoch": 0.5272226050999311, "grad_norm": 10.919912338256836, "learning_rate": 1.965291634575011e-07, "logits/chosen": -2.6657092571258545, "logits/rejected": -2.6515870094299316, "logps/chosen": -107.94815826416016, "logps/rejected": -117.40814208984375, "loss": 0.6516, "rewards/accuracies": 0.625, "rewards/chosen": -0.5158464312553406, "rewards/margins": 0.11876648664474487, "rewards/rejected": -0.6346129179000854, "step": 3060 }, { "epoch": 0.5289455547898001, "grad_norm": 12.27825927734375, "learning_rate": 1.9647660807246063e-07, "logits/chosen": -2.64638090133667, "logits/rejected": -2.6269311904907227, "logps/chosen": -112.88493347167969, "logps/rejected": -117.7209243774414, "loss": 0.6524, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.5371401309967041, "rewards/margins": 0.11090948432683945, "rewards/rejected": -0.6480496525764465, "step": 3070 }, { "epoch": 0.5306685044796692, "grad_norm": 12.138772010803223, "learning_rate": 1.9642366490900337e-07, "logits/chosen": -2.602574586868286, "logits/rejected": -2.59029221534729, "logps/chosen": -107.50993347167969, "logps/rejected": -120.93980407714844, "loss": 0.6581, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.566285252571106, "rewards/margins": 0.10775377601385117, "rewards/rejected": -0.6740390658378601, "step": 3080 }, { "epoch": 0.5323914541695383, "grad_norm": 15.335344314575195, "learning_rate": 1.9637033417992936e-07, "logits/chosen": -2.634575605392456, "logits/rejected": -2.6185824871063232, "logps/chosen": -109.16261291503906, "logps/rejected": -121.67292785644531, "loss": 0.6343, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.5395610928535461, "rewards/margins": 0.1506732702255249, "rewards/rejected": -0.6902343034744263, "step": 3090 }, { "epoch": 0.5341144038594073, "grad_norm": 19.178672790527344, "learning_rate": 1.9631661609959628e-07, "logits/chosen": -2.633981943130493, "logits/rejected": -2.6135735511779785, "logps/chosen": -117.04927062988281, "logps/rejected": -128.36834716796875, "loss": 0.6363, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.6114171743392944, "rewards/margins": 0.1613766849040985, "rewards/rejected": -0.7727938294410706, "step": 3100 }, { "epoch": 0.5358373535492763, "grad_norm": 15.843265533447266, "learning_rate": 1.9626251088391876e-07, "logits/chosen": -2.607421398162842, "logits/rejected": -2.6124348640441895, "logps/chosen": -122.96217346191406, "logps/rejected": -138.46749877929688, "loss": 0.6534, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.7146931886672974, "rewards/margins": 0.12729968130588531, "rewards/rejected": -0.8419928550720215, "step": 3110 }, { "epoch": 0.5375603032391454, "grad_norm": 13.338793754577637, "learning_rate": 1.9620801875036753e-07, "logits/chosen": -2.6482999324798584, "logits/rejected": -2.6301398277282715, "logps/chosen": -122.67982482910156, "logps/rejected": -135.84725952148438, "loss": 0.6384, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.6544739603996277, "rewards/margins": 0.1578807681798935, "rewards/rejected": -0.8123547434806824, "step": 3120 }, { "epoch": 0.5392832529290145, "grad_norm": 16.10958480834961, "learning_rate": 1.9615313991796843e-07, "logits/chosen": -2.5828936100006104, "logits/rejected": -2.577544927597046, "logps/chosen": -116.6275634765625, "logps/rejected": -133.2178955078125, "loss": 0.6392, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.6371970772743225, "rewards/margins": 0.1632428765296936, "rewards/rejected": -0.8004400134086609, "step": 3130 }, { "epoch": 0.5410062026188835, "grad_norm": 15.640116691589355, "learning_rate": 1.960978746073016e-07, "logits/chosen": -2.643578290939331, "logits/rejected": -2.630753517150879, "logps/chosen": -124.42008972167969, "logps/rejected": -140.39816284179688, "loss": 0.6407, "rewards/accuracies": 0.65625, "rewards/chosen": -0.7190755009651184, "rewards/margins": 0.15711475908756256, "rewards/rejected": -0.8761903047561646, "step": 3140 }, { "epoch": 0.5427291523087526, "grad_norm": 19.58635711669922, "learning_rate": 1.9604222304050074e-07, "logits/chosen": -2.6569530963897705, "logits/rejected": -2.6379687786102295, "logps/chosen": -127.451904296875, "logps/rejected": -133.31912231445312, "loss": 0.6666, "rewards/accuracies": 0.59375, "rewards/chosen": -0.7172044515609741, "rewards/margins": 0.09000442177057266, "rewards/rejected": -0.8072088360786438, "step": 3150 }, { "epoch": 0.5444521019986216, "grad_norm": 16.635807037353516, "learning_rate": 1.9598618544125184e-07, "logits/chosen": -2.603647232055664, "logits/rejected": -2.5840981006622314, "logps/chosen": -119.03878021240234, "logps/rejected": -128.95834350585938, "loss": 0.6452, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.6372925043106079, "rewards/margins": 0.1405438780784607, "rewards/rejected": -0.7778364419937134, "step": 3160 }, { "epoch": 0.5461750516884907, "grad_norm": 12.24067211151123, "learning_rate": 1.9592976203479266e-07, "logits/chosen": -2.6436235904693604, "logits/rejected": -2.6211209297180176, "logps/chosen": -121.8818130493164, "logps/rejected": -126.7547378540039, "loss": 0.6423, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.647641658782959, "rewards/margins": 0.14087779819965363, "rewards/rejected": -0.7885195016860962, "step": 3170 }, { "epoch": 0.5478980013783598, "grad_norm": 11.4162015914917, "learning_rate": 1.9587295304791164e-07, "logits/chosen": -2.676767349243164, "logits/rejected": -2.6484227180480957, "logps/chosen": -118.13658142089844, "logps/rejected": -129.58218383789062, "loss": 0.6398, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.6407798528671265, "rewards/margins": 0.14756932854652405, "rewards/rejected": -0.7883491516113281, "step": 3180 }, { "epoch": 0.5496209510682288, "grad_norm": 13.021843910217285, "learning_rate": 1.95815758708947e-07, "logits/chosen": -2.6646246910095215, "logits/rejected": -2.6626439094543457, "logps/chosen": -119.4345703125, "logps/rejected": -142.91665649414062, "loss": 0.615, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.6598604321479797, "rewards/margins": 0.20836925506591797, "rewards/rejected": -0.8682296872138977, "step": 3190 }, { "epoch": 0.5513439007580979, "grad_norm": 13.713115692138672, "learning_rate": 1.957581792477859e-07, "logits/chosen": -2.5963289737701416, "logits/rejected": -2.579953670501709, "logps/chosen": -133.90020751953125, "logps/rejected": -143.35369873046875, "loss": 0.6467, "rewards/accuracies": 0.65625, "rewards/chosen": -0.7653565406799316, "rewards/margins": 0.15095801651477814, "rewards/rejected": -0.9163146018981934, "step": 3200 }, { "epoch": 0.5513439007580979, "eval_logits/chosen": -2.6902225017547607, "eval_logits/rejected": -2.686380386352539, "eval_logps/chosen": -126.01993560791016, "eval_logps/rejected": -140.8850555419922, "eval_loss": 0.6583073735237122, "eval_rewards/accuracies": 0.625, "eval_rewards/chosen": -0.6700446605682373, "eval_rewards/margins": 0.11130973696708679, "eval_rewards/rejected": -0.7813544273376465, "eval_runtime": 384.2204, "eval_samples_per_second": 11.202, "eval_steps_per_second": 1.4, "step": 3200 }, { "epoch": 0.5530668504479669, "grad_norm": 14.34167194366455, "learning_rate": 1.9570021489586344e-07, "logits/chosen": -2.537349224090576, "logits/rejected": -2.5172617435455322, "logps/chosen": -136.31301879882812, "logps/rejected": -147.375244140625, "loss": 0.6499, "rewards/accuracies": 0.59375, "rewards/chosen": -0.8167101740837097, "rewards/margins": 0.1374952495098114, "rewards/rejected": -0.9542053937911987, "step": 3210 }, { "epoch": 0.554789800137836, "grad_norm": 13.798596382141113, "learning_rate": 1.956418658861617e-07, "logits/chosen": -2.5830676555633545, "logits/rejected": -2.579763889312744, "logps/chosen": -126.21321105957031, "logps/rejected": -140.56715393066406, "loss": 0.6511, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.7518718242645264, "rewards/margins": 0.13411535322666168, "rewards/rejected": -0.8859871625900269, "step": 3220 }, { "epoch": 0.556512749827705, "grad_norm": 21.124483108520508, "learning_rate": 1.9558313245320888e-07, "logits/chosen": -2.6222100257873535, "logits/rejected": -2.610461950302124, "logps/chosen": -123.21502685546875, "logps/rejected": -138.0326690673828, "loss": 0.6526, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.6875979900360107, "rewards/margins": 0.13721275329589844, "rewards/rejected": -0.8248106241226196, "step": 3230 }, { "epoch": 0.5582356995175741, "grad_norm": 16.622541427612305, "learning_rate": 1.955240148330784e-07, "logits/chosen": -2.7097229957580566, "logits/rejected": -2.693171262741089, "logps/chosen": -127.66766357421875, "logps/rejected": -132.49832153320312, "loss": 0.6612, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.7031322717666626, "rewards/margins": 0.1105247363448143, "rewards/rejected": -0.8136569857597351, "step": 3240 }, { "epoch": 0.5599586492074431, "grad_norm": 15.283352851867676, "learning_rate": 1.954645132633878e-07, "logits/chosen": -2.6242454051971436, "logits/rejected": -2.6123194694519043, "logps/chosen": -114.93122863769531, "logps/rejected": -128.0443572998047, "loss": 0.6458, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.6113388538360596, "rewards/margins": 0.14095903933048248, "rewards/rejected": -0.7522978782653809, "step": 3250 }, { "epoch": 0.5616815988973122, "grad_norm": 17.378856658935547, "learning_rate": 1.9540462798329788e-07, "logits/chosen": -2.6355347633361816, "logits/rejected": -2.624072551727295, "logps/chosen": -108.41911315917969, "logps/rejected": -124.9243392944336, "loss": 0.6285, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.5605282783508301, "rewards/margins": 0.1749248504638672, "rewards/rejected": -0.7354531288146973, "step": 3260 }, { "epoch": 0.5634045485871813, "grad_norm": 13.821595191955566, "learning_rate": 1.953443592335118e-07, "logits/chosen": -2.6720964908599854, "logits/rejected": -2.6658260822296143, "logps/chosen": -121.88392639160156, "logps/rejected": -136.3887939453125, "loss": 0.6467, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.6581703424453735, "rewards/margins": 0.14219583570957184, "rewards/rejected": -0.800366222858429, "step": 3270 }, { "epoch": 0.5651274982770503, "grad_norm": 15.801923751831055, "learning_rate": 1.9528370725627393e-07, "logits/chosen": -2.675248384475708, "logits/rejected": -2.6648693084716797, "logps/chosen": -117.72242736816406, "logps/rejected": -132.8204803466797, "loss": 0.6477, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.6743533611297607, "rewards/margins": 0.13626965880393982, "rewards/rejected": -0.810623049736023, "step": 3280 }, { "epoch": 0.5668504479669194, "grad_norm": 13.726994514465332, "learning_rate": 1.9522267229536907e-07, "logits/chosen": -2.698050022125244, "logits/rejected": -2.6729812622070312, "logps/chosen": -129.0258331298828, "logps/rejected": -145.4852294921875, "loss": 0.6327, "rewards/accuracies": 0.6875, "rewards/chosen": -0.7511179447174072, "rewards/margins": 0.1727326661348343, "rewards/rejected": -0.9238505363464355, "step": 3290 }, { "epoch": 0.5685733976567884, "grad_norm": 20.418867111206055, "learning_rate": 1.9516125459612133e-07, "logits/chosen": -2.6035115718841553, "logits/rejected": -2.584538459777832, "logps/chosen": -142.64402770996094, "logps/rejected": -157.33480834960938, "loss": 0.6359, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.8705562353134155, "rewards/margins": 0.1773010492324829, "rewards/rejected": -1.0478572845458984, "step": 3300 }, { "epoch": 0.5702963473466575, "grad_norm": 18.851089477539062, "learning_rate": 1.9509945440539328e-07, "logits/chosen": -2.546842098236084, "logits/rejected": -2.5207438468933105, "logps/chosen": -147.82745361328125, "logps/rejected": -162.19891357421875, "loss": 0.6334, "rewards/accuracies": 0.65625, "rewards/chosen": -0.9319332838058472, "rewards/margins": 0.18691737949848175, "rewards/rejected": -1.1188507080078125, "step": 3310 }, { "epoch": 0.5720192970365265, "grad_norm": 22.01388168334961, "learning_rate": 1.9503727197158475e-07, "logits/chosen": -2.5725531578063965, "logits/rejected": -2.549002170562744, "logps/chosen": -152.4097137451172, "logps/rejected": -159.28634643554688, "loss": 0.6629, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.9795411825180054, "rewards/margins": 0.13039647042751312, "rewards/rejected": -1.1099377870559692, "step": 3320 }, { "epoch": 0.5737422467263956, "grad_norm": 20.20602798461914, "learning_rate": 1.949747075446321e-07, "logits/chosen": -2.658933162689209, "logits/rejected": -2.6387689113616943, "logps/chosen": -144.72409057617188, "logps/rejected": -164.0241241455078, "loss": 0.6238, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.8950947523117065, "rewards/margins": 0.21241405606269836, "rewards/rejected": -1.107508897781372, "step": 3330 }, { "epoch": 0.5754651964162646, "grad_norm": 22.419872283935547, "learning_rate": 1.9491176137600695e-07, "logits/chosen": -2.620138645172119, "logits/rejected": -2.6018574237823486, "logps/chosen": -146.81373596191406, "logps/rejected": -160.92575073242188, "loss": 0.633, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.8906152844429016, "rewards/margins": 0.18924424052238464, "rewards/rejected": -1.0798594951629639, "step": 3340 }, { "epoch": 0.5771881461061337, "grad_norm": 17.183467864990234, "learning_rate": 1.9484843371871538e-07, "logits/chosen": -2.5479228496551514, "logits/rejected": -2.5361313819885254, "logps/chosen": -137.95021057128906, "logps/rejected": -154.76429748535156, "loss": 0.635, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.8102957010269165, "rewards/margins": 0.1809847056865692, "rewards/rejected": -0.9912804365158081, "step": 3350 }, { "epoch": 0.5789110957960028, "grad_norm": 16.16291618347168, "learning_rate": 1.9478472482729677e-07, "logits/chosen": -2.6077377796173096, "logits/rejected": -2.585120916366577, "logps/chosen": -132.53025817871094, "logps/rejected": -144.04335021972656, "loss": 0.6483, "rewards/accuracies": 0.625, "rewards/chosen": -0.7691653966903687, "rewards/margins": 0.15932944416999817, "rewards/rejected": -0.9284948110580444, "step": 3360 }, { "epoch": 0.5806340454858718, "grad_norm": 20.075923919677734, "learning_rate": 1.947206349578229e-07, "logits/chosen": -2.596151113510132, "logits/rejected": -2.592029094696045, "logps/chosen": -119.96165466308594, "logps/rejected": -141.7924346923828, "loss": 0.6138, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.6749607920646667, "rewards/margins": 0.2100691795349121, "rewards/rejected": -0.8850299715995789, "step": 3370 }, { "epoch": 0.5823569951757409, "grad_norm": 25.711334228515625, "learning_rate": 1.9465616436789683e-07, "logits/chosen": -2.6455085277557373, "logits/rejected": -2.6214022636413574, "logps/chosen": -127.19728088378906, "logps/rejected": -135.57278442382812, "loss": 0.6411, "rewards/accuracies": 0.65625, "rewards/chosen": -0.7012184858322144, "rewards/margins": 0.1534968614578247, "rewards/rejected": -0.8547152280807495, "step": 3380 }, { "epoch": 0.5840799448656099, "grad_norm": 14.334299087524414, "learning_rate": 1.9459131331665183e-07, "logits/chosen": -2.5958807468414307, "logits/rejected": -2.5762410163879395, "logps/chosen": -126.18077087402344, "logps/rejected": -138.03382873535156, "loss": 0.6477, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.7176672220230103, "rewards/margins": 0.15622806549072266, "rewards/rejected": -0.8738951683044434, "step": 3390 }, { "epoch": 0.585802894555479, "grad_norm": 14.765299797058105, "learning_rate": 1.9452608206475044e-07, "logits/chosen": -2.6166718006134033, "logits/rejected": -2.5864694118499756, "logps/chosen": -129.44992065429688, "logps/rejected": -144.64212036132812, "loss": 0.6391, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.7583649158477783, "rewards/margins": 0.18269336223602295, "rewards/rejected": -0.9410582780838013, "step": 3400 }, { "epoch": 0.587525844245348, "grad_norm": 21.21045684814453, "learning_rate": 1.9446047087438342e-07, "logits/chosen": -2.541520833969116, "logits/rejected": -2.521432399749756, "logps/chosen": -121.6983871459961, "logps/rejected": -129.38424682617188, "loss": 0.6591, "rewards/accuracies": 0.65625, "rewards/chosen": -0.71497642993927, "rewards/margins": 0.11594994366168976, "rewards/rejected": -0.8309264183044434, "step": 3410 }, { "epoch": 0.5892487939352171, "grad_norm": 16.52368927001953, "learning_rate": 1.9439448000926859e-07, "logits/chosen": -2.561218738555908, "logits/rejected": -2.546726703643799, "logps/chosen": -118.3755874633789, "logps/rejected": -137.21578979492188, "loss": 0.6195, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.6452140808105469, "rewards/margins": 0.19474345445632935, "rewards/rejected": -0.8399575352668762, "step": 3420 }, { "epoch": 0.5909717436250862, "grad_norm": 17.433828353881836, "learning_rate": 1.9432810973464988e-07, "logits/chosen": -2.6282083988189697, "logits/rejected": -2.6128320693969727, "logps/chosen": -123.78349304199219, "logps/rejected": -144.3318328857422, "loss": 0.6289, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.715265154838562, "rewards/margins": 0.18964102864265442, "rewards/rejected": -0.904906153678894, "step": 3430 }, { "epoch": 0.5926946933149552, "grad_norm": 18.383983612060547, "learning_rate": 1.942613603172962e-07, "logits/chosen": -2.552217721939087, "logits/rejected": -2.5373892784118652, "logps/chosen": -138.661376953125, "logps/rejected": -157.70814514160156, "loss": 0.6307, "rewards/accuracies": 0.65625, "rewards/chosen": -0.8593077659606934, "rewards/margins": 0.19985555112361908, "rewards/rejected": -1.0591633319854736, "step": 3440 }, { "epoch": 0.5944176430048242, "grad_norm": 16.855152130126953, "learning_rate": 1.9419423202550037e-07, "logits/chosen": -2.723828077316284, "logits/rejected": -2.694596767425537, "logps/chosen": -149.76248168945312, "logps/rejected": -157.1855926513672, "loss": 0.6574, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.8911663293838501, "rewards/margins": 0.14522525668144226, "rewards/rejected": -1.0363914966583252, "step": 3450 }, { "epoch": 0.5961405926946933, "grad_norm": 14.812005996704102, "learning_rate": 1.9412672512907812e-07, "logits/chosen": -2.566176414489746, "logits/rejected": -2.560554265975952, "logps/chosen": -126.41849517822266, "logps/rejected": -144.072509765625, "loss": 0.6337, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.7349315881729126, "rewards/margins": 0.17624779045581818, "rewards/rejected": -0.9111794233322144, "step": 3460 }, { "epoch": 0.5978635423845624, "grad_norm": 14.932448387145996, "learning_rate": 1.940588398993669e-07, "logits/chosen": -2.6209988594055176, "logits/rejected": -2.6123223304748535, "logps/chosen": -130.9401092529297, "logps/rejected": -146.76885986328125, "loss": 0.6326, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.7369273900985718, "rewards/margins": 0.19775813817977905, "rewards/rejected": -0.9346855282783508, "step": 3470 }, { "epoch": 0.5995864920744314, "grad_norm": 26.05785369873047, "learning_rate": 1.9399057660922482e-07, "logits/chosen": -2.5855112075805664, "logits/rejected": -2.5643677711486816, "logps/chosen": -133.5073699951172, "logps/rejected": -150.279296875, "loss": 0.629, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.7769684195518494, "rewards/margins": 0.21371857821941376, "rewards/rejected": -0.9906870126724243, "step": 3480 }, { "epoch": 0.6013094417643005, "grad_norm": 21.644672393798828, "learning_rate": 1.939219355330296e-07, "logits/chosen": -2.575160503387451, "logits/rejected": -2.5529327392578125, "logps/chosen": -138.62142944335938, "logps/rejected": -158.4349365234375, "loss": 0.6285, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.8397635221481323, "rewards/margins": 0.20524868369102478, "rewards/rejected": -1.0450122356414795, "step": 3490 }, { "epoch": 0.6030323914541695, "grad_norm": 18.445178985595703, "learning_rate": 1.9385291694667742e-07, "logits/chosen": -2.5392565727233887, "logits/rejected": -2.5323662757873535, "logps/chosen": -137.585693359375, "logps/rejected": -152.7445526123047, "loss": 0.6506, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.8483207821846008, "rewards/margins": 0.14356929063796997, "rewards/rejected": -0.9918900728225708, "step": 3500 }, { "epoch": 0.6047553411440386, "grad_norm": 14.520369529724121, "learning_rate": 1.9378352112758182e-07, "logits/chosen": -2.505514621734619, "logits/rejected": -2.483396053314209, "logps/chosen": -143.06138610839844, "logps/rejected": -152.49606323242188, "loss": 0.656, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.8901029825210571, "rewards/margins": 0.1305324286222458, "rewards/rejected": -1.0206353664398193, "step": 3510 }, { "epoch": 0.6064782908339077, "grad_norm": 21.934551239013672, "learning_rate": 1.937137483546726e-07, "logits/chosen": -2.5207297801971436, "logits/rejected": -2.49786639213562, "logps/chosen": -133.8035888671875, "logps/rejected": -147.05685424804688, "loss": 0.6334, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.7806012034416199, "rewards/margins": 0.17642517387866974, "rewards/rejected": -0.957026481628418, "step": 3520 }, { "epoch": 0.6082012405237767, "grad_norm": 32.22636413574219, "learning_rate": 1.936435989083947e-07, "logits/chosen": -2.644346237182617, "logits/rejected": -2.6069703102111816, "logps/chosen": -137.42364501953125, "logps/rejected": -151.6114501953125, "loss": 0.6311, "rewards/accuracies": 0.65625, "rewards/chosen": -0.808983325958252, "rewards/margins": 0.20544257760047913, "rewards/rejected": -1.0144258737564087, "step": 3530 }, { "epoch": 0.6099241902136457, "grad_norm": 18.672882080078125, "learning_rate": 1.9357307307070706e-07, "logits/chosen": -2.5201008319854736, "logits/rejected": -2.499207019805908, "logps/chosen": -138.7521209716797, "logps/rejected": -151.4750213623047, "loss": 0.649, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.8334717750549316, "rewards/margins": 0.16167697310447693, "rewards/rejected": -0.9951488375663757, "step": 3540 }, { "epoch": 0.6116471399035148, "grad_norm": 15.886916160583496, "learning_rate": 1.9350217112508145e-07, "logits/chosen": -2.5792183876037598, "logits/rejected": -2.5726521015167236, "logps/chosen": -136.02572631835938, "logps/rejected": -143.35610961914062, "loss": 0.6642, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.8100021481513977, "rewards/margins": 0.10618770122528076, "rewards/rejected": -0.9161897897720337, "step": 3550 }, { "epoch": 0.6133700895933839, "grad_norm": 16.117509841918945, "learning_rate": 1.934308933565014e-07, "logits/chosen": -2.546569585800171, "logits/rejected": -2.5285823345184326, "logps/chosen": -130.1741180419922, "logps/rejected": -140.90634155273438, "loss": 0.6484, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.7551542520523071, "rewards/margins": 0.139734148979187, "rewards/rejected": -0.8948885202407837, "step": 3560 }, { "epoch": 0.6150930392832529, "grad_norm": 15.810790061950684, "learning_rate": 1.9335924005146106e-07, "logits/chosen": -2.640364170074463, "logits/rejected": -2.613745927810669, "logps/chosen": -141.25216674804688, "logps/rejected": -148.0939178466797, "loss": 0.6697, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.8347728848457336, "rewards/margins": 0.11924564838409424, "rewards/rejected": -0.9540184736251831, "step": 3570 }, { "epoch": 0.616815988973122, "grad_norm": 14.903305053710938, "learning_rate": 1.9328721149796392e-07, "logits/chosen": -2.613096237182617, "logits/rejected": -2.596419095993042, "logps/chosen": -138.2270050048828, "logps/rejected": -153.5955047607422, "loss": 0.6374, "rewards/accuracies": 0.65625, "rewards/chosen": -0.791134238243103, "rewards/margins": 0.1797659695148468, "rewards/rejected": -0.9709001779556274, "step": 3580 }, { "epoch": 0.618538938662991, "grad_norm": 15.328279495239258, "learning_rate": 1.9321480798552184e-07, "logits/chosen": -2.5752346515655518, "logits/rejected": -2.564344644546509, "logps/chosen": -138.5938720703125, "logps/rejected": -153.5648193359375, "loss": 0.6433, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.8292818069458008, "rewards/margins": 0.17741826176643372, "rewards/rejected": -1.006700038909912, "step": 3590 }, { "epoch": 0.6202618883528601, "grad_norm": 26.101503372192383, "learning_rate": 1.9314202980515378e-07, "logits/chosen": -2.572817325592041, "logits/rejected": -2.5499234199523926, "logps/chosen": -129.71156311035156, "logps/rejected": -144.66526794433594, "loss": 0.6264, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.7399894595146179, "rewards/margins": 0.19516396522521973, "rewards/rejected": -0.9351534843444824, "step": 3600 }, { "epoch": 0.6202618883528601, "eval_logits/chosen": -2.622478723526001, "eval_logits/rejected": -2.6176483631134033, "eval_logps/chosen": -122.61001586914062, "eval_logps/rejected": -136.5857391357422, "eval_loss": 0.6586455702781677, "eval_rewards/accuracies": 0.6105948090553284, "eval_rewards/chosen": -0.6359453797340393, "eval_rewards/margins": 0.10241586714982986, "eval_rewards/rejected": -0.7383612394332886, "eval_runtime": 383.2298, "eval_samples_per_second": 11.231, "eval_steps_per_second": 1.404, "step": 3600 }, { "epoch": 0.6219848380427292, "grad_norm": 22.015100479125977, "learning_rate": 1.9306887724938452e-07, "logits/chosen": -2.5326743125915527, "logits/rejected": -2.522927761077881, "logps/chosen": -143.52890014648438, "logps/rejected": -151.68173217773438, "loss": 0.6676, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.9094659686088562, "rewards/margins": 0.12106283009052277, "rewards/rejected": -1.0305287837982178, "step": 3610 }, { "epoch": 0.6237077877325982, "grad_norm": 22.287555694580078, "learning_rate": 1.929953506122438e-07, "logits/chosen": -2.470320463180542, "logits/rejected": -2.449979066848755, "logps/chosen": -142.80337524414062, "logps/rejected": -159.98110961914062, "loss": 0.6196, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.8603125810623169, "rewards/margins": 0.2076341211795807, "rewards/rejected": -1.0679466724395752, "step": 3620 }, { "epoch": 0.6254307374224672, "grad_norm": 20.508764266967773, "learning_rate": 1.9292145018926478e-07, "logits/chosen": -2.556079626083374, "logits/rejected": -2.5517899990081787, "logps/chosen": -148.13914489746094, "logps/rejected": -177.71499633789062, "loss": 0.6036, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.9577075839042664, "rewards/margins": 0.2712717354297638, "rewards/rejected": -1.228979468345642, "step": 3630 }, { "epoch": 0.6271536871123363, "grad_norm": 25.0677547454834, "learning_rate": 1.9284717627748308e-07, "logits/chosen": -2.5317368507385254, "logits/rejected": -2.520453453063965, "logps/chosen": -155.36209106445312, "logps/rejected": -174.81427001953125, "loss": 0.6275, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.9892951250076294, "rewards/margins": 0.20692935585975647, "rewards/rejected": -1.196224570274353, "step": 3640 }, { "epoch": 0.6288766368022054, "grad_norm": 23.427276611328125, "learning_rate": 1.9277252917543557e-07, "logits/chosen": -2.5060718059539795, "logits/rejected": -2.5108015537261963, "logps/chosen": -147.3946533203125, "logps/rejected": -173.7878875732422, "loss": 0.6163, "rewards/accuracies": 0.71875, "rewards/chosen": -0.9550941586494446, "rewards/margins": 0.2426663339138031, "rewards/rejected": -1.1977603435516357, "step": 3650 }, { "epoch": 0.6305995864920745, "grad_norm": 22.165502548217773, "learning_rate": 1.92697509183159e-07, "logits/chosen": -2.522451639175415, "logits/rejected": -2.4953484535217285, "logps/chosen": -160.2294464111328, "logps/rejected": -180.65109252929688, "loss": 0.6194, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.046242117881775, "rewards/margins": 0.23893484473228455, "rewards/rejected": -1.2851769924163818, "step": 3660 }, { "epoch": 0.6323225361819435, "grad_norm": 16.42013931274414, "learning_rate": 1.926221166021891e-07, "logits/chosen": -2.520784854888916, "logits/rejected": -2.5007686614990234, "logps/chosen": -150.28506469726562, "logps/rejected": -163.83880615234375, "loss": 0.6523, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.9631339311599731, "rewards/margins": 0.15850678086280823, "rewards/rejected": -1.1216405630111694, "step": 3670 }, { "epoch": 0.6340454858718125, "grad_norm": 19.383630752563477, "learning_rate": 1.9254635173555895e-07, "logits/chosen": -2.568835496902466, "logits/rejected": -2.5408787727355957, "logps/chosen": -144.97140502929688, "logps/rejected": -154.52496337890625, "loss": 0.6447, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.8866475820541382, "rewards/margins": 0.16674400866031647, "rewards/rejected": -1.053391695022583, "step": 3680 }, { "epoch": 0.6357684355616816, "grad_norm": 19.868803024291992, "learning_rate": 1.9247021488779817e-07, "logits/chosen": -2.513503313064575, "logits/rejected": -2.5117948055267334, "logps/chosen": -134.81625366210938, "logps/rejected": -166.44728088378906, "loss": 0.6101, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.8184541463851929, "rewards/margins": 0.26437991857528687, "rewards/rejected": -1.082834005355835, "step": 3690 }, { "epoch": 0.6374913852515507, "grad_norm": 20.203310012817383, "learning_rate": 1.923937063649315e-07, "logits/chosen": -2.530961513519287, "logits/rejected": -2.503265619277954, "logps/chosen": -146.6129913330078, "logps/rejected": -168.82098388671875, "loss": 0.6264, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.8800510168075562, "rewards/margins": 0.22863145172595978, "rewards/rejected": -1.108682632446289, "step": 3700 }, { "epoch": 0.6392143349414197, "grad_norm": 23.444467544555664, "learning_rate": 1.9231682647447757e-07, "logits/chosen": -2.5593762397766113, "logits/rejected": -2.542349338531494, "logps/chosen": -141.5807342529297, "logps/rejected": -151.15762329101562, "loss": 0.6617, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.8652147054672241, "rewards/margins": 0.14454950392246246, "rewards/rejected": -1.009764313697815, "step": 3710 }, { "epoch": 0.6409372846312887, "grad_norm": 19.26589584350586, "learning_rate": 1.9223957552544762e-07, "logits/chosen": -2.5640718936920166, "logits/rejected": -2.547266721725464, "logps/chosen": -126.90428161621094, "logps/rejected": -150.60821533203125, "loss": 0.6083, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.7400155067443848, "rewards/margins": 0.23334841430187225, "rewards/rejected": -0.973363995552063, "step": 3720 }, { "epoch": 0.6426602343211578, "grad_norm": 17.89430809020996, "learning_rate": 1.9216195382834445e-07, "logits/chosen": -2.556276559829712, "logits/rejected": -2.5290729999542236, "logps/chosen": -137.04393005371094, "logps/rejected": -157.21775817871094, "loss": 0.6117, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.8089156150817871, "rewards/margins": 0.25906902551651, "rewards/rejected": -1.067984700202942, "step": 3730 }, { "epoch": 0.6443831840110269, "grad_norm": 15.389662742614746, "learning_rate": 1.9208396169516092e-07, "logits/chosen": -2.535585880279541, "logits/rejected": -2.519484043121338, "logps/chosen": -139.1349334716797, "logps/rejected": -161.3406219482422, "loss": 0.6303, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.8444369435310364, "rewards/margins": 0.20586533844470978, "rewards/rejected": -1.050302267074585, "step": 3740 }, { "epoch": 0.646106133700896, "grad_norm": 17.75441551208496, "learning_rate": 1.9200559943937895e-07, "logits/chosen": -2.592435359954834, "logits/rejected": -2.5743930339813232, "logps/chosen": -141.97390747070312, "logps/rejected": -160.23193359375, "loss": 0.6287, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.8718975186347961, "rewards/margins": 0.2032506763935089, "rewards/rejected": -1.0751482248306274, "step": 3750 }, { "epoch": 0.647829083390765, "grad_norm": 22.844364166259766, "learning_rate": 1.91926867375968e-07, "logits/chosen": -2.547484874725342, "logits/rejected": -2.5382332801818848, "logps/chosen": -156.30105590820312, "logps/rejected": -165.59693908691406, "loss": 0.6719, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.9945210218429565, "rewards/margins": 0.11567112058401108, "rewards/rejected": -1.110192060470581, "step": 3760 }, { "epoch": 0.649552033080634, "grad_norm": 19.29290199279785, "learning_rate": 1.9184776582138408e-07, "logits/chosen": -2.5390050411224365, "logits/rejected": -2.5214526653289795, "logps/chosen": -157.914794921875, "logps/rejected": -172.47427368164062, "loss": 0.6534, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.0316898822784424, "rewards/margins": 0.15778304636478424, "rewards/rejected": -1.1894729137420654, "step": 3770 }, { "epoch": 0.6512749827705031, "grad_norm": 23.615245819091797, "learning_rate": 1.9176829509356817e-07, "logits/chosen": -2.54388427734375, "logits/rejected": -2.512617826461792, "logps/chosen": -164.77212524414062, "logps/rejected": -178.50497436523438, "loss": 0.6339, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.0995312929153442, "rewards/margins": 0.1998249590396881, "rewards/rejected": -1.2993561029434204, "step": 3780 }, { "epoch": 0.6529979324603722, "grad_norm": 22.931257247924805, "learning_rate": 1.9168845551194526e-07, "logits/chosen": -2.5178706645965576, "logits/rejected": -2.4947054386138916, "logps/chosen": -163.10385131835938, "logps/rejected": -176.40982055664062, "loss": 0.6431, "rewards/accuracies": 0.59375, "rewards/chosen": -1.081376552581787, "rewards/margins": 0.18316730856895447, "rewards/rejected": -1.2645437717437744, "step": 3790 }, { "epoch": 0.6547208821502413, "grad_norm": 38.20081329345703, "learning_rate": 1.916082473974228e-07, "logits/chosen": -2.548003673553467, "logits/rejected": -2.526517152786255, "logps/chosen": -156.9646453857422, "logps/rejected": -167.34617614746094, "loss": 0.6432, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.9847234487533569, "rewards/margins": 0.16609299182891846, "rewards/rejected": -1.150816559791565, "step": 3800 }, { "epoch": 0.6564438318401102, "grad_norm": 21.577075958251953, "learning_rate": 1.9152767107238957e-07, "logits/chosen": -2.5703864097595215, "logits/rejected": -2.5399067401885986, "logps/chosen": -156.12286376953125, "logps/rejected": -171.01058959960938, "loss": 0.6165, "rewards/accuracies": 0.625, "rewards/chosen": -0.9712567329406738, "rewards/margins": 0.22613871097564697, "rewards/rejected": -1.1973953247070312, "step": 3810 }, { "epoch": 0.6581667815299793, "grad_norm": 20.8365535736084, "learning_rate": 1.9144672686071437e-07, "logits/chosen": -2.5181126594543457, "logits/rejected": -2.497028112411499, "logps/chosen": -149.05239868164062, "logps/rejected": -167.2176513671875, "loss": 0.6232, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.9377164840698242, "rewards/margins": 0.2112659215927124, "rewards/rejected": -1.148982286453247, "step": 3820 }, { "epoch": 0.6598897312198484, "grad_norm": 25.174989700317383, "learning_rate": 1.913654150877446e-07, "logits/chosen": -2.547309398651123, "logits/rejected": -2.5087881088256836, "logps/chosen": -154.81350708007812, "logps/rejected": -162.9886932373047, "loss": 0.6444, "rewards/accuracies": 0.59375, "rewards/chosen": -0.9520630836486816, "rewards/margins": 0.1712508648633957, "rewards/rejected": -1.1233139038085938, "step": 3830 }, { "epoch": 0.6616126809097175, "grad_norm": 15.482654571533203, "learning_rate": 1.9128373608030513e-07, "logits/chosen": -2.5088038444519043, "logits/rejected": -2.500941038131714, "logps/chosen": -147.49221801757812, "logps/rejected": -174.9437255859375, "loss": 0.6126, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.9215294122695923, "rewards/margins": 0.24697823822498322, "rewards/rejected": -1.1685075759887695, "step": 3840 }, { "epoch": 0.6633356305995864, "grad_norm": 20.026220321655273, "learning_rate": 1.9120169016669683e-07, "logits/chosen": -2.584136486053467, "logits/rejected": -2.56913423538208, "logps/chosen": -149.8631134033203, "logps/rejected": -163.15603637695312, "loss": 0.6351, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.9149333834648132, "rewards/margins": 0.18745577335357666, "rewards/rejected": -1.1023890972137451, "step": 3850 }, { "epoch": 0.6650585802894555, "grad_norm": 17.616666793823242, "learning_rate": 1.9111927767669531e-07, "logits/chosen": -2.5894997119903564, "logits/rejected": -2.5635571479797363, "logps/chosen": -156.982421875, "logps/rejected": -167.90719604492188, "loss": 0.658, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.0112090110778809, "rewards/margins": 0.14642414450645447, "rewards/rejected": -1.1576330661773682, "step": 3860 }, { "epoch": 0.6667815299793246, "grad_norm": 24.253337860107422, "learning_rate": 1.9103649894154965e-07, "logits/chosen": -2.5155365467071533, "logits/rejected": -2.494205951690674, "logps/chosen": -154.18704223632812, "logps/rejected": -175.84947204589844, "loss": 0.6094, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.9422490000724792, "rewards/margins": 0.27016550302505493, "rewards/rejected": -1.2124145030975342, "step": 3870 }, { "epoch": 0.6685044796691937, "grad_norm": 25.2159481048584, "learning_rate": 1.90953354293981e-07, "logits/chosen": -2.534515857696533, "logits/rejected": -2.5364620685577393, "logps/chosen": -149.01571655273438, "logps/rejected": -162.34564208984375, "loss": 0.6638, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.971118152141571, "rewards/margins": 0.12453125417232513, "rewards/rejected": -1.0956494808197021, "step": 3880 }, { "epoch": 0.6702274293590628, "grad_norm": 16.787229537963867, "learning_rate": 1.908698440681812e-07, "logits/chosen": -2.60170316696167, "logits/rejected": -2.582350254058838, "logps/chosen": -134.91282653808594, "logps/rejected": -150.58224487304688, "loss": 0.6284, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.7993661165237427, "rewards/margins": 0.19520600140094757, "rewards/rejected": -0.9945721626281738, "step": 3890 }, { "epoch": 0.6719503790489317, "grad_norm": 15.2194185256958, "learning_rate": 1.9078596859981163e-07, "logits/chosen": -2.584822416305542, "logits/rejected": -2.5468311309814453, "logps/chosen": -134.71353149414062, "logps/rejected": -147.935791015625, "loss": 0.6201, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.7654783725738525, "rewards/margins": 0.21653859317302704, "rewards/rejected": -0.982016921043396, "step": 3900 }, { "epoch": 0.6736733287388008, "grad_norm": 15.335397720336914, "learning_rate": 1.9070172822600152e-07, "logits/chosen": -2.5756888389587402, "logits/rejected": -2.558799982070923, "logps/chosen": -140.26095581054688, "logps/rejected": -163.974609375, "loss": 0.6046, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.8372836112976074, "rewards/margins": 0.27128416299819946, "rewards/rejected": -1.1085678339004517, "step": 3910 }, { "epoch": 0.6753962784286699, "grad_norm": 16.354459762573242, "learning_rate": 1.90617123285347e-07, "logits/chosen": -2.594193458557129, "logits/rejected": -2.5603878498077393, "logps/chosen": -137.20114135742188, "logps/rejected": -147.0207061767578, "loss": 0.6273, "rewards/accuracies": 0.65625, "rewards/chosen": -0.7684726715087891, "rewards/margins": 0.20228557288646698, "rewards/rejected": -0.9707581400871277, "step": 3920 }, { "epoch": 0.677119228118539, "grad_norm": 22.57278060913086, "learning_rate": 1.9053215411790945e-07, "logits/chosen": -2.569798231124878, "logits/rejected": -2.563342571258545, "logps/chosen": -141.64752197265625, "logps/rejected": -160.0135955810547, "loss": 0.6395, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.889459490776062, "rewards/margins": 0.19048067927360535, "rewards/rejected": -1.0799401998519897, "step": 3930 }, { "epoch": 0.6788421778084079, "grad_norm": 26.700321197509766, "learning_rate": 1.9044682106521428e-07, "logits/chosen": -2.454202890396118, "logits/rejected": -2.4379258155822754, "logps/chosen": -144.43515014648438, "logps/rejected": -162.84532165527344, "loss": 0.6277, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.8958985209465027, "rewards/margins": 0.2047765702009201, "rewards/rejected": -1.100675106048584, "step": 3940 }, { "epoch": 0.680565127498277, "grad_norm": 24.479074478149414, "learning_rate": 1.903611244702494e-07, "logits/chosen": -2.4695637226104736, "logits/rejected": -2.4349091053009033, "logps/chosen": -139.02603149414062, "logps/rejected": -158.69528198242188, "loss": 0.6007, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.8301140666007996, "rewards/margins": 0.2578274607658386, "rewards/rejected": -1.0879415273666382, "step": 3950 }, { "epoch": 0.6822880771881461, "grad_norm": 24.88389015197754, "learning_rate": 1.9027506467746404e-07, "logits/chosen": -2.534788131713867, "logits/rejected": -2.533944606781006, "logps/chosen": -149.79391479492188, "logps/rejected": -176.94375610351562, "loss": 0.6237, "rewards/accuracies": 0.65625, "rewards/chosen": -0.9553853869438171, "rewards/margins": 0.2335664927959442, "rewards/rejected": -1.188951849937439, "step": 3960 }, { "epoch": 0.6840110268780152, "grad_norm": 21.85911750793457, "learning_rate": 1.901886420327672e-07, "logits/chosen": -2.505385637283325, "logits/rejected": -2.48795747756958, "logps/chosen": -161.1486053466797, "logps/rejected": -184.07723999023438, "loss": 0.6175, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.059510350227356, "rewards/margins": 0.2523019015789032, "rewards/rejected": -1.3118122816085815, "step": 3970 }, { "epoch": 0.6857339765678843, "grad_norm": 20.922971725463867, "learning_rate": 1.9010185688352643e-07, "logits/chosen": -2.443387508392334, "logits/rejected": -2.4325804710388184, "logps/chosen": -154.60977172851562, "logps/rejected": -184.83419799804688, "loss": 0.5929, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.005619764328003, "rewards/margins": 0.3168439269065857, "rewards/rejected": -1.3224637508392334, "step": 3980 }, { "epoch": 0.6874569262577532, "grad_norm": 17.087665557861328, "learning_rate": 1.9001470957856615e-07, "logits/chosen": -2.4980862140655518, "logits/rejected": -2.4844629764556885, "logps/chosen": -153.3562774658203, "logps/rejected": -174.1827392578125, "loss": 0.6393, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.9912816882133484, "rewards/margins": 0.21040551364421844, "rewards/rejected": -1.201687216758728, "step": 3990 }, { "epoch": 0.6891798759476223, "grad_norm": 26.346763610839844, "learning_rate": 1.8992720046816664e-07, "logits/chosen": -2.566901922225952, "logits/rejected": -2.5389089584350586, "logps/chosen": -149.53170776367188, "logps/rejected": -166.23568725585938, "loss": 0.6203, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.918075680732727, "rewards/margins": 0.22810205817222595, "rewards/rejected": -1.1461777687072754, "step": 4000 }, { "epoch": 0.6891798759476223, "eval_logits/chosen": -2.5642409324645996, "eval_logits/rejected": -2.558300256729126, "eval_logps/chosen": -137.52479553222656, "eval_logps/rejected": -154.57752990722656, "eval_loss": 0.6523177623748779, "eval_rewards/accuracies": 0.6166356801986694, "eval_rewards/chosen": -0.7850932478904724, "eval_rewards/margins": 0.1331859678030014, "eval_rewards/rejected": -0.9182791709899902, "eval_runtime": 382.9463, "eval_samples_per_second": 11.239, "eval_steps_per_second": 1.405, "step": 4000 }, { "epoch": 0.6909028256374914, "grad_norm": 20.584022521972656, "learning_rate": 1.8983932990406229e-07, "logits/chosen": -2.4782555103302, "logits/rejected": -2.4685940742492676, "logps/chosen": -147.3070526123047, "logps/rejected": -178.8524932861328, "loss": 0.6062, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.9803979992866516, "rewards/margins": 0.2875971496105194, "rewards/rejected": -1.2679951190948486, "step": 4010 }, { "epoch": 0.6926257753273605, "grad_norm": 18.147808074951172, "learning_rate": 1.8975109823944039e-07, "logits/chosen": -2.484112501144409, "logits/rejected": -2.479996919631958, "logps/chosen": -154.5767822265625, "logps/rejected": -175.74847412109375, "loss": 0.6362, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.0183777809143066, "rewards/margins": 0.2127773016691208, "rewards/rejected": -1.2311551570892334, "step": 4020 }, { "epoch": 0.6943487250172296, "grad_norm": 16.098173141479492, "learning_rate": 1.8966250582893953e-07, "logits/chosen": -2.4954991340637207, "logits/rejected": -2.477116346359253, "logps/chosen": -149.50643920898438, "logps/rejected": -165.57916259765625, "loss": 0.6407, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.9233428835868835, "rewards/margins": 0.1793515682220459, "rewards/rejected": -1.1026945114135742, "step": 4030 }, { "epoch": 0.6960716747070985, "grad_norm": 21.584396362304688, "learning_rate": 1.8957355302864842e-07, "logits/chosen": -2.548405885696411, "logits/rejected": -2.535510301589966, "logps/chosen": -143.77305603027344, "logps/rejected": -166.7368621826172, "loss": 0.6117, "rewards/accuracies": 0.65625, "rewards/chosen": -0.8692083358764648, "rewards/margins": 0.2582603096961975, "rewards/rejected": -1.1274688243865967, "step": 4040 }, { "epoch": 0.6977946243969676, "grad_norm": 19.66901206970215, "learning_rate": 1.894842401961042e-07, "logits/chosen": -2.4985928535461426, "logits/rejected": -2.481311321258545, "logps/chosen": -143.23513793945312, "logps/rejected": -169.25555419921875, "loss": 0.6129, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.9076918363571167, "rewards/margins": 0.2686161398887634, "rewards/rejected": -1.1763079166412354, "step": 4050 }, { "epoch": 0.6995175740868367, "grad_norm": 19.5911865234375, "learning_rate": 1.8939456769029122e-07, "logits/chosen": -2.475037097930908, "logits/rejected": -2.4540417194366455, "logps/chosen": -163.9054412841797, "logps/rejected": -176.58795166015625, "loss": 0.6458, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.0669587850570679, "rewards/margins": 0.18664845824241638, "rewards/rejected": -1.2536073923110962, "step": 4060 }, { "epoch": 0.7012405237767058, "grad_norm": 21.787147521972656, "learning_rate": 1.8930453587163949e-07, "logits/chosen": -2.4452805519104004, "logits/rejected": -2.424757719039917, "logps/chosen": -152.8466033935547, "logps/rejected": -182.3805694580078, "loss": 0.5859, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9907079935073853, "rewards/margins": 0.31428462266921997, "rewards/rejected": -1.30499267578125, "step": 4070 }, { "epoch": 0.7029634734665747, "grad_norm": 30.879796981811523, "learning_rate": 1.8921414510202317e-07, "logits/chosen": -2.441105604171753, "logits/rejected": -2.4298994541168213, "logps/chosen": -159.96908569335938, "logps/rejected": -184.31655883789062, "loss": 0.6232, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.0666249990463257, "rewards/margins": 0.24226295948028564, "rewards/rejected": -1.3088879585266113, "step": 4080 }, { "epoch": 0.7046864231564438, "grad_norm": 25.079172134399414, "learning_rate": 1.8912339574475925e-07, "logits/chosen": -2.4596691131591797, "logits/rejected": -2.4346156120300293, "logps/chosen": -163.6860809326172, "logps/rejected": -190.64089965820312, "loss": 0.6097, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.0688049793243408, "rewards/margins": 0.30191558599472046, "rewards/rejected": -1.3707208633422852, "step": 4090 }, { "epoch": 0.7064093728463129, "grad_norm": 31.3631534576416, "learning_rate": 1.8903228816460598e-07, "logits/chosen": -2.458258628845215, "logits/rejected": -2.435795545578003, "logps/chosen": -164.78323364257812, "logps/rejected": -185.1956024169922, "loss": 0.6139, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.1078027486801147, "rewards/margins": 0.2592393755912781, "rewards/rejected": -1.3670421838760376, "step": 4100 }, { "epoch": 0.708132322536182, "grad_norm": 21.405792236328125, "learning_rate": 1.8894082272776156e-07, "logits/chosen": -2.42065691947937, "logits/rejected": -2.409475803375244, "logps/chosen": -173.5276641845703, "logps/rejected": -180.2970733642578, "loss": 0.6803, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.1685502529144287, "rewards/margins": 0.1270175725221634, "rewards/rejected": -1.2955677509307861, "step": 4110 }, { "epoch": 0.709855272226051, "grad_norm": 17.203845977783203, "learning_rate": 1.8884899980186248e-07, "logits/chosen": -2.423537254333496, "logits/rejected": -2.426220417022705, "logps/chosen": -153.18441772460938, "logps/rejected": -176.77294921875, "loss": 0.6317, "rewards/accuracies": 0.65625, "rewards/chosen": -1.0196375846862793, "rewards/margins": 0.21294169127941132, "rewards/rejected": -1.2325794696807861, "step": 4120 }, { "epoch": 0.71157822191592, "grad_norm": 19.912616729736328, "learning_rate": 1.8875681975598207e-07, "logits/chosen": -2.497934579849243, "logits/rejected": -2.4791147708892822, "logps/chosen": -146.62086486816406, "logps/rejected": -164.03176879882812, "loss": 0.6238, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.9095829725265503, "rewards/margins": 0.21481628715991974, "rewards/rejected": -1.1243993043899536, "step": 4130 }, { "epoch": 0.7133011716057891, "grad_norm": 18.456600189208984, "learning_rate": 1.8866428296062916e-07, "logits/chosen": -2.5012519359588623, "logits/rejected": -2.4845516681671143, "logps/chosen": -154.83172607421875, "logps/rejected": -159.90524291992188, "loss": 0.6716, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.9765934944152832, "rewards/margins": 0.11888899654150009, "rewards/rejected": -1.095482587814331, "step": 4140 }, { "epoch": 0.7150241212956582, "grad_norm": 15.824003219604492, "learning_rate": 1.8857138978774647e-07, "logits/chosen": -2.5242793560028076, "logits/rejected": -2.506749391555786, "logps/chosen": -147.20632934570312, "logps/rejected": -158.5218963623047, "loss": 0.6329, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.8848239183425903, "rewards/margins": 0.19589009881019592, "rewards/rejected": -1.0807139873504639, "step": 4150 }, { "epoch": 0.7167470709855273, "grad_norm": 15.421948432922363, "learning_rate": 1.8847814061070917e-07, "logits/chosen": -2.468259334564209, "logits/rejected": -2.4435718059539795, "logps/chosen": -138.98873901367188, "logps/rejected": -157.41748046875, "loss": 0.6159, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.857384204864502, "rewards/margins": 0.23457948863506317, "rewards/rejected": -1.091963529586792, "step": 4160 }, { "epoch": 0.7184700206753962, "grad_norm": 16.917064666748047, "learning_rate": 1.8838453580432328e-07, "logits/chosen": -2.503840208053589, "logits/rejected": -2.495299816131592, "logps/chosen": -145.8829345703125, "logps/rejected": -167.2362060546875, "loss": 0.6283, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.9338346719741821, "rewards/margins": 0.21018731594085693, "rewards/rejected": -1.1440218687057495, "step": 4170 }, { "epoch": 0.7201929703652653, "grad_norm": 21.029468536376953, "learning_rate": 1.882905757448243e-07, "logits/chosen": -2.4621734619140625, "logits/rejected": -2.442660093307495, "logps/chosen": -156.11219787597656, "logps/rejected": -180.0277557373047, "loss": 0.6166, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.992640495300293, "rewards/margins": 0.24560074508190155, "rewards/rejected": -1.238241195678711, "step": 4180 }, { "epoch": 0.7219159200551344, "grad_norm": 21.128299713134766, "learning_rate": 1.8819626080987567e-07, "logits/chosen": -2.4580259323120117, "logits/rejected": -2.445139169692993, "logps/chosen": -163.23568725585938, "logps/rejected": -186.8887176513672, "loss": 0.6251, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.112467646598816, "rewards/margins": 0.236087366938591, "rewards/rejected": -1.3485549688339233, "step": 4190 }, { "epoch": 0.7236388697450035, "grad_norm": 17.36494255065918, "learning_rate": 1.881015913785671e-07, "logits/chosen": -2.4943463802337646, "logits/rejected": -2.4810996055603027, "logps/chosen": -159.11692810058594, "logps/rejected": -166.8761749267578, "loss": 0.6656, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.0203524827957153, "rewards/margins": 0.1343226134777069, "rewards/rejected": -1.154675006866455, "step": 4200 }, { "epoch": 0.7253618194348725, "grad_norm": 26.34899139404297, "learning_rate": 1.880065678314133e-07, "logits/chosen": -2.479694128036499, "logits/rejected": -2.4616291522979736, "logps/chosen": -154.9434051513672, "logps/rejected": -165.20925903320312, "loss": 0.6586, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.9922011494636536, "rewards/margins": 0.1440073698759079, "rewards/rejected": -1.1362085342407227, "step": 4210 }, { "epoch": 0.7270847691247415, "grad_norm": 16.302705764770508, "learning_rate": 1.8791119055035221e-07, "logits/chosen": -2.394646167755127, "logits/rejected": -2.3819780349731445, "logps/chosen": -142.0032958984375, "logps/rejected": -160.90249633789062, "loss": 0.6331, "rewards/accuracies": 0.6875, "rewards/chosen": -0.8893952369689941, "rewards/margins": 0.19879098236560822, "rewards/rejected": -1.088186264038086, "step": 4220 }, { "epoch": 0.7288077188146106, "grad_norm": 19.511754989624023, "learning_rate": 1.8781545991874362e-07, "logits/chosen": -2.534529209136963, "logits/rejected": -2.5153164863586426, "logps/chosen": -151.25851440429688, "logps/rejected": -167.86810302734375, "loss": 0.6339, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.9434124231338501, "rewards/margins": 0.20468302071094513, "rewards/rejected": -1.1480954885482788, "step": 4230 }, { "epoch": 0.7305306685044797, "grad_norm": 17.765453338623047, "learning_rate": 1.8771937632136753e-07, "logits/chosen": -2.4428322315216064, "logits/rejected": -2.4241092205047607, "logps/chosen": -145.80564880371094, "logps/rejected": -167.83908081054688, "loss": 0.6246, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.9024696350097656, "rewards/margins": 0.22954091429710388, "rewards/rejected": -1.1320104598999023, "step": 4240 }, { "epoch": 0.7322536181943488, "grad_norm": 18.9899845123291, "learning_rate": 1.8762294014442275e-07, "logits/chosen": -2.456968307495117, "logits/rejected": -2.4411208629608154, "logps/chosen": -141.48629760742188, "logps/rejected": -157.35751342773438, "loss": 0.6276, "rewards/accuracies": 0.65625, "rewards/chosen": -0.8507699966430664, "rewards/margins": 0.20453086495399475, "rewards/rejected": -1.0553009510040283, "step": 4250 }, { "epoch": 0.7339765678842178, "grad_norm": 18.64900779724121, "learning_rate": 1.8752615177552515e-07, "logits/chosen": -2.440948724746704, "logits/rejected": -2.423311710357666, "logps/chosen": -142.525634765625, "logps/rejected": -165.6720733642578, "loss": 0.6281, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.866360068321228, "rewards/margins": 0.2523943781852722, "rewards/rejected": -1.118754506111145, "step": 4260 }, { "epoch": 0.7356995175740868, "grad_norm": 26.834325790405273, "learning_rate": 1.8742901160370629e-07, "logits/chosen": -2.439136028289795, "logits/rejected": -2.4189846515655518, "logps/chosen": -136.86460876464844, "logps/rejected": -156.23016357421875, "loss": 0.6264, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.8397220373153687, "rewards/margins": 0.21862992644309998, "rewards/rejected": -1.058351755142212, "step": 4270 }, { "epoch": 0.7374224672639559, "grad_norm": 21.31093978881836, "learning_rate": 1.8733152001941162e-07, "logits/chosen": -2.433461904525757, "logits/rejected": -2.416278600692749, "logps/chosen": -143.94137573242188, "logps/rejected": -168.7376251220703, "loss": 0.6115, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.9106764793395996, "rewards/margins": 0.2685638964176178, "rewards/rejected": -1.1792404651641846, "step": 4280 }, { "epoch": 0.739145416953825, "grad_norm": 23.622499465942383, "learning_rate": 1.872336774144992e-07, "logits/chosen": -2.4091594219207764, "logits/rejected": -2.389822006225586, "logps/chosen": -163.57362365722656, "logps/rejected": -182.37948608398438, "loss": 0.6325, "rewards/accuracies": 0.65625, "rewards/chosen": -1.0774534940719604, "rewards/margins": 0.2432311475276947, "rewards/rejected": -1.3206846714019775, "step": 4290 }, { "epoch": 0.740868366643694, "grad_norm": 15.690317153930664, "learning_rate": 1.8713548418223797e-07, "logits/chosen": -2.4367196559906006, "logits/rejected": -2.425534725189209, "logps/chosen": -162.27609252929688, "logps/rejected": -184.33816528320312, "loss": 0.6228, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.085301160812378, "rewards/margins": 0.24133506417274475, "rewards/rejected": -1.3266363143920898, "step": 4300 }, { "epoch": 0.742591316333563, "grad_norm": 22.461139678955078, "learning_rate": 1.8703694071730612e-07, "logits/chosen": -2.364335060119629, "logits/rejected": -2.343078851699829, "logps/chosen": -166.93553161621094, "logps/rejected": -187.39309692382812, "loss": 0.6172, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.115869164466858, "rewards/margins": 0.24821288883686066, "rewards/rejected": -1.3640820980072021, "step": 4310 }, { "epoch": 0.7443142660234321, "grad_norm": 22.394622802734375, "learning_rate": 1.8693804741578964e-07, "logits/chosen": -2.4276387691497803, "logits/rejected": -2.403562307357788, "logps/chosen": -168.36390686035156, "logps/rejected": -196.73269653320312, "loss": 0.5796, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.0858043432235718, "rewards/margins": 0.34008654952049255, "rewards/rejected": -1.4258909225463867, "step": 4320 }, { "epoch": 0.7460372157133012, "grad_norm": 19.556331634521484, "learning_rate": 1.8683880467518055e-07, "logits/chosen": -2.3903796672821045, "logits/rejected": -2.366075038909912, "logps/chosen": -168.3106689453125, "logps/rejected": -182.62008666992188, "loss": 0.6442, "rewards/accuracies": 0.65625, "rewards/chosen": -1.1263628005981445, "rewards/margins": 0.20899710059165955, "rewards/rejected": -1.3353599309921265, "step": 4330 }, { "epoch": 0.7477601654031703, "grad_norm": 22.436267852783203, "learning_rate": 1.8673921289437554e-07, "logits/chosen": -2.3715744018554688, "logits/rejected": -2.3499209880828857, "logps/chosen": -154.8798828125, "logps/rejected": -184.8426055908203, "loss": 0.6044, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.0422241687774658, "rewards/margins": 0.3092927634716034, "rewards/rejected": -1.3515169620513916, "step": 4340 }, { "epoch": 0.7494831150930393, "grad_norm": 23.099063873291016, "learning_rate": 1.8663927247367407e-07, "logits/chosen": -2.3912670612335205, "logits/rejected": -2.3795111179351807, "logps/chosen": -151.63742065429688, "logps/rejected": -180.13919067382812, "loss": 0.6018, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9931322932243347, "rewards/margins": 0.2769755423069, "rewards/rejected": -1.2701078653335571, "step": 4350 }, { "epoch": 0.7512060647829083, "grad_norm": 21.606122970581055, "learning_rate": 1.865389838147771e-07, "logits/chosen": -2.4218342304229736, "logits/rejected": -2.4023971557617188, "logps/chosen": -164.91665649414062, "logps/rejected": -180.6358642578125, "loss": 0.6554, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.1022279262542725, "rewards/margins": 0.19608908891677856, "rewards/rejected": -1.2983169555664062, "step": 4360 }, { "epoch": 0.7529290144727774, "grad_norm": 28.500917434692383, "learning_rate": 1.864383473207852e-07, "logits/chosen": -2.443657398223877, "logits/rejected": -2.423034191131592, "logps/chosen": -149.47891235351562, "logps/rejected": -171.32008361816406, "loss": 0.6229, "rewards/accuracies": 0.65625, "rewards/chosen": -0.9359003901481628, "rewards/margins": 0.23575489223003387, "rewards/rejected": -1.171655535697937, "step": 4370 }, { "epoch": 0.7546519641626465, "grad_norm": 25.91777992248535, "learning_rate": 1.8633736339619702e-07, "logits/chosen": -2.4580273628234863, "logits/rejected": -2.4401497840881348, "logps/chosen": -155.8708953857422, "logps/rejected": -171.21290588378906, "loss": 0.6473, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.0010490417480469, "rewards/margins": 0.1942082643508911, "rewards/rejected": -1.195257306098938, "step": 4380 }, { "epoch": 0.7563749138525155, "grad_norm": 23.267330169677734, "learning_rate": 1.8623603244690772e-07, "logits/chosen": -2.4025485515594482, "logits/rejected": -2.3881447315216064, "logps/chosen": -147.8549041748047, "logps/rejected": -173.3671875, "loss": 0.6106, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9391400218009949, "rewards/margins": 0.2640628516674042, "rewards/rejected": -1.2032029628753662, "step": 4390 }, { "epoch": 0.7580978635423845, "grad_norm": 33.47300338745117, "learning_rate": 1.861343548802073e-07, "logits/chosen": -2.4176034927368164, "logits/rejected": -2.3994877338409424, "logps/chosen": -165.08901977539062, "logps/rejected": -180.73426818847656, "loss": 0.6341, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.0616172552108765, "rewards/margins": 0.19609788060188293, "rewards/rejected": -1.257715106010437, "step": 4400 }, { "epoch": 0.7580978635423845, "eval_logits/chosen": -2.4722869396209717, "eval_logits/rejected": -2.4643163681030273, "eval_logps/chosen": -146.87518310546875, "eval_logps/rejected": -165.33773803710938, "eval_loss": 0.6487003564834595, "eval_rewards/accuracies": 0.6129181981086731, "eval_rewards/chosen": -0.8785969614982605, "eval_rewards/margins": 0.14728423953056335, "eval_rewards/rejected": -1.0258814096450806, "eval_runtime": 382.734, "eval_samples_per_second": 11.245, "eval_steps_per_second": 1.406, "step": 4400 }, { "epoch": 0.7598208132322536, "grad_norm": 26.31522560119629, "learning_rate": 1.8603233110477884e-07, "logits/chosen": -2.35225510597229, "logits/rejected": -2.3370361328125, "logps/chosen": -167.1627197265625, "logps/rejected": -185.4983367919922, "loss": 0.6481, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.1053941249847412, "rewards/margins": 0.20654484629631042, "rewards/rejected": -1.311939001083374, "step": 4410 }, { "epoch": 0.7615437629221227, "grad_norm": 26.59756851196289, "learning_rate": 1.8592996153069715e-07, "logits/chosen": -2.471127986907959, "logits/rejected": -2.4345784187316895, "logps/chosen": -166.3771209716797, "logps/rejected": -178.87313842773438, "loss": 0.6456, "rewards/accuracies": 0.625, "rewards/chosen": -1.103097915649414, "rewards/margins": 0.18923258781433105, "rewards/rejected": -1.2923305034637451, "step": 4420 }, { "epoch": 0.7632667126119917, "grad_norm": 21.480712890625, "learning_rate": 1.8582724656942683e-07, "logits/chosen": -2.392627477645874, "logits/rejected": -2.3656272888183594, "logps/chosen": -159.09445190429688, "logps/rejected": -176.07310485839844, "loss": 0.6416, "rewards/accuracies": 0.625, "rewards/chosen": -1.0466508865356445, "rewards/margins": 0.21135036647319794, "rewards/rejected": -1.2580013275146484, "step": 4430 }, { "epoch": 0.7649896623018608, "grad_norm": 21.1779727935791, "learning_rate": 1.8572418663382074e-07, "logits/chosen": -2.434896945953369, "logits/rejected": -2.4075655937194824, "logps/chosen": -163.56765747070312, "logps/rejected": -185.07579040527344, "loss": 0.6057, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.0352531671524048, "rewards/margins": 0.2752748131752014, "rewards/rejected": -1.310528039932251, "step": 4440 }, { "epoch": 0.7667126119917298, "grad_norm": 17.894512176513672, "learning_rate": 1.8562078213811833e-07, "logits/chosen": -2.3739511966705322, "logits/rejected": -2.3611817359924316, "logps/chosen": -149.56271362304688, "logps/rejected": -168.64669799804688, "loss": 0.62, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9247000813484192, "rewards/margins": 0.23195107281208038, "rewards/rejected": -1.1566511392593384, "step": 4450 }, { "epoch": 0.7684355616815989, "grad_norm": 20.049449920654297, "learning_rate": 1.8551703349794406e-07, "logits/chosen": -2.448498249053955, "logits/rejected": -2.4351491928100586, "logps/chosen": -145.53578186035156, "logps/rejected": -174.42141723632812, "loss": 0.6027, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.9232245683670044, "rewards/margins": 0.30191126465797424, "rewards/rejected": -1.2251358032226562, "step": 4460 }, { "epoch": 0.770158511371468, "grad_norm": 19.79228401184082, "learning_rate": 1.854129411303055e-07, "logits/chosen": -2.47465181350708, "logits/rejected": -2.4493346214294434, "logps/chosen": -153.1445770263672, "logps/rejected": -166.31228637695312, "loss": 0.6392, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.9666484594345093, "rewards/margins": 0.20043261349201202, "rewards/rejected": -1.1670811176300049, "step": 4470 }, { "epoch": 0.771881461061337, "grad_norm": 24.481473922729492, "learning_rate": 1.8530850545359193e-07, "logits/chosen": -2.4872870445251465, "logits/rejected": -2.4746615886688232, "logps/chosen": -152.59925842285156, "logps/rejected": -171.68936157226562, "loss": 0.643, "rewards/accuracies": 0.59375, "rewards/chosen": -1.0053293704986572, "rewards/margins": 0.20094509422779083, "rewards/rejected": -1.2062745094299316, "step": 4480 }, { "epoch": 0.7736044107512061, "grad_norm": 21.01685333251953, "learning_rate": 1.8520372688757245e-07, "logits/chosen": -2.4067769050598145, "logits/rejected": -2.3791446685791016, "logps/chosen": -146.48553466796875, "logps/rejected": -164.83828735351562, "loss": 0.6286, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.9418350458145142, "rewards/margins": 0.2116595208644867, "rewards/rejected": -1.1534945964813232, "step": 4490 }, { "epoch": 0.7753273604410751, "grad_norm": 21.07688331604004, "learning_rate": 1.8509860585339446e-07, "logits/chosen": -2.392287015914917, "logits/rejected": -2.371561288833618, "logps/chosen": -159.52886962890625, "logps/rejected": -180.74732971191406, "loss": 0.6216, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.017209768295288, "rewards/margins": 0.2693372368812561, "rewards/rejected": -1.286547064781189, "step": 4500 }, { "epoch": 0.7770503101309442, "grad_norm": 23.083614349365234, "learning_rate": 1.8499314277358167e-07, "logits/chosen": -2.4393038749694824, "logits/rejected": -2.4085822105407715, "logps/chosen": -176.33653259277344, "logps/rejected": -210.8615264892578, "loss": 0.5921, "rewards/accuracies": 0.71875, "rewards/chosen": -1.209841012954712, "rewards/margins": 0.34841373562812805, "rewards/rejected": -1.558254599571228, "step": 4510 }, { "epoch": 0.7787732598208132, "grad_norm": 26.21770477294922, "learning_rate": 1.848873380720329e-07, "logits/chosen": -2.3809621334075928, "logits/rejected": -2.3610141277313232, "logps/chosen": -198.77658081054688, "logps/rejected": -216.4871826171875, "loss": 0.6556, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.4526500701904297, "rewards/margins": 0.215979665517807, "rewards/rejected": -1.6686298847198486, "step": 4520 }, { "epoch": 0.7804962095106823, "grad_norm": 35.826942443847656, "learning_rate": 1.8478119217401985e-07, "logits/chosen": -2.3922924995422363, "logits/rejected": -2.3780174255371094, "logps/chosen": -169.76864624023438, "logps/rejected": -187.02670288085938, "loss": 0.6569, "rewards/accuracies": 0.5625, "rewards/chosen": -1.1504439115524292, "rewards/margins": 0.19426843523979187, "rewards/rejected": -1.3447123765945435, "step": 4530 }, { "epoch": 0.7822191592005513, "grad_norm": 36.92649841308594, "learning_rate": 1.8467470550618574e-07, "logits/chosen": -2.3660781383514404, "logits/rejected": -2.3454301357269287, "logps/chosen": -154.78317260742188, "logps/rejected": -174.80319213867188, "loss": 0.6292, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.0085923671722412, "rewards/margins": 0.23993799090385437, "rewards/rejected": -1.248530387878418, "step": 4540 }, { "epoch": 0.7839421088904204, "grad_norm": 25.094932556152344, "learning_rate": 1.8456787849654347e-07, "logits/chosen": -2.4405195713043213, "logits/rejected": -2.4174976348876953, "logps/chosen": -152.58892822265625, "logps/rejected": -170.98582458496094, "loss": 0.6282, "rewards/accuracies": 0.65625, "rewards/chosen": -0.946140468120575, "rewards/margins": 0.23043744266033173, "rewards/rejected": -1.1765779256820679, "step": 4550 }, { "epoch": 0.7856650585802895, "grad_norm": 21.068693161010742, "learning_rate": 1.844607115744739e-07, "logits/chosen": -2.3593530654907227, "logits/rejected": -2.326908826828003, "logps/chosen": -164.55909729003906, "logps/rejected": -188.50650024414062, "loss": 0.6079, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.0752851963043213, "rewards/margins": 0.2878347933292389, "rewards/rejected": -1.3631200790405273, "step": 4560 }, { "epoch": 0.7873880082701585, "grad_norm": 30.99504852294922, "learning_rate": 1.8435320517072408e-07, "logits/chosen": -2.3525474071502686, "logits/rejected": -2.3298683166503906, "logps/chosen": -182.09889221191406, "logps/rejected": -209.79025268554688, "loss": 0.6312, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.2944326400756836, "rewards/margins": 0.2926630973815918, "rewards/rejected": -1.5870954990386963, "step": 4570 }, { "epoch": 0.7891109579600276, "grad_norm": 18.812456130981445, "learning_rate": 1.842453597174057e-07, "logits/chosen": -2.3480334281921387, "logits/rejected": -2.322362184524536, "logps/chosen": -163.5109405517578, "logps/rejected": -184.9033203125, "loss": 0.6169, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.0780930519104004, "rewards/margins": 0.25485703349113464, "rewards/rejected": -1.3329499959945679, "step": 4580 }, { "epoch": 0.7908339076498966, "grad_norm": 24.78331756591797, "learning_rate": 1.841371756479931e-07, "logits/chosen": -2.414440155029297, "logits/rejected": -2.389694929122925, "logps/chosen": -177.2312469482422, "logps/rejected": -192.85855102539062, "loss": 0.6596, "rewards/accuracies": 0.59375, "rewards/chosen": -1.2167154550552368, "rewards/margins": 0.19507446885108948, "rewards/rejected": -1.4117896556854248, "step": 4590 }, { "epoch": 0.7925568573397657, "grad_norm": 26.33585548400879, "learning_rate": 1.8402865339732171e-07, "logits/chosen": -2.340993642807007, "logits/rejected": -2.3130643367767334, "logps/chosen": -172.70472717285156, "logps/rejected": -210.70401000976562, "loss": 0.5706, "rewards/accuracies": 0.6875, "rewards/chosen": -1.1847777366638184, "rewards/margins": 0.3849853575229645, "rewards/rejected": -1.5697630643844604, "step": 4600 }, { "epoch": 0.7942798070296347, "grad_norm": 26.14021873474121, "learning_rate": 1.8391979340158627e-07, "logits/chosen": -2.32139253616333, "logits/rejected": -2.3075783252716064, "logps/chosen": -203.4938507080078, "logps/rejected": -219.33523559570312, "loss": 0.6653, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.5140093564987183, "rewards/margins": 0.18047411739826202, "rewards/rejected": -1.6944833993911743, "step": 4610 }, { "epoch": 0.7960027567195038, "grad_norm": 27.7562255859375, "learning_rate": 1.8381059609833904e-07, "logits/chosen": -2.35945200920105, "logits/rejected": -2.3298919200897217, "logps/chosen": -203.76455688476562, "logps/rejected": -238.9661102294922, "loss": 0.5745, "rewards/accuracies": 0.71875, "rewards/chosen": -1.470356822013855, "rewards/margins": 0.36763960123062134, "rewards/rejected": -1.837996244430542, "step": 4620 }, { "epoch": 0.7977257064093728, "grad_norm": 39.02242660522461, "learning_rate": 1.83701061926488e-07, "logits/chosen": -2.339899778366089, "logits/rejected": -2.3107967376708984, "logps/chosen": -224.0686798095703, "logps/rejected": -250.54379272460938, "loss": 0.6192, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.6597421169281006, "rewards/margins": 0.29094910621643066, "rewards/rejected": -1.9506912231445312, "step": 4630 }, { "epoch": 0.7994486560992419, "grad_norm": 22.518293380737305, "learning_rate": 1.8359119132629522e-07, "logits/chosen": -2.3632376194000244, "logits/rejected": -2.351179838180542, "logps/chosen": -203.08949279785156, "logps/rejected": -226.6865692138672, "loss": 0.6346, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.48097825050354, "rewards/margins": 0.2385386973619461, "rewards/rejected": -1.7195169925689697, "step": 4640 }, { "epoch": 0.801171605789111, "grad_norm": 17.876625061035156, "learning_rate": 1.8348098473937498e-07, "logits/chosen": -2.40993595123291, "logits/rejected": -2.384605884552002, "logps/chosen": -165.33267211914062, "logps/rejected": -181.77304077148438, "loss": 0.6362, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.100512981414795, "rewards/margins": 0.21383455395698547, "rewards/rejected": -1.3143476247787476, "step": 4650 }, { "epoch": 0.80289455547898, "grad_norm": 18.722620010375977, "learning_rate": 1.8337044260869195e-07, "logits/chosen": -2.4083523750305176, "logits/rejected": -2.3905911445617676, "logps/chosen": -142.04159545898438, "logps/rejected": -158.0908660888672, "loss": 0.6309, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.8668336868286133, "rewards/margins": 0.1838662177324295, "rewards/rejected": -1.0506998300552368, "step": 4660 }, { "epoch": 0.8046175051688491, "grad_norm": 16.902860641479492, "learning_rate": 1.8325956537855964e-07, "logits/chosen": -2.4333815574645996, "logits/rejected": -2.4127719402313232, "logps/chosen": -142.946533203125, "logps/rejected": -163.60421752929688, "loss": 0.6064, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.900127112865448, "rewards/margins": 0.25424256920814514, "rewards/rejected": -1.1543697118759155, "step": 4670 }, { "epoch": 0.8063404548587181, "grad_norm": 17.532752990722656, "learning_rate": 1.8314835349463834e-07, "logits/chosen": -2.4113612174987793, "logits/rejected": -2.3857650756835938, "logps/chosen": -160.74105834960938, "logps/rejected": -179.22518920898438, "loss": 0.6465, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.0686453580856323, "rewards/margins": 0.20836098492145538, "rewards/rejected": -1.2770063877105713, "step": 4680 }, { "epoch": 0.8080634045485872, "grad_norm": 20.075408935546875, "learning_rate": 1.8303680740393354e-07, "logits/chosen": -2.432438373565674, "logits/rejected": -2.411315441131592, "logps/chosen": -167.2003936767578, "logps/rejected": -194.3954620361328, "loss": 0.6118, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.1071606874465942, "rewards/margins": 0.3014344871044159, "rewards/rejected": -1.408595323562622, "step": 4690 }, { "epoch": 0.8097863542384562, "grad_norm": 23.070444107055664, "learning_rate": 1.829249275547939e-07, "logits/chosen": -2.41184139251709, "logits/rejected": -2.3968803882598877, "logps/chosen": -161.8149871826172, "logps/rejected": -190.91075134277344, "loss": 0.6151, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.0889852046966553, "rewards/margins": 0.2751777768135071, "rewards/rejected": -1.3641631603240967, "step": 4700 }, { "epoch": 0.8115093039283253, "grad_norm": 22.357309341430664, "learning_rate": 1.8281271439690972e-07, "logits/chosen": -2.4170899391174316, "logits/rejected": -2.387962818145752, "logps/chosen": -178.35140991210938, "logps/rejected": -203.06228637695312, "loss": 0.5967, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.1947414875030518, "rewards/margins": 0.3194465637207031, "rewards/rejected": -1.5141879320144653, "step": 4710 }, { "epoch": 0.8132322536181944, "grad_norm": 21.492481231689453, "learning_rate": 1.8270016838131098e-07, "logits/chosen": -2.328721523284912, "logits/rejected": -2.2992498874664307, "logps/chosen": -178.8650360107422, "logps/rejected": -209.41500854492188, "loss": 0.5832, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.211004614830017, "rewards/margins": 0.36220628023147583, "rewards/rejected": -1.5732109546661377, "step": 4720 }, { "epoch": 0.8149552033080634, "grad_norm": 27.255842208862305, "learning_rate": 1.825872899603655e-07, "logits/chosen": -2.312281370162964, "logits/rejected": -2.2868740558624268, "logps/chosen": -181.79415893554688, "logps/rejected": -201.48757934570312, "loss": 0.6487, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.268235445022583, "rewards/margins": 0.2422148734331131, "rewards/rejected": -1.5104503631591797, "step": 4730 }, { "epoch": 0.8166781529979324, "grad_norm": 21.115175247192383, "learning_rate": 1.824740795877772e-07, "logits/chosen": -2.390545606613159, "logits/rejected": -2.3746256828308105, "logps/chosen": -158.5949249267578, "logps/rejected": -193.73794555664062, "loss": 0.5753, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.0280969142913818, "rewards/margins": 0.3749907612800598, "rewards/rejected": -1.403087854385376, "step": 4740 }, { "epoch": 0.8184011026878015, "grad_norm": 18.447275161743164, "learning_rate": 1.8236053771858428e-07, "logits/chosen": -2.355696678161621, "logits/rejected": -2.348707914352417, "logps/chosen": -179.67068481445312, "logps/rejected": -197.00306701660156, "loss": 0.6373, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.231192946434021, "rewards/margins": 0.2000356912612915, "rewards/rejected": -1.4312288761138916, "step": 4750 }, { "epoch": 0.8201240523776706, "grad_norm": 22.534955978393555, "learning_rate": 1.8224666480915732e-07, "logits/chosen": -2.3500988483428955, "logits/rejected": -2.3278253078460693, "logps/chosen": -184.92636108398438, "logps/rejected": -212.4356231689453, "loss": 0.61, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.32185959815979, "rewards/margins": 0.295890748500824, "rewards/rejected": -1.6177504062652588, "step": 4760 }, { "epoch": 0.8218470020675396, "grad_norm": 18.955162048339844, "learning_rate": 1.8213246131719746e-07, "logits/chosen": -2.377048969268799, "logits/rejected": -2.357697010040283, "logps/chosen": -209.9337615966797, "logps/rejected": -226.17276000976562, "loss": 0.6535, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.494385838508606, "rewards/margins": 0.22841958701610565, "rewards/rejected": -1.722805380821228, "step": 4770 }, { "epoch": 0.8235699517574087, "grad_norm": 25.681936264038086, "learning_rate": 1.8201792770173462e-07, "logits/chosen": -2.277048110961914, "logits/rejected": -2.257699728012085, "logps/chosen": -206.84872436523438, "logps/rejected": -243.0885772705078, "loss": 0.5753, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.5417685508728027, "rewards/margins": 0.36321893334388733, "rewards/rejected": -1.9049873352050781, "step": 4780 }, { "epoch": 0.8252929014472777, "grad_norm": 19.8004093170166, "learning_rate": 1.8190306442312565e-07, "logits/chosen": -2.3256642818450928, "logits/rejected": -2.3049635887145996, "logps/chosen": -219.4740753173828, "logps/rejected": -230.257568359375, "loss": 0.6755, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.6086790561676025, "rewards/margins": 0.17308612167835236, "rewards/rejected": -1.7817652225494385, "step": 4790 }, { "epoch": 0.8270158511371468, "grad_norm": 21.14756202697754, "learning_rate": 1.8178787194305239e-07, "logits/chosen": -2.319889545440674, "logits/rejected": -2.3040847778320312, "logps/chosen": -186.53793334960938, "logps/rejected": -211.5950927734375, "loss": 0.6184, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.3383642435073853, "rewards/margins": 0.27897733449935913, "rewards/rejected": -1.6173416376113892, "step": 4800 }, { "epoch": 0.8270158511371468, "eval_logits/chosen": -2.424163818359375, "eval_logits/rejected": -2.414050817489624, "eval_logps/chosen": -166.67298889160156, "eval_logps/rejected": -187.56295776367188, "eval_loss": 0.6454273462295532, "eval_rewards/accuracies": 0.6129181981086731, "eval_rewards/chosen": -1.0765751600265503, "eval_rewards/margins": 0.17155835032463074, "eval_rewards/rejected": -1.2481335401535034, "eval_runtime": 383.5899, "eval_samples_per_second": 11.22, "eval_steps_per_second": 1.403, "step": 4800 }, { "epoch": 0.8287388008270159, "grad_norm": 24.968095779418945, "learning_rate": 1.816723507245199e-07, "logits/chosen": -2.3386826515197754, "logits/rejected": -2.3073079586029053, "logps/chosen": -182.45309448242188, "logps/rejected": -206.2501220703125, "loss": 0.6021, "rewards/accuracies": 0.6875, "rewards/chosen": -1.2412832975387573, "rewards/margins": 0.31198614835739136, "rewards/rejected": -1.553269386291504, "step": 4810 }, { "epoch": 0.8304617505168849, "grad_norm": 23.982210159301758, "learning_rate": 1.8155650123185458e-07, "logits/chosen": -2.3738133907318115, "logits/rejected": -2.3586814403533936, "logps/chosen": -171.57936096191406, "logps/rejected": -196.97628784179688, "loss": 0.6235, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.1862354278564453, "rewards/margins": 0.2727213203907013, "rewards/rejected": -1.4589568376541138, "step": 4820 }, { "epoch": 0.832184700206754, "grad_norm": 20.661731719970703, "learning_rate": 1.8144032393070225e-07, "logits/chosen": -2.35894513130188, "logits/rejected": -2.3394012451171875, "logps/chosen": -171.9304656982422, "logps/rejected": -195.32716369628906, "loss": 0.6235, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.168662190437317, "rewards/margins": 0.26239484548568726, "rewards/rejected": -1.4310569763183594, "step": 4830 }, { "epoch": 0.833907649896623, "grad_norm": 20.392499923706055, "learning_rate": 1.8132381928802643e-07, "logits/chosen": -2.3282320499420166, "logits/rejected": -2.2940051555633545, "logps/chosen": -185.9119110107422, "logps/rejected": -214.93667602539062, "loss": 0.5982, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.2838799953460693, "rewards/margins": 0.3473908305168152, "rewards/rejected": -1.6312707662582397, "step": 4840 }, { "epoch": 0.8356305995864921, "grad_norm": 20.10554313659668, "learning_rate": 1.8120698777210626e-07, "logits/chosen": -2.3485913276672363, "logits/rejected": -2.3278183937072754, "logps/chosen": -195.07545471191406, "logps/rejected": -225.80789184570312, "loss": 0.5997, "rewards/accuracies": 0.75, "rewards/chosen": -1.397279143333435, "rewards/margins": 0.33234673738479614, "rewards/rejected": -1.729625940322876, "step": 4850 }, { "epoch": 0.8373535492763611, "grad_norm": 36.35572814941406, "learning_rate": 1.8108982985253472e-07, "logits/chosen": -2.330827474594116, "logits/rejected": -2.2981231212615967, "logps/chosen": -211.63613891601562, "logps/rejected": -226.32748413085938, "loss": 0.6464, "rewards/accuracies": 0.625, "rewards/chosen": -1.5219459533691406, "rewards/margins": 0.24193425476551056, "rewards/rejected": -1.7638801336288452, "step": 4860 }, { "epoch": 0.8390764989662302, "grad_norm": 24.555248260498047, "learning_rate": 1.8097234600021679e-07, "logits/chosen": -2.361845016479492, "logits/rejected": -2.331868886947632, "logps/chosen": -208.45864868164062, "logps/rejected": -239.99966430664062, "loss": 0.5848, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.4959218502044678, "rewards/margins": 0.3945196568965912, "rewards/rejected": -1.8904415369033813, "step": 4870 }, { "epoch": 0.8407994486560992, "grad_norm": 31.07630157470703, "learning_rate": 1.8085453668736745e-07, "logits/chosen": -2.2912700176239014, "logits/rejected": -2.261535406112671, "logps/chosen": -196.89691162109375, "logps/rejected": -217.8070831298828, "loss": 0.6382, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.431493878364563, "rewards/margins": 0.2531737685203552, "rewards/rejected": -1.6846675872802734, "step": 4880 }, { "epoch": 0.8425223983459683, "grad_norm": 27.161630630493164, "learning_rate": 1.8073640238750988e-07, "logits/chosen": -2.3718910217285156, "logits/rejected": -2.345691680908203, "logps/chosen": -184.5735321044922, "logps/rejected": -217.81478881835938, "loss": 0.5836, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.2863454818725586, "rewards/margins": 0.36429768800735474, "rewards/rejected": -1.6506431102752686, "step": 4890 }, { "epoch": 0.8442453480358374, "grad_norm": 24.7296199798584, "learning_rate": 1.806179435754735e-07, "logits/chosen": -2.3248345851898193, "logits/rejected": -2.295557975769043, "logps/chosen": -179.61318969726562, "logps/rejected": -199.75857543945312, "loss": 0.6453, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.2582156658172607, "rewards/margins": 0.245769664645195, "rewards/rejected": -1.5039854049682617, "step": 4900 }, { "epoch": 0.8459682977257064, "grad_norm": 26.73059844970703, "learning_rate": 1.804991607273921e-07, "logits/chosen": -2.432668924331665, "logits/rejected": -2.3986926078796387, "logps/chosen": -181.2235565185547, "logps/rejected": -205.87939453125, "loss": 0.611, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.2396998405456543, "rewards/margins": 0.3025253415107727, "rewards/rejected": -1.5422253608703613, "step": 4910 }, { "epoch": 0.8476912474155754, "grad_norm": 25.337461471557617, "learning_rate": 1.8038005432070183e-07, "logits/chosen": -2.291698455810547, "logits/rejected": -2.2636170387268066, "logps/chosen": -193.2530059814453, "logps/rejected": -228.5478515625, "loss": 0.5935, "rewards/accuracies": 0.6875, "rewards/chosen": -1.3832341432571411, "rewards/margins": 0.38054174184799194, "rewards/rejected": -1.7637760639190674, "step": 4920 }, { "epoch": 0.8494141971054445, "grad_norm": 30.94846534729004, "learning_rate": 1.8026062483413943e-07, "logits/chosen": -2.388110637664795, "logits/rejected": -2.367668628692627, "logps/chosen": -214.96780395507812, "logps/rejected": -244.97134399414062, "loss": 0.6433, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.6070770025253296, "rewards/margins": 0.27405670285224915, "rewards/rejected": -1.8811336755752563, "step": 4930 }, { "epoch": 0.8511371467953136, "grad_norm": 26.88053321838379, "learning_rate": 1.8014087274774018e-07, "logits/chosen": -2.3821301460266113, "logits/rejected": -2.354160785675049, "logps/chosen": -198.7532958984375, "logps/rejected": -225.3708038330078, "loss": 0.6198, "rewards/accuracies": 0.6875, "rewards/chosen": -1.4216960668563843, "rewards/margins": 0.3176547884941101, "rewards/rejected": -1.7393509149551392, "step": 4940 }, { "epoch": 0.8528600964851827, "grad_norm": 22.982711791992188, "learning_rate": 1.8002079854283605e-07, "logits/chosen": -2.257017135620117, "logits/rejected": -2.2330005168914795, "logps/chosen": -185.67608642578125, "logps/rejected": -210.94851684570312, "loss": 0.6271, "rewards/accuracies": 0.65625, "rewards/chosen": -1.29585862159729, "rewards/margins": 0.2753117084503174, "rewards/rejected": -1.5711703300476074, "step": 4950 }, { "epoch": 0.8545830461750517, "grad_norm": 33.373817443847656, "learning_rate": 1.799004027020537e-07, "logits/chosen": -2.32515025138855, "logits/rejected": -2.3151698112487793, "logps/chosen": -180.51144409179688, "logps/rejected": -223.17764282226562, "loss": 0.5729, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.2766039371490479, "rewards/margins": 0.4046260714530945, "rewards/rejected": -1.6812299489974976, "step": 4960 }, { "epoch": 0.8563059958649207, "grad_norm": 27.193952560424805, "learning_rate": 1.7977968570931262e-07, "logits/chosen": -2.282940149307251, "logits/rejected": -2.2675819396972656, "logps/chosen": -193.19253540039062, "logps/rejected": -230.05044555664062, "loss": 0.5858, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.404479742050171, "rewards/margins": 0.3858822286128998, "rewards/rejected": -1.790361762046814, "step": 4970 }, { "epoch": 0.8580289455547898, "grad_norm": 28.381431579589844, "learning_rate": 1.796586480498231e-07, "logits/chosen": -2.3110270500183105, "logits/rejected": -2.2937839031219482, "logps/chosen": -199.82888793945312, "logps/rejected": -233.05908203125, "loss": 0.5956, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.4531872272491455, "rewards/margins": 0.3392183184623718, "rewards/rejected": -1.7924054861068726, "step": 4980 }, { "epoch": 0.8597518952446589, "grad_norm": 32.66222381591797, "learning_rate": 1.7953729021008434e-07, "logits/chosen": -2.265355348587036, "logits/rejected": -2.245307445526123, "logps/chosen": -211.9461669921875, "logps/rejected": -252.12063598632812, "loss": 0.5988, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.5934269428253174, "rewards/margins": 0.3837546706199646, "rewards/rejected": -1.9771816730499268, "step": 4990 }, { "epoch": 0.8614748449345279, "grad_norm": 29.829771041870117, "learning_rate": 1.7941561267788245e-07, "logits/chosen": -2.279102325439453, "logits/rejected": -2.249908924102783, "logps/chosen": -200.36363220214844, "logps/rejected": -233.7318572998047, "loss": 0.5876, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.4332067966461182, "rewards/margins": 0.3809499740600586, "rewards/rejected": -1.8141567707061768, "step": 5000 }, { "epoch": 0.8631977946243969, "grad_norm": 23.35953712463379, "learning_rate": 1.7929361594228852e-07, "logits/chosen": -2.2788968086242676, "logits/rejected": -2.2506165504455566, "logps/chosen": -198.2695770263672, "logps/rejected": -231.7457733154297, "loss": 0.5872, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.4459125995635986, "rewards/margins": 0.37506669759750366, "rewards/rejected": -1.820979356765747, "step": 5010 }, { "epoch": 0.864920744314266, "grad_norm": 23.324260711669922, "learning_rate": 1.7917130049365672e-07, "logits/chosen": -2.2702226638793945, "logits/rejected": -2.237178325653076, "logps/chosen": -186.34371948242188, "logps/rejected": -223.6638946533203, "loss": 0.59, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.3197271823883057, "rewards/margins": 0.4005703032016754, "rewards/rejected": -1.7202975749969482, "step": 5020 }, { "epoch": 0.8666436940041351, "grad_norm": 48.629798889160156, "learning_rate": 1.7904866682362213e-07, "logits/chosen": -2.2448463439941406, "logits/rejected": -2.2259209156036377, "logps/chosen": -208.24423217773438, "logps/rejected": -233.9544677734375, "loss": 0.6497, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.5443228483200073, "rewards/margins": 0.2657424807548523, "rewards/rejected": -1.8100652694702148, "step": 5030 }, { "epoch": 0.8683666436940042, "grad_norm": 37.488189697265625, "learning_rate": 1.7892571542509896e-07, "logits/chosen": -2.3680639266967773, "logits/rejected": -2.33235239982605, "logps/chosen": -208.89395141601562, "logps/rejected": -230.7420196533203, "loss": 0.6191, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.5072259902954102, "rewards/margins": 0.2992441654205322, "rewards/rejected": -1.8064699172973633, "step": 5040 }, { "epoch": 0.8700895933838731, "grad_norm": 18.682756423950195, "learning_rate": 1.7880244679227853e-07, "logits/chosen": -2.2868175506591797, "logits/rejected": -2.2622358798980713, "logps/chosen": -205.58200073242188, "logps/rejected": -235.55917358398438, "loss": 0.6223, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.48475182056427, "rewards/margins": 0.3248194754123688, "rewards/rejected": -1.8095712661743164, "step": 5050 }, { "epoch": 0.8718125430737422, "grad_norm": 23.736186981201172, "learning_rate": 1.7867886142062717e-07, "logits/chosen": -2.2453980445861816, "logits/rejected": -2.2342467308044434, "logps/chosen": -182.3429412841797, "logps/rejected": -209.33657836914062, "loss": 0.6408, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.3162187337875366, "rewards/margins": 0.2594860792160034, "rewards/rejected": -1.5757049322128296, "step": 5060 }, { "epoch": 0.8735354927636113, "grad_norm": 20.68201446533203, "learning_rate": 1.785549598068844e-07, "logits/chosen": -2.3142597675323486, "logits/rejected": -2.2893218994140625, "logps/chosen": -162.87265014648438, "logps/rejected": -180.55992126464844, "loss": 0.6424, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.0877201557159424, "rewards/margins": 0.24460335075855255, "rewards/rejected": -1.3323233127593994, "step": 5070 }, { "epoch": 0.8752584424534804, "grad_norm": 25.388307571411133, "learning_rate": 1.7843074244906075e-07, "logits/chosen": -2.499162197113037, "logits/rejected": -2.461242198944092, "logps/chosen": -154.22413635253906, "logps/rejected": -178.7101287841797, "loss": 0.5933, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.9620031118392944, "rewards/margins": 0.3197912871837616, "rewards/rejected": -1.2817943096160889, "step": 5080 }, { "epoch": 0.8769813921433495, "grad_norm": 27.99173355102539, "learning_rate": 1.7830620984643597e-07, "logits/chosen": -2.353234052658081, "logits/rejected": -2.3224358558654785, "logps/chosen": -174.60372924804688, "logps/rejected": -208.08639526367188, "loss": 0.578, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.1949115991592407, "rewards/margins": 0.3782222867012024, "rewards/rejected": -1.573133945465088, "step": 5090 }, { "epoch": 0.8787043418332184, "grad_norm": 37.61249542236328, "learning_rate": 1.7818136249955678e-07, "logits/chosen": -2.229835271835327, "logits/rejected": -2.1953840255737305, "logps/chosen": -207.0838165283203, "logps/rejected": -227.10610961914062, "loss": 0.6509, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.5098870992660522, "rewards/margins": 0.2407897710800171, "rewards/rejected": -1.7506768703460693, "step": 5100 }, { "epoch": 0.8804272915230875, "grad_norm": 25.037174224853516, "learning_rate": 1.7805620091023505e-07, "logits/chosen": -2.3433327674865723, "logits/rejected": -2.314168930053711, "logps/chosen": -204.13002014160156, "logps/rejected": -220.486572265625, "loss": 0.6719, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.4547873735427856, "rewards/margins": 0.21724343299865723, "rewards/rejected": -1.672031044960022, "step": 5110 }, { "epoch": 0.8821502412129566, "grad_norm": 22.87247657775879, "learning_rate": 1.7793072558154573e-07, "logits/chosen": -2.378096103668213, "logits/rejected": -2.348536252975464, "logps/chosen": -164.9044647216797, "logps/rejected": -187.47647094726562, "loss": 0.6194, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.1046850681304932, "rewards/margins": 0.27192893624305725, "rewards/rejected": -1.376613974571228, "step": 5120 }, { "epoch": 0.8838731909028257, "grad_norm": 24.949138641357422, "learning_rate": 1.778049370178248e-07, "logits/chosen": -2.429076671600342, "logits/rejected": -2.3960869312286377, "logps/chosen": -163.94699096679688, "logps/rejected": -187.69308471679688, "loss": 0.5998, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.0935906171798706, "rewards/margins": 0.29233506321907043, "rewards/rejected": -1.3859257698059082, "step": 5130 }, { "epoch": 0.8855961405926946, "grad_norm": 23.759702682495117, "learning_rate": 1.7767883572466726e-07, "logits/chosen": -2.3825690746307373, "logits/rejected": -2.352921724319458, "logps/chosen": -161.80252075195312, "logps/rejected": -172.0554656982422, "loss": 0.6553, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.0280125141143799, "rewards/margins": 0.17422954738140106, "rewards/rejected": -1.2022418975830078, "step": 5140 }, { "epoch": 0.8873190902825637, "grad_norm": 21.064332962036133, "learning_rate": 1.7755242220892507e-07, "logits/chosen": -2.3944315910339355, "logits/rejected": -2.3759605884552, "logps/chosen": -147.7733154296875, "logps/rejected": -173.36581420898438, "loss": 0.6133, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.9367380142211914, "rewards/margins": 0.258679062128067, "rewards/rejected": -1.1954171657562256, "step": 5150 }, { "epoch": 0.8890420399724328, "grad_norm": 28.148576736450195, "learning_rate": 1.7742569697870512e-07, "logits/chosen": -2.40571665763855, "logits/rejected": -2.3768887519836426, "logps/chosen": -152.49459838867188, "logps/rejected": -167.98675537109375, "loss": 0.6408, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.999171257019043, "rewards/margins": 0.20267994701862335, "rewards/rejected": -1.2018513679504395, "step": 5160 }, { "epoch": 0.8907649896623019, "grad_norm": 19.74530792236328, "learning_rate": 1.7729866054336734e-07, "logits/chosen": -2.3742120265960693, "logits/rejected": -2.345825672149658, "logps/chosen": -163.65151977539062, "logps/rejected": -194.41513061523438, "loss": 0.5832, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.0627447366714478, "rewards/margins": 0.3682798743247986, "rewards/rejected": -1.4310245513916016, "step": 5170 }, { "epoch": 0.892487939352171, "grad_norm": 24.458284378051758, "learning_rate": 1.7717131341352235e-07, "logits/chosen": -2.4376699924468994, "logits/rejected": -2.4229021072387695, "logps/chosen": -189.02191162109375, "logps/rejected": -206.2496795654297, "loss": 0.6558, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.3285324573516846, "rewards/margins": 0.1917664110660553, "rewards/rejected": -1.520298957824707, "step": 5180 }, { "epoch": 0.8942108890420399, "grad_norm": 22.02104377746582, "learning_rate": 1.770436561010297e-07, "logits/chosen": -2.3305296897888184, "logits/rejected": -2.3187999725341797, "logps/chosen": -180.78086853027344, "logps/rejected": -199.74057006835938, "loss": 0.6602, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.2771596908569336, "rewards/margins": 0.18791182339191437, "rewards/rejected": -1.465071439743042, "step": 5190 }, { "epoch": 0.895933838731909, "grad_norm": 23.871702194213867, "learning_rate": 1.7691568911899556e-07, "logits/chosen": -2.412647008895874, "logits/rejected": -2.384337902069092, "logps/chosen": -174.80752563476562, "logps/rejected": -202.4107208251953, "loss": 0.609, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.219252347946167, "rewards/margins": 0.27993494272232056, "rewards/rejected": -1.4991872310638428, "step": 5200 }, { "epoch": 0.895933838731909, "eval_logits/chosen": -2.4079904556274414, "eval_logits/rejected": -2.3969969749450684, "eval_logps/chosen": -158.2065887451172, "eval_logps/rejected": -179.52780151367188, "eval_loss": 0.6414390206336975, "eval_rewards/accuracies": 0.616403341293335, "eval_rewards/chosen": -0.9919112324714661, "eval_rewards/margins": 0.1758705973625183, "eval_rewards/rejected": -1.1677817106246948, "eval_runtime": 382.9928, "eval_samples_per_second": 11.238, "eval_steps_per_second": 1.405, "step": 5200 }, { "epoch": 0.8976567884217781, "grad_norm": 30.867982864379883, "learning_rate": 1.7678741298177092e-07, "logits/chosen": -2.3551595211029053, "logits/rejected": -2.3347275257110596, "logps/chosen": -173.14175415039062, "logps/rejected": -192.95025634765625, "loss": 0.6329, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.2061526775360107, "rewards/margins": 0.23082256317138672, "rewards/rejected": -1.4369752407073975, "step": 5210 }, { "epoch": 0.8993797381116472, "grad_norm": 17.44063377380371, "learning_rate": 1.766588282049494e-07, "logits/chosen": -2.377192258834839, "logits/rejected": -2.361964702606201, "logps/chosen": -160.16224670410156, "logps/rejected": -179.97178649902344, "loss": 0.6529, "rewards/accuracies": 0.625, "rewards/chosen": -1.0694485902786255, "rewards/margins": 0.1831921637058258, "rewards/rejected": -1.252640962600708, "step": 5220 }, { "epoch": 0.9011026878015161, "grad_norm": 21.062334060668945, "learning_rate": 1.7652993530536497e-07, "logits/chosen": -2.3476855754852295, "logits/rejected": -2.332172155380249, "logps/chosen": -156.99844360351562, "logps/rejected": -190.17465209960938, "loss": 0.5777, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.9969981908798218, "rewards/margins": 0.3543012738227844, "rewards/rejected": -1.3512994050979614, "step": 5230 }, { "epoch": 0.9028256374913852, "grad_norm": 26.4909610748291, "learning_rate": 1.764007348010903e-07, "logits/chosen": -2.314365863800049, "logits/rejected": -2.2813925743103027, "logps/chosen": -164.72598266601562, "logps/rejected": -209.2113800048828, "loss": 0.5639, "rewards/accuracies": 0.71875, "rewards/chosen": -1.1105834245681763, "rewards/margins": 0.4559425413608551, "rewards/rejected": -1.566525936126709, "step": 5240 }, { "epoch": 0.9045485871812543, "grad_norm": 30.440879821777344, "learning_rate": 1.762712272114343e-07, "logits/chosen": -2.2364721298217773, "logits/rejected": -2.218665599822998, "logps/chosen": -184.82301330566406, "logps/rejected": -216.07345581054688, "loss": 0.5961, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.3264563083648682, "rewards/margins": 0.3302633464336395, "rewards/rejected": -1.6567198038101196, "step": 5250 }, { "epoch": 0.9062715368711234, "grad_norm": 35.71326446533203, "learning_rate": 1.7614141305694029e-07, "logits/chosen": -2.2694973945617676, "logits/rejected": -2.2302098274230957, "logps/chosen": -191.68019104003906, "logps/rejected": -218.50619506835938, "loss": 0.6129, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.3447864055633545, "rewards/margins": 0.33376210927963257, "rewards/rejected": -1.6785485744476318, "step": 5260 }, { "epoch": 0.9079944865609925, "grad_norm": 26.6447696685791, "learning_rate": 1.7601129285938364e-07, "logits/chosen": -2.3759639263153076, "logits/rejected": -2.351966381072998, "logps/chosen": -187.26132202148438, "logps/rejected": -216.2740478515625, "loss": 0.6218, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.343926191329956, "rewards/margins": 0.3137759268283844, "rewards/rejected": -1.657702088356018, "step": 5270 }, { "epoch": 0.9097174362508614, "grad_norm": 29.428714752197266, "learning_rate": 1.7588086714177003e-07, "logits/chosen": -2.320923328399658, "logits/rejected": -2.2858223915100098, "logps/chosen": -187.89013671875, "logps/rejected": -211.83566284179688, "loss": 0.6178, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.3284814357757568, "rewards/margins": 0.2929961681365967, "rewards/rejected": -1.621477484703064, "step": 5280 }, { "epoch": 0.9114403859407305, "grad_norm": 25.15055274963379, "learning_rate": 1.7575013642833295e-07, "logits/chosen": -2.295776844024658, "logits/rejected": -2.2582039833068848, "logps/chosen": -174.5590057373047, "logps/rejected": -198.55734252929688, "loss": 0.6163, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.1752634048461914, "rewards/margins": 0.30027860403060913, "rewards/rejected": -1.4755419492721558, "step": 5290 }, { "epoch": 0.9131633356305996, "grad_norm": 22.768001556396484, "learning_rate": 1.7561910124453195e-07, "logits/chosen": -2.313915729522705, "logits/rejected": -2.2940611839294434, "logps/chosen": -169.04296875, "logps/rejected": -199.2408447265625, "loss": 0.5988, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.1217864751815796, "rewards/margins": 0.3264429271221161, "rewards/rejected": -1.4482295513153076, "step": 5300 }, { "epoch": 0.9148862853204687, "grad_norm": 25.457565307617188, "learning_rate": 1.7548776211705034e-07, "logits/chosen": -2.3798553943634033, "logits/rejected": -2.3660120964050293, "logps/chosen": -183.68081665039062, "logps/rejected": -200.96652221679688, "loss": 0.6349, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.2447845935821533, "rewards/margins": 0.24797436594963074, "rewards/rejected": -1.4927589893341064, "step": 5310 }, { "epoch": 0.9166092350103378, "grad_norm": 27.405920028686523, "learning_rate": 1.7535611957379302e-07, "logits/chosen": -2.317561149597168, "logits/rejected": -2.263969898223877, "logps/chosen": -192.5063018798828, "logps/rejected": -217.94100952148438, "loss": 0.5931, "rewards/accuracies": 0.71875, "rewards/chosen": -1.3361905813217163, "rewards/margins": 0.35336703062057495, "rewards/rejected": -1.689557433128357, "step": 5320 }, { "epoch": 0.9183321847002067, "grad_norm": 22.66073989868164, "learning_rate": 1.7522417414388446e-07, "logits/chosen": -2.248110294342041, "logits/rejected": -2.2363169193267822, "logps/chosen": -207.2706298828125, "logps/rejected": -251.25704956054688, "loss": 0.5868, "rewards/accuracies": 0.6875, "rewards/chosen": -1.5597829818725586, "rewards/margins": 0.4321991503238678, "rewards/rejected": -1.9919822216033936, "step": 5330 }, { "epoch": 0.9200551343900758, "grad_norm": 29.119892120361328, "learning_rate": 1.7509192635766664e-07, "logits/chosen": -2.262770414352417, "logits/rejected": -2.2210538387298584, "logps/chosen": -215.5501708984375, "logps/rejected": -244.3115692138672, "loss": 0.5908, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.5742887258529663, "rewards/margins": 0.3592612147331238, "rewards/rejected": -1.9335498809814453, "step": 5340 }, { "epoch": 0.9217780840799449, "grad_norm": 20.251754760742188, "learning_rate": 1.7495937674669675e-07, "logits/chosen": -2.272902011871338, "logits/rejected": -2.245993137359619, "logps/chosen": -193.52423095703125, "logps/rejected": -218.2841339111328, "loss": 0.6288, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.3998287916183472, "rewards/margins": 0.2747870087623596, "rewards/rejected": -1.6746160984039307, "step": 5350 }, { "epoch": 0.923501033769814, "grad_norm": 27.8018856048584, "learning_rate": 1.7482652584374514e-07, "logits/chosen": -2.339388847351074, "logits/rejected": -2.3210878372192383, "logps/chosen": -181.2473602294922, "logps/rejected": -221.5663299560547, "loss": 0.5801, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.266035556793213, "rewards/margins": 0.38970786333084106, "rewards/rejected": -1.6557432413101196, "step": 5360 }, { "epoch": 0.9252239834596829, "grad_norm": 25.928937911987305, "learning_rate": 1.7469337418279325e-07, "logits/chosen": -2.283285140991211, "logits/rejected": -2.267566680908203, "logps/chosen": -182.66424560546875, "logps/rejected": -206.1977996826172, "loss": 0.6343, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.2891303300857544, "rewards/margins": 0.2453964203596115, "rewards/rejected": -1.5345267057418823, "step": 5370 }, { "epoch": 0.926946933149552, "grad_norm": 24.324359893798828, "learning_rate": 1.7455992229903133e-07, "logits/chosen": -2.3700790405273438, "logits/rejected": -2.339428424835205, "logps/chosen": -183.50149536132812, "logps/rejected": -204.10726928710938, "loss": 0.6165, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.2439554929733276, "rewards/margins": 0.2912697494029999, "rewards/rejected": -1.53522527217865, "step": 5380 }, { "epoch": 0.9286698828394211, "grad_norm": 35.17918395996094, "learning_rate": 1.7442617072885627e-07, "logits/chosen": -2.2917733192443848, "logits/rejected": -2.2521700859069824, "logps/chosen": -189.371826171875, "logps/rejected": -213.36972045898438, "loss": 0.6084, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.3059356212615967, "rewards/margins": 0.3214091360569, "rewards/rejected": -1.6273447275161743, "step": 5390 }, { "epoch": 0.9303928325292902, "grad_norm": 25.27411651611328, "learning_rate": 1.7429212000986965e-07, "logits/chosen": -2.3175926208496094, "logits/rejected": -2.293071985244751, "logps/chosen": -167.50717163085938, "logps/rejected": -209.6229248046875, "loss": 0.5747, "rewards/accuracies": 0.6875, "rewards/chosen": -1.1563806533813477, "rewards/margins": 0.40642791986465454, "rewards/rejected": -1.562808632850647, "step": 5400 }, { "epoch": 0.9321157822191593, "grad_norm": 40.84873962402344, "learning_rate": 1.7415777068087545e-07, "logits/chosen": -2.3327441215515137, "logits/rejected": -2.3204197883605957, "logps/chosen": -183.882080078125, "logps/rejected": -197.12765502929688, "loss": 0.6669, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.25808584690094, "rewards/margins": 0.17094475030899048, "rewards/rejected": -1.4290306568145752, "step": 5410 }, { "epoch": 0.9338387319090282, "grad_norm": 31.94063949584961, "learning_rate": 1.7402312328187776e-07, "logits/chosen": -2.3866710662841797, "logits/rejected": -2.3662643432617188, "logps/chosen": -176.82510375976562, "logps/rejected": -197.71966552734375, "loss": 0.6343, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.2160383462905884, "rewards/margins": 0.23323245346546173, "rewards/rejected": -1.4492708444595337, "step": 5420 }, { "epoch": 0.9355616815988973, "grad_norm": 28.33039665222168, "learning_rate": 1.7388817835407884e-07, "logits/chosen": -2.353658676147461, "logits/rejected": -2.3364737033843994, "logps/chosen": -170.1305694580078, "logps/rejected": -195.0408172607422, "loss": 0.6107, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.1594005823135376, "rewards/margins": 0.2835468053817749, "rewards/rejected": -1.4429473876953125, "step": 5430 }, { "epoch": 0.9372846312887664, "grad_norm": 26.746353149414062, "learning_rate": 1.737529364398768e-07, "logits/chosen": -2.3374722003936768, "logits/rejected": -2.3144500255584717, "logps/chosen": -180.90878295898438, "logps/rejected": -211.0740509033203, "loss": 0.6003, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.2434018850326538, "rewards/margins": 0.32336291670799255, "rewards/rejected": -1.5667648315429688, "step": 5440 }, { "epoch": 0.9390075809786355, "grad_norm": 43.07821273803711, "learning_rate": 1.7361739808286343e-07, "logits/chosen": -2.2768020629882812, "logits/rejected": -2.2596564292907715, "logps/chosen": -194.68710327148438, "logps/rejected": -224.31982421875, "loss": 0.6064, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.3785072565078735, "rewards/margins": 0.33700355887413025, "rewards/rejected": -1.7155107259750366, "step": 5450 }, { "epoch": 0.9407305306685044, "grad_norm": 33.96110916137695, "learning_rate": 1.7348156382782215e-07, "logits/chosen": -2.274019956588745, "logits/rejected": -2.2504825592041016, "logps/chosen": -198.7681884765625, "logps/rejected": -219.91854858398438, "loss": 0.6386, "rewards/accuracies": 0.625, "rewards/chosen": -1.4427436590194702, "rewards/margins": 0.24368536472320557, "rewards/rejected": -1.6864287853240967, "step": 5460 }, { "epoch": 0.9424534803583735, "grad_norm": 24.447280883789062, "learning_rate": 1.733454342207256e-07, "logits/chosen": -2.259782075881958, "logits/rejected": -2.245398998260498, "logps/chosen": -197.2213897705078, "logps/rejected": -221.02444458007812, "loss": 0.6565, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.4507030248641968, "rewards/margins": 0.25823861360549927, "rewards/rejected": -1.7089416980743408, "step": 5470 }, { "epoch": 0.9441764300482426, "grad_norm": 32.96196746826172, "learning_rate": 1.732090098087336e-07, "logits/chosen": -2.270646572113037, "logits/rejected": -2.243563652038574, "logps/chosen": -195.88558959960938, "logps/rejected": -227.06527709960938, "loss": 0.5953, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.4136393070220947, "rewards/margins": 0.3479146361351013, "rewards/rejected": -1.7615541219711304, "step": 5480 }, { "epoch": 0.9458993797381117, "grad_norm": 19.306108474731445, "learning_rate": 1.7307229114019091e-07, "logits/chosen": -2.2577857971191406, "logits/rejected": -2.2329039573669434, "logps/chosen": -195.81765747070312, "logps/rejected": -209.193359375, "loss": 0.6551, "rewards/accuracies": 0.625, "rewards/chosen": -1.3897420167922974, "rewards/margins": 0.19239407777786255, "rewards/rejected": -1.5821361541748047, "step": 5490 }, { "epoch": 0.9476223294279807, "grad_norm": 26.862762451171875, "learning_rate": 1.7293527876462504e-07, "logits/chosen": -2.3610310554504395, "logits/rejected": -2.340768575668335, "logps/chosen": -176.81341552734375, "logps/rejected": -212.5503387451172, "loss": 0.585, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.2214715480804443, "rewards/margins": 0.3627847731113434, "rewards/rejected": -1.5842561721801758, "step": 5500 }, { "epoch": 0.9493452791178497, "grad_norm": 23.03242301940918, "learning_rate": 1.72797973232744e-07, "logits/chosen": -2.288675308227539, "logits/rejected": -2.272275447845459, "logps/chosen": -187.26974487304688, "logps/rejected": -210.4651336669922, "loss": 0.6331, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.336241364479065, "rewards/margins": 0.24365845322608948, "rewards/rejected": -1.5798996686935425, "step": 5510 }, { "epoch": 0.9510682288077188, "grad_norm": 24.83022117614746, "learning_rate": 1.726603750964341e-07, "logits/chosen": -2.2835960388183594, "logits/rejected": -2.2611312866210938, "logps/chosen": -185.1580810546875, "logps/rejected": -211.9181365966797, "loss": 0.5992, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.2729332447052002, "rewards/margins": 0.33776092529296875, "rewards/rejected": -1.610694169998169, "step": 5520 }, { "epoch": 0.9527911784975879, "grad_norm": 20.41211700439453, "learning_rate": 1.725224849087578e-07, "logits/chosen": -2.3336851596832275, "logits/rejected": -2.2991302013397217, "logps/chosen": -187.03628540039062, "logps/rejected": -202.1573486328125, "loss": 0.6331, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.2588682174682617, "rewards/margins": 0.24945969879627228, "rewards/rejected": -1.508327841758728, "step": 5530 }, { "epoch": 0.954514128187457, "grad_norm": 23.775251388549805, "learning_rate": 1.723843032239514e-07, "logits/chosen": -2.3340952396392822, "logits/rejected": -2.3236658573150635, "logps/chosen": -169.35516357421875, "logps/rejected": -204.19137573242188, "loss": 0.6068, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.1559011936187744, "rewards/margins": 0.32853391766548157, "rewards/rejected": -1.4844350814819336, "step": 5540 }, { "epoch": 0.956237077877326, "grad_norm": 23.238468170166016, "learning_rate": 1.722458305974229e-07, "logits/chosen": -2.251826763153076, "logits/rejected": -2.2365410327911377, "logps/chosen": -165.81344604492188, "logps/rejected": -181.73863220214844, "loss": 0.6747, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.1286054849624634, "rewards/margins": 0.1507066935300827, "rewards/rejected": -1.279312252998352, "step": 5550 }, { "epoch": 0.957960027567195, "grad_norm": 27.57474708557129, "learning_rate": 1.7210706758574957e-07, "logits/chosen": -2.3349127769470215, "logits/rejected": -2.307807207107544, "logps/chosen": -145.54527282714844, "logps/rejected": -165.34902954101562, "loss": 0.6212, "rewards/accuracies": 0.625, "rewards/chosen": -0.9249517321586609, "rewards/margins": 0.24045920372009277, "rewards/rejected": -1.1654109954833984, "step": 5560 }, { "epoch": 0.9596829772570641, "grad_norm": 18.97833824157715, "learning_rate": 1.71968014746676e-07, "logits/chosen": -2.3977248668670654, "logits/rejected": -2.3879504203796387, "logps/chosen": -142.7986297607422, "logps/rejected": -164.1881866455078, "loss": 0.647, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.908721923828125, "rewards/margins": 0.1725355088710785, "rewards/rejected": -1.0812573432922363, "step": 5570 }, { "epoch": 0.9614059269469332, "grad_norm": 24.691076278686523, "learning_rate": 1.7182867263911163e-07, "logits/chosen": -2.3252456188201904, "logits/rejected": -2.3095462322235107, "logps/chosen": -144.26171875, "logps/rejected": -170.215087890625, "loss": 0.6079, "rewards/accuracies": 0.6875, "rewards/chosen": -0.901009738445282, "rewards/margins": 0.26762694120407104, "rewards/rejected": -1.1686367988586426, "step": 5580 }, { "epoch": 0.9631288766368022, "grad_norm": 22.33702850341797, "learning_rate": 1.7168904182312863e-07, "logits/chosen": -2.4048874378204346, "logits/rejected": -2.365469455718994, "logps/chosen": -154.96484375, "logps/rejected": -179.8811492919922, "loss": 0.6156, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.9779122471809387, "rewards/margins": 0.27446627616882324, "rewards/rejected": -1.2523784637451172, "step": 5590 }, { "epoch": 0.9648518263266712, "grad_norm": 23.034870147705078, "learning_rate": 1.715491228599596e-07, "logits/chosen": -2.3783340454101562, "logits/rejected": -2.3681511878967285, "logps/chosen": -158.8350830078125, "logps/rejected": -193.6468505859375, "loss": 0.5977, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.056706190109253, "rewards/margins": 0.31611010432243347, "rewards/rejected": -1.3728163242340088, "step": 5600 }, { "epoch": 0.9648518263266712, "eval_logits/chosen": -2.404153347015381, "eval_logits/rejected": -2.3932902812957764, "eval_logps/chosen": -150.67100524902344, "eval_logps/rejected": -170.78880310058594, "eval_loss": 0.643197238445282, "eval_rewards/accuracies": 0.6273234486579895, "eval_rewards/chosen": -0.9165552258491516, "eval_rewards/margins": 0.1638367623090744, "eval_rewards/rejected": -1.0803918838500977, "eval_runtime": 383.0072, "eval_samples_per_second": 11.237, "eval_steps_per_second": 1.405, "step": 5600 }, { "epoch": 0.9665747760165403, "grad_norm": 24.86272430419922, "learning_rate": 1.7140891631199533e-07, "logits/chosen": -2.353733777999878, "logits/rejected": -2.333936929702759, "logps/chosen": -175.52102661132812, "logps/rejected": -216.1461181640625, "loss": 0.5819, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.226787805557251, "rewards/margins": 0.3806641697883606, "rewards/rejected": -1.6074520349502563, "step": 5610 }, { "epoch": 0.9682977257064094, "grad_norm": 36.44122314453125, "learning_rate": 1.7126842274278245e-07, "logits/chosen": -2.2590763568878174, "logits/rejected": -2.2382168769836426, "logps/chosen": -203.39913940429688, "logps/rejected": -224.2606658935547, "loss": 0.6405, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.479689598083496, "rewards/margins": 0.26059383153915405, "rewards/rejected": -1.7402836084365845, "step": 5620 }, { "epoch": 0.9700206753962785, "grad_norm": 26.534252166748047, "learning_rate": 1.7112764271702135e-07, "logits/chosen": -2.3296680450439453, "logits/rejected": -2.298461675643921, "logps/chosen": -189.0676727294922, "logps/rejected": -214.03378295898438, "loss": 0.6184, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.3230987787246704, "rewards/margins": 0.3118281364440918, "rewards/rejected": -1.6349267959594727, "step": 5630 }, { "epoch": 0.9717436250861475, "grad_norm": 26.497467041015625, "learning_rate": 1.7098657680056373e-07, "logits/chosen": -2.3253607749938965, "logits/rejected": -2.3052287101745605, "logps/chosen": -165.99765014648438, "logps/rejected": -201.12632751464844, "loss": 0.6061, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.1070231199264526, "rewards/margins": 0.35045483708381653, "rewards/rejected": -1.4574780464172363, "step": 5640 }, { "epoch": 0.9734665747760165, "grad_norm": 16.514123916625977, "learning_rate": 1.7084522556041049e-07, "logits/chosen": -2.2654869556427, "logits/rejected": -2.243253469467163, "logps/chosen": -162.08592224121094, "logps/rejected": -198.29437255859375, "loss": 0.5992, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.1030868291854858, "rewards/margins": 0.36720800399780273, "rewards/rejected": -1.4702950716018677, "step": 5650 }, { "epoch": 0.9751895244658856, "grad_norm": 24.211523056030273, "learning_rate": 1.7070358956470923e-07, "logits/chosen": -2.2788543701171875, "logits/rejected": -2.263421058654785, "logps/chosen": -169.59774780273438, "logps/rejected": -196.69061279296875, "loss": 0.6203, "rewards/accuracies": 0.625, "rewards/chosen": -1.1747559309005737, "rewards/margins": 0.2927972674369812, "rewards/rejected": -1.4675531387329102, "step": 5660 }, { "epoch": 0.9769124741557547, "grad_norm": 26.447824478149414, "learning_rate": 1.705616693827522e-07, "logits/chosen": -2.276493787765503, "logits/rejected": -2.2538936138153076, "logps/chosen": -184.60488891601562, "logps/rejected": -212.2623748779297, "loss": 0.6032, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.292311429977417, "rewards/margins": 0.30952146649360657, "rewards/rejected": -1.6018329858779907, "step": 5670 }, { "epoch": 0.9786354238456237, "grad_norm": 48.45136260986328, "learning_rate": 1.7041946558497388e-07, "logits/chosen": -2.2704081535339355, "logits/rejected": -2.2329323291778564, "logps/chosen": -212.3938751220703, "logps/rejected": -245.57284545898438, "loss": 0.5841, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.5729804039001465, "rewards/margins": 0.3733724355697632, "rewards/rejected": -1.9463527202606201, "step": 5680 }, { "epoch": 0.9803583735354927, "grad_norm": 27.563390731811523, "learning_rate": 1.7027697874294867e-07, "logits/chosen": -2.284641742706299, "logits/rejected": -2.244694471359253, "logps/chosen": -222.4856414794922, "logps/rejected": -256.3599853515625, "loss": 0.5792, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.6246564388275146, "rewards/margins": 0.4309469759464264, "rewards/rejected": -2.055603504180908, "step": 5690 }, { "epoch": 0.9820813232253618, "grad_norm": 80.95060729980469, "learning_rate": 1.7013420942938876e-07, "logits/chosen": -2.1855034828186035, "logits/rejected": -2.1641812324523926, "logps/chosen": -216.79989624023438, "logps/rejected": -245.7619171142578, "loss": 0.6365, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.5874277353286743, "rewards/margins": 0.339304655790329, "rewards/rejected": -1.9267324209213257, "step": 5700 }, { "epoch": 0.9838042729152309, "grad_norm": 24.923404693603516, "learning_rate": 1.6999115821814155e-07, "logits/chosen": -2.312826633453369, "logits/rejected": -2.2875611782073975, "logps/chosen": -202.64010620117188, "logps/rejected": -239.53475952148438, "loss": 0.5897, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.464739441871643, "rewards/margins": 0.41259440779685974, "rewards/rejected": -1.8773338794708252, "step": 5710 }, { "epoch": 0.9855272226051, "grad_norm": 25.859790802001953, "learning_rate": 1.6984782568418766e-07, "logits/chosen": -2.2918834686279297, "logits/rejected": -2.259382963180542, "logps/chosen": -179.59243774414062, "logps/rejected": -216.98263549804688, "loss": 0.5698, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.2602999210357666, "rewards/margins": 0.42483147978782654, "rewards/rejected": -1.6851314306259155, "step": 5720 }, { "epoch": 0.987250172294969, "grad_norm": 29.39626693725586, "learning_rate": 1.697042124036383e-07, "logits/chosen": -2.349903106689453, "logits/rejected": -2.3303074836730957, "logps/chosen": -182.60787963867188, "logps/rejected": -214.63491821289062, "loss": 0.6162, "rewards/accuracies": 0.6875, "rewards/chosen": -1.2972140312194824, "rewards/margins": 0.33766549825668335, "rewards/rejected": -1.634879469871521, "step": 5730 }, { "epoch": 0.988973121984838, "grad_norm": 26.878019332885742, "learning_rate": 1.6956031895373327e-07, "logits/chosen": -2.27702260017395, "logits/rejected": -2.2376182079315186, "logps/chosen": -189.97862243652344, "logps/rejected": -228.1022186279297, "loss": 0.5716, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.303370475769043, "rewards/margins": 0.44443821907043457, "rewards/rejected": -1.7478086948394775, "step": 5740 }, { "epoch": 0.9906960716747071, "grad_norm": 26.576335906982422, "learning_rate": 1.6941614591283834e-07, "logits/chosen": -2.340158462524414, "logits/rejected": -2.319542407989502, "logps/chosen": -189.45004272460938, "logps/rejected": -203.67274475097656, "loss": 0.65, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.3123247623443604, "rewards/margins": 0.22555959224700928, "rewards/rejected": -1.5378843545913696, "step": 5750 }, { "epoch": 0.9924190213645762, "grad_norm": 21.105655670166016, "learning_rate": 1.6927169386044313e-07, "logits/chosen": -2.3314685821533203, "logits/rejected": -2.3016769886016846, "logps/chosen": -168.71255493164062, "logps/rejected": -194.60256958007812, "loss": 0.6264, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.1167190074920654, "rewards/margins": 0.2818935811519623, "rewards/rejected": -1.398612380027771, "step": 5760 }, { "epoch": 0.9941419710544452, "grad_norm": 26.614208221435547, "learning_rate": 1.691269633771588e-07, "logits/chosen": -2.2715003490448, "logits/rejected": -2.2406039237976074, "logps/chosen": -163.55667114257812, "logps/rejected": -199.54388427734375, "loss": 0.5905, "rewards/accuracies": 0.6875, "rewards/chosen": -1.1084734201431274, "rewards/margins": 0.35367265343666077, "rewards/rejected": -1.4621461629867554, "step": 5770 }, { "epoch": 0.9958649207443143, "grad_norm": 25.3371524810791, "learning_rate": 1.6898195504471552e-07, "logits/chosen": -2.252403497695923, "logits/rejected": -2.225072145462036, "logps/chosen": -195.75697326660156, "logps/rejected": -231.71957397460938, "loss": 0.6095, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.423097014427185, "rewards/margins": 0.3620423674583435, "rewards/rejected": -1.7851394414901733, "step": 5780 }, { "epoch": 0.9975878704341833, "grad_norm": 25.647926330566406, "learning_rate": 1.688366694459603e-07, "logits/chosen": -2.201721668243408, "logits/rejected": -2.1597495079040527, "logps/chosen": -188.7086181640625, "logps/rejected": -233.0748291015625, "loss": 0.5789, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.3738667964935303, "rewards/margins": 0.41834861040115356, "rewards/rejected": -1.792215347290039, "step": 5790 }, { "epoch": 0.9993108201240524, "grad_norm": 22.43425941467285, "learning_rate": 1.6869110716485456e-07, "logits/chosen": -2.3050150871276855, "logits/rejected": -2.269188165664673, "logps/chosen": -186.7384033203125, "logps/rejected": -226.17269897460938, "loss": 0.5916, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.349612832069397, "rewards/margins": 0.4007296562194824, "rewards/rejected": -1.750342607498169, "step": 5800 }, { "epoch": 1.0010337698139213, "grad_norm": 24.5869197845459, "learning_rate": 1.6854526878647186e-07, "logits/chosen": -2.322516918182373, "logits/rejected": -2.297133207321167, "logps/chosen": -185.1601104736328, "logps/rejected": -216.9307098388672, "loss": 0.612, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.2970949411392212, "rewards/margins": 0.3414715826511383, "rewards/rejected": -1.6385663747787476, "step": 5810 }, { "epoch": 1.0027567195037905, "grad_norm": 27.19829750061035, "learning_rate": 1.6839915489699545e-07, "logits/chosen": -2.316255807876587, "logits/rejected": -2.269670009613037, "logps/chosen": -188.31253051757812, "logps/rejected": -236.2301483154297, "loss": 0.5283, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.343603491783142, "rewards/margins": 0.5325624346733093, "rewards/rejected": -1.8761659860610962, "step": 5820 }, { "epoch": 1.0044796691936595, "grad_norm": 46.75299835205078, "learning_rate": 1.682527660837161e-07, "logits/chosen": -2.220951557159424, "logits/rejected": -2.1911635398864746, "logps/chosen": -211.2012176513672, "logps/rejected": -245.8602294921875, "loss": 0.6076, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.5650999546051025, "rewards/margins": 0.377856969833374, "rewards/rejected": -1.9429566860198975, "step": 5830 }, { "epoch": 1.0062026188835287, "grad_norm": 21.179813385009766, "learning_rate": 1.6810610293502944e-07, "logits/chosen": -2.259906053543091, "logits/rejected": -2.233856439590454, "logps/chosen": -202.88821411132812, "logps/rejected": -246.70016479492188, "loss": 0.5882, "rewards/accuracies": 0.65625, "rewards/chosen": -1.4954583644866943, "rewards/margins": 0.4338740408420563, "rewards/rejected": -1.9293327331542969, "step": 5840 }, { "epoch": 1.0079255685733977, "grad_norm": 23.58659553527832, "learning_rate": 1.679591660404339e-07, "logits/chosen": -2.303954601287842, "logits/rejected": -2.271392345428467, "logps/chosen": -183.2528076171875, "logps/rejected": -234.5919647216797, "loss": 0.5451, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.316575050354004, "rewards/margins": 0.4936788082122803, "rewards/rejected": -1.8102538585662842, "step": 5850 }, { "epoch": 1.0096485182632666, "grad_norm": 21.694034576416016, "learning_rate": 1.6781195599052807e-07, "logits/chosen": -2.206890106201172, "logits/rejected": -2.181102991104126, "logps/chosen": -190.34397888183594, "logps/rejected": -238.74783325195312, "loss": 0.5629, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.372124433517456, "rewards/margins": 0.49931803345680237, "rewards/rejected": -1.8714425563812256, "step": 5860 }, { "epoch": 1.0113714679531358, "grad_norm": 26.660905838012695, "learning_rate": 1.6766447337700865e-07, "logits/chosen": -2.247291088104248, "logits/rejected": -2.2175240516662598, "logps/chosen": -198.14712524414062, "logps/rejected": -241.3429412841797, "loss": 0.5602, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.415946125984192, "rewards/margins": 0.4595022201538086, "rewards/rejected": -1.875448226928711, "step": 5870 }, { "epoch": 1.0130944176430048, "grad_norm": 25.15273094177246, "learning_rate": 1.6751671879266769e-07, "logits/chosen": -2.2789111137390137, "logits/rejected": -2.254117965698242, "logps/chosen": -191.4722137451172, "logps/rejected": -225.45297241210938, "loss": 0.5961, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.3582924604415894, "rewards/margins": 0.37257465720176697, "rewards/rejected": -1.7308671474456787, "step": 5880 }, { "epoch": 1.014817367332874, "grad_norm": 32.9681510925293, "learning_rate": 1.673686928313905e-07, "logits/chosen": -2.307744264602661, "logits/rejected": -2.2846286296844482, "logps/chosen": -187.4040985107422, "logps/rejected": -220.6470184326172, "loss": 0.6146, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.3102037906646729, "rewards/margins": 0.32465046644210815, "rewards/rejected": -1.6348543167114258, "step": 5890 }, { "epoch": 1.016540317022743, "grad_norm": 26.85865020751953, "learning_rate": 1.6722039608815315e-07, "logits/chosen": -2.2252914905548096, "logits/rejected": -2.1969614028930664, "logps/chosen": -187.75491333007812, "logps/rejected": -230.99462890625, "loss": 0.5611, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.3111248016357422, "rewards/margins": 0.4657079577445984, "rewards/rejected": -1.7768325805664062, "step": 5900 }, { "epoch": 1.018263266712612, "grad_norm": 24.568252563476562, "learning_rate": 1.670718291590201e-07, "logits/chosen": -2.2330894470214844, "logits/rejected": -2.2195515632629395, "logps/chosen": -185.08482360839844, "logps/rejected": -224.62344360351562, "loss": 0.5979, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.3206746578216553, "rewards/margins": 0.35854750871658325, "rewards/rejected": -1.6792221069335938, "step": 5910 }, { "epoch": 1.019986216402481, "grad_norm": 35.23897171020508, "learning_rate": 1.6692299264114178e-07, "logits/chosen": -2.1981778144836426, "logits/rejected": -2.1714072227478027, "logps/chosen": -200.17672729492188, "logps/rejected": -229.83621215820312, "loss": 0.6394, "rewards/accuracies": 0.65625, "rewards/chosen": -1.4837713241577148, "rewards/margins": 0.28825488686561584, "rewards/rejected": -1.7720263004302979, "step": 5920 }, { "epoch": 1.02170916609235, "grad_norm": 22.59590721130371, "learning_rate": 1.6677388713275224e-07, "logits/chosen": -2.262528657913208, "logits/rejected": -2.239509105682373, "logps/chosen": -191.60922241210938, "logps/rejected": -227.3905029296875, "loss": 0.6215, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.4099931716918945, "rewards/margins": 0.3598439693450928, "rewards/rejected": -1.7698370218276978, "step": 5930 }, { "epoch": 1.0234321157822193, "grad_norm": 20.984752655029297, "learning_rate": 1.6662451323316663e-07, "logits/chosen": -2.261719226837158, "logits/rejected": -2.2246999740600586, "logps/chosen": -164.25592041015625, "logps/rejected": -204.17506408691406, "loss": 0.5654, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.0919129848480225, "rewards/margins": 0.4257412850856781, "rewards/rejected": -1.5176541805267334, "step": 5940 }, { "epoch": 1.0251550654720882, "grad_norm": 27.647113800048828, "learning_rate": 1.6647487154277897e-07, "logits/chosen": -2.2034451961517334, "logits/rejected": -2.184757709503174, "logps/chosen": -173.5892791748047, "logps/rejected": -207.5119171142578, "loss": 0.5837, "rewards/accuracies": 0.75, "rewards/chosen": -1.1839631795883179, "rewards/margins": 0.3621217608451843, "rewards/rejected": -1.546085000038147, "step": 5950 }, { "epoch": 1.0268780151619572, "grad_norm": 24.661508560180664, "learning_rate": 1.6632496266305958e-07, "logits/chosen": -2.2062265872955322, "logits/rejected": -2.164212703704834, "logps/chosen": -191.03012084960938, "logps/rejected": -208.45529174804688, "loss": 0.6431, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.3232829570770264, "rewards/margins": 0.2540973126888275, "rewards/rejected": -1.5773800611495972, "step": 5960 }, { "epoch": 1.0286009648518264, "grad_norm": 24.430320739746094, "learning_rate": 1.661747871965527e-07, "logits/chosen": -2.2344512939453125, "logits/rejected": -2.209048271179199, "logps/chosen": -177.21939086914062, "logps/rejected": -210.49755859375, "loss": 0.5936, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.2416160106658936, "rewards/margins": 0.3695695698261261, "rewards/rejected": -1.6111854314804077, "step": 5970 }, { "epoch": 1.0303239145416954, "grad_norm": 25.474464416503906, "learning_rate": 1.6602434574687417e-07, "logits/chosen": -2.223212718963623, "logits/rejected": -2.207329511642456, "logps/chosen": -168.71603393554688, "logps/rejected": -211.7668914794922, "loss": 0.575, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.1495542526245117, "rewards/margins": 0.4093645215034485, "rewards/rejected": -1.5589187145233154, "step": 5980 }, { "epoch": 1.0320468642315643, "grad_norm": 24.24454689025879, "learning_rate": 1.658736389187089e-07, "logits/chosen": -2.284614086151123, "logits/rejected": -2.246558666229248, "logps/chosen": -173.7165069580078, "logps/rejected": -209.68753051757812, "loss": 0.5746, "rewards/accuracies": 0.71875, "rewards/chosen": -1.1815001964569092, "rewards/margins": 0.40414220094680786, "rewards/rejected": -1.5856424570083618, "step": 5990 }, { "epoch": 1.0337698139214335, "grad_norm": 35.86470031738281, "learning_rate": 1.6572266731780842e-07, "logits/chosen": -2.2233195304870605, "logits/rejected": -2.196089744567871, "logps/chosen": -201.31045532226562, "logps/rejected": -242.53005981445312, "loss": 0.5845, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.48953378200531, "rewards/margins": 0.42532163858413696, "rewards/rejected": -1.9148553609848022, "step": 6000 }, { "epoch": 1.0337698139214335, "eval_logits/chosen": -2.2815613746643066, "eval_logits/rejected": -2.2640297412872314, "eval_logps/chosen": -195.87583923339844, "eval_logps/rejected": -223.07241821289062, "eval_loss": 0.6437787413597107, "eval_rewards/accuracies": 0.624535322189331, "eval_rewards/chosen": -1.3686038255691528, "eval_rewards/margins": 0.23462428152561188, "eval_rewards/rejected": -1.6032280921936035, "eval_runtime": 383.8708, "eval_samples_per_second": 11.212, "eval_steps_per_second": 1.402, "step": 6000 }, { "epoch": 1.0354927636113025, "grad_norm": 32.003517150878906, "learning_rate": 1.655714315509885e-07, "logits/chosen": -2.2215774059295654, "logits/rejected": -2.2042527198791504, "logps/chosen": -207.7661895751953, "logps/rejected": -231.06192016601562, "loss": 0.661, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.5392072200775146, "rewards/margins": 0.268566370010376, "rewards/rejected": -1.8077733516693115, "step": 6010 }, { "epoch": 1.0372157133011717, "grad_norm": 22.12173843383789, "learning_rate": 1.654199322261267e-07, "logits/chosen": -2.349522352218628, "logits/rejected": -2.317288875579834, "logps/chosen": -193.19227600097656, "logps/rejected": -214.86181640625, "loss": 0.645, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.3733975887298584, "rewards/margins": 0.286784827709198, "rewards/rejected": -1.6601823568344116, "step": 6020 }, { "epoch": 1.0389386629910407, "grad_norm": 22.20186996459961, "learning_rate": 1.6526816995215995e-07, "logits/chosen": -2.1700615882873535, "logits/rejected": -2.143908739089966, "logps/chosen": -164.04891967773438, "logps/rejected": -192.51254272460938, "loss": 0.6245, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.1379988193511963, "rewards/margins": 0.28763145208358765, "rewards/rejected": -1.4256302118301392, "step": 6030 }, { "epoch": 1.0406616126809096, "grad_norm": 30.76277732849121, "learning_rate": 1.651161453390821e-07, "logits/chosen": -2.309840202331543, "logits/rejected": -2.2857794761657715, "logps/chosen": -154.37887573242188, "logps/rejected": -181.8328094482422, "loss": 0.5997, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.0026304721832275, "rewards/margins": 0.30957844853401184, "rewards/rejected": -1.3122087717056274, "step": 6040 }, { "epoch": 1.0423845623707788, "grad_norm": 24.196666717529297, "learning_rate": 1.6496385899794135e-07, "logits/chosen": -2.2179017066955566, "logits/rejected": -2.1860344409942627, "logps/chosen": -193.98912048339844, "logps/rejected": -227.5110626220703, "loss": 0.5726, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.335597038269043, "rewards/margins": 0.41937771439552307, "rewards/rejected": -1.7549747228622437, "step": 6050 }, { "epoch": 1.0441075120606478, "grad_norm": 28.210494995117188, "learning_rate": 1.64811311540838e-07, "logits/chosen": -2.2067856788635254, "logits/rejected": -2.1841800212860107, "logps/chosen": -208.82028198242188, "logps/rejected": -239.56918334960938, "loss": 0.6251, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.547461748123169, "rewards/margins": 0.3352636694908142, "rewards/rejected": -1.882725477218628, "step": 6060 }, { "epoch": 1.045830461750517, "grad_norm": 42.14088439941406, "learning_rate": 1.6465850358092184e-07, "logits/chosen": -2.2099831104278564, "logits/rejected": -2.1768100261688232, "logps/chosen": -210.5271759033203, "logps/rejected": -253.0932159423828, "loss": 0.5568, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.5555570125579834, "rewards/margins": 0.4656600058078766, "rewards/rejected": -2.021216869354248, "step": 6070 }, { "epoch": 1.047553411440386, "grad_norm": 21.90705680847168, "learning_rate": 1.645054357323897e-07, "logits/chosen": -2.2115724086761475, "logits/rejected": -2.1864047050476074, "logps/chosen": -218.80557250976562, "logps/rejected": -249.6724090576172, "loss": 0.603, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.5844274759292603, "rewards/margins": 0.390811026096344, "rewards/rejected": -1.9752384424209595, "step": 6080 }, { "epoch": 1.049276361130255, "grad_norm": 23.00667381286621, "learning_rate": 1.6435210861048302e-07, "logits/chosen": -2.263542413711548, "logits/rejected": -2.2285208702087402, "logps/chosen": -197.87942504882812, "logps/rejected": -247.3519287109375, "loss": 0.5377, "rewards/accuracies": 0.75, "rewards/chosen": -1.4560437202453613, "rewards/margins": 0.5109397768974304, "rewards/rejected": -1.9669831991195679, "step": 6090 }, { "epoch": 1.050999310820124, "grad_norm": 27.286256790161133, "learning_rate": 1.6419852283148535e-07, "logits/chosen": -2.259631872177124, "logits/rejected": -2.231081485748291, "logps/chosen": -197.79969787597656, "logps/rejected": -248.64035034179688, "loss": 0.5464, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.3961166143417358, "rewards/margins": 0.5630569458007812, "rewards/rejected": -1.9591734409332275, "step": 6100 }, { "epoch": 1.052722260509993, "grad_norm": 30.823551177978516, "learning_rate": 1.6404467901271998e-07, "logits/chosen": -2.211205244064331, "logits/rejected": -2.180361032485962, "logps/chosen": -212.4109344482422, "logps/rejected": -263.77099609375, "loss": 0.5624, "rewards/accuracies": 0.6875, "rewards/chosen": -1.5987329483032227, "rewards/margins": 0.5072905421257019, "rewards/rejected": -2.1060233116149902, "step": 6110 }, { "epoch": 1.0544452101998623, "grad_norm": 40.18345642089844, "learning_rate": 1.6389057777254722e-07, "logits/chosen": -2.2784905433654785, "logits/rejected": -2.2230658531188965, "logps/chosen": -229.3824462890625, "logps/rejected": -293.1062316894531, "loss": 0.5076, "rewards/accuracies": 0.8125, "rewards/chosen": -1.7395166158676147, "rewards/margins": 0.669109582901001, "rewards/rejected": -2.408626079559326, "step": 6120 }, { "epoch": 1.0561681598897312, "grad_norm": 25.442590713500977, "learning_rate": 1.6373621973036224e-07, "logits/chosen": -2.1934523582458496, "logits/rejected": -2.1601486206054688, "logps/chosen": -239.6211700439453, "logps/rejected": -286.9929504394531, "loss": 0.5719, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.8426984548568726, "rewards/margins": 0.4940398335456848, "rewards/rejected": -2.336738109588623, "step": 6130 }, { "epoch": 1.0578911095796002, "grad_norm": 49.917945861816406, "learning_rate": 1.6358160550659213e-07, "logits/chosen": -2.2199857234954834, "logits/rejected": -2.187546968460083, "logps/chosen": -225.39395141601562, "logps/rejected": -270.90447998046875, "loss": 0.5977, "rewards/accuracies": 0.65625, "rewards/chosen": -1.7169145345687866, "rewards/margins": 0.4762873649597168, "rewards/rejected": -2.193201780319214, "step": 6140 }, { "epoch": 1.0596140592694694, "grad_norm": 22.244651794433594, "learning_rate": 1.6342673572269398e-07, "logits/chosen": -2.197420597076416, "logits/rejected": -2.1729331016540527, "logps/chosen": -213.5404052734375, "logps/rejected": -249.8678741455078, "loss": 0.6162, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.6002448797225952, "rewards/margins": 0.4091455340385437, "rewards/rejected": -2.009390354156494, "step": 6150 }, { "epoch": 1.0613370089593384, "grad_norm": 28.97780418395996, "learning_rate": 1.632716110011519e-07, "logits/chosen": -2.186614513397217, "logits/rejected": -2.165234088897705, "logps/chosen": -201.4184112548828, "logps/rejected": -235.50375366210938, "loss": 0.6203, "rewards/accuracies": 0.6875, "rewards/chosen": -1.4898805618286133, "rewards/margins": 0.3466905355453491, "rewards/rejected": -1.8365710973739624, "step": 6160 }, { "epoch": 1.0630599586492075, "grad_norm": 32.25000762939453, "learning_rate": 1.6311623196547474e-07, "logits/chosen": -2.2768733501434326, "logits/rejected": -2.2490344047546387, "logps/chosen": -219.40673828125, "logps/rejected": -263.5992126464844, "loss": 0.5747, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.5952725410461426, "rewards/margins": 0.47955289483070374, "rewards/rejected": -2.0748257637023926, "step": 6170 }, { "epoch": 1.0647829083390765, "grad_norm": 25.306297302246094, "learning_rate": 1.6296059924019353e-07, "logits/chosen": -2.275926351547241, "logits/rejected": -2.2481603622436523, "logps/chosen": -205.8504638671875, "logps/rejected": -236.63137817382812, "loss": 0.5982, "rewards/accuracies": 0.71875, "rewards/chosen": -1.46571946144104, "rewards/margins": 0.3783527910709381, "rewards/rejected": -1.8440719842910767, "step": 6180 }, { "epoch": 1.0665058580289455, "grad_norm": 29.48764419555664, "learning_rate": 1.6280471345085901e-07, "logits/chosen": -2.2747912406921387, "logits/rejected": -2.246176242828369, "logps/chosen": -203.13949584960938, "logps/rejected": -234.68856811523438, "loss": 0.6018, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.4325004816055298, "rewards/margins": 0.39268478751182556, "rewards/rejected": -1.8251851797103882, "step": 6190 }, { "epoch": 1.0682288077188147, "grad_norm": 40.502254486083984, "learning_rate": 1.6264857522403906e-07, "logits/chosen": -2.1964752674102783, "logits/rejected": -2.1596341133117676, "logps/chosen": -186.2880859375, "logps/rejected": -229.4107208251953, "loss": 0.5771, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.3087577819824219, "rewards/margins": 0.44233304262161255, "rewards/rejected": -1.7510907649993896, "step": 6200 }, { "epoch": 1.0699517574086836, "grad_norm": 32.917842864990234, "learning_rate": 1.6249218518731623e-07, "logits/chosen": -2.2598016262054443, "logits/rejected": -2.2300467491149902, "logps/chosen": -195.76866149902344, "logps/rejected": -231.24526977539062, "loss": 0.5751, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.394277811050415, "rewards/margins": 0.3902966380119324, "rewards/rejected": -1.784574270248413, "step": 6210 }, { "epoch": 1.0716747070985528, "grad_norm": 23.806379318237305, "learning_rate": 1.6233554396928515e-07, "logits/chosen": -2.2467989921569824, "logits/rejected": -2.22261381149292, "logps/chosen": -194.40060424804688, "logps/rejected": -229.7987518310547, "loss": 0.61, "rewards/accuracies": 0.6875, "rewards/chosen": -1.4125274419784546, "rewards/margins": 0.3693087697029114, "rewards/rejected": -1.7818362712860107, "step": 6220 }, { "epoch": 1.0733976567884218, "grad_norm": 26.85658073425293, "learning_rate": 1.6217865219955008e-07, "logits/chosen": -2.338665008544922, "logits/rejected": -2.3019931316375732, "logps/chosen": -192.6621551513672, "logps/rejected": -246.43557739257812, "loss": 0.5288, "rewards/accuracies": 0.75, "rewards/chosen": -1.3202307224273682, "rewards/margins": 0.5889842510223389, "rewards/rejected": -1.909214973449707, "step": 6230 }, { "epoch": 1.0751206064782908, "grad_norm": 29.26054573059082, "learning_rate": 1.6202151050872242e-07, "logits/chosen": -2.1933677196502686, "logits/rejected": -2.163956880569458, "logps/chosen": -201.94302368164062, "logps/rejected": -236.05807495117188, "loss": 0.6181, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.449880599975586, "rewards/margins": 0.38049185276031494, "rewards/rejected": -1.8303724527359009, "step": 6240 }, { "epoch": 1.07684355616816, "grad_norm": 32.749603271484375, "learning_rate": 1.618641195284179e-07, "logits/chosen": -2.305246353149414, "logits/rejected": -2.2783963680267334, "logps/chosen": -173.74647521972656, "logps/rejected": -196.07456970214844, "loss": 0.6333, "rewards/accuracies": 0.65625, "rewards/chosen": -1.168368935585022, "rewards/margins": 0.26043254137039185, "rewards/rejected": -1.428801417350769, "step": 6250 }, { "epoch": 1.078566505858029, "grad_norm": 16.783361434936523, "learning_rate": 1.6170647989125455e-07, "logits/chosen": -2.2322545051574707, "logits/rejected": -2.208922863006592, "logps/chosen": -155.6553955078125, "logps/rejected": -178.28318786621094, "loss": 0.6247, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.0038013458251953, "rewards/margins": 0.2728382647037506, "rewards/rejected": -1.2766395807266235, "step": 6260 }, { "epoch": 1.080289455547898, "grad_norm": 15.635846138000488, "learning_rate": 1.6154859223084953e-07, "logits/chosen": -2.4459521770477295, "logits/rejected": -2.439850330352783, "logps/chosen": -144.04306030273438, "logps/rejected": -171.6459197998047, "loss": 0.6206, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.9036776423454285, "rewards/margins": 0.2726871371269226, "rewards/rejected": -1.1763646602630615, "step": 6270 }, { "epoch": 1.082012405237767, "grad_norm": 20.35181999206543, "learning_rate": 1.613904571818171e-07, "logits/chosen": -2.2114737033843994, "logits/rejected": -2.184677839279175, "logps/chosen": -146.02078247070312, "logps/rejected": -174.31607055664062, "loss": 0.5935, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.9112964868545532, "rewards/margins": 0.3144376873970032, "rewards/rejected": -1.225733995437622, "step": 6280 }, { "epoch": 1.083735354927636, "grad_norm": 19.11197853088379, "learning_rate": 1.6123207537976588e-07, "logits/chosen": -2.2452197074890137, "logits/rejected": -2.2174201011657715, "logps/chosen": -162.3245086669922, "logps/rejected": -197.2689971923828, "loss": 0.5868, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.1011226177215576, "rewards/margins": 0.35962381958961487, "rewards/rejected": -1.4607465267181396, "step": 6290 }, { "epoch": 1.0854583046175053, "grad_norm": 23.40118980407715, "learning_rate": 1.6107344746129622e-07, "logits/chosen": -2.261756658554077, "logits/rejected": -2.2390553951263428, "logps/chosen": -180.0264434814453, "logps/rejected": -208.5572052001953, "loss": 0.6152, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.2631686925888062, "rewards/margins": 0.3133259117603302, "rewards/rejected": -1.5764944553375244, "step": 6300 }, { "epoch": 1.0871812543073742, "grad_norm": 38.912540435791016, "learning_rate": 1.609145740639977e-07, "logits/chosen": -2.254859447479248, "logits/rejected": -2.2240333557128906, "logps/chosen": -165.6407470703125, "logps/rejected": -196.86428833007812, "loss": 0.6129, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.1381008625030518, "rewards/margins": 0.3193897306919098, "rewards/rejected": -1.4574905633926392, "step": 6310 }, { "epoch": 1.0889042039972432, "grad_norm": 22.782106399536133, "learning_rate": 1.6075545582644663e-07, "logits/chosen": -2.222576856613159, "logits/rejected": -2.1984236240386963, "logps/chosen": -162.3043670654297, "logps/rejected": -195.146484375, "loss": 0.5999, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.068186640739441, "rewards/margins": 0.3570778965950012, "rewards/rejected": -1.425264596939087, "step": 6320 }, { "epoch": 1.0906271536871124, "grad_norm": 20.50153350830078, "learning_rate": 1.6059609338820342e-07, "logits/chosen": -2.2440178394317627, "logits/rejected": -2.2133121490478516, "logps/chosen": -166.8011016845703, "logps/rejected": -211.500732421875, "loss": 0.5555, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.1194723844528198, "rewards/margins": 0.47705450654029846, "rewards/rejected": -1.5965269804000854, "step": 6330 }, { "epoch": 1.0923501033769814, "grad_norm": 28.833908081054688, "learning_rate": 1.6043648738981e-07, "logits/chosen": -2.269195556640625, "logits/rejected": -2.2403371334075928, "logps/chosen": -185.642822265625, "logps/rejected": -217.7083740234375, "loss": 0.5996, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.2849600315093994, "rewards/margins": 0.36721131205558777, "rewards/rejected": -1.6521714925765991, "step": 6340 }, { "epoch": 1.0940730530668505, "grad_norm": 36.53247833251953, "learning_rate": 1.6027663847278725e-07, "logits/chosen": -2.1933300495147705, "logits/rejected": -2.176132917404175, "logps/chosen": -200.3306884765625, "logps/rejected": -239.5429229736328, "loss": 0.5821, "rewards/accuracies": 0.6875, "rewards/chosen": -1.4361860752105713, "rewards/margins": 0.4046160578727722, "rewards/rejected": -1.8408019542694092, "step": 6350 }, { "epoch": 1.0957960027567195, "grad_norm": 23.467296600341797, "learning_rate": 1.6011654727963252e-07, "logits/chosen": -2.1536242961883545, "logits/rejected": -2.1328327655792236, "logps/chosen": -211.01321411132812, "logps/rejected": -261.79376220703125, "loss": 0.5684, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.5633403062820435, "rewards/margins": 0.5159338712692261, "rewards/rejected": -2.0792741775512695, "step": 6360 }, { "epoch": 1.0975189524465885, "grad_norm": 28.605619430541992, "learning_rate": 1.599562144538169e-07, "logits/chosen": -2.1722331047058105, "logits/rejected": -2.1576180458068848, "logps/chosen": -221.28628540039062, "logps/rejected": -250.8852996826172, "loss": 0.6574, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.660936951637268, "rewards/margins": 0.3105929493904114, "rewards/rejected": -1.9715297222137451, "step": 6370 }, { "epoch": 1.0992419021364577, "grad_norm": 31.780277252197266, "learning_rate": 1.597956406397827e-07, "logits/chosen": -2.2239651679992676, "logits/rejected": -2.1921591758728027, "logps/chosen": -212.7681121826172, "logps/rejected": -263.39239501953125, "loss": 0.5588, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.5899945497512817, "rewards/margins": 0.4881893992424011, "rewards/rejected": -2.078183889389038, "step": 6380 }, { "epoch": 1.1009648518263266, "grad_norm": 30.17275047302246, "learning_rate": 1.5963482648294085e-07, "logits/chosen": -2.259859085083008, "logits/rejected": -2.2155401706695557, "logps/chosen": -197.69004821777344, "logps/rejected": -241.78903198242188, "loss": 0.5508, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.4024711847305298, "rewards/margins": 0.5164901614189148, "rewards/rejected": -1.9189612865447998, "step": 6390 }, { "epoch": 1.1026878015161956, "grad_norm": 30.444303512573242, "learning_rate": 1.5947377262966842e-07, "logits/chosen": -2.223111629486084, "logits/rejected": -2.1871323585510254, "logps/chosen": -206.0340118408203, "logps/rejected": -249.349365234375, "loss": 0.5789, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.4722647666931152, "rewards/margins": 0.4635472893714905, "rewards/rejected": -1.9358123540878296, "step": 6400 }, { "epoch": 1.1026878015161956, "eval_logits/chosen": -2.2595252990722656, "eval_logits/rejected": -2.242840528488159, "eval_logps/chosen": -197.83056640625, "eval_logps/rejected": -224.87249755859375, "eval_loss": 0.6455348134040833, "eval_rewards/accuracies": 0.616403341293335, "eval_rewards/chosen": -1.388150691986084, "eval_rewards/margins": 0.23307843506336212, "eval_rewards/rejected": -1.6212290525436401, "eval_runtime": 384.0489, "eval_samples_per_second": 11.207, "eval_steps_per_second": 1.401, "step": 6400 }, { "epoch": 1.1044107512060648, "grad_norm": 39.10361862182617, "learning_rate": 1.5931247972730572e-07, "logits/chosen": -2.2497591972351074, "logits/rejected": -2.220106601715088, "logps/chosen": -230.1884765625, "logps/rejected": -270.73980712890625, "loss": 0.6077, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.7419331073760986, "rewards/margins": 0.4585306644439697, "rewards/rejected": -2.2004637718200684, "step": 6410 }, { "epoch": 1.1061337008959338, "grad_norm": 24.094772338867188, "learning_rate": 1.591509484241541e-07, "logits/chosen": -2.2276716232299805, "logits/rejected": -2.200310230255127, "logps/chosen": -219.2551727294922, "logps/rejected": -252.8025665283203, "loss": 0.6332, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.6354185342788696, "rewards/margins": 0.36073872447013855, "rewards/rejected": -1.9961572885513306, "step": 6420 }, { "epoch": 1.107856650585803, "grad_norm": 26.919458389282227, "learning_rate": 1.5898917936947297e-07, "logits/chosen": -2.1865882873535156, "logits/rejected": -2.1732869148254395, "logps/chosen": -189.06442260742188, "logps/rejected": -213.3035125732422, "loss": 0.6421, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.3809840679168701, "rewards/margins": 0.2711717486381531, "rewards/rejected": -1.652155876159668, "step": 6430 }, { "epoch": 1.109579600275672, "grad_norm": 21.60903549194336, "learning_rate": 1.5882717321347752e-07, "logits/chosen": -2.241295099258423, "logits/rejected": -2.215834379196167, "logps/chosen": -186.6502685546875, "logps/rejected": -218.45486450195312, "loss": 0.6103, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.3116780519485474, "rewards/margins": 0.34163540601730347, "rewards/rejected": -1.653313398361206, "step": 6440 }, { "epoch": 1.111302549965541, "grad_norm": 21.67346954345703, "learning_rate": 1.5866493060733576e-07, "logits/chosen": -2.2527382373809814, "logits/rejected": -2.21574330329895, "logps/chosen": -177.25755310058594, "logps/rejected": -212.2408905029297, "loss": 0.5797, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.1989918947219849, "rewards/margins": 0.4047037959098816, "rewards/rejected": -1.6036956310272217, "step": 6450 }, { "epoch": 1.11302549965541, "grad_norm": 26.42724609375, "learning_rate": 1.585024522031663e-07, "logits/chosen": -2.1832480430603027, "logits/rejected": -2.1704015731811523, "logps/chosen": -181.95785522460938, "logps/rejected": -235.64987182617188, "loss": 0.5557, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.2918856143951416, "rewards/margins": 0.498269259929657, "rewards/rejected": -1.7901546955108643, "step": 6460 }, { "epoch": 1.114748449345279, "grad_norm": 27.41206932067871, "learning_rate": 1.5833973865403533e-07, "logits/chosen": -2.124350070953369, "logits/rejected": -2.0984079837799072, "logps/chosen": -197.56777954101562, "logps/rejected": -235.68283081054688, "loss": 0.5946, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.433988332748413, "rewards/margins": 0.4133404791355133, "rewards/rejected": -1.8473289012908936, "step": 6470 }, { "epoch": 1.1164713990351482, "grad_norm": 23.416603088378906, "learning_rate": 1.5817679061395426e-07, "logits/chosen": -2.2085471153259277, "logits/rejected": -2.1690986156463623, "logps/chosen": -193.58267211914062, "logps/rejected": -229.81869506835938, "loss": 0.5803, "rewards/accuracies": 0.71875, "rewards/chosen": -1.3631269931793213, "rewards/margins": 0.43150943517684937, "rewards/rejected": -1.7946363687515259, "step": 6480 }, { "epoch": 1.1181943487250172, "grad_norm": 27.429529190063477, "learning_rate": 1.5801360873787704e-07, "logits/chosen": -2.3445587158203125, "logits/rejected": -2.3242859840393066, "logps/chosen": -196.75357055664062, "logps/rejected": -231.71450805664062, "loss": 0.6102, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.4204045534133911, "rewards/margins": 0.36140257120132446, "rewards/rejected": -1.78180730342865, "step": 6490 }, { "epoch": 1.1199172984148862, "grad_norm": 25.771257400512695, "learning_rate": 1.5785019368169748e-07, "logits/chosen": -2.2539565563201904, "logits/rejected": -2.233546733856201, "logps/chosen": -189.86428833007812, "logps/rejected": -215.73965454101562, "loss": 0.6098, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.3247458934783936, "rewards/margins": 0.3122237026691437, "rewards/rejected": -1.6369695663452148, "step": 6500 }, { "epoch": 1.1216402481047554, "grad_norm": 27.679521560668945, "learning_rate": 1.5768654610224664e-07, "logits/chosen": -2.2240753173828125, "logits/rejected": -2.1769585609436035, "logps/chosen": -189.4766845703125, "logps/rejected": -226.5435028076172, "loss": 0.5855, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.3213176727294922, "rewards/margins": 0.42212343215942383, "rewards/rejected": -1.7434409856796265, "step": 6510 }, { "epoch": 1.1233631977946243, "grad_norm": 35.53666305541992, "learning_rate": 1.575226666572901e-07, "logits/chosen": -2.2380692958831787, "logits/rejected": -2.206503391265869, "logps/chosen": -176.08033752441406, "logps/rejected": -212.88357543945312, "loss": 0.5733, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.2142693996429443, "rewards/margins": 0.40591007471084595, "rewards/rejected": -1.620179533958435, "step": 6520 }, { "epoch": 1.1250861474844935, "grad_norm": 16.802722930908203, "learning_rate": 1.573585560055256e-07, "logits/chosen": -2.190563201904297, "logits/rejected": -2.149428367614746, "logps/chosen": -183.85194396972656, "logps/rejected": -234.85830688476562, "loss": 0.5242, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.2935733795166016, "rewards/margins": 0.5506505966186523, "rewards/rejected": -1.8442237377166748, "step": 6530 }, { "epoch": 1.1268090971743625, "grad_norm": 31.97213363647461, "learning_rate": 1.5719421480657996e-07, "logits/chosen": -2.1805355548858643, "logits/rejected": -2.1448729038238525, "logps/chosen": -213.38833618164062, "logps/rejected": -248.4428253173828, "loss": 0.6279, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.553464412689209, "rewards/margins": 0.37057262659072876, "rewards/rejected": -1.924036979675293, "step": 6540 }, { "epoch": 1.1285320468642315, "grad_norm": 36.40608215332031, "learning_rate": 1.570296437210068e-07, "logits/chosen": -2.133267641067505, "logits/rejected": -2.1039319038391113, "logps/chosen": -206.63760375976562, "logps/rejected": -245.5586395263672, "loss": 0.6107, "rewards/accuracies": 0.65625, "rewards/chosen": -1.5502650737762451, "rewards/margins": 0.39422768354415894, "rewards/rejected": -1.9444925785064697, "step": 6550 }, { "epoch": 1.1302549965541007, "grad_norm": 33.15107727050781, "learning_rate": 1.5686484341028374e-07, "logits/chosen": -2.217454195022583, "logits/rejected": -2.178605079650879, "logps/chosen": -198.23617553710938, "logps/rejected": -239.9998321533203, "loss": 0.5656, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.414640188217163, "rewards/margins": 0.4772668778896332, "rewards/rejected": -1.8919073343276978, "step": 6560 }, { "epoch": 1.1319779462439696, "grad_norm": 29.158788681030273, "learning_rate": 1.566998145368097e-07, "logits/chosen": -2.2073724269866943, "logits/rejected": -2.1710896492004395, "logps/chosen": -200.75865173339844, "logps/rejected": -242.34756469726562, "loss": 0.5686, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.4528988599777222, "rewards/margins": 0.4519721567630768, "rewards/rejected": -1.9048709869384766, "step": 6570 }, { "epoch": 1.1337008959338388, "grad_norm": 26.331056594848633, "learning_rate": 1.5653455776390235e-07, "logits/chosen": -2.22875714302063, "logits/rejected": -2.187915802001953, "logps/chosen": -197.57884216308594, "logps/rejected": -227.84872436523438, "loss": 0.6095, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.4123129844665527, "rewards/margins": 0.37162190675735474, "rewards/rejected": -1.7839349508285522, "step": 6580 }, { "epoch": 1.1354238456237078, "grad_norm": 31.246917724609375, "learning_rate": 1.563690737557953e-07, "logits/chosen": -2.20268177986145, "logits/rejected": -2.16776180267334, "logps/chosen": -184.05455017089844, "logps/rejected": -226.29769897460938, "loss": 0.5749, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.3052704334259033, "rewards/margins": 0.42799264192581177, "rewards/rejected": -1.7332630157470703, "step": 6590 }, { "epoch": 1.1371467953135768, "grad_norm": 33.766841888427734, "learning_rate": 1.562033631776356e-07, "logits/chosen": -2.2406680583953857, "logits/rejected": -2.210420608520508, "logps/chosen": -203.17628479003906, "logps/rejected": -243.853515625, "loss": 0.5912, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.4638261795043945, "rewards/margins": 0.44317826628685, "rewards/rejected": -1.9070043563842773, "step": 6600 }, { "epoch": 1.138869745003446, "grad_norm": 26.146657943725586, "learning_rate": 1.560374266954809e-07, "logits/chosen": -2.1933908462524414, "logits/rejected": -2.166749954223633, "logps/chosen": -219.3288116455078, "logps/rejected": -266.52911376953125, "loss": 0.5595, "rewards/accuracies": 0.71875, "rewards/chosen": -1.662623405456543, "rewards/margins": 0.49311646819114685, "rewards/rejected": -2.1557400226593018, "step": 6610 }, { "epoch": 1.140592694693315, "grad_norm": 30.15679359436035, "learning_rate": 1.5587126497629686e-07, "logits/chosen": -2.1265695095062256, "logits/rejected": -2.0984911918640137, "logps/chosen": -235.67855834960938, "logps/rejected": -269.5155944824219, "loss": 0.6436, "rewards/accuracies": 0.625, "rewards/chosen": -1.8247787952423096, "rewards/margins": 0.3574926257133484, "rewards/rejected": -2.1822714805603027, "step": 6620 }, { "epoch": 1.1423156443831841, "grad_norm": 29.55496597290039, "learning_rate": 1.557048786879545e-07, "logits/chosen": -2.1744980812072754, "logits/rejected": -2.1456077098846436, "logps/chosen": -190.76315307617188, "logps/rejected": -218.13583374023438, "loss": 0.6088, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.3612579107284546, "rewards/margins": 0.3343326449394226, "rewards/rejected": -1.6955903768539429, "step": 6630 }, { "epoch": 1.144038594073053, "grad_norm": 29.953752517700195, "learning_rate": 1.5553826849922747e-07, "logits/chosen": -2.2497270107269287, "logits/rejected": -2.2161362171173096, "logps/chosen": -180.73580932617188, "logps/rejected": -204.81265258789062, "loss": 0.6203, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.261747121810913, "rewards/margins": 0.30038005113601685, "rewards/rejected": -1.5621273517608643, "step": 6640 }, { "epoch": 1.145761543762922, "grad_norm": 26.446489334106445, "learning_rate": 1.553714350797893e-07, "logits/chosen": -2.290693759918213, "logits/rejected": -2.2610902786254883, "logps/chosen": -167.90994262695312, "logps/rejected": -211.5741424560547, "loss": 0.5604, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.1483453512191772, "rewards/margins": 0.43969884514808655, "rewards/rejected": -1.5880441665649414, "step": 6650 }, { "epoch": 1.1474844934527912, "grad_norm": 29.325246810913086, "learning_rate": 1.5520437910021084e-07, "logits/chosen": -2.2955827713012695, "logits/rejected": -2.276799440383911, "logps/chosen": -176.8847198486328, "logps/rejected": -213.06546020507812, "loss": 0.5975, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.2398980855941772, "rewards/margins": 0.360373854637146, "rewards/rejected": -1.6002719402313232, "step": 6660 }, { "epoch": 1.1492074431426602, "grad_norm": 24.36836814880371, "learning_rate": 1.550371012319575e-07, "logits/chosen": -2.17775821685791, "logits/rejected": -2.1535000801086426, "logps/chosen": -192.9346160888672, "logps/rejected": -252.8832244873047, "loss": 0.5325, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.4218976497650146, "rewards/margins": 0.5638034343719482, "rewards/rejected": -1.9857012033462524, "step": 6670 }, { "epoch": 1.1509303928325294, "grad_norm": 31.592592239379883, "learning_rate": 1.5486960214738648e-07, "logits/chosen": -2.1380467414855957, "logits/rejected": -2.1065773963928223, "logps/chosen": -217.91091918945312, "logps/rejected": -257.14508056640625, "loss": 0.6023, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.6398321390151978, "rewards/margins": 0.4262255132198334, "rewards/rejected": -2.0660576820373535, "step": 6680 }, { "epoch": 1.1526533425223984, "grad_norm": 22.85630226135254, "learning_rate": 1.547018825197443e-07, "logits/chosen": -2.188157081604004, "logits/rejected": -2.1534061431884766, "logps/chosen": -216.86978149414062, "logps/rejected": -265.63018798828125, "loss": 0.5528, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.5875884294509888, "rewards/margins": 0.5234032869338989, "rewards/rejected": -2.1109914779663086, "step": 6690 }, { "epoch": 1.1543762922122673, "grad_norm": 44.523284912109375, "learning_rate": 1.5453394302316366e-07, "logits/chosen": -2.1419711112976074, "logits/rejected": -2.1201653480529785, "logps/chosen": -236.85684204101562, "logps/rejected": -283.3230895996094, "loss": 0.5803, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.7883342504501343, "rewards/margins": 0.4932478368282318, "rewards/rejected": -2.2815825939178467, "step": 6700 }, { "epoch": 1.1560992419021365, "grad_norm": 26.147733688354492, "learning_rate": 1.5436578433266126e-07, "logits/chosen": -2.1709694862365723, "logits/rejected": -2.128147602081299, "logps/chosen": -219.7293243408203, "logps/rejected": -272.3164978027344, "loss": 0.5688, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.617134690284729, "rewards/margins": 0.5824311375617981, "rewards/rejected": -2.1995654106140137, "step": 6710 }, { "epoch": 1.1578221915920055, "grad_norm": 32.670108795166016, "learning_rate": 1.5419740712413472e-07, "logits/chosen": -2.177053451538086, "logits/rejected": -2.1449925899505615, "logps/chosen": -198.5723876953125, "logps/rejected": -241.6822967529297, "loss": 0.5817, "rewards/accuracies": 0.65625, "rewards/chosen": -1.459101915359497, "rewards/margins": 0.44461125135421753, "rewards/rejected": -1.9037132263183594, "step": 6720 }, { "epoch": 1.1595451412818747, "grad_norm": 30.65315818786621, "learning_rate": 1.5402881207436e-07, "logits/chosen": -2.1471118927001953, "logits/rejected": -2.1220784187316895, "logps/chosen": -207.4105682373047, "logps/rejected": -236.3886260986328, "loss": 0.6307, "rewards/accuracies": 0.59375, "rewards/chosen": -1.4984217882156372, "rewards/margins": 0.32371002435684204, "rewards/rejected": -1.8221315145492554, "step": 6730 }, { "epoch": 1.1612680909717437, "grad_norm": 37.31703567504883, "learning_rate": 1.5385999986098858e-07, "logits/chosen": -2.18902850151062, "logits/rejected": -2.1660685539245605, "logps/chosen": -197.50247192382812, "logps/rejected": -240.78140258789062, "loss": 0.5585, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.421494960784912, "rewards/margins": 0.43881043791770935, "rewards/rejected": -1.8603054285049438, "step": 6740 }, { "epoch": 1.1629910406616126, "grad_norm": 39.52729797363281, "learning_rate": 1.5369097116254493e-07, "logits/chosen": -2.223015785217285, "logits/rejected": -2.1959691047668457, "logps/chosen": -207.1807861328125, "logps/rejected": -252.77059936523438, "loss": 0.5831, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.5373988151550293, "rewards/margins": 0.4599340856075287, "rewards/rejected": -1.9973331689834595, "step": 6750 }, { "epoch": 1.1647139903514818, "grad_norm": 37.49037170410156, "learning_rate": 1.5352172665842351e-07, "logits/chosen": -2.1724252700805664, "logits/rejected": -2.137951374053955, "logps/chosen": -202.9478302001953, "logps/rejected": -243.4915771484375, "loss": 0.5801, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.4955527782440186, "rewards/margins": 0.4567495882511139, "rewards/rejected": -1.9523022174835205, "step": 6760 }, { "epoch": 1.1664369400413508, "grad_norm": 33.874122619628906, "learning_rate": 1.5335226702888636e-07, "logits/chosen": -2.20847749710083, "logits/rejected": -2.1878161430358887, "logps/chosen": -202.27294921875, "logps/rejected": -246.615478515625, "loss": 0.581, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.4698498249053955, "rewards/margins": 0.4451657831668854, "rewards/rejected": -1.9150155782699585, "step": 6770 }, { "epoch": 1.1681598897312198, "grad_norm": 26.217775344848633, "learning_rate": 1.5318259295506004e-07, "logits/chosen": -2.2057294845581055, "logits/rejected": -2.170468330383301, "logps/chosen": -199.53536987304688, "logps/rejected": -232.54922485351562, "loss": 0.6079, "rewards/accuracies": 0.625, "rewards/chosen": -1.4228346347808838, "rewards/margins": 0.37124061584472656, "rewards/rejected": -1.7940753698349, "step": 6780 }, { "epoch": 1.169882839421089, "grad_norm": 24.48525619506836, "learning_rate": 1.5301270511893315e-07, "logits/chosen": -2.247795820236206, "logits/rejected": -2.213374614715576, "logps/chosen": -176.34242248535156, "logps/rejected": -228.22158813476562, "loss": 0.5359, "rewards/accuracies": 0.75, "rewards/chosen": -1.24251389503479, "rewards/margins": 0.5219536423683167, "rewards/rejected": -1.764467477798462, "step": 6790 }, { "epoch": 1.171605789110958, "grad_norm": 20.723039627075195, "learning_rate": 1.5284260420335345e-07, "logits/chosen": -2.151221513748169, "logits/rejected": -2.111957311630249, "logps/chosen": -192.44325256347656, "logps/rejected": -236.78494262695312, "loss": 0.5681, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.3916516304016113, "rewards/margins": 0.4804829955101013, "rewards/rejected": -1.872134804725647, "step": 6800 }, { "epoch": 1.171605789110958, "eval_logits/chosen": -2.2592737674713135, "eval_logits/rejected": -2.2435038089752197, "eval_logps/chosen": -192.49172973632812, "eval_logps/rejected": -217.7540283203125, "eval_loss": 0.6433852314949036, "eval_rewards/accuracies": 0.6129181981086731, "eval_rewards/chosen": -1.3347625732421875, "eval_rewards/margins": 0.21528159081935883, "eval_rewards/rejected": -1.550044298171997, "eval_runtime": 382.3053, "eval_samples_per_second": 11.258, "eval_steps_per_second": 1.407, "step": 6800 }, { "epoch": 1.173328738800827, "grad_norm": 28.285385131835938, "learning_rate": 1.5267229089202514e-07, "logits/chosen": -2.159170627593994, "logits/rejected": -2.125225067138672, "logps/chosen": -220.8817138671875, "logps/rejected": -257.7799987792969, "loss": 0.5872, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.6409006118774414, "rewards/margins": 0.4281826913356781, "rewards/rejected": -2.0690832138061523, "step": 6810 }, { "epoch": 1.175051688490696, "grad_norm": 24.356565475463867, "learning_rate": 1.5250176586950615e-07, "logits/chosen": -2.2474682331085205, "logits/rejected": -2.2131118774414062, "logps/chosen": -224.6580047607422, "logps/rejected": -268.55230712890625, "loss": 0.5788, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.6821105480194092, "rewards/margins": 0.485291063785553, "rewards/rejected": -2.1674017906188965, "step": 6820 }, { "epoch": 1.176774638180565, "grad_norm": 26.990726470947266, "learning_rate": 1.523310298212054e-07, "logits/chosen": -2.242975950241089, "logits/rejected": -2.226395606994629, "logps/chosen": -213.24368286132812, "logps/rejected": -257.6997985839844, "loss": 0.5952, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.5863196849822998, "rewards/margins": 0.4677578806877136, "rewards/rejected": -2.054077625274658, "step": 6830 }, { "epoch": 1.1784975878704342, "grad_norm": 30.25794792175293, "learning_rate": 1.5216008343337987e-07, "logits/chosen": -2.2314229011535645, "logits/rejected": -2.197105884552002, "logps/chosen": -213.30313110351562, "logps/rejected": -253.2093963623047, "loss": 0.6025, "rewards/accuracies": 0.75, "rewards/chosen": -1.5536158084869385, "rewards/margins": 0.43098410964012146, "rewards/rejected": -1.9845998287200928, "step": 6840 }, { "epoch": 1.1802205375603032, "grad_norm": 31.903627395629883, "learning_rate": 1.5198892739313216e-07, "logits/chosen": -2.160515546798706, "logits/rejected": -2.1238436698913574, "logps/chosen": -196.7727508544922, "logps/rejected": -231.3065948486328, "loss": 0.6101, "rewards/accuracies": 0.65625, "rewards/chosen": -1.4143223762512207, "rewards/margins": 0.37205883860588074, "rewards/rejected": -1.7863811254501343, "step": 6850 }, { "epoch": 1.1819434872501722, "grad_norm": 23.95057487487793, "learning_rate": 1.518175623884074e-07, "logits/chosen": -2.245159864425659, "logits/rejected": -2.203448534011841, "logps/chosen": -199.4086456298828, "logps/rejected": -227.75155639648438, "loss": 0.5858, "rewards/accuracies": 0.71875, "rewards/chosen": -1.3946173191070557, "rewards/margins": 0.3646080493927002, "rewards/rejected": -1.7592252492904663, "step": 6860 }, { "epoch": 1.1836664369400414, "grad_norm": 28.446277618408203, "learning_rate": 1.516459891079907e-07, "logits/chosen": -2.1487252712249756, "logits/rejected": -2.1328117847442627, "logps/chosen": -192.9253692626953, "logps/rejected": -231.7952117919922, "loss": 0.586, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.4090341329574585, "rewards/margins": 0.39750105142593384, "rewards/rejected": -1.8065353631973267, "step": 6870 }, { "epoch": 1.1853893866299103, "grad_norm": 26.053268432617188, "learning_rate": 1.5147420824150435e-07, "logits/chosen": -2.2122786045074463, "logits/rejected": -2.1730782985687256, "logps/chosen": -196.47982788085938, "logps/rejected": -238.5743408203125, "loss": 0.5652, "rewards/accuracies": 0.6875, "rewards/chosen": -1.40488862991333, "rewards/margins": 0.4748099446296692, "rewards/rejected": -1.879698395729065, "step": 6880 }, { "epoch": 1.1871123363197795, "grad_norm": 24.240577697753906, "learning_rate": 1.5130222047940492e-07, "logits/chosen": -2.132542848587036, "logits/rejected": -2.0986483097076416, "logps/chosen": -203.53436279296875, "logps/rejected": -254.85830688476562, "loss": 0.5518, "rewards/accuracies": 0.71875, "rewards/chosen": -1.5031158924102783, "rewards/margins": 0.525459885597229, "rewards/rejected": -2.0285756587982178, "step": 6890 }, { "epoch": 1.1888352860096485, "grad_norm": 28.958574295043945, "learning_rate": 1.5113002651298062e-07, "logits/chosen": -2.17870831489563, "logits/rejected": -2.1460628509521484, "logps/chosen": -215.60385131835938, "logps/rejected": -255.82870483398438, "loss": 0.5897, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.6127774715423584, "rewards/margins": 0.41913384199142456, "rewards/rejected": -2.0319113731384277, "step": 6900 }, { "epoch": 1.1905582356995175, "grad_norm": 27.42066764831543, "learning_rate": 1.509576270343485e-07, "logits/chosen": -2.1975607872009277, "logits/rejected": -2.165322780609131, "logps/chosen": -222.44503784179688, "logps/rejected": -268.72802734375, "loss": 0.5683, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.646388292312622, "rewards/margins": 0.4927579462528229, "rewards/rejected": -2.139146327972412, "step": 6910 }, { "epoch": 1.1922811853893867, "grad_norm": 31.671611785888672, "learning_rate": 1.5078502273645164e-07, "logits/chosen": -2.240344762802124, "logits/rejected": -2.2021901607513428, "logps/chosen": -225.1938018798828, "logps/rejected": -262.28289794921875, "loss": 0.6135, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.685520887374878, "rewards/margins": 0.4017064571380615, "rewards/rejected": -2.0872275829315186, "step": 6920 }, { "epoch": 1.1940041350792556, "grad_norm": 27.858375549316406, "learning_rate": 1.5061221431305632e-07, "logits/chosen": -2.1529998779296875, "logits/rejected": -2.108311414718628, "logps/chosen": -202.53628540039062, "logps/rejected": -253.01431274414062, "loss": 0.5529, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.4367836713790894, "rewards/margins": 0.5604701638221741, "rewards/rejected": -1.9972540140151978, "step": 6930 }, { "epoch": 1.1957270847691248, "grad_norm": 33.72744369506836, "learning_rate": 1.5043920245874937e-07, "logits/chosen": -2.111140727996826, "logits/rejected": -2.0591862201690674, "logps/chosen": -204.8585205078125, "logps/rejected": -251.541748046875, "loss": 0.5423, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.4479444026947021, "rewards/margins": 0.5700526833534241, "rewards/rejected": -2.0179970264434814, "step": 6940 }, { "epoch": 1.1974500344589938, "grad_norm": 29.296106338500977, "learning_rate": 1.5026598786893522e-07, "logits/chosen": -2.1314244270324707, "logits/rejected": -2.098526954650879, "logps/chosen": -229.43649291992188, "logps/rejected": -279.2596130371094, "loss": 0.5683, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.7518694400787354, "rewards/margins": 0.513970136642456, "rewards/rejected": -2.2658395767211914, "step": 6950 }, { "epoch": 1.1991729841488628, "grad_norm": 26.940580368041992, "learning_rate": 1.5009257123983322e-07, "logits/chosen": -2.258476734161377, "logits/rejected": -2.2213873863220215, "logps/chosen": -235.5916748046875, "logps/rejected": -263.60174560546875, "loss": 0.6182, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.795373558998108, "rewards/margins": 0.3696233332157135, "rewards/rejected": -2.16499662399292, "step": 6960 }, { "epoch": 1.200895933838732, "grad_norm": 26.772729873657227, "learning_rate": 1.499189532684747e-07, "logits/chosen": -2.210001230239868, "logits/rejected": -2.1754508018493652, "logps/chosen": -209.4125518798828, "logps/rejected": -254.1207733154297, "loss": 0.5482, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.5401835441589355, "rewards/margins": 0.5008086562156677, "rewards/rejected": -2.040992259979248, "step": 6970 }, { "epoch": 1.202618883528601, "grad_norm": 28.58230209350586, "learning_rate": 1.4974513465270049e-07, "logits/chosen": -2.1468191146850586, "logits/rejected": -2.111788272857666, "logps/chosen": -212.5028839111328, "logps/rejected": -256.9225769042969, "loss": 0.5604, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.5714538097381592, "rewards/margins": 0.47803419828414917, "rewards/rejected": -2.049488067626953, "step": 6980 }, { "epoch": 1.20434183321847, "grad_norm": 25.351545333862305, "learning_rate": 1.4957111609115761e-07, "logits/chosen": -2.1056313514709473, "logits/rejected": -2.079068422317505, "logps/chosen": -216.9329071044922, "logps/rejected": -250.76547241210938, "loss": 0.6179, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.5887540578842163, "rewards/margins": 0.3808908462524414, "rewards/rejected": -1.9696447849273682, "step": 6990 }, { "epoch": 1.206064782908339, "grad_norm": 23.871530532836914, "learning_rate": 1.4939689828329694e-07, "logits/chosen": -2.3162684440612793, "logits/rejected": -2.2757205963134766, "logps/chosen": -211.6276397705078, "logps/rejected": -262.7303466796875, "loss": 0.5402, "rewards/accuracies": 0.71875, "rewards/chosen": -1.5442078113555908, "rewards/margins": 0.5424421429634094, "rewards/rejected": -2.0866498947143555, "step": 7000 }, { "epoch": 1.207787732598208, "grad_norm": 19.846067428588867, "learning_rate": 1.492224819293701e-07, "logits/chosen": -2.2165017127990723, "logits/rejected": -2.1862950325012207, "logps/chosen": -192.9996337890625, "logps/rejected": -227.0289764404297, "loss": 0.5975, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.3586089611053467, "rewards/margins": 0.39048129320144653, "rewards/rejected": -1.7490901947021484, "step": 7010 }, { "epoch": 1.2095106822880772, "grad_norm": 30.123388290405273, "learning_rate": 1.490478677304268e-07, "logits/chosen": -2.1815481185913086, "logits/rejected": -2.1474032402038574, "logps/chosen": -179.94114685058594, "logps/rejected": -219.311279296875, "loss": 0.5872, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.279196858406067, "rewards/margins": 0.3986092209815979, "rewards/rejected": -1.6778061389923096, "step": 7020 }, { "epoch": 1.2112336319779462, "grad_norm": 49.9505729675293, "learning_rate": 1.4887305638831207e-07, "logits/chosen": -2.226543426513672, "logits/rejected": -2.192350149154663, "logps/chosen": -198.51046752929688, "logps/rejected": -240.5613555908203, "loss": 0.588, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.4425830841064453, "rewards/margins": 0.4182146191596985, "rewards/rejected": -1.860797643661499, "step": 7030 }, { "epoch": 1.2129565816678154, "grad_norm": 74.18596649169922, "learning_rate": 1.486980486056631e-07, "logits/chosen": -2.1702017784118652, "logits/rejected": -2.1381258964538574, "logps/chosen": -212.8441619873047, "logps/rejected": -259.13226318359375, "loss": 0.5804, "rewards/accuracies": 0.75, "rewards/chosen": -1.5529807806015015, "rewards/margins": 0.4791305959224701, "rewards/rejected": -2.032111406326294, "step": 7040 }, { "epoch": 1.2146795313576844, "grad_norm": 27.31275177001953, "learning_rate": 1.4852284508590686e-07, "logits/chosen": -2.1526901721954346, "logits/rejected": -2.128535032272339, "logps/chosen": -198.78216552734375, "logps/rejected": -236.4357147216797, "loss": 0.62, "rewards/accuracies": 0.6875, "rewards/chosen": -1.4617741107940674, "rewards/margins": 0.37819933891296387, "rewards/rejected": -1.8399736881256104, "step": 7050 }, { "epoch": 1.2164024810475533, "grad_norm": 28.094131469726562, "learning_rate": 1.483474465332569e-07, "logits/chosen": -2.2378499507904053, "logits/rejected": -2.2252941131591797, "logps/chosen": -185.1388397216797, "logps/rejected": -211.9171600341797, "loss": 0.6271, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.3142409324645996, "rewards/margins": 0.2868518829345703, "rewards/rejected": -1.6010926961898804, "step": 7060 }, { "epoch": 1.2181254307374225, "grad_norm": 29.921306610107422, "learning_rate": 1.4817185365271092e-07, "logits/chosen": -2.2271568775177, "logits/rejected": -2.1991305351257324, "logps/chosen": -167.1840057373047, "logps/rejected": -190.1419677734375, "loss": 0.6348, "rewards/accuracies": 0.625, "rewards/chosen": -1.118652582168579, "rewards/margins": 0.25607386231422424, "rewards/rejected": -1.3747262954711914, "step": 7070 }, { "epoch": 1.2198483804272915, "grad_norm": 31.323341369628906, "learning_rate": 1.4799606715004744e-07, "logits/chosen": -2.3093223571777344, "logits/rejected": -2.2754738330841064, "logps/chosen": -154.3928985595703, "logps/rejected": -181.71890258789062, "loss": 0.6099, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.999815821647644, "rewards/margins": 0.28015637397766113, "rewards/rejected": -1.2799723148345947, "step": 7080 }, { "epoch": 1.2215713301171607, "grad_norm": 38.321563720703125, "learning_rate": 1.4782008773182342e-07, "logits/chosen": -2.3063206672668457, "logits/rejected": -2.2809898853302, "logps/chosen": -166.0218048095703, "logps/rejected": -211.0521697998047, "loss": 0.546, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.1230088472366333, "rewards/margins": 0.48068147897720337, "rewards/rejected": -1.6036901473999023, "step": 7090 }, { "epoch": 1.2232942798070296, "grad_norm": 29.58236312866211, "learning_rate": 1.476439161053711e-07, "logits/chosen": -2.159698009490967, "logits/rejected": -2.1280694007873535, "logps/chosen": -210.9506378173828, "logps/rejected": -251.65866088867188, "loss": 0.6058, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.5356671810150146, "rewards/margins": 0.42311763763427734, "rewards/rejected": -1.958784818649292, "step": 7100 }, { "epoch": 1.2250172294968986, "grad_norm": 21.98423957824707, "learning_rate": 1.4746755297879535e-07, "logits/chosen": -2.1789379119873047, "logits/rejected": -2.1484053134918213, "logps/chosen": -204.1139678955078, "logps/rejected": -236.242919921875, "loss": 0.6129, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.4982563257217407, "rewards/margins": 0.36157017946243286, "rewards/rejected": -1.8598264455795288, "step": 7110 }, { "epoch": 1.2267401791867678, "grad_norm": 38.57432174682617, "learning_rate": 1.4729099906097074e-07, "logits/chosen": -2.189185857772827, "logits/rejected": -2.14705753326416, "logps/chosen": -204.56578063964844, "logps/rejected": -229.01296997070312, "loss": 0.61, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.4135963916778564, "rewards/margins": 0.35852089524269104, "rewards/rejected": -1.772117257118225, "step": 7120 }, { "epoch": 1.2284631288766368, "grad_norm": 40.41454315185547, "learning_rate": 1.4711425506153872e-07, "logits/chosen": -2.1416923999786377, "logits/rejected": -2.108492612838745, "logps/chosen": -184.8870391845703, "logps/rejected": -223.3743438720703, "loss": 0.5817, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.2780604362487793, "rewards/margins": 0.42994004487991333, "rewards/rejected": -1.7080005407333374, "step": 7130 }, { "epoch": 1.230186078566506, "grad_norm": 29.750425338745117, "learning_rate": 1.4693732169090472e-07, "logits/chosen": -2.2249319553375244, "logits/rejected": -2.2040207386016846, "logps/chosen": -176.0301513671875, "logps/rejected": -214.18447875976562, "loss": 0.5941, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.2371530532836914, "rewards/margins": 0.3913215100765228, "rewards/rejected": -1.6284745931625366, "step": 7140 }, { "epoch": 1.231909028256375, "grad_norm": 37.294952392578125, "learning_rate": 1.4676019966023537e-07, "logits/chosen": -2.2079851627349854, "logits/rejected": -2.1838433742523193, "logps/chosen": -224.13235473632812, "logps/rejected": -258.48931884765625, "loss": 0.6243, "rewards/accuracies": 0.6875, "rewards/chosen": -1.6731014251708984, "rewards/margins": 0.3684059679508209, "rewards/rejected": -2.0415074825286865, "step": 7150 }, { "epoch": 1.233631977946244, "grad_norm": 42.03847885131836, "learning_rate": 1.4658288968145556e-07, "logits/chosen": -2.19736909866333, "logits/rejected": -2.154069423675537, "logps/chosen": -187.75115966796875, "logps/rejected": -230.8140869140625, "loss": 0.5753, "rewards/accuracies": 0.6875, "rewards/chosen": -1.335819959640503, "rewards/margins": 0.44506731629371643, "rewards/rejected": -1.7808876037597656, "step": 7160 }, { "epoch": 1.235354927636113, "grad_norm": 45.9422721862793, "learning_rate": 1.4640539246724565e-07, "logits/chosen": -2.196071147918701, "logits/rejected": -2.1541483402252197, "logps/chosen": -178.31436157226562, "logps/rejected": -224.1558380126953, "loss": 0.5716, "rewards/accuracies": 0.71875, "rewards/chosen": -1.2226836681365967, "rewards/margins": 0.4886323809623718, "rewards/rejected": -1.7113158702850342, "step": 7170 }, { "epoch": 1.237077877325982, "grad_norm": 28.878841400146484, "learning_rate": 1.4622770873103857e-07, "logits/chosen": -2.2925209999084473, "logits/rejected": -2.2665061950683594, "logps/chosen": -177.89590454101562, "logps/rejected": -213.99520874023438, "loss": 0.5741, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.2149291038513184, "rewards/margins": 0.3878980576992035, "rewards/rejected": -1.6028270721435547, "step": 7180 }, { "epoch": 1.2388008270158513, "grad_norm": 29.684158325195312, "learning_rate": 1.4604983918701692e-07, "logits/chosen": -2.1275532245635986, "logits/rejected": -2.08992338180542, "logps/chosen": -190.8628692626953, "logps/rejected": -236.9882049560547, "loss": 0.5687, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.3781225681304932, "rewards/margins": 0.46524912118911743, "rewards/rejected": -1.8433716297149658, "step": 7190 }, { "epoch": 1.2405237767057202, "grad_norm": 30.833763122558594, "learning_rate": 1.4587178455011021e-07, "logits/chosen": -2.1466710567474365, "logits/rejected": -2.109937906265259, "logps/chosen": -215.5335693359375, "logps/rejected": -269.4213562011719, "loss": 0.5602, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.616548776626587, "rewards/margins": 0.5279964208602905, "rewards/rejected": -2.144545316696167, "step": 7200 }, { "epoch": 1.2405237767057202, "eval_logits/chosen": -2.2377779483795166, "eval_logits/rejected": -2.2210443019866943, "eval_logps/chosen": -195.74278259277344, "eval_logps/rejected": -222.339111328125, "eval_loss": 0.6448310017585754, "eval_rewards/accuracies": 0.6233736276626587, "eval_rewards/chosen": -1.367273211479187, "eval_rewards/margins": 0.22862191498279572, "eval_rewards/rejected": -1.5958951711654663, "eval_runtime": 383.222, "eval_samples_per_second": 11.231, "eval_steps_per_second": 1.404, "step": 7200 }, { "epoch": 1.2422467263955892, "grad_norm": 39.25101852416992, "learning_rate": 1.4569354553599186e-07, "logits/chosen": -2.211578845977783, "logits/rejected": -2.1857690811157227, "logps/chosen": -227.9265899658203, "logps/rejected": -249.36239624023438, "loss": 0.6531, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.6993329524993896, "rewards/margins": 0.2680916488170624, "rewards/rejected": -1.967424750328064, "step": 7210 }, { "epoch": 1.2439696760854584, "grad_norm": 29.855588912963867, "learning_rate": 1.4551512286107642e-07, "logits/chosen": -2.142920970916748, "logits/rejected": -2.0990686416625977, "logps/chosen": -193.0585479736328, "logps/rejected": -237.34970092773438, "loss": 0.5624, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.3795111179351807, "rewards/margins": 0.4934348464012146, "rewards/rejected": -1.872945785522461, "step": 7220 }, { "epoch": 1.2456926257753274, "grad_norm": 28.475894927978516, "learning_rate": 1.4533651724251654e-07, "logits/chosen": -2.1553685665130615, "logits/rejected": -2.127537250518799, "logps/chosen": -188.8539581298828, "logps/rejected": -226.0784912109375, "loss": 0.5846, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.336523413658142, "rewards/margins": 0.407946914434433, "rewards/rejected": -1.7444703578948975, "step": 7230 }, { "epoch": 1.2474155754651963, "grad_norm": 25.94778060913086, "learning_rate": 1.4515772939820036e-07, "logits/chosen": -2.198272228240967, "logits/rejected": -2.176140308380127, "logps/chosen": -200.8077392578125, "logps/rejected": -236.87417602539062, "loss": 0.5909, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.4269249439239502, "rewards/margins": 0.39614081382751465, "rewards/rejected": -1.8230657577514648, "step": 7240 }, { "epoch": 1.2491385251550655, "grad_norm": 29.854503631591797, "learning_rate": 1.4497876004674824e-07, "logits/chosen": -2.2051305770874023, "logits/rejected": -2.167301893234253, "logps/chosen": -193.99215698242188, "logps/rejected": -232.569091796875, "loss": 0.565, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.38264799118042, "rewards/margins": 0.4309522211551666, "rewards/rejected": -1.813599944114685, "step": 7250 }, { "epoch": 1.2508614748449345, "grad_norm": 33.17638397216797, "learning_rate": 1.4479960990751037e-07, "logits/chosen": -2.19197154045105, "logits/rejected": -2.1598422527313232, "logps/chosen": -205.8832550048828, "logps/rejected": -246.341552734375, "loss": 0.5765, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.507854700088501, "rewards/margins": 0.44655442237854004, "rewards/rejected": -1.9544092416763306, "step": 7260 }, { "epoch": 1.2525844245348035, "grad_norm": 33.7602424621582, "learning_rate": 1.4462027970056336e-07, "logits/chosen": -2.221886396408081, "logits/rejected": -2.180298328399658, "logps/chosen": -192.14157104492188, "logps/rejected": -230.5829315185547, "loss": 0.5885, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.3673036098480225, "rewards/margins": 0.4185425341129303, "rewards/rejected": -1.7858461141586304, "step": 7270 }, { "epoch": 1.2543073742246726, "grad_norm": 28.689376831054688, "learning_rate": 1.4444077014670767e-07, "logits/chosen": -2.2600386142730713, "logits/rejected": -2.216193437576294, "logps/chosen": -200.1356964111328, "logps/rejected": -241.4607696533203, "loss": 0.5815, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.4219920635223389, "rewards/margins": 0.4745899736881256, "rewards/rejected": -1.896582007408142, "step": 7280 }, { "epoch": 1.2560303239145416, "grad_norm": 27.535062789916992, "learning_rate": 1.4426108196746465e-07, "logits/chosen": -2.136287212371826, "logits/rejected": -2.109666347503662, "logps/chosen": -191.55276489257812, "logps/rejected": -227.65219116210938, "loss": 0.5933, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.3572779893875122, "rewards/margins": 0.367828905582428, "rewards/rejected": -1.725106954574585, "step": 7290 }, { "epoch": 1.2577532736044108, "grad_norm": 25.037973403930664, "learning_rate": 1.4408121588507358e-07, "logits/chosen": -2.101698398590088, "logits/rejected": -2.072650194168091, "logps/chosen": -185.10435485839844, "logps/rejected": -224.55752563476562, "loss": 0.6116, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.3446990251541138, "rewards/margins": 0.37932828068733215, "rewards/rejected": -1.724027395248413, "step": 7300 }, { "epoch": 1.2594762232942798, "grad_norm": 24.10384750366211, "learning_rate": 1.4390117262248886e-07, "logits/chosen": -2.235344409942627, "logits/rejected": -2.1998746395111084, "logps/chosen": -192.3137969970703, "logps/rejected": -235.06784057617188, "loss": 0.578, "rewards/accuracies": 0.6875, "rewards/chosen": -1.3667701482772827, "rewards/margins": 0.47494545578956604, "rewards/rejected": -1.8417155742645264, "step": 7310 }, { "epoch": 1.2611991729841487, "grad_norm": 21.041650772094727, "learning_rate": 1.4372095290337697e-07, "logits/chosen": -2.248053789138794, "logits/rejected": -2.21608829498291, "logps/chosen": -176.29461669921875, "logps/rejected": -209.7880096435547, "loss": 0.6027, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.205918788909912, "rewards/margins": 0.38784992694854736, "rewards/rejected": -1.5937687158584595, "step": 7320 }, { "epoch": 1.262922122674018, "grad_norm": 43.863494873046875, "learning_rate": 1.4354055745211372e-07, "logits/chosen": -2.1404201984405518, "logits/rejected": -2.103395938873291, "logps/chosen": -191.5880584716797, "logps/rejected": -242.8134002685547, "loss": 0.5418, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.3741697072982788, "rewards/margins": 0.5526102185249329, "rewards/rejected": -1.9267799854278564, "step": 7330 }, { "epoch": 1.264645072363887, "grad_norm": 22.694883346557617, "learning_rate": 1.4335998699378123e-07, "logits/chosen": -2.2119076251983643, "logits/rejected": -2.1771349906921387, "logps/chosen": -206.36874389648438, "logps/rejected": -241.12820434570312, "loss": 0.6079, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.503580093383789, "rewards/margins": 0.3770178556442261, "rewards/rejected": -1.8805980682373047, "step": 7340 }, { "epoch": 1.266368022053756, "grad_norm": 33.1309700012207, "learning_rate": 1.4317924225416493e-07, "logits/chosen": -2.2810635566711426, "logits/rejected": -2.243312120437622, "logps/chosen": -177.04080200195312, "logps/rejected": -215.0124969482422, "loss": 0.5747, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.1863033771514893, "rewards/margins": 0.45680537819862366, "rewards/rejected": -1.6431087255477905, "step": 7350 }, { "epoch": 1.268090971743625, "grad_norm": 36.34376525878906, "learning_rate": 1.42998323959751e-07, "logits/chosen": -2.171330213546753, "logits/rejected": -2.145925998687744, "logps/chosen": -194.1071319580078, "logps/rejected": -231.82693481445312, "loss": 0.5934, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.385913610458374, "rewards/margins": 0.4217739701271057, "rewards/rejected": -1.807687759399414, "step": 7360 }, { "epoch": 1.269813921433494, "grad_norm": 44.59245681762695, "learning_rate": 1.4281723283772297e-07, "logits/chosen": -2.1258740425109863, "logits/rejected": -2.0956592559814453, "logps/chosen": -197.22552490234375, "logps/rejected": -241.16964721679688, "loss": 0.5757, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.4306964874267578, "rewards/margins": 0.47660359740257263, "rewards/rejected": -1.9073002338409424, "step": 7370 }, { "epoch": 1.2715368711233632, "grad_norm": 31.368579864501953, "learning_rate": 1.4263596961595913e-07, "logits/chosen": -2.1727757453918457, "logits/rejected": -2.1414594650268555, "logps/chosen": -196.63665771484375, "logps/rejected": -243.6747589111328, "loss": 0.5809, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.4313442707061768, "rewards/margins": 0.47405076026916504, "rewards/rejected": -1.9053949117660522, "step": 7380 }, { "epoch": 1.2732598208132322, "grad_norm": 47.17496871948242, "learning_rate": 1.424545350230296e-07, "logits/chosen": -2.1499431133270264, "logits/rejected": -2.1169068813323975, "logps/chosen": -197.3967742919922, "logps/rejected": -244.5073699951172, "loss": 0.5753, "rewards/accuracies": 0.6875, "rewards/chosen": -1.4444047212600708, "rewards/margins": 0.4816197454929352, "rewards/rejected": -1.9260244369506836, "step": 7390 }, { "epoch": 1.2749827705031014, "grad_norm": 23.355920791625977, "learning_rate": 1.422729297881931e-07, "logits/chosen": -2.127746343612671, "logits/rejected": -2.0778279304504395, "logps/chosen": -226.56710815429688, "logps/rejected": -276.52569580078125, "loss": 0.5395, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.7020788192749023, "rewards/margins": 0.5654906034469604, "rewards/rejected": -2.267569065093994, "step": 7400 }, { "epoch": 1.2767057201929704, "grad_norm": 45.91021728515625, "learning_rate": 1.4209115464139445e-07, "logits/chosen": -2.117419958114624, "logits/rejected": -2.0680668354034424, "logps/chosen": -225.8282470703125, "logps/rejected": -279.32623291015625, "loss": 0.5735, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.7132211923599243, "rewards/margins": 0.5609859824180603, "rewards/rejected": -2.274207353591919, "step": 7410 }, { "epoch": 1.2784286698828393, "grad_norm": 30.91571044921875, "learning_rate": 1.419092103132612e-07, "logits/chosen": -2.076505661010742, "logits/rejected": -2.050628423690796, "logps/chosen": -228.55966186523438, "logps/rejected": -265.3526916503906, "loss": 0.5955, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.7106529474258423, "rewards/margins": 0.41547513008117676, "rewards/rejected": -2.1261279582977295, "step": 7420 }, { "epoch": 1.2801516195727085, "grad_norm": 39.247623443603516, "learning_rate": 1.4172709753510117e-07, "logits/chosen": -2.0895779132843018, "logits/rejected": -2.051273822784424, "logps/chosen": -219.62612915039062, "logps/rejected": -268.3280334472656, "loss": 0.5791, "rewards/accuracies": 0.6875, "rewards/chosen": -1.6621767282485962, "rewards/margins": 0.5093218088150024, "rewards/rejected": -2.1714985370635986, "step": 7430 }, { "epoch": 1.2818745692625775, "grad_norm": 26.95642852783203, "learning_rate": 1.41544817038899e-07, "logits/chosen": -2.2014544010162354, "logits/rejected": -2.1671154499053955, "logps/chosen": -202.35275268554688, "logps/rejected": -237.54611206054688, "loss": 0.61, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.4587031602859497, "rewards/margins": 0.3934493660926819, "rewards/rejected": -1.8521524667739868, "step": 7440 }, { "epoch": 1.2835975189524467, "grad_norm": 28.73160171508789, "learning_rate": 1.4136236955731354e-07, "logits/chosen": -2.317675828933716, "logits/rejected": -2.2803618907928467, "logps/chosen": -173.7164764404297, "logps/rejected": -201.6647186279297, "loss": 0.6046, "rewards/accuracies": 0.6875, "rewards/chosen": -1.1620408296585083, "rewards/margins": 0.33942776918411255, "rewards/rejected": -1.5014686584472656, "step": 7450 }, { "epoch": 1.2853204686423156, "grad_norm": 26.703969955444336, "learning_rate": 1.4117975582367488e-07, "logits/chosen": -2.168306350708008, "logits/rejected": -2.148745536804199, "logps/chosen": -176.16409301757812, "logps/rejected": -217.79684448242188, "loss": 0.5928, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.25123131275177, "rewards/margins": 0.4050864577293396, "rewards/rejected": -1.6563177108764648, "step": 7460 }, { "epoch": 1.2870434183321846, "grad_norm": 33.33496856689453, "learning_rate": 1.4099697657198128e-07, "logits/chosen": -2.2102653980255127, "logits/rejected": -2.189234733581543, "logps/chosen": -196.8131103515625, "logps/rejected": -221.9905548095703, "loss": 0.65, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.4170329570770264, "rewards/margins": 0.29639095067977905, "rewards/rejected": -1.7134240865707397, "step": 7470 }, { "epoch": 1.2887663680220538, "grad_norm": 32.69403839111328, "learning_rate": 1.4081403253689638e-07, "logits/chosen": -2.1745803356170654, "logits/rejected": -2.140530586242676, "logps/chosen": -177.96560668945312, "logps/rejected": -204.87222290039062, "loss": 0.6119, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.2249789237976074, "rewards/margins": 0.3372560739517212, "rewards/rejected": -1.5622351169586182, "step": 7480 }, { "epoch": 1.2904893177119228, "grad_norm": 27.952274322509766, "learning_rate": 1.4063092445374591e-07, "logits/chosen": -2.1473286151885986, "logits/rejected": -2.1264710426330566, "logps/chosen": -187.06661987304688, "logps/rejected": -220.3990936279297, "loss": 0.6118, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.3386499881744385, "rewards/margins": 0.33736392855644226, "rewards/rejected": -1.6760139465332031, "step": 7490 }, { "epoch": 1.292212267401792, "grad_norm": 27.661653518676758, "learning_rate": 1.404476530585153e-07, "logits/chosen": -2.1809844970703125, "logits/rejected": -2.1510908603668213, "logps/chosen": -175.53463745117188, "logps/rejected": -207.1257781982422, "loss": 0.6244, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.2388243675231934, "rewards/margins": 0.32033151388168335, "rewards/rejected": -1.559156060218811, "step": 7500 }, { "epoch": 1.293935217091661, "grad_norm": 26.77812957763672, "learning_rate": 1.402642190878462e-07, "logits/chosen": -2.1876208782196045, "logits/rejected": -2.161717414855957, "logps/chosen": -178.3339080810547, "logps/rejected": -207.5714111328125, "loss": 0.5876, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.1986923217773438, "rewards/margins": 0.36776408553123474, "rewards/rejected": -1.5664561986923218, "step": 7510 }, { "epoch": 1.29565816678153, "grad_norm": 26.58428192138672, "learning_rate": 1.4008062327903373e-07, "logits/chosen": -2.1912448406219482, "logits/rejected": -2.166442394256592, "logps/chosen": -173.25840759277344, "logps/rejected": -209.9363555908203, "loss": 0.5816, "rewards/accuracies": 0.6875, "rewards/chosen": -1.1874148845672607, "rewards/margins": 0.38662785291671753, "rewards/rejected": -1.574042797088623, "step": 7520 }, { "epoch": 1.297381116471399, "grad_norm": 24.964679718017578, "learning_rate": 1.398968663700235e-07, "logits/chosen": -2.139922618865967, "logits/rejected": -2.1178131103515625, "logps/chosen": -172.1335906982422, "logps/rejected": -214.1690216064453, "loss": 0.5787, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.194275140762329, "rewards/margins": 0.4243914484977722, "rewards/rejected": -1.618666410446167, "step": 7530 }, { "epoch": 1.299104066161268, "grad_norm": 24.18116569519043, "learning_rate": 1.3971294909940872e-07, "logits/chosen": -2.2325923442840576, "logits/rejected": -2.204035758972168, "logps/chosen": -176.44895935058594, "logps/rejected": -231.82229614257812, "loss": 0.5317, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.2390239238739014, "rewards/margins": 0.5569388270378113, "rewards/rejected": -1.7959626913070679, "step": 7540 }, { "epoch": 1.3008270158511372, "grad_norm": 33.15352249145508, "learning_rate": 1.395288722064271e-07, "logits/chosen": -2.1470162868499756, "logits/rejected": -2.121925115585327, "logps/chosen": -198.5567169189453, "logps/rejected": -244.9143829345703, "loss": 0.585, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.4541960954666138, "rewards/margins": 0.4865036606788635, "rewards/rejected": -1.940699577331543, "step": 7550 }, { "epoch": 1.3025499655410062, "grad_norm": 27.650455474853516, "learning_rate": 1.39344636430958e-07, "logits/chosen": -2.2052130699157715, "logits/rejected": -2.1664042472839355, "logps/chosen": -195.47647094726562, "logps/rejected": -255.9792938232422, "loss": 0.5353, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.4195246696472168, "rewards/margins": 0.5902754068374634, "rewards/rejected": -2.0097999572753906, "step": 7560 }, { "epoch": 1.3042729152308752, "grad_norm": 37.809913635253906, "learning_rate": 1.3916024251351922e-07, "logits/chosen": -2.1798274517059326, "logits/rejected": -2.1409709453582764, "logps/chosen": -233.6822052001953, "logps/rejected": -287.028564453125, "loss": 0.5546, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.7669155597686768, "rewards/margins": 0.5862165093421936, "rewards/rejected": -2.3531317710876465, "step": 7570 }, { "epoch": 1.3059958649207444, "grad_norm": 38.519527435302734, "learning_rate": 1.3897569119526442e-07, "logits/chosen": -2.1253037452697754, "logits/rejected": -2.095015287399292, "logps/chosen": -234.3262939453125, "logps/rejected": -282.79449462890625, "loss": 0.5699, "rewards/accuracies": 0.71875, "rewards/chosen": -1.7710899114608765, "rewards/margins": 0.5180790424346924, "rewards/rejected": -2.2891690731048584, "step": 7580 }, { "epoch": 1.3077188146106133, "grad_norm": 24.080333709716797, "learning_rate": 1.387909832179798e-07, "logits/chosen": -2.1358370780944824, "logits/rejected": -2.0884411334991455, "logps/chosen": -253.19937133789062, "logps/rejected": -307.0641784667969, "loss": 0.5695, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.9425344467163086, "rewards/margins": 0.632226824760437, "rewards/rejected": -2.574761390686035, "step": 7590 }, { "epoch": 1.3094417643004825, "grad_norm": 45.20371627807617, "learning_rate": 1.3860611932408118e-07, "logits/chosen": -2.122115135192871, "logits/rejected": -2.0939908027648926, "logps/chosen": -235.76028442382812, "logps/rejected": -265.4013366699219, "loss": 0.6357, "rewards/accuracies": 0.6875, "rewards/chosen": -1.8253326416015625, "rewards/margins": 0.3514913022518158, "rewards/rejected": -2.176823854446411, "step": 7600 }, { "epoch": 1.3094417643004825, "eval_logits/chosen": -2.2208468914031982, "eval_logits/rejected": -2.203432083129883, "eval_logps/chosen": -198.77024841308594, "eval_logps/rejected": -226.18759155273438, "eval_loss": 0.6412925720214844, "eval_rewards/accuracies": 0.6124535202980042, "eval_rewards/chosen": -1.3975476026535034, "eval_rewards/margins": 0.23683220148086548, "eval_rewards/rejected": -1.6343798637390137, "eval_runtime": 383.4213, "eval_samples_per_second": 11.225, "eval_steps_per_second": 1.403, "step": 7600 }, { "epoch": 1.3111647139903515, "grad_norm": 22.482765197753906, "learning_rate": 1.3842110025661126e-07, "logits/chosen": -2.1169135570526123, "logits/rejected": -2.0805201530456543, "logps/chosen": -200.65127563476562, "logps/rejected": -246.5351104736328, "loss": 0.5702, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.4636411666870117, "rewards/margins": 0.5099278688430786, "rewards/rejected": -1.9735692739486694, "step": 7610 }, { "epoch": 1.3128876636802205, "grad_norm": 31.662145614624023, "learning_rate": 1.3823592675923625e-07, "logits/chosen": -2.166860342025757, "logits/rejected": -2.135286569595337, "logps/chosen": -199.06710815429688, "logps/rejected": -237.6704864501953, "loss": 0.5748, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.4023758172988892, "rewards/margins": 0.43845334649086, "rewards/rejected": -1.8408292531967163, "step": 7620 }, { "epoch": 1.3146106133700897, "grad_norm": 30.58859634399414, "learning_rate": 1.3805059957624318e-07, "logits/chosen": -2.1249918937683105, "logits/rejected": -2.1092543601989746, "logps/chosen": -196.4220733642578, "logps/rejected": -248.9763946533203, "loss": 0.5485, "rewards/accuracies": 0.75, "rewards/chosen": -1.4566963911056519, "rewards/margins": 0.49759864807128906, "rewards/rejected": -1.9542949199676514, "step": 7630 }, { "epoch": 1.3163335630599586, "grad_norm": 38.346683502197266, "learning_rate": 1.3786511945253675e-07, "logits/chosen": -2.1035542488098145, "logits/rejected": -2.066838502883911, "logps/chosen": -229.5228271484375, "logps/rejected": -273.35430908203125, "loss": 0.5913, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.7108421325683594, "rewards/margins": 0.5063773989677429, "rewards/rejected": -2.217219591140747, "step": 7640 }, { "epoch": 1.3180565127498278, "grad_norm": 25.326038360595703, "learning_rate": 1.3767948713363646e-07, "logits/chosen": -2.168118476867676, "logits/rejected": -2.138068199157715, "logps/chosen": -226.3521270751953, "logps/rejected": -265.44915771484375, "loss": 0.6025, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.6705223321914673, "rewards/margins": 0.4374934136867523, "rewards/rejected": -2.108015775680542, "step": 7650 }, { "epoch": 1.3197794624396968, "grad_norm": 21.727615356445312, "learning_rate": 1.374937033656735e-07, "logits/chosen": -2.2056868076324463, "logits/rejected": -2.164641857147217, "logps/chosen": -193.8480682373047, "logps/rejected": -246.1538848876953, "loss": 0.5336, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.3655202388763428, "rewards/margins": 0.5554262399673462, "rewards/rejected": -1.920946717262268, "step": 7660 }, { "epoch": 1.3215024121295658, "grad_norm": 31.452966690063477, "learning_rate": 1.3730776889538776e-07, "logits/chosen": -2.1546008586883545, "logits/rejected": -2.1246042251586914, "logps/chosen": -194.42176818847656, "logps/rejected": -227.3402557373047, "loss": 0.6119, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.3616440296173096, "rewards/margins": 0.3867500424385071, "rewards/rejected": -1.7483940124511719, "step": 7670 }, { "epoch": 1.323225361819435, "grad_norm": 27.648405075073242, "learning_rate": 1.3712168447012493e-07, "logits/chosen": -2.192735195159912, "logits/rejected": -2.1616322994232178, "logps/chosen": -187.9457244873047, "logps/rejected": -228.00625610351562, "loss": 0.5635, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.3604012727737427, "rewards/margins": 0.43012505769729614, "rewards/rejected": -1.7905261516571045, "step": 7680 }, { "epoch": 1.324948311509304, "grad_norm": 24.951980590820312, "learning_rate": 1.369354508378334e-07, "logits/chosen": -2.2712864875793457, "logits/rejected": -2.2198472023010254, "logps/chosen": -194.9119873046875, "logps/rejected": -235.7847137451172, "loss": 0.5708, "rewards/accuracies": 0.6875, "rewards/chosen": -1.385386347770691, "rewards/margins": 0.48633819818496704, "rewards/rejected": -1.8717244863510132, "step": 7690 }, { "epoch": 1.3266712611991731, "grad_norm": 22.935848236083984, "learning_rate": 1.3674906874706129e-07, "logits/chosen": -2.1742866039276123, "logits/rejected": -2.1302480697631836, "logps/chosen": -194.0531005859375, "logps/rejected": -235.50125122070312, "loss": 0.5815, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.3940820693969727, "rewards/margins": 0.42350903153419495, "rewards/rejected": -1.8175909519195557, "step": 7700 }, { "epoch": 1.328394210889042, "grad_norm": 31.301496505737305, "learning_rate": 1.365625389469534e-07, "logits/chosen": -2.1869258880615234, "logits/rejected": -2.160074234008789, "logps/chosen": -198.0553436279297, "logps/rejected": -230.83023071289062, "loss": 0.6123, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.4490340948104858, "rewards/margins": 0.3512323498725891, "rewards/rejected": -1.8002665042877197, "step": 7710 }, { "epoch": 1.330117160578911, "grad_norm": 32.13523864746094, "learning_rate": 1.363758621872483e-07, "logits/chosen": -2.193956136703491, "logits/rejected": -2.156627893447876, "logps/chosen": -196.5673370361328, "logps/rejected": -227.8412628173828, "loss": 0.5943, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.3913943767547607, "rewards/margins": 0.3918333947658539, "rewards/rejected": -1.7832276821136475, "step": 7720 }, { "epoch": 1.33184011026878, "grad_norm": 36.192283630371094, "learning_rate": 1.361890392182752e-07, "logits/chosen": -2.131290912628174, "logits/rejected": -2.0990428924560547, "logps/chosen": -181.3380584716797, "logps/rejected": -223.23141479492188, "loss": 0.5653, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.2632769346237183, "rewards/margins": 0.4550713002681732, "rewards/rejected": -1.7183481454849243, "step": 7730 }, { "epoch": 1.3335630599586492, "grad_norm": 26.759370803833008, "learning_rate": 1.3600207079095097e-07, "logits/chosen": -2.1518218517303467, "logits/rejected": -2.1181464195251465, "logps/chosen": -205.61929321289062, "logps/rejected": -261.2309875488281, "loss": 0.5487, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.5240737199783325, "rewards/margins": 0.5604895353317261, "rewards/rejected": -2.0845632553100586, "step": 7740 }, { "epoch": 1.3352860096485184, "grad_norm": 34.86008071899414, "learning_rate": 1.3581495765677718e-07, "logits/chosen": -2.1620683670043945, "logits/rejected": -2.1198601722717285, "logps/chosen": -224.58352661132812, "logps/rejected": -274.17803955078125, "loss": 0.5766, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.69365656375885, "rewards/margins": 0.5385280847549438, "rewards/rejected": -2.232184648513794, "step": 7750 }, { "epoch": 1.3370089593383874, "grad_norm": 34.262413024902344, "learning_rate": 1.3562770056783702e-07, "logits/chosen": -2.0609984397888184, "logits/rejected": -2.026923894882202, "logps/chosen": -197.84727478027344, "logps/rejected": -253.30307006835938, "loss": 0.5368, "rewards/accuracies": 0.75, "rewards/chosen": -1.4634578227996826, "rewards/margins": 0.5615237951278687, "rewards/rejected": -2.0249814987182617, "step": 7760 }, { "epoch": 1.3387319090282563, "grad_norm": 24.656667709350586, "learning_rate": 1.3544030027679232e-07, "logits/chosen": -2.1213603019714355, "logits/rejected": -2.085960865020752, "logps/chosen": -206.48831176757812, "logps/rejected": -255.5338134765625, "loss": 0.5678, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.532331109046936, "rewards/margins": 0.5047855377197266, "rewards/rejected": -2.037116527557373, "step": 7770 }, { "epoch": 1.3404548587181253, "grad_norm": 29.939220428466797, "learning_rate": 1.3525275753688042e-07, "logits/chosen": -2.193809986114502, "logits/rejected": -2.168917179107666, "logps/chosen": -219.8052520751953, "logps/rejected": -268.90814208984375, "loss": 0.6032, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.6730769872665405, "rewards/margins": 0.4895497262477875, "rewards/rejected": -2.1626267433166504, "step": 7780 }, { "epoch": 1.3421778084079945, "grad_norm": 45.841880798339844, "learning_rate": 1.350650731019113e-07, "logits/chosen": -2.1801536083221436, "logits/rejected": -2.1468937397003174, "logps/chosen": -216.2582244873047, "logps/rejected": -274.93438720703125, "loss": 0.5399, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.641721487045288, "rewards/margins": 0.5975975394248962, "rewards/rejected": -2.239319324493408, "step": 7790 }, { "epoch": 1.3439007580978635, "grad_norm": 32.21296691894531, "learning_rate": 1.3487724772626439e-07, "logits/chosen": -2.1707370281219482, "logits/rejected": -2.138662338256836, "logps/chosen": -228.29141235351562, "logps/rejected": -283.7637939453125, "loss": 0.5713, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.7291542291641235, "rewards/margins": 0.5668483972549438, "rewards/rejected": -2.2960026264190674, "step": 7800 }, { "epoch": 1.3456237077877327, "grad_norm": 50.75136184692383, "learning_rate": 1.346892821648857e-07, "logits/chosen": -2.175175428390503, "logits/rejected": -2.130139112472534, "logps/chosen": -235.88687133789062, "logps/rejected": -280.6183776855469, "loss": 0.5859, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.7639777660369873, "rewards/margins": 0.5005490779876709, "rewards/rejected": -2.264526844024658, "step": 7810 }, { "epoch": 1.3473466574776016, "grad_norm": 52.12438201904297, "learning_rate": 1.3450117717328468e-07, "logits/chosen": -2.166834592819214, "logits/rejected": -2.1256909370422363, "logps/chosen": -219.2531280517578, "logps/rejected": -271.19378662109375, "loss": 0.581, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.616594672203064, "rewards/margins": 0.5541667342185974, "rewards/rejected": -2.1707613468170166, "step": 7820 }, { "epoch": 1.3490696071674706, "grad_norm": 31.646299362182617, "learning_rate": 1.3431293350753115e-07, "logits/chosen": -2.1095938682556152, "logits/rejected": -2.0786352157592773, "logps/chosen": -205.64035034179688, "logps/rejected": -261.259521484375, "loss": 0.5568, "rewards/accuracies": 0.71875, "rewards/chosen": -1.5484790802001953, "rewards/margins": 0.5396331548690796, "rewards/rejected": -2.0881123542785645, "step": 7830 }, { "epoch": 1.3507925568573398, "grad_norm": 28.87881851196289, "learning_rate": 1.341245519242524e-07, "logits/chosen": -2.1058099269866943, "logits/rejected": -2.0774412155151367, "logps/chosen": -197.8090362548828, "logps/rejected": -233.66806030273438, "loss": 0.6179, "rewards/accuracies": 0.6875, "rewards/chosen": -1.4320785999298096, "rewards/margins": 0.37303489446640015, "rewards/rejected": -1.805113434791565, "step": 7840 }, { "epoch": 1.3525155065472088, "grad_norm": 27.308841705322266, "learning_rate": 1.3393603318063e-07, "logits/chosen": -2.0920326709747314, "logits/rejected": -2.0379960536956787, "logps/chosen": -205.4201202392578, "logps/rejected": -240.3004913330078, "loss": 0.6017, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.4918204545974731, "rewards/margins": 0.4359316825866699, "rewards/rejected": -1.927752137184143, "step": 7850 }, { "epoch": 1.354238456237078, "grad_norm": 36.3019905090332, "learning_rate": 1.3374737803439685e-07, "logits/chosen": -2.107954502105713, "logits/rejected": -2.058523416519165, "logps/chosen": -224.4720916748047, "logps/rejected": -279.4774169921875, "loss": 0.5657, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.71206533908844, "rewards/margins": 0.566798210144043, "rewards/rejected": -2.2788634300231934, "step": 7860 }, { "epoch": 1.355961405926947, "grad_norm": 34.743247985839844, "learning_rate": 1.3355858724383415e-07, "logits/chosen": -2.0951883792877197, "logits/rejected": -2.0639822483062744, "logps/chosen": -243.0995635986328, "logps/rejected": -293.8856506347656, "loss": 0.5744, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.863247275352478, "rewards/margins": 0.5420467257499695, "rewards/rejected": -2.405294179916382, "step": 7870 }, { "epoch": 1.3576843556168159, "grad_norm": 42.764366149902344, "learning_rate": 1.3336966156776822e-07, "logits/chosen": -2.1480154991149902, "logits/rejected": -2.1229541301727295, "logps/chosen": -233.7257537841797, "logps/rejected": -269.1768798828125, "loss": 0.6355, "rewards/accuracies": 0.65625, "rewards/chosen": -1.7973674535751343, "rewards/margins": 0.3781353533267975, "rewards/rejected": -2.1755027770996094, "step": 7880 }, { "epoch": 1.359407305306685, "grad_norm": 29.920547485351562, "learning_rate": 1.3318060176556756e-07, "logits/chosen": -2.1473171710968018, "logits/rejected": -2.112473487854004, "logps/chosen": -203.85462951660156, "logps/rejected": -251.13766479492188, "loss": 0.5794, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.4900063276290894, "rewards/margins": 0.47319668531417847, "rewards/rejected": -1.9632028341293335, "step": 7890 }, { "epoch": 1.361130254996554, "grad_norm": 35.13427734375, "learning_rate": 1.3299140859713983e-07, "logits/chosen": -2.195388078689575, "logits/rejected": -2.1696789264678955, "logps/chosen": -184.6055908203125, "logps/rejected": -229.09329223632812, "loss": 0.5793, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.3148016929626465, "rewards/margins": 0.45367079973220825, "rewards/rejected": -1.768472671508789, "step": 7900 }, { "epoch": 1.3628532046864232, "grad_norm": 42.53324890136719, "learning_rate": 1.3280208282292878e-07, "logits/chosen": -2.192570686340332, "logits/rejected": -2.1600852012634277, "logps/chosen": -189.4005889892578, "logps/rejected": -221.49951171875, "loss": 0.6167, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.3212350606918335, "rewards/margins": 0.36447951197624207, "rewards/rejected": -1.6857147216796875, "step": 7910 }, { "epoch": 1.3645761543762922, "grad_norm": 26.738149642944336, "learning_rate": 1.3261262520391097e-07, "logits/chosen": -2.1648764610290527, "logits/rejected": -2.136169195175171, "logps/chosen": -188.26707458496094, "logps/rejected": -223.86648559570312, "loss": 0.5765, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.3203630447387695, "rewards/margins": 0.4093676209449768, "rewards/rejected": -1.7297306060791016, "step": 7920 }, { "epoch": 1.3662991040661612, "grad_norm": 35.8544921875, "learning_rate": 1.3242303650159313e-07, "logits/chosen": -2.196377992630005, "logits/rejected": -2.1608593463897705, "logps/chosen": -200.53225708007812, "logps/rejected": -236.4687957763672, "loss": 0.5847, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.4146637916564941, "rewards/margins": 0.43156057596206665, "rewards/rejected": -1.8462244272232056, "step": 7930 }, { "epoch": 1.3680220537560304, "grad_norm": 47.39460754394531, "learning_rate": 1.3223331747800867e-07, "logits/chosen": -2.1359665393829346, "logits/rejected": -2.1010541915893555, "logps/chosen": -199.37338256835938, "logps/rejected": -249.5021209716797, "loss": 0.5582, "rewards/accuracies": 0.71875, "rewards/chosen": -1.4021588563919067, "rewards/margins": 0.5483847856521606, "rewards/rejected": -1.9505436420440674, "step": 7940 }, { "epoch": 1.3697450034458993, "grad_norm": 25.43636703491211, "learning_rate": 1.3204346889571494e-07, "logits/chosen": -2.12013840675354, "logits/rejected": -2.100837230682373, "logps/chosen": -177.21873474121094, "logps/rejected": -224.29898071289062, "loss": 0.5749, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.251285195350647, "rewards/margins": 0.47028857469558716, "rewards/rejected": -1.721573829650879, "step": 7950 }, { "epoch": 1.3714679531357685, "grad_norm": 24.467222213745117, "learning_rate": 1.3185349151779e-07, "logits/chosen": -2.1382768154144287, "logits/rejected": -2.104619264602661, "logps/chosen": -182.4629364013672, "logps/rejected": -221.2128143310547, "loss": 0.5913, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.31582510471344, "rewards/margins": 0.38097429275512695, "rewards/rejected": -1.6967992782592773, "step": 7960 }, { "epoch": 1.3731909028256375, "grad_norm": 27.679777145385742, "learning_rate": 1.3166338610782957e-07, "logits/chosen": -2.211303234100342, "logits/rejected": -2.1780335903167725, "logps/chosen": -182.8100128173828, "logps/rejected": -221.35983276367188, "loss": 0.5754, "rewards/accuracies": 0.6875, "rewards/chosen": -1.26361083984375, "rewards/margins": 0.43397146463394165, "rewards/rejected": -1.6975822448730469, "step": 7970 }, { "epoch": 1.3749138525155065, "grad_norm": 27.631725311279297, "learning_rate": 1.31473153429944e-07, "logits/chosen": -2.2298529148101807, "logits/rejected": -2.2019245624542236, "logps/chosen": -190.2526092529297, "logps/rejected": -237.2704620361328, "loss": 0.5636, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.3224337100982666, "rewards/margins": 0.49690714478492737, "rewards/rejected": -1.819340705871582, "step": 7980 }, { "epoch": 1.3766368022053757, "grad_norm": 36.38218688964844, "learning_rate": 1.3128279424875523e-07, "logits/chosen": -2.2090554237365723, "logits/rejected": -2.1736884117126465, "logps/chosen": -195.2666778564453, "logps/rejected": -249.32376098632812, "loss": 0.5378, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.4387704133987427, "rewards/margins": 0.5402941703796387, "rewards/rejected": -1.9790645837783813, "step": 7990 }, { "epoch": 1.3783597518952446, "grad_norm": 25.795717239379883, "learning_rate": 1.3109230932939354e-07, "logits/chosen": -2.0892562866210938, "logits/rejected": -2.057300090789795, "logps/chosen": -210.97891235351562, "logps/rejected": -268.6546325683594, "loss": 0.5491, "rewards/accuracies": 0.75, "rewards/chosen": -1.5498253107070923, "rewards/margins": 0.5857113599777222, "rewards/rejected": -2.1355366706848145, "step": 8000 }, { "epoch": 1.3783597518952446, "eval_logits/chosen": -2.2084763050079346, "eval_logits/rejected": -2.190638542175293, "eval_logps/chosen": -205.56568908691406, "eval_logps/rejected": -233.9599151611328, "eval_loss": 0.6438331604003906, "eval_rewards/accuracies": 0.6054832935333252, "eval_rewards/chosen": -1.4655022621154785, "eval_rewards/margins": 0.24660077691078186, "eval_rewards/rejected": -1.712103009223938, "eval_runtime": 382.7297, "eval_samples_per_second": 11.246, "eval_steps_per_second": 1.406, "step": 8000 }, { "epoch": 1.3800827015851138, "grad_norm": 50.01524353027344, "learning_rate": 1.3090169943749475e-07, "logits/chosen": -2.1011762619018555, "logits/rejected": -2.053696632385254, "logps/chosen": -227.21572875976562, "logps/rejected": -275.5449523925781, "loss": 0.576, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.6916106939315796, "rewards/margins": 0.5524572134017944, "rewards/rejected": -2.244067907333374, "step": 8010 }, { "epoch": 1.3818056512749828, "grad_norm": 31.691274642944336, "learning_rate": 1.307109653391969e-07, "logits/chosen": -2.124875068664551, "logits/rejected": -2.08036208152771, "logps/chosen": -237.47872924804688, "logps/rejected": -280.66436767578125, "loss": 0.5605, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.7674553394317627, "rewards/margins": 0.5002973079681396, "rewards/rejected": -2.2677524089813232, "step": 8020 }, { "epoch": 1.3835286009648518, "grad_norm": 43.40495681762695, "learning_rate": 1.3052010780113726e-07, "logits/chosen": -2.1229870319366455, "logits/rejected": -2.0922768115997314, "logps/chosen": -206.2229461669922, "logps/rejected": -257.94525146484375, "loss": 0.5747, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.5460470914840698, "rewards/margins": 0.5066577196121216, "rewards/rejected": -2.0527050495147705, "step": 8030 }, { "epoch": 1.385251550654721, "grad_norm": 28.32925796508789, "learning_rate": 1.3032912759044937e-07, "logits/chosen": -2.1125435829162598, "logits/rejected": -2.062844753265381, "logps/chosen": -220.2933349609375, "logps/rejected": -269.1329650878906, "loss": 0.5803, "rewards/accuracies": 0.6875, "rewards/chosen": -1.6463918685913086, "rewards/margins": 0.5379444360733032, "rewards/rejected": -2.1843366622924805, "step": 8040 }, { "epoch": 1.38697450034459, "grad_norm": 26.020471572875977, "learning_rate": 1.301380254747597e-07, "logits/chosen": -2.1440205574035645, "logits/rejected": -2.113903045654297, "logps/chosen": -203.15582275390625, "logps/rejected": -256.64044189453125, "loss": 0.5576, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.4739043712615967, "rewards/margins": 0.5403920412063599, "rewards/rejected": -2.014296531677246, "step": 8050 }, { "epoch": 1.388697450034459, "grad_norm": 32.8132209777832, "learning_rate": 1.2994680222218478e-07, "logits/chosen": -2.1887154579162598, "logits/rejected": -2.1529600620269775, "logps/chosen": -211.48086547851562, "logps/rejected": -248.77041625976562, "loss": 0.598, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.5500705242156982, "rewards/margins": 0.44143232703208923, "rewards/rejected": -1.9915030002593994, "step": 8060 }, { "epoch": 1.390420399724328, "grad_norm": 39.48831558227539, "learning_rate": 1.29755458601328e-07, "logits/chosen": -2.1170506477355957, "logits/rejected": -2.078484058380127, "logps/chosen": -200.0701904296875, "logps/rejected": -240.6426544189453, "loss": 0.5641, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.4589961767196655, "rewards/margins": 0.45816683769226074, "rewards/rejected": -1.9171631336212158, "step": 8070 }, { "epoch": 1.392143349414197, "grad_norm": 33.201900482177734, "learning_rate": 1.2956399538127665e-07, "logits/chosen": -2.114403247833252, "logits/rejected": -2.0826992988586426, "logps/chosen": -205.1845245361328, "logps/rejected": -267.910888671875, "loss": 0.5387, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.513293981552124, "rewards/margins": 0.5943515300750732, "rewards/rejected": -2.1076455116271973, "step": 8080 }, { "epoch": 1.3938662991040662, "grad_norm": 28.73116111755371, "learning_rate": 1.2937241333159854e-07, "logits/chosen": -2.089317798614502, "logits/rejected": -2.0446722507476807, "logps/chosen": -224.7921600341797, "logps/rejected": -283.7295837402344, "loss": 0.5499, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.7055130004882812, "rewards/margins": 0.6278866529464722, "rewards/rejected": -2.333399772644043, "step": 8090 }, { "epoch": 1.3955892487939352, "grad_norm": 36.38704299926758, "learning_rate": 1.2918071322233933e-07, "logits/chosen": -2.109844446182251, "logits/rejected": -2.093400478363037, "logps/chosen": -251.6876220703125, "logps/rejected": -291.4291687011719, "loss": 0.628, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.9619048833847046, "rewards/margins": 0.37870702147483826, "rewards/rejected": -2.3406119346618652, "step": 8100 }, { "epoch": 1.3973121984838044, "grad_norm": 37.000370025634766, "learning_rate": 1.2898889582401912e-07, "logits/chosen": -2.089730739593506, "logits/rejected": -2.046140193939209, "logps/chosen": -251.7401580810547, "logps/rejected": -300.5556640625, "loss": 0.5798, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.9494653940200806, "rewards/margins": 0.5562061071395874, "rewards/rejected": -2.505671262741089, "step": 8110 }, { "epoch": 1.3990351481736734, "grad_norm": 35.38205337524414, "learning_rate": 1.287969619076294e-07, "logits/chosen": -2.1034393310546875, "logits/rejected": -2.068429470062256, "logps/chosen": -234.1424102783203, "logps/rejected": -277.00262451171875, "loss": 0.5932, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.783160924911499, "rewards/margins": 0.46717602014541626, "rewards/rejected": -2.2503371238708496, "step": 8120 }, { "epoch": 1.4007580978635423, "grad_norm": 43.41401672363281, "learning_rate": 1.2860491224463003e-07, "logits/chosen": -2.146254777908325, "logits/rejected": -2.1182961463928223, "logps/chosen": -212.2090606689453, "logps/rejected": -252.13058471679688, "loss": 0.5811, "rewards/accuracies": 0.71875, "rewards/chosen": -1.5653047561645508, "rewards/margins": 0.45889678597450256, "rewards/rejected": -2.0242016315460205, "step": 8130 }, { "epoch": 1.4024810475534115, "grad_norm": 36.6799430847168, "learning_rate": 1.2841274760694607e-07, "logits/chosen": -2.1333212852478027, "logits/rejected": -2.093632936477661, "logps/chosen": -203.4479217529297, "logps/rejected": -258.7621154785156, "loss": 0.553, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.4967834949493408, "rewards/margins": 0.5681794881820679, "rewards/rejected": -2.064962863922119, "step": 8140 }, { "epoch": 1.4042039972432805, "grad_norm": 24.73906135559082, "learning_rate": 1.282204687669648e-07, "logits/chosen": -2.1712634563446045, "logits/rejected": -2.13863468170166, "logps/chosen": -212.09573364257812, "logps/rejected": -257.94891357421875, "loss": 0.6103, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.5868251323699951, "rewards/margins": 0.47013989090919495, "rewards/rejected": -2.0569651126861572, "step": 8150 }, { "epoch": 1.4059269469331497, "grad_norm": 26.953794479370117, "learning_rate": 1.280280764975324e-07, "logits/chosen": -2.1399598121643066, "logits/rejected": -2.097625732421875, "logps/chosen": -194.59352111816406, "logps/rejected": -259.80902099609375, "loss": 0.4902, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.381890058517456, "rewards/margins": 0.706649661064148, "rewards/rejected": -2.0885396003723145, "step": 8160 }, { "epoch": 1.4076498966230186, "grad_norm": 36.16432571411133, "learning_rate": 1.278355715719511e-07, "logits/chosen": -2.164865016937256, "logits/rejected": -2.1224586963653564, "logps/chosen": -215.82766723632812, "logps/rejected": -252.830078125, "loss": 0.5694, "rewards/accuracies": 0.6875, "rewards/chosen": -1.5576438903808594, "rewards/margins": 0.45905083417892456, "rewards/rejected": -2.0166945457458496, "step": 8170 }, { "epoch": 1.4093728463128876, "grad_norm": 37.798011779785156, "learning_rate": 1.276429547639758e-07, "logits/chosen": -2.149423837661743, "logits/rejected": -2.1127476692199707, "logps/chosen": -241.2819061279297, "logps/rejected": -288.2625732421875, "loss": 0.5852, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.8370189666748047, "rewards/margins": 0.4997822344303131, "rewards/rejected": -2.336801290512085, "step": 8180 }, { "epoch": 1.4110957960027566, "grad_norm": 38.44746398925781, "learning_rate": 1.274502268478112e-07, "logits/chosen": -2.0557899475097656, "logits/rejected": -2.0156021118164062, "logps/chosen": -246.6701202392578, "logps/rejected": -301.4805603027344, "loss": 0.5658, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.8727941513061523, "rewards/margins": 0.6068886518478394, "rewards/rejected": -2.4796829223632812, "step": 8190 }, { "epoch": 1.4128187456926258, "grad_norm": 46.51035690307617, "learning_rate": 1.2725738859810862e-07, "logits/chosen": -2.112189769744873, "logits/rejected": -2.080653667449951, "logps/chosen": -244.71798706054688, "logps/rejected": -282.700439453125, "loss": 0.6207, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.8888120651245117, "rewards/margins": 0.4255140721797943, "rewards/rejected": -2.314326286315918, "step": 8200 }, { "epoch": 1.414541695382495, "grad_norm": 24.586009979248047, "learning_rate": 1.270644407899627e-07, "logits/chosen": -2.067437171936035, "logits/rejected": -2.0337612628936768, "logps/chosen": -203.90200805664062, "logps/rejected": -251.04037475585938, "loss": 0.5897, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.4770008325576782, "rewards/margins": 0.4779542088508606, "rewards/rejected": -1.9549548625946045, "step": 8210 }, { "epoch": 1.416264645072364, "grad_norm": 32.18049240112305, "learning_rate": 1.2687138419890863e-07, "logits/chosen": -2.1636199951171875, "logits/rejected": -2.1185073852539062, "logps/chosen": -184.97103881835938, "logps/rejected": -235.7440948486328, "loss": 0.5581, "rewards/accuracies": 0.71875, "rewards/chosen": -1.3181358575820923, "rewards/margins": 0.5168372392654419, "rewards/rejected": -1.8349730968475342, "step": 8220 }, { "epoch": 1.417987594762233, "grad_norm": 30.544574737548828, "learning_rate": 1.2667821960091865e-07, "logits/chosen": -2.14854097366333, "logits/rejected": -2.1254589557647705, "logps/chosen": -202.51792907714844, "logps/rejected": -237.36862182617188, "loss": 0.6038, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.4582659006118774, "rewards/margins": 0.41172361373901367, "rewards/rejected": -1.8699896335601807, "step": 8230 }, { "epoch": 1.4197105444521019, "grad_norm": 48.499568939208984, "learning_rate": 1.2648494777239934e-07, "logits/chosen": -2.19791579246521, "logits/rejected": -2.153782844543457, "logps/chosen": -221.105712890625, "logps/rejected": -264.9075622558594, "loss": 0.572, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.6414047479629517, "rewards/margins": 0.4974588453769684, "rewards/rejected": -2.1388633251190186, "step": 8240 }, { "epoch": 1.421433494141971, "grad_norm": 37.22239303588867, "learning_rate": 1.2629156949018805e-07, "logits/chosen": -2.154099464416504, "logits/rejected": -2.1236770153045654, "logps/chosen": -220.8912811279297, "logps/rejected": -274.5480041503906, "loss": 0.5441, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.6146440505981445, "rewards/margins": 0.589509129524231, "rewards/rejected": -2.204153537750244, "step": 8250 }, { "epoch": 1.42315644383184, "grad_norm": 28.23543930053711, "learning_rate": 1.260980855315502e-07, "logits/chosen": -2.190843105316162, "logits/rejected": -2.158343553543091, "logps/chosen": -228.07455444335938, "logps/rejected": -278.42626953125, "loss": 0.5663, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.6998857259750366, "rewards/margins": 0.5324597954750061, "rewards/rejected": -2.2323453426361084, "step": 8260 }, { "epoch": 1.4248793935217092, "grad_norm": 35.08918380737305, "learning_rate": 1.2590449667417585e-07, "logits/chosen": -2.1991262435913086, "logits/rejected": -2.1735010147094727, "logps/chosen": -216.458251953125, "logps/rejected": -261.60736083984375, "loss": 0.6025, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.6242341995239258, "rewards/margins": 0.4514681398868561, "rewards/rejected": -2.07570219039917, "step": 8270 }, { "epoch": 1.4266023432115782, "grad_norm": 32.83218002319336, "learning_rate": 1.2571080369617673e-07, "logits/chosen": -2.139239549636841, "logits/rejected": -2.1168651580810547, "logps/chosen": -204.988525390625, "logps/rejected": -241.57247924804688, "loss": 0.615, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.5196573734283447, "rewards/margins": 0.3771902322769165, "rewards/rejected": -1.8968474864959717, "step": 8280 }, { "epoch": 1.4283252929014472, "grad_norm": 22.9846134185791, "learning_rate": 1.2551700737608313e-07, "logits/chosen": -2.14395809173584, "logits/rejected": -2.0958104133605957, "logps/chosen": -183.81094360351562, "logps/rejected": -212.01416015625, "loss": 0.5932, "rewards/accuracies": 0.6875, "rewards/chosen": -1.231366753578186, "rewards/margins": 0.3542328476905823, "rewards/rejected": -1.585599660873413, "step": 8290 }, { "epoch": 1.4300482425913164, "grad_norm": 20.401521682739258, "learning_rate": 1.253231084928406e-07, "logits/chosen": -2.2636139392852783, "logits/rejected": -2.2296109199523926, "logps/chosen": -199.21340942382812, "logps/rejected": -241.51412963867188, "loss": 0.5865, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.3988856077194214, "rewards/margins": 0.4828069806098938, "rewards/rejected": -1.8816925287246704, "step": 8300 }, { "epoch": 1.4317711922811853, "grad_norm": 36.791229248046875, "learning_rate": 1.2512910782580704e-07, "logits/chosen": -2.1130568981170654, "logits/rejected": -2.073232889175415, "logps/chosen": -195.42286682128906, "logps/rejected": -232.5188446044922, "loss": 0.5896, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.3795632123947144, "rewards/margins": 0.4427759051322937, "rewards/rejected": -1.8223390579223633, "step": 8310 }, { "epoch": 1.4334941419710545, "grad_norm": 28.587932586669922, "learning_rate": 1.2493500615474937e-07, "logits/chosen": -2.128284215927124, "logits/rejected": -2.10135817527771, "logps/chosen": -190.30288696289062, "logps/rejected": -242.4776611328125, "loss": 0.5698, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.366477131843567, "rewards/margins": 0.523558497428894, "rewards/rejected": -1.89003586769104, "step": 8320 }, { "epoch": 1.4352170916609235, "grad_norm": 30.92583656311035, "learning_rate": 1.2474080425984056e-07, "logits/chosen": -2.1785707473754883, "logits/rejected": -2.1576743125915527, "logps/chosen": -188.43667602539062, "logps/rejected": -229.5851593017578, "loss": 0.615, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.3890917301177979, "rewards/margins": 0.37919071316719055, "rewards/rejected": -1.7682822942733765, "step": 8330 }, { "epoch": 1.4369400413507925, "grad_norm": 24.35154151916504, "learning_rate": 1.2454650292165634e-07, "logits/chosen": -2.2455568313598633, "logits/rejected": -2.2208244800567627, "logps/chosen": -173.263427734375, "logps/rejected": -210.4772491455078, "loss": 0.5819, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.1742428541183472, "rewards/margins": 0.41545358300209045, "rewards/rejected": -1.5896964073181152, "step": 8340 }, { "epoch": 1.4386629910406616, "grad_norm": 25.503570556640625, "learning_rate": 1.2435210292117223e-07, "logits/chosen": -2.1394078731536865, "logits/rejected": -2.112879753112793, "logps/chosen": -168.69577026367188, "logps/rejected": -195.05099487304688, "loss": 0.615, "rewards/accuracies": 0.65625, "rewards/chosen": -1.1397912502288818, "rewards/margins": 0.28986939787864685, "rewards/rejected": -1.4296607971191406, "step": 8350 }, { "epoch": 1.4403859407305306, "grad_norm": 29.409194946289062, "learning_rate": 1.2415760503976027e-07, "logits/chosen": -2.1161611080169678, "logits/rejected": -2.072409152984619, "logps/chosen": -164.82449340820312, "logps/rejected": -202.44180297851562, "loss": 0.5674, "rewards/accuracies": 0.71875, "rewards/chosen": -1.0871448516845703, "rewards/margins": 0.4222991466522217, "rewards/rejected": -1.509443998336792, "step": 8360 }, { "epoch": 1.4421088904203998, "grad_norm": 31.52328109741211, "learning_rate": 1.2396301005918592e-07, "logits/chosen": -2.1593313217163086, "logits/rejected": -2.1244845390319824, "logps/chosen": -173.5267791748047, "logps/rejected": -224.0096435546875, "loss": 0.5643, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.214634656906128, "rewards/margins": 0.5000126361846924, "rewards/rejected": -1.7146470546722412, "step": 8370 }, { "epoch": 1.4438318401102688, "grad_norm": 26.57062339782715, "learning_rate": 1.2376831876160493e-07, "logits/chosen": -2.207657814025879, "logits/rejected": -2.174440860748291, "logps/chosen": -182.63824462890625, "logps/rejected": -229.58169555664062, "loss": 0.5749, "rewards/accuracies": 0.6875, "rewards/chosen": -1.2988611459732056, "rewards/margins": 0.46366414427757263, "rewards/rejected": -1.7625253200531006, "step": 8380 }, { "epoch": 1.4455547898001377, "grad_norm": 33.905601501464844, "learning_rate": 1.2357353192956015e-07, "logits/chosen": -2.181260824203491, "logits/rejected": -2.130100727081299, "logps/chosen": -204.32766723632812, "logps/rejected": -269.24566650390625, "loss": 0.5149, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.469922423362732, "rewards/margins": 0.6692734956741333, "rewards/rejected": -2.1391959190368652, "step": 8390 }, { "epoch": 1.447277739490007, "grad_norm": 32.535919189453125, "learning_rate": 1.2337865034597853e-07, "logits/chosen": -2.1397032737731934, "logits/rejected": -2.1052467823028564, "logps/chosen": -210.44638061523438, "logps/rejected": -265.6088562011719, "loss": 0.5537, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.5950980186462402, "rewards/margins": 0.5580428242683411, "rewards/rejected": -2.1531407833099365, "step": 8400 }, { "epoch": 1.447277739490007, "eval_logits/chosen": -2.198362350463867, "eval_logits/rejected": -2.1797170639038086, "eval_logps/chosen": -202.7633819580078, "eval_logps/rejected": -230.68116760253906, "eval_loss": 0.6445255875587463, "eval_rewards/accuracies": 0.6259293556213379, "eval_rewards/chosen": -1.4374792575836182, "eval_rewards/margins": 0.24183642864227295, "eval_rewards/rejected": -1.679315447807312, "eval_runtime": 383.012, "eval_samples_per_second": 11.237, "eval_steps_per_second": 1.405, "step": 8400 }, { "epoch": 1.449000689179876, "grad_norm": 31.82181167602539, "learning_rate": 1.2318367479416772e-07, "logits/chosen": -2.113138198852539, "logits/rejected": -2.0723438262939453, "logps/chosen": -223.67822265625, "logps/rejected": -277.77703857421875, "loss": 0.5478, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.6481117010116577, "rewards/margins": 0.6274040937423706, "rewards/rejected": -2.2755157947540283, "step": 8410 }, { "epoch": 1.450723638869745, "grad_norm": 31.47911262512207, "learning_rate": 1.2298860605781317e-07, "logits/chosen": -2.025463581085205, "logits/rejected": -1.9937463998794556, "logps/chosen": -209.2928466796875, "logps/rejected": -263.2427673339844, "loss": 0.5496, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.5683337450027466, "rewards/margins": 0.5581804513931274, "rewards/rejected": -2.126513957977295, "step": 8420 }, { "epoch": 1.452446588559614, "grad_norm": 36.388572692871094, "learning_rate": 1.2279344492097482e-07, "logits/chosen": -2.0982298851013184, "logits/rejected": -2.066349744796753, "logps/chosen": -224.09603881835938, "logps/rejected": -277.3132629394531, "loss": 0.5998, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.6971712112426758, "rewards/margins": 0.5131348371505737, "rewards/rejected": -2.21030592918396, "step": 8430 }, { "epoch": 1.454169538249483, "grad_norm": 28.68044662475586, "learning_rate": 1.2259819216808406e-07, "logits/chosen": -2.1119582653045654, "logits/rejected": -2.089153289794922, "logps/chosen": -222.8911895751953, "logps/rejected": -261.83636474609375, "loss": 0.6012, "rewards/accuracies": 0.6875, "rewards/chosen": -1.6672919988632202, "rewards/margins": 0.4484184682369232, "rewards/rejected": -2.115710496902466, "step": 8440 }, { "epoch": 1.4558924879393522, "grad_norm": 44.86223220825195, "learning_rate": 1.2240284858394048e-07, "logits/chosen": -2.0294086933135986, "logits/rejected": -2.006143093109131, "logps/chosen": -217.24111938476562, "logps/rejected": -272.67327880859375, "loss": 0.5672, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.685194969177246, "rewards/margins": 0.5192277431488037, "rewards/rejected": -2.20442271232605, "step": 8450 }, { "epoch": 1.4576154376292212, "grad_norm": 28.0307559967041, "learning_rate": 1.2220741495370875e-07, "logits/chosen": -2.113776683807373, "logits/rejected": -2.079357147216797, "logps/chosen": -225.5147247314453, "logps/rejected": -274.3211669921875, "loss": 0.5735, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.7199233770370483, "rewards/margins": 0.49351826310157776, "rewards/rejected": -2.2134416103363037, "step": 8460 }, { "epoch": 1.4593383873190904, "grad_norm": 45.38313293457031, "learning_rate": 1.220118920629155e-07, "logits/chosen": -2.125673770904541, "logits/rejected": -2.0898425579071045, "logps/chosen": -226.78396606445312, "logps/rejected": -271.1668395996094, "loss": 0.6055, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.7033987045288086, "rewards/margins": 0.4875558316707611, "rewards/rejected": -2.1909546852111816, "step": 8470 }, { "epoch": 1.4610613370089593, "grad_norm": 34.2986946105957, "learning_rate": 1.2181628069744613e-07, "logits/chosen": -2.1393344402313232, "logits/rejected": -2.0992624759674072, "logps/chosen": -187.6902313232422, "logps/rejected": -234.9958038330078, "loss": 0.5692, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.3351308107376099, "rewards/margins": 0.5066913366317749, "rewards/rejected": -1.8418220281600952, "step": 8480 }, { "epoch": 1.4627842866988283, "grad_norm": 27.435794830322266, "learning_rate": 1.216205816435416e-07, "logits/chosen": -2.2194714546203613, "logits/rejected": -2.195204257965088, "logps/chosen": -175.31588745117188, "logps/rejected": -224.5002899169922, "loss": 0.5579, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.201210618019104, "rewards/margins": 0.48213768005371094, "rewards/rejected": -1.6833486557006836, "step": 8490 }, { "epoch": 1.4645072363886975, "grad_norm": 32.16939926147461, "learning_rate": 1.2142479568779545e-07, "logits/chosen": -2.1218631267547607, "logits/rejected": -2.0974326133728027, "logps/chosen": -182.5852813720703, "logps/rejected": -221.85195922851562, "loss": 0.579, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.2722389698028564, "rewards/margins": 0.42209678888320923, "rewards/rejected": -1.694335699081421, "step": 8500 }, { "epoch": 1.4662301860785665, "grad_norm": 40.23246765136719, "learning_rate": 1.2122892361715042e-07, "logits/chosen": -2.1258957386016846, "logits/rejected": -2.0866127014160156, "logps/chosen": -197.31124877929688, "logps/rejected": -246.4371795654297, "loss": 0.5417, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.3823117017745972, "rewards/margins": 0.5729239583015442, "rewards/rejected": -1.9552357196807861, "step": 8510 }, { "epoch": 1.4679531357684357, "grad_norm": 26.580339431762695, "learning_rate": 1.2103296621889531e-07, "logits/chosen": -2.098947525024414, "logits/rejected": -2.0675082206726074, "logps/chosen": -206.1828155517578, "logps/rejected": -247.58139038085938, "loss": 0.5763, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.5146195888519287, "rewards/margins": 0.4820035994052887, "rewards/rejected": -1.9966232776641846, "step": 8520 }, { "epoch": 1.4696760854583046, "grad_norm": 24.382217407226562, "learning_rate": 1.2083692428066207e-07, "logits/chosen": -2.1002607345581055, "logits/rejected": -2.0706868171691895, "logps/chosen": -195.86622619628906, "logps/rejected": -232.44155883789062, "loss": 0.6081, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.4051289558410645, "rewards/margins": 0.4165209233760834, "rewards/rejected": -1.8216501474380493, "step": 8530 }, { "epoch": 1.4713990351481736, "grad_norm": 25.53142738342285, "learning_rate": 1.2064079859042237e-07, "logits/chosen": -2.24005126953125, "logits/rejected": -2.216982126235962, "logps/chosen": -178.56973266601562, "logps/rejected": -211.1771240234375, "loss": 0.6123, "rewards/accuracies": 0.65625, "rewards/chosen": -1.2287400960922241, "rewards/margins": 0.34191378951072693, "rewards/rejected": -1.570654034614563, "step": 8540 }, { "epoch": 1.4731219848380428, "grad_norm": 26.57423973083496, "learning_rate": 1.204445899364844e-07, "logits/chosen": -2.174487590789795, "logits/rejected": -2.1445860862731934, "logps/chosen": -174.37026977539062, "logps/rejected": -221.49685668945312, "loss": 0.5647, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.1900184154510498, "rewards/margins": 0.47659429907798767, "rewards/rejected": -1.6666128635406494, "step": 8550 }, { "epoch": 1.4748449345279118, "grad_norm": 26.368349075317383, "learning_rate": 1.2024829910749e-07, "logits/chosen": -2.24088978767395, "logits/rejected": -2.1960439682006836, "logps/chosen": -182.83432006835938, "logps/rejected": -239.61111450195312, "loss": 0.5278, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.2800990343093872, "rewards/margins": 0.5987238883972168, "rewards/rejected": -1.878822684288025, "step": 8560 }, { "epoch": 1.476567884217781, "grad_norm": 36.1506233215332, "learning_rate": 1.2005192689241111e-07, "logits/chosen": -2.098243236541748, "logits/rejected": -2.067201614379883, "logps/chosen": -207.9953155517578, "logps/rejected": -254.19345092773438, "loss": 0.5628, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.5292612314224243, "rewards/margins": 0.5006064176559448, "rewards/rejected": -2.029867649078369, "step": 8570 }, { "epoch": 1.47829083390765, "grad_norm": 36.61823272705078, "learning_rate": 1.1985547408054707e-07, "logits/chosen": -2.137028217315674, "logits/rejected": -2.096841335296631, "logps/chosen": -226.59426879882812, "logps/rejected": -289.16400146484375, "loss": 0.5243, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.712284803390503, "rewards/margins": 0.639613926410675, "rewards/rejected": -2.351898670196533, "step": 8580 }, { "epoch": 1.480013783597519, "grad_norm": 27.322467803955078, "learning_rate": 1.1965894146152083e-07, "logits/chosen": -2.114105701446533, "logits/rejected": -2.0724968910217285, "logps/chosen": -236.0207977294922, "logps/rejected": -287.5677185058594, "loss": 0.5503, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.7737798690795898, "rewards/margins": 0.5756506323814392, "rewards/rejected": -2.349430561065674, "step": 8590 }, { "epoch": 1.481736733287388, "grad_norm": 39.7527961730957, "learning_rate": 1.1946232982527637e-07, "logits/chosen": -2.091597080230713, "logits/rejected": -2.0650010108947754, "logps/chosen": -234.6602020263672, "logps/rejected": -267.87298583984375, "loss": 0.647, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.8054077625274658, "rewards/margins": 0.36355963349342346, "rewards/rejected": -2.1689672470092773, "step": 8600 }, { "epoch": 1.483459682977257, "grad_norm": 32.23507308959961, "learning_rate": 1.1926563996207518e-07, "logits/chosen": -2.141597270965576, "logits/rejected": -2.107510566711426, "logps/chosen": -205.731689453125, "logps/rejected": -252.4749755859375, "loss": 0.5796, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.503330945968628, "rewards/margins": 0.518940806388855, "rewards/rejected": -2.0222718715667725, "step": 8610 }, { "epoch": 1.4851826326671262, "grad_norm": 25.941207885742188, "learning_rate": 1.1906887266249317e-07, "logits/chosen": -2.1043217182159424, "logits/rejected": -2.088639736175537, "logps/chosen": -187.4060821533203, "logps/rejected": -212.50051879882812, "loss": 0.6189, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.3152862787246704, "rewards/margins": 0.3075936436653137, "rewards/rejected": -1.622880220413208, "step": 8620 }, { "epoch": 1.4869055823569952, "grad_norm": 27.588563919067383, "learning_rate": 1.1887202871741757e-07, "logits/chosen": -2.0996415615081787, "logits/rejected": -2.0707783699035645, "logps/chosen": -168.55072021484375, "logps/rejected": -217.21298217773438, "loss": 0.5502, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.1530864238739014, "rewards/margins": 0.5020955204963684, "rewards/rejected": -1.655182123184204, "step": 8630 }, { "epoch": 1.4886285320468642, "grad_norm": 24.333717346191406, "learning_rate": 1.1867510891804353e-07, "logits/chosen": -2.1870296001434326, "logits/rejected": -2.1533381938934326, "logps/chosen": -199.0816650390625, "logps/rejected": -234.541015625, "loss": 0.6065, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.4219980239868164, "rewards/margins": 0.4048318862915039, "rewards/rejected": -1.8268299102783203, "step": 8640 }, { "epoch": 1.4903514817367332, "grad_norm": 39.82074737548828, "learning_rate": 1.1847811405587127e-07, "logits/chosen": -2.109865665435791, "logits/rejected": -2.071108341217041, "logps/chosen": -205.94320678710938, "logps/rejected": -248.83682250976562, "loss": 0.5983, "rewards/accuracies": 0.6875, "rewards/chosen": -1.4994192123413086, "rewards/margins": 0.4557233452796936, "rewards/rejected": -1.955142617225647, "step": 8650 }, { "epoch": 1.4920744314266023, "grad_norm": 42.07971954345703, "learning_rate": 1.1828104492270254e-07, "logits/chosen": -2.117985963821411, "logits/rejected": -2.086888551712036, "logps/chosen": -208.15737915039062, "logps/rejected": -256.1336975097656, "loss": 0.592, "rewards/accuracies": 0.65625, "rewards/chosen": -1.5353782176971436, "rewards/margins": 0.5151991248130798, "rewards/rejected": -2.05057692527771, "step": 8660 }, { "epoch": 1.4937973811164715, "grad_norm": 26.2050724029541, "learning_rate": 1.1808390231063783e-07, "logits/chosen": -2.1967854499816895, "logits/rejected": -2.16111421585083, "logps/chosen": -198.25198364257812, "logps/rejected": -248.2962188720703, "loss": 0.5703, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.4539095163345337, "rewards/margins": 0.5109778642654419, "rewards/rejected": -1.964887261390686, "step": 8670 }, { "epoch": 1.4955203308063405, "grad_norm": 33.02647399902344, "learning_rate": 1.1788668701207274e-07, "logits/chosen": -2.121121883392334, "logits/rejected": -2.1066901683807373, "logps/chosen": -204.23362731933594, "logps/rejected": -239.91250610351562, "loss": 0.6268, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.5213901996612549, "rewards/margins": 0.34961163997650146, "rewards/rejected": -1.871001958847046, "step": 8680 }, { "epoch": 1.4972432804962095, "grad_norm": 28.559772491455078, "learning_rate": 1.1768939981969515e-07, "logits/chosen": -2.1288161277770996, "logits/rejected": -2.0944597721099854, "logps/chosen": -211.2086181640625, "logps/rejected": -243.73361206054688, "loss": 0.6311, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.5508366823196411, "rewards/margins": 0.41266554594039917, "rewards/rejected": -1.9635021686553955, "step": 8690 }, { "epoch": 1.4989662301860784, "grad_norm": 23.26927947998047, "learning_rate": 1.1749204152648191e-07, "logits/chosen": -2.1648428440093994, "logits/rejected": -2.130405902862549, "logps/chosen": -206.45309448242188, "logps/rejected": -235.69580078125, "loss": 0.6144, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.493087887763977, "rewards/margins": 0.3440720736980438, "rewards/rejected": -1.8371598720550537, "step": 8700 }, { "epoch": 1.5006891798759476, "grad_norm": 21.11803436279297, "learning_rate": 1.1729461292569563e-07, "logits/chosen": -2.1213269233703613, "logits/rejected": -2.1003379821777344, "logps/chosen": -190.43356323242188, "logps/rejected": -219.8393096923828, "loss": 0.6394, "rewards/accuracies": 0.59375, "rewards/chosen": -1.364392876625061, "rewards/margins": 0.30296334624290466, "rewards/rejected": -1.6673561334609985, "step": 8710 }, { "epoch": 1.5024121295658168, "grad_norm": 33.64053726196289, "learning_rate": 1.1709711481088156e-07, "logits/chosen": -2.1832005977630615, "logits/rejected": -2.1413581371307373, "logps/chosen": -174.26988220214844, "logps/rejected": -220.2556610107422, "loss": 0.541, "rewards/accuracies": 0.6875, "rewards/chosen": -1.1718385219573975, "rewards/margins": 0.49143847823143005, "rewards/rejected": -1.66327702999115, "step": 8720 }, { "epoch": 1.5041350792556858, "grad_norm": 27.05516242980957, "learning_rate": 1.1689954797586422e-07, "logits/chosen": -2.1730306148529053, "logits/rejected": -2.127138614654541, "logps/chosen": -193.1197052001953, "logps/rejected": -243.0399169921875, "loss": 0.5735, "rewards/accuracies": 0.6875, "rewards/chosen": -1.3963727951049805, "rewards/margins": 0.519677996635437, "rewards/rejected": -1.916050672531128, "step": 8730 }, { "epoch": 1.5058580289455548, "grad_norm": 57.3426628112793, "learning_rate": 1.1670191321474457e-07, "logits/chosen": -2.146721363067627, "logits/rejected": -2.1224703788757324, "logps/chosen": -215.717529296875, "logps/rejected": -269.1415710449219, "loss": 0.5754, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.623270034790039, "rewards/margins": 0.546475887298584, "rewards/rejected": -2.169745922088623, "step": 8740 }, { "epoch": 1.5075809786354237, "grad_norm": 33.93220138549805, "learning_rate": 1.1650421132189634e-07, "logits/chosen": -2.145536422729492, "logits/rejected": -2.1085126399993896, "logps/chosen": -213.81161499023438, "logps/rejected": -272.66522216796875, "loss": 0.5375, "rewards/accuracies": 0.71875, "rewards/chosen": -1.5925045013427734, "rewards/margins": 0.6129032969474792, "rewards/rejected": -2.2054076194763184, "step": 8750 }, { "epoch": 1.509303928325293, "grad_norm": 26.928314208984375, "learning_rate": 1.1630644309196327e-07, "logits/chosen": -2.095851421356201, "logits/rejected": -2.0776500701904297, "logps/chosen": -219.29922485351562, "logps/rejected": -266.6626892089844, "loss": 0.5936, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.6660619974136353, "rewards/margins": 0.457836389541626, "rewards/rejected": -2.1238980293273926, "step": 8760 }, { "epoch": 1.5110268780151621, "grad_norm": 22.584171295166016, "learning_rate": 1.1610860931985566e-07, "logits/chosen": -2.140084743499756, "logits/rejected": -2.1087329387664795, "logps/chosen": -208.4669952392578, "logps/rejected": -253.07095336914062, "loss": 0.5809, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.5258299112319946, "rewards/margins": 0.5107992887496948, "rewards/rejected": -2.0366291999816895, "step": 8770 }, { "epoch": 1.512749827705031, "grad_norm": 30.782161712646484, "learning_rate": 1.1591071080074727e-07, "logits/chosen": -2.200284957885742, "logits/rejected": -2.1839470863342285, "logps/chosen": -196.86715698242188, "logps/rejected": -236.0321044921875, "loss": 0.6009, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.4668762683868408, "rewards/margins": 0.3684500455856323, "rewards/rejected": -1.8353259563446045, "step": 8780 }, { "epoch": 1.5144727773949, "grad_norm": 31.48596954345703, "learning_rate": 1.1571274833007214e-07, "logits/chosen": -2.2006895542144775, "logits/rejected": -2.162903308868408, "logps/chosen": -187.66943359375, "logps/rejected": -231.6300048828125, "loss": 0.5716, "rewards/accuracies": 0.71875, "rewards/chosen": -1.3145688772201538, "rewards/margins": 0.4905197024345398, "rewards/rejected": -1.8050886392593384, "step": 8790 }, { "epoch": 1.516195727084769, "grad_norm": 22.056011199951172, "learning_rate": 1.1551472270352125e-07, "logits/chosen": -2.150611162185669, "logits/rejected": -2.116769790649414, "logps/chosen": -187.0447540283203, "logps/rejected": -218.8388214111328, "loss": 0.61, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.307472586631775, "rewards/margins": 0.36511510610580444, "rewards/rejected": -1.6725876331329346, "step": 8800 }, { "epoch": 1.516195727084769, "eval_logits/chosen": -2.2578678131103516, "eval_logits/rejected": -2.242769479751587, "eval_logps/chosen": -168.42660522460938, "eval_logps/rejected": -192.21197509765625, "eval_loss": 0.640521764755249, "eval_rewards/accuracies": 0.616403341293335, "eval_rewards/chosen": -1.094111442565918, "eval_rewards/margins": 0.20051223039627075, "eval_rewards/rejected": -1.294623613357544, "eval_runtime": 383.423, "eval_samples_per_second": 11.225, "eval_steps_per_second": 1.403, "step": 8800 }, { "epoch": 1.5179186767746382, "grad_norm": 37.015296936035156, "learning_rate": 1.1531663471703956e-07, "logits/chosen": -2.187131404876709, "logits/rejected": -2.144275426864624, "logps/chosen": -187.49765014648438, "logps/rejected": -243.79248046875, "loss": 0.5379, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.3532283306121826, "rewards/margins": 0.5681970119476318, "rewards/rejected": -1.921425223350525, "step": 8810 }, { "epoch": 1.5196416264645074, "grad_norm": 32.8203239440918, "learning_rate": 1.1511848516682257e-07, "logits/chosen": -2.192147970199585, "logits/rejected": -2.1544651985168457, "logps/chosen": -194.306640625, "logps/rejected": -248.0606231689453, "loss": 0.5455, "rewards/accuracies": 0.78125, "rewards/chosen": -1.4270803928375244, "rewards/margins": 0.564303994178772, "rewards/rejected": -1.9913842678070068, "step": 8820 }, { "epoch": 1.5213645761543764, "grad_norm": 38.81319808959961, "learning_rate": 1.149202748493133e-07, "logits/chosen": -2.034425735473633, "logits/rejected": -1.9980943202972412, "logps/chosen": -211.1022491455078, "logps/rejected": -260.2487487792969, "loss": 0.5638, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.557013750076294, "rewards/margins": 0.5233016014099121, "rewards/rejected": -2.080315351486206, "step": 8830 }, { "epoch": 1.5230875258442453, "grad_norm": 29.696474075317383, "learning_rate": 1.1472200456119901e-07, "logits/chosen": -2.088050365447998, "logits/rejected": -2.060323476791382, "logps/chosen": -198.53713989257812, "logps/rejected": -258.73529052734375, "loss": 0.537, "rewards/accuracies": 0.75, "rewards/chosen": -1.449790358543396, "rewards/margins": 0.6091513633728027, "rewards/rejected": -2.058941602706909, "step": 8840 }, { "epoch": 1.5248104755341143, "grad_norm": 40.81111145019531, "learning_rate": 1.1452367509940794e-07, "logits/chosen": -2.1819236278533936, "logits/rejected": -2.1462504863739014, "logps/chosen": -206.32754516601562, "logps/rejected": -267.1129455566406, "loss": 0.5548, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.5286524295806885, "rewards/margins": 0.6038883924484253, "rewards/rejected": -2.1325409412384033, "step": 8850 }, { "epoch": 1.5265334252239835, "grad_norm": 37.93629455566406, "learning_rate": 1.1432528726110628e-07, "logits/chosen": -2.0919547080993652, "logits/rejected": -2.056307315826416, "logps/chosen": -230.38778686523438, "logps/rejected": -283.297119140625, "loss": 0.5624, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.7613922357559204, "rewards/margins": 0.5530128479003906, "rewards/rejected": -2.3144049644470215, "step": 8860 }, { "epoch": 1.5282563749138525, "grad_norm": 28.398059844970703, "learning_rate": 1.1412684184369478e-07, "logits/chosen": -2.2037432193756104, "logits/rejected": -2.154242753982544, "logps/chosen": -233.8716583251953, "logps/rejected": -292.74822998046875, "loss": 0.5315, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.7590751647949219, "rewards/margins": 0.623824954032898, "rewards/rejected": -2.3829002380371094, "step": 8870 }, { "epoch": 1.5299793246037217, "grad_norm": 54.44942092895508, "learning_rate": 1.1392833964480564e-07, "logits/chosen": -2.0453591346740723, "logits/rejected": -2.0088441371917725, "logps/chosen": -233.88278198242188, "logps/rejected": -281.6195373535156, "loss": 0.6176, "rewards/accuracies": 0.65625, "rewards/chosen": -1.7956584692001343, "rewards/margins": 0.515036940574646, "rewards/rejected": -2.310695171356201, "step": 8880 }, { "epoch": 1.5317022742935906, "grad_norm": 38.829612731933594, "learning_rate": 1.137297814622993e-07, "logits/chosen": -2.057898759841919, "logits/rejected": -2.014880895614624, "logps/chosen": -218.6936492919922, "logps/rejected": -271.9684753417969, "loss": 0.5398, "rewards/accuracies": 0.75, "rewards/chosen": -1.6280596256256104, "rewards/margins": 0.591567873954773, "rewards/rejected": -2.2196271419525146, "step": 8890 }, { "epoch": 1.5334252239834596, "grad_norm": 34.26298904418945, "learning_rate": 1.1353116809426121e-07, "logits/chosen": -2.137526273727417, "logits/rejected": -2.099030017852783, "logps/chosen": -216.14047241210938, "logps/rejected": -262.841796875, "loss": 0.5809, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.6013559103012085, "rewards/margins": 0.5089315176010132, "rewards/rejected": -2.110287666320801, "step": 8900 }, { "epoch": 1.5351481736733288, "grad_norm": 41.93275451660156, "learning_rate": 1.1333250033899867e-07, "logits/chosen": -2.138963222503662, "logits/rejected": -2.1128087043762207, "logps/chosen": -210.839111328125, "logps/rejected": -253.3659210205078, "loss": 0.5888, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.537819504737854, "rewards/margins": 0.4727010130882263, "rewards/rejected": -2.0105204582214355, "step": 8910 }, { "epoch": 1.5368711233631978, "grad_norm": 25.67527198791504, "learning_rate": 1.131337789950375e-07, "logits/chosen": -2.2184395790100098, "logits/rejected": -2.170405149459839, "logps/chosen": -203.72010803222656, "logps/rejected": -259.0718688964844, "loss": 0.5379, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.449211835861206, "rewards/margins": 0.6281321048736572, "rewards/rejected": -2.0773439407348633, "step": 8920 }, { "epoch": 1.538594073053067, "grad_norm": 30.74202537536621, "learning_rate": 1.12935004861119e-07, "logits/chosen": -2.1468801498413086, "logits/rejected": -2.103550434112549, "logps/chosen": -206.2528533935547, "logps/rejected": -254.24356079101562, "loss": 0.5687, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.510958194732666, "rewards/margins": 0.533153772354126, "rewards/rejected": -2.044111967086792, "step": 8930 }, { "epoch": 1.540317022742936, "grad_norm": 31.103778839111328, "learning_rate": 1.1273617873619663e-07, "logits/chosen": -2.155284881591797, "logits/rejected": -2.1240036487579346, "logps/chosen": -204.4677276611328, "logps/rejected": -248.03125, "loss": 0.5825, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.4737517833709717, "rewards/margins": 0.4530414044857025, "rewards/rejected": -1.9267933368682861, "step": 8940 }, { "epoch": 1.5420399724328049, "grad_norm": 33.27176284790039, "learning_rate": 1.1253730141943276e-07, "logits/chosen": -2.0801100730895996, "logits/rejected": -2.0663437843322754, "logps/chosen": -206.09115600585938, "logps/rejected": -250.7277374267578, "loss": 0.5877, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.5113497972488403, "rewards/margins": 0.43479982018470764, "rewards/rejected": -1.946149230003357, "step": 8950 }, { "epoch": 1.5437629221226739, "grad_norm": 57.94588088989258, "learning_rate": 1.1233837371019566e-07, "logits/chosen": -2.1334681510925293, "logits/rejected": -2.0911877155303955, "logps/chosen": -240.5612030029297, "logps/rejected": -292.9984130859375, "loss": 0.5788, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.8518180847167969, "rewards/margins": 0.5720192193984985, "rewards/rejected": -2.423837423324585, "step": 8960 }, { "epoch": 1.545485871812543, "grad_norm": 31.889646530151367, "learning_rate": 1.1213939640805594e-07, "logits/chosen": -2.095510721206665, "logits/rejected": -2.0452637672424316, "logps/chosen": -237.1003875732422, "logps/rejected": -292.24005126953125, "loss": 0.52, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.8281110525131226, "rewards/margins": 0.5949289798736572, "rewards/rejected": -2.4230399131774902, "step": 8970 }, { "epoch": 1.5472088215024122, "grad_norm": 40.414710998535156, "learning_rate": 1.1194037031278378e-07, "logits/chosen": -2.1494863033294678, "logits/rejected": -2.118666172027588, "logps/chosen": -257.7496337890625, "logps/rejected": -294.08026123046875, "loss": 0.6499, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -2.0259671211242676, "rewards/margins": 0.3900921046733856, "rewards/rejected": -2.4160590171813965, "step": 8980 }, { "epoch": 1.5489317711922812, "grad_norm": 27.115970611572266, "learning_rate": 1.1174129622434531e-07, "logits/chosen": -2.084998607635498, "logits/rejected": -2.046213388442993, "logps/chosen": -217.5188446044922, "logps/rejected": -272.76715087890625, "loss": 0.5303, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.6049985885620117, "rewards/margins": 0.5865806341171265, "rewards/rejected": -2.1915793418884277, "step": 8990 }, { "epoch": 1.5506547208821502, "grad_norm": 38.3233528137207, "learning_rate": 1.1154217494289966e-07, "logits/chosen": -2.1537277698516846, "logits/rejected": -2.1174731254577637, "logps/chosen": -232.237060546875, "logps/rejected": -275.6959228515625, "loss": 0.6133, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.7371547222137451, "rewards/margins": 0.5046462416648865, "rewards/rejected": -2.2418007850646973, "step": 9000 }, { "epoch": 1.5523776705720191, "grad_norm": 26.543476104736328, "learning_rate": 1.1134300726879557e-07, "logits/chosen": -2.1145331859588623, "logits/rejected": -2.084182024002075, "logps/chosen": -218.10641479492188, "logps/rejected": -260.49365234375, "loss": 0.5863, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.6136271953582764, "rewards/margins": 0.4704989790916443, "rewards/rejected": -2.0841259956359863, "step": 9010 }, { "epoch": 1.5541006202618883, "grad_norm": 32.8950080871582, "learning_rate": 1.1114379400256828e-07, "logits/chosen": -2.0746097564697266, "logits/rejected": -2.040530204772949, "logps/chosen": -196.68701171875, "logps/rejected": -250.39822387695312, "loss": 0.5431, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.4095488786697388, "rewards/margins": 0.5949562788009644, "rewards/rejected": -2.004505157470703, "step": 9020 }, { "epoch": 1.5558235699517575, "grad_norm": 34.005210876464844, "learning_rate": 1.1094453594493634e-07, "logits/chosen": -2.1117730140686035, "logits/rejected": -2.1014599800109863, "logps/chosen": -189.75453186035156, "logps/rejected": -237.1101837158203, "loss": 0.5811, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.373258113861084, "rewards/margins": 0.4692157208919525, "rewards/rejected": -1.8424737453460693, "step": 9030 }, { "epoch": 1.5575465196416265, "grad_norm": 35.201900482177734, "learning_rate": 1.107452338967982e-07, "logits/chosen": -2.1079840660095215, "logits/rejected": -2.0829663276672363, "logps/chosen": -194.3500518798828, "logps/rejected": -235.8008575439453, "loss": 0.6012, "rewards/accuracies": 0.6875, "rewards/chosen": -1.430095911026001, "rewards/margins": 0.4139363169670105, "rewards/rejected": -1.8440319299697876, "step": 9040 }, { "epoch": 1.5592694693314955, "grad_norm": 48.223419189453125, "learning_rate": 1.1054588865922931e-07, "logits/chosen": -2.161323308944702, "logits/rejected": -2.1286869049072266, "logps/chosen": -210.0124969482422, "logps/rejected": -252.99826049804688, "loss": 0.6013, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.5600329637527466, "rewards/margins": 0.4573109745979309, "rewards/rejected": -2.0173439979553223, "step": 9050 }, { "epoch": 1.5609924190213644, "grad_norm": 32.22039794921875, "learning_rate": 1.1034650103347856e-07, "logits/chosen": -2.17484712600708, "logits/rejected": -2.1460723876953125, "logps/chosen": -193.7638397216797, "logps/rejected": -224.82772827148438, "loss": 0.6186, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.3822447061538696, "rewards/margins": 0.34044378995895386, "rewards/rejected": -1.7226884365081787, "step": 9060 }, { "epoch": 1.5627153687112336, "grad_norm": 27.751741409301758, "learning_rate": 1.1014707182096525e-07, "logits/chosen": -2.160062313079834, "logits/rejected": -2.1260383129119873, "logps/chosen": -174.01168823242188, "logps/rejected": -227.6369171142578, "loss": 0.5207, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.1757690906524658, "rewards/margins": 0.5599743127822876, "rewards/rejected": -1.735743522644043, "step": 9070 }, { "epoch": 1.5644383184011028, "grad_norm": 29.899240493774414, "learning_rate": 1.0994760182327593e-07, "logits/chosen": -2.1407668590545654, "logits/rejected": -2.116692304611206, "logps/chosen": -184.6007080078125, "logps/rejected": -230.424560546875, "loss": 0.5897, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.315338134765625, "rewards/margins": 0.438321053981781, "rewards/rejected": -1.7536592483520508, "step": 9080 }, { "epoch": 1.5661612680909718, "grad_norm": 29.706432342529297, "learning_rate": 1.0974809184216094e-07, "logits/chosen": -2.0893330574035645, "logits/rejected": -2.04660701751709, "logps/chosen": -212.39706420898438, "logps/rejected": -258.37762451171875, "loss": 0.5413, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.5627620220184326, "rewards/margins": 0.5210008025169373, "rewards/rejected": -2.0837631225585938, "step": 9090 }, { "epoch": 1.5678842177808407, "grad_norm": 31.138164520263672, "learning_rate": 1.0954854267953146e-07, "logits/chosen": -2.1368656158447266, "logits/rejected": -2.1022555828094482, "logps/chosen": -227.8143768310547, "logps/rejected": -255.43417358398438, "loss": 0.6453, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.7001497745513916, "rewards/margins": 0.3254784941673279, "rewards/rejected": -2.025627851486206, "step": 9100 }, { "epoch": 1.5696071674707097, "grad_norm": 31.21167755126953, "learning_rate": 1.0934895513745603e-07, "logits/chosen": -2.1031768321990967, "logits/rejected": -2.066413402557373, "logps/chosen": -214.42800903320312, "logps/rejected": -263.53326416015625, "loss": 0.5665, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.5775201320648193, "rewards/margins": 0.5366379022598267, "rewards/rejected": -2.1141581535339355, "step": 9110 }, { "epoch": 1.571330117160579, "grad_norm": 32.25068283081055, "learning_rate": 1.0914933001815754e-07, "logits/chosen": -2.144286870956421, "logits/rejected": -2.1097588539123535, "logps/chosen": -217.8202362060547, "logps/rejected": -258.30316162109375, "loss": 0.5685, "rewards/accuracies": 0.71875, "rewards/chosen": -1.560901165008545, "rewards/margins": 0.47228294610977173, "rewards/rejected": -2.033184289932251, "step": 9120 }, { "epoch": 1.573053066850448, "grad_norm": 32.76934051513672, "learning_rate": 1.0894966812400992e-07, "logits/chosen": -2.10341215133667, "logits/rejected": -2.06626558303833, "logps/chosen": -213.362548828125, "logps/rejected": -258.8315734863281, "loss": 0.5847, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.579810380935669, "rewards/margins": 0.4671041965484619, "rewards/rejected": -2.046914577484131, "step": 9130 }, { "epoch": 1.574776016540317, "grad_norm": 21.085250854492188, "learning_rate": 1.0874997025753482e-07, "logits/chosen": -2.157405376434326, "logits/rejected": -2.106078624725342, "logps/chosen": -202.14071655273438, "logps/rejected": -251.3083038330078, "loss": 0.5309, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.4339019060134888, "rewards/margins": 0.5942884683609009, "rewards/rejected": -2.0281903743743896, "step": 9140 }, { "epoch": 1.576498966230186, "grad_norm": 29.505142211914062, "learning_rate": 1.0855023722139864e-07, "logits/chosen": -2.1527392864227295, "logits/rejected": -2.1070356369018555, "logps/chosen": -207.39437866210938, "logps/rejected": -264.4066467285156, "loss": 0.5427, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.478251576423645, "rewards/margins": 0.6295614838600159, "rewards/rejected": -2.1078131198883057, "step": 9150 }, { "epoch": 1.578221915920055, "grad_norm": 29.595401763916016, "learning_rate": 1.0835046981840896e-07, "logits/chosen": -2.1166820526123047, "logits/rejected": -2.0935609340667725, "logps/chosen": -195.57730102539062, "logps/rejected": -249.2331085205078, "loss": 0.5487, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.456304907798767, "rewards/margins": 0.5181986093521118, "rewards/rejected": -1.974503517150879, "step": 9160 }, { "epoch": 1.5799448656099242, "grad_norm": 29.17534637451172, "learning_rate": 1.0815066885151165e-07, "logits/chosen": -2.167940616607666, "logits/rejected": -2.1178174018859863, "logps/chosen": -214.2770233154297, "logps/rejected": -270.4759521484375, "loss": 0.5349, "rewards/accuracies": 0.71875, "rewards/chosen": -1.5862025022506714, "rewards/margins": 0.6339669227600098, "rewards/rejected": -2.2201695442199707, "step": 9170 }, { "epoch": 1.5816678152997934, "grad_norm": 25.466968536376953, "learning_rate": 1.0795083512378738e-07, "logits/chosen": -2.119816303253174, "logits/rejected": -2.098113536834717, "logps/chosen": -221.22421264648438, "logps/rejected": -262.0200500488281, "loss": 0.6017, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.6510483026504517, "rewards/margins": 0.4491577744483948, "rewards/rejected": -2.100205898284912, "step": 9180 }, { "epoch": 1.5833907649896624, "grad_norm": 29.182552337646484, "learning_rate": 1.077509694384485e-07, "logits/chosen": -2.2061610221862793, "logits/rejected": -2.1876559257507324, "logps/chosen": -216.8682403564453, "logps/rejected": -272.07733154296875, "loss": 0.5311, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.6165103912353516, "rewards/margins": 0.5453293919563293, "rewards/rejected": -2.1618399620056152, "step": 9190 }, { "epoch": 1.5851137146795313, "grad_norm": 29.112823486328125, "learning_rate": 1.0755107259883591e-07, "logits/chosen": -2.1319005489349365, "logits/rejected": -2.094393491744995, "logps/chosen": -216.9573974609375, "logps/rejected": -276.7088317871094, "loss": 0.523, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.6260998249053955, "rewards/margins": 0.6355565190315247, "rewards/rejected": -2.2616562843322754, "step": 9200 }, { "epoch": 1.5851137146795313, "eval_logits/chosen": -2.1756374835968018, "eval_logits/rejected": -2.156975507736206, "eval_logps/chosen": -204.97225952148438, "eval_logps/rejected": -233.03977966308594, "eval_loss": 0.6431333422660828, "eval_rewards/accuracies": 0.6289498209953308, "eval_rewards/chosen": -1.459567904472351, "eval_rewards/margins": 0.24333389103412628, "eval_rewards/rejected": -1.7029017210006714, "eval_runtime": 383.0245, "eval_samples_per_second": 11.237, "eval_steps_per_second": 1.405, "step": 9200 }, { "epoch": 1.5868366643694003, "grad_norm": 40.64500427246094, "learning_rate": 1.0735114540841565e-07, "logits/chosen": -2.053277015686035, "logits/rejected": -2.010258913040161, "logps/chosen": -233.3690948486328, "logps/rejected": -277.2367248535156, "loss": 0.5958, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.7608321905136108, "rewards/margins": 0.5039989352226257, "rewards/rejected": -2.264831066131592, "step": 9210 }, { "epoch": 1.5885596140592695, "grad_norm": 44.250858306884766, "learning_rate": 1.0715118867077575e-07, "logits/chosen": -2.101931095123291, "logits/rejected": -2.070049285888672, "logps/chosen": -214.3753204345703, "logps/rejected": -270.01031494140625, "loss": 0.5596, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.6313447952270508, "rewards/margins": 0.5771504640579224, "rewards/rejected": -2.2084951400756836, "step": 9220 }, { "epoch": 1.5902825637491387, "grad_norm": 30.64914894104004, "learning_rate": 1.0695120318962305e-07, "logits/chosen": -2.047006607055664, "logits/rejected": -2.011451482772827, "logps/chosen": -207.5570068359375, "logps/rejected": -269.8608703613281, "loss": 0.5311, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.573559045791626, "rewards/margins": 0.5993128418922424, "rewards/rejected": -2.1728720664978027, "step": 9230 }, { "epoch": 1.5920055134390076, "grad_norm": 33.54546356201172, "learning_rate": 1.0675118976877989e-07, "logits/chosen": -2.0637457370758057, "logits/rejected": -2.0366601943969727, "logps/chosen": -238.0423583984375, "logps/rejected": -293.1909484863281, "loss": 0.569, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.8054208755493164, "rewards/margins": 0.5892099738121033, "rewards/rejected": -2.3946309089660645, "step": 9240 }, { "epoch": 1.5937284631288766, "grad_norm": 25.22513198852539, "learning_rate": 1.0655114921218086e-07, "logits/chosen": -2.0545201301574707, "logits/rejected": -2.0165328979492188, "logps/chosen": -219.8867645263672, "logps/rejected": -272.6210632324219, "loss": 0.5682, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.635759711265564, "rewards/margins": 0.5482776165008545, "rewards/rejected": -2.184037208557129, "step": 9250 }, { "epoch": 1.5954514128187456, "grad_norm": 27.668685913085938, "learning_rate": 1.0635108232386976e-07, "logits/chosen": -2.0562832355499268, "logits/rejected": -2.0279722213745117, "logps/chosen": -224.8714141845703, "logps/rejected": -274.3570861816406, "loss": 0.5967, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.7631824016571045, "rewards/margins": 0.5013735890388489, "rewards/rejected": -2.264556407928467, "step": 9260 }, { "epoch": 1.5971743625086148, "grad_norm": 26.909574508666992, "learning_rate": 1.0615098990799607e-07, "logits/chosen": -2.1258835792541504, "logits/rejected": -2.084963798522949, "logps/chosen": -222.63735961914062, "logps/rejected": -271.89166259765625, "loss": 0.5589, "rewards/accuracies": 0.71875, "rewards/chosen": -1.664276123046875, "rewards/margins": 0.5433754920959473, "rewards/rejected": -2.2076516151428223, "step": 9270 }, { "epoch": 1.598897312198484, "grad_norm": 34.04690170288086, "learning_rate": 1.05950872768812e-07, "logits/chosen": -2.1449623107910156, "logits/rejected": -2.1120221614837646, "logps/chosen": -200.7971649169922, "logps/rejected": -247.43359375, "loss": 0.5702, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.4581162929534912, "rewards/margins": 0.4871610105037689, "rewards/rejected": -1.945277214050293, "step": 9280 }, { "epoch": 1.600620261888353, "grad_norm": 26.49469566345215, "learning_rate": 1.0575073171066906e-07, "logits/chosen": -2.07810640335083, "logits/rejected": -2.04874324798584, "logps/chosen": -191.39736938476562, "logps/rejected": -225.0348663330078, "loss": 0.6096, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.35977041721344, "rewards/margins": 0.3903138041496277, "rewards/rejected": -1.7500841617584229, "step": 9290 }, { "epoch": 1.602343211578222, "grad_norm": 34.1944694519043, "learning_rate": 1.0555056753801493e-07, "logits/chosen": -2.1365926265716553, "logits/rejected": -2.1075198650360107, "logps/chosen": -190.500732421875, "logps/rejected": -245.53759765625, "loss": 0.5479, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.3554012775421143, "rewards/margins": 0.5518912672996521, "rewards/rejected": -1.9072927236557007, "step": 9300 }, { "epoch": 1.6040661612680909, "grad_norm": 26.409666061401367, "learning_rate": 1.0535038105539014e-07, "logits/chosen": -2.129552125930786, "logits/rejected": -2.1025259494781494, "logps/chosen": -191.71160888671875, "logps/rejected": -234.30126953125, "loss": 0.5647, "rewards/accuracies": 0.71875, "rewards/chosen": -1.3693513870239258, "rewards/margins": 0.47751206159591675, "rewards/rejected": -1.8468633890151978, "step": 9310 }, { "epoch": 1.60578911095796, "grad_norm": 28.054391860961914, "learning_rate": 1.0515017306742504e-07, "logits/chosen": -2.186974048614502, "logits/rejected": -2.148070812225342, "logps/chosen": -194.74349975585938, "logps/rejected": -250.1185760498047, "loss": 0.5573, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.4252640008926392, "rewards/margins": 0.5447186231613159, "rewards/rejected": -1.9699825048446655, "step": 9320 }, { "epoch": 1.607512060647829, "grad_norm": 34.07315444946289, "learning_rate": 1.0494994437883619e-07, "logits/chosen": -2.1339361667633057, "logits/rejected": -2.0924479961395264, "logps/chosen": -207.88687133789062, "logps/rejected": -262.8285827636719, "loss": 0.5524, "rewards/accuracies": 0.75, "rewards/chosen": -1.4944640398025513, "rewards/margins": 0.5901376605033875, "rewards/rejected": -2.084601879119873, "step": 9330 }, { "epoch": 1.6092350103376982, "grad_norm": 26.45479393005371, "learning_rate": 1.0474969579442356e-07, "logits/chosen": -2.067122459411621, "logits/rejected": -2.0326459407806396, "logps/chosen": -214.92770385742188, "logps/rejected": -278.4351806640625, "loss": 0.5254, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.6262165307998657, "rewards/margins": 0.654121994972229, "rewards/rejected": -2.2803382873535156, "step": 9340 }, { "epoch": 1.6109579600275672, "grad_norm": 55.737125396728516, "learning_rate": 1.0454942811906703e-07, "logits/chosen": -2.0701682567596436, "logits/rejected": -2.0222160816192627, "logps/chosen": -223.65353393554688, "logps/rejected": -282.44549560546875, "loss": 0.5307, "rewards/accuracies": 0.71875, "rewards/chosen": -1.7031676769256592, "rewards/margins": 0.6135429739952087, "rewards/rejected": -2.3167104721069336, "step": 9350 }, { "epoch": 1.6126809097174362, "grad_norm": 42.1988639831543, "learning_rate": 1.0434914215772318e-07, "logits/chosen": -2.1071135997772217, "logits/rejected": -2.058732509613037, "logps/chosen": -254.6279296875, "logps/rejected": -318.9466552734375, "loss": 0.5517, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.9617589712142944, "rewards/margins": 0.7104604244232178, "rewards/rejected": -2.6722190380096436, "step": 9360 }, { "epoch": 1.6144038594073054, "grad_norm": 55.07982635498047, "learning_rate": 1.0414883871542208e-07, "logits/chosen": -2.111906051635742, "logits/rejected": -2.054779529571533, "logps/chosen": -245.83413696289062, "logps/rejected": -314.3738098144531, "loss": 0.5225, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.9080016613006592, "rewards/margins": 0.7261230945587158, "rewards/rejected": -2.634124994277954, "step": 9370 }, { "epoch": 1.6161268090971743, "grad_norm": 30.782182693481445, "learning_rate": 1.0394851859726408e-07, "logits/chosen": -2.116983652114868, "logits/rejected": -2.0886120796203613, "logps/chosen": -228.97787475585938, "logps/rejected": -272.6360168457031, "loss": 0.6418, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.7381610870361328, "rewards/margins": 0.4510871469974518, "rewards/rejected": -2.189248561859131, "step": 9380 }, { "epoch": 1.6178497587870435, "grad_norm": 28.4465389251709, "learning_rate": 1.0374818260841663e-07, "logits/chosen": -2.0388429164886475, "logits/rejected": -2.00158953666687, "logps/chosen": -199.9791717529297, "logps/rejected": -260.9677734375, "loss": 0.5261, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.4609673023223877, "rewards/margins": 0.6105450391769409, "rewards/rejected": -2.071512460708618, "step": 9390 }, { "epoch": 1.6195727084769125, "grad_norm": 29.523244857788086, "learning_rate": 1.035478315541108e-07, "logits/chosen": -2.10453462600708, "logits/rejected": -2.0754971504211426, "logps/chosen": -196.71092224121094, "logps/rejected": -237.4402618408203, "loss": 0.6086, "rewards/accuracies": 0.65625, "rewards/chosen": -1.41031813621521, "rewards/margins": 0.42263931035995483, "rewards/rejected": -1.8329575061798096, "step": 9400 }, { "epoch": 1.6212956581667815, "grad_norm": 40.687496185302734, "learning_rate": 1.0334746623963843e-07, "logits/chosen": -2.087872266769409, "logits/rejected": -2.050535202026367, "logps/chosen": -208.1427764892578, "logps/rejected": -262.15130615234375, "loss": 0.5531, "rewards/accuracies": 0.65625, "rewards/chosen": -1.5345708131790161, "rewards/margins": 0.5761247277259827, "rewards/rejected": -2.1106953620910645, "step": 9410 }, { "epoch": 1.6230186078566504, "grad_norm": 50.26692581176758, "learning_rate": 1.031470874703485e-07, "logits/chosen": -2.109248161315918, "logits/rejected": -2.0769641399383545, "logps/chosen": -220.5753936767578, "logps/rejected": -263.15814208984375, "loss": 0.5914, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.6488628387451172, "rewards/margins": 0.47126445174217224, "rewards/rejected": -2.1201274394989014, "step": 9420 }, { "epoch": 1.6247415575465196, "grad_norm": 28.470251083374023, "learning_rate": 1.0294669605164417e-07, "logits/chosen": -2.0839807987213135, "logits/rejected": -2.043217182159424, "logps/chosen": -210.2068328857422, "logps/rejected": -260.85699462890625, "loss": 0.5825, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.5591367483139038, "rewards/margins": 0.5711666345596313, "rewards/rejected": -2.130303144454956, "step": 9430 }, { "epoch": 1.6264645072363888, "grad_norm": 39.39202117919922, "learning_rate": 1.0274629278897941e-07, "logits/chosen": -2.091012716293335, "logits/rejected": -2.070128917694092, "logps/chosen": -195.10609436035156, "logps/rejected": -238.289306640625, "loss": 0.5901, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.4520760774612427, "rewards/margins": 0.41639605164527893, "rewards/rejected": -1.8684720993041992, "step": 9440 }, { "epoch": 1.6281874569262578, "grad_norm": 40.777000427246094, "learning_rate": 1.0254587848785574e-07, "logits/chosen": -2.208874464035034, "logits/rejected": -2.1783530712127686, "logps/chosen": -209.24563598632812, "logps/rejected": -247.0983123779297, "loss": 0.6091, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.5514070987701416, "rewards/margins": 0.4226817190647125, "rewards/rejected": -1.9740889072418213, "step": 9450 }, { "epoch": 1.6299104066161267, "grad_norm": 30.348520278930664, "learning_rate": 1.0234545395381922e-07, "logits/chosen": -2.1149468421936035, "logits/rejected": -2.081460475921631, "logps/chosen": -191.42092895507812, "logps/rejected": -270.44873046875, "loss": 0.4516, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.3773084878921509, "rewards/margins": 0.7978097796440125, "rewards/rejected": -2.1751179695129395, "step": 9460 }, { "epoch": 1.6316333563059957, "grad_norm": 31.95038414001465, "learning_rate": 1.021450199924568e-07, "logits/chosen": -1.9736360311508179, "logits/rejected": -1.9359562397003174, "logps/chosen": -214.228759765625, "logps/rejected": -256.5456237792969, "loss": 0.6069, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.6125974655151367, "rewards/margins": 0.4563944339752197, "rewards/rejected": -2.0689916610717773, "step": 9470 }, { "epoch": 1.633356305995865, "grad_norm": 37.818477630615234, "learning_rate": 1.0194457740939353e-07, "logits/chosen": -2.091583490371704, "logits/rejected": -2.0452983379364014, "logps/chosen": -221.3782958984375, "logps/rejected": -272.5511474609375, "loss": 0.5538, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.6660854816436768, "rewards/margins": 0.5488374829292297, "rewards/rejected": -2.2149226665496826, "step": 9480 }, { "epoch": 1.635079255685734, "grad_norm": 37.699649810791016, "learning_rate": 1.0174412701028899e-07, "logits/chosen": -2.0410289764404297, "logits/rejected": -2.002805233001709, "logps/chosen": -231.63192749023438, "logps/rejected": -287.02618408203125, "loss": 0.5512, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.7601169347763062, "rewards/margins": 0.5951749682426453, "rewards/rejected": -2.3552918434143066, "step": 9490 }, { "epoch": 1.636802205375603, "grad_norm": 26.676788330078125, "learning_rate": 1.0154366960083422e-07, "logits/chosen": -2.0825207233428955, "logits/rejected": -2.049968719482422, "logps/chosen": -237.82357788085938, "logps/rejected": -303.791259765625, "loss": 0.5249, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.8297290802001953, "rewards/margins": 0.6732373237609863, "rewards/rejected": -2.5029664039611816, "step": 9500 }, { "epoch": 1.638525155065472, "grad_norm": 36.48911666870117, "learning_rate": 1.0134320598674846e-07, "logits/chosen": -1.985640287399292, "logits/rejected": -1.9444854259490967, "logps/chosen": -244.42471313476562, "logps/rejected": -323.17071533203125, "loss": 0.5188, "rewards/accuracies": 0.71875, "rewards/chosen": -1.922539472579956, "rewards/margins": 0.7798765301704407, "rewards/rejected": -2.70241641998291, "step": 9510 }, { "epoch": 1.640248104755341, "grad_norm": 28.293848037719727, "learning_rate": 1.0114273697377583e-07, "logits/chosen": -2.153301239013672, "logits/rejected": -2.1260287761688232, "logps/chosen": -258.92950439453125, "logps/rejected": -325.4205627441406, "loss": 0.6037, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.056684970855713, "rewards/margins": 0.6256363987922668, "rewards/rejected": -2.682321786880493, "step": 9520 }, { "epoch": 1.6419710544452102, "grad_norm": 31.049102783203125, "learning_rate": 1.0094226336768224e-07, "logits/chosen": -2.119263172149658, "logits/rejected": -2.0707802772521973, "logps/chosen": -223.88095092773438, "logps/rejected": -281.79473876953125, "loss": 0.5493, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.6941604614257812, "rewards/margins": 0.6301865577697754, "rewards/rejected": -2.3243470191955566, "step": 9530 }, { "epoch": 1.6436940041350794, "grad_norm": 27.43020248413086, "learning_rate": 1.0074178597425194e-07, "logits/chosen": -2.0635290145874023, "logits/rejected": -2.0217814445495605, "logps/chosen": -212.50009155273438, "logps/rejected": -273.19610595703125, "loss": 0.5317, "rewards/accuracies": 0.75, "rewards/chosen": -1.5750280618667603, "rewards/margins": 0.6327553391456604, "rewards/rejected": -2.2077832221984863, "step": 9540 }, { "epoch": 1.6454169538249483, "grad_norm": 24.99987030029297, "learning_rate": 1.0054130559928451e-07, "logits/chosen": -2.129505157470703, "logits/rejected": -2.1067185401916504, "logps/chosen": -204.6131134033203, "logps/rejected": -259.88726806640625, "loss": 0.5647, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.504603624343872, "rewards/margins": 0.5379934310913086, "rewards/rejected": -2.0425968170166016, "step": 9550 }, { "epoch": 1.6471399035148173, "grad_norm": 40.675254821777344, "learning_rate": 1.0034082304859144e-07, "logits/chosen": -2.1753644943237305, "logits/rejected": -2.1537044048309326, "logps/chosen": -207.90188598632812, "logps/rejected": -250.1908416748047, "loss": 0.5972, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.5333625078201294, "rewards/margins": 0.4467272162437439, "rewards/rejected": -1.9800899028778076, "step": 9560 }, { "epoch": 1.6488628532046863, "grad_norm": 35.79283905029297, "learning_rate": 1.00140339127993e-07, "logits/chosen": -2.075749397277832, "logits/rejected": -2.0463900566101074, "logps/chosen": -202.37844848632812, "logps/rejected": -250.78274536132812, "loss": 0.5997, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.5012257099151611, "rewards/margins": 0.5024011135101318, "rewards/rejected": -2.003626823425293, "step": 9570 }, { "epoch": 1.6505858028945555, "grad_norm": 35.26013946533203, "learning_rate": 9.9939854643315e-08, "logits/chosen": -2.158979654312134, "logits/rejected": -2.122878313064575, "logps/chosen": -195.38125610351562, "logps/rejected": -243.55880737304688, "loss": 0.576, "rewards/accuracies": 0.71875, "rewards/chosen": -1.401939868927002, "rewards/margins": 0.46915024518966675, "rewards/rejected": -1.8710901737213135, "step": 9580 }, { "epoch": 1.6523087525844247, "grad_norm": 30.336355209350586, "learning_rate": 9.973937040038544e-08, "logits/chosen": -2.2403571605682373, "logits/rejected": -2.203962802886963, "logps/chosen": -197.30946350097656, "logps/rejected": -245.12820434570312, "loss": 0.564, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.399221420288086, "rewards/margins": 0.49407368898391724, "rewards/rejected": -1.8932949304580688, "step": 9590 }, { "epoch": 1.6540317022742936, "grad_norm": 45.379878997802734, "learning_rate": 9.953888720503145e-08, "logits/chosen": -2.093346118927002, "logits/rejected": -2.0450000762939453, "logps/chosen": -209.98733520507812, "logps/rejected": -269.8240966796875, "loss": 0.5412, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.5394213199615479, "rewards/margins": 0.6430832147598267, "rewards/rejected": -2.182504892349243, "step": 9600 }, { "epoch": 1.6540317022742936, "eval_logits/chosen": -2.1707704067230225, "eval_logits/rejected": -2.151291608810425, "eval_logps/chosen": -201.29856872558594, "eval_logps/rejected": -231.70973205566406, "eval_loss": 0.6392949223518372, "eval_rewards/accuracies": 0.63150554895401, "eval_rewards/chosen": -1.4228310585021973, "eval_rewards/margins": 0.26677021384239197, "eval_rewards/rejected": -1.689601182937622, "eval_runtime": 383.1623, "eval_samples_per_second": 11.233, "eval_steps_per_second": 1.404, "step": 9600 }, { "epoch": 1.6557546519641626, "grad_norm": 27.446199417114258, "learning_rate": 9.933840586307579e-08, "logits/chosen": -2.0719919204711914, "logits/rejected": -2.036891460418701, "logps/chosen": -212.3845672607422, "logps/rejected": -278.9921875, "loss": 0.5134, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.5430954694747925, "rewards/margins": 0.7068275809288025, "rewards/rejected": -2.2499232292175293, "step": 9610 }, { "epoch": 1.6574776016540316, "grad_norm": 31.442110061645508, "learning_rate": 9.913792718033396e-08, "logits/chosen": -2.150604724884033, "logits/rejected": -2.118518352508545, "logps/chosen": -223.6409454345703, "logps/rejected": -266.01214599609375, "loss": 0.6095, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.6785471439361572, "rewards/margins": 0.43278011679649353, "rewards/rejected": -2.1113271713256836, "step": 9620 }, { "epoch": 1.6592005513439008, "grad_norm": 61.02971649169922, "learning_rate": 9.893745196261062e-08, "logits/chosen": -2.092367649078369, "logits/rejected": -2.0572047233581543, "logps/chosen": -236.3819580078125, "logps/rejected": -284.91693115234375, "loss": 0.6022, "rewards/accuracies": 0.625, "rewards/chosen": -1.8070704936981201, "rewards/margins": 0.5147414207458496, "rewards/rejected": -2.3218119144439697, "step": 9630 }, { "epoch": 1.66092350103377, "grad_norm": 31.099504470825195, "learning_rate": 9.873698101569657e-08, "logits/chosen": -2.1342499256134033, "logits/rejected": -2.097806692123413, "logps/chosen": -211.4412384033203, "logps/rejected": -256.29473876953125, "loss": 0.56, "rewards/accuracies": 0.6875, "rewards/chosen": -1.5404903888702393, "rewards/margins": 0.5262420773506165, "rewards/rejected": -2.066732406616211, "step": 9640 }, { "epoch": 1.662646450723639, "grad_norm": 44.89314270019531, "learning_rate": 9.853651514536552e-08, "logits/chosen": -2.0789711475372314, "logits/rejected": -2.0402519702911377, "logps/chosen": -206.84707641601562, "logps/rejected": -245.1453399658203, "loss": 0.6071, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.5060863494873047, "rewards/margins": 0.4056490361690521, "rewards/rejected": -1.9117352962493896, "step": 9650 }, { "epoch": 1.664369400413508, "grad_norm": 31.32309341430664, "learning_rate": 9.833605515737058e-08, "logits/chosen": -2.0809390544891357, "logits/rejected": -2.055072069168091, "logps/chosen": -187.5325469970703, "logps/rejected": -240.6793670654297, "loss": 0.5671, "rewards/accuracies": 0.6875, "rewards/chosen": -1.3576128482818604, "rewards/margins": 0.5182755589485168, "rewards/rejected": -1.875888466835022, "step": 9660 }, { "epoch": 1.6660923501033769, "grad_norm": 33.44990539550781, "learning_rate": 9.813560185744138e-08, "logits/chosen": -2.1692049503326416, "logits/rejected": -2.1284148693084717, "logps/chosen": -202.41641235351562, "logps/rejected": -259.0638122558594, "loss": 0.5392, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.44061279296875, "rewards/margins": 0.6232479214668274, "rewards/rejected": -2.0638606548309326, "step": 9670 }, { "epoch": 1.667815299793246, "grad_norm": 33.01234436035156, "learning_rate": 9.79351560512806e-08, "logits/chosen": -2.1055922508239746, "logits/rejected": -2.0845696926116943, "logps/chosen": -209.5716094970703, "logps/rejected": -242.68685913085938, "loss": 0.6386, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.539527177810669, "rewards/margins": 0.371540904045105, "rewards/rejected": -1.9110679626464844, "step": 9680 }, { "epoch": 1.6695382494831152, "grad_norm": 42.91765594482422, "learning_rate": 9.773471854456087e-08, "logits/chosen": -2.057504177093506, "logits/rejected": -2.0265281200408936, "logps/chosen": -196.57354736328125, "logps/rejected": -242.1881866455078, "loss": 0.566, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.4017528295516968, "rewards/margins": 0.4889547824859619, "rewards/rejected": -1.8907076120376587, "step": 9690 }, { "epoch": 1.6712611991729842, "grad_norm": 59.99947738647461, "learning_rate": 9.753429014292132e-08, "logits/chosen": -2.0934269428253174, "logits/rejected": -2.0551304817199707, "logps/chosen": -189.7425537109375, "logps/rejected": -230.2326202392578, "loss": 0.6123, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.3643513917922974, "rewards/margins": 0.42653173208236694, "rewards/rejected": -1.7908833026885986, "step": 9700 }, { "epoch": 1.6729841488628532, "grad_norm": 31.751218795776367, "learning_rate": 9.73338716519646e-08, "logits/chosen": -2.0292251110076904, "logits/rejected": -2.010593891143799, "logps/chosen": -180.52139282226562, "logps/rejected": -216.5252227783203, "loss": 0.6059, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.2649705410003662, "rewards/margins": 0.3905658423900604, "rewards/rejected": -1.6555362939834595, "step": 9710 }, { "epoch": 1.6747070985527222, "grad_norm": 30.22354507446289, "learning_rate": 9.713346387725355e-08, "logits/chosen": -2.165675640106201, "logits/rejected": -2.1414732933044434, "logps/chosen": -177.21224975585938, "logps/rejected": -205.3203887939453, "loss": 0.6241, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.2220103740692139, "rewards/margins": 0.3224605917930603, "rewards/rejected": -1.5444709062576294, "step": 9720 }, { "epoch": 1.6764300482425913, "grad_norm": 29.720321655273438, "learning_rate": 9.693306762430782e-08, "logits/chosen": -2.1688475608825684, "logits/rejected": -2.1419014930725098, "logps/chosen": -172.5311279296875, "logps/rejected": -221.6881561279297, "loss": 0.5561, "rewards/accuracies": 0.75, "rewards/chosen": -1.1749764680862427, "rewards/margins": 0.504775881767273, "rewards/rejected": -1.6797523498535156, "step": 9730 }, { "epoch": 1.6781529979324605, "grad_norm": 26.8512020111084, "learning_rate": 9.673268369860086e-08, "logits/chosen": -2.133647918701172, "logits/rejected": -2.107252359390259, "logps/chosen": -191.29776000976562, "logps/rejected": -230.453369140625, "loss": 0.5964, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.356992244720459, "rewards/margins": 0.4239605963230133, "rewards/rejected": -1.780953049659729, "step": 9740 }, { "epoch": 1.6798759476223295, "grad_norm": 24.874778747558594, "learning_rate": 9.653231290555647e-08, "logits/chosen": -2.1468117237091064, "logits/rejected": -2.0880696773529053, "logps/chosen": -199.73204040527344, "logps/rejected": -246.09619140625, "loss": 0.5717, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.437829613685608, "rewards/margins": 0.5362062454223633, "rewards/rejected": -1.9740358591079712, "step": 9750 }, { "epoch": 1.6815988973121985, "grad_norm": 23.847570419311523, "learning_rate": 9.633195605054573e-08, "logits/chosen": -2.125284194946289, "logits/rejected": -2.079078197479248, "logps/chosen": -195.61141967773438, "logps/rejected": -247.21658325195312, "loss": 0.5554, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.4141006469726562, "rewards/margins": 0.5550005435943604, "rewards/rejected": -1.9691009521484375, "step": 9760 }, { "epoch": 1.6833218470020674, "grad_norm": 37.763572692871094, "learning_rate": 9.613161393888372e-08, "logits/chosen": -2.0627262592315674, "logits/rejected": -2.0276284217834473, "logps/chosen": -195.24488830566406, "logps/rejected": -244.3018341064453, "loss": 0.5826, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.4011054039001465, "rewards/margins": 0.5025665163993835, "rewards/rejected": -1.9036719799041748, "step": 9770 }, { "epoch": 1.6850447966919366, "grad_norm": 24.46339225769043, "learning_rate": 9.593128737582623e-08, "logits/chosen": -2.1057629585266113, "logits/rejected": -2.049747943878174, "logps/chosen": -201.80274963378906, "logps/rejected": -250.95144653320312, "loss": 0.5648, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.4022512435913086, "rewards/margins": 0.5962937474250793, "rewards/rejected": -1.9985450506210327, "step": 9780 }, { "epoch": 1.6867677463818056, "grad_norm": 28.699012756347656, "learning_rate": 9.57309771665665e-08, "logits/chosen": -2.119412899017334, "logits/rejected": -2.0996031761169434, "logps/chosen": -199.32797241210938, "logps/rejected": -255.63925170898438, "loss": 0.582, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.4494515657424927, "rewards/margins": 0.5416026711463928, "rewards/rejected": -1.9910542964935303, "step": 9790 }, { "epoch": 1.6884906960716748, "grad_norm": 47.394649505615234, "learning_rate": 9.553068411623211e-08, "logits/chosen": -2.157869815826416, "logits/rejected": -2.1125385761260986, "logps/chosen": -199.79861450195312, "logps/rejected": -255.11672973632812, "loss": 0.562, "rewards/accuracies": 0.75, "rewards/chosen": -1.412087321281433, "rewards/margins": 0.6073322296142578, "rewards/rejected": -2.0194194316864014, "step": 9800 }, { "epoch": 1.6902136457615438, "grad_norm": 30.579559326171875, "learning_rate": 9.533040902988164e-08, "logits/chosen": -2.1238341331481934, "logits/rejected": -2.079685688018799, "logps/chosen": -200.06643676757812, "logps/rejected": -249.8910675048828, "loss": 0.5945, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.4329220056533813, "rewards/margins": 0.5416523218154907, "rewards/rejected": -1.9745744466781616, "step": 9810 }, { "epoch": 1.6919365954514127, "grad_norm": 33.1264533996582, "learning_rate": 9.51301527125015e-08, "logits/chosen": -2.131040573120117, "logits/rejected": -2.09142804145813, "logps/chosen": -185.69674682617188, "logps/rejected": -241.4114227294922, "loss": 0.5386, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.306038498878479, "rewards/margins": 0.5765220522880554, "rewards/rejected": -1.8825607299804688, "step": 9820 }, { "epoch": 1.693659545141282, "grad_norm": 35.00567626953125, "learning_rate": 9.492991596900265e-08, "logits/chosen": -2.1659820079803467, "logits/rejected": -2.143907070159912, "logps/chosen": -197.84617614746094, "logps/rejected": -236.71890258789062, "loss": 0.6339, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.385030746459961, "rewards/margins": 0.4190130829811096, "rewards/rejected": -1.8040437698364258, "step": 9830 }, { "epoch": 1.6953824948311509, "grad_norm": 31.27936363220215, "learning_rate": 9.47296996042173e-08, "logits/chosen": -2.1307952404022217, "logits/rejected": -2.0879383087158203, "logps/chosen": -185.07919311523438, "logps/rejected": -237.0386962890625, "loss": 0.5739, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.2881401777267456, "rewards/margins": 0.5533360242843628, "rewards/rejected": -1.8414760828018188, "step": 9840 }, { "epoch": 1.69710544452102, "grad_norm": 29.849496841430664, "learning_rate": 9.452950442289582e-08, "logits/chosen": -2.148524284362793, "logits/rejected": -2.125277042388916, "logps/chosen": -176.24105834960938, "logps/rejected": -219.44589233398438, "loss": 0.5566, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.1986567974090576, "rewards/margins": 0.4844595789909363, "rewards/rejected": -1.6831163167953491, "step": 9850 }, { "epoch": 1.698828394210889, "grad_norm": 28.343416213989258, "learning_rate": 9.432933122970347e-08, "logits/chosen": -2.17924165725708, "logits/rejected": -2.159785509109497, "logps/chosen": -195.09347534179688, "logps/rejected": -228.8980712890625, "loss": 0.6072, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.3785400390625, "rewards/margins": 0.3773375153541565, "rewards/rejected": -1.7558774948120117, "step": 9860 }, { "epoch": 1.700551343900758, "grad_norm": 30.498952865600586, "learning_rate": 9.412918082921706e-08, "logits/chosen": -2.1416916847229004, "logits/rejected": -2.1032252311706543, "logps/chosen": -188.11207580566406, "logps/rejected": -235.4365997314453, "loss": 0.5707, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.3446654081344604, "rewards/margins": 0.5133527517318726, "rewards/rejected": -1.858018159866333, "step": 9870 }, { "epoch": 1.7022742935906272, "grad_norm": 26.755144119262695, "learning_rate": 9.39290540259218e-08, "logits/chosen": -2.0809402465820312, "logits/rejected": -2.058351993560791, "logps/chosen": -189.55882263183594, "logps/rejected": -243.50753784179688, "loss": 0.5537, "rewards/accuracies": 0.6875, "rewards/chosen": -1.3670248985290527, "rewards/margins": 0.5220685005187988, "rewards/rejected": -1.8890936374664307, "step": 9880 }, { "epoch": 1.7039972432804962, "grad_norm": 26.49530029296875, "learning_rate": 9.372895162420808e-08, "logits/chosen": -2.071610689163208, "logits/rejected": -2.044877052307129, "logps/chosen": -198.6932373046875, "logps/rejected": -251.8302001953125, "loss": 0.5383, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.408941388130188, "rewards/margins": 0.580849289894104, "rewards/rejected": -1.9897905588150024, "step": 9890 }, { "epoch": 1.7057201929703654, "grad_norm": 33.74192428588867, "learning_rate": 9.352887442836816e-08, "logits/chosen": -2.1049561500549316, "logits/rejected": -2.0604281425476074, "logps/chosen": -195.46896362304688, "logps/rejected": -265.776123046875, "loss": 0.4979, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.4245421886444092, "rewards/margins": 0.7445545792579651, "rewards/rejected": -2.1690969467163086, "step": 9900 }, { "epoch": 1.7074431426602343, "grad_norm": 26.481822967529297, "learning_rate": 9.332882324259306e-08, "logits/chosen": -2.1466808319091797, "logits/rejected": -2.0927011966705322, "logps/chosen": -213.91171264648438, "logps/rejected": -260.396728515625, "loss": 0.5752, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.5574681758880615, "rewards/margins": 0.5506702065467834, "rewards/rejected": -2.1081383228302, "step": 9910 }, { "epoch": 1.7091660923501033, "grad_norm": 35.15553283691406, "learning_rate": 9.312879887096923e-08, "logits/chosen": -2.167757272720337, "logits/rejected": -2.1290271282196045, "logps/chosen": -212.5194091796875, "logps/rejected": -266.25323486328125, "loss": 0.5405, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.5846225023269653, "rewards/margins": 0.5880954265594482, "rewards/rejected": -2.172717809677124, "step": 9920 }, { "epoch": 1.7108890420399723, "grad_norm": 38.19834518432617, "learning_rate": 9.292880211747528e-08, "logits/chosen": -2.097388744354248, "logits/rejected": -2.0738513469696045, "logps/chosen": -213.891357421875, "logps/rejected": -275.8308410644531, "loss": 0.555, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.6055524349212646, "rewards/margins": 0.6078485250473022, "rewards/rejected": -2.2134010791778564, "step": 9930 }, { "epoch": 1.7126119917298415, "grad_norm": 24.613040924072266, "learning_rate": 9.27288337859789e-08, "logits/chosen": -2.1469640731811523, "logits/rejected": -2.1228179931640625, "logps/chosen": -225.4186553955078, "logps/rejected": -278.17218017578125, "loss": 0.59, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.6956018209457397, "rewards/margins": 0.48676204681396484, "rewards/rejected": -2.182363986968994, "step": 9940 }, { "epoch": 1.7143349414197107, "grad_norm": 49.375003814697266, "learning_rate": 9.252889468023348e-08, "logits/chosen": -2.115926504135132, "logits/rejected": -2.065687894821167, "logps/chosen": -225.0879364013672, "logps/rejected": -289.8050537109375, "loss": 0.5322, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.7030729055404663, "rewards/margins": 0.6950501203536987, "rewards/rejected": -2.398123264312744, "step": 9950 }, { "epoch": 1.7160578911095796, "grad_norm": 36.60076904296875, "learning_rate": 9.232898560387503e-08, "logits/chosen": -2.1651546955108643, "logits/rejected": -2.1324679851531982, "logps/chosen": -222.7748260498047, "logps/rejected": -272.51092529296875, "loss": 0.589, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.6857795715332031, "rewards/margins": 0.4993601441383362, "rewards/rejected": -2.1851398944854736, "step": 9960 }, { "epoch": 1.7177808407994486, "grad_norm": 37.716888427734375, "learning_rate": 9.212910736041868e-08, "logits/chosen": -2.1696953773498535, "logits/rejected": -2.1427950859069824, "logps/chosen": -206.0946502685547, "logps/rejected": -253.1869354248047, "loss": 0.5856, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.536563754081726, "rewards/margins": 0.4869001507759094, "rewards/rejected": -2.0234639644622803, "step": 9970 }, { "epoch": 1.7195037904893176, "grad_norm": 31.95079803466797, "learning_rate": 9.19292607532558e-08, "logits/chosen": -2.0850770473480225, "logits/rejected": -2.0535874366760254, "logps/chosen": -207.0938262939453, "logps/rejected": -266.87335205078125, "loss": 0.5404, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.5402209758758545, "rewards/margins": 0.6110003590583801, "rewards/rejected": -2.151221513748169, "step": 9980 }, { "epoch": 1.7212267401791868, "grad_norm": 48.57706832885742, "learning_rate": 9.172944658565057e-08, "logits/chosen": -2.1648459434509277, "logits/rejected": -2.1238019466400146, "logps/chosen": -201.11058044433594, "logps/rejected": -244.0185089111328, "loss": 0.5645, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.4623303413391113, "rewards/margins": 0.49238890409469604, "rewards/rejected": -1.9547193050384521, "step": 9990 }, { "epoch": 1.722949689869056, "grad_norm": 28.529874801635742, "learning_rate": 9.15296656607367e-08, "logits/chosen": -2.1567516326904297, "logits/rejected": -2.124053478240967, "logps/chosen": -200.27005004882812, "logps/rejected": -253.435791015625, "loss": 0.5368, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.4336096048355103, "rewards/margins": 0.5446447134017944, "rewards/rejected": -1.9782540798187256, "step": 10000 }, { "epoch": 1.722949689869056, "eval_logits/chosen": -2.191511869430542, "eval_logits/rejected": -2.17295241355896, "eval_logps/chosen": -192.59469604492188, "eval_logps/rejected": -221.33299255371094, "eval_loss": 0.6408481597900391, "eval_rewards/accuracies": 0.6236059665679932, "eval_rewards/chosen": -1.335792064666748, "eval_rewards/margins": 0.2500417232513428, "eval_rewards/rejected": -1.5858336687088013, "eval_runtime": 382.9314, "eval_samples_per_second": 11.24, "eval_steps_per_second": 1.405, "step": 10000 }, { "epoch": 1.724672639558925, "grad_norm": 28.102582931518555, "learning_rate": 9.132991878151444e-08, "logits/chosen": -2.148301362991333, "logits/rejected": -2.11006760597229, "logps/chosen": -207.0460662841797, "logps/rejected": -269.1331787109375, "loss": 0.5292, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.54012930393219, "rewards/margins": 0.6273112297058105, "rewards/rejected": -2.167440414428711, "step": 10010 }, { "epoch": 1.7263955892487939, "grad_norm": 38.713417053222656, "learning_rate": 9.113020675084693e-08, "logits/chosen": -2.0745723247528076, "logits/rejected": -2.028005599975586, "logps/chosen": -222.94821166992188, "logps/rejected": -279.4356384277344, "loss": 0.5582, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.6800209283828735, "rewards/margins": 0.6077736020088196, "rewards/rejected": -2.287794351577759, "step": 10020 }, { "epoch": 1.7281185389386629, "grad_norm": 46.993404388427734, "learning_rate": 9.093053037145756e-08, "logits/chosen": -2.0698273181915283, "logits/rejected": -2.0276541709899902, "logps/chosen": -232.9654998779297, "logps/rejected": -272.5043029785156, "loss": 0.5962, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.7468217611312866, "rewards/margins": 0.464282751083374, "rewards/rejected": -2.211104393005371, "step": 10030 }, { "epoch": 1.729841488628532, "grad_norm": 24.55193519592285, "learning_rate": 9.073089044592619e-08, "logits/chosen": -2.2040719985961914, "logits/rejected": -2.1704514026641846, "logps/chosen": -221.79983520507812, "logps/rejected": -280.9211730957031, "loss": 0.5402, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.6553106307983398, "rewards/margins": 0.6160081624984741, "rewards/rejected": -2.2713184356689453, "step": 10040 }, { "epoch": 1.7315644383184012, "grad_norm": 35.085670471191406, "learning_rate": 9.053128777668629e-08, "logits/chosen": -2.1237728595733643, "logits/rejected": -2.109290361404419, "logps/chosen": -211.3069305419922, "logps/rejected": -255.8780975341797, "loss": 0.5977, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.5639381408691406, "rewards/margins": 0.44324740767478943, "rewards/rejected": -2.007185697555542, "step": 10050 }, { "epoch": 1.7332873880082702, "grad_norm": 41.619598388671875, "learning_rate": 9.033172316602148e-08, "logits/chosen": -2.1192288398742676, "logits/rejected": -2.0995373725891113, "logps/chosen": -197.01919555664062, "logps/rejected": -245.8788299560547, "loss": 0.5834, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.4734445810317993, "rewards/margins": 0.4632526934146881, "rewards/rejected": -1.936697006225586, "step": 10060 }, { "epoch": 1.7350103376981392, "grad_norm": 26.65183448791504, "learning_rate": 9.013219741606244e-08, "logits/chosen": -2.086177110671997, "logits/rejected": -2.0609517097473145, "logps/chosen": -180.7833709716797, "logps/rejected": -239.9923858642578, "loss": 0.5384, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.2739650011062622, "rewards/margins": 0.6015530824661255, "rewards/rejected": -1.8755180835723877, "step": 10070 }, { "epoch": 1.7367332873880081, "grad_norm": 41.86779022216797, "learning_rate": 8.993271132878371e-08, "logits/chosen": -2.152545690536499, "logits/rejected": -2.116054058074951, "logps/chosen": -191.3165283203125, "logps/rejected": -240.53225708007812, "loss": 0.5592, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.3363940715789795, "rewards/margins": 0.5229128003120422, "rewards/rejected": -1.8593066930770874, "step": 10080 }, { "epoch": 1.7384562370778773, "grad_norm": 36.689613342285156, "learning_rate": 8.973326570600038e-08, "logits/chosen": -2.187739849090576, "logits/rejected": -2.1426002979278564, "logps/chosen": -198.7881317138672, "logps/rejected": -266.9776306152344, "loss": 0.5284, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.4309791326522827, "rewards/margins": 0.6683201789855957, "rewards/rejected": -2.099299192428589, "step": 10090 }, { "epoch": 1.7401791867677465, "grad_norm": 51.00032424926758, "learning_rate": 8.953386134936489e-08, "logits/chosen": -2.1044974327087402, "logits/rejected": -2.0780110359191895, "logps/chosen": -231.7470245361328, "logps/rejected": -280.9796447753906, "loss": 0.5958, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.752642273902893, "rewards/margins": 0.5219553709030151, "rewards/rejected": -2.2745978832244873, "step": 10100 }, { "epoch": 1.7419021364576155, "grad_norm": 37.555301666259766, "learning_rate": 8.933449906036373e-08, "logits/chosen": -2.141427516937256, "logits/rejected": -2.114920139312744, "logps/chosen": -227.4268035888672, "logps/rejected": -294.4813537597656, "loss": 0.5533, "rewards/accuracies": 0.75, "rewards/chosen": -1.7672802209854126, "rewards/margins": 0.64067542552948, "rewards/rejected": -2.4079554080963135, "step": 10110 }, { "epoch": 1.7436250861474845, "grad_norm": 34.8802490234375, "learning_rate": 8.913517964031447e-08, "logits/chosen": -2.1124048233032227, "logits/rejected": -2.0721170902252197, "logps/chosen": -227.3213348388672, "logps/rejected": -283.9159240722656, "loss": 0.5579, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.7284618616104126, "rewards/margins": 0.6203962564468384, "rewards/rejected": -2.348858118057251, "step": 10120 }, { "epoch": 1.7453480358373534, "grad_norm": 28.73194694519043, "learning_rate": 8.893590389036226e-08, "logits/chosen": -2.1395440101623535, "logits/rejected": -2.1035239696502686, "logps/chosen": -214.451904296875, "logps/rejected": -280.0576171875, "loss": 0.5343, "rewards/accuracies": 0.78125, "rewards/chosen": -1.5973799228668213, "rewards/margins": 0.6699446439743042, "rewards/rejected": -2.267324924468994, "step": 10130 }, { "epoch": 1.7470709855272226, "grad_norm": 31.8477840423584, "learning_rate": 8.873667261147673e-08, "logits/chosen": -2.1350624561309814, "logits/rejected": -2.0828769207000732, "logps/chosen": -222.15798950195312, "logps/rejected": -273.2109680175781, "loss": 0.5767, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.6050676107406616, "rewards/margins": 0.5911908745765686, "rewards/rejected": -2.196258544921875, "step": 10140 }, { "epoch": 1.7487939352170918, "grad_norm": 26.414133071899414, "learning_rate": 8.853748660444881e-08, "logits/chosen": -2.127324342727661, "logits/rejected": -2.086487293243408, "logps/chosen": -187.64529418945312, "logps/rejected": -249.9339141845703, "loss": 0.5472, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.3201484680175781, "rewards/margins": 0.6274799108505249, "rewards/rejected": -1.947628378868103, "step": 10150 }, { "epoch": 1.7505168849069608, "grad_norm": 29.215452194213867, "learning_rate": 8.833834666988738e-08, "logits/chosen": -2.141927719116211, "logits/rejected": -2.097440481185913, "logps/chosen": -179.4386444091797, "logps/rejected": -246.930419921875, "loss": 0.4974, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.2599875926971436, "rewards/margins": 0.6941196322441101, "rewards/rejected": -1.9541072845458984, "step": 10160 }, { "epoch": 1.7522398345968297, "grad_norm": 35.651058197021484, "learning_rate": 8.813925360821624e-08, "logits/chosen": -2.0970935821533203, "logits/rejected": -2.0645716190338135, "logps/chosen": -207.51675415039062, "logps/rejected": -269.4786071777344, "loss": 0.5383, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.5255186557769775, "rewards/margins": 0.6351886987686157, "rewards/rejected": -2.1607069969177246, "step": 10170 }, { "epoch": 1.7539627842866987, "grad_norm": 24.785844802856445, "learning_rate": 8.794020821967075e-08, "logits/chosen": -2.038064479827881, "logits/rejected": -1.9947633743286133, "logps/chosen": -225.9365234375, "logps/rejected": -299.15802001953125, "loss": 0.5266, "rewards/accuracies": 0.71875, "rewards/chosen": -1.729060173034668, "rewards/margins": 0.7350554466247559, "rewards/rejected": -2.464115619659424, "step": 10180 }, { "epoch": 1.755685733976568, "grad_norm": 44.277008056640625, "learning_rate": 8.774121130429464e-08, "logits/chosen": -2.0331788063049316, "logits/rejected": -1.9945003986358643, "logps/chosen": -229.5591583251953, "logps/rejected": -300.8504333496094, "loss": 0.537, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.771598219871521, "rewards/margins": 0.7276459336280823, "rewards/rejected": -2.499244213104248, "step": 10190 }, { "epoch": 1.757408683666437, "grad_norm": 30.182825088500977, "learning_rate": 8.754226366193677e-08, "logits/chosen": -2.0969910621643066, "logits/rejected": -2.0572707653045654, "logps/chosen": -235.1160430908203, "logps/rejected": -300.4604187011719, "loss": 0.5356, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.8176597356796265, "rewards/margins": 0.687166690826416, "rewards/rejected": -2.504826545715332, "step": 10200 }, { "epoch": 1.759131633356306, "grad_norm": 54.42823028564453, "learning_rate": 8.734336609224798e-08, "logits/chosen": -2.084446430206299, "logits/rejected": -2.0502068996429443, "logps/chosen": -246.3536834716797, "logps/rejected": -325.4996643066406, "loss": 0.5233, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.906688928604126, "rewards/margins": 0.7914060354232788, "rewards/rejected": -2.6980948448181152, "step": 10210 }, { "epoch": 1.760854583046175, "grad_norm": 31.210098266601562, "learning_rate": 8.714451939467793e-08, "logits/chosen": -2.058612585067749, "logits/rejected": -2.026947021484375, "logps/chosen": -226.6744384765625, "logps/rejected": -285.47247314453125, "loss": 0.5576, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.7720890045166016, "rewards/margins": 0.5804320573806763, "rewards/rejected": -2.3525211811065674, "step": 10220 }, { "epoch": 1.762577532736044, "grad_norm": 36.709999084472656, "learning_rate": 8.69457243684717e-08, "logits/chosen": -2.0432868003845215, "logits/rejected": -2.003864288330078, "logps/chosen": -226.4574737548828, "logps/rejected": -276.8308410644531, "loss": 0.5898, "rewards/accuracies": 0.65625, "rewards/chosen": -1.7202298641204834, "rewards/margins": 0.5365725755691528, "rewards/rejected": -2.2568023204803467, "step": 10230 }, { "epoch": 1.7643004824259132, "grad_norm": 31.065011978149414, "learning_rate": 8.67469818126667e-08, "logits/chosen": -2.069124460220337, "logits/rejected": -2.023430109024048, "logps/chosen": -216.22946166992188, "logps/rejected": -298.06903076171875, "loss": 0.4958, "rewards/accuracies": 0.78125, "rewards/chosen": -1.641461968421936, "rewards/margins": 0.8219528198242188, "rewards/rejected": -2.4634146690368652, "step": 10240 }, { "epoch": 1.7660234321157822, "grad_norm": 45.54820251464844, "learning_rate": 8.654829252608947e-08, "logits/chosen": -2.118251085281372, "logits/rejected": -2.0692851543426514, "logps/chosen": -220.3714141845703, "logps/rejected": -275.654296875, "loss": 0.5211, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.627004623413086, "rewards/margins": 0.6395336389541626, "rewards/rejected": -2.266538619995117, "step": 10250 }, { "epoch": 1.7677463818056514, "grad_norm": 44.912437438964844, "learning_rate": 8.634965730735238e-08, "logits/chosen": -2.0857231616973877, "logits/rejected": -2.0690531730651855, "logps/chosen": -224.17861938476562, "logps/rejected": -287.48419189453125, "loss": 0.5445, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.7005201578140259, "rewards/margins": 0.6115856170654297, "rewards/rejected": -2.312105655670166, "step": 10260 }, { "epoch": 1.7694693314955203, "grad_norm": 30.748966217041016, "learning_rate": 8.615107695485059e-08, "logits/chosen": -2.0707106590270996, "logits/rejected": -2.039233684539795, "logps/chosen": -220.36544799804688, "logps/rejected": -282.8392639160156, "loss": 0.5349, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.639102578163147, "rewards/margins": 0.6370723247528076, "rewards/rejected": -2.276174783706665, "step": 10270 }, { "epoch": 1.7711922811853893, "grad_norm": 40.262969970703125, "learning_rate": 8.595255226675867e-08, "logits/chosen": -2.0837535858154297, "logits/rejected": -2.056248426437378, "logps/chosen": -224.76107788085938, "logps/rejected": -264.0224609375, "loss": 0.6194, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.6971912384033203, "rewards/margins": 0.42010498046875, "rewards/rejected": -2.1172964572906494, "step": 10280 }, { "epoch": 1.7729152308752585, "grad_norm": 33.58854675292969, "learning_rate": 8.575408404102739e-08, "logits/chosen": -2.0661511421203613, "logits/rejected": -2.0259451866149902, "logps/chosen": -199.9491424560547, "logps/rejected": -268.76312255859375, "loss": 0.5468, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.4780738353729248, "rewards/margins": 0.6608957052230835, "rewards/rejected": -2.1389694213867188, "step": 10290 }, { "epoch": 1.7746381805651275, "grad_norm": 53.497398376464844, "learning_rate": 8.555567307538067e-08, "logits/chosen": -2.1200575828552246, "logits/rejected": -2.092484951019287, "logps/chosen": -225.9880828857422, "logps/rejected": -271.4896545410156, "loss": 0.6079, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.6974165439605713, "rewards/margins": 0.4778365194797516, "rewards/rejected": -2.17525315284729, "step": 10300 }, { "epoch": 1.7763611302549966, "grad_norm": 30.578693389892578, "learning_rate": 8.53573201673122e-08, "logits/chosen": -2.051030158996582, "logits/rejected": -2.013582706451416, "logps/chosen": -221.84011840820312, "logps/rejected": -282.8453369140625, "loss": 0.5257, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.641297698020935, "rewards/margins": 0.686009407043457, "rewards/rejected": -2.3273074626922607, "step": 10310 }, { "epoch": 1.7780840799448656, "grad_norm": 39.207130432128906, "learning_rate": 8.515902611408245e-08, "logits/chosen": -2.1029365062713623, "logits/rejected": -2.062652111053467, "logps/chosen": -215.40438842773438, "logps/rejected": -267.7357177734375, "loss": 0.584, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.5729243755340576, "rewards/margins": 0.5734225511550903, "rewards/rejected": -2.1463465690612793, "step": 10320 }, { "epoch": 1.7798070296347346, "grad_norm": 35.41927719116211, "learning_rate": 8.496079171271512e-08, "logits/chosen": -2.1222665309906006, "logits/rejected": -2.08558988571167, "logps/chosen": -196.37164306640625, "logps/rejected": -254.0879669189453, "loss": 0.5371, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.4327995777130127, "rewards/margins": 0.5754380822181702, "rewards/rejected": -2.008237600326538, "step": 10330 }, { "epoch": 1.7815299793246038, "grad_norm": 27.103147506713867, "learning_rate": 8.476261775999432e-08, "logits/chosen": -2.156501293182373, "logits/rejected": -2.117757558822632, "logps/chosen": -194.71116638183594, "logps/rejected": -252.72793579101562, "loss": 0.5415, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.3771531581878662, "rewards/margins": 0.6131818890571594, "rewards/rejected": -1.9903347492218018, "step": 10340 }, { "epoch": 1.7832529290144727, "grad_norm": 41.56328582763672, "learning_rate": 8.45645050524611e-08, "logits/chosen": -2.1711769104003906, "logits/rejected": -2.1348977088928223, "logps/chosen": -204.80001831054688, "logps/rejected": -252.4854736328125, "loss": 0.5598, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.4941831827163696, "rewards/margins": 0.5315493941307068, "rewards/rejected": -2.0257325172424316, "step": 10350 }, { "epoch": 1.784975878704342, "grad_norm": 36.49805450439453, "learning_rate": 8.436645438641038e-08, "logits/chosen": -2.102998971939087, "logits/rejected": -2.0662338733673096, "logps/chosen": -203.11863708496094, "logps/rejected": -242.20803833007812, "loss": 0.5992, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.461118459701538, "rewards/margins": 0.4449211061000824, "rewards/rejected": -1.9060395956039429, "step": 10360 }, { "epoch": 1.786698828394211, "grad_norm": 40.32637023925781, "learning_rate": 8.416846655788774e-08, "logits/chosen": -1.995539903640747, "logits/rejected": -1.9595807790756226, "logps/chosen": -184.28761291503906, "logps/rejected": -232.712890625, "loss": 0.5566, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.307677984237671, "rewards/margins": 0.500927209854126, "rewards/rejected": -1.8086051940917969, "step": 10370 }, { "epoch": 1.7884217780840799, "grad_norm": 39.63434982299805, "learning_rate": 8.397054236268611e-08, "logits/chosen": -2.1119132041931152, "logits/rejected": -2.092855215072632, "logps/chosen": -197.7986602783203, "logps/rejected": -221.83645629882812, "loss": 0.6914, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.4502270221710205, "rewards/margins": 0.2483011782169342, "rewards/rejected": -1.6985282897949219, "step": 10380 }, { "epoch": 1.7901447277739488, "grad_norm": 23.693660736083984, "learning_rate": 8.37726825963427e-08, "logits/chosen": -2.164780378341675, "logits/rejected": -2.127293825149536, "logps/chosen": -174.3961639404297, "logps/rejected": -218.28988647460938, "loss": 0.5728, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.1731741428375244, "rewards/margins": 0.47819751501083374, "rewards/rejected": -1.6513715982437134, "step": 10390 }, { "epoch": 1.791867677463818, "grad_norm": 28.680999755859375, "learning_rate": 8.357488805413576e-08, "logits/chosen": -2.198361873626709, "logits/rejected": -2.1695852279663086, "logps/chosen": -170.17636108398438, "logps/rejected": -233.5998077392578, "loss": 0.5064, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.1772061586380005, "rewards/margins": 0.6171708106994629, "rewards/rejected": -1.794377088546753, "step": 10400 }, { "epoch": 1.791867677463818, "eval_logits/chosen": -2.2306599617004395, "eval_logits/rejected": -2.215008497238159, "eval_logps/chosen": -165.2631072998047, "eval_logps/rejected": -188.94882202148438, "eval_loss": 0.6423309445381165, "eval_rewards/accuracies": 0.6215148568153381, "eval_rewards/chosen": -1.062476396560669, "eval_rewards/margins": 0.1995159536600113, "eval_rewards/rejected": -1.2619922161102295, "eval_runtime": 383.4393, "eval_samples_per_second": 11.225, "eval_steps_per_second": 1.403, "step": 10400 }, { "epoch": 1.7935906271536872, "grad_norm": 32.13147735595703, "learning_rate": 8.337715953108133e-08, "logits/chosen": -2.1234824657440186, "logits/rejected": -2.08290958404541, "logps/chosen": -185.5501251220703, "logps/rejected": -223.6185760498047, "loss": 0.5874, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.280967354774475, "rewards/margins": 0.43095582723617554, "rewards/rejected": -1.7119232416152954, "step": 10410 }, { "epoch": 1.7953135768435562, "grad_norm": 56.05282974243164, "learning_rate": 8.317949782193021e-08, "logits/chosen": -2.1323277950286865, "logits/rejected": -2.0914833545684814, "logps/chosen": -200.09124755859375, "logps/rejected": -253.63876342773438, "loss": 0.5699, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.452972412109375, "rewards/margins": 0.5477496981620789, "rewards/rejected": -2.0007221698760986, "step": 10420 }, { "epoch": 1.7970365265334252, "grad_norm": 31.1216983795166, "learning_rate": 8.298190372116449e-08, "logits/chosen": -2.1449904441833496, "logits/rejected": -2.109835386276245, "logps/chosen": -215.15225219726562, "logps/rejected": -261.8252258300781, "loss": 0.6041, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.5944058895111084, "rewards/margins": 0.4992167055606842, "rewards/rejected": -2.0936226844787598, "step": 10430 }, { "epoch": 1.7987594762232941, "grad_norm": 31.59772491455078, "learning_rate": 8.278437802299462e-08, "logits/chosen": -2.1712965965270996, "logits/rejected": -2.1471519470214844, "logps/chosen": -227.6300506591797, "logps/rejected": -270.65386962890625, "loss": 0.5907, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.6827600002288818, "rewards/margins": 0.46045559644699097, "rewards/rejected": -2.1432156562805176, "step": 10440 }, { "epoch": 1.8004824259131633, "grad_norm": 35.741493225097656, "learning_rate": 8.258692152135605e-08, "logits/chosen": -2.1343047618865967, "logits/rejected": -2.1071267127990723, "logps/chosen": -214.2671661376953, "logps/rejected": -278.6723937988281, "loss": 0.5215, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.5967563390731812, "rewards/margins": 0.6269431710243225, "rewards/rejected": -2.2236995697021484, "step": 10450 }, { "epoch": 1.8022053756030325, "grad_norm": 28.693790435791016, "learning_rate": 8.238953500990624e-08, "logits/chosen": -2.1232166290283203, "logits/rejected": -2.0772557258605957, "logps/chosen": -214.5017547607422, "logps/rejected": -265.8387145996094, "loss": 0.5561, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.5892353057861328, "rewards/margins": 0.5419289469718933, "rewards/rejected": -2.131164073944092, "step": 10460 }, { "epoch": 1.8039283252929015, "grad_norm": 45.37855911254883, "learning_rate": 8.219221928202108e-08, "logits/chosen": -2.009894609451294, "logits/rejected": -1.9749433994293213, "logps/chosen": -222.81887817382812, "logps/rejected": -276.101318359375, "loss": 0.5717, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.7024590969085693, "rewards/margins": 0.5605179071426392, "rewards/rejected": -2.262976884841919, "step": 10470 }, { "epoch": 1.8056512749827704, "grad_norm": 36.52731704711914, "learning_rate": 8.199497513079219e-08, "logits/chosen": -2.1089510917663574, "logits/rejected": -2.0611464977264404, "logps/chosen": -226.2388458251953, "logps/rejected": -295.7428283691406, "loss": 0.5359, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.6885255575180054, "rewards/margins": 0.7537800073623657, "rewards/rejected": -2.44230580329895, "step": 10480 }, { "epoch": 1.8073742246726394, "grad_norm": 30.162227630615234, "learning_rate": 8.179780334902338e-08, "logits/chosen": -2.099175214767456, "logits/rejected": -2.0624618530273438, "logps/chosen": -213.5786895751953, "logps/rejected": -270.34735107421875, "loss": 0.5526, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.5826948881149292, "rewards/margins": 0.59993976354599, "rewards/rejected": -2.1826348304748535, "step": 10490 }, { "epoch": 1.8090971743625086, "grad_norm": 35.62689208984375, "learning_rate": 8.16007047292276e-08, "logits/chosen": -2.1074697971343994, "logits/rejected": -2.058868885040283, "logps/chosen": -213.86355590820312, "logps/rejected": -284.5144958496094, "loss": 0.5221, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.60076904296875, "rewards/margins": 0.7032946944236755, "rewards/rejected": -2.3040637969970703, "step": 10500 }, { "epoch": 1.8108201240523778, "grad_norm": 35.58950424194336, "learning_rate": 8.140368006362378e-08, "logits/chosen": -2.111414909362793, "logits/rejected": -2.0616517066955566, "logps/chosen": -219.7614288330078, "logps/rejected": -283.29779052734375, "loss": 0.5341, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.6361150741577148, "rewards/margins": 0.6754859089851379, "rewards/rejected": -2.311600923538208, "step": 10510 }, { "epoch": 1.8125430737422468, "grad_norm": 54.72854232788086, "learning_rate": 8.120673014413346e-08, "logits/chosen": -2.1215085983276367, "logits/rejected": -2.089259624481201, "logps/chosen": -228.813232421875, "logps/rejected": -306.3352966308594, "loss": 0.5364, "rewards/accuracies": 0.71875, "rewards/chosen": -1.767823576927185, "rewards/margins": 0.7603949904441833, "rewards/rejected": -2.5282187461853027, "step": 10520 }, { "epoch": 1.8142660234321157, "grad_norm": 41.22834014892578, "learning_rate": 8.100985576237789e-08, "logits/chosen": -2.053673505783081, "logits/rejected": -2.018805503845215, "logps/chosen": -238.222412109375, "logps/rejected": -289.5137634277344, "loss": 0.5878, "rewards/accuracies": 0.65625, "rewards/chosen": -1.8332278728485107, "rewards/margins": 0.5466124415397644, "rewards/rejected": -2.37984037399292, "step": 10530 }, { "epoch": 1.8159889731219847, "grad_norm": 39.10302734375, "learning_rate": 8.081305770967466e-08, "logits/chosen": -2.0007946491241455, "logits/rejected": -1.9645189046859741, "logps/chosen": -222.1063690185547, "logps/rejected": -276.51312255859375, "loss": 0.5475, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.689347267150879, "rewards/margins": 0.5932210683822632, "rewards/rejected": -2.2825684547424316, "step": 10540 }, { "epoch": 1.817711922811854, "grad_norm": 33.643516540527344, "learning_rate": 8.061633677703457e-08, "logits/chosen": -2.174844741821289, "logits/rejected": -2.1463987827301025, "logps/chosen": -237.1045379638672, "logps/rejected": -293.6675109863281, "loss": 0.5867, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.8306430578231812, "rewards/margins": 0.5582226514816284, "rewards/rejected": -2.3888659477233887, "step": 10550 }, { "epoch": 1.819434872501723, "grad_norm": 35.75984191894531, "learning_rate": 8.041969375515835e-08, "logits/chosen": -2.065913438796997, "logits/rejected": -2.016780138015747, "logps/chosen": -222.2818145751953, "logps/rejected": -298.7868347167969, "loss": 0.5156, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.6556761264801025, "rewards/margins": 0.800155758857727, "rewards/rejected": -2.455831527709961, "step": 10560 }, { "epoch": 1.821157822191592, "grad_norm": 53.0742301940918, "learning_rate": 8.022312943443369e-08, "logits/chosen": -2.112668514251709, "logits/rejected": -2.078002691268921, "logps/chosen": -226.45956420898438, "logps/rejected": -291.5998840332031, "loss": 0.5494, "rewards/accuracies": 0.71875, "rewards/chosen": -1.6997945308685303, "rewards/margins": 0.6631422638893127, "rewards/rejected": -2.3629367351531982, "step": 10570 }, { "epoch": 1.822880771881461, "grad_norm": 41.303245544433594, "learning_rate": 8.002664460493194e-08, "logits/chosen": -2.1236252784729004, "logits/rejected": -2.0799834728240967, "logps/chosen": -203.3345947265625, "logps/rejected": -271.3036804199219, "loss": 0.5138, "rewards/accuracies": 0.75, "rewards/chosen": -1.4962794780731201, "rewards/margins": 0.6943894624710083, "rewards/rejected": -2.190668821334839, "step": 10580 }, { "epoch": 1.82460372157133, "grad_norm": 28.450899124145508, "learning_rate": 7.983024005640487e-08, "logits/chosen": -2.090174913406372, "logits/rejected": -2.045700788497925, "logps/chosen": -210.66390991210938, "logps/rejected": -261.7216796875, "loss": 0.5587, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.540217399597168, "rewards/margins": 0.5885982513427734, "rewards/rejected": -2.1288156509399414, "step": 10590 }, { "epoch": 1.8263266712611992, "grad_norm": 24.977964401245117, "learning_rate": 7.963391657828167e-08, "logits/chosen": -2.158010482788086, "logits/rejected": -2.1370630264282227, "logps/chosen": -195.0901641845703, "logps/rejected": -249.91494750976562, "loss": 0.5716, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.4173429012298584, "rewards/margins": 0.543049693107605, "rewards/rejected": -1.960392713546753, "step": 10600 }, { "epoch": 1.8280496209510684, "grad_norm": 44.63248825073242, "learning_rate": 7.943767495966556e-08, "logits/chosen": -2.094036102294922, "logits/rejected": -2.066676139831543, "logps/chosen": -208.56723022460938, "logps/rejected": -263.9549560546875, "loss": 0.5582, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.5647661685943604, "rewards/margins": 0.5567968487739563, "rewards/rejected": -2.121562957763672, "step": 10610 }, { "epoch": 1.8297725706409373, "grad_norm": 37.00849151611328, "learning_rate": 7.924151598933077e-08, "logits/chosen": -2.0145325660705566, "logits/rejected": -1.9684088230133057, "logps/chosen": -207.1978302001953, "logps/rejected": -270.76348876953125, "loss": 0.5188, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.523241400718689, "rewards/margins": 0.6691671013832092, "rewards/rejected": -2.192408561706543, "step": 10620 }, { "epoch": 1.8314955203308063, "grad_norm": 29.079648971557617, "learning_rate": 7.904544045571942e-08, "logits/chosen": -2.102393627166748, "logits/rejected": -2.0552027225494385, "logps/chosen": -214.1923065185547, "logps/rejected": -284.5091552734375, "loss": 0.545, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.586716890335083, "rewards/margins": 0.7270705103874207, "rewards/rejected": -2.3137869834899902, "step": 10630 }, { "epoch": 1.8332184700206753, "grad_norm": 61.565185546875, "learning_rate": 7.884944914693819e-08, "logits/chosen": -2.1358046531677246, "logits/rejected": -2.094247341156006, "logps/chosen": -218.451171875, "logps/rejected": -275.0665588378906, "loss": 0.5669, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.631365180015564, "rewards/margins": 0.6042715907096863, "rewards/rejected": -2.2356367111206055, "step": 10640 }, { "epoch": 1.8349414197105445, "grad_norm": 30.415767669677734, "learning_rate": 7.865354285075517e-08, "logits/chosen": -2.135023593902588, "logits/rejected": -2.0953783988952637, "logps/chosen": -220.89950561523438, "logps/rejected": -292.2164611816406, "loss": 0.5329, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.6609359979629517, "rewards/margins": 0.7331883311271667, "rewards/rejected": -2.3941242694854736, "step": 10650 }, { "epoch": 1.8366643694004137, "grad_norm": 40.1267204284668, "learning_rate": 7.845772235459687e-08, "logits/chosen": -2.0598466396331787, "logits/rejected": -2.025097608566284, "logps/chosen": -226.88601684570312, "logps/rejected": -285.40167236328125, "loss": 0.5567, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.7189706563949585, "rewards/margins": 0.5910989046096802, "rewards/rejected": -2.3100695610046387, "step": 10660 }, { "epoch": 1.8383873190902826, "grad_norm": 33.5792236328125, "learning_rate": 7.826198844554484e-08, "logits/chosen": -2.0914905071258545, "logits/rejected": -2.0463545322418213, "logps/chosen": -229.7507781982422, "logps/rejected": -295.3099670410156, "loss": 0.5437, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.7596862316131592, "rewards/margins": 0.682495653629303, "rewards/rejected": -2.4421820640563965, "step": 10670 }, { "epoch": 1.8401102687801516, "grad_norm": 34.08005905151367, "learning_rate": 7.806634191033268e-08, "logits/chosen": -2.118601083755493, "logits/rejected": -2.0770506858825684, "logps/chosen": -219.3430633544922, "logps/rejected": -275.48748779296875, "loss": 0.5409, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.6182947158813477, "rewards/margins": 0.6389121413230896, "rewards/rejected": -2.257207155227661, "step": 10680 }, { "epoch": 1.8418332184700206, "grad_norm": 35.42080307006836, "learning_rate": 7.787078353534276e-08, "logits/chosen": -2.083477020263672, "logits/rejected": -2.0510470867156982, "logps/chosen": -218.898193359375, "logps/rejected": -286.86431884765625, "loss": 0.5394, "rewards/accuracies": 0.71875, "rewards/chosen": -1.6395857334136963, "rewards/margins": 0.6692296266555786, "rewards/rejected": -2.3088154792785645, "step": 10690 }, { "epoch": 1.8435561681598898, "grad_norm": 51.69084167480469, "learning_rate": 7.767531410660307e-08, "logits/chosen": -2.130699396133423, "logits/rejected": -2.0828940868377686, "logps/chosen": -219.0266876220703, "logps/rejected": -268.32159423828125, "loss": 0.5482, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.624758005142212, "rewards/margins": 0.5619906187057495, "rewards/rejected": -2.1867482662200928, "step": 10700 }, { "epoch": 1.8452791178497587, "grad_norm": 41.16576385498047, "learning_rate": 7.74799344097841e-08, "logits/chosen": -2.089820146560669, "logits/rejected": -2.045987367630005, "logps/chosen": -209.64437866210938, "logps/rejected": -264.726318359375, "loss": 0.5433, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.5587536096572876, "rewards/margins": 0.6008284687995911, "rewards/rejected": -2.1595821380615234, "step": 10710 }, { "epoch": 1.847002067539628, "grad_norm": 39.79367446899414, "learning_rate": 7.728464523019574e-08, "logits/chosen": -2.0866949558258057, "logits/rejected": -2.0479319095611572, "logps/chosen": -215.8254852294922, "logps/rejected": -266.1983642578125, "loss": 0.597, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.625404953956604, "rewards/margins": 0.522426426410675, "rewards/rejected": -2.147831439971924, "step": 10720 }, { "epoch": 1.848725017229497, "grad_norm": 31.813405990600586, "learning_rate": 7.7089447352784e-08, "logits/chosen": -2.203573226928711, "logits/rejected": -2.150926113128662, "logps/chosen": -206.60537719726562, "logps/rejected": -267.6660461425781, "loss": 0.5131, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.5262092351913452, "rewards/margins": 0.6517717242240906, "rewards/rejected": -2.177980899810791, "step": 10730 }, { "epoch": 1.8504479669193659, "grad_norm": 34.61793899536133, "learning_rate": 7.689434156212788e-08, "logits/chosen": -2.0917916297912598, "logits/rejected": -2.0554144382476807, "logps/chosen": -209.6357879638672, "logps/rejected": -256.06610107421875, "loss": 0.6046, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.5490946769714355, "rewards/margins": 0.466376930475235, "rewards/rejected": -2.0154716968536377, "step": 10740 }, { "epoch": 1.852170916609235, "grad_norm": 33.33896255493164, "learning_rate": 7.669932864243627e-08, "logits/chosen": -2.0860400199890137, "logits/rejected": -2.032809019088745, "logps/chosen": -192.8465576171875, "logps/rejected": -261.32330322265625, "loss": 0.5341, "rewards/accuracies": 0.71875, "rewards/chosen": -1.4007537364959717, "rewards/margins": 0.6914193630218506, "rewards/rejected": -2.0921730995178223, "step": 10750 }, { "epoch": 1.853893866299104, "grad_norm": 36.69129180908203, "learning_rate": 7.65044093775448e-08, "logits/chosen": -2.0872907638549805, "logits/rejected": -2.049820899963379, "logps/chosen": -206.46987915039062, "logps/rejected": -257.88897705078125, "loss": 0.5712, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.5185004472732544, "rewards/margins": 0.5392345190048218, "rewards/rejected": -2.057734966278076, "step": 10760 }, { "epoch": 1.8556168159889732, "grad_norm": 32.58485794067383, "learning_rate": 7.630958455091266e-08, "logits/chosen": -2.069591522216797, "logits/rejected": -2.039121627807617, "logps/chosen": -213.1195526123047, "logps/rejected": -270.96746826171875, "loss": 0.5377, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.5476007461547852, "rewards/margins": 0.6266247630119324, "rewards/rejected": -2.174225330352783, "step": 10770 }, { "epoch": 1.8573397656788422, "grad_norm": 27.233510971069336, "learning_rate": 7.611485494561947e-08, "logits/chosen": -2.190983772277832, "logits/rejected": -2.161463737487793, "logps/chosen": -208.59158325195312, "logps/rejected": -264.21661376953125, "loss": 0.5847, "rewards/accuracies": 0.6875, "rewards/chosen": -1.5422141551971436, "rewards/margins": 0.5495244264602661, "rewards/rejected": -2.091738700866699, "step": 10780 }, { "epoch": 1.8590627153687111, "grad_norm": 44.738311767578125, "learning_rate": 7.592022134436201e-08, "logits/chosen": -2.232229709625244, "logits/rejected": -2.18972110748291, "logps/chosen": -193.24270629882812, "logps/rejected": -248.223876953125, "loss": 0.5538, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.3993558883666992, "rewards/margins": 0.5599854588508606, "rewards/rejected": -1.959341287612915, "step": 10790 }, { "epoch": 1.8607856650585803, "grad_norm": 31.418119430541992, "learning_rate": 7.57256845294513e-08, "logits/chosen": -2.178560733795166, "logits/rejected": -2.139875650405884, "logps/chosen": -198.04063415527344, "logps/rejected": -266.0334167480469, "loss": 0.5268, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.4451549053192139, "rewards/margins": 0.6842938661575317, "rewards/rejected": -2.129448652267456, "step": 10800 }, { "epoch": 1.8607856650585803, "eval_logits/chosen": -2.183124542236328, "eval_logits/rejected": -2.1644392013549805, "eval_logps/chosen": -201.55577087402344, "eval_logps/rejected": -231.0404052734375, "eval_loss": 0.6405770778656006, "eval_rewards/accuracies": 0.634061336517334, "eval_rewards/chosen": -1.425403118133545, "eval_rewards/margins": 0.2575048804283142, "eval_rewards/rejected": -1.6829079389572144, "eval_runtime": 382.9043, "eval_samples_per_second": 11.24, "eval_steps_per_second": 1.405, "step": 10800 }, { "epoch": 1.8625086147484493, "grad_norm": 28.313385009765625, "learning_rate": 7.553124528280928e-08, "logits/chosen": -2.138568162918091, "logits/rejected": -2.0898187160491943, "logps/chosen": -221.3721466064453, "logps/rejected": -265.95477294921875, "loss": 0.5684, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.6270099878311157, "rewards/margins": 0.5401317477226257, "rewards/rejected": -2.1671416759490967, "step": 10810 }, { "epoch": 1.8642315644383185, "grad_norm": 31.738422393798828, "learning_rate": 7.533690438596583e-08, "logits/chosen": -2.091878890991211, "logits/rejected": -2.0571346282958984, "logps/chosen": -203.2316131591797, "logps/rejected": -257.54656982421875, "loss": 0.5733, "rewards/accuracies": 0.6875, "rewards/chosen": -1.4928950071334839, "rewards/margins": 0.5778385996818542, "rewards/rejected": -2.0707337856292725, "step": 10820 }, { "epoch": 1.8659545141281875, "grad_norm": 52.44428253173828, "learning_rate": 7.514266262005528e-08, "logits/chosen": -2.1045467853546143, "logits/rejected": -2.069561004638672, "logps/chosen": -223.9551544189453, "logps/rejected": -274.0635681152344, "loss": 0.5889, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.708439588546753, "rewards/margins": 0.5089355111122131, "rewards/rejected": -2.2173752784729004, "step": 10830 }, { "epoch": 1.8676774638180564, "grad_norm": 28.92299461364746, "learning_rate": 7.494852076581377e-08, "logits/chosen": -2.120326042175293, "logits/rejected": -2.091909408569336, "logps/chosen": -208.5050506591797, "logps/rejected": -250.04397583007812, "loss": 0.6206, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.5447773933410645, "rewards/margins": 0.4490395188331604, "rewards/rejected": -1.9938167333602905, "step": 10840 }, { "epoch": 1.8694004135079254, "grad_norm": 32.259178161621094, "learning_rate": 7.475447960357572e-08, "logits/chosen": -2.1164214611053467, "logits/rejected": -2.0852532386779785, "logps/chosen": -182.91921997070312, "logps/rejected": -235.68115234375, "loss": 0.5502, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.3145451545715332, "rewards/margins": 0.5021630525588989, "rewards/rejected": -1.8167082071304321, "step": 10850 }, { "epoch": 1.8711233631977946, "grad_norm": 35.98172378540039, "learning_rate": 7.456053991327083e-08, "logits/chosen": -2.1797902584075928, "logits/rejected": -2.1264162063598633, "logps/chosen": -199.8368377685547, "logps/rejected": -259.7995910644531, "loss": 0.5373, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.4367259740829468, "rewards/margins": 0.6395610570907593, "rewards/rejected": -2.076287269592285, "step": 10860 }, { "epoch": 1.8728463128876638, "grad_norm": 29.174753189086914, "learning_rate": 7.436670247442107e-08, "logits/chosen": -2.0565848350524902, "logits/rejected": -2.0258164405822754, "logps/chosen": -196.3871612548828, "logps/rejected": -266.41925048828125, "loss": 0.5371, "rewards/accuracies": 0.71875, "rewards/chosen": -1.4551265239715576, "rewards/margins": 0.655053973197937, "rewards/rejected": -2.110180377960205, "step": 10870 }, { "epoch": 1.8745692625775328, "grad_norm": 37.872257232666016, "learning_rate": 7.417296806613718e-08, "logits/chosen": -2.1089484691619873, "logits/rejected": -2.0674309730529785, "logps/chosen": -224.12796020507812, "logps/rejected": -283.5495300292969, "loss": 0.545, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.6908639669418335, "rewards/margins": 0.6248513460159302, "rewards/rejected": -2.3157153129577637, "step": 10880 }, { "epoch": 1.8762922122674017, "grad_norm": 36.295127868652344, "learning_rate": 7.397933746711603e-08, "logits/chosen": -2.129281997680664, "logits/rejected": -2.1052193641662598, "logps/chosen": -233.7235870361328, "logps/rejected": -286.4725341796875, "loss": 0.5873, "rewards/accuracies": 0.6875, "rewards/chosen": -1.7942317724227905, "rewards/margins": 0.5287545919418335, "rewards/rejected": -2.322986602783203, "step": 10890 }, { "epoch": 1.8780151619572707, "grad_norm": 42.38193893432617, "learning_rate": 7.378581145563709e-08, "logits/chosen": -2.161618709564209, "logits/rejected": -2.1121408939361572, "logps/chosen": -225.397705078125, "logps/rejected": -281.1845397949219, "loss": 0.5398, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.6846339702606201, "rewards/margins": 0.6255574822425842, "rewards/rejected": -2.3101913928985596, "step": 10900 }, { "epoch": 1.8797381116471399, "grad_norm": 55.0797119140625, "learning_rate": 7.35923908095595e-08, "logits/chosen": -2.078524112701416, "logits/rejected": -2.0375468730926514, "logps/chosen": -220.0901641845703, "logps/rejected": -267.397705078125, "loss": 0.5922, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.6298377513885498, "rewards/margins": 0.5042181015014648, "rewards/rejected": -2.1340558528900146, "step": 10910 }, { "epoch": 1.881461061337009, "grad_norm": 34.892669677734375, "learning_rate": 7.339907630631886e-08, "logits/chosen": -2.0853939056396484, "logits/rejected": -2.045135736465454, "logps/chosen": -213.90255737304688, "logps/rejected": -266.9146728515625, "loss": 0.5396, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.5748045444488525, "rewards/margins": 0.5764104723930359, "rewards/rejected": -2.151215076446533, "step": 10920 }, { "epoch": 1.883184011026878, "grad_norm": 35.38461685180664, "learning_rate": 7.320586872292413e-08, "logits/chosen": -2.049386739730835, "logits/rejected": -2.015204429626465, "logps/chosen": -212.7605743408203, "logps/rejected": -256.4028015136719, "loss": 0.5719, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.5758640766143799, "rewards/margins": 0.4759438931941986, "rewards/rejected": -2.0518081188201904, "step": 10930 }, { "epoch": 1.884906960716747, "grad_norm": 41.828216552734375, "learning_rate": 7.301276883595463e-08, "logits/chosen": -2.048062562942505, "logits/rejected": -2.019521713256836, "logps/chosen": -213.22213745117188, "logps/rejected": -262.60064697265625, "loss": 0.5598, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.5625088214874268, "rewards/margins": 0.5233115553855896, "rewards/rejected": -2.085820436477661, "step": 10940 }, { "epoch": 1.886629910406616, "grad_norm": 38.982906341552734, "learning_rate": 7.281977742155669e-08, "logits/chosen": -2.1038460731506348, "logits/rejected": -2.074618101119995, "logps/chosen": -204.53701782226562, "logps/rejected": -253.8427276611328, "loss": 0.553, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.4922102689743042, "rewards/margins": 0.5051702260971069, "rewards/rejected": -1.9973804950714111, "step": 10950 }, { "epoch": 1.8883528600964852, "grad_norm": 35.97544479370117, "learning_rate": 7.262689525544067e-08, "logits/chosen": -2.1165997982025146, "logits/rejected": -2.082392692565918, "logps/chosen": -208.37155151367188, "logps/rejected": -262.97265625, "loss": 0.5808, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.541710615158081, "rewards/margins": 0.5694451332092285, "rewards/rejected": -2.1111557483673096, "step": 10960 }, { "epoch": 1.8900758097863544, "grad_norm": 35.33456802368164, "learning_rate": 7.243412311287782e-08, "logits/chosen": -2.138899087905884, "logits/rejected": -2.094754457473755, "logps/chosen": -197.99417114257812, "logps/rejected": -256.3907775878906, "loss": 0.5568, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.4036202430725098, "rewards/margins": 0.6245434880256653, "rewards/rejected": -2.0281636714935303, "step": 10970 }, { "epoch": 1.8917987594762233, "grad_norm": 47.53961181640625, "learning_rate": 7.224146176869717e-08, "logits/chosen": -2.1551897525787354, "logits/rejected": -2.116971492767334, "logps/chosen": -203.69459533691406, "logps/rejected": -265.29327392578125, "loss": 0.5628, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.477172613143921, "rewards/margins": 0.6235934495925903, "rewards/rejected": -2.100766181945801, "step": 10980 }, { "epoch": 1.8935217091660923, "grad_norm": 31.756277084350586, "learning_rate": 7.204891199728241e-08, "logits/chosen": -2.0958075523376465, "logits/rejected": -2.0669543743133545, "logps/chosen": -186.14865112304688, "logps/rejected": -236.04080200195312, "loss": 0.5647, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.3263320922851562, "rewards/margins": 0.5096399188041687, "rewards/rejected": -1.8359720706939697, "step": 10990 }, { "epoch": 1.8952446588559613, "grad_norm": 30.142967224121094, "learning_rate": 7.185647457256879e-08, "logits/chosen": -2.2401175498962402, "logits/rejected": -2.2189393043518066, "logps/chosen": -189.11007690429688, "logps/rejected": -246.180419921875, "loss": 0.5703, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.3603506088256836, "rewards/margins": 0.542121410369873, "rewards/rejected": -1.902471899986267, "step": 11000 }, { "epoch": 1.8969676085458305, "grad_norm": 37.16350173950195, "learning_rate": 7.166415026803991e-08, "logits/chosen": -2.147042751312256, "logits/rejected": -2.098593235015869, "logps/chosen": -193.9458770751953, "logps/rejected": -260.72247314453125, "loss": 0.506, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.3998019695281982, "rewards/margins": 0.7031914591789246, "rewards/rejected": -2.1029934883117676, "step": 11010 }, { "epoch": 1.8986905582356997, "grad_norm": 25.575349807739258, "learning_rate": 7.147193985672477e-08, "logits/chosen": -2.162492275238037, "logits/rejected": -2.124326467514038, "logps/chosen": -208.29403686523438, "logps/rejected": -263.257568359375, "loss": 0.553, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.522696852684021, "rewards/margins": 0.5898150205612183, "rewards/rejected": -2.1125118732452393, "step": 11020 }, { "epoch": 1.9004135079255686, "grad_norm": 32.54864501953125, "learning_rate": 7.127984411119461e-08, "logits/chosen": -2.151383876800537, "logits/rejected": -2.1169238090515137, "logps/chosen": -231.1976318359375, "logps/rejected": -291.98944091796875, "loss": 0.5982, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.7518266439437866, "rewards/margins": 0.6365460157394409, "rewards/rejected": -2.3883728981018066, "step": 11030 }, { "epoch": 1.9021364576154376, "grad_norm": 27.272384643554688, "learning_rate": 7.108786380355971e-08, "logits/chosen": -2.161572217941284, "logits/rejected": -2.126248836517334, "logps/chosen": -217.8511199951172, "logps/rejected": -274.0556640625, "loss": 0.5472, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.5999891757965088, "rewards/margins": 0.5932620167732239, "rewards/rejected": -2.193251132965088, "step": 11040 }, { "epoch": 1.9038594073053066, "grad_norm": 33.55147933959961, "learning_rate": 7.089599970546642e-08, "logits/chosen": -2.104125499725342, "logits/rejected": -2.0692391395568848, "logps/chosen": -203.26809692382812, "logps/rejected": -269.63470458984375, "loss": 0.5176, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.5091619491577148, "rewards/margins": 0.6466538906097412, "rewards/rejected": -2.155815839767456, "step": 11050 }, { "epoch": 1.9055823569951758, "grad_norm": 30.785818099975586, "learning_rate": 7.070425258809394e-08, "logits/chosen": -2.0970897674560547, "logits/rejected": -2.031811237335205, "logps/chosen": -225.00997924804688, "logps/rejected": -298.5378723144531, "loss": 0.5053, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.6961663961410522, "rewards/margins": 0.778182864189148, "rewards/rejected": -2.474349021911621, "step": 11060 }, { "epoch": 1.907305306685045, "grad_norm": 37.45442199707031, "learning_rate": 7.051262322215128e-08, "logits/chosen": -2.1341097354888916, "logits/rejected": -2.072995662689209, "logps/chosen": -230.6807098388672, "logps/rejected": -307.604248046875, "loss": 0.4812, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.7601606845855713, "rewards/margins": 0.8273337483406067, "rewards/rejected": -2.587494373321533, "step": 11070 }, { "epoch": 1.909028256374914, "grad_norm": 26.271894454956055, "learning_rate": 7.032111237787424e-08, "logits/chosen": -2.0926690101623535, "logits/rejected": -2.053788661956787, "logps/chosen": -246.2302703857422, "logps/rejected": -309.3777160644531, "loss": 0.5356, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.8914282321929932, "rewards/margins": 0.686504602432251, "rewards/rejected": -2.577932834625244, "step": 11080 }, { "epoch": 1.9107512060647829, "grad_norm": 36.41754913330078, "learning_rate": 7.01297208250222e-08, "logits/chosen": -2.080683946609497, "logits/rejected": -2.040173053741455, "logps/chosen": -246.9329833984375, "logps/rejected": -303.186767578125, "loss": 0.5431, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.8879804611206055, "rewards/margins": 0.6224623918533325, "rewards/rejected": -2.5104429721832275, "step": 11090 }, { "epoch": 1.9124741557546519, "grad_norm": 34.7519645690918, "learning_rate": 6.993844933287496e-08, "logits/chosen": -2.0179200172424316, "logits/rejected": -1.9861838817596436, "logps/chosen": -230.6690673828125, "logps/rejected": -291.9774475097656, "loss": 0.5703, "rewards/accuracies": 0.6875, "rewards/chosen": -1.7871367931365967, "rewards/margins": 0.6063834428787231, "rewards/rejected": -2.3935203552246094, "step": 11100 }, { "epoch": 1.914197105444521, "grad_norm": 34.446861267089844, "learning_rate": 6.974729867022989e-08, "logits/chosen": -2.06170392036438, "logits/rejected": -2.01041841506958, "logps/chosen": -250.8234100341797, "logps/rejected": -332.9823303222656, "loss": 0.5677, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.9623744487762451, "rewards/margins": 0.8461829423904419, "rewards/rejected": -2.8085572719573975, "step": 11110 }, { "epoch": 1.9159200551343902, "grad_norm": 35.57859802246094, "learning_rate": 6.955626960539855e-08, "logits/chosen": -2.165560245513916, "logits/rejected": -2.1220479011535645, "logps/chosen": -229.8457794189453, "logps/rejected": -289.033203125, "loss": 0.5335, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.6932449340820312, "rewards/margins": 0.6686538457870483, "rewards/rejected": -2.36189866065979, "step": 11120 }, { "epoch": 1.9176430048242592, "grad_norm": 52.480201721191406, "learning_rate": 6.936536290620393e-08, "logits/chosen": -2.066098690032959, "logits/rejected": -2.018115520477295, "logps/chosen": -226.6170196533203, "logps/rejected": -289.1579284667969, "loss": 0.5292, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.7019894123077393, "rewards/margins": 0.6620699167251587, "rewards/rejected": -2.3640592098236084, "step": 11130 }, { "epoch": 1.9193659545141282, "grad_norm": 42.84165573120117, "learning_rate": 6.917457933997706e-08, "logits/chosen": -2.01469349861145, "logits/rejected": -1.9801337718963623, "logps/chosen": -226.46005249023438, "logps/rejected": -282.3187255859375, "loss": 0.5586, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.7090473175048828, "rewards/margins": 0.6025687456130981, "rewards/rejected": -2.3116161823272705, "step": 11140 }, { "epoch": 1.9210889042039971, "grad_norm": 39.90765380859375, "learning_rate": 6.898391967355405e-08, "logits/chosen": -2.1024584770202637, "logits/rejected": -2.076080799102783, "logps/chosen": -231.018310546875, "logps/rejected": -283.4735412597656, "loss": 0.5776, "rewards/accuracies": 0.71875, "rewards/chosen": -1.722633719444275, "rewards/margins": 0.5566567778587341, "rewards/rejected": -2.2792906761169434, "step": 11150 }, { "epoch": 1.9228118538938663, "grad_norm": 52.43754577636719, "learning_rate": 6.879338467327302e-08, "logits/chosen": -2.14897084236145, "logits/rejected": -2.1171956062316895, "logps/chosen": -208.48263549804688, "logps/rejected": -254.87289428710938, "loss": 0.5868, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.5512667894363403, "rewards/margins": 0.4904497265815735, "rewards/rejected": -2.0417165756225586, "step": 11160 }, { "epoch": 1.9245348035837355, "grad_norm": 38.62773895263672, "learning_rate": 6.860297510497104e-08, "logits/chosen": -2.0566763877868652, "logits/rejected": -2.0216500759124756, "logps/chosen": -211.69384765625, "logps/rejected": -248.56982421875, "loss": 0.588, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.5310242176055908, "rewards/margins": 0.46797189116477966, "rewards/rejected": -1.9989961385726929, "step": 11170 }, { "epoch": 1.9262577532736045, "grad_norm": 32.62820053100586, "learning_rate": 6.841269173398107e-08, "logits/chosen": -2.1622769832611084, "logits/rejected": -2.1254467964172363, "logps/chosen": -216.33853149414062, "logps/rejected": -272.8714294433594, "loss": 0.5488, "rewards/accuracies": 0.6875, "rewards/chosen": -1.611806869506836, "rewards/margins": 0.610049307346344, "rewards/rejected": -2.221856117248535, "step": 11180 }, { "epoch": 1.9279807029634735, "grad_norm": 48.73835372924805, "learning_rate": 6.82225353251286e-08, "logits/chosen": -2.1593518257141113, "logits/rejected": -2.109025239944458, "logps/chosen": -198.0034637451172, "logps/rejected": -267.95458984375, "loss": 0.4723, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.420243263244629, "rewards/margins": 0.7498121857643127, "rewards/rejected": -2.170055389404297, "step": 11190 }, { "epoch": 1.9297036526533424, "grad_norm": 36.46852493286133, "learning_rate": 6.80325066427291e-08, "logits/chosen": -2.1360020637512207, "logits/rejected": -2.0926783084869385, "logps/chosen": -210.9085693359375, "logps/rejected": -271.9500732421875, "loss": 0.5384, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.5630364418029785, "rewards/margins": 0.622287929058075, "rewards/rejected": -2.1853244304656982, "step": 11200 }, { "epoch": 1.9297036526533424, "eval_logits/chosen": -2.150330066680908, "eval_logits/rejected": -2.129934072494507, "eval_logps/chosen": -223.87203979492188, "eval_logps/rejected": -257.14404296875, "eval_loss": 0.6418474912643433, "eval_rewards/accuracies": 0.6363847851753235, "eval_rewards/chosen": -1.648565649986267, "eval_rewards/margins": 0.2953791320323944, "eval_rewards/rejected": -1.9439448118209839, "eval_runtime": 383.1567, "eval_samples_per_second": 11.233, "eval_steps_per_second": 1.404, "step": 11200 }, { "epoch": 1.9314266023432116, "grad_norm": 38.86859130859375, "learning_rate": 6.784260645058445e-08, "logits/chosen": -2.0044658184051514, "logits/rejected": -1.9595401287078857, "logps/chosen": -239.3448028564453, "logps/rejected": -320.5731506347656, "loss": 0.5335, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.8804552555084229, "rewards/margins": 0.7997304201126099, "rewards/rejected": -2.6801857948303223, "step": 11210 }, { "epoch": 1.9331495520330806, "grad_norm": 52.32000732421875, "learning_rate": 6.765283551198016e-08, "logits/chosen": -2.0517823696136475, "logits/rejected": -2.023305892944336, "logps/chosen": -250.20114135742188, "logps/rejected": -312.5027770996094, "loss": 0.6199, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.9770643711090088, "rewards/margins": 0.5979080200195312, "rewards/rejected": -2.574972629547119, "step": 11220 }, { "epoch": 1.9348725017229498, "grad_norm": 30.262535095214844, "learning_rate": 6.746319458968226e-08, "logits/chosen": -2.0434021949768066, "logits/rejected": -2.012618064880371, "logps/chosen": -255.679931640625, "logps/rejected": -310.25408935546875, "loss": 0.5841, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.982190728187561, "rewards/margins": 0.6203389167785645, "rewards/rejected": -2.602529764175415, "step": 11230 }, { "epoch": 1.9365954514128187, "grad_norm": 31.0468807220459, "learning_rate": 6.727368444593408e-08, "logits/chosen": -2.082054376602173, "logits/rejected": -2.0466256141662598, "logps/chosen": -224.1901397705078, "logps/rejected": -274.36688232421875, "loss": 0.5864, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.6886026859283447, "rewards/margins": 0.5320411920547485, "rewards/rejected": -2.220643997192383, "step": 11240 }, { "epoch": 1.9383184011026877, "grad_norm": 30.541744232177734, "learning_rate": 6.708430584245337e-08, "logits/chosen": -2.110844373703003, "logits/rejected": -2.0703446865081787, "logps/chosen": -216.69485473632812, "logps/rejected": -277.73150634765625, "loss": 0.5383, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.6169925928115845, "rewards/margins": 0.6295633316040039, "rewards/rejected": -2.246555805206299, "step": 11250 }, { "epoch": 1.940041350792557, "grad_norm": 31.27999496459961, "learning_rate": 6.689505954042913e-08, "logits/chosen": -2.0711936950683594, "logits/rejected": -2.0286784172058105, "logps/chosen": -218.43081665039062, "logps/rejected": -267.02716064453125, "loss": 0.5712, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.5965648889541626, "rewards/margins": 0.5688450932502747, "rewards/rejected": -2.165409803390503, "step": 11260 }, { "epoch": 1.9417643004824259, "grad_norm": 43.0255126953125, "learning_rate": 6.67059463005187e-08, "logits/chosen": -2.0696632862091064, "logits/rejected": -2.0213420391082764, "logps/chosen": -209.5532989501953, "logps/rejected": -272.81805419921875, "loss": 0.5049, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.532209038734436, "rewards/margins": 0.69169682264328, "rewards/rejected": -2.2239060401916504, "step": 11270 }, { "epoch": 1.943487250172295, "grad_norm": 28.406047821044922, "learning_rate": 6.651696688284438e-08, "logits/chosen": -2.106395959854126, "logits/rejected": -2.059577703475952, "logps/chosen": -232.7011260986328, "logps/rejected": -284.5666198730469, "loss": 0.5597, "rewards/accuracies": 0.6875, "rewards/chosen": -1.7567068338394165, "rewards/margins": 0.5732653737068176, "rewards/rejected": -2.329972505569458, "step": 11280 }, { "epoch": 1.945210199862164, "grad_norm": 37.7677116394043, "learning_rate": 6.632812204699077e-08, "logits/chosen": -2.094848871231079, "logits/rejected": -2.055690288543701, "logps/chosen": -231.9086456298828, "logps/rejected": -297.62689208984375, "loss": 0.5308, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.7121360301971436, "rewards/margins": 0.7090049982070923, "rewards/rejected": -2.4211411476135254, "step": 11290 }, { "epoch": 1.946933149552033, "grad_norm": 43.34477233886719, "learning_rate": 6.613941255200147e-08, "logits/chosen": -2.0163540840148926, "logits/rejected": -1.9858640432357788, "logps/chosen": -264.2913513183594, "logps/rejected": -301.52703857421875, "loss": 0.6495, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -2.0428531169891357, "rewards/margins": 0.442251056432724, "rewards/rejected": -2.4851040840148926, "step": 11300 }, { "epoch": 1.948656099241902, "grad_norm": 47.41953659057617, "learning_rate": 6.595083915637602e-08, "logits/chosen": -2.127748966217041, "logits/rejected": -2.0928280353546143, "logps/chosen": -237.9222412109375, "logps/rejected": -310.1321716308594, "loss": 0.5151, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.854783058166504, "rewards/margins": 0.7411599159240723, "rewards/rejected": -2.595942974090576, "step": 11310 }, { "epoch": 1.9503790489317712, "grad_norm": 32.63195037841797, "learning_rate": 6.576240261806711e-08, "logits/chosen": -2.0840084552764893, "logits/rejected": -2.041901111602783, "logps/chosen": -236.044677734375, "logps/rejected": -304.3868103027344, "loss": 0.5322, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.7976115942001343, "rewards/margins": 0.7138528227806091, "rewards/rejected": -2.5114643573760986, "step": 11320 }, { "epoch": 1.9521019986216404, "grad_norm": 81.51802062988281, "learning_rate": 6.557410369447712e-08, "logits/chosen": -2.0825531482696533, "logits/rejected": -2.0385289192199707, "logps/chosen": -242.2621307373047, "logps/rejected": -308.0596008300781, "loss": 0.5261, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.8813930749893188, "rewards/margins": 0.7215787172317505, "rewards/rejected": -2.6029715538024902, "step": 11330 }, { "epoch": 1.9538249483115093, "grad_norm": 43.0372200012207, "learning_rate": 6.538594314245541e-08, "logits/chosen": -2.0444424152374268, "logits/rejected": -1.997079849243164, "logps/chosen": -236.3446502685547, "logps/rejected": -302.52606201171875, "loss": 0.5421, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.7998119592666626, "rewards/margins": 0.6873337626457214, "rewards/rejected": -2.4871456623077393, "step": 11340 }, { "epoch": 1.9555478980013783, "grad_norm": 46.32229995727539, "learning_rate": 6.51979217182952e-08, "logits/chosen": -2.1301732063293457, "logits/rejected": -2.0888075828552246, "logps/chosen": -244.2246856689453, "logps/rejected": -303.2362365722656, "loss": 0.5816, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.858077049255371, "rewards/margins": 0.6253143548965454, "rewards/rejected": -2.483391284942627, "step": 11350 }, { "epoch": 1.9572708476912473, "grad_norm": 35.97012710571289, "learning_rate": 6.501004017773049e-08, "logits/chosen": -2.0921897888183594, "logits/rejected": -2.056212902069092, "logps/chosen": -232.33438110351562, "logps/rejected": -288.73779296875, "loss": 0.5435, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.7822563648223877, "rewards/margins": 0.6243523955345154, "rewards/rejected": -2.406608819961548, "step": 11360 }, { "epoch": 1.9589937973811165, "grad_norm": 33.192787170410156, "learning_rate": 6.482229927593292e-08, "logits/chosen": -2.087029457092285, "logits/rejected": -2.0539119243621826, "logps/chosen": -225.0400848388672, "logps/rejected": -280.0304260253906, "loss": 0.5488, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.682896614074707, "rewards/margins": 0.5910366177558899, "rewards/rejected": -2.273932933807373, "step": 11370 }, { "epoch": 1.9607167470709856, "grad_norm": 47.09540557861328, "learning_rate": 6.463469976750894e-08, "logits/chosen": -2.0447041988372803, "logits/rejected": -2.00447154045105, "logps/chosen": -225.29141235351562, "logps/rejected": -284.82025146484375, "loss": 0.5638, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.7298924922943115, "rewards/margins": 0.6187609434127808, "rewards/rejected": -2.3486533164978027, "step": 11380 }, { "epoch": 1.9624396967608546, "grad_norm": 35.9467887878418, "learning_rate": 6.444724240649674e-08, "logits/chosen": -2.1504344940185547, "logits/rejected": -2.1013474464416504, "logps/chosen": -203.40408325195312, "logps/rejected": -275.85430908203125, "loss": 0.5266, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.4995545148849487, "rewards/margins": 0.7154166102409363, "rewards/rejected": -2.2149710655212402, "step": 11390 }, { "epoch": 1.9641626464507236, "grad_norm": 34.19363784790039, "learning_rate": 6.425992794636305e-08, "logits/chosen": -2.1202802658081055, "logits/rejected": -2.081855535507202, "logps/chosen": -209.0074462890625, "logps/rejected": -267.097900390625, "loss": 0.5436, "rewards/accuracies": 0.71875, "rewards/chosen": -1.522707223892212, "rewards/margins": 0.6264538764953613, "rewards/rejected": -2.149160861968994, "step": 11400 }, { "epoch": 1.9658855961405926, "grad_norm": 33.694339752197266, "learning_rate": 6.407275714000029e-08, "logits/chosen": -2.1229145526885986, "logits/rejected": -2.084303140640259, "logps/chosen": -214.90896606445312, "logps/rejected": -272.10284423828125, "loss": 0.5436, "rewards/accuracies": 0.71875, "rewards/chosen": -1.5856306552886963, "rewards/margins": 0.5950139164924622, "rewards/rejected": -2.1806445121765137, "step": 11410 }, { "epoch": 1.9676085458304617, "grad_norm": 25.552141189575195, "learning_rate": 6.388573073972341e-08, "logits/chosen": -2.104806661605835, "logits/rejected": -2.069352626800537, "logps/chosen": -215.0218505859375, "logps/rejected": -257.0823669433594, "loss": 0.5875, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.6048667430877686, "rewards/margins": 0.4714290201663971, "rewards/rejected": -2.0762956142425537, "step": 11420 }, { "epoch": 1.969331495520331, "grad_norm": 31.175952911376953, "learning_rate": 6.3698849497267e-08, "logits/chosen": -2.075159788131714, "logits/rejected": -2.0389952659606934, "logps/chosen": -196.71681213378906, "logps/rejected": -245.8275909423828, "loss": 0.5744, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.4119598865509033, "rewards/margins": 0.5342273712158203, "rewards/rejected": -1.9461870193481445, "step": 11430 }, { "epoch": 1.9710544452102, "grad_norm": 32.55466079711914, "learning_rate": 6.351211416378221e-08, "logits/chosen": -2.113795757293701, "logits/rejected": -2.0957531929016113, "logps/chosen": -195.7284393310547, "logps/rejected": -235.590576171875, "loss": 0.5862, "rewards/accuracies": 0.6875, "rewards/chosen": -1.4285451173782349, "rewards/margins": 0.40397801995277405, "rewards/rejected": -1.8325231075286865, "step": 11440 }, { "epoch": 1.9727773949000689, "grad_norm": 30.875164031982422, "learning_rate": 6.332552548983368e-08, "logits/chosen": -2.1119213104248047, "logits/rejected": -2.0720133781433105, "logps/chosen": -200.646484375, "logps/rejected": -256.79559326171875, "loss": 0.5535, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.4703338146209717, "rewards/margins": 0.5860790014266968, "rewards/rejected": -2.056412935256958, "step": 11450 }, { "epoch": 1.9745003445899378, "grad_norm": 30.871906280517578, "learning_rate": 6.313908422539656e-08, "logits/chosen": -2.1062095165252686, "logits/rejected": -2.06668758392334, "logps/chosen": -199.67001342773438, "logps/rejected": -259.66522216796875, "loss": 0.5361, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.4358268976211548, "rewards/margins": 0.6267709732055664, "rewards/rejected": -2.0625977516174316, "step": 11460 }, { "epoch": 1.976223294279807, "grad_norm": 28.920886993408203, "learning_rate": 6.295279111985354e-08, "logits/chosen": -2.1364591121673584, "logits/rejected": -2.075352430343628, "logps/chosen": -216.4774627685547, "logps/rejected": -280.35980224609375, "loss": 0.5025, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.5842845439910889, "rewards/margins": 0.7140352725982666, "rewards/rejected": -2.2983195781707764, "step": 11470 }, { "epoch": 1.9779462439696762, "grad_norm": 35.6077766418457, "learning_rate": 6.276664692199175e-08, "logits/chosen": -2.1571812629699707, "logits/rejected": -2.110039234161377, "logps/chosen": -202.0142364501953, "logps/rejected": -247.08834838867188, "loss": 0.5817, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.444929838180542, "rewards/margins": 0.5276190042495728, "rewards/rejected": -1.9725488424301147, "step": 11480 }, { "epoch": 1.9796691936595452, "grad_norm": 29.205793380737305, "learning_rate": 6.258065237999988e-08, "logits/chosen": -2.1270751953125, "logits/rejected": -2.0870296955108643, "logps/chosen": -210.76516723632812, "logps/rejected": -252.1230926513672, "loss": 0.5994, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.548182487487793, "rewards/margins": 0.4664430618286133, "rewards/rejected": -2.0146255493164062, "step": 11490 }, { "epoch": 1.9813921433494142, "grad_norm": 35.74567794799805, "learning_rate": 6.239480824146503e-08, "logits/chosen": -2.148181438446045, "logits/rejected": -2.1178340911865234, "logps/chosen": -193.29258728027344, "logps/rejected": -232.23593139648438, "loss": 0.596, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.4171593189239502, "rewards/margins": 0.4054163098335266, "rewards/rejected": -1.822575569152832, "step": 11500 }, { "epoch": 1.9831150930392831, "grad_norm": 41.26714324951172, "learning_rate": 6.220911525336977e-08, "logits/chosen": -2.104710578918457, "logits/rejected": -2.073302745819092, "logps/chosen": -203.44204711914062, "logps/rejected": -262.4302062988281, "loss": 0.5674, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.471699595451355, "rewards/margins": 0.6327524781227112, "rewards/rejected": -2.104451894760132, "step": 11510 }, { "epoch": 1.9848380427291523, "grad_norm": 37.78926086425781, "learning_rate": 6.202357416208911e-08, "logits/chosen": -2.152705669403076, "logits/rejected": -2.0834972858428955, "logps/chosen": -195.06761169433594, "logps/rejected": -245.7989959716797, "loss": 0.5314, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.3396432399749756, "rewards/margins": 0.6214916706085205, "rewards/rejected": -1.9611351490020752, "step": 11520 }, { "epoch": 1.9865609924190215, "grad_norm": 24.8828067779541, "learning_rate": 6.183818571338766e-08, "logits/chosen": -2.119786024093628, "logits/rejected": -2.073561191558838, "logps/chosen": -188.87936401367188, "logps/rejected": -236.6806640625, "loss": 0.5444, "rewards/accuracies": 0.71875, "rewards/chosen": -1.3401225805282593, "rewards/margins": 0.5436755418777466, "rewards/rejected": -1.8837982416152954, "step": 11530 }, { "epoch": 1.9882839421088905, "grad_norm": 32.03831481933594, "learning_rate": 6.165295065241633e-08, "logits/chosen": -2.1403820514678955, "logits/rejected": -2.1111044883728027, "logps/chosen": -199.05844116210938, "logps/rejected": -277.83038330078125, "loss": 0.5031, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.4719569683074951, "rewards/margins": 0.7576948404312134, "rewards/rejected": -2.229651689529419, "step": 11540 }, { "epoch": 1.9900068917987594, "grad_norm": 36.63600158691406, "learning_rate": 6.146786972370959e-08, "logits/chosen": -2.069815158843994, "logits/rejected": -2.041349411010742, "logps/chosen": -222.26174926757812, "logps/rejected": -275.2193908691406, "loss": 0.5902, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.698169469833374, "rewards/margins": 0.5312357544898987, "rewards/rejected": -2.229405164718628, "step": 11550 }, { "epoch": 1.9917298414886284, "grad_norm": 33.422481536865234, "learning_rate": 6.128294367118237e-08, "logits/chosen": -2.1030850410461426, "logits/rejected": -2.0553946495056152, "logps/chosen": -223.8405303955078, "logps/rejected": -285.63922119140625, "loss": 0.5378, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.6755192279815674, "rewards/margins": 0.676115870475769, "rewards/rejected": -2.3516347408294678, "step": 11560 }, { "epoch": 1.9934527911784976, "grad_norm": 37.54303741455078, "learning_rate": 6.109817323812706e-08, "logits/chosen": -2.1160786151885986, "logits/rejected": -2.0797441005706787, "logps/chosen": -232.2115936279297, "logps/rejected": -290.12860107421875, "loss": 0.5729, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.7748934030532837, "rewards/margins": 0.579551100730896, "rewards/rejected": -2.3544440269470215, "step": 11570 }, { "epoch": 1.9951757408683668, "grad_norm": 36.5296745300293, "learning_rate": 6.091355916721064e-08, "logits/chosen": -2.1580021381378174, "logits/rejected": -2.118701219558716, "logps/chosen": -219.04708862304688, "logps/rejected": -277.0885314941406, "loss": 0.5636, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.618523359298706, "rewards/margins": 0.607399582862854, "rewards/rejected": -2.2259230613708496, "step": 11580 }, { "epoch": 1.9968986905582358, "grad_norm": 40.546112060546875, "learning_rate": 6.072910220047159e-08, "logits/chosen": -2.062598943710327, "logits/rejected": -2.0163540840148926, "logps/chosen": -213.2329864501953, "logps/rejected": -254.9839324951172, "loss": 0.5645, "rewards/accuracies": 0.6875, "rewards/chosen": -1.5311720371246338, "rewards/margins": 0.5165271759033203, "rewards/rejected": -2.047699451446533, "step": 11590 }, { "epoch": 1.9986216402481047, "grad_norm": 36.788360595703125, "learning_rate": 6.054480307931678e-08, "logits/chosen": -2.1328606605529785, "logits/rejected": -2.103621244430542, "logps/chosen": -197.7500457763672, "logps/rejected": -250.41305541992188, "loss": 0.5734, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.4637445211410522, "rewards/margins": 0.512778103351593, "rewards/rejected": -1.97652268409729, "step": 11600 }, { "epoch": 1.9986216402481047, "eval_logits/chosen": -2.1813366413116455, "eval_logits/rejected": -2.162360429763794, "eval_logps/chosen": -202.5782012939453, "eval_logps/rejected": -233.75628662109375, "eval_loss": 0.6378381252288818, "eval_rewards/accuracies": 0.6361523866653442, "eval_rewards/chosen": -1.4356271028518677, "eval_rewards/margins": 0.27443984150886536, "eval_rewards/rejected": -1.710066795349121, "eval_runtime": 383.3871, "eval_samples_per_second": 11.226, "eval_steps_per_second": 1.403, "step": 11600 }, { "epoch": 2.0003445899379737, "grad_norm": 38.874786376953125, "learning_rate": 6.036066254451881e-08, "logits/chosen": -2.080693244934082, "logits/rejected": -2.040472984313965, "logps/chosen": -205.5887908935547, "logps/rejected": -267.6993713378906, "loss": 0.5544, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.5286204814910889, "rewards/margins": 0.6358743906021118, "rewards/rejected": -2.1644949913024902, "step": 11610 }, { "epoch": 2.0020675396278427, "grad_norm": 42.31683349609375, "learning_rate": 6.017668133621275e-08, "logits/chosen": -2.1189987659454346, "logits/rejected": -2.0837512016296387, "logps/chosen": -216.97024536132812, "logps/rejected": -276.1315002441406, "loss": 0.5511, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.5971972942352295, "rewards/margins": 0.6041030883789062, "rewards/rejected": -2.2013003826141357, "step": 11620 }, { "epoch": 2.003790489317712, "grad_norm": 35.677406311035156, "learning_rate": 5.999286019389342e-08, "logits/chosen": -2.1324095726013184, "logits/rejected": -2.083679676055908, "logps/chosen": -208.1287384033203, "logps/rejected": -270.8012390136719, "loss": 0.5135, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.4944071769714355, "rewards/margins": 0.689971923828125, "rewards/rejected": -2.1843791007995605, "step": 11630 }, { "epoch": 2.005513439007581, "grad_norm": 26.854320526123047, "learning_rate": 5.980919985641202e-08, "logits/chosen": -2.13673734664917, "logits/rejected": -2.0990772247314453, "logps/chosen": -206.71560668945312, "logps/rejected": -269.2923889160156, "loss": 0.5106, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.5317871570587158, "rewards/margins": 0.6630698442459106, "rewards/rejected": -2.194857120513916, "step": 11640 }, { "epoch": 2.00723638869745, "grad_norm": 45.685150146484375, "learning_rate": 5.962570106197364e-08, "logits/chosen": -2.0522255897521973, "logits/rejected": -1.999353051185608, "logps/chosen": -206.79458618164062, "logps/rejected": -270.4117126464844, "loss": 0.5254, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.514573574066162, "rewards/margins": 0.6830571293830872, "rewards/rejected": -2.1976304054260254, "step": 11650 }, { "epoch": 2.008959338387319, "grad_norm": 28.480182647705078, "learning_rate": 5.944236454813396e-08, "logits/chosen": -2.123844623565674, "logits/rejected": -2.067272186279297, "logps/chosen": -233.8195037841797, "logps/rejected": -292.9638366699219, "loss": 0.5361, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.7744499444961548, "rewards/margins": 0.665165364742279, "rewards/rejected": -2.439615488052368, "step": 11660 }, { "epoch": 2.010682288077188, "grad_norm": 39.119380950927734, "learning_rate": 5.9259191051796375e-08, "logits/chosen": -2.0889270305633545, "logits/rejected": -2.028006076812744, "logps/chosen": -230.15066528320312, "logps/rejected": -327.9476623535156, "loss": 0.4392, "rewards/accuracies": 0.8125, "rewards/chosen": -1.7356784343719482, "rewards/margins": 1.015694499015808, "rewards/rejected": -2.751373291015625, "step": 11670 }, { "epoch": 2.0124052377670574, "grad_norm": 42.146400451660156, "learning_rate": 5.907618130920919e-08, "logits/chosen": -2.0312490463256836, "logits/rejected": -2.000492572784424, "logps/chosen": -240.016357421875, "logps/rejected": -307.0270690917969, "loss": 0.558, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.8243763446807861, "rewards/margins": 0.6948267221450806, "rewards/rejected": -2.519202947616577, "step": 11680 }, { "epoch": 2.0141281874569263, "grad_norm": 42.349021911621094, "learning_rate": 5.8893336055962254e-08, "logits/chosen": -2.1421942710876465, "logits/rejected": -2.0884110927581787, "logps/chosen": -228.0297088623047, "logps/rejected": -291.3267517089844, "loss": 0.5139, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.6707584857940674, "rewards/margins": 0.7115265130996704, "rewards/rejected": -2.3822848796844482, "step": 11690 }, { "epoch": 2.0158511371467953, "grad_norm": 30.39948272705078, "learning_rate": 5.871065602698451e-08, "logits/chosen": -2.1972460746765137, "logits/rejected": -2.1512832641601562, "logps/chosen": -199.59202575683594, "logps/rejected": -259.33209228515625, "loss": 0.5424, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.422917127609253, "rewards/margins": 0.6099113821983337, "rewards/rejected": -2.0328285694122314, "step": 11700 }, { "epoch": 2.0175740868366643, "grad_norm": 39.6872444152832, "learning_rate": 5.852814195654068e-08, "logits/chosen": -2.116191864013672, "logits/rejected": -2.066092014312744, "logps/chosen": -213.960205078125, "logps/rejected": -273.5892028808594, "loss": 0.5265, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.5808203220367432, "rewards/margins": 0.6322665810585022, "rewards/rejected": -2.2130866050720215, "step": 11710 }, { "epoch": 2.0192970365265333, "grad_norm": 34.95515823364258, "learning_rate": 5.834579457822848e-08, "logits/chosen": -2.1429433822631836, "logits/rejected": -2.1109375953674316, "logps/chosen": -223.36117553710938, "logps/rejected": -280.72528076171875, "loss": 0.5774, "rewards/accuracies": 0.6875, "rewards/chosen": -1.681788444519043, "rewards/margins": 0.6002216935157776, "rewards/rejected": -2.2820098400115967, "step": 11720 }, { "epoch": 2.0210199862164027, "grad_norm": 32.278526306152344, "learning_rate": 5.81636146249756e-08, "logits/chosen": -2.0954790115356445, "logits/rejected": -2.060276508331299, "logps/chosen": -219.8001251220703, "logps/rejected": -285.2901916503906, "loss": 0.5431, "rewards/accuracies": 0.75, "rewards/chosen": -1.6424379348754883, "rewards/margins": 0.6421239376068115, "rewards/rejected": -2.2845618724823, "step": 11730 }, { "epoch": 2.0227429359062716, "grad_norm": 37.31665802001953, "learning_rate": 5.798160282903672e-08, "logits/chosen": -2.1379740238189697, "logits/rejected": -2.1029396057128906, "logps/chosen": -211.76272583007812, "logps/rejected": -270.9972839355469, "loss": 0.5506, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.5817822217941284, "rewards/margins": 0.605307400226593, "rewards/rejected": -2.187089681625366, "step": 11740 }, { "epoch": 2.0244658855961406, "grad_norm": 27.076473236083984, "learning_rate": 5.779975992199075e-08, "logits/chosen": -2.137813091278076, "logits/rejected": -2.105919122695923, "logps/chosen": -214.4691925048828, "logps/rejected": -299.862060546875, "loss": 0.4774, "rewards/accuracies": 0.8125, "rewards/chosen": -1.6512985229492188, "rewards/margins": 0.8046330213546753, "rewards/rejected": -2.4559314250946045, "step": 11750 }, { "epoch": 2.0261888352860096, "grad_norm": 34.72412109375, "learning_rate": 5.761808663473775e-08, "logits/chosen": -2.1149559020996094, "logits/rejected": -2.0760014057159424, "logps/chosen": -237.0303192138672, "logps/rejected": -312.3663635253906, "loss": 0.5381, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.812771201133728, "rewards/margins": 0.7852569222450256, "rewards/rejected": -2.5980281829833984, "step": 11760 }, { "epoch": 2.0279117849758785, "grad_norm": 30.311351776123047, "learning_rate": 5.74365836974959e-08, "logits/chosen": -2.0622406005859375, "logits/rejected": -2.0181469917297363, "logps/chosen": -227.7604217529297, "logps/rejected": -292.30255126953125, "loss": 0.5248, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.6945585012435913, "rewards/margins": 0.715693473815918, "rewards/rejected": -2.410252094268799, "step": 11770 }, { "epoch": 2.029634734665748, "grad_norm": 45.21958923339844, "learning_rate": 5.7255251839798726e-08, "logits/chosen": -2.038755178451538, "logits/rejected": -2.0028738975524902, "logps/chosen": -227.783203125, "logps/rejected": -288.4049072265625, "loss": 0.5598, "rewards/accuracies": 0.71875, "rewards/chosen": -1.7753404378890991, "rewards/margins": 0.5917962789535522, "rewards/rejected": -2.3671367168426514, "step": 11780 }, { "epoch": 2.031357684355617, "grad_norm": 36.97776412963867, "learning_rate": 5.7074091790492206e-08, "logits/chosen": -2.0641019344329834, "logits/rejected": -2.0340189933776855, "logps/chosen": -220.64047241210938, "logps/rejected": -322.8470153808594, "loss": 0.425, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.7013603448867798, "rewards/margins": 0.9990695714950562, "rewards/rejected": -2.700429916381836, "step": 11790 }, { "epoch": 2.033080634045486, "grad_norm": 34.968109130859375, "learning_rate": 5.6893104277731594e-08, "logits/chosen": -2.077822685241699, "logits/rejected": -2.0449321269989014, "logps/chosen": -257.3918762207031, "logps/rejected": -310.2939453125, "loss": 0.5721, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.9884878396987915, "rewards/margins": 0.5862862467765808, "rewards/rejected": -2.5747742652893066, "step": 11800 }, { "epoch": 2.034803583735355, "grad_norm": 46.57944869995117, "learning_rate": 5.6712290028978815e-08, "logits/chosen": -2.075214385986328, "logits/rejected": -2.031886577606201, "logps/chosen": -256.56695556640625, "logps/rejected": -326.1233215332031, "loss": 0.5368, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.995705008506775, "rewards/margins": 0.7450501322746277, "rewards/rejected": -2.740755558013916, "step": 11810 }, { "epoch": 2.036526533425224, "grad_norm": 32.89664077758789, "learning_rate": 5.653164977099921e-08, "logits/chosen": -2.0899271965026855, "logits/rejected": -2.056302309036255, "logps/chosen": -241.9139862060547, "logps/rejected": -305.611083984375, "loss": 0.5702, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.9050085544586182, "rewards/margins": 0.6445132493972778, "rewards/rejected": -2.5495216846466064, "step": 11820 }, { "epoch": 2.0382494831150932, "grad_norm": 49.7681999206543, "learning_rate": 5.635118422985896e-08, "logits/chosen": -2.0551645755767822, "logits/rejected": -2.027526378631592, "logps/chosen": -226.86361694335938, "logps/rejected": -297.20831298828125, "loss": 0.5262, "rewards/accuracies": 0.71875, "rewards/chosen": -1.7415897846221924, "rewards/margins": 0.6913633942604065, "rewards/rejected": -2.432953119277954, "step": 11830 }, { "epoch": 2.039972432804962, "grad_norm": 34.186309814453125, "learning_rate": 5.61708941309218e-08, "logits/chosen": -2.0613903999328613, "logits/rejected": -2.0164942741394043, "logps/chosen": -239.87210083007812, "logps/rejected": -312.2968444824219, "loss": 0.5158, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.8667516708374023, "rewards/margins": 0.7428416609764099, "rewards/rejected": -2.609593152999878, "step": 11840 }, { "epoch": 2.041695382494831, "grad_norm": 39.7913703918457, "learning_rate": 5.5990780198846435e-08, "logits/chosen": -2.0198912620544434, "logits/rejected": -1.9685665369033813, "logps/chosen": -253.1337890625, "logps/rejected": -317.5090026855469, "loss": 0.5376, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.9773327112197876, "rewards/margins": 0.7132784724235535, "rewards/rejected": -2.6906113624572754, "step": 11850 }, { "epoch": 2.0434183321847, "grad_norm": 56.37113571166992, "learning_rate": 5.581084315758351e-08, "logits/chosen": -2.093982696533203, "logits/rejected": -2.075417995452881, "logps/chosen": -268.6842346191406, "logps/rejected": -316.57073974609375, "loss": 0.6342, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -2.1388251781463623, "rewards/margins": 0.4565099775791168, "rewards/rejected": -2.595335006713867, "step": 11860 }, { "epoch": 2.045141281874569, "grad_norm": 42.2559700012207, "learning_rate": 5.563108373037243e-08, "logits/chosen": -2.0415985584259033, "logits/rejected": -2.0136475563049316, "logps/chosen": -233.3759002685547, "logps/rejected": -308.192626953125, "loss": 0.5188, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.8376801013946533, "rewards/margins": 0.7106243371963501, "rewards/rejected": -2.548304796218872, "step": 11870 }, { "epoch": 2.0468642315644385, "grad_norm": 45.246246337890625, "learning_rate": 5.545150263973897e-08, "logits/chosen": -2.063602924346924, "logits/rejected": -2.015317916870117, "logps/chosen": -246.6908721923828, "logps/rejected": -325.9163513183594, "loss": 0.4574, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.8842356204986572, "rewards/margins": 0.8306552767753601, "rewards/rejected": -2.714890718460083, "step": 11880 }, { "epoch": 2.0485871812543075, "grad_norm": 43.25898742675781, "learning_rate": 5.527210060749201e-08, "logits/chosen": -2.0884766578674316, "logits/rejected": -2.0456490516662598, "logps/chosen": -256.9544982910156, "logps/rejected": -329.82989501953125, "loss": 0.5009, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.9893547296524048, "rewards/margins": 0.7906600832939148, "rewards/rejected": -2.780014753341675, "step": 11890 }, { "epoch": 2.0503101309441765, "grad_norm": 48.36300277709961, "learning_rate": 5.509287835472067e-08, "logits/chosen": -2.000892162322998, "logits/rejected": -1.9563884735107422, "logps/chosen": -238.4857177734375, "logps/rejected": -315.6385192871094, "loss": 0.5189, "rewards/accuracies": 0.75, "rewards/chosen": -1.8447710275650024, "rewards/margins": 0.8057880401611328, "rewards/rejected": -2.6505589485168457, "step": 11900 }, { "epoch": 2.0520330806340454, "grad_norm": 30.14019012451172, "learning_rate": 5.4913836601791497e-08, "logits/chosen": -2.1544063091278076, "logits/rejected": -2.138747215270996, "logps/chosen": -231.66281127929688, "logps/rejected": -307.263916015625, "loss": 0.5387, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.7832281589508057, "rewards/margins": 0.7166194915771484, "rewards/rejected": -2.499847888946533, "step": 11910 }, { "epoch": 2.0537560303239144, "grad_norm": 38.02259063720703, "learning_rate": 5.473497606834554e-08, "logits/chosen": -2.119105100631714, "logits/rejected": -2.089402437210083, "logps/chosen": -230.0631866455078, "logps/rejected": -290.7784423828125, "loss": 0.5544, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.7121751308441162, "rewards/margins": 0.6320434212684631, "rewards/rejected": -2.3442187309265137, "step": 11920 }, { "epoch": 2.055478980013784, "grad_norm": 31.72987174987793, "learning_rate": 5.4556297473295496e-08, "logits/chosen": -2.1159067153930664, "logits/rejected": -2.081695079803467, "logps/chosen": -210.96792602539062, "logps/rejected": -279.8138732910156, "loss": 0.5045, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.5741071701049805, "rewards/margins": 0.67446368932724, "rewards/rejected": -2.2485709190368652, "step": 11930 }, { "epoch": 2.057201929703653, "grad_norm": 29.604206085205078, "learning_rate": 5.4377801534822676e-08, "logits/chosen": -2.078176975250244, "logits/rejected": -2.032797336578369, "logps/chosen": -203.94651794433594, "logps/rejected": -282.87371826171875, "loss": 0.4724, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.4840799570083618, "rewards/margins": 0.8179097175598145, "rewards/rejected": -2.301989793777466, "step": 11940 }, { "epoch": 2.0589248793935218, "grad_norm": 39.70060348510742, "learning_rate": 5.419948897037436e-08, "logits/chosen": -2.1387264728546143, "logits/rejected": -2.099928617477417, "logps/chosen": -237.4861602783203, "logps/rejected": -289.05340576171875, "loss": 0.5961, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.7850711345672607, "rewards/margins": 0.5495141744613647, "rewards/rejected": -2.334585189819336, "step": 11950 }, { "epoch": 2.0606478290833907, "grad_norm": 47.06940460205078, "learning_rate": 5.4021360496660614e-08, "logits/chosen": -2.024587869644165, "logits/rejected": -1.9838438034057617, "logps/chosen": -224.7746124267578, "logps/rejected": -310.7027893066406, "loss": 0.4805, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.7129604816436768, "rewards/margins": 0.8761369585990906, "rewards/rejected": -2.589097499847412, "step": 11960 }, { "epoch": 2.0623707787732597, "grad_norm": 31.619342803955078, "learning_rate": 5.3843416829651713e-08, "logits/chosen": -2.1144022941589355, "logits/rejected": -2.0715739727020264, "logps/chosen": -232.4583740234375, "logps/rejected": -300.1246032714844, "loss": 0.5358, "rewards/accuracies": 0.71875, "rewards/chosen": -1.7532933950424194, "rewards/margins": 0.72503262758255, "rewards/rejected": -2.478325605392456, "step": 11970 }, { "epoch": 2.0640937284631287, "grad_norm": 36.4967041015625, "learning_rate": 5.3665658684574975e-08, "logits/chosen": -2.0988223552703857, "logits/rejected": -2.049405097961426, "logps/chosen": -223.54940795898438, "logps/rejected": -293.61090087890625, "loss": 0.4992, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.6739461421966553, "rewards/margins": 0.7265938520431519, "rewards/rejected": -2.4005398750305176, "step": 11980 }, { "epoch": 2.065816678152998, "grad_norm": 32.14418029785156, "learning_rate": 5.348808677591222e-08, "logits/chosen": -2.153207302093506, "logits/rejected": -2.098644495010376, "logps/chosen": -228.08499145507812, "logps/rejected": -290.0260009765625, "loss": 0.5217, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.6931899785995483, "rewards/margins": 0.6990076303482056, "rewards/rejected": -2.392197847366333, "step": 11990 }, { "epoch": 2.067539627842867, "grad_norm": 30.61179542541504, "learning_rate": 5.331070181739654e-08, "logits/chosen": -2.0519518852233887, "logits/rejected": -2.02451753616333, "logps/chosen": -229.9219512939453, "logps/rejected": -305.4267578125, "loss": 0.5302, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.780832290649414, "rewards/margins": 0.7345147728919983, "rewards/rejected": -2.5153470039367676, "step": 12000 }, { "epoch": 2.067539627842867, "eval_logits/chosen": -2.1466264724731445, "eval_logits/rejected": -2.125704050064087, "eval_logps/chosen": -229.6514892578125, "eval_logps/rejected": -265.5969543457031, "eval_loss": 0.641311764717102, "eval_rewards/accuracies": 0.6291821599006653, "eval_rewards/chosen": -1.706360101699829, "eval_rewards/margins": 0.32211339473724365, "eval_rewards/rejected": -2.028473377227783, "eval_runtime": 382.9749, "eval_samples_per_second": 11.238, "eval_steps_per_second": 1.405, "step": 12000 }, { "epoch": 2.069262577532736, "grad_norm": 38.363101959228516, "learning_rate": 5.313350452200962e-08, "logits/chosen": -2.1067209243774414, "logits/rejected": -2.0558550357818604, "logps/chosen": -243.1296844482422, "logps/rejected": -316.4700622558594, "loss": 0.5338, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.8736377954483032, "rewards/margins": 0.7471028566360474, "rewards/rejected": -2.6207406520843506, "step": 12010 }, { "epoch": 2.070985527222605, "grad_norm": 43.54338455200195, "learning_rate": 5.295649560197895e-08, "logits/chosen": -2.1059651374816895, "logits/rejected": -2.0664753913879395, "logps/chosen": -233.4269561767578, "logps/rejected": -297.7746887207031, "loss": 0.5522, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.8017711639404297, "rewards/margins": 0.6565322279930115, "rewards/rejected": -2.458303451538086, "step": 12020 }, { "epoch": 2.072708476912474, "grad_norm": 54.19285583496094, "learning_rate": 5.27796757687748e-08, "logits/chosen": -2.1758646965026855, "logits/rejected": -2.144629716873169, "logps/chosen": -212.6887664794922, "logps/rejected": -268.6290588378906, "loss": 0.5845, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.625186562538147, "rewards/margins": 0.5565309524536133, "rewards/rejected": -2.1817173957824707, "step": 12030 }, { "epoch": 2.0744314266023434, "grad_norm": 33.641727447509766, "learning_rate": 5.260304573310743e-08, "logits/chosen": -2.0929903984069824, "logits/rejected": -2.04295015335083, "logps/chosen": -227.9827117919922, "logps/rejected": -301.3453063964844, "loss": 0.5225, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.7123521566390991, "rewards/margins": 0.7900432348251343, "rewards/rejected": -2.5023951530456543, "step": 12040 }, { "epoch": 2.0761543762922123, "grad_norm": 41.476985931396484, "learning_rate": 5.242660620492416e-08, "logits/chosen": -2.1373560428619385, "logits/rejected": -2.0753097534179688, "logps/chosen": -226.6936492919922, "logps/rejected": -305.44647216796875, "loss": 0.4736, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.7126344442367554, "rewards/margins": 0.8372985124588013, "rewards/rejected": -2.5499329566955566, "step": 12050 }, { "epoch": 2.0778773259820813, "grad_norm": 41.41499328613281, "learning_rate": 5.2250357893406703e-08, "logits/chosen": -2.0900511741638184, "logits/rejected": -2.05121111869812, "logps/chosen": -239.5316162109375, "logps/rejected": -307.7794494628906, "loss": 0.5373, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.8286758661270142, "rewards/margins": 0.7210630178451538, "rewards/rejected": -2.549738883972168, "step": 12060 }, { "epoch": 2.0796002756719503, "grad_norm": 42.31587219238281, "learning_rate": 5.2074301506968165e-08, "logits/chosen": -2.0873608589172363, "logits/rejected": -2.029822587966919, "logps/chosen": -233.84033203125, "logps/rejected": -314.54278564453125, "loss": 0.4958, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.7759023904800415, "rewards/margins": 0.8784404993057251, "rewards/rejected": -2.6543426513671875, "step": 12070 }, { "epoch": 2.0813232253618192, "grad_norm": 66.47350311279297, "learning_rate": 5.189843775325018e-08, "logits/chosen": -2.0308492183685303, "logits/rejected": -1.979935646057129, "logps/chosen": -232.94021606445312, "logps/rejected": -327.6007080078125, "loss": 0.4672, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -1.7581536769866943, "rewards/margins": 0.9758221507072449, "rewards/rejected": -2.733975887298584, "step": 12080 }, { "epoch": 2.0830461750516887, "grad_norm": 53.56596374511719, "learning_rate": 5.172276733912009e-08, "logits/chosen": -2.0624167919158936, "logits/rejected": -2.0192716121673584, "logps/chosen": -261.79852294921875, "logps/rejected": -325.7588806152344, "loss": 0.5509, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.0378787517547607, "rewards/margins": 0.6984132528305054, "rewards/rejected": -2.7362923622131348, "step": 12090 }, { "epoch": 2.0847691247415576, "grad_norm": 48.26594543457031, "learning_rate": 5.1547290970668243e-08, "logits/chosen": -2.052945613861084, "logits/rejected": -2.0171191692352295, "logps/chosen": -244.5965118408203, "logps/rejected": -308.4600830078125, "loss": 0.5303, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.859833002090454, "rewards/margins": 0.7063957452774048, "rewards/rejected": -2.5662283897399902, "step": 12100 }, { "epoch": 2.0864920744314266, "grad_norm": 33.33226013183594, "learning_rate": 5.13720093532049e-08, "logits/chosen": -2.085258960723877, "logits/rejected": -2.0338473320007324, "logps/chosen": -251.0377655029297, "logps/rejected": -320.9676818847656, "loss": 0.5262, "rewards/accuracies": 0.78125, "rewards/chosen": -1.9285335540771484, "rewards/margins": 0.7524645328521729, "rewards/rejected": -2.6809983253479004, "step": 12110 }, { "epoch": 2.0882150241212956, "grad_norm": 51.34297561645508, "learning_rate": 5.1196923191257654e-08, "logits/chosen": -2.0758512020111084, "logits/rejected": -2.0404000282287598, "logps/chosen": -242.778076171875, "logps/rejected": -309.3258056640625, "loss": 0.5513, "rewards/accuracies": 0.71875, "rewards/chosen": -1.8604662418365479, "rewards/margins": 0.7200610637664795, "rewards/rejected": -2.5805275440216064, "step": 12120 }, { "epoch": 2.0899379738111645, "grad_norm": 29.25992202758789, "learning_rate": 5.102203318856847e-08, "logits/chosen": -2.0763731002807617, "logits/rejected": -2.0236387252807617, "logps/chosen": -230.5070037841797, "logps/rejected": -313.5376892089844, "loss": 0.4763, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.7576888799667358, "rewards/margins": 0.850530743598938, "rewards/rejected": -2.608219623565674, "step": 12130 }, { "epoch": 2.091660923501034, "grad_norm": 49.207420349121094, "learning_rate": 5.084734004809079e-08, "logits/chosen": -2.1137688159942627, "logits/rejected": -2.0833239555358887, "logps/chosen": -229.6987762451172, "logps/rejected": -289.147216796875, "loss": 0.5833, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.7667906284332275, "rewards/margins": 0.5884928703308105, "rewards/rejected": -2.355283498764038, "step": 12140 }, { "epoch": 2.093383873190903, "grad_norm": 109.75863647460938, "learning_rate": 5.0672844471986806e-08, "logits/chosen": -2.1269781589508057, "logits/rejected": -2.09053373336792, "logps/chosen": -212.83584594726562, "logps/rejected": -284.8175964355469, "loss": 0.546, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.5835081338882446, "rewards/margins": 0.7448925971984863, "rewards/rejected": -2.3284006118774414, "step": 12150 }, { "epoch": 2.095106822880772, "grad_norm": 46.11418151855469, "learning_rate": 5.049854716162469e-08, "logits/chosen": -2.0724129676818848, "logits/rejected": -2.0280184745788574, "logps/chosen": -211.89907836914062, "logps/rejected": -272.09698486328125, "loss": 0.5479, "rewards/accuracies": 0.71875, "rewards/chosen": -1.5520589351654053, "rewards/margins": 0.6403037309646606, "rewards/rejected": -2.1923625469207764, "step": 12160 }, { "epoch": 2.096829772570641, "grad_norm": 30.395431518554688, "learning_rate": 5.032444881757575e-08, "logits/chosen": -2.0958447456359863, "logits/rejected": -2.058408260345459, "logps/chosen": -217.18734741210938, "logps/rejected": -282.3731384277344, "loss": 0.5525, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.6155513525009155, "rewards/margins": 0.6562166213989258, "rewards/rejected": -2.271768093109131, "step": 12170 }, { "epoch": 2.09855272226051, "grad_norm": 30.15849494934082, "learning_rate": 5.015055013961129e-08, "logits/chosen": -2.0267977714538574, "logits/rejected": -1.9941260814666748, "logps/chosen": -218.51113891601562, "logps/rejected": -305.34735107421875, "loss": 0.4984, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.692307472229004, "rewards/margins": 0.7915050387382507, "rewards/rejected": -2.4838123321533203, "step": 12180 }, { "epoch": 2.1002756719503792, "grad_norm": 33.532596588134766, "learning_rate": 4.9976851826700385e-08, "logits/chosen": -2.068605661392212, "logits/rejected": -2.0211539268493652, "logps/chosen": -222.3265380859375, "logps/rejected": -281.1471252441406, "loss": 0.5434, "rewards/accuracies": 0.71875, "rewards/chosen": -1.6454464197158813, "rewards/margins": 0.6401292085647583, "rewards/rejected": -2.2855758666992188, "step": 12190 }, { "epoch": 2.101998621640248, "grad_norm": 51.70030212402344, "learning_rate": 4.980335457700665e-08, "logits/chosen": -2.1316757202148438, "logits/rejected": -2.0937085151672363, "logps/chosen": -236.2337646484375, "logps/rejected": -304.72802734375, "loss": 0.5122, "rewards/accuracies": 0.78125, "rewards/chosen": -1.7878990173339844, "rewards/margins": 0.7227575182914734, "rewards/rejected": -2.5106565952301025, "step": 12200 }, { "epoch": 2.103721571330117, "grad_norm": 41.22138595581055, "learning_rate": 4.963005908788547e-08, "logits/chosen": -2.0903918743133545, "logits/rejected": -2.0646119117736816, "logps/chosen": -237.90762329101562, "logps/rejected": -295.3863830566406, "loss": 0.5607, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.829699158668518, "rewards/margins": 0.6166217923164368, "rewards/rejected": -2.4463210105895996, "step": 12210 }, { "epoch": 2.105444521019986, "grad_norm": 57.86821746826172, "learning_rate": 4.945696605588143e-08, "logits/chosen": -2.0225930213928223, "logits/rejected": -1.9905112981796265, "logps/chosen": -229.29971313476562, "logps/rejected": -291.74102783203125, "loss": 0.5468, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.7564723491668701, "rewards/margins": 0.6317520141601562, "rewards/rejected": -2.3882243633270264, "step": 12220 }, { "epoch": 2.107167470709855, "grad_norm": 55.78356170654297, "learning_rate": 4.928407617672519e-08, "logits/chosen": -2.09854793548584, "logits/rejected": -2.0631284713745117, "logps/chosen": -224.0506591796875, "logps/rejected": -292.0597839355469, "loss": 0.5564, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.7134735584259033, "rewards/margins": 0.7068896293640137, "rewards/rejected": -2.420363664627075, "step": 12230 }, { "epoch": 2.1088904203997245, "grad_norm": 46.384521484375, "learning_rate": 4.911139014533099e-08, "logits/chosen": -2.0313241481781006, "logits/rejected": -1.9866256713867188, "logps/chosen": -211.84573364257812, "logps/rejected": -284.09539794921875, "loss": 0.5232, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.5812326669692993, "rewards/margins": 0.743860125541687, "rewards/rejected": -2.3250927925109863, "step": 12240 }, { "epoch": 2.1106133700895935, "grad_norm": 59.478694915771484, "learning_rate": 4.893890865579362e-08, "logits/chosen": -2.075040817260742, "logits/rejected": -2.039846658706665, "logps/chosen": -221.32901000976562, "logps/rejected": -269.36419677734375, "loss": 0.6039, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.6808462142944336, "rewards/margins": 0.5116840600967407, "rewards/rejected": -2.1925301551818848, "step": 12250 }, { "epoch": 2.1123363197794625, "grad_norm": 45.6484375, "learning_rate": 4.8766632401385856e-08, "logits/chosen": -2.0814313888549805, "logits/rejected": -2.0419795513153076, "logps/chosen": -213.54916381835938, "logps/rejected": -276.7439880371094, "loss": 0.5496, "rewards/accuracies": 0.71875, "rewards/chosen": -1.599725604057312, "rewards/margins": 0.6187621355056763, "rewards/rejected": -2.2184877395629883, "step": 12260 }, { "epoch": 2.1140592694693314, "grad_norm": 48.90853500366211, "learning_rate": 4.859456207455539e-08, "logits/chosen": -2.153773069381714, "logits/rejected": -2.1088504791259766, "logps/chosen": -224.59121704101562, "logps/rejected": -293.13031005859375, "loss": 0.5484, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.6861425638198853, "rewards/margins": 0.7158703804016113, "rewards/rejected": -2.402012825012207, "step": 12270 }, { "epoch": 2.1157822191592004, "grad_norm": 32.83452606201172, "learning_rate": 4.842269836692239e-08, "logits/chosen": -2.0922152996063232, "logits/rejected": -2.051940441131592, "logps/chosen": -226.1486358642578, "logps/rejected": -295.8097839355469, "loss": 0.5137, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.6863266229629517, "rewards/margins": 0.7510272860527039, "rewards/rejected": -2.4373538494110107, "step": 12280 }, { "epoch": 2.11750516884907, "grad_norm": 58.551639556884766, "learning_rate": 4.8251041969276355e-08, "logits/chosen": -2.034183979034424, "logits/rejected": -1.9952113628387451, "logps/chosen": -225.10250854492188, "logps/rejected": -283.5815124511719, "loss": 0.5476, "rewards/accuracies": 0.71875, "rewards/chosen": -1.703881025314331, "rewards/margins": 0.6042089462280273, "rewards/rejected": -2.3080897331237793, "step": 12290 }, { "epoch": 2.1192281185389388, "grad_norm": 46.86976623535156, "learning_rate": 4.8079593571573654e-08, "logits/chosen": -2.0990262031555176, "logits/rejected": -2.052614688873291, "logps/chosen": -230.6024169921875, "logps/rejected": -292.49884033203125, "loss": 0.5279, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.749632477760315, "rewards/margins": 0.6554352045059204, "rewards/rejected": -2.4050679206848145, "step": 12300 }, { "epoch": 2.1209510682288077, "grad_norm": 36.16896438598633, "learning_rate": 4.7908353862934645e-08, "logits/chosen": -2.0593631267547607, "logits/rejected": -2.006453037261963, "logps/chosen": -230.8593292236328, "logps/rejected": -307.69720458984375, "loss": 0.5149, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.742922067642212, "rewards/margins": 0.8167440295219421, "rewards/rejected": -2.5596659183502197, "step": 12310 }, { "epoch": 2.1226740179186767, "grad_norm": 37.408653259277344, "learning_rate": 4.773732353164069e-08, "logits/chosen": -2.054842472076416, "logits/rejected": -2.0034537315368652, "logps/chosen": -228.54055786132812, "logps/rejected": -325.37860107421875, "loss": 0.4833, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.7277143001556396, "rewards/margins": 0.9802853465080261, "rewards/rejected": -2.7079997062683105, "step": 12320 }, { "epoch": 2.1243969676085457, "grad_norm": 35.757171630859375, "learning_rate": 4.756650326513175e-08, "logits/chosen": -2.073939085006714, "logits/rejected": -2.0456321239471436, "logps/chosen": -239.0164031982422, "logps/rejected": -308.36529541015625, "loss": 0.5265, "rewards/accuracies": 0.75, "rewards/chosen": -1.8104842901229858, "rewards/margins": 0.7307012677192688, "rewards/rejected": -2.5411856174468994, "step": 12330 }, { "epoch": 2.126119917298415, "grad_norm": 32.52260971069336, "learning_rate": 4.739589375000345e-08, "logits/chosen": -2.076432466506958, "logits/rejected": -2.0272200107574463, "logps/chosen": -246.3824920654297, "logps/rejected": -331.07965087890625, "loss": 0.4841, "rewards/accuracies": 0.78125, "rewards/chosen": -1.9130935668945312, "rewards/margins": 0.8666901588439941, "rewards/rejected": -2.7797837257385254, "step": 12340 }, { "epoch": 2.127842866988284, "grad_norm": 44.25779342651367, "learning_rate": 4.722549567200423e-08, "logits/chosen": -1.996809959411621, "logits/rejected": -1.9381519556045532, "logps/chosen": -259.73126220703125, "logps/rejected": -331.24725341796875, "loss": 0.5365, "rewards/accuracies": 0.71875, "rewards/chosen": -2.027064561843872, "rewards/margins": 0.7794581651687622, "rewards/rejected": -2.806522846221924, "step": 12350 }, { "epoch": 2.129565816678153, "grad_norm": 41.06666564941406, "learning_rate": 4.70553097160327e-08, "logits/chosen": -2.079986095428467, "logits/rejected": -2.0405890941619873, "logps/chosen": -229.076904296875, "logps/rejected": -316.8277282714844, "loss": 0.4616, "rewards/accuracies": 0.75, "rewards/chosen": -1.7490637302398682, "rewards/margins": 0.9058129191398621, "rewards/rejected": -2.654876708984375, "step": 12360 }, { "epoch": 2.131288766368022, "grad_norm": 35.618194580078125, "learning_rate": 4.6885336566134905e-08, "logits/chosen": -2.060807466506958, "logits/rejected": -2.0051631927490234, "logps/chosen": -245.77291870117188, "logps/rejected": -335.7901306152344, "loss": 0.4906, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.921326994895935, "rewards/margins": 0.9444554448127747, "rewards/rejected": -2.8657822608947754, "step": 12370 }, { "epoch": 2.133011716057891, "grad_norm": 43.511146545410156, "learning_rate": 4.671557690550158e-08, "logits/chosen": -2.0528554916381836, "logits/rejected": -2.024606704711914, "logps/chosen": -244.91891479492188, "logps/rejected": -336.2535400390625, "loss": 0.4881, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.9351543188095093, "rewards/margins": 0.8718324899673462, "rewards/rejected": -2.8069872856140137, "step": 12380 }, { "epoch": 2.13473466574776, "grad_norm": 38.06670379638672, "learning_rate": 4.65460314164652e-08, "logits/chosen": -2.140407085418701, "logits/rejected": -2.1023502349853516, "logps/chosen": -248.9733123779297, "logps/rejected": -341.685546875, "loss": 0.4843, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.9468520879745483, "rewards/margins": 0.9342899322509766, "rewards/rejected": -2.8811416625976562, "step": 12390 }, { "epoch": 2.1364576154376294, "grad_norm": 31.036680221557617, "learning_rate": 4.637670078049759e-08, "logits/chosen": -2.056527614593506, "logits/rejected": -2.023624897003174, "logps/chosen": -274.3301086425781, "logps/rejected": -358.85400390625, "loss": 0.4961, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.1827850341796875, "rewards/margins": 0.8480589985847473, "rewards/rejected": -3.03084397315979, "step": 12400 }, { "epoch": 2.1364576154376294, "eval_logits/chosen": -2.11784029006958, "eval_logits/rejected": -2.09578013420105, "eval_logps/chosen": -259.76959228515625, "eval_logps/rejected": -299.8690185546875, "eval_loss": 0.6474471688270569, "eval_rewards/accuracies": 0.6387081742286682, "eval_rewards/chosen": -2.0075414180755615, "eval_rewards/margins": 0.36365291476249695, "eval_rewards/rejected": -2.3711941242218018, "eval_runtime": 383.2126, "eval_samples_per_second": 11.231, "eval_steps_per_second": 1.404, "step": 12400 }, { "epoch": 2.1381805651274983, "grad_norm": 44.605613708496094, "learning_rate": 4.620758567820686e-08, "logits/chosen": -2.03633975982666, "logits/rejected": -1.9880211353302002, "logps/chosen": -259.0669250488281, "logps/rejected": -327.37298583984375, "loss": 0.5481, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.0398917198181152, "rewards/margins": 0.7153648138046265, "rewards/rejected": -2.7552566528320312, "step": 12410 }, { "epoch": 2.1399035148173673, "grad_norm": 69.91634368896484, "learning_rate": 4.60386867893348e-08, "logits/chosen": -1.9803155660629272, "logits/rejected": -1.94023859500885, "logps/chosen": -263.7940368652344, "logps/rejected": -339.0141906738281, "loss": 0.5273, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -2.0752742290496826, "rewards/margins": 0.7945727109909058, "rewards/rejected": -2.869846820831299, "step": 12420 }, { "epoch": 2.1416264645072363, "grad_norm": 47.40889358520508, "learning_rate": 4.5870004792754257e-08, "logits/chosen": -2.044445514678955, "logits/rejected": -2.0063881874084473, "logps/chosen": -260.7670593261719, "logps/rejected": -335.7140808105469, "loss": 0.5264, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.047086477279663, "rewards/margins": 0.7734764814376831, "rewards/rejected": -2.8205630779266357, "step": 12430 }, { "epoch": 2.1433494141971057, "grad_norm": 30.57667350769043, "learning_rate": 4.570154036646625e-08, "logits/chosen": -2.0202064514160156, "logits/rejected": -1.9847602844238281, "logps/chosen": -236.6773223876953, "logps/rejected": -304.9550476074219, "loss": 0.5395, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.8218532800674438, "rewards/margins": 0.7106314897537231, "rewards/rejected": -2.532484531402588, "step": 12440 }, { "epoch": 2.1450723638869746, "grad_norm": 53.94111251831055, "learning_rate": 4.553329418759726e-08, "logits/chosen": -2.0629711151123047, "logits/rejected": -2.032637357711792, "logps/chosen": -234.65023803710938, "logps/rejected": -299.62969970703125, "loss": 0.5676, "rewards/accuracies": 0.6875, "rewards/chosen": -1.8310039043426514, "rewards/margins": 0.6646952033042908, "rewards/rejected": -2.495698928833008, "step": 12450 }, { "epoch": 2.1467953135768436, "grad_norm": 44.45255661010742, "learning_rate": 4.5365266932396526e-08, "logits/chosen": -2.111095666885376, "logits/rejected": -2.0713887214660645, "logps/chosen": -234.33627319335938, "logps/rejected": -308.1429443359375, "loss": 0.5145, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.7842357158660889, "rewards/margins": 0.7793278694152832, "rewards/rejected": -2.563563585281372, "step": 12460 }, { "epoch": 2.1485182632667126, "grad_norm": 53.02629089355469, "learning_rate": 4.519745927623344e-08, "logits/chosen": -2.1693835258483887, "logits/rejected": -2.1237571239471436, "logps/chosen": -225.2732696533203, "logps/rejected": -309.4500427246094, "loss": 0.4799, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.6921237707138062, "rewards/margins": 0.84943026304245, "rewards/rejected": -2.5415542125701904, "step": 12470 }, { "epoch": 2.1502412129565815, "grad_norm": 42.24469757080078, "learning_rate": 4.5029871893594695e-08, "logits/chosen": -2.152308464050293, "logits/rejected": -2.1171212196350098, "logps/chosen": -230.6731414794922, "logps/rejected": -296.44952392578125, "loss": 0.528, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.7500979900360107, "rewards/margins": 0.6756486892700195, "rewards/rejected": -2.4257466793060303, "step": 12480 }, { "epoch": 2.1519641626464505, "grad_norm": 78.8714370727539, "learning_rate": 4.486250545808159e-08, "logits/chosen": -2.0349388122558594, "logits/rejected": -1.998011827468872, "logps/chosen": -241.18063354492188, "logps/rejected": -319.050537109375, "loss": 0.5447, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.8526427745819092, "rewards/margins": 0.8055623769760132, "rewards/rejected": -2.6582047939300537, "step": 12490 }, { "epoch": 2.15368711233632, "grad_norm": 48.69804000854492, "learning_rate": 4.469536064240731e-08, "logits/chosen": -2.075000286102295, "logits/rejected": -2.0368382930755615, "logps/chosen": -246.09445190429688, "logps/rejected": -324.2549743652344, "loss": 0.5345, "rewards/accuracies": 0.6875, "rewards/chosen": -1.915993332862854, "rewards/margins": 0.801496148109436, "rewards/rejected": -2.717489242553711, "step": 12500 }, { "epoch": 2.155410062026189, "grad_norm": 35.36015701293945, "learning_rate": 4.452843811839435e-08, "logits/chosen": -2.120659112930298, "logits/rejected": -2.087756872177124, "logps/chosen": -231.48532104492188, "logps/rejected": -295.49078369140625, "loss": 0.5469, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.7738559246063232, "rewards/margins": 0.6695488691329956, "rewards/rejected": -2.4434046745300293, "step": 12510 }, { "epoch": 2.157133011716058, "grad_norm": 43.532100677490234, "learning_rate": 4.436173855697174e-08, "logits/chosen": -2.055793046951294, "logits/rejected": -2.0177674293518066, "logps/chosen": -232.77920532226562, "logps/rejected": -294.95440673828125, "loss": 0.5566, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.7898495197296143, "rewards/margins": 0.6242788434028625, "rewards/rejected": -2.414128303527832, "step": 12520 }, { "epoch": 2.158855961405927, "grad_norm": 47.047569274902344, "learning_rate": 4.4195262628172224e-08, "logits/chosen": -2.0752203464508057, "logits/rejected": -2.0413734912872314, "logps/chosen": -233.0418701171875, "logps/rejected": -306.05621337890625, "loss": 0.5037, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.7798078060150146, "rewards/margins": 0.7302197217941284, "rewards/rejected": -2.5100274085998535, "step": 12530 }, { "epoch": 2.160578911095796, "grad_norm": 35.58103561401367, "learning_rate": 4.402901100112972e-08, "logits/chosen": -2.072754144668579, "logits/rejected": -2.0337443351745605, "logps/chosen": -233.8501739501953, "logps/rejected": -298.0557556152344, "loss": 0.5718, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.7930688858032227, "rewards/margins": 0.6550329923629761, "rewards/rejected": -2.448101758956909, "step": 12540 }, { "epoch": 2.162301860785665, "grad_norm": 50.7521858215332, "learning_rate": 4.386298434407666e-08, "logits/chosen": -2.1599698066711426, "logits/rejected": -2.133126735687256, "logps/chosen": -221.6575469970703, "logps/rejected": -277.86492919921875, "loss": 0.5649, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.6921859979629517, "rewards/margins": 0.5701245069503784, "rewards/rejected": -2.26231050491333, "step": 12550 }, { "epoch": 2.164024810475534, "grad_norm": 44.69491195678711, "learning_rate": 4.369718332434109e-08, "logits/chosen": -2.110529661178589, "logits/rejected": -2.0664124488830566, "logps/chosen": -224.39920043945312, "logps/rejected": -274.45062255859375, "loss": 0.573, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.6608068943023682, "rewards/margins": 0.5945785045623779, "rewards/rejected": -2.255385637283325, "step": 12560 }, { "epoch": 2.165747760165403, "grad_norm": 38.576751708984375, "learning_rate": 4.3531608608344274e-08, "logits/chosen": -2.0868732929229736, "logits/rejected": -2.0362396240234375, "logps/chosen": -204.62542724609375, "logps/rejected": -267.41387939453125, "loss": 0.5145, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.5071732997894287, "rewards/margins": 0.6663482189178467, "rewards/rejected": -2.1735215187072754, "step": 12570 }, { "epoch": 2.167470709855272, "grad_norm": 44.46379470825195, "learning_rate": 4.3366260861597814e-08, "logits/chosen": -2.106191635131836, "logits/rejected": -2.0512826442718506, "logps/chosen": -219.61532592773438, "logps/rejected": -283.9519348144531, "loss": 0.4881, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.5764575004577637, "rewards/margins": 0.7457872629165649, "rewards/rejected": -2.322244882583618, "step": 12580 }, { "epoch": 2.169193659545141, "grad_norm": 44.6986083984375, "learning_rate": 4.3201140748701e-08, "logits/chosen": -2.077023983001709, "logits/rejected": -2.0341241359710693, "logps/chosen": -217.6835174560547, "logps/rejected": -285.7940979003906, "loss": 0.5134, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.6051843166351318, "rewards/margins": 0.731010913848877, "rewards/rejected": -2.336195468902588, "step": 12590 }, { "epoch": 2.1709166092350105, "grad_norm": 39.228145599365234, "learning_rate": 4.303624893333816e-08, "logits/chosen": -2.0710034370422363, "logits/rejected": -2.0349724292755127, "logps/chosen": -242.8468475341797, "logps/rejected": -303.30194091796875, "loss": 0.5498, "rewards/accuracies": 0.75, "rewards/chosen": -1.833044409751892, "rewards/margins": 0.6784363985061646, "rewards/rejected": -2.5114808082580566, "step": 12600 }, { "epoch": 2.1726395589248795, "grad_norm": 33.84947967529297, "learning_rate": 4.287158607827607e-08, "logits/chosen": -2.1545472145080566, "logits/rejected": -2.1268463134765625, "logps/chosen": -229.6652374267578, "logps/rejected": -303.5273742675781, "loss": 0.5085, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.7480127811431885, "rewards/margins": 0.7504814267158508, "rewards/rejected": -2.4984943866729736, "step": 12610 }, { "epoch": 2.1743625086147484, "grad_norm": 38.461978912353516, "learning_rate": 4.270715284536124e-08, "logits/chosen": -2.1819534301757812, "logits/rejected": -2.143298387527466, "logps/chosen": -260.98065185546875, "logps/rejected": -328.08758544921875, "loss": 0.5528, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.05389142036438, "rewards/margins": 0.7068518996238708, "rewards/rejected": -2.7607436180114746, "step": 12620 }, { "epoch": 2.1760854583046174, "grad_norm": 48.95317077636719, "learning_rate": 4.2542949895517066e-08, "logits/chosen": -2.127363681793213, "logits/rejected": -2.0970330238342285, "logps/chosen": -235.3542022705078, "logps/rejected": -309.9244079589844, "loss": 0.5624, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.8453752994537354, "rewards/margins": 0.7416688203811646, "rewards/rejected": -2.5870440006256104, "step": 12630 }, { "epoch": 2.1778084079944864, "grad_norm": 37.52322006225586, "learning_rate": 4.2378977888741506e-08, "logits/chosen": -2.1374919414520264, "logits/rejected": -2.0961146354675293, "logps/chosen": -223.96456909179688, "logps/rejected": -295.3265380859375, "loss": 0.4979, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.6855684518814087, "rewards/margins": 0.7392398118972778, "rewards/rejected": -2.4248080253601074, "step": 12640 }, { "epoch": 2.179531357684356, "grad_norm": 57.112144470214844, "learning_rate": 4.221523748410428e-08, "logits/chosen": -2.089409828186035, "logits/rejected": -2.0558364391326904, "logps/chosen": -222.017578125, "logps/rejected": -278.6932067871094, "loss": 0.5873, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.6818082332611084, "rewards/margins": 0.5971177220344543, "rewards/rejected": -2.278926134109497, "step": 12650 }, { "epoch": 2.1812543073742248, "grad_norm": 50.265281677246094, "learning_rate": 4.2051729339744056e-08, "logits/chosen": -2.027480363845825, "logits/rejected": -1.9964735507965088, "logps/chosen": -231.51260375976562, "logps/rejected": -287.72509765625, "loss": 0.5954, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.7851423025131226, "rewards/margins": 0.5692311525344849, "rewards/rejected": -2.3543732166290283, "step": 12660 }, { "epoch": 2.1829772570640937, "grad_norm": 51.05306625366211, "learning_rate": 4.1888454112866125e-08, "logits/chosen": -2.2497687339782715, "logits/rejected": -2.1945605278015137, "logps/chosen": -206.6665802001953, "logps/rejected": -275.145263671875, "loss": 0.5233, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.5245170593261719, "rewards/margins": 0.7166666984558105, "rewards/rejected": -2.2411837577819824, "step": 12670 }, { "epoch": 2.1847002067539627, "grad_norm": 26.886734008789062, "learning_rate": 4.172541245973943e-08, "logits/chosen": -2.082343101501465, "logits/rejected": -2.0427489280700684, "logps/chosen": -213.1779022216797, "logps/rejected": -282.32244873046875, "loss": 0.5273, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.606439232826233, "rewards/margins": 0.7072060108184814, "rewards/rejected": -2.313645124435425, "step": 12680 }, { "epoch": 2.1864231564438317, "grad_norm": 36.41101837158203, "learning_rate": 4.156260503569423e-08, "logits/chosen": -2.061903953552246, "logits/rejected": -2.029452085494995, "logps/chosen": -211.6961669921875, "logps/rejected": -280.8975830078125, "loss": 0.5686, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.5883698463439941, "rewards/margins": 0.6996433734893799, "rewards/rejected": -2.288013219833374, "step": 12690 }, { "epoch": 2.188146106133701, "grad_norm": 28.81426429748535, "learning_rate": 4.1400032495119183e-08, "logits/chosen": -2.0733580589294434, "logits/rejected": -2.033250331878662, "logps/chosen": -213.970947265625, "logps/rejected": -279.64385986328125, "loss": 0.52, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.5556237697601318, "rewards/margins": 0.7045634984970093, "rewards/rejected": -2.2601871490478516, "step": 12700 }, { "epoch": 2.18986905582357, "grad_norm": 45.23122787475586, "learning_rate": 4.123769549145901e-08, "logits/chosen": -2.1473305225372314, "logits/rejected": -2.120551347732544, "logps/chosen": -213.69515991210938, "logps/rejected": -287.4021301269531, "loss": 0.5262, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.5914419889450073, "rewards/margins": 0.7104102373123169, "rewards/rejected": -2.3018524646759033, "step": 12710 }, { "epoch": 2.191592005513439, "grad_norm": 32.552513122558594, "learning_rate": 4.10755946772116e-08, "logits/chosen": -2.1464271545410156, "logits/rejected": -2.0934534072875977, "logps/chosen": -211.2435302734375, "logps/rejected": -282.8409423828125, "loss": 0.48, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.53597092628479, "rewards/margins": 0.7881087064743042, "rewards/rejected": -2.324079751968384, "step": 12720 }, { "epoch": 2.193314955203308, "grad_norm": 47.42426681518555, "learning_rate": 4.0913730703925485e-08, "logits/chosen": -2.0571787357330322, "logits/rejected": -2.0159764289855957, "logps/chosen": -223.61099243164062, "logps/rejected": -297.05218505859375, "loss": 0.5254, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.7240705490112305, "rewards/margins": 0.7499324083328247, "rewards/rejected": -2.4740028381347656, "step": 12730 }, { "epoch": 2.195037904893177, "grad_norm": 32.92298126220703, "learning_rate": 4.075210422219732e-08, "logits/chosen": -2.0610849857330322, "logits/rejected": -2.031686305999756, "logps/chosen": -247.9683380126953, "logps/rejected": -316.2990417480469, "loss": 0.5609, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.9526822566986084, "rewards/margins": 0.6972087025642395, "rewards/rejected": -2.6498911380767822, "step": 12740 }, { "epoch": 2.1967608545830464, "grad_norm": 39.98879623413086, "learning_rate": 4.059071588166921e-08, "logits/chosen": -2.052370548248291, "logits/rejected": -2.0056519508361816, "logps/chosen": -234.57046508789062, "logps/rejected": -312.9558410644531, "loss": 0.4892, "rewards/accuracies": 0.75, "rewards/chosen": -1.7930113077163696, "rewards/margins": 0.7980147004127502, "rewards/rejected": -2.5910258293151855, "step": 12750 }, { "epoch": 2.1984838042729153, "grad_norm": 47.036380767822266, "learning_rate": 4.042956633102597e-08, "logits/chosen": -2.0422403812408447, "logits/rejected": -2.0114598274230957, "logps/chosen": -244.37646484375, "logps/rejected": -314.6592712402344, "loss": 0.5813, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.915923833847046, "rewards/margins": 0.7069417238235474, "rewards/rejected": -2.622865676879883, "step": 12760 }, { "epoch": 2.2002067539627843, "grad_norm": 40.41290283203125, "learning_rate": 4.0268656217992615e-08, "logits/chosen": -2.1366629600524902, "logits/rejected": -2.0922679901123047, "logps/chosen": -236.18856811523438, "logps/rejected": -291.73162841796875, "loss": 0.5701, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.7758594751358032, "rewards/margins": 0.6260801553726196, "rewards/rejected": -2.401939868927002, "step": 12770 }, { "epoch": 2.2019297036526533, "grad_norm": 29.852205276489258, "learning_rate": 4.0107986189331875e-08, "logits/chosen": -2.1107864379882812, "logits/rejected": -2.072418689727783, "logps/chosen": -237.52664184570312, "logps/rejected": -301.23297119140625, "loss": 0.5871, "rewards/accuracies": 0.71875, "rewards/chosen": -1.852663278579712, "rewards/margins": 0.6307103037834167, "rewards/rejected": -2.4833736419677734, "step": 12780 }, { "epoch": 2.2036526533425222, "grad_norm": 46.336830139160156, "learning_rate": 3.9947556890841464e-08, "logits/chosen": -2.045558214187622, "logits/rejected": -2.007340908050537, "logps/chosen": -240.2716522216797, "logps/rejected": -312.72625732421875, "loss": 0.5075, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.8492214679718018, "rewards/margins": 0.757246732711792, "rewards/rejected": -2.6064682006835938, "step": 12790 }, { "epoch": 2.205375603032391, "grad_norm": 44.32802963256836, "learning_rate": 3.978736896735141e-08, "logits/chosen": -2.11002516746521, "logits/rejected": -2.076409101486206, "logps/chosen": -226.1758270263672, "logps/rejected": -287.878662109375, "loss": 0.55, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.701133131980896, "rewards/margins": 0.6254535913467407, "rewards/rejected": -2.326586961746216, "step": 12800 }, { "epoch": 2.205375603032391, "eval_logits/chosen": -2.176112651824951, "eval_logits/rejected": -2.1573870182037354, "eval_logps/chosen": -209.3660430908203, "eval_logps/rejected": -241.4328155517578, "eval_loss": 0.6414783596992493, "eval_rewards/accuracies": 0.63150554895401, "eval_rewards/chosen": -1.503505825996399, "eval_rewards/margins": 0.2833262085914612, "eval_rewards/rejected": -1.7868318557739258, "eval_runtime": 383.2929, "eval_samples_per_second": 11.229, "eval_steps_per_second": 1.404, "step": 12800 }, { "epoch": 2.2070985527222606, "grad_norm": 43.70280456542969, "learning_rate": 3.96274230627216e-08, "logits/chosen": -2.1460890769958496, "logits/rejected": -2.1152615547180176, "logps/chosen": -213.664306640625, "logps/rejected": -281.9350280761719, "loss": 0.5265, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.6249897480010986, "rewards/margins": 0.6531460285186768, "rewards/rejected": -2.2781357765197754, "step": 12810 }, { "epoch": 2.2088215024121296, "grad_norm": 47.759342193603516, "learning_rate": 3.9467719819839186e-08, "logits/chosen": -2.0540995597839355, "logits/rejected": -2.0256972312927246, "logps/chosen": -207.12734985351562, "logps/rejected": -273.18743896484375, "loss": 0.5398, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.5232833623886108, "rewards/margins": 0.6660471558570862, "rewards/rejected": -2.1893303394317627, "step": 12820 }, { "epoch": 2.2105444521019986, "grad_norm": 32.4177360534668, "learning_rate": 3.930825988061599e-08, "logits/chosen": -2.056870698928833, "logits/rejected": -2.0277340412139893, "logps/chosen": -216.51528930664062, "logps/rejected": -269.9931640625, "loss": 0.5782, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.6514508724212646, "rewards/margins": 0.5335295796394348, "rewards/rejected": -2.1849803924560547, "step": 12830 }, { "epoch": 2.2122674017918675, "grad_norm": 34.3258056640625, "learning_rate": 3.914904388598577e-08, "logits/chosen": -2.156665563583374, "logits/rejected": -2.1270575523376465, "logps/chosen": -213.1860809326172, "logps/rejected": -286.87841796875, "loss": 0.5084, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.5715808868408203, "rewards/margins": 0.7577574849128723, "rewards/rejected": -2.329338312149048, "step": 12840 }, { "epoch": 2.213990351481737, "grad_norm": 33.174949645996094, "learning_rate": 3.899007247590191e-08, "logits/chosen": -2.1505343914031982, "logits/rejected": -2.1065900325775146, "logps/chosen": -213.80368041992188, "logps/rejected": -275.1285705566406, "loss": 0.5279, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.598930835723877, "rewards/margins": 0.6486135125160217, "rewards/rejected": -2.247544288635254, "step": 12850 }, { "epoch": 2.215713301171606, "grad_norm": 33.14176559448242, "learning_rate": 3.883134628933465e-08, "logits/chosen": -2.048759937286377, "logits/rejected": -2.0071349143981934, "logps/chosen": -229.22830200195312, "logps/rejected": -300.01043701171875, "loss": 0.5409, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.7220948934555054, "rewards/margins": 0.7704235315322876, "rewards/rejected": -2.492518424987793, "step": 12860 }, { "epoch": 2.217436250861475, "grad_norm": 58.30504608154297, "learning_rate": 3.867286596426853e-08, "logits/chosen": -2.0994811058044434, "logits/rejected": -2.0638999938964844, "logps/chosen": -221.7117919921875, "logps/rejected": -286.0891418457031, "loss": 0.5738, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.6807664632797241, "rewards/margins": 0.6800059676170349, "rewards/rejected": -2.3607726097106934, "step": 12870 }, { "epoch": 2.219159200551344, "grad_norm": 43.13029861450195, "learning_rate": 3.851463213769996e-08, "logits/chosen": -2.1190624237060547, "logits/rejected": -2.084502935409546, "logps/chosen": -210.6239013671875, "logps/rejected": -275.19720458984375, "loss": 0.5292, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.5534844398498535, "rewards/margins": 0.6436396837234497, "rewards/rejected": -2.1971240043640137, "step": 12880 }, { "epoch": 2.220882150241213, "grad_norm": 30.03142738342285, "learning_rate": 3.8356645445634575e-08, "logits/chosen": -2.149033308029175, "logits/rejected": -2.120265483856201, "logps/chosen": -218.79638671875, "logps/rejected": -276.3510437011719, "loss": 0.5302, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.601214051246643, "rewards/margins": 0.6322570443153381, "rewards/rejected": -2.233471155166626, "step": 12890 }, { "epoch": 2.222605099931082, "grad_norm": 44.66122055053711, "learning_rate": 3.8198906523084594e-08, "logits/chosen": -2.11344313621521, "logits/rejected": -2.062312602996826, "logps/chosen": -233.74307250976562, "logps/rejected": -312.79876708984375, "loss": 0.4754, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.784369707107544, "rewards/margins": 0.8080135583877563, "rewards/rejected": -2.5923831462860107, "step": 12900 }, { "epoch": 2.224328049620951, "grad_norm": 36.82771301269531, "learning_rate": 3.8041416004066364e-08, "logits/chosen": -2.130718231201172, "logits/rejected": -2.0977675914764404, "logps/chosen": -229.9229278564453, "logps/rejected": -296.8409729003906, "loss": 0.5508, "rewards/accuracies": 0.75, "rewards/chosen": -1.7804368734359741, "rewards/margins": 0.6411340832710266, "rewards/rejected": -2.4215707778930664, "step": 12910 }, { "epoch": 2.22605099931082, "grad_norm": 39.755863189697266, "learning_rate": 3.7884174521597866e-08, "logits/chosen": -2.0777835845947266, "logits/rejected": -2.0352044105529785, "logps/chosen": -229.240478515625, "logps/rejected": -318.3649597167969, "loss": 0.499, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.7551753520965576, "rewards/margins": 0.9018391370773315, "rewards/rejected": -2.6570143699645996, "step": 12920 }, { "epoch": 2.227773949000689, "grad_norm": 32.51687240600586, "learning_rate": 3.77271827076961e-08, "logits/chosen": -2.1255993843078613, "logits/rejected": -2.096824884414673, "logps/chosen": -221.54824829101562, "logps/rejected": -285.9750061035156, "loss": 0.573, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.7131789922714233, "rewards/margins": 0.6252934336662292, "rewards/rejected": -2.338472604751587, "step": 12930 }, { "epoch": 2.229496898690558, "grad_norm": 44.66363525390625, "learning_rate": 3.757044119337449e-08, "logits/chosen": -2.0880866050720215, "logits/rejected": -2.0467846393585205, "logps/chosen": -228.26327514648438, "logps/rejected": -289.1280212402344, "loss": 0.5451, "rewards/accuracies": 0.71875, "rewards/chosen": -1.7492284774780273, "rewards/margins": 0.6592920422554016, "rewards/rejected": -2.4085206985473633, "step": 12940 }, { "epoch": 2.231219848380427, "grad_norm": 51.84122085571289, "learning_rate": 3.741395060864038e-08, "logits/chosen": -2.0882647037506104, "logits/rejected": -2.0473856925964355, "logps/chosen": -216.173583984375, "logps/rejected": -298.5913391113281, "loss": 0.474, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.6181936264038086, "rewards/margins": 0.8368695974349976, "rewards/rejected": -2.4550633430480957, "step": 12950 }, { "epoch": 2.2329427980702965, "grad_norm": 40.83433151245117, "learning_rate": 3.7257711582492645e-08, "logits/chosen": -2.043201208114624, "logits/rejected": -2.009258508682251, "logps/chosen": -233.2020263671875, "logps/rejected": -281.8921813964844, "loss": 0.5717, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.7400344610214233, "rewards/margins": 0.5515367388725281, "rewards/rejected": -2.2915711402893066, "step": 12960 }, { "epoch": 2.2346657477601655, "grad_norm": 33.69052505493164, "learning_rate": 3.7101724742918915e-08, "logits/chosen": -2.036499500274658, "logits/rejected": -1.9901424646377563, "logps/chosen": -220.5728759765625, "logps/rejected": -300.4973449707031, "loss": 0.4663, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.664202094078064, "rewards/margins": 0.8676424026489258, "rewards/rejected": -2.5318446159362793, "step": 12970 }, { "epoch": 2.2363886974500344, "grad_norm": 53.17600631713867, "learning_rate": 3.694599071689329e-08, "logits/chosen": -2.0466625690460205, "logits/rejected": -2.016874313354492, "logps/chosen": -229.981201171875, "logps/rejected": -301.1022033691406, "loss": 0.5053, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.7895774841308594, "rewards/margins": 0.7001364827156067, "rewards/rejected": -2.4897139072418213, "step": 12980 }, { "epoch": 2.2381116471399034, "grad_norm": 39.544010162353516, "learning_rate": 3.679051013037361e-08, "logits/chosen": -2.1186952590942383, "logits/rejected": -2.0678322315216064, "logps/chosen": -251.4757080078125, "logps/rejected": -324.4326171875, "loss": 0.5166, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.916233777999878, "rewards/margins": 0.8067552447319031, "rewards/rejected": -2.722989320755005, "step": 12990 }, { "epoch": 2.2398345968297724, "grad_norm": 37.2462158203125, "learning_rate": 3.663528360829915e-08, "logits/chosen": -2.0603792667388916, "logits/rejected": -2.017498254776001, "logps/chosen": -262.0799560546875, "logps/rejected": -340.74285888671875, "loss": 0.508, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.0763087272644043, "rewards/margins": 0.8207263946533203, "rewards/rejected": -2.8970351219177246, "step": 13000 }, { "epoch": 2.241557546519642, "grad_norm": 39.30596160888672, "learning_rate": 3.6480311774587877e-08, "logits/chosen": -2.0265860557556152, "logits/rejected": -1.9936189651489258, "logps/chosen": -259.8675537109375, "logps/rejected": -325.21942138671875, "loss": 0.5646, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.0876617431640625, "rewards/margins": 0.672526478767395, "rewards/rejected": -2.760188579559326, "step": 13010 }, { "epoch": 2.2432804962095108, "grad_norm": 34.74739074707031, "learning_rate": 3.6325595252134144e-08, "logits/chosen": -2.0303027629852295, "logits/rejected": -1.9870803356170654, "logps/chosen": -241.1360321044922, "logps/rejected": -316.6763610839844, "loss": 0.5396, "rewards/accuracies": 0.6875, "rewards/chosen": -1.887058973312378, "rewards/margins": 0.7892248034477234, "rewards/rejected": -2.676283836364746, "step": 13020 }, { "epoch": 2.2450034458993797, "grad_norm": 35.928749084472656, "learning_rate": 3.617113466280612e-08, "logits/chosen": -2.1021182537078857, "logits/rejected": -2.066804885864258, "logps/chosen": -241.94442749023438, "logps/rejected": -299.8307189941406, "loss": 0.5608, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.8644864559173584, "rewards/margins": 0.6163435578346252, "rewards/rejected": -2.480829954147339, "step": 13030 }, { "epoch": 2.2467263955892487, "grad_norm": 52.62453842163086, "learning_rate": 3.601693062744322e-08, "logits/chosen": -2.1262030601501465, "logits/rejected": -2.082648515701294, "logps/chosen": -229.03598022460938, "logps/rejected": -314.4962158203125, "loss": 0.4906, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.7390594482421875, "rewards/margins": 0.8555986285209656, "rewards/rejected": -2.594658136367798, "step": 13040 }, { "epoch": 2.2484493452791177, "grad_norm": 52.518775939941406, "learning_rate": 3.586298376585363e-08, "logits/chosen": -2.1036646366119385, "logits/rejected": -2.0692028999328613, "logps/chosen": -240.4881134033203, "logps/rejected": -314.6589050292969, "loss": 0.5534, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.8791265487670898, "rewards/margins": 0.7353334426879883, "rewards/rejected": -2.614459753036499, "step": 13050 }, { "epoch": 2.250172294968987, "grad_norm": 43.86840057373047, "learning_rate": 3.5709294696811985e-08, "logits/chosen": -2.1041011810302734, "logits/rejected": -2.069333553314209, "logps/chosen": -238.55062866210938, "logps/rejected": -315.8983459472656, "loss": 0.5483, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.8438698053359985, "rewards/margins": 0.7788059115409851, "rewards/rejected": -2.6226754188537598, "step": 13060 }, { "epoch": 2.251895244658856, "grad_norm": 58.724525451660156, "learning_rate": 3.555586403805663e-08, "logits/chosen": -2.0676779747009277, "logits/rejected": -2.0228145122528076, "logps/chosen": -215.6838836669922, "logps/rejected": -275.9107360839844, "loss": 0.5651, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.5825284719467163, "rewards/margins": 0.6505758166313171, "rewards/rejected": -2.2331044673919678, "step": 13070 }, { "epoch": 2.253618194348725, "grad_norm": 32.55445861816406, "learning_rate": 3.540269240628726e-08, "logits/chosen": -2.049349069595337, "logits/rejected": -2.0212624073028564, "logps/chosen": -225.2374725341797, "logps/rejected": -285.44671630859375, "loss": 0.5521, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.7316248416900635, "rewards/margins": 0.611584484577179, "rewards/rejected": -2.3432092666625977, "step": 13080 }, { "epoch": 2.255341144038594, "grad_norm": 37.775718688964844, "learning_rate": 3.52497804171625e-08, "logits/chosen": -2.148881196975708, "logits/rejected": -2.1036956310272217, "logps/chosen": -224.271484375, "logps/rejected": -284.09429931640625, "loss": 0.5724, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.6970704793930054, "rewards/margins": 0.6304658651351929, "rewards/rejected": -2.327536106109619, "step": 13090 }, { "epoch": 2.257064093728463, "grad_norm": 25.495990753173828, "learning_rate": 3.509712868529738e-08, "logits/chosen": -2.2044754028320312, "logits/rejected": -2.1581830978393555, "logps/chosen": -213.9280242919922, "logps/rejected": -284.5126647949219, "loss": 0.5051, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.5838515758514404, "rewards/margins": 0.7658999562263489, "rewards/rejected": -2.3497514724731445, "step": 13100 }, { "epoch": 2.2587870434183324, "grad_norm": 33.51130676269531, "learning_rate": 3.494473782426073e-08, "logits/chosen": -2.0514578819274902, "logits/rejected": -2.0082719326019287, "logps/chosen": -219.0296630859375, "logps/rejected": -288.70654296875, "loss": 0.5318, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.6402019262313843, "rewards/margins": 0.7379582524299622, "rewards/rejected": -2.378160238265991, "step": 13110 }, { "epoch": 2.2605099931082013, "grad_norm": 31.88288688659668, "learning_rate": 3.479260844657297e-08, "logits/chosen": -2.1669747829437256, "logits/rejected": -2.1406798362731934, "logps/chosen": -217.6679229736328, "logps/rejected": -277.6645812988281, "loss": 0.5686, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.622894048690796, "rewards/margins": 0.6306630373001099, "rewards/rejected": -2.253556728363037, "step": 13120 }, { "epoch": 2.2622329427980703, "grad_norm": 31.417335510253906, "learning_rate": 3.46407411637034e-08, "logits/chosen": -2.207447052001953, "logits/rejected": -2.1617865562438965, "logps/chosen": -203.57965087890625, "logps/rejected": -294.2021484375, "loss": 0.4744, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.4846456050872803, "rewards/margins": 0.8848323822021484, "rewards/rejected": -2.3694777488708496, "step": 13130 }, { "epoch": 2.2639558924879393, "grad_norm": 27.322017669677734, "learning_rate": 3.448913658606798e-08, "logits/chosen": -2.078812837600708, "logits/rejected": -2.040693759918213, "logps/chosen": -211.0071258544922, "logps/rejected": -282.39678955078125, "loss": 0.4836, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.5680955648422241, "rewards/margins": 0.7172769904136658, "rewards/rejected": -2.285372257232666, "step": 13140 }, { "epoch": 2.2656788421778082, "grad_norm": 38.46590805053711, "learning_rate": 3.43377953230266e-08, "logits/chosen": -2.078317642211914, "logits/rejected": -2.0354676246643066, "logps/chosen": -227.1080780029297, "logps/rejected": -309.87066650390625, "loss": 0.4814, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.7072769403457642, "rewards/margins": 0.8658990859985352, "rewards/rejected": -2.573176145553589, "step": 13150 }, { "epoch": 2.2674017918676777, "grad_norm": 40.2093620300293, "learning_rate": 3.418671798288093e-08, "logits/chosen": -2.0533547401428223, "logits/rejected": -2.008688449859619, "logps/chosen": -251.79995727539062, "logps/rejected": -322.4410705566406, "loss": 0.5278, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.9460399150848389, "rewards/margins": 0.7493022084236145, "rewards/rejected": -2.6953423023223877, "step": 13160 }, { "epoch": 2.2691247415575466, "grad_norm": 48.867897033691406, "learning_rate": 3.403590517287175e-08, "logits/chosen": -2.0968425273895264, "logits/rejected": -2.0625452995300293, "logps/chosen": -235.0701446533203, "logps/rejected": -298.63433837890625, "loss": 0.5422, "rewards/accuracies": 0.71875, "rewards/chosen": -1.799288034439087, "rewards/margins": 0.670975387096405, "rewards/rejected": -2.4702632427215576, "step": 13170 }, { "epoch": 2.2708476912474156, "grad_norm": 39.24733352661133, "learning_rate": 3.388535749917653e-08, "logits/chosen": -2.1031875610351562, "logits/rejected": -2.054490566253662, "logps/chosen": -225.1061553955078, "logps/rejected": -303.8103332519531, "loss": 0.4691, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.7030404806137085, "rewards/margins": 0.8334137201309204, "rewards/rejected": -2.536454200744629, "step": 13180 }, { "epoch": 2.2725706409372846, "grad_norm": 40.70262145996094, "learning_rate": 3.373507556690718e-08, "logits/chosen": -2.094082832336426, "logits/rejected": -2.0526416301727295, "logps/chosen": -237.23123168945312, "logps/rejected": -295.06768798828125, "loss": 0.5715, "rewards/accuracies": 0.71875, "rewards/chosen": -1.8039411306381226, "rewards/margins": 0.6279858946800232, "rewards/rejected": -2.43192720413208, "step": 13190 }, { "epoch": 2.2742935906271535, "grad_norm": 35.791282653808594, "learning_rate": 3.358505998010743e-08, "logits/chosen": -2.080048084259033, "logits/rejected": -2.0455188751220703, "logps/chosen": -233.1878662109375, "logps/rejected": -290.905029296875, "loss": 0.5546, "rewards/accuracies": 0.71875, "rewards/chosen": -1.7701551914215088, "rewards/margins": 0.6163936257362366, "rewards/rejected": -2.3865485191345215, "step": 13200 }, { "epoch": 2.2742935906271535, "eval_logits/chosen": -2.161210298538208, "eval_logits/rejected": -2.1413331031799316, "eval_logps/chosen": -226.16151428222656, "eval_logps/rejected": -261.48590087890625, "eval_loss": 0.6425279974937439, "eval_rewards/accuracies": 0.6303438544273376, "eval_rewards/chosen": -1.6714603900909424, "eval_rewards/margins": 0.31590259075164795, "eval_rewards/rejected": -1.9873627424240112, "eval_runtime": 383.1902, "eval_samples_per_second": 11.232, "eval_steps_per_second": 1.404, "step": 13200 }, { "epoch": 2.2760165403170225, "grad_norm": 36.40027618408203, "learning_rate": 3.343531134175046e-08, "logits/chosen": -2.1576075553894043, "logits/rejected": -2.1340811252593994, "logps/chosen": -220.05160522460938, "logps/rejected": -283.6050109863281, "loss": 0.5517, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.6668260097503662, "rewards/margins": 0.6425694227218628, "rewards/rejected": -2.3093955516815186, "step": 13210 }, { "epoch": 2.277739490006892, "grad_norm": 35.08451843261719, "learning_rate": 3.3285830253736405e-08, "logits/chosen": -2.1245293617248535, "logits/rejected": -2.092247724533081, "logps/chosen": -225.3599853515625, "logps/rejected": -289.3499755859375, "loss": 0.5274, "rewards/accuracies": 0.71875, "rewards/chosen": -1.693889856338501, "rewards/margins": 0.6483731865882874, "rewards/rejected": -2.3422627449035645, "step": 13220 }, { "epoch": 2.279462439696761, "grad_norm": 27.97449493408203, "learning_rate": 3.313661731689013e-08, "logits/chosen": -2.0879783630371094, "logits/rejected": -2.0488433837890625, "logps/chosen": -219.3804473876953, "logps/rejected": -301.9391784667969, "loss": 0.5051, "rewards/accuracies": 0.75, "rewards/chosen": -1.670306921005249, "rewards/margins": 0.8253267407417297, "rewards/rejected": -2.495633602142334, "step": 13230 }, { "epoch": 2.28118538938663, "grad_norm": 32.750732421875, "learning_rate": 3.298767313095865e-08, "logits/chosen": -2.1046676635742188, "logits/rejected": -2.0787782669067383, "logps/chosen": -233.48611450195312, "logps/rejected": -293.97784423828125, "loss": 0.5207, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.8176323175430298, "rewards/margins": 0.5957657694816589, "rewards/rejected": -2.413398265838623, "step": 13240 }, { "epoch": 2.282908339076499, "grad_norm": 35.13943862915039, "learning_rate": 3.283899829460873e-08, "logits/chosen": -2.0670647621154785, "logits/rejected": -2.035553455352783, "logps/chosen": -227.38436889648438, "logps/rejected": -314.468017578125, "loss": 0.4911, "rewards/accuracies": 0.75, "rewards/chosen": -1.7445780038833618, "rewards/margins": 0.8522817492485046, "rewards/rejected": -2.596859931945801, "step": 13250 }, { "epoch": 2.2846312887663682, "grad_norm": 41.03888702392578, "learning_rate": 3.269059340542448e-08, "logits/chosen": -2.161862373352051, "logits/rejected": -2.1250216960906982, "logps/chosen": -232.2884979248047, "logps/rejected": -315.02960205078125, "loss": 0.5266, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.828884482383728, "rewards/margins": 0.8013812899589539, "rewards/rejected": -2.630265712738037, "step": 13260 }, { "epoch": 2.286354238456237, "grad_norm": 35.590736389160156, "learning_rate": 3.2542459059905127e-08, "logits/chosen": -2.0372190475463867, "logits/rejected": -2.0048985481262207, "logps/chosen": -238.0127716064453, "logps/rejected": -311.5678405761719, "loss": 0.5186, "rewards/accuracies": 0.6875, "rewards/chosen": -1.8236106634140015, "rewards/margins": 0.7575092315673828, "rewards/rejected": -2.581120014190674, "step": 13270 }, { "epoch": 2.288077188146106, "grad_norm": 38.78087615966797, "learning_rate": 3.239459585346228e-08, "logits/chosen": -2.093226909637451, "logits/rejected": -2.046966791152954, "logps/chosen": -222.5635528564453, "logps/rejected": -292.07373046875, "loss": 0.5405, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.6759002208709717, "rewards/margins": 0.7385443449020386, "rewards/rejected": -2.4144444465637207, "step": 13280 }, { "epoch": 2.289800137835975, "grad_norm": 46.30903244018555, "learning_rate": 3.224700438041789e-08, "logits/chosen": -2.080031156539917, "logits/rejected": -2.040649652481079, "logps/chosen": -222.87655639648438, "logps/rejected": -294.4432678222656, "loss": 0.5004, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.6777915954589844, "rewards/margins": 0.7428628206253052, "rewards/rejected": -2.420654535293579, "step": 13290 }, { "epoch": 2.291523087525844, "grad_norm": 36.877052307128906, "learning_rate": 3.209968523400165e-08, "logits/chosen": -2.088174343109131, "logits/rejected": -2.0509660243988037, "logps/chosen": -248.04592895507812, "logps/rejected": -313.04864501953125, "loss": 0.581, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.9629755020141602, "rewards/margins": 0.662213146686554, "rewards/rejected": -2.6251888275146484, "step": 13300 }, { "epoch": 2.293246037215713, "grad_norm": 34.83769226074219, "learning_rate": 3.195263900634863e-08, "logits/chosen": -2.093043088912964, "logits/rejected": -2.0511364936828613, "logps/chosen": -244.8745880126953, "logps/rejected": -321.04046630859375, "loss": 0.5388, "rewards/accuracies": 0.71875, "rewards/chosen": -1.8718029260635376, "rewards/margins": 0.8050811886787415, "rewards/rejected": -2.676884174346924, "step": 13310 }, { "epoch": 2.2949689869055825, "grad_norm": 50.702056884765625, "learning_rate": 3.180586628849692e-08, "logits/chosen": -2.1294782161712646, "logits/rejected": -2.081721782684326, "logps/chosen": -243.4737548828125, "logps/rejected": -289.72357177734375, "loss": 0.623, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.866064429283142, "rewards/margins": 0.5558760166168213, "rewards/rejected": -2.421940565109253, "step": 13320 }, { "epoch": 2.2966919365954515, "grad_norm": 38.728511810302734, "learning_rate": 3.165936767038534e-08, "logits/chosen": -2.0673575401306152, "logits/rejected": -2.025552988052368, "logps/chosen": -212.02685546875, "logps/rejected": -287.38671875, "loss": 0.5166, "rewards/accuracies": 0.75, "rewards/chosen": -1.5518863201141357, "rewards/margins": 0.8267132639884949, "rewards/rejected": -2.3785996437072754, "step": 13330 }, { "epoch": 2.2984148862853204, "grad_norm": 47.14889907836914, "learning_rate": 3.151314374085097e-08, "logits/chosen": -2.169553518295288, "logits/rejected": -2.141165256500244, "logps/chosen": -221.1737823486328, "logps/rejected": -283.49029541015625, "loss": 0.5646, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.6549873352050781, "rewards/margins": 0.6485617756843567, "rewards/rejected": -2.30354905128479, "step": 13340 }, { "epoch": 2.3001378359751894, "grad_norm": 35.23659133911133, "learning_rate": 3.136719508762674e-08, "logits/chosen": -2.1357874870300293, "logits/rejected": -2.0790905952453613, "logps/chosen": -203.52828979492188, "logps/rejected": -281.9134826660156, "loss": 0.4834, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.461071252822876, "rewards/margins": 0.850469708442688, "rewards/rejected": -2.3115413188934326, "step": 13350 }, { "epoch": 2.301860785665059, "grad_norm": 40.01634216308594, "learning_rate": 3.1221522297339177e-08, "logits/chosen": -2.1334757804870605, "logits/rejected": -2.0900192260742188, "logps/chosen": -217.0267333984375, "logps/rejected": -296.7123718261719, "loss": 0.5045, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.5906345844268799, "rewards/margins": 0.8470972180366516, "rewards/rejected": -2.4377317428588867, "step": 13360 }, { "epoch": 2.3035837353549278, "grad_norm": 35.108001708984375, "learning_rate": 3.1076125955506015e-08, "logits/chosen": -2.093878746032715, "logits/rejected": -2.039607524871826, "logps/chosen": -217.65939331054688, "logps/rejected": -293.3722229003906, "loss": 0.5265, "rewards/accuracies": 0.71875, "rewards/chosen": -1.6202102899551392, "rewards/margins": 0.7910685539245605, "rewards/rejected": -2.4112792015075684, "step": 13370 }, { "epoch": 2.3053066850447967, "grad_norm": 45.69697952270508, "learning_rate": 3.0931006646533866e-08, "logits/chosen": -2.041187286376953, "logits/rejected": -1.9984385967254639, "logps/chosen": -230.24514770507812, "logps/rejected": -290.89263916015625, "loss": 0.5392, "rewards/accuracies": 0.75, "rewards/chosen": -1.7188899517059326, "rewards/margins": 0.6583026647567749, "rewards/rejected": -2.377192497253418, "step": 13380 }, { "epoch": 2.3070296347346657, "grad_norm": 45.5197639465332, "learning_rate": 3.078616495371574e-08, "logits/chosen": -2.070643186569214, "logits/rejected": -2.014615535736084, "logps/chosen": -212.42764282226562, "logps/rejected": -277.73175048828125, "loss": 0.5393, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.5495667457580566, "rewards/margins": 0.7197413444519043, "rewards/rejected": -2.269308090209961, "step": 13390 }, { "epoch": 2.3087525844245347, "grad_norm": 27.229652404785156, "learning_rate": 3.064160145922884e-08, "logits/chosen": -2.0850396156311035, "logits/rejected": -2.0378663539886475, "logps/chosen": -216.22128295898438, "logps/rejected": -297.5529479980469, "loss": 0.4751, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.6000601053237915, "rewards/margins": 0.8747854232788086, "rewards/rejected": -2.4748454093933105, "step": 13400 }, { "epoch": 2.3104755341144037, "grad_norm": 36.54592514038086, "learning_rate": 3.0497316744132215e-08, "logits/chosen": -2.104112148284912, "logits/rejected": -2.0560758113861084, "logps/chosen": -241.180908203125, "logps/rejected": -320.4126892089844, "loss": 0.5309, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.864006757736206, "rewards/margins": 0.8387478590011597, "rewards/rejected": -2.702754497528076, "step": 13410 }, { "epoch": 2.312198483804273, "grad_norm": 43.858726501464844, "learning_rate": 3.035331138836431e-08, "logits/chosen": -2.0926592350006104, "logits/rejected": -2.0584444999694824, "logps/chosen": -238.5738983154297, "logps/rejected": -321.810546875, "loss": 0.4983, "rewards/accuracies": 0.75, "rewards/chosen": -1.844842553138733, "rewards/margins": 0.8219674229621887, "rewards/rejected": -2.6668102741241455, "step": 13420 }, { "epoch": 2.313921433494142, "grad_norm": 47.705291748046875, "learning_rate": 3.020958597074081e-08, "logits/chosen": -2.166374683380127, "logits/rejected": -2.1191868782043457, "logps/chosen": -236.5503692626953, "logps/rejected": -315.83587646484375, "loss": 0.5115, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.8055598735809326, "rewards/margins": 0.7851177453994751, "rewards/rejected": -2.590677499771118, "step": 13430 }, { "epoch": 2.315644383184011, "grad_norm": 40.79521942138672, "learning_rate": 3.006614106895211e-08, "logits/chosen": -2.026750087738037, "logits/rejected": -1.9977821111679077, "logps/chosen": -223.0307159423828, "logps/rejected": -284.9368896484375, "loss": 0.5683, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.6900018453598022, "rewards/margins": 0.6437240839004517, "rewards/rejected": -2.333725929260254, "step": 13440 }, { "epoch": 2.31736733287388, "grad_norm": 32.59689712524414, "learning_rate": 2.992297725956121e-08, "logits/chosen": -2.061788558959961, "logits/rejected": -2.020012378692627, "logps/chosen": -218.10922241210938, "logps/rejected": -291.716552734375, "loss": 0.5251, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.6696808338165283, "rewards/margins": 0.7335036993026733, "rewards/rejected": -2.403184413909912, "step": 13450 }, { "epoch": 2.3190902825637494, "grad_norm": 41.41628646850586, "learning_rate": 2.978009511800116e-08, "logits/chosen": -2.0997917652130127, "logits/rejected": -2.0537500381469727, "logps/chosen": -220.21572875976562, "logps/rejected": -301.9630126953125, "loss": 0.4729, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.643982172012329, "rewards/margins": 0.8779670596122742, "rewards/rejected": -2.521949291229248, "step": 13460 }, { "epoch": 2.3208132322536184, "grad_norm": 38.67691421508789, "learning_rate": 2.9637495218572972e-08, "logits/chosen": -2.0158824920654297, "logits/rejected": -1.976339340209961, "logps/chosen": -237.1468048095703, "logps/rejected": -301.9319152832031, "loss": 0.5574, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.8132215738296509, "rewards/margins": 0.709286093711853, "rewards/rejected": -2.522507429122925, "step": 13470 }, { "epoch": 2.3225361819434873, "grad_norm": 44.51760482788086, "learning_rate": 2.9495178134443254e-08, "logits/chosen": -2.125648021697998, "logits/rejected": -2.0716588497161865, "logps/chosen": -220.8754425048828, "logps/rejected": -296.6810607910156, "loss": 0.4888, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.6669012308120728, "rewards/margins": 0.7847355008125305, "rewards/rejected": -2.451636552810669, "step": 13480 }, { "epoch": 2.3242591316333563, "grad_norm": 31.296419143676758, "learning_rate": 2.9353144437641662e-08, "logits/chosen": -2.1042752265930176, "logits/rejected": -2.056389570236206, "logps/chosen": -234.0042266845703, "logps/rejected": -307.01837158203125, "loss": 0.5252, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.7978731393814087, "rewards/margins": 0.7429569959640503, "rewards/rejected": -2.540830135345459, "step": 13490 }, { "epoch": 2.3259820813232253, "grad_norm": 41.06807327270508, "learning_rate": 2.9211394699058987e-08, "logits/chosen": -2.094754457473755, "logits/rejected": -2.0499045848846436, "logps/chosen": -236.43984985351562, "logps/rejected": -315.6556396484375, "loss": 0.473, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.7972850799560547, "rewards/margins": 0.8472954630851746, "rewards/rejected": -2.644580602645874, "step": 13500 }, { "epoch": 2.3277050310130942, "grad_norm": 31.174585342407227, "learning_rate": 2.9069929488444678e-08, "logits/chosen": -2.026517391204834, "logits/rejected": -1.997696876525879, "logps/chosen": -222.0631866455078, "logps/rejected": -294.4779357910156, "loss": 0.5431, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.7136270999908447, "rewards/margins": 0.6927502155303955, "rewards/rejected": -2.4063773155212402, "step": 13510 }, { "epoch": 2.3294279807029636, "grad_norm": 31.05910301208496, "learning_rate": 2.8928749374404448e-08, "logits/chosen": -1.9960353374481201, "logits/rejected": -1.9593353271484375, "logps/chosen": -234.98104858398438, "logps/rejected": -313.2256164550781, "loss": 0.5235, "rewards/accuracies": 0.75, "rewards/chosen": -1.8336776494979858, "rewards/margins": 0.7925911545753479, "rewards/rejected": -2.6262686252593994, "step": 13520 }, { "epoch": 2.3311509303928326, "grad_norm": 54.28462219238281, "learning_rate": 2.8787854924398123e-08, "logits/chosen": -2.093662738800049, "logits/rejected": -2.056663990020752, "logps/chosen": -235.8678436279297, "logps/rejected": -285.80438232421875, "loss": 0.5827, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.8033876419067383, "rewards/margins": 0.5410268306732178, "rewards/rejected": -2.344414472579956, "step": 13530 }, { "epoch": 2.3328738800827016, "grad_norm": 67.41534423828125, "learning_rate": 2.8647246704737382e-08, "logits/chosen": -2.041447401046753, "logits/rejected": -1.9953224658966064, "logps/chosen": -233.7212677001953, "logps/rejected": -306.98822021484375, "loss": 0.5158, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.7728395462036133, "rewards/margins": 0.8034697771072388, "rewards/rejected": -2.5763089656829834, "step": 13540 }, { "epoch": 2.3345968297725705, "grad_norm": 35.68818664550781, "learning_rate": 2.8506925280583417e-08, "logits/chosen": -2.074463367462158, "logits/rejected": -2.0343222618103027, "logps/chosen": -231.67538452148438, "logps/rejected": -292.61614990234375, "loss": 0.5818, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.8013169765472412, "rewards/margins": 0.6350749135017395, "rewards/rejected": -2.436391830444336, "step": 13550 }, { "epoch": 2.3363197794624395, "grad_norm": 47.221675872802734, "learning_rate": 2.8366891215944598e-08, "logits/chosen": -2.104546308517456, "logits/rejected": -2.0793075561523438, "logps/chosen": -211.36093139648438, "logps/rejected": -272.89996337890625, "loss": 0.5679, "rewards/accuracies": 0.6875, "rewards/chosen": -1.5778541564941406, "rewards/margins": 0.6280261874198914, "rewards/rejected": -2.205880641937256, "step": 13560 }, { "epoch": 2.338042729152309, "grad_norm": 31.64224624633789, "learning_rate": 2.8227145073674385e-08, "logits/chosen": -2.0302841663360596, "logits/rejected": -1.9922692775726318, "logps/chosen": -227.22756958007812, "logps/rejected": -306.2268981933594, "loss": 0.4881, "rewards/accuracies": 0.75, "rewards/chosen": -1.7482671737670898, "rewards/margins": 0.8114362955093384, "rewards/rejected": -2.5597033500671387, "step": 13570 }, { "epoch": 2.339765678842178, "grad_norm": 36.071170806884766, "learning_rate": 2.8087687415468896e-08, "logits/chosen": -2.077479600906372, "logits/rejected": -2.052595615386963, "logps/chosen": -217.6588592529297, "logps/rejected": -281.9828796386719, "loss": 0.5402, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.644518256187439, "rewards/margins": 0.6487827897071838, "rewards/rejected": -2.2933011054992676, "step": 13580 }, { "epoch": 2.341488628532047, "grad_norm": 41.25293731689453, "learning_rate": 2.7948518801864697e-08, "logits/chosen": -2.041703939437866, "logits/rejected": -2.0177359580993652, "logps/chosen": -224.66085815429688, "logps/rejected": -298.35089111328125, "loss": 0.5183, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.7158304452896118, "rewards/margins": 0.7353793978691101, "rewards/rejected": -2.451209545135498, "step": 13590 }, { "epoch": 2.343211578221916, "grad_norm": 45.824951171875, "learning_rate": 2.780963979223663e-08, "logits/chosen": -2.097001552581787, "logits/rejected": -2.0635221004486084, "logps/chosen": -229.45321655273438, "logps/rejected": -288.4214782714844, "loss": 0.5639, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.695074439048767, "rewards/margins": 0.6171914935112, "rewards/rejected": -2.3122661113739014, "step": 13600 }, { "epoch": 2.343211578221916, "eval_logits/chosen": -2.167468786239624, "eval_logits/rejected": -2.1481270790100098, "eval_logps/chosen": -218.10006713867188, "eval_logps/rejected": -252.5519256591797, "eval_loss": 0.6408571600914001, "eval_rewards/accuracies": 0.6289498209953308, "eval_rewards/chosen": -1.5908458232879639, "eval_rewards/margins": 0.30717742443084717, "eval_rewards/rejected": -1.8980233669281006, "eval_runtime": 382.927, "eval_samples_per_second": 11.24, "eval_steps_per_second": 1.405, "step": 13600 }, { "epoch": 2.344934527911785, "grad_norm": 56.389686584472656, "learning_rate": 2.7671050944795494e-08, "logits/chosen": -2.2281742095947266, "logits/rejected": -2.187246561050415, "logps/chosen": -214.6632537841797, "logps/rejected": -280.9909362792969, "loss": 0.5508, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.6076910495758057, "rewards/margins": 0.6706972122192383, "rewards/rejected": -2.278388261795044, "step": 13610 }, { "epoch": 2.346657477601654, "grad_norm": 27.47957420349121, "learning_rate": 2.753275281658578e-08, "logits/chosen": -2.1068856716156006, "logits/rejected": -2.0513110160827637, "logps/chosen": -219.8840789794922, "logps/rejected": -288.0721740722656, "loss": 0.4984, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.6228272914886475, "rewards/margins": 0.7473019361495972, "rewards/rejected": -2.370129346847534, "step": 13620 }, { "epoch": 2.348380427291523, "grad_norm": 36.49810791015625, "learning_rate": 2.7394745963483414e-08, "logits/chosen": -2.041313409805298, "logits/rejected": -1.9859342575073242, "logps/chosen": -224.9228973388672, "logps/rejected": -303.43414306640625, "loss": 0.4732, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.6923065185546875, "rewards/margins": 0.8486798405647278, "rewards/rejected": -2.5409865379333496, "step": 13630 }, { "epoch": 2.350103376981392, "grad_norm": 33.58571243286133, "learning_rate": 2.725703094019368e-08, "logits/chosen": -2.050534725189209, "logits/rejected": -2.008087396621704, "logps/chosen": -226.65194702148438, "logps/rejected": -307.14324951171875, "loss": 0.4801, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.7088916301727295, "rewards/margins": 0.8303858041763306, "rewards/rejected": -2.5392775535583496, "step": 13640 }, { "epoch": 2.351826326671261, "grad_norm": 39.161048889160156, "learning_rate": 2.7119608300248842e-08, "logits/chosen": -2.118753671646118, "logits/rejected": -2.077744245529175, "logps/chosen": -237.79550170898438, "logps/rejected": -312.37933349609375, "loss": 0.508, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.826664686203003, "rewards/margins": 0.7895326018333435, "rewards/rejected": -2.616197109222412, "step": 13650 }, { "epoch": 2.35354927636113, "grad_norm": 46.64421463012695, "learning_rate": 2.698247859600591e-08, "logits/chosen": -2.0097432136535645, "logits/rejected": -1.9697071313858032, "logps/chosen": -232.4499969482422, "logps/rejected": -302.5597229003906, "loss": 0.5348, "rewards/accuracies": 0.75, "rewards/chosen": -1.7880842685699463, "rewards/margins": 0.7070426940917969, "rewards/rejected": -2.495126962661743, "step": 13660 }, { "epoch": 2.3552722260509995, "grad_norm": 42.685482025146484, "learning_rate": 2.6845642378644463e-08, "logits/chosen": -2.0989060401916504, "logits/rejected": -2.05808162689209, "logps/chosen": -238.69668579101562, "logps/rejected": -302.35272216796875, "loss": 0.5426, "rewards/accuracies": 0.71875, "rewards/chosen": -1.8286670446395874, "rewards/margins": 0.6564762592315674, "rewards/rejected": -2.4851431846618652, "step": 13670 }, { "epoch": 2.3569951757408685, "grad_norm": 40.78122329711914, "learning_rate": 2.6709100198164513e-08, "logits/chosen": -2.105421781539917, "logits/rejected": -2.0629665851593018, "logps/chosen": -236.335693359375, "logps/rejected": -299.8169250488281, "loss": 0.5282, "rewards/accuracies": 0.71875, "rewards/chosen": -1.8018970489501953, "rewards/margins": 0.6912158727645874, "rewards/rejected": -2.4931130409240723, "step": 13680 }, { "epoch": 2.3587181254307374, "grad_norm": 33.260982513427734, "learning_rate": 2.657285260338421e-08, "logits/chosen": -2.078080892562866, "logits/rejected": -2.0286943912506104, "logps/chosen": -236.0628204345703, "logps/rejected": -314.62322998046875, "loss": 0.4971, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.814112901687622, "rewards/margins": 0.7981823086738586, "rewards/rejected": -2.612295150756836, "step": 13690 }, { "epoch": 2.3604410751206064, "grad_norm": 58.762420654296875, "learning_rate": 2.643690014193758e-08, "logits/chosen": -2.072925090789795, "logits/rejected": -2.030588150024414, "logps/chosen": -245.0630340576172, "logps/rejected": -304.95379638671875, "loss": 0.5536, "rewards/accuracies": 0.71875, "rewards/chosen": -1.8534374237060547, "rewards/margins": 0.6731254458427429, "rewards/rejected": -2.5265629291534424, "step": 13700 }, { "epoch": 2.3621640248104754, "grad_norm": 53.97275924682617, "learning_rate": 2.6301243360272394e-08, "logits/chosen": -2.022672176361084, "logits/rejected": -1.9718831777572632, "logps/chosen": -233.41122436523438, "logps/rejected": -311.2149353027344, "loss": 0.5107, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.787607192993164, "rewards/margins": 0.790905773639679, "rewards/rejected": -2.578512668609619, "step": 13710 }, { "epoch": 2.3638869745003444, "grad_norm": 61.27710723876953, "learning_rate": 2.6165882803648055e-08, "logits/chosen": -2.044752597808838, "logits/rejected": -2.001466751098633, "logps/chosen": -232.3321533203125, "logps/rejected": -300.99346923828125, "loss": 0.5336, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.7707017660140991, "rewards/margins": 0.7224616408348083, "rewards/rejected": -2.4931633472442627, "step": 13720 }, { "epoch": 2.3656099241902138, "grad_norm": 73.16886901855469, "learning_rate": 2.60308190161332e-08, "logits/chosen": -2.161386489868164, "logits/rejected": -2.1212000846862793, "logps/chosen": -235.1083221435547, "logps/rejected": -325.3297424316406, "loss": 0.4772, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.7882179021835327, "rewards/margins": 0.8951144218444824, "rewards/rejected": -2.6833324432373047, "step": 13730 }, { "epoch": 2.3673328738800827, "grad_norm": 50.45926284790039, "learning_rate": 2.5896052540603706e-08, "logits/chosen": -2.141047477722168, "logits/rejected": -2.095937490463257, "logps/chosen": -246.432373046875, "logps/rejected": -324.0269775390625, "loss": 0.517, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.8807487487792969, "rewards/margins": 0.8315391540527344, "rewards/rejected": -2.7122879028320312, "step": 13740 }, { "epoch": 2.3690558235699517, "grad_norm": 41.474308013916016, "learning_rate": 2.576158391874047e-08, "logits/chosen": -2.1030590534210205, "logits/rejected": -2.0579898357391357, "logps/chosen": -246.07363891601562, "logps/rejected": -331.3897399902344, "loss": 0.5176, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.8955215215682983, "rewards/margins": 0.8814752697944641, "rewards/rejected": -2.7769968509674072, "step": 13750 }, { "epoch": 2.3707787732598207, "grad_norm": 52.19126510620117, "learning_rate": 2.562741369102711e-08, "logits/chosen": -2.1607160568237305, "logits/rejected": -2.1174328327178955, "logps/chosen": -229.7025146484375, "logps/rejected": -289.3486328125, "loss": 0.5784, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.7426655292510986, "rewards/margins": 0.6520506739616394, "rewards/rejected": -2.394716262817383, "step": 13760 }, { "epoch": 2.37250172294969, "grad_norm": 45.66209411621094, "learning_rate": 2.549354239674786e-08, "logits/chosen": -2.155529260635376, "logits/rejected": -2.1266839504241943, "logps/chosen": -226.15097045898438, "logps/rejected": -297.9380798339844, "loss": 0.5314, "rewards/accuracies": 0.71875, "rewards/chosen": -1.746569275856018, "rewards/margins": 0.694832980632782, "rewards/rejected": -2.441401958465576, "step": 13770 }, { "epoch": 2.374224672639559, "grad_norm": 30.729534149169922, "learning_rate": 2.5359970573985524e-08, "logits/chosen": -2.2028872966766357, "logits/rejected": -2.151646137237549, "logps/chosen": -237.97409057617188, "logps/rejected": -293.50445556640625, "loss": 0.5552, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.779500961303711, "rewards/margins": 0.6468724012374878, "rewards/rejected": -2.4263734817504883, "step": 13780 }, { "epoch": 2.375947622329428, "grad_norm": 33.8580207824707, "learning_rate": 2.522669875961919e-08, "logits/chosen": -2.11983060836792, "logits/rejected": -2.080904722213745, "logps/chosen": -215.71340942382812, "logps/rejected": -279.94342041015625, "loss": 0.5113, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.5791537761688232, "rewards/margins": 0.7006839513778687, "rewards/rejected": -2.2798376083374023, "step": 13790 }, { "epoch": 2.377670572019297, "grad_norm": 43.75486373901367, "learning_rate": 2.509372748932195e-08, "logits/chosen": -2.186190128326416, "logits/rejected": -2.13720703125, "logps/chosen": -213.9628448486328, "logps/rejected": -287.0473937988281, "loss": 0.4907, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.5549368858337402, "rewards/margins": 0.8049592971801758, "rewards/rejected": -2.359896183013916, "step": 13800 }, { "epoch": 2.379393521709166, "grad_norm": 49.993106842041016, "learning_rate": 2.4961057297559064e-08, "logits/chosen": -2.102524995803833, "logits/rejected": -2.061375617980957, "logps/chosen": -212.2505340576172, "logps/rejected": -283.9283447265625, "loss": 0.5364, "rewards/accuracies": 0.6875, "rewards/chosen": -1.581167459487915, "rewards/margins": 0.7305034399032593, "rewards/rejected": -2.3116707801818848, "step": 13810 }, { "epoch": 2.381116471399035, "grad_norm": 39.11954879760742, "learning_rate": 2.4828688717585567e-08, "logits/chosen": -2.153839111328125, "logits/rejected": -2.103940963745117, "logps/chosen": -216.89474487304688, "logps/rejected": -281.2050476074219, "loss": 0.5263, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.5603468418121338, "rewards/margins": 0.7254433035850525, "rewards/rejected": -2.285790205001831, "step": 13820 }, { "epoch": 2.3828394210889043, "grad_norm": 41.82209396362305, "learning_rate": 2.4696622281444158e-08, "logits/chosen": -2.1807799339294434, "logits/rejected": -2.154695987701416, "logps/chosen": -209.79354858398438, "logps/rejected": -268.2076416015625, "loss": 0.5323, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.5338456630706787, "rewards/margins": 0.6318690180778503, "rewards/rejected": -2.165714979171753, "step": 13830 }, { "epoch": 2.3845623707787733, "grad_norm": 39.15401077270508, "learning_rate": 2.4564858519963195e-08, "logits/chosen": -2.1524453163146973, "logits/rejected": -2.1096901893615723, "logps/chosen": -218.0504150390625, "logps/rejected": -271.6688537597656, "loss": 0.5577, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.6052839756011963, "rewards/margins": 0.59880530834198, "rewards/rejected": -2.204089641571045, "step": 13840 }, { "epoch": 2.3862853204686423, "grad_norm": 38.87300109863281, "learning_rate": 2.443339796275432e-08, "logits/chosen": -2.078158140182495, "logits/rejected": -2.0419223308563232, "logps/chosen": -215.25033569335938, "logps/rejected": -275.47967529296875, "loss": 0.5611, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.6021534204483032, "rewards/margins": 0.6668301224708557, "rewards/rejected": -2.2689836025238037, "step": 13850 }, { "epoch": 2.3880082701585112, "grad_norm": 37.93467330932617, "learning_rate": 2.4302241138210633e-08, "logits/chosen": -2.0814151763916016, "logits/rejected": -2.04856276512146, "logps/chosen": -221.61465454101562, "logps/rejected": -282.6617736816406, "loss": 0.5406, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.6941856145858765, "rewards/margins": 0.6259894371032715, "rewards/rejected": -2.3201751708984375, "step": 13860 }, { "epoch": 2.3897312198483807, "grad_norm": 44.15261459350586, "learning_rate": 2.417138857350428e-08, "logits/chosen": -2.119511365890503, "logits/rejected": -2.085003614425659, "logps/chosen": -235.39614868164062, "logps/rejected": -303.26483154296875, "loss": 0.552, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.7935514450073242, "rewards/margins": 0.7201517820358276, "rewards/rejected": -2.513702869415283, "step": 13870 }, { "epoch": 2.3914541695382496, "grad_norm": 36.41903305053711, "learning_rate": 2.404084079458457e-08, "logits/chosen": -2.0601439476013184, "logits/rejected": -2.0118136405944824, "logps/chosen": -226.09365844726562, "logps/rejected": -284.78387451171875, "loss": 0.5702, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.706420660018921, "rewards/margins": 0.6236202716827393, "rewards/rejected": -2.3300411701202393, "step": 13880 }, { "epoch": 2.3931771192281186, "grad_norm": 31.09129524230957, "learning_rate": 2.3910598326175635e-08, "logits/chosen": -2.155531644821167, "logits/rejected": -2.1182315349578857, "logps/chosen": -211.36474609375, "logps/rejected": -272.09503173828125, "loss": 0.5368, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.5504828691482544, "rewards/margins": 0.6465977430343628, "rewards/rejected": -2.197080612182617, "step": 13890 }, { "epoch": 2.3949000689179876, "grad_norm": 38.452693939208984, "learning_rate": 2.3780661691774585e-08, "logits/chosen": -2.0754923820495605, "logits/rejected": -2.0319905281066895, "logps/chosen": -206.372314453125, "logps/rejected": -269.18389892578125, "loss": 0.5489, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.5245025157928467, "rewards/margins": 0.6824948787689209, "rewards/rejected": -2.2069973945617676, "step": 13900 }, { "epoch": 2.3966230186078565, "grad_norm": 47.45440673828125, "learning_rate": 2.3651031413649127e-08, "logits/chosen": -2.1021831035614014, "logits/rejected": -2.0657753944396973, "logps/chosen": -202.1910400390625, "logps/rejected": -255.5797576904297, "loss": 0.5502, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.447701096534729, "rewards/margins": 0.5892106890678406, "rewards/rejected": -2.036911964416504, "step": 13910 }, { "epoch": 2.3983459682977255, "grad_norm": 52.61918640136719, "learning_rate": 2.3521708012835696e-08, "logits/chosen": -2.1503491401672363, "logits/rejected": -2.1002659797668457, "logps/chosen": -216.20458984375, "logps/rejected": -273.5358581542969, "loss": 0.518, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.5507729053497314, "rewards/margins": 0.6722269654273987, "rewards/rejected": -2.2229995727539062, "step": 13920 }, { "epoch": 2.400068917987595, "grad_norm": 38.167327880859375, "learning_rate": 2.3392692009137193e-08, "logits/chosen": -2.1040003299713135, "logits/rejected": -2.0707528591156006, "logps/chosen": -205.02896118164062, "logps/rejected": -252.3942413330078, "loss": 0.603, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.5288441181182861, "rewards/margins": 0.4923536777496338, "rewards/rejected": -2.02119779586792, "step": 13930 }, { "epoch": 2.401791867677464, "grad_norm": 39.31071090698242, "learning_rate": 2.3263983921120987e-08, "logits/chosen": -2.0793838500976562, "logits/rejected": -2.0337014198303223, "logps/chosen": -198.89938354492188, "logps/rejected": -282.0915832519531, "loss": 0.5019, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.450226068496704, "rewards/margins": 0.8140614628791809, "rewards/rejected": -2.2642874717712402, "step": 13940 }, { "epoch": 2.403514817367333, "grad_norm": 49.855323791503906, "learning_rate": 2.3135584266116837e-08, "logits/chosen": -2.140976667404175, "logits/rejected": -2.103226661682129, "logps/chosen": -216.03695678710938, "logps/rejected": -277.74993896484375, "loss": 0.5757, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.6169942617416382, "rewards/margins": 0.6376385688781738, "rewards/rejected": -2.2546324729919434, "step": 13950 }, { "epoch": 2.405237767057202, "grad_norm": 34.05168533325195, "learning_rate": 2.3007493560214787e-08, "logits/chosen": -2.008964776992798, "logits/rejected": -1.994206428527832, "logps/chosen": -208.091064453125, "logps/rejected": -256.63726806640625, "loss": 0.5781, "rewards/accuracies": 0.6875, "rewards/chosen": -1.53592848777771, "rewards/margins": 0.5078008770942688, "rewards/rejected": -2.043729305267334, "step": 13960 }, { "epoch": 2.406960716747071, "grad_norm": 27.172494888305664, "learning_rate": 2.2879712318263056e-08, "logits/chosen": -2.1079671382904053, "logits/rejected": -2.0630617141723633, "logps/chosen": -207.2488250732422, "logps/rejected": -272.19744873046875, "loss": 0.5354, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.4706203937530518, "rewards/margins": 0.7252450585365295, "rewards/rejected": -2.1958653926849365, "step": 13970 }, { "epoch": 2.40868366643694, "grad_norm": 38.71217346191406, "learning_rate": 2.2752241053865973e-08, "logits/chosen": -2.0822973251342773, "logits/rejected": -2.0517477989196777, "logps/chosen": -209.283447265625, "logps/rejected": -281.3548278808594, "loss": 0.5157, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.5362039804458618, "rewards/margins": 0.7391260266304016, "rewards/rejected": -2.275330066680908, "step": 13980 }, { "epoch": 2.410406616126809, "grad_norm": 49.03131103515625, "learning_rate": 2.2625080279382024e-08, "logits/chosen": -2.108020067214966, "logits/rejected": -2.0622098445892334, "logps/chosen": -210.24856567382812, "logps/rejected": -269.4635925292969, "loss": 0.55, "rewards/accuracies": 0.71875, "rewards/chosen": -1.5178747177124023, "rewards/margins": 0.6414793729782104, "rewards/rejected": -2.1593542098999023, "step": 13990 }, { "epoch": 2.412129565816678, "grad_norm": 38.7979736328125, "learning_rate": 2.249823050592169e-08, "logits/chosen": -2.0372188091278076, "logits/rejected": -1.9894564151763916, "logps/chosen": -209.2309112548828, "logps/rejected": -284.48480224609375, "loss": 0.5055, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.5764163732528687, "rewards/margins": 0.7421399354934692, "rewards/rejected": -2.318556070327759, "step": 14000 }, { "epoch": 2.412129565816678, "eval_logits/chosen": -2.1856634616851807, "eval_logits/rejected": -2.1664555072784424, "eval_logps/chosen": -205.19786071777344, "eval_logps/rejected": -239.03466796875, "eval_loss": 0.6384242177009583, "eval_rewards/accuracies": 0.6256970167160034, "eval_rewards/chosen": -1.46182382106781, "eval_rewards/margins": 0.3010266423225403, "eval_rewards/rejected": -1.7628505229949951, "eval_runtime": 383.0062, "eval_samples_per_second": 11.237, "eval_steps_per_second": 1.405, "step": 14000 }, { "epoch": 2.413852515506547, "grad_norm": 31.980548858642578, "learning_rate": 2.2371692243345354e-08, "logits/chosen": -2.0730888843536377, "logits/rejected": -2.044764280319214, "logps/chosen": -214.8597869873047, "logps/rejected": -277.0663146972656, "loss": 0.5672, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.5847432613372803, "rewards/margins": 0.660110354423523, "rewards/rejected": -2.2448537349700928, "step": 14010 }, { "epoch": 2.415575465196416, "grad_norm": 36.432456970214844, "learning_rate": 2.2245466000261394e-08, "logits/chosen": -2.082345962524414, "logits/rejected": -2.0584168434143066, "logps/chosen": -218.6758575439453, "logps/rejected": -274.13787841796875, "loss": 0.5691, "rewards/accuracies": 0.6875, "rewards/chosen": -1.631087303161621, "rewards/margins": 0.6045592427253723, "rewards/rejected": -2.2356464862823486, "step": 14020 }, { "epoch": 2.4172984148862855, "grad_norm": 40.95353698730469, "learning_rate": 2.211955228402399e-08, "logits/chosen": -2.0695528984069824, "logits/rejected": -2.0302963256835938, "logps/chosen": -220.57418823242188, "logps/rejected": -279.3653259277344, "loss": 0.5479, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.6627451181411743, "rewards/margins": 0.6441237926483154, "rewards/rejected": -2.3068687915802, "step": 14030 }, { "epoch": 2.4190213645761545, "grad_norm": 43.107215881347656, "learning_rate": 2.1993951600731154e-08, "logits/chosen": -2.075979709625244, "logits/rejected": -2.0187344551086426, "logps/chosen": -218.7676239013672, "logps/rejected": -283.742431640625, "loss": 0.512, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.6136515140533447, "rewards/margins": 0.7331534624099731, "rewards/rejected": -2.3468048572540283, "step": 14040 }, { "epoch": 2.4207443142660234, "grad_norm": 32.694793701171875, "learning_rate": 2.186866445522273e-08, "logits/chosen": -2.1185286045074463, "logits/rejected": -2.072413682937622, "logps/chosen": -202.73074340820312, "logps/rejected": -259.3929443359375, "loss": 0.5442, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.4719576835632324, "rewards/margins": 0.622622013092041, "rewards/rejected": -2.0945794582366943, "step": 14050 }, { "epoch": 2.4224672639558924, "grad_norm": 36.23963165283203, "learning_rate": 2.1743691351078332e-08, "logits/chosen": -2.129517078399658, "logits/rejected": -2.070852756500244, "logps/chosen": -210.60415649414062, "logps/rejected": -298.68585205078125, "loss": 0.4587, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.535704255104065, "rewards/margins": 0.9426101446151733, "rewards/rejected": -2.4783143997192383, "step": 14060 }, { "epoch": 2.4241902136457614, "grad_norm": 43.657432556152344, "learning_rate": 2.161903279061529e-08, "logits/chosen": -2.0908563137054443, "logits/rejected": -2.0465846061706543, "logps/chosen": -226.6675262451172, "logps/rejected": -300.82196044921875, "loss": 0.5151, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.7071689367294312, "rewards/margins": 0.7603108882904053, "rewards/rejected": -2.467479944229126, "step": 14070 }, { "epoch": 2.425913163335631, "grad_norm": 37.620723724365234, "learning_rate": 2.14946892748866e-08, "logits/chosen": -2.040656805038452, "logits/rejected": -1.9950001239776611, "logps/chosen": -241.02963256835938, "logps/rejected": -301.5330505371094, "loss": 0.5648, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.8209514617919922, "rewards/margins": 0.6740165948867798, "rewards/rejected": -2.4949679374694824, "step": 14080 }, { "epoch": 2.4276361130254998, "grad_norm": 43.765499114990234, "learning_rate": 2.1370661303679084e-08, "logits/chosen": -2.074293613433838, "logits/rejected": -2.0272514820098877, "logps/chosen": -216.4875946044922, "logps/rejected": -272.84417724609375, "loss": 0.5578, "rewards/accuracies": 0.71875, "rewards/chosen": -1.590613603591919, "rewards/margins": 0.6262162327766418, "rewards/rejected": -2.216829776763916, "step": 14090 }, { "epoch": 2.4293590627153687, "grad_norm": 25.765214920043945, "learning_rate": 2.1246949375511214e-08, "logits/chosen": -2.1253323554992676, "logits/rejected": -2.080301523208618, "logps/chosen": -219.0335235595703, "logps/rejected": -295.12152099609375, "loss": 0.4907, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.5946428775787354, "rewards/margins": 0.8154427409172058, "rewards/rejected": -2.4100852012634277, "step": 14100 }, { "epoch": 2.4310820124052377, "grad_norm": 49.0091552734375, "learning_rate": 2.1123553987631126e-08, "logits/chosen": -2.126990795135498, "logits/rejected": -2.101468801498413, "logps/chosen": -217.9353790283203, "logps/rejected": -278.48260498046875, "loss": 0.5529, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.6728681325912476, "rewards/margins": 0.6070038676261902, "rewards/rejected": -2.279871940612793, "step": 14110 }, { "epoch": 2.4328049620951067, "grad_norm": 62.83906555175781, "learning_rate": 2.1000475636014635e-08, "logits/chosen": -2.1041083335876465, "logits/rejected": -2.0625221729278564, "logps/chosen": -219.5806427001953, "logps/rejected": -277.6369934082031, "loss": 0.5599, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.6661217212677002, "rewards/margins": 0.5916668176651001, "rewards/rejected": -2.2577884197235107, "step": 14120 }, { "epoch": 2.4345279117849756, "grad_norm": 50.54969787597656, "learning_rate": 2.0877714815363366e-08, "logits/chosen": -2.13179349899292, "logits/rejected": -2.097782611846924, "logps/chosen": -213.8779296875, "logps/rejected": -265.7869567871094, "loss": 0.5524, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.5833685398101807, "rewards/margins": 0.5597018003463745, "rewards/rejected": -2.1430704593658447, "step": 14130 }, { "epoch": 2.436250861474845, "grad_norm": 48.95522689819336, "learning_rate": 2.0755272019102542e-08, "logits/chosen": -2.1710026264190674, "logits/rejected": -2.132071018218994, "logps/chosen": -228.41012573242188, "logps/rejected": -292.615966796875, "loss": 0.5522, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.7536594867706299, "rewards/margins": 0.6659385561943054, "rewards/rejected": -2.419598340988159, "step": 14140 }, { "epoch": 2.437973811164714, "grad_norm": 54.72209930419922, "learning_rate": 2.063314773937921e-08, "logits/chosen": -2.1553657054901123, "logits/rejected": -2.120232343673706, "logps/chosen": -224.70114135742188, "logps/rejected": -288.44012451171875, "loss": 0.5617, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.6969101428985596, "rewards/margins": 0.641015887260437, "rewards/rejected": -2.337926149368286, "step": 14150 }, { "epoch": 2.439696760854583, "grad_norm": 36.77251434326172, "learning_rate": 2.051134246706008e-08, "logits/chosen": -2.1020195484161377, "logits/rejected": -2.0639472007751465, "logps/chosen": -217.10897827148438, "logps/rejected": -275.93084716796875, "loss": 0.5907, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.6270267963409424, "rewards/margins": 0.5740610361099243, "rewards/rejected": -2.2010879516601562, "step": 14160 }, { "epoch": 2.441419710544452, "grad_norm": 38.20256423950195, "learning_rate": 2.0389856691729734e-08, "logits/chosen": -2.0549514293670654, "logits/rejected": -2.01374888420105, "logps/chosen": -209.72048950195312, "logps/rejected": -270.4997863769531, "loss": 0.5582, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.5325407981872559, "rewards/margins": 0.6379245519638062, "rewards/rejected": -2.1704652309417725, "step": 14170 }, { "epoch": 2.4431426602343214, "grad_norm": 58.34638595581055, "learning_rate": 2.026869090168849e-08, "logits/chosen": -2.117248058319092, "logits/rejected": -2.068472385406494, "logps/chosen": -220.41830444335938, "logps/rejected": -269.4673767089844, "loss": 0.5768, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.6232048273086548, "rewards/margins": 0.5637925267219543, "rewards/rejected": -2.186997413635254, "step": 14180 }, { "epoch": 2.4448656099241903, "grad_norm": 46.98957824707031, "learning_rate": 2.0147845583950552e-08, "logits/chosen": -2.1380128860473633, "logits/rejected": -2.101867198944092, "logps/chosen": -221.2222900390625, "logps/rejected": -279.1551208496094, "loss": 0.5456, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.6172845363616943, "rewards/margins": 0.6354303359985352, "rewards/rejected": -2.2527148723602295, "step": 14190 }, { "epoch": 2.4465885596140593, "grad_norm": 33.68207550048828, "learning_rate": 2.0027321224242067e-08, "logits/chosen": -2.0417656898498535, "logits/rejected": -2.0019569396972656, "logps/chosen": -204.66732788085938, "logps/rejected": -285.6546936035156, "loss": 0.4812, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.4978091716766357, "rewards/margins": 0.8225471377372742, "rewards/rejected": -2.3203561305999756, "step": 14200 }, { "epoch": 2.4483115093039283, "grad_norm": 56.56424331665039, "learning_rate": 1.9907118306999017e-08, "logits/chosen": -2.112985610961914, "logits/rejected": -2.0740582942962646, "logps/chosen": -222.14852905273438, "logps/rejected": -287.42132568359375, "loss": 0.5402, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.6827729940414429, "rewards/margins": 0.6711798906326294, "rewards/rejected": -2.3539528846740723, "step": 14210 }, { "epoch": 2.4500344589937972, "grad_norm": 33.83453369140625, "learning_rate": 1.9787237315365424e-08, "logits/chosen": -2.164368152618408, "logits/rejected": -2.113048791885376, "logps/chosen": -226.94155883789062, "logps/rejected": -297.2632751464844, "loss": 0.5128, "rewards/accuracies": 0.78125, "rewards/chosen": -1.6764634847640991, "rewards/margins": 0.750978410243988, "rewards/rejected": -2.4274418354034424, "step": 14220 }, { "epoch": 2.451757408683666, "grad_norm": 39.73827362060547, "learning_rate": 1.9667678731191373e-08, "logits/chosen": -2.028599262237549, "logits/rejected": -1.974353551864624, "logps/chosen": -228.91439819335938, "logps/rejected": -302.0907897949219, "loss": 0.5124, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.7156779766082764, "rewards/margins": 0.795965313911438, "rewards/rejected": -2.511643171310425, "step": 14230 }, { "epoch": 2.4534803583735356, "grad_norm": 48.25334548950195, "learning_rate": 1.9548443035031125e-08, "logits/chosen": -2.028688430786133, "logits/rejected": -1.9942216873168945, "logps/chosen": -236.4214630126953, "logps/rejected": -316.0946960449219, "loss": 0.5161, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.8170276880264282, "rewards/margins": 0.8088165521621704, "rewards/rejected": -2.6258442401885986, "step": 14240 }, { "epoch": 2.4552033080634046, "grad_norm": 44.34630584716797, "learning_rate": 1.942953070614094e-08, "logits/chosen": -2.0448529720306396, "logits/rejected": -2.0084269046783447, "logps/chosen": -234.31857299804688, "logps/rejected": -294.1382141113281, "loss": 0.551, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.7902448177337646, "rewards/margins": 0.6250909566879272, "rewards/rejected": -2.4153358936309814, "step": 14250 }, { "epoch": 2.4569262577532736, "grad_norm": 63.74422073364258, "learning_rate": 1.93109422224775e-08, "logits/chosen": -2.1369385719299316, "logits/rejected": -2.0834081172943115, "logps/chosen": -230.72824096679688, "logps/rejected": -293.1272888183594, "loss": 0.5428, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.7284284830093384, "rewards/margins": 0.7105134725570679, "rewards/rejected": -2.4389424324035645, "step": 14260 }, { "epoch": 2.4586492074431425, "grad_norm": 38.61103057861328, "learning_rate": 1.9192678060695812e-08, "logits/chosen": -2.1000282764434814, "logits/rejected": -2.0557782649993896, "logps/chosen": -233.59414672851562, "logps/rejected": -309.5748291015625, "loss": 0.5062, "rewards/accuracies": 0.75, "rewards/chosen": -1.7813142538070679, "rewards/margins": 0.8087684512138367, "rewards/rejected": -2.590082883834839, "step": 14270 }, { "epoch": 2.460372157133012, "grad_norm": 34.51493835449219, "learning_rate": 1.9074738696147196e-08, "logits/chosen": -2.0385901927948, "logits/rejected": -2.0119054317474365, "logps/chosen": -225.359619140625, "logps/rejected": -284.545654296875, "loss": 0.5792, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.7285268306732178, "rewards/margins": 0.5999918580055237, "rewards/rejected": -2.3285186290740967, "step": 14280 }, { "epoch": 2.462095106822881, "grad_norm": 35.47799301147461, "learning_rate": 1.8957124602877618e-08, "logits/chosen": -2.036486864089966, "logits/rejected": -1.9845813512802124, "logps/chosen": -228.3271942138672, "logps/rejected": -290.8971862792969, "loss": 0.5302, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.6707541942596436, "rewards/margins": 0.7225546836853027, "rewards/rejected": -2.3933091163635254, "step": 14290 }, { "epoch": 2.46381805651275, "grad_norm": 31.757747650146484, "learning_rate": 1.8839836253625496e-08, "logits/chosen": -2.1560089588165283, "logits/rejected": -2.1177501678466797, "logps/chosen": -210.2001190185547, "logps/rejected": -289.1632385253906, "loss": 0.4882, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.5514459609985352, "rewards/margins": 0.7843285202980042, "rewards/rejected": -2.3357746601104736, "step": 14300 }, { "epoch": 2.465541006202619, "grad_norm": 43.582733154296875, "learning_rate": 1.872287411982011e-08, "logits/chosen": -2.0736470222473145, "logits/rejected": -2.032609701156616, "logps/chosen": -228.1890106201172, "logps/rejected": -295.87188720703125, "loss": 0.5463, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.7359241247177124, "rewards/margins": 0.7118991613388062, "rewards/rejected": -2.4478230476379395, "step": 14310 }, { "epoch": 2.467263955892488, "grad_norm": 36.0502815246582, "learning_rate": 1.860623867157941e-08, "logits/chosen": -2.0633580684661865, "logits/rejected": -2.0303797721862793, "logps/chosen": -209.2544708251953, "logps/rejected": -286.25189208984375, "loss": 0.4936, "rewards/accuracies": 0.75, "rewards/chosen": -1.5319209098815918, "rewards/margins": 0.784302294254303, "rewards/rejected": -2.31622314453125, "step": 14320 }, { "epoch": 2.468986905582357, "grad_norm": 39.22027587890625, "learning_rate": 1.8489930377708372e-08, "logits/chosen": -2.1764934062957764, "logits/rejected": -2.118807315826416, "logps/chosen": -228.25131225585938, "logps/rejected": -319.8058776855469, "loss": 0.4699, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.723565697669983, "rewards/margins": 0.9397293329238892, "rewards/rejected": -2.663294792175293, "step": 14330 }, { "epoch": 2.470709855272226, "grad_norm": 49.67063903808594, "learning_rate": 1.8373949705696934e-08, "logits/chosen": -2.0703787803649902, "logits/rejected": -2.028900384902954, "logps/chosen": -231.64517211914062, "logps/rejected": -319.1253662109375, "loss": 0.4915, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.758962631225586, "rewards/margins": 0.8747455477714539, "rewards/rejected": -2.6337082386016846, "step": 14340 }, { "epoch": 2.472432804962095, "grad_norm": 36.914669036865234, "learning_rate": 1.8258297121718204e-08, "logits/chosen": -2.1144473552703857, "logits/rejected": -2.0762577056884766, "logps/chosen": -224.0520782470703, "logps/rejected": -297.4252014160156, "loss": 0.5135, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.6970233917236328, "rewards/margins": 0.7429527044296265, "rewards/rejected": -2.439976215362549, "step": 14350 }, { "epoch": 2.474155754651964, "grad_norm": 52.473472595214844, "learning_rate": 1.81429730906266e-08, "logits/chosen": -2.0756735801696777, "logits/rejected": -2.0378482341766357, "logps/chosen": -248.0349578857422, "logps/rejected": -300.68670654296875, "loss": 0.6018, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.915371298789978, "rewards/margins": 0.5735548734664917, "rewards/rejected": -2.488926410675049, "step": 14360 }, { "epoch": 2.475878704341833, "grad_norm": 40.73741912841797, "learning_rate": 1.8027978075955953e-08, "logits/chosen": -2.0995495319366455, "logits/rejected": -2.0527689456939697, "logps/chosen": -236.0800018310547, "logps/rejected": -303.43731689453125, "loss": 0.5193, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.79816472530365, "rewards/margins": 0.7360619902610779, "rewards/rejected": -2.534226417541504, "step": 14370 }, { "epoch": 2.4776016540317025, "grad_norm": 36.18812561035156, "learning_rate": 1.7913312539917624e-08, "logits/chosen": -2.168733596801758, "logits/rejected": -2.128675937652588, "logps/chosen": -233.74838256835938, "logps/rejected": -306.06292724609375, "loss": 0.5068, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.8042490482330322, "rewards/margins": 0.7510305643081665, "rewards/rejected": -2.555279493331909, "step": 14380 }, { "epoch": 2.4793246037215715, "grad_norm": 52.489402770996094, "learning_rate": 1.7798976943398623e-08, "logits/chosen": -2.078742504119873, "logits/rejected": -2.0347018241882324, "logps/chosen": -230.8499755859375, "logps/rejected": -311.3640441894531, "loss": 0.5033, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.7754895687103271, "rewards/margins": 0.8343890309333801, "rewards/rejected": -2.6098785400390625, "step": 14390 }, { "epoch": 2.4810475534114405, "grad_norm": 52.65116882324219, "learning_rate": 1.7684971745959887e-08, "logits/chosen": -2.1193552017211914, "logits/rejected": -2.0765717029571533, "logps/chosen": -232.67263793945312, "logps/rejected": -301.19561767578125, "loss": 0.5404, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.7463268041610718, "rewards/margins": 0.7748914957046509, "rewards/rejected": -2.5212185382843018, "step": 14400 }, { "epoch": 2.4810475534114405, "eval_logits/chosen": -2.161259889602661, "eval_logits/rejected": -2.141052007675171, "eval_logps/chosen": -224.158935546875, "eval_logps/rejected": -260.64886474609375, "eval_loss": 0.6405364871025085, "eval_rewards/accuracies": 0.6284851431846619, "eval_rewards/chosen": -1.6514345407485962, "eval_rewards/margins": 0.32755807042121887, "eval_rewards/rejected": -1.9789925813674927, "eval_runtime": 383.2726, "eval_samples_per_second": 11.23, "eval_steps_per_second": 1.404, "step": 14400 }, { "epoch": 2.4827705031013094, "grad_norm": 35.988075256347656, "learning_rate": 1.7571297405834328e-08, "logits/chosen": -2.082426071166992, "logits/rejected": -2.0395476818084717, "logps/chosen": -219.3708953857422, "logps/rejected": -296.60821533203125, "loss": 0.5272, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.6755558252334595, "rewards/margins": 0.7709904909133911, "rewards/rejected": -2.4465460777282715, "step": 14410 }, { "epoch": 2.4844934527911784, "grad_norm": 33.371829986572266, "learning_rate": 1.7457954379924967e-08, "logits/chosen": -2.1155333518981934, "logits/rejected": -2.0812180042266846, "logps/chosen": -230.984130859375, "logps/rejected": -296.2799377441406, "loss": 0.571, "rewards/accuracies": 0.71875, "rewards/chosen": -1.782016396522522, "rewards/margins": 0.6671894788742065, "rewards/rejected": -2.4492058753967285, "step": 14420 }, { "epoch": 2.4862164024810474, "grad_norm": 63.244773864746094, "learning_rate": 1.7344943123803126e-08, "logits/chosen": -2.0925819873809814, "logits/rejected": -2.0567777156829834, "logps/chosen": -221.43994140625, "logps/rejected": -288.7841491699219, "loss": 0.5606, "rewards/accuracies": 0.6875, "rewards/chosen": -1.6841884851455688, "rewards/margins": 0.7082483768463135, "rewards/rejected": -2.3924367427825928, "step": 14430 }, { "epoch": 2.4879393521709168, "grad_norm": 36.74225616455078, "learning_rate": 1.7232264091706682e-08, "logits/chosen": -2.0929274559020996, "logits/rejected": -2.0496408939361572, "logps/chosen": -210.4450225830078, "logps/rejected": -288.16119384765625, "loss": 0.4963, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.5563026666641235, "rewards/margins": 0.7918773889541626, "rewards/rejected": -2.3481802940368652, "step": 14440 }, { "epoch": 2.4896623018607857, "grad_norm": 35.94378662109375, "learning_rate": 1.7119917736538115e-08, "logits/chosen": -2.0818235874176025, "logits/rejected": -2.0427815914154053, "logps/chosen": -227.7500762939453, "logps/rejected": -294.5572204589844, "loss": 0.5141, "rewards/accuracies": 0.71875, "rewards/chosen": -1.7150242328643799, "rewards/margins": 0.7176655530929565, "rewards/rejected": -2.432689666748047, "step": 14450 }, { "epoch": 2.4913852515506547, "grad_norm": 31.527305603027344, "learning_rate": 1.700790450986276e-08, "logits/chosen": -2.0948452949523926, "logits/rejected": -2.061279058456421, "logps/chosen": -221.8003387451172, "logps/rejected": -284.5101013183594, "loss": 0.5474, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.6733448505401611, "rewards/margins": 0.6444646120071411, "rewards/rejected": -2.3178091049194336, "step": 14460 }, { "epoch": 2.4931082012405237, "grad_norm": 39.6318244934082, "learning_rate": 1.6896224861907004e-08, "logits/chosen": -2.1865909099578857, "logits/rejected": -2.1406912803649902, "logps/chosen": -232.9261474609375, "logps/rejected": -295.4991760253906, "loss": 0.5054, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.7305749654769897, "rewards/margins": 0.7335001230239868, "rewards/rejected": -2.4640750885009766, "step": 14470 }, { "epoch": 2.4948311509303926, "grad_norm": 31.915298461914062, "learning_rate": 1.6784879241556395e-08, "logits/chosen": -2.1030077934265137, "logits/rejected": -2.082937002182007, "logps/chosen": -222.57131958007812, "logps/rejected": -295.8778991699219, "loss": 0.5156, "rewards/accuracies": 0.75, "rewards/chosen": -1.6824105978012085, "rewards/margins": 0.7339216470718384, "rewards/rejected": -2.416332721710205, "step": 14480 }, { "epoch": 2.496554100620262, "grad_norm": 39.143375396728516, "learning_rate": 1.667386809635387e-08, "logits/chosen": -2.0768282413482666, "logits/rejected": -2.0459749698638916, "logps/chosen": -228.5738983154297, "logps/rejected": -297.850341796875, "loss": 0.5533, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.7719007730484009, "rewards/margins": 0.6979681849479675, "rewards/rejected": -2.4698688983917236, "step": 14490 }, { "epoch": 2.498277050310131, "grad_norm": 33.62594985961914, "learning_rate": 1.6563191872498062e-08, "logits/chosen": -2.097094774246216, "logits/rejected": -2.038964033126831, "logps/chosen": -218.2742462158203, "logps/rejected": -294.4154968261719, "loss": 0.4801, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.6070648431777954, "rewards/margins": 0.8043497204780579, "rewards/rejected": -2.411414623260498, "step": 14500 }, { "epoch": 2.5, "grad_norm": 41.214996337890625, "learning_rate": 1.6452851014841374e-08, "logits/chosen": -2.1380131244659424, "logits/rejected": -2.1041531562805176, "logps/chosen": -228.376220703125, "logps/rejected": -275.10205078125, "loss": 0.5938, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.7192304134368896, "rewards/margins": 0.5255845785140991, "rewards/rejected": -2.2448153495788574, "step": 14510 }, { "epoch": 2.501722949689869, "grad_norm": 46.394371032714844, "learning_rate": 1.634284596688823e-08, "logits/chosen": -2.090603828430176, "logits/rejected": -2.0525267124176025, "logps/chosen": -226.4547119140625, "logps/rejected": -286.3438720703125, "loss": 0.5721, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.696892499923706, "rewards/margins": 0.6256784796714783, "rewards/rejected": -2.322571039199829, "step": 14520 }, { "epoch": 2.503445899379738, "grad_norm": 30.314685821533203, "learning_rate": 1.623317717079328e-08, "logits/chosen": -2.1235809326171875, "logits/rejected": -2.0845789909362793, "logps/chosen": -225.544677734375, "logps/rejected": -293.58978271484375, "loss": 0.5188, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.6724278926849365, "rewards/margins": 0.7151906490325928, "rewards/rejected": -2.3876185417175293, "step": 14530 }, { "epoch": 2.505168849069607, "grad_norm": 35.82331848144531, "learning_rate": 1.6123845067359676e-08, "logits/chosen": -2.0943241119384766, "logits/rejected": -2.0486671924591064, "logps/chosen": -215.40835571289062, "logps/rejected": -291.39263916015625, "loss": 0.5088, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.6005795001983643, "rewards/margins": 0.7979137301445007, "rewards/rejected": -2.3984932899475098, "step": 14540 }, { "epoch": 2.5068917987594763, "grad_norm": 33.8138427734375, "learning_rate": 1.6014850096037304e-08, "logits/chosen": -2.1238183975219727, "logits/rejected": -2.071692705154419, "logps/chosen": -212.27365112304688, "logps/rejected": -289.61138916015625, "loss": 0.5027, "rewards/accuracies": 0.71875, "rewards/chosen": -1.5402295589447021, "rewards/margins": 0.8287081718444824, "rewards/rejected": -2.3689382076263428, "step": 14550 }, { "epoch": 2.5086147484493453, "grad_norm": 37.8752555847168, "learning_rate": 1.5906192694920883e-08, "logits/chosen": -2.078948736190796, "logits/rejected": -2.0395030975341797, "logps/chosen": -222.35928344726562, "logps/rejected": -295.3717346191406, "loss": 0.5384, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.6899096965789795, "rewards/margins": 0.7251261472702026, "rewards/rejected": -2.4150357246398926, "step": 14560 }, { "epoch": 2.5103376981392143, "grad_norm": 32.014007568359375, "learning_rate": 1.5797873300748355e-08, "logits/chosen": -2.035325527191162, "logits/rejected": -2.0111773014068604, "logps/chosen": -216.21414184570312, "logps/rejected": -283.3451232910156, "loss": 0.5642, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.6279312372207642, "rewards/margins": 0.6702073812484741, "rewards/rejected": -2.2981388568878174, "step": 14570 }, { "epoch": 2.5120606478290832, "grad_norm": 43.30143737792969, "learning_rate": 1.5689892348899103e-08, "logits/chosen": -2.128683567047119, "logits/rejected": -2.0904922485351562, "logps/chosen": -214.5843963623047, "logps/rejected": -279.8626403808594, "loss": 0.5395, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.6395725011825562, "rewards/margins": 0.657935619354248, "rewards/rejected": -2.2975080013275146, "step": 14580 }, { "epoch": 2.5137835975189526, "grad_norm": 37.98158645629883, "learning_rate": 1.5582250273392107e-08, "logits/chosen": -2.0742502212524414, "logits/rejected": -2.0438551902770996, "logps/chosen": -211.3285369873047, "logps/rejected": -276.2047424316406, "loss": 0.5419, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.574963927268982, "rewards/margins": 0.6697554588317871, "rewards/rejected": -2.2447195053100586, "step": 14590 }, { "epoch": 2.5155065472088216, "grad_norm": 52.75736618041992, "learning_rate": 1.547494750688435e-08, "logits/chosen": -2.091930627822876, "logits/rejected": -2.038133144378662, "logps/chosen": -220.406494140625, "logps/rejected": -295.84893798828125, "loss": 0.4755, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.6465896368026733, "rewards/margins": 0.8058226704597473, "rewards/rejected": -2.4524121284484863, "step": 14600 }, { "epoch": 2.5172294968986906, "grad_norm": 71.89414978027344, "learning_rate": 1.5367984480668884e-08, "logits/chosen": -2.066631317138672, "logits/rejected": -2.019594669342041, "logps/chosen": -223.1024169921875, "logps/rejected": -288.41741943359375, "loss": 0.5025, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.6547962427139282, "rewards/margins": 0.7386652827262878, "rewards/rejected": -2.3934614658355713, "step": 14610 }, { "epoch": 2.5189524465885595, "grad_norm": 40.280296325683594, "learning_rate": 1.526136162467333e-08, "logits/chosen": -2.042661190032959, "logits/rejected": -2.0123565196990967, "logps/chosen": -239.93679809570312, "logps/rejected": -309.1552429199219, "loss": 0.5739, "rewards/accuracies": 0.71875, "rewards/chosen": -1.8766295909881592, "rewards/margins": 0.6878591775894165, "rewards/rejected": -2.564488649368286, "step": 14620 }, { "epoch": 2.5206753962784285, "grad_norm": 38.62200164794922, "learning_rate": 1.5155079367457925e-08, "logits/chosen": -2.0123703479766846, "logits/rejected": -1.9841398000717163, "logps/chosen": -232.4508056640625, "logps/rejected": -293.60650634765625, "loss": 0.5437, "rewards/accuracies": 0.75, "rewards/chosen": -1.777007818222046, "rewards/margins": 0.6375226974487305, "rewards/rejected": -2.4145302772521973, "step": 14630 }, { "epoch": 2.5223983459682975, "grad_norm": 28.682510375976562, "learning_rate": 1.5049138136213968e-08, "logits/chosen": -2.040053606033325, "logits/rejected": -2.0036227703094482, "logps/chosen": -226.60018920898438, "logps/rejected": -306.9270324707031, "loss": 0.5403, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.7411445379257202, "rewards/margins": 0.8110846281051636, "rewards/rejected": -2.552229642868042, "step": 14640 }, { "epoch": 2.524121295658167, "grad_norm": 99.67330169677734, "learning_rate": 1.4943538356762065e-08, "logits/chosen": -2.0883889198303223, "logits/rejected": -2.0557003021240234, "logps/chosen": -248.0118408203125, "logps/rejected": -296.2115783691406, "loss": 0.6158, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.9309829473495483, "rewards/margins": 0.5404241681098938, "rewards/rejected": -2.471407413482666, "step": 14650 }, { "epoch": 2.525844245348036, "grad_norm": 43.54148483276367, "learning_rate": 1.4838280453550234e-08, "logits/chosen": -2.0526440143585205, "logits/rejected": -1.996791124343872, "logps/chosen": -229.3895263671875, "logps/rejected": -313.713134765625, "loss": 0.46, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.7467634677886963, "rewards/margins": 0.8816523551940918, "rewards/rejected": -2.628415822982788, "step": 14660 }, { "epoch": 2.527567195037905, "grad_norm": 32.807762145996094, "learning_rate": 1.4733364849652518e-08, "logits/chosen": -2.005056619644165, "logits/rejected": -1.9649536609649658, "logps/chosen": -218.852783203125, "logps/rejected": -297.0928039550781, "loss": 0.4897, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.6662321090698242, "rewards/margins": 0.7909852266311646, "rewards/rejected": -2.4572174549102783, "step": 14670 }, { "epoch": 2.529290144727774, "grad_norm": 53.55386734008789, "learning_rate": 1.4628791966767095e-08, "logits/chosen": -2.063568353652954, "logits/rejected": -2.027937173843384, "logps/chosen": -231.7452392578125, "logps/rejected": -297.2411804199219, "loss": 0.5546, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.8246181011199951, "rewards/margins": 0.6291089057922363, "rewards/rejected": -2.4537270069122314, "step": 14680 }, { "epoch": 2.531013094417643, "grad_norm": 68.92058563232422, "learning_rate": 1.4524562225214532e-08, "logits/chosen": -2.069715976715088, "logits/rejected": -2.0306973457336426, "logps/chosen": -253.0674285888672, "logps/rejected": -330.63787841796875, "loss": 0.5521, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.0037357807159424, "rewards/margins": 0.7751157283782959, "rewards/rejected": -2.7788515090942383, "step": 14690 }, { "epoch": 2.532736044107512, "grad_norm": 36.464759826660156, "learning_rate": 1.4420676043936198e-08, "logits/chosen": -2.1349639892578125, "logits/rejected": -2.0926318168640137, "logps/chosen": -251.04220581054688, "logps/rejected": -341.8655090332031, "loss": 0.5306, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.9554615020751953, "rewards/margins": 0.9042149782180786, "rewards/rejected": -2.8596763610839844, "step": 14700 }, { "epoch": 2.534458993797381, "grad_norm": 45.3137321472168, "learning_rate": 1.4317133840492612e-08, "logits/chosen": -2.088095188140869, "logits/rejected": -2.06190824508667, "logps/chosen": -229.5307159423828, "logps/rejected": -297.6787109375, "loss": 0.5367, "rewards/accuracies": 0.71875, "rewards/chosen": -1.749803900718689, "rewards/margins": 0.6973880529403687, "rewards/rejected": -2.4471919536590576, "step": 14710 }, { "epoch": 2.53618194348725, "grad_norm": 46.88576889038086, "learning_rate": 1.4213936031061691e-08, "logits/chosen": -2.0573620796203613, "logits/rejected": -2.0041470527648926, "logps/chosen": -236.66488647460938, "logps/rejected": -316.1903381347656, "loss": 0.5133, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.7825686931610107, "rewards/margins": 0.8490725755691528, "rewards/rejected": -2.631641387939453, "step": 14720 }, { "epoch": 2.537904893177119, "grad_norm": 50.461631774902344, "learning_rate": 1.411108303043701e-08, "logits/chosen": -2.126465082168579, "logits/rejected": -2.077259063720703, "logps/chosen": -227.8856964111328, "logps/rejected": -303.0856018066406, "loss": 0.5361, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.73442804813385, "rewards/margins": 0.7996103167533875, "rewards/rejected": -2.5340380668640137, "step": 14730 }, { "epoch": 2.539627842866988, "grad_norm": 49.08579635620117, "learning_rate": 1.4008575252026334e-08, "logits/chosen": -2.050816059112549, "logits/rejected": -2.024536371231079, "logps/chosen": -242.36447143554688, "logps/rejected": -312.283935546875, "loss": 0.5423, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.8535900115966797, "rewards/margins": 0.7273507714271545, "rewards/rejected": -2.5809407234191895, "step": 14740 }, { "epoch": 2.5413507925568575, "grad_norm": 43.996585845947266, "learning_rate": 1.3906413107849757e-08, "logits/chosen": -2.079751491546631, "logits/rejected": -2.0369937419891357, "logps/chosen": -225.7006378173828, "logps/rejected": -294.62030029296875, "loss": 0.5134, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.721297264099121, "rewards/margins": 0.7374666333198547, "rewards/rejected": -2.458763837814331, "step": 14750 }, { "epoch": 2.5430737422467264, "grad_norm": 33.08069610595703, "learning_rate": 1.3804597008538177e-08, "logits/chosen": -2.1115968227386475, "logits/rejected": -2.0671963691711426, "logps/chosen": -226.8126983642578, "logps/rejected": -305.046142578125, "loss": 0.5026, "rewards/accuracies": 0.75, "rewards/chosen": -1.7297484874725342, "rewards/margins": 0.7960547208786011, "rewards/rejected": -2.525803327560425, "step": 14760 }, { "epoch": 2.5447966919365954, "grad_norm": 64.5079116821289, "learning_rate": 1.3703127363331556e-08, "logits/chosen": -2.100311756134033, "logits/rejected": -2.0607922077178955, "logps/chosen": -237.541259765625, "logps/rejected": -309.08929443359375, "loss": 0.5523, "rewards/accuracies": 0.75, "rewards/chosen": -1.808807134628296, "rewards/margins": 0.7249511480331421, "rewards/rejected": -2.5337581634521484, "step": 14770 }, { "epoch": 2.5465196416264644, "grad_norm": 39.559322357177734, "learning_rate": 1.3602004580077375e-08, "logits/chosen": -2.064465045928955, "logits/rejected": -2.0404582023620605, "logps/chosen": -228.71432495117188, "logps/rejected": -295.16485595703125, "loss": 0.5589, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.7813472747802734, "rewards/margins": 0.6820805072784424, "rewards/rejected": -2.463427782058716, "step": 14780 }, { "epoch": 2.548242591316334, "grad_norm": 34.208404541015625, "learning_rate": 1.3501229065228892e-08, "logits/chosen": -2.1109213829040527, "logits/rejected": -2.066493034362793, "logps/chosen": -252.40908813476562, "logps/rejected": -317.8335266113281, "loss": 0.5796, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.9827115535736084, "rewards/margins": 0.6900705099105835, "rewards/rejected": -2.6727821826934814, "step": 14790 }, { "epoch": 2.5499655410062028, "grad_norm": 47.914310455322266, "learning_rate": 1.3400801223843539e-08, "logits/chosen": -2.076321840286255, "logits/rejected": -2.044887065887451, "logps/chosen": -236.44992065429688, "logps/rejected": -312.14312744140625, "loss": 0.5348, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.8138248920440674, "rewards/margins": 0.7549554705619812, "rewards/rejected": -2.5687801837921143, "step": 14800 }, { "epoch": 2.5499655410062028, "eval_logits/chosen": -2.157832384109497, "eval_logits/rejected": -2.1374824047088623, "eval_logps/chosen": -227.1384735107422, "eval_logps/rejected": -263.64813232421875, "eval_loss": 0.6417858004570007, "eval_rewards/accuracies": 0.627555787563324, "eval_rewards/chosen": -1.681229829788208, "eval_rewards/margins": 0.32775557041168213, "eval_rewards/rejected": -2.0089855194091797, "eval_runtime": 383.3252, "eval_samples_per_second": 11.228, "eval_steps_per_second": 1.404, "step": 14800 }, { "epoch": 2.5516884906960717, "grad_norm": 30.952611923217773, "learning_rate": 1.3300721459581355e-08, "logits/chosen": -2.113208293914795, "logits/rejected": -2.0605287551879883, "logps/chosen": -245.7742462158203, "logps/rejected": -312.1253356933594, "loss": 0.5285, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.8606784343719482, "rewards/margins": 0.78578120470047, "rewards/rejected": -2.6464600563049316, "step": 14810 }, { "epoch": 2.5534114403859407, "grad_norm": 67.501953125, "learning_rate": 1.3200990174703308e-08, "logits/chosen": -2.2114782333374023, "logits/rejected": -2.159426689147949, "logps/chosen": -232.00704956054688, "logps/rejected": -317.8869934082031, "loss": 0.4664, "rewards/accuracies": 0.75, "rewards/chosen": -1.7742869853973389, "rewards/margins": 0.8930938839912415, "rewards/rejected": -2.6673808097839355, "step": 14820 }, { "epoch": 2.5551343900758097, "grad_norm": 41.97214126586914, "learning_rate": 1.3101607770069667e-08, "logits/chosen": -2.0992329120635986, "logits/rejected": -2.055812358856201, "logps/chosen": -231.4944305419922, "logps/rejected": -303.2246398925781, "loss": 0.5352, "rewards/accuracies": 0.75, "rewards/chosen": -1.7502925395965576, "rewards/margins": 0.7767872214317322, "rewards/rejected": -2.5270798206329346, "step": 14830 }, { "epoch": 2.5568573397656786, "grad_norm": 35.21275329589844, "learning_rate": 1.3002574645138375e-08, "logits/chosen": -2.1365883350372314, "logits/rejected": -2.090890407562256, "logps/chosen": -246.3368682861328, "logps/rejected": -325.1856994628906, "loss": 0.51, "rewards/accuracies": 0.75, "rewards/chosen": -1.894221305847168, "rewards/margins": 0.8003508448600769, "rewards/rejected": -2.6945719718933105, "step": 14840 }, { "epoch": 2.558580289455548, "grad_norm": 32.376625061035156, "learning_rate": 1.2903891197963568e-08, "logits/chosen": -2.076740026473999, "logits/rejected": -2.0330796241760254, "logps/chosen": -245.0225067138672, "logps/rejected": -323.209228515625, "loss": 0.5266, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.9002012014389038, "rewards/margins": 0.8007373809814453, "rewards/rejected": -2.7009384632110596, "step": 14850 }, { "epoch": 2.560303239145417, "grad_norm": 40.997806549072266, "learning_rate": 1.2805557825193857e-08, "logits/chosen": -2.054138660430908, "logits/rejected": -2.024360179901123, "logps/chosen": -238.13211059570312, "logps/rejected": -314.7568359375, "loss": 0.5651, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.8250688314437866, "rewards/margins": 0.8130731582641602, "rewards/rejected": -2.6381421089172363, "step": 14860 }, { "epoch": 2.562026188835286, "grad_norm": 41.17534255981445, "learning_rate": 1.2707574922070708e-08, "logits/chosen": -2.1252520084381104, "logits/rejected": -2.0850586891174316, "logps/chosen": -237.7712860107422, "logps/rejected": -298.0438232421875, "loss": 0.6044, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.7930355072021484, "rewards/margins": 0.6633931398391724, "rewards/rejected": -2.4564290046691895, "step": 14870 }, { "epoch": 2.563749138525155, "grad_norm": 32.69382858276367, "learning_rate": 1.2609942882426938e-08, "logits/chosen": -2.0632481575012207, "logits/rejected": -2.039973020553589, "logps/chosen": -220.024658203125, "logps/rejected": -292.401611328125, "loss": 0.5123, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.6605151891708374, "rewards/margins": 0.7392647862434387, "rewards/rejected": -2.399779796600342, "step": 14880 }, { "epoch": 2.5654720882150244, "grad_norm": 50.100215911865234, "learning_rate": 1.2512662098685144e-08, "logits/chosen": -2.028979778289795, "logits/rejected": -1.9955250024795532, "logps/chosen": -230.2629852294922, "logps/rejected": -301.06414794921875, "loss": 0.5213, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.7187559604644775, "rewards/margins": 0.7263022661209106, "rewards/rejected": -2.4450583457946777, "step": 14890 }, { "epoch": 2.5671950379048933, "grad_norm": 57.26528549194336, "learning_rate": 1.2415732961856006e-08, "logits/chosen": -2.018132209777832, "logits/rejected": -1.9759390354156494, "logps/chosen": -217.76461791992188, "logps/rejected": -286.56805419921875, "loss": 0.5304, "rewards/accuracies": 0.75, "rewards/chosen": -1.6334377527236938, "rewards/margins": 0.7253769636154175, "rewards/rejected": -2.3588147163391113, "step": 14900 }, { "epoch": 2.5689179875947623, "grad_norm": 34.72880935668945, "learning_rate": 1.2319155861536867e-08, "logits/chosen": -2.086937665939331, "logits/rejected": -2.051175594329834, "logps/chosen": -209.20523071289062, "logps/rejected": -286.2922668457031, "loss": 0.5023, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.5768992900848389, "rewards/margins": 0.7545540928840637, "rewards/rejected": -2.331453561782837, "step": 14910 }, { "epoch": 2.5706409372846313, "grad_norm": 30.486162185668945, "learning_rate": 1.222293118591008e-08, "logits/chosen": -2.0713205337524414, "logits/rejected": -2.0423014163970947, "logps/chosen": -224.8202667236328, "logps/rejected": -304.6694641113281, "loss": 0.5278, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.7100989818572998, "rewards/margins": 0.7935729026794434, "rewards/rejected": -2.5036721229553223, "step": 14920 }, { "epoch": 2.5723638869745002, "grad_norm": 51.449337005615234, "learning_rate": 1.2127059321741417e-08, "logits/chosen": -2.169572591781616, "logits/rejected": -2.124962568283081, "logps/chosen": -212.64041137695312, "logps/rejected": -297.39984130859375, "loss": 0.479, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.5795478820800781, "rewards/margins": 0.8730304837226868, "rewards/rejected": -2.45257830619812, "step": 14930 }, { "epoch": 2.574086836664369, "grad_norm": 55.8746452331543, "learning_rate": 1.203154065437857e-08, "logits/chosen": -2.09590482711792, "logits/rejected": -2.0522971153259277, "logps/chosen": -219.5468292236328, "logps/rejected": -285.1246337890625, "loss": 0.527, "rewards/accuracies": 0.71875, "rewards/chosen": -1.6562507152557373, "rewards/margins": 0.7033320665359497, "rewards/rejected": -2.3595831394195557, "step": 14940 }, { "epoch": 2.575809786354238, "grad_norm": 56.473838806152344, "learning_rate": 1.1936375567749612e-08, "logits/chosen": -2.1660001277923584, "logits/rejected": -2.1302969455718994, "logps/chosen": -234.9818572998047, "logps/rejected": -291.95697021484375, "loss": 0.5774, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.7821323871612549, "rewards/margins": 0.6441555619239807, "rewards/rejected": -2.426287889480591, "step": 14950 }, { "epoch": 2.5775327360441076, "grad_norm": 39.60293197631836, "learning_rate": 1.1841564444361496e-08, "logits/chosen": -2.076197624206543, "logits/rejected": -2.035750150680542, "logps/chosen": -225.7178955078125, "logps/rejected": -292.7468566894531, "loss": 0.5525, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.6837339401245117, "rewards/margins": 0.704573392868042, "rewards/rejected": -2.388307571411133, "step": 14960 }, { "epoch": 2.5792556857339766, "grad_norm": 44.428306579589844, "learning_rate": 1.1747107665298273e-08, "logits/chosen": -2.1206603050231934, "logits/rejected": -2.073085308074951, "logps/chosen": -215.6715087890625, "logps/rejected": -288.7075500488281, "loss": 0.5348, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.607478380203247, "rewards/margins": 0.7512115240097046, "rewards/rejected": -2.358689785003662, "step": 14970 }, { "epoch": 2.5809786354238455, "grad_norm": 42.954856872558594, "learning_rate": 1.1653005610219913e-08, "logits/chosen": -2.142565965652466, "logits/rejected": -2.084108352661133, "logps/chosen": -220.28713989257812, "logps/rejected": -305.529541015625, "loss": 0.4796, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.6533963680267334, "rewards/margins": 0.8964015245437622, "rewards/rejected": -2.549797773361206, "step": 14980 }, { "epoch": 2.582701585113715, "grad_norm": 36.77607345581055, "learning_rate": 1.155925865736055e-08, "logits/chosen": -2.141984224319458, "logits/rejected": -2.114633798599243, "logps/chosen": -212.90451049804688, "logps/rejected": -295.39056396484375, "loss": 0.5129, "rewards/accuracies": 0.75, "rewards/chosen": -1.60614812374115, "rewards/margins": 0.8274170756340027, "rewards/rejected": -2.4335649013519287, "step": 14990 }, { "epoch": 2.584424534803584, "grad_norm": 40.01217269897461, "learning_rate": 1.146586718352699e-08, "logits/chosen": -2.1471335887908936, "logits/rejected": -2.1165921688079834, "logps/chosen": -217.9677734375, "logps/rejected": -282.8397521972656, "loss": 0.5354, "rewards/accuracies": 0.6875, "rewards/chosen": -1.6128219366073608, "rewards/margins": 0.6820386648178101, "rewards/rejected": -2.294860601425171, "step": 15000 }, { "epoch": 2.586147484493453, "grad_norm": 43.856868743896484, "learning_rate": 1.1372831564097286e-08, "logits/chosen": -2.146790027618408, "logits/rejected": -2.109755754470825, "logps/chosen": -217.1037139892578, "logps/rejected": -278.0974426269531, "loss": 0.5797, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.6128923892974854, "rewards/margins": 0.6384140253067017, "rewards/rejected": -2.2513060569763184, "step": 15010 }, { "epoch": 2.587870434183322, "grad_norm": 60.50875473022461, "learning_rate": 1.1280152173019075e-08, "logits/chosen": -2.067186117172241, "logits/rejected": -2.046182155609131, "logps/chosen": -217.5106201171875, "logps/rejected": -281.8955993652344, "loss": 0.5814, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.663536787033081, "rewards/margins": 0.6339327096939087, "rewards/rejected": -2.2974696159362793, "step": 15020 }, { "epoch": 2.589593383873191, "grad_norm": 43.150291442871094, "learning_rate": 1.118782938280829e-08, "logits/chosen": -2.0811879634857178, "logits/rejected": -2.0418200492858887, "logps/chosen": -218.2391357421875, "logps/rejected": -283.5480651855469, "loss": 0.5568, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.6285641193389893, "rewards/margins": 0.6660295724868774, "rewards/rejected": -2.2945938110351562, "step": 15030 }, { "epoch": 2.59131633356306, "grad_norm": 61.47371292114258, "learning_rate": 1.1095863564547436e-08, "logits/chosen": -2.1220862865448, "logits/rejected": -2.0855085849761963, "logps/chosen": -216.48977661132812, "logps/rejected": -276.1427307128906, "loss": 0.5495, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.6123539209365845, "rewards/margins": 0.6356294751167297, "rewards/rejected": -2.247983455657959, "step": 15040 }, { "epoch": 2.5930392832529288, "grad_norm": 32.4649772644043, "learning_rate": 1.1004255087884273e-08, "logits/chosen": -2.1372647285461426, "logits/rejected": -2.0886662006378174, "logps/chosen": -213.27688598632812, "logps/rejected": -276.65118408203125, "loss": 0.5256, "rewards/accuracies": 0.71875, "rewards/chosen": -1.5744832754135132, "rewards/margins": 0.7090197801589966, "rewards/rejected": -2.2835030555725098, "step": 15050 }, { "epoch": 2.594762232942798, "grad_norm": 49.57844161987305, "learning_rate": 1.0913004321030195e-08, "logits/chosen": -2.0831291675567627, "logits/rejected": -2.048346996307373, "logps/chosen": -208.37832641601562, "logps/rejected": -278.1231994628906, "loss": 0.5475, "rewards/accuracies": 0.71875, "rewards/chosen": -1.5380138158798218, "rewards/margins": 0.7039236426353455, "rewards/rejected": -2.2419376373291016, "step": 15060 }, { "epoch": 2.596485182632667, "grad_norm": 40.79827117919922, "learning_rate": 1.0822111630758901e-08, "logits/chosen": -2.1562612056732178, "logits/rejected": -2.1056265830993652, "logps/chosen": -210.0137176513672, "logps/rejected": -265.4194030761719, "loss": 0.5531, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.5162451267242432, "rewards/margins": 0.6312421560287476, "rewards/rejected": -2.147487163543701, "step": 15070 }, { "epoch": 2.598208132322536, "grad_norm": 57.13422775268555, "learning_rate": 1.0731577382404744e-08, "logits/chosen": -2.1240782737731934, "logits/rejected": -2.078268527984619, "logps/chosen": -200.83511352539062, "logps/rejected": -282.7082824707031, "loss": 0.4758, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.4630084037780762, "rewards/margins": 0.8306447863578796, "rewards/rejected": -2.2936532497406006, "step": 15080 }, { "epoch": 2.599931082012405, "grad_norm": 37.599464416503906, "learning_rate": 1.0641401939861417e-08, "logits/chosen": -2.13403058052063, "logits/rejected": -2.0908923149108887, "logps/chosen": -209.10098266601562, "logps/rejected": -272.2961730957031, "loss": 0.5502, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.5389658212661743, "rewards/margins": 0.6597687602043152, "rewards/rejected": -2.1987345218658447, "step": 15090 }, { "epoch": 2.6016540317022745, "grad_norm": 39.21672439575195, "learning_rate": 1.0551585665580465e-08, "logits/chosen": -2.0761544704437256, "logits/rejected": -2.0411019325256348, "logps/chosen": -205.38424682617188, "logps/rejected": -269.87884521484375, "loss": 0.549, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.536199927330017, "rewards/margins": 0.6629042029380798, "rewards/rejected": -2.1991045475006104, "step": 15100 }, { "epoch": 2.6033769813921435, "grad_norm": 38.53199768066406, "learning_rate": 1.0462128920569635e-08, "logits/chosen": -2.109217643737793, "logits/rejected": -2.07594633102417, "logps/chosen": -216.8025360107422, "logps/rejected": -277.08612060546875, "loss": 0.5523, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.6238981485366821, "rewards/margins": 0.6392677426338196, "rewards/rejected": -2.2631657123565674, "step": 15110 }, { "epoch": 2.6050999310820124, "grad_norm": 44.340145111083984, "learning_rate": 1.0373032064391729e-08, "logits/chosen": -2.0932981967926025, "logits/rejected": -2.0627665519714355, "logps/chosen": -231.3092498779297, "logps/rejected": -290.2330017089844, "loss": 0.5546, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.7282911539077759, "rewards/margins": 0.6158181428909302, "rewards/rejected": -2.344109296798706, "step": 15120 }, { "epoch": 2.6068228807718814, "grad_norm": 41.10717010498047, "learning_rate": 1.0284295455162995e-08, "logits/chosen": -2.077404260635376, "logits/rejected": -2.0305936336517334, "logps/chosen": -207.697265625, "logps/rejected": -279.94598388671875, "loss": 0.5005, "rewards/accuracies": 0.75, "rewards/chosen": -1.5323796272277832, "rewards/margins": 0.7784202694892883, "rewards/rejected": -2.310800075531006, "step": 15130 }, { "epoch": 2.6085458304617504, "grad_norm": 32.70948028564453, "learning_rate": 1.0195919449551637e-08, "logits/chosen": -2.106325149536133, "logits/rejected": -2.063744068145752, "logps/chosen": -221.25039672851562, "logps/rejected": -297.72052001953125, "loss": 0.489, "rewards/accuracies": 0.75, "rewards/chosen": -1.6539459228515625, "rewards/margins": 0.7634718418121338, "rewards/rejected": -2.4174180030822754, "step": 15140 }, { "epoch": 2.6102687801516193, "grad_norm": 43.813720703125, "learning_rate": 1.0107904402776468e-08, "logits/chosen": -2.2083098888397217, "logits/rejected": -2.1639564037323, "logps/chosen": -216.8330841064453, "logps/rejected": -282.2804260253906, "loss": 0.5509, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.6166563034057617, "rewards/margins": 0.6878889203071594, "rewards/rejected": -2.3045449256896973, "step": 15150 }, { "epoch": 2.6119917298414888, "grad_norm": 42.778221130371094, "learning_rate": 1.002025066860549e-08, "logits/chosen": -2.06974196434021, "logits/rejected": -2.034329414367676, "logps/chosen": -215.4232940673828, "logps/rejected": -290.89227294921875, "loss": 0.5088, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.5965074300765991, "rewards/margins": 0.7750741243362427, "rewards/rejected": -2.371581792831421, "step": 15160 }, { "epoch": 2.6137146795313577, "grad_norm": 59.19876480102539, "learning_rate": 9.932958599354457e-09, "logits/chosen": -2.065034866333008, "logits/rejected": -2.02778959274292, "logps/chosen": -206.2749786376953, "logps/rejected": -277.1195068359375, "loss": 0.5321, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.5284637212753296, "rewards/margins": 0.7130771279335022, "rewards/rejected": -2.2415406703948975, "step": 15170 }, { "epoch": 2.6154376292212267, "grad_norm": 34.03282928466797, "learning_rate": 9.846028545885376e-09, "logits/chosen": -2.122490644454956, "logits/rejected": -2.095641613006592, "logps/chosen": -231.06039428710938, "logps/rejected": -298.1627197265625, "loss": 0.5581, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.7421585321426392, "rewards/margins": 0.6905821561813354, "rewards/rejected": -2.4327406883239746, "step": 15180 }, { "epoch": 2.6171605789110957, "grad_norm": 34.73793029785156, "learning_rate": 9.75946085760524e-09, "logits/chosen": -2.071645736694336, "logits/rejected": -2.0422959327697754, "logps/chosen": -218.0347442626953, "logps/rejected": -283.3576965332031, "loss": 0.528, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.642773985862732, "rewards/margins": 0.6757218241691589, "rewards/rejected": -2.318495988845825, "step": 15190 }, { "epoch": 2.618883528600965, "grad_norm": 31.4705867767334, "learning_rate": 9.673255882464504e-09, "logits/chosen": -2.120964765548706, "logits/rejected": -2.073119878768921, "logps/chosen": -226.7511444091797, "logps/rejected": -296.59942626953125, "loss": 0.5114, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.7025848627090454, "rewards/margins": 0.7379928231239319, "rewards/rejected": -2.440577507019043, "step": 15200 }, { "epoch": 2.618883528600965, "eval_logits/chosen": -2.1731746196746826, "eval_logits/rejected": -2.153785228729248, "eval_logps/chosen": -214.88101196289062, "eval_logps/rejected": -249.07342529296875, "eval_loss": 0.6408128142356873, "eval_rewards/accuracies": 0.6310408711433411, "eval_rewards/chosen": -1.5586552619934082, "eval_rewards/margins": 0.3045828640460968, "eval_rewards/rejected": -1.8632382154464722, "eval_runtime": 382.9379, "eval_samples_per_second": 11.239, "eval_steps_per_second": 1.405, "step": 15200 }, { "epoch": 2.620606478290834, "grad_norm": 51.31040954589844, "learning_rate": 9.587413966955737e-09, "logits/chosen": -2.033310651779175, "logits/rejected": -1.9806716442108154, "logps/chosen": -232.90646362304688, "logps/rejected": -294.5772399902344, "loss": 0.5784, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.7613096237182617, "rewards/margins": 0.6706287860870361, "rewards/rejected": -2.431938648223877, "step": 15210 }, { "epoch": 2.622329427980703, "grad_norm": 41.79688262939453, "learning_rate": 9.501935456112254e-09, "logits/chosen": -2.0468971729278564, "logits/rejected": -1.9976829290390015, "logps/chosen": -209.7871856689453, "logps/rejected": -284.422607421875, "loss": 0.476, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.5661617517471313, "rewards/margins": 0.7923303842544556, "rewards/rejected": -2.358492136001587, "step": 15220 }, { "epoch": 2.624052377670572, "grad_norm": 40.47127914428711, "learning_rate": 9.416820693506677e-09, "logits/chosen": -2.074531316757202, "logits/rejected": -2.032819986343384, "logps/chosen": -220.6643829345703, "logps/rejected": -293.26055908203125, "loss": 0.5271, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.6469504833221436, "rewards/margins": 0.7576435804367065, "rewards/rejected": -2.4045939445495605, "step": 15230 }, { "epoch": 2.625775327360441, "grad_norm": 42.0208740234375, "learning_rate": 9.332070021249595e-09, "logits/chosen": -2.068018913269043, "logits/rejected": -2.01499605178833, "logps/chosen": -222.3048095703125, "logps/rejected": -291.1240234375, "loss": 0.5095, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.622515082359314, "rewards/margins": 0.7594237923622131, "rewards/rejected": -2.3819386959075928, "step": 15240 }, { "epoch": 2.62749827705031, "grad_norm": 45.18162536621094, "learning_rate": 9.247683779988113e-09, "logits/chosen": -2.1060614585876465, "logits/rejected": -2.070847272872925, "logps/chosen": -216.2559356689453, "logps/rejected": -289.8507385253906, "loss": 0.5203, "rewards/accuracies": 0.78125, "rewards/chosen": -1.6200711727142334, "rewards/margins": 0.742321789264679, "rewards/rejected": -2.3623929023742676, "step": 15250 }, { "epoch": 2.6292212267401793, "grad_norm": 42.50625228881836, "learning_rate": 9.163662308904608e-09, "logits/chosen": -2.0640547275543213, "logits/rejected": -2.033785820007324, "logps/chosen": -234.31454467773438, "logps/rejected": -286.94061279296875, "loss": 0.6048, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.7885749340057373, "rewards/margins": 0.5866588354110718, "rewards/rejected": -2.3752341270446777, "step": 15260 }, { "epoch": 2.6309441764300483, "grad_norm": 43.62517166137695, "learning_rate": 9.080005945715307e-09, "logits/chosen": -2.1171374320983887, "logits/rejected": -2.050475597381592, "logps/chosen": -238.31082153320312, "logps/rejected": -316.4501037597656, "loss": 0.5307, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.825531244277954, "rewards/margins": 0.8491969108581543, "rewards/rejected": -2.6747279167175293, "step": 15270 }, { "epoch": 2.6326671261199173, "grad_norm": 38.742164611816406, "learning_rate": 8.996715026668867e-09, "logits/chosen": -2.1770644187927246, "logits/rejected": -2.140118360519409, "logps/chosen": -222.07058715820312, "logps/rejected": -297.3765869140625, "loss": 0.4889, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.683081865310669, "rewards/margins": 0.7598232626914978, "rewards/rejected": -2.4429049491882324, "step": 15280 }, { "epoch": 2.6343900758097862, "grad_norm": 29.044748306274414, "learning_rate": 8.913789886545064e-09, "logits/chosen": -2.0895426273345947, "logits/rejected": -2.0382912158966064, "logps/chosen": -234.68301391601562, "logps/rejected": -315.882080078125, "loss": 0.5222, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.7909761667251587, "rewards/margins": 0.8641980886459351, "rewards/rejected": -2.6551737785339355, "step": 15290 }, { "epoch": 2.6361130254996556, "grad_norm": 48.1595344543457, "learning_rate": 8.831230858653538e-09, "logits/chosen": -2.0058329105377197, "logits/rejected": -1.9574228525161743, "logps/chosen": -230.89028930664062, "logps/rejected": -308.9717712402344, "loss": 0.5445, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.7803550958633423, "rewards/margins": 0.8227502107620239, "rewards/rejected": -2.6031055450439453, "step": 15300 }, { "epoch": 2.6378359751895246, "grad_norm": 37.70459747314453, "learning_rate": 8.749038274832343e-09, "logits/chosen": -2.135277271270752, "logits/rejected": -2.0876753330230713, "logps/chosen": -224.98989868164062, "logps/rejected": -303.2028503417969, "loss": 0.4922, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.6934198141098022, "rewards/margins": 0.8129827380180359, "rewards/rejected": -2.5064024925231934, "step": 15310 }, { "epoch": 2.6395589248793936, "grad_norm": 53.39246368408203, "learning_rate": 8.667212465446617e-09, "logits/chosen": -2.063955307006836, "logits/rejected": -2.0377583503723145, "logps/chosen": -230.70285034179688, "logps/rejected": -303.0435485839844, "loss": 0.5394, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.7863218784332275, "rewards/margins": 0.7197625637054443, "rewards/rejected": -2.506084680557251, "step": 15320 }, { "epoch": 2.6412818745692626, "grad_norm": 40.4294319152832, "learning_rate": 8.585753759387292e-09, "logits/chosen": -2.0826597213745117, "logits/rejected": -2.036132335662842, "logps/chosen": -235.56198120117188, "logps/rejected": -313.53350830078125, "loss": 0.4952, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.7901794910430908, "rewards/margins": 0.8129813075065613, "rewards/rejected": -2.6031606197357178, "step": 15330 }, { "epoch": 2.6430048242591315, "grad_norm": 37.36764144897461, "learning_rate": 8.504662484069824e-09, "logits/chosen": -2.0842349529266357, "logits/rejected": -2.0482351779937744, "logps/chosen": -238.4183349609375, "logps/rejected": -314.86151123046875, "loss": 0.5193, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.83328115940094, "rewards/margins": 0.7736489772796631, "rewards/rejected": -2.6069302558898926, "step": 15340 }, { "epoch": 2.6447277739490005, "grad_norm": 53.557281494140625, "learning_rate": 8.423938965432708e-09, "logits/chosen": -2.002223491668701, "logits/rejected": -1.9665918350219727, "logps/chosen": -234.70602416992188, "logps/rejected": -310.17669677734375, "loss": 0.525, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.7966200113296509, "rewards/margins": 0.7793971300125122, "rewards/rejected": -2.576017379760742, "step": 15350 }, { "epoch": 2.64645072363887, "grad_norm": 33.29266357421875, "learning_rate": 8.343583527936382e-09, "logits/chosen": -2.0914101600646973, "logits/rejected": -2.0616629123687744, "logps/chosen": -233.9851837158203, "logps/rejected": -300.9095764160156, "loss": 0.5747, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.8075244426727295, "rewards/margins": 0.6563832759857178, "rewards/rejected": -2.4639077186584473, "step": 15360 }, { "epoch": 2.648173673328739, "grad_norm": 37.487945556640625, "learning_rate": 8.263596494561765e-09, "logits/chosen": -2.104043483734131, "logits/rejected": -2.0622448921203613, "logps/chosen": -238.8284454345703, "logps/rejected": -303.08538818359375, "loss": 0.5598, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.8186184167861938, "rewards/margins": 0.7071665525436401, "rewards/rejected": -2.525784969329834, "step": 15370 }, { "epoch": 2.649896623018608, "grad_norm": 37.66122055053711, "learning_rate": 8.183978186809026e-09, "logits/chosen": -2.0884664058685303, "logits/rejected": -2.0510292053222656, "logps/chosen": -233.5215301513672, "logps/rejected": -307.7198181152344, "loss": 0.5061, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.7857778072357178, "rewards/margins": 0.778823971748352, "rewards/rejected": -2.5646018981933594, "step": 15380 }, { "epoch": 2.651619572708477, "grad_norm": 36.3622932434082, "learning_rate": 8.104728924696237e-09, "logits/chosen": -2.157322645187378, "logits/rejected": -2.1230530738830566, "logps/chosen": -227.74887084960938, "logps/rejected": -309.10894775390625, "loss": 0.5155, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.7101014852523804, "rewards/margins": 0.8325842022895813, "rewards/rejected": -2.5426855087280273, "step": 15390 }, { "epoch": 2.6533425223983462, "grad_norm": 29.958967208862305, "learning_rate": 8.02584902675818e-09, "logits/chosen": -2.1099212169647217, "logits/rejected": -2.0682926177978516, "logps/chosen": -246.22549438476562, "logps/rejected": -295.62115478515625, "loss": 0.6045, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.8742916584014893, "rewards/margins": 0.5758206844329834, "rewards/rejected": -2.4501121044158936, "step": 15400 }, { "epoch": 2.655065472088215, "grad_norm": 34.939613342285156, "learning_rate": 7.947338810045035e-09, "logits/chosen": -2.099294900894165, "logits/rejected": -2.0514397621154785, "logps/chosen": -238.8804931640625, "logps/rejected": -285.91949462890625, "loss": 0.591, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.7824329137802124, "rewards/margins": 0.5694050192832947, "rewards/rejected": -2.351837635040283, "step": 15410 }, { "epoch": 2.656788421778084, "grad_norm": 54.7791748046875, "learning_rate": 7.869198590120962e-09, "logits/chosen": -2.0966858863830566, "logits/rejected": -2.054964542388916, "logps/chosen": -214.153076171875, "logps/rejected": -305.690673828125, "loss": 0.4777, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.6057186126708984, "rewards/margins": 0.926251232624054, "rewards/rejected": -2.5319697856903076, "step": 15420 }, { "epoch": 2.658511371467953, "grad_norm": 29.094106674194336, "learning_rate": 7.791428681063084e-09, "logits/chosen": -2.1707558631896973, "logits/rejected": -2.1257083415985107, "logps/chosen": -227.0717315673828, "logps/rejected": -300.9056396484375, "loss": 0.4866, "rewards/accuracies": 0.78125, "rewards/chosen": -1.6737381219863892, "rewards/margins": 0.8080810308456421, "rewards/rejected": -2.4818191528320312, "step": 15430 }, { "epoch": 2.660234321157822, "grad_norm": 26.97381019592285, "learning_rate": 7.714029395460054e-09, "logits/chosen": -2.1956639289855957, "logits/rejected": -2.1574459075927734, "logps/chosen": -215.98043823242188, "logps/rejected": -278.1003112792969, "loss": 0.5109, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.6043450832366943, "rewards/margins": 0.6456824541091919, "rewards/rejected": -2.2500274181365967, "step": 15440 }, { "epoch": 2.661957270847691, "grad_norm": 41.384124755859375, "learning_rate": 7.637001044410784e-09, "logits/chosen": -1.992272138595581, "logits/rejected": -1.9533510208129883, "logps/chosen": -219.4346923828125, "logps/rejected": -281.28619384765625, "loss": 0.5541, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.6634387969970703, "rewards/margins": 0.6239207983016968, "rewards/rejected": -2.2873597145080566, "step": 15450 }, { "epoch": 2.66368022053756, "grad_norm": 44.49235534667969, "learning_rate": 7.560343937523361e-09, "logits/chosen": -2.1659507751464844, "logits/rejected": -2.131117582321167, "logps/chosen": -215.7431182861328, "logps/rejected": -285.8184509277344, "loss": 0.5275, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.5413073301315308, "rewards/margins": 0.7475422024726868, "rewards/rejected": -2.288849353790283, "step": 15460 }, { "epoch": 2.6654031702274295, "grad_norm": 48.974029541015625, "learning_rate": 7.484058382913583e-09, "logits/chosen": -2.153371572494507, "logits/rejected": -2.1122617721557617, "logps/chosen": -233.70059204101562, "logps/rejected": -304.9097900390625, "loss": 0.5193, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.7240705490112305, "rewards/margins": 0.7797734141349792, "rewards/rejected": -2.5038444995880127, "step": 15470 }, { "epoch": 2.6671261199172984, "grad_norm": 39.822750091552734, "learning_rate": 7.40814468720391e-09, "logits/chosen": -2.125124454498291, "logits/rejected": -2.0769131183624268, "logps/chosen": -212.4561309814453, "logps/rejected": -281.7795715332031, "loss": 0.5407, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.5578019618988037, "rewards/margins": 0.7482360601425171, "rewards/rejected": -2.3060381412506104, "step": 15480 }, { "epoch": 2.6688490696071674, "grad_norm": 49.14643859863281, "learning_rate": 7.332603155522066e-09, "logits/chosen": -2.102980136871338, "logits/rejected": -2.0779075622558594, "logps/chosen": -230.55142211914062, "logps/rejected": -287.64202880859375, "loss": 0.5676, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.7493207454681396, "rewards/margins": 0.612503170967102, "rewards/rejected": -2.361823558807373, "step": 15490 }, { "epoch": 2.670572019297037, "grad_norm": 58.49702835083008, "learning_rate": 7.257434091500014e-09, "logits/chosen": -2.0715060234069824, "logits/rejected": -2.048126697540283, "logps/chosen": -241.28201293945312, "logps/rejected": -292.69305419921875, "loss": 0.6058, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.863378882408142, "rewards/margins": 0.5241729021072388, "rewards/rejected": -2.387551784515381, "step": 15500 }, { "epoch": 2.6722949689869058, "grad_norm": 42.66554260253906, "learning_rate": 7.182637797272506e-09, "logits/chosen": -2.052743434906006, "logits/rejected": -2.0031511783599854, "logps/chosen": -219.3328857421875, "logps/rejected": -296.0882568359375, "loss": 0.5025, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.6018941402435303, "rewards/margins": 0.827610969543457, "rewards/rejected": -2.4295051097869873, "step": 15510 }, { "epoch": 2.6740179186767747, "grad_norm": 40.335811614990234, "learning_rate": 7.108214573476035e-09, "logits/chosen": -2.0336544513702393, "logits/rejected": -2.001194715499878, "logps/chosen": -218.55990600585938, "logps/rejected": -279.3645935058594, "loss": 0.5463, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.6355009078979492, "rewards/margins": 0.6691679358482361, "rewards/rejected": -2.30466890335083, "step": 15520 }, { "epoch": 2.6757408683666437, "grad_norm": 37.94197463989258, "learning_rate": 7.0341647192475704e-09, "logits/chosen": -2.0494165420532227, "logits/rejected": -2.0060877799987793, "logps/chosen": -205.8288116455078, "logps/rejected": -279.2801513671875, "loss": 0.5031, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.5091311931610107, "rewards/margins": 0.7558835744857788, "rewards/rejected": -2.265014886856079, "step": 15530 }, { "epoch": 2.6774638180565127, "grad_norm": 39.58177947998047, "learning_rate": 6.960488532223374e-09, "logits/chosen": -2.0769643783569336, "logits/rejected": -2.033782482147217, "logps/chosen": -227.2656707763672, "logps/rejected": -295.30426025390625, "loss": 0.5561, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.739140510559082, "rewards/margins": 0.7042891383171082, "rewards/rejected": -2.443429708480835, "step": 15540 }, { "epoch": 2.6791867677463816, "grad_norm": 55.12656021118164, "learning_rate": 6.887186308537763e-09, "logits/chosen": -2.1615262031555176, "logits/rejected": -2.121224880218506, "logps/chosen": -227.6425018310547, "logps/rejected": -294.74786376953125, "loss": 0.5313, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.7184242010116577, "rewards/margins": 0.7132667899131775, "rewards/rejected": -2.4316909313201904, "step": 15550 }, { "epoch": 2.6809097174362506, "grad_norm": 46.9069938659668, "learning_rate": 6.814258342821932e-09, "logits/chosen": -2.100739002227783, "logits/rejected": -2.078354597091675, "logps/chosen": -223.0258026123047, "logps/rejected": -281.591064453125, "loss": 0.5781, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.7119252681732178, "rewards/margins": 0.5784019231796265, "rewards/rejected": -2.2903270721435547, "step": 15560 }, { "epoch": 2.68263266712612, "grad_norm": 37.50832748413086, "learning_rate": 6.741704928202807e-09, "logits/chosen": -2.142457962036133, "logits/rejected": -2.100450038909912, "logps/chosen": -228.854248046875, "logps/rejected": -301.9892883300781, "loss": 0.5141, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.751019835472107, "rewards/margins": 0.7723337411880493, "rewards/rejected": -2.5233535766601562, "step": 15570 }, { "epoch": 2.684355616815989, "grad_norm": 57.68934631347656, "learning_rate": 6.669526356301869e-09, "logits/chosen": -2.1601061820983887, "logits/rejected": -2.1275477409362793, "logps/chosen": -223.7733917236328, "logps/rejected": -289.5685729980469, "loss": 0.5404, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.6700990200042725, "rewards/margins": 0.6639571785926819, "rewards/rejected": -2.3340563774108887, "step": 15580 }, { "epoch": 2.686078566505858, "grad_norm": 45.8087272644043, "learning_rate": 6.597722917233894e-09, "logits/chosen": -2.0941264629364014, "logits/rejected": -2.056966781616211, "logps/chosen": -208.35531616210938, "logps/rejected": -270.4266662597656, "loss": 0.5518, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.5251692533493042, "rewards/margins": 0.6424340605735779, "rewards/rejected": -2.1676034927368164, "step": 15590 }, { "epoch": 2.687801516195727, "grad_norm": 41.391231536865234, "learning_rate": 6.526294899605878e-09, "logits/chosen": -2.1049602031707764, "logits/rejected": -2.0679843425750732, "logps/chosen": -226.60043334960938, "logps/rejected": -299.9786682128906, "loss": 0.5356, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.7385717630386353, "rewards/margins": 0.7576156854629517, "rewards/rejected": -2.496187448501587, "step": 15600 }, { "epoch": 2.687801516195727, "eval_logits/chosen": -2.174306631088257, "eval_logits/rejected": -2.1549601554870605, "eval_logps/chosen": -213.9473419189453, "eval_logps/rejected": -248.0917510986328, "eval_loss": 0.6404752731323242, "eval_rewards/accuracies": 0.6266263723373413, "eval_rewards/chosen": -1.549318552017212, "eval_rewards/margins": 0.3041025400161743, "eval_rewards/rejected": -1.8534212112426758, "eval_runtime": 383.0657, "eval_samples_per_second": 11.236, "eval_steps_per_second": 1.404, "step": 15600 }, { "epoch": 2.6895244658855963, "grad_norm": 41.80103302001953, "learning_rate": 6.455242590515842e-09, "logits/chosen": -2.163078784942627, "logits/rejected": -2.1271393299102783, "logps/chosen": -224.98046875, "logps/rejected": -290.39678955078125, "loss": 0.547, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.679848074913025, "rewards/margins": 0.6862354278564453, "rewards/rejected": -2.3660836219787598, "step": 15610 }, { "epoch": 2.6912474155754653, "grad_norm": 39.715118408203125, "learning_rate": 6.384566275551717e-09, "logits/chosen": -2.0747292041778564, "logits/rejected": -2.0420749187469482, "logps/chosen": -199.78384399414062, "logps/rejected": -281.0851745605469, "loss": 0.4806, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.448569893836975, "rewards/margins": 0.8093862533569336, "rewards/rejected": -2.257956027984619, "step": 15620 }, { "epoch": 2.6929703652653343, "grad_norm": 37.04435348510742, "learning_rate": 6.314266238790089e-09, "logits/chosen": -2.0782577991485596, "logits/rejected": -2.0221097469329834, "logps/chosen": -231.4824676513672, "logps/rejected": -305.20196533203125, "loss": 0.5169, "rewards/accuracies": 0.75, "rewards/chosen": -1.756324052810669, "rewards/margins": 0.8029062151908875, "rewards/rejected": -2.559230089187622, "step": 15630 }, { "epoch": 2.6946933149552033, "grad_norm": 32.966758728027344, "learning_rate": 6.244342762795207e-09, "logits/chosen": -2.085794687271118, "logits/rejected": -2.042360544204712, "logps/chosen": -221.9350128173828, "logps/rejected": -310.47705078125, "loss": 0.4609, "rewards/accuracies": 0.75, "rewards/chosen": -1.68125319480896, "rewards/margins": 0.8988713026046753, "rewards/rejected": -2.5801243782043457, "step": 15640 }, { "epoch": 2.6964162646450722, "grad_norm": 55.33198928833008, "learning_rate": 6.1747961286177205e-09, "logits/chosen": -2.070162296295166, "logits/rejected": -2.031337022781372, "logps/chosen": -219.86752319335938, "logps/rejected": -287.378662109375, "loss": 0.5562, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.6482536792755127, "rewards/margins": 0.6931672096252441, "rewards/rejected": -2.341421127319336, "step": 15650 }, { "epoch": 2.698139214334941, "grad_norm": 30.554706573486328, "learning_rate": 6.105626615793602e-09, "logits/chosen": -2.145211696624756, "logits/rejected": -2.114421844482422, "logps/chosen": -226.2273406982422, "logps/rejected": -297.3497314453125, "loss": 0.5082, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.723120093345642, "rewards/margins": 0.7179322242736816, "rewards/rejected": -2.4410524368286133, "step": 15660 }, { "epoch": 2.6998621640248106, "grad_norm": 28.581279754638672, "learning_rate": 6.036834502343058e-09, "logits/chosen": -2.0493581295013428, "logits/rejected": -1.996788740158081, "logps/chosen": -217.5170440673828, "logps/rejected": -293.3517761230469, "loss": 0.476, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.623051643371582, "rewards/margins": 0.8018845319747925, "rewards/rejected": -2.424935817718506, "step": 15670 }, { "epoch": 2.7015851137146796, "grad_norm": 31.51927947998047, "learning_rate": 5.968420064769342e-09, "logits/chosen": -2.0571978092193604, "logits/rejected": -2.023253917694092, "logps/chosen": -238.87527465820312, "logps/rejected": -311.1076965332031, "loss": 0.5185, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.8466196060180664, "rewards/margins": 0.7387115359306335, "rewards/rejected": -2.5853309631347656, "step": 15680 }, { "epoch": 2.7033080634045485, "grad_norm": 27.066898345947266, "learning_rate": 5.9003835780576774e-09, "logits/chosen": -2.0971992015838623, "logits/rejected": -2.064260482788086, "logps/chosen": -218.11105346679688, "logps/rejected": -283.69268798828125, "loss": 0.538, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.6466944217681885, "rewards/margins": 0.6604923605918884, "rewards/rejected": -2.3071868419647217, "step": 15690 }, { "epoch": 2.7050310130944175, "grad_norm": 34.388179779052734, "learning_rate": 5.832725315674147e-09, "logits/chosen": -2.102985382080078, "logits/rejected": -2.0637826919555664, "logps/chosen": -230.23574829101562, "logps/rejected": -303.936767578125, "loss": 0.5343, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.7688966989517212, "rewards/margins": 0.7512655854225159, "rewards/rejected": -2.520162582397461, "step": 15700 }, { "epoch": 2.706753962784287, "grad_norm": 39.30438995361328, "learning_rate": 5.76544554956463e-09, "logits/chosen": -2.105860710144043, "logits/rejected": -2.063655138015747, "logps/chosen": -236.8208770751953, "logps/rejected": -317.7994079589844, "loss": 0.5073, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.7796757221221924, "rewards/margins": 0.8442095518112183, "rewards/rejected": -2.6238853931427, "step": 15710 }, { "epoch": 2.708476912474156, "grad_norm": 58.20681381225586, "learning_rate": 5.698544550153661e-09, "logits/chosen": -2.0977749824523926, "logits/rejected": -2.0756781101226807, "logps/chosen": -229.4589080810547, "logps/rejected": -288.4719543457031, "loss": 0.5559, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.753771424293518, "rewards/margins": 0.6107560396194458, "rewards/rejected": -2.364527463912964, "step": 15720 }, { "epoch": 2.710199862164025, "grad_norm": 40.557525634765625, "learning_rate": 5.632022586343333e-09, "logits/chosen": -2.1652374267578125, "logits/rejected": -2.1265182495117188, "logps/chosen": -218.47988891601562, "logps/rejected": -295.4986877441406, "loss": 0.5085, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.6443450450897217, "rewards/margins": 0.7873646020889282, "rewards/rejected": -2.4317097663879395, "step": 15730 }, { "epoch": 2.711922811853894, "grad_norm": 36.61780548095703, "learning_rate": 5.565879925512252e-09, "logits/chosen": -2.1096248626708984, "logits/rejected": -2.0603795051574707, "logps/chosen": -225.2184600830078, "logps/rejected": -292.0648498535156, "loss": 0.5589, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.7009508609771729, "rewards/margins": 0.7264993190765381, "rewards/rejected": -2.427450180053711, "step": 15740 }, { "epoch": 2.713645761543763, "grad_norm": 39.6802864074707, "learning_rate": 5.50011683351449e-09, "logits/chosen": -2.1140971183776855, "logits/rejected": -2.071514368057251, "logps/chosen": -237.9486846923828, "logps/rejected": -314.0099792480469, "loss": 0.5053, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.8496208190917969, "rewards/margins": 0.7466347217559814, "rewards/rejected": -2.5962555408477783, "step": 15750 }, { "epoch": 2.7153687112336318, "grad_norm": 38.177574157714844, "learning_rate": 5.434733574678418e-09, "logits/chosen": -2.029038429260254, "logits/rejected": -1.992246389389038, "logps/chosen": -224.46682739257812, "logps/rejected": -286.83203125, "loss": 0.5738, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.7270536422729492, "rewards/margins": 0.6463114619255066, "rewards/rejected": -2.3733649253845215, "step": 15760 }, { "epoch": 2.717091660923501, "grad_norm": 29.399215698242188, "learning_rate": 5.369730411805762e-09, "logits/chosen": -2.0837364196777344, "logits/rejected": -2.045706272125244, "logps/chosen": -214.2775115966797, "logps/rejected": -298.97247314453125, "loss": 0.4496, "rewards/accuracies": 0.84375, "rewards/chosen": -1.6066551208496094, "rewards/margins": 0.8585401773452759, "rewards/rejected": -2.4651951789855957, "step": 15770 }, { "epoch": 2.71881461061337, "grad_norm": 40.45827865600586, "learning_rate": 5.3051076061704445e-09, "logits/chosen": -2.1905288696289062, "logits/rejected": -2.159872531890869, "logps/chosen": -237.99563598632812, "logps/rejected": -286.5212707519531, "loss": 0.6026, "rewards/accuracies": 0.6875, "rewards/chosen": -1.83493971824646, "rewards/margins": 0.5101853609085083, "rewards/rejected": -2.3451249599456787, "step": 15780 }, { "epoch": 2.720537560303239, "grad_norm": 37.64627456665039, "learning_rate": 5.240865417517604e-09, "logits/chosen": -2.0218024253845215, "logits/rejected": -1.9925413131713867, "logps/chosen": -232.785888671875, "logps/rejected": -302.86700439453125, "loss": 0.5064, "rewards/accuracies": 0.75, "rewards/chosen": -1.7826083898544312, "rewards/margins": 0.7289341688156128, "rewards/rejected": -2.511542797088623, "step": 15790 }, { "epoch": 2.722260509993108, "grad_norm": 26.595001220703125, "learning_rate": 5.177004104062521e-09, "logits/chosen": -2.160280704498291, "logits/rejected": -2.1041154861450195, "logps/chosen": -219.24887084960938, "logps/rejected": -291.0208435058594, "loss": 0.482, "rewards/accuracies": 0.75, "rewards/chosen": -1.5983530282974243, "rewards/margins": 0.8090564608573914, "rewards/rejected": -2.407409429550171, "step": 15800 }, { "epoch": 2.7239834596829775, "grad_norm": 57.61701202392578, "learning_rate": 5.113523922489571e-09, "logits/chosen": -2.13946270942688, "logits/rejected": -2.113898992538452, "logps/chosen": -230.5516357421875, "logps/rejected": -298.66229248046875, "loss": 0.5488, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.816119909286499, "rewards/margins": 0.6525536775588989, "rewards/rejected": -2.4686739444732666, "step": 15810 }, { "epoch": 2.7257064093728465, "grad_norm": 36.64439010620117, "learning_rate": 5.0504251279512415e-09, "logits/chosen": -2.0380873680114746, "logits/rejected": -1.9975385665893555, "logps/chosen": -224.701904296875, "logps/rejected": -303.8479919433594, "loss": 0.5166, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.7237030267715454, "rewards/margins": 0.8035634756088257, "rewards/rejected": -2.527266025543213, "step": 15820 }, { "epoch": 2.7274293590627154, "grad_norm": 51.537010192871094, "learning_rate": 4.987707974067046e-09, "logits/chosen": -2.1332333087921143, "logits/rejected": -2.1054179668426514, "logps/chosen": -218.86459350585938, "logps/rejected": -286.1872863769531, "loss": 0.5528, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.67132568359375, "rewards/margins": 0.6859435439109802, "rewards/rejected": -2.357269048690796, "step": 15830 }, { "epoch": 2.7291523087525844, "grad_norm": 44.762908935546875, "learning_rate": 4.9253727129224934e-09, "logits/chosen": -2.1452794075012207, "logits/rejected": -2.1109366416931152, "logps/chosen": -240.15103149414062, "logps/rejected": -313.2534484863281, "loss": 0.5432, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.8571033477783203, "rewards/margins": 0.7278814315795898, "rewards/rejected": -2.58498477935791, "step": 15840 }, { "epoch": 2.7308752584424534, "grad_norm": 67.00555419921875, "learning_rate": 4.863419595068197e-09, "logits/chosen": -2.116338014602661, "logits/rejected": -2.079974889755249, "logps/chosen": -224.298583984375, "logps/rejected": -290.0685119628906, "loss": 0.5594, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.6982589960098267, "rewards/margins": 0.6676750183105469, "rewards/rejected": -2.365933895111084, "step": 15850 }, { "epoch": 2.7325982081323223, "grad_norm": 59.24932861328125, "learning_rate": 4.801848869518721e-09, "logits/chosen": -2.1024158000946045, "logits/rejected": -2.0658373832702637, "logps/chosen": -227.84585571289062, "logps/rejected": -278.639892578125, "loss": 0.5878, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.735727071762085, "rewards/margins": 0.5497328042984009, "rewards/rejected": -2.2854597568511963, "step": 15860 }, { "epoch": 2.7343211578221913, "grad_norm": 41.53136444091797, "learning_rate": 4.740660783751638e-09, "logits/chosen": -2.085195779800415, "logits/rejected": -2.038940191268921, "logps/chosen": -231.2070770263672, "logps/rejected": -308.63604736328125, "loss": 0.5134, "rewards/accuracies": 0.78125, "rewards/chosen": -1.7543834447860718, "rewards/margins": 0.8147646188735962, "rewards/rejected": -2.569148302078247, "step": 15870 }, { "epoch": 2.7360441075120607, "grad_norm": 39.869197845458984, "learning_rate": 4.679855583706571e-09, "logits/chosen": -2.0924925804138184, "logits/rejected": -2.057692050933838, "logps/chosen": -220.9092254638672, "logps/rejected": -302.0570068359375, "loss": 0.4935, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.6684633493423462, "rewards/margins": 0.8464488983154297, "rewards/rejected": -2.5149121284484863, "step": 15880 }, { "epoch": 2.7377670572019297, "grad_norm": 39.32370376586914, "learning_rate": 4.619433513784166e-09, "logits/chosen": -2.1212639808654785, "logits/rejected": -2.0752553939819336, "logps/chosen": -225.4252166748047, "logps/rejected": -288.95513916015625, "loss": 0.5521, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.7253128290176392, "rewards/margins": 0.6845265626907349, "rewards/rejected": -2.409839153289795, "step": 15890 }, { "epoch": 2.7394900068917987, "grad_norm": 52.2964973449707, "learning_rate": 4.559394816845075e-09, "logits/chosen": -2.1233177185058594, "logits/rejected": -2.0619730949401855, "logps/chosen": -233.4136505126953, "logps/rejected": -299.57403564453125, "loss": 0.5222, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.7194812297821045, "rewards/margins": 0.7652706503868103, "rewards/rejected": -2.4847517013549805, "step": 15900 }, { "epoch": 2.741212956581668, "grad_norm": 44.44611740112305, "learning_rate": 4.499739734209074e-09, "logits/chosen": -2.05899977684021, "logits/rejected": -2.015986204147339, "logps/chosen": -208.9771728515625, "logps/rejected": -275.2792053222656, "loss": 0.5347, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.5446122884750366, "rewards/margins": 0.7179552316665649, "rewards/rejected": -2.2625672817230225, "step": 15910 }, { "epoch": 2.742935906271537, "grad_norm": 39.67881393432617, "learning_rate": 4.440468505653982e-09, "logits/chosen": -2.0713725090026855, "logits/rejected": -2.0402581691741943, "logps/chosen": -231.6560516357422, "logps/rejected": -300.25970458984375, "loss": 0.5462, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.78274667263031, "rewards/margins": 0.6845306158065796, "rewards/rejected": -2.4672775268554688, "step": 15920 }, { "epoch": 2.744658855961406, "grad_norm": 35.616493225097656, "learning_rate": 4.381581369414822e-09, "logits/chosen": -2.025343418121338, "logits/rejected": -1.9802414178848267, "logps/chosen": -212.61770629882812, "logps/rejected": -276.72784423828125, "loss": 0.51, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.5602275133132935, "rewards/margins": 0.7136334180831909, "rewards/rejected": -2.2738606929779053, "step": 15930 }, { "epoch": 2.746381805651275, "grad_norm": 29.387271881103516, "learning_rate": 4.323078562182702e-09, "logits/chosen": -2.0832173824310303, "logits/rejected": -2.031151533126831, "logps/chosen": -224.5006561279297, "logps/rejected": -316.13031005859375, "loss": 0.4689, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.6402543783187866, "rewards/margins": 0.9359604120254517, "rewards/rejected": -2.5762152671813965, "step": 15940 }, { "epoch": 2.748104755341144, "grad_norm": 55.440773010253906, "learning_rate": 4.2649603191040715e-09, "logits/chosen": -2.155172824859619, "logits/rejected": -2.1123874187469482, "logps/chosen": -219.11181640625, "logps/rejected": -279.84783935546875, "loss": 0.5402, "rewards/accuracies": 0.75, "rewards/chosen": -1.5970779657363892, "rewards/margins": 0.68474942445755, "rewards/rejected": -2.281827449798584, "step": 15950 }, { "epoch": 2.749827705031013, "grad_norm": 27.0786075592041, "learning_rate": 4.207226873779557e-09, "logits/chosen": -2.1281867027282715, "logits/rejected": -2.085352659225464, "logps/chosen": -228.1505126953125, "logps/rejected": -296.83099365234375, "loss": 0.5425, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.7110164165496826, "rewards/margins": 0.7366132736206055, "rewards/rejected": -2.447629451751709, "step": 15960 }, { "epoch": 2.751550654720882, "grad_norm": 47.35169982910156, "learning_rate": 4.149878458263179e-09, "logits/chosen": -2.1108779907226562, "logits/rejected": -2.073906421661377, "logps/chosen": -223.5197296142578, "logps/rejected": -297.11297607421875, "loss": 0.5184, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.6669301986694336, "rewards/margins": 0.7547533512115479, "rewards/rejected": -2.4216837882995605, "step": 15970 }, { "epoch": 2.7532736044107513, "grad_norm": 55.57887268066406, "learning_rate": 4.092915303061372e-09, "logits/chosen": -2.08697772026062, "logits/rejected": -2.0546367168426514, "logps/chosen": -229.6186981201172, "logps/rejected": -289.40289306640625, "loss": 0.5517, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.7267612218856812, "rewards/margins": 0.6117097735404968, "rewards/rejected": -2.3384711742401123, "step": 15980 }, { "epoch": 2.7549965541006203, "grad_norm": 42.30339431762695, "learning_rate": 4.0363376371320366e-09, "logits/chosen": -2.1910502910614014, "logits/rejected": -2.1740078926086426, "logps/chosen": -221.6997528076172, "logps/rejected": -283.23016357421875, "loss": 0.5523, "rewards/accuracies": 0.6875, "rewards/chosen": -1.6591275930404663, "rewards/margins": 0.6236189603805542, "rewards/rejected": -2.2827465534210205, "step": 15990 }, { "epoch": 2.7567195037904892, "grad_norm": 33.60514450073242, "learning_rate": 3.98014568788364e-09, "logits/chosen": -2.065885543823242, "logits/rejected": -2.0202903747558594, "logps/chosen": -219.5049285888672, "logps/rejected": -292.70697021484375, "loss": 0.4885, "rewards/accuracies": 0.78125, "rewards/chosen": -1.6498653888702393, "rewards/margins": 0.7682273983955383, "rewards/rejected": -2.4180924892425537, "step": 16000 }, { "epoch": 2.7567195037904892, "eval_logits/chosen": -2.170712947845459, "eval_logits/rejected": -2.1511991024017334, "eval_logps/chosen": -217.2328338623047, "eval_logps/rejected": -251.9056396484375, "eval_loss": 0.6406311392784119, "eval_rewards/accuracies": 0.6268587112426758, "eval_rewards/chosen": -1.5821737051010132, "eval_rewards/margins": 0.3093867003917694, "eval_rewards/rejected": -1.891560435295105, "eval_runtime": 383.3325, "eval_samples_per_second": 11.228, "eval_steps_per_second": 1.403, "step": 16000 }, { "epoch": 2.758442453480358, "grad_norm": 60.14817428588867, "learning_rate": 3.924339681174293e-09, "logits/chosen": -2.1418251991271973, "logits/rejected": -2.115861177444458, "logps/chosen": -228.0880584716797, "logps/rejected": -287.68463134765625, "loss": 0.5822, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.720970869064331, "rewards/margins": 0.6308675408363342, "rewards/rejected": -2.3518383502960205, "step": 16010 }, { "epoch": 2.7601654031702276, "grad_norm": 23.835336685180664, "learning_rate": 3.868919841310858e-09, "logits/chosen": -2.176820993423462, "logits/rejected": -2.137303590774536, "logps/chosen": -223.884521484375, "logps/rejected": -293.23089599609375, "loss": 0.5382, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.6952816247940063, "rewards/margins": 0.7307706475257874, "rewards/rejected": -2.4260523319244385, "step": 16020 }, { "epoch": 2.7618883528600966, "grad_norm": 41.4140625, "learning_rate": 3.81388639104806e-09, "logits/chosen": -2.159712314605713, "logits/rejected": -2.121702194213867, "logps/chosen": -223.4437713623047, "logps/rejected": -291.7823181152344, "loss": 0.5451, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.6980524063110352, "rewards/margins": 0.7046465873718262, "rewards/rejected": -2.4026989936828613, "step": 16030 }, { "epoch": 2.7636113025499656, "grad_norm": 48.69667434692383, "learning_rate": 3.759239551587512e-09, "logits/chosen": -2.1217103004455566, "logits/rejected": -2.083436965942383, "logps/chosen": -231.5419464111328, "logps/rejected": -299.17242431640625, "loss": 0.546, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.7604268789291382, "rewards/margins": 0.7160990834236145, "rewards/rejected": -2.4765260219573975, "step": 16040 }, { "epoch": 2.7653342522398345, "grad_norm": 35.25212860107422, "learning_rate": 3.7049795425769027e-09, "logits/chosen": -2.1005589962005615, "logits/rejected": -2.057460308074951, "logps/chosen": -217.8822021484375, "logps/rejected": -303.2470397949219, "loss": 0.4636, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.626956582069397, "rewards/margins": 0.8781922459602356, "rewards/rejected": -2.5051486492156982, "step": 16050 }, { "epoch": 2.7670572019297035, "grad_norm": 38.03778076171875, "learning_rate": 3.6511065821091314e-09, "logits/chosen": -2.1131396293640137, "logits/rejected": -2.079918384552002, "logps/chosen": -217.0569305419922, "logps/rejected": -286.45526123046875, "loss": 0.5129, "rewards/accuracies": 0.75, "rewards/chosen": -1.6031198501586914, "rewards/margins": 0.7212660908699036, "rewards/rejected": -2.3243861198425293, "step": 16060 }, { "epoch": 2.7687801516195725, "grad_norm": 36.92191696166992, "learning_rate": 3.597620886721342e-09, "logits/chosen": -2.0656356811523438, "logits/rejected": -2.0290164947509766, "logps/chosen": -225.89096069335938, "logps/rejected": -298.1449279785156, "loss": 0.487, "rewards/accuracies": 0.78125, "rewards/chosen": -1.6773040294647217, "rewards/margins": 0.7722429633140564, "rewards/rejected": -2.449547290802002, "step": 16070 }, { "epoch": 2.770503101309442, "grad_norm": 32.893951416015625, "learning_rate": 3.5445226713941457e-09, "logits/chosen": -2.1103484630584717, "logits/rejected": -2.0507454872131348, "logps/chosen": -230.32687377929688, "logps/rejected": -302.87579345703125, "loss": 0.5217, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.7214205265045166, "rewards/margins": 0.7978491187095642, "rewards/rejected": -2.5192697048187256, "step": 16080 }, { "epoch": 2.772226050999311, "grad_norm": 48.20229721069336, "learning_rate": 3.491812149550688e-09, "logits/chosen": -2.106853485107422, "logits/rejected": -2.065788745880127, "logps/chosen": -221.03012084960938, "logps/rejected": -291.2431335449219, "loss": 0.5219, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.6745634078979492, "rewards/margins": 0.7341934442520142, "rewards/rejected": -2.408756971359253, "step": 16090 }, { "epoch": 2.77394900068918, "grad_norm": 40.7789192199707, "learning_rate": 3.4394895330558284e-09, "logits/chosen": -2.1259613037109375, "logits/rejected": -2.0911924839019775, "logps/chosen": -222.92385864257812, "logps/rejected": -309.33013916015625, "loss": 0.4922, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.6919066905975342, "rewards/margins": 0.8644245862960815, "rewards/rejected": -2.5563313961029053, "step": 16100 }, { "epoch": 2.775671950379049, "grad_norm": 42.12590026855469, "learning_rate": 3.3875550322152503e-09, "logits/chosen": -2.0365333557128906, "logits/rejected": -1.9861524105072021, "logps/chosen": -221.8624725341797, "logps/rejected": -294.39288330078125, "loss": 0.5231, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.638580083847046, "rewards/margins": 0.7712908983230591, "rewards/rejected": -2.4098711013793945, "step": 16110 }, { "epoch": 2.777394900068918, "grad_norm": 52.53358459472656, "learning_rate": 3.3360088557746856e-09, "logits/chosen": -2.1154911518096924, "logits/rejected": -2.0882134437561035, "logps/chosen": -217.0634765625, "logps/rejected": -280.804443359375, "loss": 0.5567, "rewards/accuracies": 0.71875, "rewards/chosen": -1.660840630531311, "rewards/margins": 0.6266318559646606, "rewards/rejected": -2.2874722480773926, "step": 16120 }, { "epoch": 2.779117849758787, "grad_norm": 50.5311279296875, "learning_rate": 3.2848512109190375e-09, "logits/chosen": -2.087770700454712, "logits/rejected": -2.048919916152954, "logps/chosen": -232.2989959716797, "logps/rejected": -297.85614013671875, "loss": 0.5416, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.789602518081665, "rewards/margins": 0.6839150190353394, "rewards/rejected": -2.473517417907715, "step": 16130 }, { "epoch": 2.780840799448656, "grad_norm": 43.92984390258789, "learning_rate": 3.2340823032715125e-09, "logits/chosen": -2.1730029582977295, "logits/rejected": -2.129218578338623, "logps/chosen": -229.45938110351562, "logps/rejected": -296.0975036621094, "loss": 0.5441, "rewards/accuracies": 0.71875, "rewards/chosen": -1.7570350170135498, "rewards/margins": 0.6896018981933594, "rewards/rejected": -2.44663667678833, "step": 16140 }, { "epoch": 2.782563749138525, "grad_norm": 42.87978744506836, "learning_rate": 3.1837023368928017e-09, "logits/chosen": -2.144951343536377, "logits/rejected": -2.1099157333374023, "logps/chosen": -230.6343536376953, "logps/rejected": -295.3407897949219, "loss": 0.542, "rewards/accuracies": 0.71875, "rewards/chosen": -1.7498550415039062, "rewards/margins": 0.6905009150505066, "rewards/rejected": -2.4403560161590576, "step": 16150 }, { "epoch": 2.784286698828394, "grad_norm": 44.73198318481445, "learning_rate": 3.133711514280357e-09, "logits/chosen": -2.131856679916382, "logits/rejected": -2.0838818550109863, "logps/chosen": -218.6786346435547, "logps/rejected": -304.34088134765625, "loss": 0.4743, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.6389833688735962, "rewards/margins": 0.8746803402900696, "rewards/rejected": -2.5136635303497314, "step": 16160 }, { "epoch": 2.786009648518263, "grad_norm": 32.14342498779297, "learning_rate": 3.084110036367449e-09, "logits/chosen": -2.046508312225342, "logits/rejected": -2.0131518840789795, "logps/chosen": -235.529541015625, "logps/rejected": -296.33636474609375, "loss": 0.5556, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.8099933862686157, "rewards/margins": 0.6219733953475952, "rewards/rejected": -2.43196702003479, "step": 16170 }, { "epoch": 2.7877325982081325, "grad_norm": 40.33845520019531, "learning_rate": 3.034898102522454e-09, "logits/chosen": -2.088554859161377, "logits/rejected": -2.026787281036377, "logps/chosen": -236.4054718017578, "logps/rejected": -316.6522521972656, "loss": 0.5058, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.7643744945526123, "rewards/margins": 0.889947235584259, "rewards/rejected": -2.6543216705322266, "step": 16180 }, { "epoch": 2.7894555478980014, "grad_norm": 48.48818588256836, "learning_rate": 2.9860759105479582e-09, "logits/chosen": -2.0988450050354004, "logits/rejected": -2.0645952224731445, "logps/chosen": -227.4365234375, "logps/rejected": -295.9173889160156, "loss": 0.5447, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.7268270254135132, "rewards/margins": 0.6935473680496216, "rewards/rejected": -2.4203743934631348, "step": 16190 }, { "epoch": 2.7911784975878704, "grad_norm": 45.35523223876953, "learning_rate": 2.9376436566800667e-09, "logits/chosen": -2.0436596870422363, "logits/rejected": -1.9899866580963135, "logps/chosen": -226.82186889648438, "logps/rejected": -291.56561279296875, "loss": 0.5485, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.7067333459854126, "rewards/margins": 0.7155084609985352, "rewards/rejected": -2.422241687774658, "step": 16200 }, { "epoch": 2.7929014472777394, "grad_norm": 38.39668655395508, "learning_rate": 2.8896015355875492e-09, "logits/chosen": -2.01147198677063, "logits/rejected": -1.9818751811981201, "logps/chosen": -226.8378448486328, "logps/rejected": -289.5968322753906, "loss": 0.5536, "rewards/accuracies": 0.71875, "rewards/chosen": -1.7334760427474976, "rewards/margins": 0.6313256025314331, "rewards/rejected": -2.3648018836975098, "step": 16210 }, { "epoch": 2.794624396967609, "grad_norm": 37.38385009765625, "learning_rate": 2.841949740371086e-09, "logits/chosen": -2.089500665664673, "logits/rejected": -2.0475239753723145, "logps/chosen": -218.9881591796875, "logps/rejected": -295.2696228027344, "loss": 0.5192, "rewards/accuracies": 0.78125, "rewards/chosen": -1.6111987829208374, "rewards/margins": 0.7985876798629761, "rewards/rejected": -2.4097864627838135, "step": 16220 }, { "epoch": 2.7963473466574778, "grad_norm": 32.81590270996094, "learning_rate": 2.7946884625624556e-09, "logits/chosen": -2.110447406768799, "logits/rejected": -2.0691370964050293, "logps/chosen": -223.9718475341797, "logps/rejected": -296.3196716308594, "loss": 0.5205, "rewards/accuracies": 0.78125, "rewards/chosen": -1.7176158428192139, "rewards/margins": 0.7397913932800293, "rewards/rejected": -2.4574074745178223, "step": 16230 }, { "epoch": 2.7980702963473467, "grad_norm": 37.23353958129883, "learning_rate": 2.747817892123816e-09, "logits/chosen": -2.0878543853759766, "logits/rejected": -2.0457749366760254, "logps/chosen": -232.74758911132812, "logps/rejected": -308.37872314453125, "loss": 0.5228, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.7941334247589111, "rewards/margins": 0.7627833485603333, "rewards/rejected": -2.5569167137145996, "step": 16240 }, { "epoch": 2.7997932460372157, "grad_norm": 40.1833381652832, "learning_rate": 2.7013382174468914e-09, "logits/chosen": -2.112239360809326, "logits/rejected": -2.079160213470459, "logps/chosen": -226.0060272216797, "logps/rejected": -285.4903564453125, "loss": 0.5518, "rewards/accuracies": 0.75, "rewards/chosen": -1.6794464588165283, "rewards/margins": 0.6362031102180481, "rewards/rejected": -2.3156495094299316, "step": 16250 }, { "epoch": 2.8015161957270847, "grad_norm": 49.72036361694336, "learning_rate": 2.6552496253522518e-09, "logits/chosen": -2.1082348823547363, "logits/rejected": -2.0651748180389404, "logps/chosen": -235.5941619873047, "logps/rejected": -320.23138427734375, "loss": 0.5462, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.8211790323257446, "rewards/margins": 0.833513617515564, "rewards/rejected": -2.654693126678467, "step": 16260 }, { "epoch": 2.8032391454169536, "grad_norm": 45.086212158203125, "learning_rate": 2.609552301088558e-09, "logits/chosen": -2.1189451217651367, "logits/rejected": -2.0799121856689453, "logps/chosen": -230.2737579345703, "logps/rejected": -297.52001953125, "loss": 0.5645, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.7419335842132568, "rewards/margins": 0.7099277377128601, "rewards/rejected": -2.4518613815307617, "step": 16270 }, { "epoch": 2.804962095106823, "grad_norm": 43.261688232421875, "learning_rate": 2.5642464283317733e-09, "logits/chosen": -2.1716601848602295, "logits/rejected": -2.1354892253875732, "logps/chosen": -235.68460083007812, "logps/rejected": -300.2950744628906, "loss": 0.5466, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.7859855890274048, "rewards/margins": 0.6980710625648499, "rewards/rejected": -2.4840569496154785, "step": 16280 }, { "epoch": 2.806685044796692, "grad_norm": 29.19462776184082, "learning_rate": 2.5193321891844866e-09, "logits/chosen": -2.1258790493011475, "logits/rejected": -2.0978095531463623, "logps/chosen": -223.1262664794922, "logps/rejected": -291.4661560058594, "loss": 0.5461, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.6972472667694092, "rewards/margins": 0.6799039244651794, "rewards/rejected": -2.3771510124206543, "step": 16290 }, { "epoch": 2.808407994486561, "grad_norm": 45.25839614868164, "learning_rate": 2.4748097641751787e-09, "logits/chosen": -2.1373369693756104, "logits/rejected": -2.0963287353515625, "logps/chosen": -239.76138305664062, "logps/rejected": -298.3421936035156, "loss": 0.5828, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.8480463027954102, "rewards/margins": 0.6322572231292725, "rewards/rejected": -2.4803037643432617, "step": 16300 }, { "epoch": 2.81013094417643, "grad_norm": 41.874542236328125, "learning_rate": 2.4306793322574014e-09, "logits/chosen": -2.1024575233459473, "logits/rejected": -2.0653140544891357, "logps/chosen": -226.78759765625, "logps/rejected": -290.859130859375, "loss": 0.5652, "rewards/accuracies": 0.71875, "rewards/chosen": -1.7299995422363281, "rewards/margins": 0.6347140669822693, "rewards/rejected": -2.364713430404663, "step": 16310 }, { "epoch": 2.8118538938662994, "grad_norm": 43.39019012451172, "learning_rate": 2.3869410708091787e-09, "logits/chosen": -2.108158588409424, "logits/rejected": -2.0729689598083496, "logps/chosen": -235.5603485107422, "logps/rejected": -313.83453369140625, "loss": 0.5132, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.79485285282135, "rewards/margins": 0.8003241419792175, "rewards/rejected": -2.595176935195923, "step": 16320 }, { "epoch": 2.8135768435561683, "grad_norm": 30.487186431884766, "learning_rate": 2.3435951556322386e-09, "logits/chosen": -2.1272568702697754, "logits/rejected": -2.086873769760132, "logps/chosen": -223.5836944580078, "logps/rejected": -285.99444580078125, "loss": 0.5445, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.643500566482544, "rewards/margins": 0.6845242381095886, "rewards/rejected": -2.3280248641967773, "step": 16330 }, { "epoch": 2.8152997932460373, "grad_norm": 36.673702239990234, "learning_rate": 2.3006417609513053e-09, "logits/chosen": -2.0551676750183105, "logits/rejected": -2.0148515701293945, "logps/chosen": -215.37350463867188, "logps/rejected": -295.75323486328125, "loss": 0.4746, "rewards/accuracies": 0.75, "rewards/chosen": -1.5917526483535767, "rewards/margins": 0.8517443537712097, "rewards/rejected": -2.4434971809387207, "step": 16340 }, { "epoch": 2.8170227429359063, "grad_norm": 32.942054748535156, "learning_rate": 2.258081059413397e-09, "logits/chosen": -2.192537546157837, "logits/rejected": -2.144319534301758, "logps/chosen": -220.494384765625, "logps/rejected": -292.1282653808594, "loss": 0.493, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.6330026388168335, "rewards/margins": 0.7516492605209351, "rewards/rejected": -2.3846521377563477, "step": 16350 }, { "epoch": 2.8187456926257752, "grad_norm": 42.28948211669922, "learning_rate": 2.2159132220871623e-09, "logits/chosen": -2.1154656410217285, "logits/rejected": -2.0709640979766846, "logps/chosen": -223.71414184570312, "logps/rejected": -295.94488525390625, "loss": 0.5153, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.6985929012298584, "rewards/margins": 0.7562544345855713, "rewards/rejected": -2.454847574234009, "step": 16360 }, { "epoch": 2.820468642315644, "grad_norm": 44.34054183959961, "learning_rate": 2.174138418462135e-09, "logits/chosen": -2.087660789489746, "logits/rejected": -2.0548341274261475, "logps/chosen": -232.76956176757812, "logps/rejected": -287.1643371582031, "loss": 0.5742, "rewards/accuracies": 0.71875, "rewards/chosen": -1.7786098718643188, "rewards/margins": 0.5987129807472229, "rewards/rejected": -2.3773229122161865, "step": 16370 }, { "epoch": 2.822191592005513, "grad_norm": 48.648616790771484, "learning_rate": 2.132756816448111e-09, "logits/chosen": -2.1305344104766846, "logits/rejected": -2.101365089416504, "logps/chosen": -225.6343994140625, "logps/rejected": -291.4571533203125, "loss": 0.5371, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.702154517173767, "rewards/margins": 0.6686417460441589, "rewards/rejected": -2.3707962036132812, "step": 16380 }, { "epoch": 2.8239145416953826, "grad_norm": 39.413551330566406, "learning_rate": 2.0917685823744426e-09, "logits/chosen": -2.03899884223938, "logits/rejected": -2.005887508392334, "logps/chosen": -212.76266479492188, "logps/rejected": -286.5272216796875, "loss": 0.5432, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.587390422821045, "rewards/margins": 0.7614668011665344, "rewards/rejected": -2.3488574028015137, "step": 16390 }, { "epoch": 2.8256374913852516, "grad_norm": 41.09767532348633, "learning_rate": 2.0511738809894097e-09, "logits/chosen": -2.0512211322784424, "logits/rejected": -2.0137734413146973, "logps/chosen": -222.48031616210938, "logps/rejected": -295.8672790527344, "loss": 0.5057, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.6593711376190186, "rewards/margins": 0.7600074410438538, "rewards/rejected": -2.4193787574768066, "step": 16400 }, { "epoch": 2.8256374913852516, "eval_logits/chosen": -2.1720340251922607, "eval_logits/rejected": -2.1526761054992676, "eval_logps/chosen": -217.00506591796875, "eval_logps/rejected": -251.57510375976562, "eval_loss": 0.6409673094749451, "eval_rewards/accuracies": 0.6305761933326721, "eval_rewards/chosen": -1.5798962116241455, "eval_rewards/margins": 0.30835920572280884, "eval_rewards/rejected": -1.8882551193237305, "eval_runtime": 383.5047, "eval_samples_per_second": 11.223, "eval_steps_per_second": 1.403, "step": 16400 }, { "epoch": 2.8273604410751205, "grad_norm": 37.83739471435547, "learning_rate": 2.0109728754594713e-09, "logits/chosen": -2.1093385219573975, "logits/rejected": -2.056161403656006, "logps/chosen": -235.99423217773438, "logps/rejected": -306.68701171875, "loss": 0.5049, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.7986196279525757, "rewards/margins": 0.7931631803512573, "rewards/rejected": -2.591783046722412, "step": 16410 }, { "epoch": 2.82908339076499, "grad_norm": 51.42935562133789, "learning_rate": 1.9711657273686844e-09, "logits/chosen": -2.0189051628112793, "logits/rejected": -1.9867775440216064, "logps/chosen": -228.15390014648438, "logps/rejected": -289.6495666503906, "loss": 0.5659, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.7289907932281494, "rewards/margins": 0.6051722764968872, "rewards/rejected": -2.334162950515747, "step": 16420 }, { "epoch": 2.830806340454859, "grad_norm": 60.047271728515625, "learning_rate": 1.93175259671805e-09, "logits/chosen": -2.024369716644287, "logits/rejected": -1.9945716857910156, "logps/chosen": -232.27267456054688, "logps/rejected": -299.2837829589844, "loss": 0.5415, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.7775561809539795, "rewards/margins": 0.6843501329421997, "rewards/rejected": -2.4619064331054688, "step": 16430 }, { "epoch": 2.832529290144728, "grad_norm": 29.9412841796875, "learning_rate": 1.8927336419248596e-09, "logits/chosen": -2.0732693672180176, "logits/rejected": -2.032212734222412, "logps/chosen": -221.9842987060547, "logps/rejected": -298.64361572265625, "loss": 0.532, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.699705719947815, "rewards/margins": 0.7766598463058472, "rewards/rejected": -2.476365327835083, "step": 16440 }, { "epoch": 2.834252239834597, "grad_norm": 48.490699768066406, "learning_rate": 1.8541090198220144e-09, "logits/chosen": -2.151984691619873, "logits/rejected": -2.109513521194458, "logps/chosen": -230.19943237304688, "logps/rejected": -307.9920959472656, "loss": 0.4905, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.7183396816253662, "rewards/margins": 0.8219423294067383, "rewards/rejected": -2.5402817726135254, "step": 16450 }, { "epoch": 2.835975189524466, "grad_norm": 61.01959228515625, "learning_rate": 1.8158788856574624e-09, "logits/chosen": -2.005765438079834, "logits/rejected": -1.9797627925872803, "logps/chosen": -213.1536865234375, "logps/rejected": -289.0273742675781, "loss": 0.5169, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.6353687047958374, "rewards/margins": 0.7584549188613892, "rewards/rejected": -2.3938233852386475, "step": 16460 }, { "epoch": 2.837698139214335, "grad_norm": 49.8837890625, "learning_rate": 1.7780433930935312e-09, "logits/chosen": -2.124091148376465, "logits/rejected": -2.0797622203826904, "logps/chosen": -222.4127197265625, "logps/rejected": -310.6213684082031, "loss": 0.5079, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.7021658420562744, "rewards/margins": 0.8872678875923157, "rewards/rejected": -2.5894336700439453, "step": 16470 }, { "epoch": 2.8394210889042037, "grad_norm": 52.482948303222656, "learning_rate": 1.74060269420635e-09, "logits/chosen": -2.023437738418579, "logits/rejected": -1.982782006263733, "logps/chosen": -220.4035186767578, "logps/rejected": -297.6961669921875, "loss": 0.502, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.668707251548767, "rewards/margins": 0.7751259803771973, "rewards/rejected": -2.443833112716675, "step": 16480 }, { "epoch": 2.841144038594073, "grad_norm": 48.38195037841797, "learning_rate": 1.7035569394851955e-09, "logits/chosen": -2.1020920276641846, "logits/rejected": -2.0738072395324707, "logps/chosen": -222.8717041015625, "logps/rejected": -270.82470703125, "loss": 0.5943, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.7007564306259155, "rewards/margins": 0.4976697862148285, "rewards/rejected": -2.1984260082244873, "step": 16490 }, { "epoch": 2.842866988283942, "grad_norm": 29.190269470214844, "learning_rate": 1.6669062778318698e-09, "logits/chosen": -2.1296000480651855, "logits/rejected": -2.0699143409729004, "logps/chosen": -217.5924530029297, "logps/rejected": -268.601806640625, "loss": 0.5326, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.5834438800811768, "rewards/margins": 0.6127041578292847, "rewards/rejected": -2.196147918701172, "step": 16500 }, { "epoch": 2.844589937973811, "grad_norm": 63.935707092285156, "learning_rate": 1.6306508565602228e-09, "logits/chosen": -2.133659839630127, "logits/rejected": -2.0986266136169434, "logps/chosen": -220.5809326171875, "logps/rejected": -289.7440490722656, "loss": 0.5213, "rewards/accuracies": 0.75, "rewards/chosen": -1.6280450820922852, "rewards/margins": 0.7229426503181458, "rewards/rejected": -2.350987672805786, "step": 16510 }, { "epoch": 2.84631288766368, "grad_norm": 50.458160400390625, "learning_rate": 1.5947908213953753e-09, "logits/chosen": -2.1781907081604004, "logits/rejected": -2.126542568206787, "logps/chosen": -232.2783660888672, "logps/rejected": -316.50927734375, "loss": 0.4762, "rewards/accuracies": 0.78125, "rewards/chosen": -1.7517235279083252, "rewards/margins": 0.8916566967964172, "rewards/rejected": -2.6433801651000977, "step": 16520 }, { "epoch": 2.8480358373535495, "grad_norm": 53.46482467651367, "learning_rate": 1.5593263164732972e-09, "logits/chosen": -2.0542373657226562, "logits/rejected": -2.0234646797180176, "logps/chosen": -227.55838012695312, "logps/rejected": -275.9589538574219, "loss": 0.5728, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.7072197198867798, "rewards/margins": 0.5339667201042175, "rewards/rejected": -2.2411863803863525, "step": 16530 }, { "epoch": 2.8497587870434185, "grad_norm": 53.02235794067383, "learning_rate": 1.5242574843401524e-09, "logits/chosen": -2.079049587249756, "logits/rejected": -2.04000186920166, "logps/chosen": -233.3113555908203, "logps/rejected": -297.20574951171875, "loss": 0.5702, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.7984466552734375, "rewards/margins": 0.6766330003738403, "rewards/rejected": -2.4750795364379883, "step": 16540 }, { "epoch": 2.8514817367332874, "grad_norm": 33.67155456542969, "learning_rate": 1.489584465951721e-09, "logits/chosen": -2.0668838024139404, "logits/rejected": -2.0234920978546143, "logps/chosen": -229.3888702392578, "logps/rejected": -313.6064758300781, "loss": 0.4876, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.7475191354751587, "rewards/margins": 0.8554231524467468, "rewards/rejected": -2.60294246673584, "step": 16550 }, { "epoch": 2.8532046864231564, "grad_norm": 34.016475677490234, "learning_rate": 1.455307400672845e-09, "logits/chosen": -2.0811784267425537, "logits/rejected": -2.0434277057647705, "logps/chosen": -222.74697875976562, "logps/rejected": -296.45184326171875, "loss": 0.5085, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.672705054283142, "rewards/margins": 0.7318105101585388, "rewards/rejected": -2.404515504837036, "step": 16560 }, { "epoch": 2.8549276361130254, "grad_norm": 43.50887680053711, "learning_rate": 1.421426426276895e-09, "logits/chosen": -2.14603590965271, "logits/rejected": -2.096391439437866, "logps/chosen": -219.7465057373047, "logps/rejected": -290.72857666015625, "loss": 0.4984, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.6511335372924805, "rewards/margins": 0.7488675713539124, "rewards/rejected": -2.400001049041748, "step": 16570 }, { "epoch": 2.8566505858028943, "grad_norm": 35.695289611816406, "learning_rate": 1.3879416789451815e-09, "logits/chosen": -2.062807321548462, "logits/rejected": -2.0235047340393066, "logps/chosen": -222.1370849609375, "logps/rejected": -294.31915283203125, "loss": 0.5454, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.7023674249649048, "rewards/margins": 0.7214884757995605, "rewards/rejected": -2.423856258392334, "step": 16580 }, { "epoch": 2.8583735354927637, "grad_norm": 71.88639068603516, "learning_rate": 1.3548532932663891e-09, "logits/chosen": -2.0466041564941406, "logits/rejected": -2.005546808242798, "logps/chosen": -237.32559204101562, "logps/rejected": -296.65850830078125, "loss": 0.5666, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.8258588314056396, "rewards/margins": 0.6459594368934631, "rewards/rejected": -2.471818208694458, "step": 16590 }, { "epoch": 2.8600964851826327, "grad_norm": 47.83307647705078, "learning_rate": 1.3221614022361105e-09, "logits/chosen": -2.078378200531006, "logits/rejected": -2.0499420166015625, "logps/chosen": -234.3688507080078, "logps/rejected": -295.27923583984375, "loss": 0.569, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.7668821811676025, "rewards/margins": 0.641403079032898, "rewards/rejected": -2.408285617828369, "step": 16600 }, { "epoch": 2.8618194348725017, "grad_norm": 43.39389419555664, "learning_rate": 1.289866137256257e-09, "logits/chosen": -2.072134494781494, "logits/rejected": -2.011193037033081, "logps/chosen": -239.80465698242188, "logps/rejected": -318.4080505371094, "loss": 0.5168, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.8393303155899048, "rewards/margins": 0.820911705493927, "rewards/rejected": -2.6602420806884766, "step": 16610 }, { "epoch": 2.8635423845623706, "grad_norm": 31.751678466796875, "learning_rate": 1.2579676281345042e-09, "logits/chosen": -2.0522119998931885, "logits/rejected": -2.009146213531494, "logps/chosen": -218.08444213867188, "logps/rejected": -292.8926086425781, "loss": 0.5054, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.6057907342910767, "rewards/margins": 0.7817180752754211, "rewards/rejected": -2.3875088691711426, "step": 16620 }, { "epoch": 2.86526533425224, "grad_norm": 74.23681640625, "learning_rate": 1.2264660030838592e-09, "logits/chosen": -2.059389591217041, "logits/rejected": -2.0104565620422363, "logps/chosen": -231.3424072265625, "logps/rejected": -301.61322021484375, "loss": 0.5198, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.684125304222107, "rewards/margins": 0.8123534917831421, "rewards/rejected": -2.496478796005249, "step": 16630 }, { "epoch": 2.866988283942109, "grad_norm": 64.68147277832031, "learning_rate": 1.195361388722038e-09, "logits/chosen": -2.0717241764068604, "logits/rejected": -2.021415948867798, "logps/chosen": -247.53384399414062, "logps/rejected": -317.575439453125, "loss": 0.5592, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.9080226421356201, "rewards/margins": 0.7610594630241394, "rewards/rejected": -2.6690824031829834, "step": 16640 }, { "epoch": 2.868711233631978, "grad_norm": 42.12141418457031, "learning_rate": 1.1646539100710562e-09, "logits/chosen": -2.051790952682495, "logits/rejected": -2.012688636779785, "logps/chosen": -212.3316192626953, "logps/rejected": -288.4221496582031, "loss": 0.4891, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.5662543773651123, "rewards/margins": 0.8092681765556335, "rewards/rejected": -2.3755226135253906, "step": 16650 }, { "epoch": 2.870434183321847, "grad_norm": 47.80633544921875, "learning_rate": 1.1343436905566495e-09, "logits/chosen": -2.088360548019409, "logits/rejected": -2.055044174194336, "logps/chosen": -228.2832794189453, "logps/rejected": -304.2309875488281, "loss": 0.5157, "rewards/accuracies": 0.75, "rewards/chosen": -1.7355735301971436, "rewards/margins": 0.7714985013008118, "rewards/rejected": -2.5070719718933105, "step": 16660 }, { "epoch": 2.872157133011716, "grad_norm": 32.09943389892578, "learning_rate": 1.1044308520078316e-09, "logits/chosen": -2.1001334190368652, "logits/rejected": -2.062692642211914, "logps/chosen": -211.157958984375, "logps/rejected": -298.08978271484375, "loss": 0.4738, "rewards/accuracies": 0.78125, "rewards/chosen": -1.5801347494125366, "rewards/margins": 0.8807607889175415, "rewards/rejected": -2.460895538330078, "step": 16670 }, { "epoch": 2.873880082701585, "grad_norm": 47.860862731933594, "learning_rate": 1.0749155146563493e-09, "logits/chosen": -2.022371530532837, "logits/rejected": -1.9902015924453735, "logps/chosen": -225.13632202148438, "logps/rejected": -289.27484130859375, "loss": 0.5621, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.7542200088500977, "rewards/margins": 0.6523141264915466, "rewards/rejected": -2.406534194946289, "step": 16680 }, { "epoch": 2.8756030323914543, "grad_norm": 37.298927307128906, "learning_rate": 1.0457977971362831e-09, "logits/chosen": -2.1136248111724854, "logits/rejected": -2.0815658569335938, "logps/chosen": -221.63546752929688, "logps/rejected": -277.52569580078125, "loss": 0.5695, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.6617939472198486, "rewards/margins": 0.5853327512741089, "rewards/rejected": -2.247126579284668, "step": 16690 }, { "epoch": 2.8773259820813233, "grad_norm": 48.6547966003418, "learning_rate": 1.0170778164834581e-09, "logits/chosen": -2.182882308959961, "logits/rejected": -2.1483664512634277, "logps/chosen": -232.5152587890625, "logps/rejected": -292.2670593261719, "loss": 0.591, "rewards/accuracies": 0.71875, "rewards/chosen": -1.7677936553955078, "rewards/margins": 0.6265477538108826, "rewards/rejected": -2.394341230392456, "step": 16700 }, { "epoch": 2.8790489317711923, "grad_norm": 47.388885498046875, "learning_rate": 9.887556881350901e-10, "logits/chosen": -2.0984857082366943, "logits/rejected": -2.055283784866333, "logps/chosen": -229.77487182617188, "logps/rejected": -302.63848876953125, "loss": 0.553, "rewards/accuracies": 0.75, "rewards/chosen": -1.7760871648788452, "rewards/margins": 0.75262051820755, "rewards/rejected": -2.528707504272461, "step": 16710 }, { "epoch": 2.8807718814610612, "grad_norm": 30.91242790222168, "learning_rate": 9.608315259292288e-10, "logits/chosen": -2.0606322288513184, "logits/rejected": -2.015148639678955, "logps/chosen": -229.14425659179688, "logps/rejected": -295.65289306640625, "loss": 0.5164, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.736588478088379, "rewards/margins": 0.7398849129676819, "rewards/rejected": -2.476473331451416, "step": 16720 }, { "epoch": 2.8824948311509306, "grad_norm": 32.950870513916016, "learning_rate": 9.333054421043484e-10, "logits/chosen": -2.050093173980713, "logits/rejected": -2.0072808265686035, "logps/chosen": -218.2460479736328, "logps/rejected": -304.90594482421875, "loss": 0.4845, "rewards/accuracies": 0.75, "rewards/chosen": -1.6574676036834717, "rewards/margins": 0.8856703042984009, "rewards/rejected": -2.543137788772583, "step": 16730 }, { "epoch": 2.8842177808407996, "grad_norm": 47.38127899169922, "learning_rate": 9.06177547298892e-10, "logits/chosen": -2.040806293487549, "logits/rejected": -2.002763271331787, "logps/chosen": -228.78292846679688, "logps/rejected": -291.5965881347656, "loss": 0.5487, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.7215182781219482, "rewards/margins": 0.6703734993934631, "rewards/rejected": -2.3918917179107666, "step": 16740 }, { "epoch": 2.8859407305306686, "grad_norm": 37.412601470947266, "learning_rate": 8.794479505508268e-10, "logits/chosen": -2.125983476638794, "logits/rejected": -2.08128023147583, "logps/chosen": -225.7335968017578, "logps/rejected": -317.2071838378906, "loss": 0.4742, "rewards/accuracies": 0.78125, "rewards/chosen": -1.707411766052246, "rewards/margins": 0.8916788101196289, "rewards/rejected": -2.599090337753296, "step": 16750 }, { "epoch": 2.8876636802205375, "grad_norm": 39.6590461730957, "learning_rate": 8.531167592971566e-10, "logits/chosen": -2.0325124263763428, "logits/rejected": -1.992448091506958, "logps/chosen": -233.4219970703125, "logps/rejected": -304.00323486328125, "loss": 0.5327, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.7760006189346313, "rewards/margins": 0.7474104166030884, "rewards/rejected": -2.5234107971191406, "step": 16760 }, { "epoch": 2.8893866299104065, "grad_norm": 34.32674789428711, "learning_rate": 8.271840793735884e-10, "logits/chosen": -2.150630474090576, "logits/rejected": -2.098395824432373, "logps/chosen": -233.03384399414062, "logps/rejected": -300.7596740722656, "loss": 0.5388, "rewards/accuracies": 0.75, "rewards/chosen": -1.7554289102554321, "rewards/margins": 0.7637473344802856, "rewards/rejected": -2.5191762447357178, "step": 16770 }, { "epoch": 2.8911095796002755, "grad_norm": 35.03683090209961, "learning_rate": 8.016500150140215e-10, "logits/chosen": -2.0818848609924316, "logits/rejected": -2.048121452331543, "logps/chosen": -225.534423828125, "logps/rejected": -297.86639404296875, "loss": 0.5285, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.7005207538604736, "rewards/margins": 0.7076085805892944, "rewards/rejected": -2.4081294536590576, "step": 16780 }, { "epoch": 2.892832529290145, "grad_norm": 35.001564025878906, "learning_rate": 7.765146688501589e-10, "logits/chosen": -2.0988194942474365, "logits/rejected": -2.039665460586548, "logps/chosen": -226.2569122314453, "logps/rejected": -283.5757141113281, "loss": 0.5791, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.6833999156951904, "rewards/margins": 0.6543527841567993, "rewards/rejected": -2.3377525806427, "step": 16790 }, { "epoch": 2.894555478980014, "grad_norm": 76.95634460449219, "learning_rate": 7.51778141911108e-10, "logits/chosen": -2.1269469261169434, "logits/rejected": -2.0994410514831543, "logps/chosen": -237.2293243408203, "logps/rejected": -302.3692932128906, "loss": 0.5731, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.8673900365829468, "rewards/margins": 0.6507739424705505, "rewards/rejected": -2.5181639194488525, "step": 16800 }, { "epoch": 2.894555478980014, "eval_logits/chosen": -2.170219898223877, "eval_logits/rejected": -2.150702953338623, "eval_logps/chosen": -218.18539428710938, "eval_logps/rejected": -252.9563751220703, "eval_loss": 0.6412459015846252, "eval_rewards/accuracies": 0.6270910501480103, "eval_rewards/chosen": -1.5916991233825684, "eval_rewards/margins": 0.3103685677051544, "eval_rewards/rejected": -1.90206778049469, "eval_runtime": 383.1692, "eval_samples_per_second": 11.233, "eval_steps_per_second": 1.404, "step": 16800 }, { "epoch": 2.896278428669883, "grad_norm": 39.302852630615234, "learning_rate": 7.274405336229361e-10, "logits/chosen": -2.073404550552368, "logits/rejected": -2.026705503463745, "logps/chosen": -217.42691040039062, "logps/rejected": -291.86859130859375, "loss": 0.5075, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.6343815326690674, "rewards/margins": 0.7991583347320557, "rewards/rejected": -2.433539867401123, "step": 16810 }, { "epoch": 2.898001378359752, "grad_norm": 57.89147186279297, "learning_rate": 7.035019418083376e-10, "logits/chosen": -2.1040916442871094, "logits/rejected": -2.0590415000915527, "logps/chosen": -234.2236328125, "logps/rejected": -289.3756103515625, "loss": 0.5516, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.780346155166626, "rewards/margins": 0.5990556478500366, "rewards/rejected": -2.379401683807373, "step": 16820 }, { "epoch": 2.899724328049621, "grad_norm": 45.09634780883789, "learning_rate": 6.799624626861456e-10, "logits/chosen": -2.165893316268921, "logits/rejected": -2.1156418323516846, "logps/chosen": -242.53964233398438, "logps/rejected": -330.9523010253906, "loss": 0.4985, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.869193434715271, "rewards/margins": 0.8991667032241821, "rewards/rejected": -2.768360137939453, "step": 16830 }, { "epoch": 2.90144727773949, "grad_norm": 64.08953857421875, "learning_rate": 6.568221908710314e-10, "logits/chosen": -2.063244104385376, "logits/rejected": -2.0177295207977295, "logps/chosen": -226.95565795898438, "logps/rejected": -297.2478332519531, "loss": 0.518, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.729138731956482, "rewards/margins": 0.7378978133201599, "rewards/rejected": -2.467036724090576, "step": 16840 }, { "epoch": 2.903170227429359, "grad_norm": 83.15585327148438, "learning_rate": 6.340812193730949e-10, "logits/chosen": -2.103969097137451, "logits/rejected": -2.0735132694244385, "logps/chosen": -242.5148162841797, "logps/rejected": -297.60333251953125, "loss": 0.546, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.8600852489471436, "rewards/margins": 0.6247068643569946, "rewards/rejected": -2.4847922325134277, "step": 16850 }, { "epoch": 2.904893177119228, "grad_norm": 39.73136520385742, "learning_rate": 6.117396395974749e-10, "logits/chosen": -2.1144349575042725, "logits/rejected": -2.0644149780273438, "logps/chosen": -232.8963165283203, "logps/rejected": -293.119873046875, "loss": 0.5494, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.7648794651031494, "rewards/margins": 0.6657959222793579, "rewards/rejected": -2.4306752681732178, "step": 16860 }, { "epoch": 2.906616126809097, "grad_norm": 60.453369140625, "learning_rate": 5.897975413439837e-10, "logits/chosen": -2.123142719268799, "logits/rejected": -2.086905002593994, "logps/chosen": -229.3736572265625, "logps/rejected": -286.8680114746094, "loss": 0.5826, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.7463557720184326, "rewards/margins": 0.6210822463035583, "rewards/rejected": -2.3674380779266357, "step": 16870 }, { "epoch": 2.908339076498966, "grad_norm": 46.789649963378906, "learning_rate": 5.682550128067731e-10, "logits/chosen": -2.1383373737335205, "logits/rejected": -2.0988118648529053, "logps/chosen": -225.3925018310547, "logps/rejected": -306.0677490234375, "loss": 0.5104, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.7493778467178345, "rewards/margins": 0.7984367609024048, "rewards/rejected": -2.54781436920166, "step": 16880 }, { "epoch": 2.910062026188835, "grad_norm": 48.62042999267578, "learning_rate": 5.471121405739687e-10, "logits/chosen": -2.0652871131896973, "logits/rejected": -2.0350475311279297, "logps/chosen": -239.32034301757812, "logps/rejected": -307.5757751464844, "loss": 0.5242, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.8389384746551514, "rewards/margins": 0.7051053047180176, "rewards/rejected": -2.544044017791748, "step": 16890 }, { "epoch": 2.9117849758787044, "grad_norm": 49.12190246582031, "learning_rate": 5.263690096273033e-10, "logits/chosen": -2.131868362426758, "logits/rejected": -2.0974514484405518, "logps/chosen": -223.4816436767578, "logps/rejected": -291.1946716308594, "loss": 0.5153, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.69611394405365, "rewards/margins": 0.7257545590400696, "rewards/rejected": -2.4218688011169434, "step": 16900 }, { "epoch": 2.9135079255685734, "grad_norm": 31.582202911376953, "learning_rate": 5.060257033417725e-10, "logits/chosen": -2.151571273803711, "logits/rejected": -2.1119132041931152, "logps/chosen": -229.43637084960938, "logps/rejected": -301.1514587402344, "loss": 0.5362, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.769758939743042, "rewards/margins": 0.7082198858261108, "rewards/rejected": -2.4779789447784424, "step": 16910 }, { "epoch": 2.9152308752584424, "grad_norm": 43.54109191894531, "learning_rate": 4.860823034853468e-10, "logits/chosen": -2.0970876216888428, "logits/rejected": -2.0635154247283936, "logps/chosen": -226.046875, "logps/rejected": -284.4517517089844, "loss": 0.5692, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.70242178440094, "rewards/margins": 0.6119588613510132, "rewards/rejected": -2.314380168914795, "step": 16920 }, { "epoch": 2.9169538249483113, "grad_norm": 37.83869171142578, "learning_rate": 4.66538890218593e-10, "logits/chosen": -2.1397299766540527, "logits/rejected": -2.1106526851654053, "logps/chosen": -214.8348846435547, "logps/rejected": -270.4018249511719, "loss": 0.5509, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.5738188028335571, "rewards/margins": 0.6245588660240173, "rewards/rejected": -2.1983776092529297, "step": 16930 }, { "epoch": 2.9186767746381808, "grad_norm": 38.34006881713867, "learning_rate": 4.4739554209437536e-10, "logits/chosen": -2.111955165863037, "logits/rejected": -2.0768473148345947, "logps/chosen": -220.9971923828125, "logps/rejected": -288.90179443359375, "loss": 0.521, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.6153472661972046, "rewards/margins": 0.7338367104530334, "rewards/rejected": -2.349184036254883, "step": 16940 }, { "epoch": 2.9203997243280497, "grad_norm": 41.916229248046875, "learning_rate": 4.286523360575334e-10, "logits/chosen": -2.0766398906707764, "logits/rejected": -2.0473122596740723, "logps/chosen": -223.3889617919922, "logps/rejected": -304.4686584472656, "loss": 0.5363, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.7231884002685547, "rewards/margins": 0.7785578370094299, "rewards/rejected": -2.50174617767334, "step": 16950 }, { "epoch": 2.9221226740179187, "grad_norm": 41.175865173339844, "learning_rate": 4.103093474445818e-10, "logits/chosen": -2.0784764289855957, "logits/rejected": -2.036428689956665, "logps/chosen": -229.8789825439453, "logps/rejected": -313.3496398925781, "loss": 0.5057, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.738173246383667, "rewards/margins": 0.8513944745063782, "rewards/rejected": -2.5895678997039795, "step": 16960 }, { "epoch": 2.9238456237077877, "grad_norm": 45.59201431274414, "learning_rate": 3.9236664998338885e-10, "logits/chosen": -2.162342071533203, "logits/rejected": -2.118924140930176, "logps/chosen": -231.0322265625, "logps/rejected": -301.0166931152344, "loss": 0.5236, "rewards/accuracies": 0.71875, "rewards/chosen": -1.7455734014511108, "rewards/margins": 0.7369973063468933, "rewards/rejected": -2.4825704097747803, "step": 16970 }, { "epoch": 2.9255685733976566, "grad_norm": 42.4276123046875, "learning_rate": 3.7482431579289873e-10, "logits/chosen": -2.136894702911377, "logits/rejected": -2.0962953567504883, "logps/chosen": -219.96725463867188, "logps/rejected": -285.20452880859375, "loss": 0.5119, "rewards/accuracies": 0.78125, "rewards/chosen": -1.650986671447754, "rewards/margins": 0.6824150681495667, "rewards/rejected": -2.3334014415740967, "step": 16980 }, { "epoch": 2.9272915230875256, "grad_norm": 28.827068328857422, "learning_rate": 3.5768241538282064e-10, "logits/chosen": -2.1777596473693848, "logits/rejected": -2.1340672969818115, "logps/chosen": -210.70730590820312, "logps/rejected": -287.60089111328125, "loss": 0.5085, "rewards/accuracies": 0.75, "rewards/chosen": -1.5722601413726807, "rewards/margins": 0.786490797996521, "rewards/rejected": -2.358751058578491, "step": 16990 }, { "epoch": 2.929014472777395, "grad_norm": 70.47722625732422, "learning_rate": 3.4094101765338446e-10, "logits/chosen": -2.1753337383270264, "logits/rejected": -2.140491485595703, "logps/chosen": -220.97109985351562, "logps/rejected": -279.3697509765625, "loss": 0.5656, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.6485884189605713, "rewards/margins": 0.6162117719650269, "rewards/rejected": -2.2647998332977295, "step": 17000 }, { "epoch": 2.930737422467264, "grad_norm": 33.92271041870117, "learning_rate": 3.24600189895019e-10, "logits/chosen": -2.0778861045837402, "logits/rejected": -2.0436019897460938, "logps/chosen": -242.9732666015625, "logps/rejected": -312.52880859375, "loss": 0.5511, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.89163339138031, "rewards/margins": 0.6904274225234985, "rewards/rejected": -2.5820608139038086, "step": 17010 }, { "epoch": 2.932460372157133, "grad_norm": 64.56465911865234, "learning_rate": 3.086599977880855e-10, "logits/chosen": -2.077587366104126, "logits/rejected": -2.061856746673584, "logps/chosen": -231.9762725830078, "logps/rejected": -284.2293701171875, "loss": 0.6059, "rewards/accuracies": 0.6875, "rewards/chosen": -1.7796518802642822, "rewards/margins": 0.5387360453605652, "rewards/rejected": -2.318387985229492, "step": 17020 }, { "epoch": 2.934183321847002, "grad_norm": 44.50904083251953, "learning_rate": 2.931205054026775e-10, "logits/chosen": -2.123732566833496, "logits/rejected": -2.082094669342041, "logps/chosen": -231.95407104492188, "logps/rejected": -289.9576110839844, "loss": 0.5535, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.7447305917739868, "rewards/margins": 0.6295886635780334, "rewards/rejected": -2.374319076538086, "step": 17030 }, { "epoch": 2.9359062715368713, "grad_norm": 35.688663482666016, "learning_rate": 2.7798177519826605e-10, "logits/chosen": -2.1549980640411377, "logits/rejected": -2.1072096824645996, "logps/chosen": -237.3199462890625, "logps/rejected": -303.9327392578125, "loss": 0.5179, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.8088245391845703, "rewards/margins": 0.7268606424331665, "rewards/rejected": -2.5356850624084473, "step": 17040 }, { "epoch": 2.9376292212267403, "grad_norm": 52.765228271484375, "learning_rate": 2.632438680235216e-10, "logits/chosen": -2.083176851272583, "logits/rejected": -2.050807476043701, "logps/chosen": -238.5670166015625, "logps/rejected": -295.58355712890625, "loss": 0.5802, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.8304694890975952, "rewards/margins": 0.5952358245849609, "rewards/rejected": -2.4257054328918457, "step": 17050 }, { "epoch": 2.9393521709166093, "grad_norm": 53.14019012451172, "learning_rate": 2.4890684311603683e-10, "logits/chosen": -2.1409833431243896, "logits/rejected": -2.0979952812194824, "logps/chosen": -231.754638671875, "logps/rejected": -292.4068298339844, "loss": 0.5759, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.7674375772476196, "rewards/margins": 0.6122730374336243, "rewards/rejected": -2.3797104358673096, "step": 17060 }, { "epoch": 2.9410751206064782, "grad_norm": 63.452335357666016, "learning_rate": 2.3497075810210433e-10, "logits/chosen": -2.1018431186676025, "logits/rejected": -2.0571179389953613, "logps/chosen": -235.9232635498047, "logps/rejected": -294.25274658203125, "loss": 0.5765, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.7991282939910889, "rewards/margins": 0.6322215795516968, "rewards/rejected": -2.431349992752075, "step": 17070 }, { "epoch": 2.942798070296347, "grad_norm": 39.4548454284668, "learning_rate": 2.2143566899647248e-10, "logits/chosen": -2.051210880279541, "logits/rejected": -2.0057883262634277, "logps/chosen": -226.5519256591797, "logps/rejected": -315.3668518066406, "loss": 0.4735, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.726647138595581, "rewards/margins": 0.9013614654541016, "rewards/rejected": -2.6280086040496826, "step": 17080 }, { "epoch": 2.944521019986216, "grad_norm": 38.06551742553711, "learning_rate": 2.0830163020212344e-10, "logits/chosen": -2.099445104598999, "logits/rejected": -2.0651302337646484, "logps/chosen": -226.014404296875, "logps/rejected": -304.4267272949219, "loss": 0.5062, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.72359299659729, "rewards/margins": 0.7966343760490417, "rewards/rejected": -2.5202274322509766, "step": 17090 }, { "epoch": 2.9462439696760856, "grad_norm": 42.14332580566406, "learning_rate": 1.955686945100621e-10, "logits/chosen": -2.061148166656494, "logits/rejected": -2.016936779022217, "logps/chosen": -233.59298706054688, "logps/rejected": -298.9555358886719, "loss": 0.5449, "rewards/accuracies": 0.71875, "rewards/chosen": -1.7688153982162476, "rewards/margins": 0.700672447681427, "rewards/rejected": -2.4694876670837402, "step": 17100 }, { "epoch": 2.9479669193659546, "grad_norm": 39.180789947509766, "learning_rate": 1.8323691309909407e-10, "logits/chosen": -2.0598368644714355, "logits/rejected": -2.03139066696167, "logps/chosen": -250.33279418945312, "logps/rejected": -325.10748291015625, "loss": 0.5263, "rewards/accuracies": 0.71875, "rewards/chosen": -1.9446275234222412, "rewards/margins": 0.7775768041610718, "rewards/rejected": -2.7222044467926025, "step": 17110 }, { "epoch": 2.9496898690558235, "grad_norm": 37.56937026977539, "learning_rate": 1.7130633553561479e-10, "logits/chosen": -2.158722400665283, "logits/rejected": -2.103527545928955, "logps/chosen": -226.38259887695312, "logps/rejected": -308.0562438964844, "loss": 0.4577, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.7288379669189453, "rewards/margins": 0.8727463483810425, "rewards/rejected": -2.6015844345092773, "step": 17120 }, { "epoch": 2.9514128187456925, "grad_norm": 47.376258850097656, "learning_rate": 1.597770097734541e-10, "logits/chosen": -2.040189266204834, "logits/rejected": -1.9927847385406494, "logps/chosen": -238.5029754638672, "logps/rejected": -307.3231201171875, "loss": 0.5081, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.8275566101074219, "rewards/margins": 0.7151464223861694, "rewards/rejected": -2.542703151702881, "step": 17130 }, { "epoch": 2.953135768435562, "grad_norm": 30.941635131835938, "learning_rate": 1.4864898215359857e-10, "logits/chosen": -2.0431325435638428, "logits/rejected": -2.0120351314544678, "logps/chosen": -223.69985961914062, "logps/rejected": -299.73345947265625, "loss": 0.5041, "rewards/accuracies": 0.75, "rewards/chosen": -1.6792500019073486, "rewards/margins": 0.7977226972579956, "rewards/rejected": -2.476972818374634, "step": 17140 }, { "epoch": 2.954858718125431, "grad_norm": 46.703495025634766, "learning_rate": 1.3792229740409166e-10, "logits/chosen": -2.152099847793579, "logits/rejected": -2.1069183349609375, "logps/chosen": -233.14736938476562, "logps/rejected": -298.739501953125, "loss": 0.5626, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.7751268148422241, "rewards/margins": 0.6954813599586487, "rewards/rejected": -2.4706082344055176, "step": 17150 }, { "epoch": 2.9565816678153, "grad_norm": 30.80360221862793, "learning_rate": 1.2759699863980067e-10, "logits/chosen": -2.1575663089752197, "logits/rejected": -2.1177420616149902, "logps/chosen": -221.9051513671875, "logps/rejected": -321.9290771484375, "loss": 0.4635, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.6959228515625, "rewards/margins": 1.0101712942123413, "rewards/rejected": -2.706094264984131, "step": 17160 }, { "epoch": 2.958304617505169, "grad_norm": 36.00048065185547, "learning_rate": 1.1767312736228329e-10, "logits/chosen": -2.1762290000915527, "logits/rejected": -2.138206958770752, "logps/chosen": -250.8097686767578, "logps/rejected": -305.70135498046875, "loss": 0.6168, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.907684087753296, "rewards/margins": 0.5790875554084778, "rewards/rejected": -2.486771583557129, "step": 17170 }, { "epoch": 2.960027567195038, "grad_norm": 45.304500579833984, "learning_rate": 1.0815072345957688e-10, "logits/chosen": -2.125030517578125, "logits/rejected": -2.088151216506958, "logps/chosen": -229.660888671875, "logps/rejected": -296.909912109375, "loss": 0.5473, "rewards/accuracies": 0.71875, "rewards/chosen": -1.7812343835830688, "rewards/margins": 0.6809852719306946, "rewards/rejected": -2.462219476699829, "step": 17180 }, { "epoch": 2.9617505168849068, "grad_norm": 43.71578598022461, "learning_rate": 9.902982520605396e-11, "logits/chosen": -2.073464870452881, "logits/rejected": -2.042576313018799, "logps/chosen": -213.50186157226562, "logps/rejected": -281.359619140625, "loss": 0.5209, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.6059232950210571, "rewards/margins": 0.6878377795219421, "rewards/rejected": -2.2937610149383545, "step": 17190 }, { "epoch": 2.963473466574776, "grad_norm": 33.20631790161133, "learning_rate": 9.031046926230024e-11, "logits/chosen": -2.134242534637451, "logits/rejected": -2.099299907684326, "logps/chosen": -210.3935546875, "logps/rejected": -288.7630310058594, "loss": 0.4958, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.551510214805603, "rewards/margins": 0.8001585006713867, "rewards/rejected": -2.3516688346862793, "step": 17200 }, { "epoch": 2.963473466574776, "eval_logits/chosen": -2.1701745986938477, "eval_logits/rejected": -2.150585651397705, "eval_logps/chosen": -218.34732055664062, "eval_logps/rejected": -253.1478271484375, "eval_loss": 0.6411592364311218, "eval_rewards/accuracies": 0.6296468377113342, "eval_rewards/chosen": -1.5933184623718262, "eval_rewards/margins": 0.31066349148750305, "eval_rewards/rejected": -1.903982162475586, "eval_runtime": 382.9775, "eval_samples_per_second": 11.238, "eval_steps_per_second": 1.405, "step": 17200 }, { "epoch": 2.965196416264645, "grad_norm": 45.74662780761719, "learning_rate": 8.199269067491466e-11, "logits/chosen": -2.063141345977783, "logits/rejected": -2.02927303314209, "logps/chosen": -233.63015747070312, "logps/rejected": -305.9888610839844, "loss": 0.5514, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.8087259531021118, "rewards/margins": 0.7234727144241333, "rewards/rejected": -2.532198905944824, "step": 17210 }, { "epoch": 2.966919365954514, "grad_norm": 30.98674201965332, "learning_rate": 7.407652287640953e-11, "logits/chosen": -2.1059508323669434, "logits/rejected": -2.0682930946350098, "logps/chosen": -230.4197998046875, "logps/rejected": -323.3200378417969, "loss": 0.4998, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.7401487827301025, "rewards/margins": 0.9332057237625122, "rewards/rejected": -2.673354387283325, "step": 17220 }, { "epoch": 2.968642315644383, "grad_norm": 32.45381164550781, "learning_rate": 6.656199768505511e-11, "logits/chosen": -2.115161895751953, "logits/rejected": -2.0847389698028564, "logps/chosen": -229.85830688476562, "logps/rejected": -305.0896911621094, "loss": 0.5406, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.7608146667480469, "rewards/margins": 0.7466955780982971, "rewards/rejected": -2.5075104236602783, "step": 17230 }, { "epoch": 2.9703652653342525, "grad_norm": 38.92058563232422, "learning_rate": 5.944914530475742e-11, "logits/chosen": -2.1152920722961426, "logits/rejected": -2.0800163745880127, "logps/chosen": -213.4129180908203, "logps/rejected": -281.83587646484375, "loss": 0.5245, "rewards/accuracies": 0.75, "rewards/chosen": -1.586476445198059, "rewards/margins": 0.6894331574440002, "rewards/rejected": -2.275909662246704, "step": 17240 }, { "epoch": 2.9720882150241215, "grad_norm": 39.71238327026367, "learning_rate": 5.2737994324958403e-11, "logits/chosen": -2.0901167392730713, "logits/rejected": -2.052403688430786, "logps/chosen": -223.3805389404297, "logps/rejected": -303.0497741699219, "loss": 0.4873, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.699758768081665, "rewards/margins": 0.8128054738044739, "rewards/rejected": -2.512564182281494, "step": 17250 }, { "epoch": 2.9738111647139904, "grad_norm": 41.23759078979492, "learning_rate": 4.642857172045822e-11, "logits/chosen": -2.098942279815674, "logits/rejected": -2.0471463203430176, "logps/chosen": -218.2930145263672, "logps/rejected": -304.4262390136719, "loss": 0.4787, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.6233618259429932, "rewards/margins": 0.9090239405632019, "rewards/rejected": -2.53238582611084, "step": 17260 }, { "epoch": 2.9755341144038594, "grad_norm": 42.076927185058594, "learning_rate": 4.052090285138199e-11, "logits/chosen": -2.0997719764709473, "logits/rejected": -2.057018756866455, "logps/chosen": -238.2406463623047, "logps/rejected": -301.4361877441406, "loss": 0.5706, "rewards/accuracies": 0.6875, "rewards/chosen": -1.8217058181762695, "rewards/margins": 0.6541695594787598, "rewards/rejected": -2.4758753776550293, "step": 17270 }, { "epoch": 2.9772570640937284, "grad_norm": 38.43862533569336, "learning_rate": 3.501501146304653e-11, "logits/chosen": -2.0579917430877686, "logits/rejected": -2.0086448192596436, "logps/chosen": -221.3511505126953, "logps/rejected": -305.3607482910156, "loss": 0.4742, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.6612695455551147, "rewards/margins": 0.8794782757759094, "rewards/rejected": -2.540748119354248, "step": 17280 }, { "epoch": 2.9789800137835973, "grad_norm": 39.56004333496094, "learning_rate": 2.991091968582715e-11, "logits/chosen": -2.1151480674743652, "logits/rejected": -2.075232982635498, "logps/chosen": -237.7044219970703, "logps/rejected": -302.80474853515625, "loss": 0.5218, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.7731300592422485, "rewards/margins": 0.6949617266654968, "rewards/rejected": -2.4680919647216797, "step": 17290 }, { "epoch": 2.9807029634734663, "grad_norm": 59.17507553100586, "learning_rate": 2.5208648035146553e-11, "logits/chosen": -2.1309123039245605, "logits/rejected": -2.1003451347351074, "logps/chosen": -228.07443237304688, "logps/rejected": -289.4764404296875, "loss": 0.5408, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.7138761281967163, "rewards/margins": 0.6651451587677002, "rewards/rejected": -2.379021167755127, "step": 17300 }, { "epoch": 2.9824259131633357, "grad_norm": 59.69649124145508, "learning_rate": 2.0908215411330477e-11, "logits/chosen": -2.1257481575012207, "logits/rejected": -2.080110549926758, "logps/chosen": -232.7665557861328, "logps/rejected": -313.95428466796875, "loss": 0.5014, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.771287202835083, "rewards/margins": 0.8487504720687866, "rewards/rejected": -2.62003755569458, "step": 17310 }, { "epoch": 2.9841488628532047, "grad_norm": 39.24268341064453, "learning_rate": 1.7009639099541118e-11, "logits/chosen": -2.1227715015411377, "logits/rejected": -2.0875697135925293, "logps/chosen": -230.41921997070312, "logps/rejected": -290.3526916503906, "loss": 0.5541, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.7490037679672241, "rewards/margins": 0.6406279802322388, "rewards/rejected": -2.389631986618042, "step": 17320 }, { "epoch": 2.9858718125430737, "grad_norm": 56.3477668762207, "learning_rate": 1.35129347697438e-11, "logits/chosen": -2.06532621383667, "logits/rejected": -2.0280654430389404, "logps/chosen": -231.87942504882812, "logps/rejected": -289.4935302734375, "loss": 0.5795, "rewards/accuracies": 0.65625, "rewards/chosen": -1.7513792514801025, "rewards/margins": 0.6341745257377625, "rewards/rejected": -2.3855533599853516, "step": 17330 }, { "epoch": 2.987594762232943, "grad_norm": 27.002744674682617, "learning_rate": 1.0418116476584859e-11, "logits/chosen": -2.1330957412719727, "logits/rejected": -2.0874831676483154, "logps/chosen": -220.322265625, "logps/rejected": -295.8821105957031, "loss": 0.5003, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.665138840675354, "rewards/margins": 0.7804524302482605, "rewards/rejected": -2.445591449737549, "step": 17340 }, { "epoch": 2.989317711922812, "grad_norm": 39.57339096069336, "learning_rate": 7.725196659413847e-12, "logits/chosen": -2.1366100311279297, "logits/rejected": -2.088812828063965, "logps/chosen": -214.4608612060547, "logps/rejected": -283.12646484375, "loss": 0.5176, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.6012340784072876, "rewards/margins": 0.71893709897995, "rewards/rejected": -2.3201708793640137, "step": 17350 }, { "epoch": 2.991040661612681, "grad_norm": 59.013671875, "learning_rate": 5.4341861421391965e-12, "logits/chosen": -2.170902729034424, "logits/rejected": -2.1363930702209473, "logps/chosen": -226.77188110351562, "logps/rejected": -299.11785888671875, "loss": 0.5488, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.7157566547393799, "rewards/margins": 0.7337027192115784, "rewards/rejected": -2.4494593143463135, "step": 17360 }, { "epoch": 2.99276361130255, "grad_norm": 42.70999526977539, "learning_rate": 3.5450941332726415e-12, "logits/chosen": -2.0977907180786133, "logits/rejected": -2.068878650665283, "logps/chosen": -229.20639038085938, "logps/rejected": -288.0982666015625, "loss": 0.569, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.760547399520874, "rewards/margins": 0.5857253074645996, "rewards/rejected": -2.3462727069854736, "step": 17370 }, { "epoch": 2.994486560992419, "grad_norm": 69.87225341796875, "learning_rate": 2.0579282258292862e-12, "logits/chosen": -2.0864222049713135, "logits/rejected": -2.054324150085449, "logps/chosen": -231.4883270263672, "logps/rejected": -297.22149658203125, "loss": 0.537, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.7789758443832397, "rewards/margins": 0.6550412178039551, "rewards/rejected": -2.4340169429779053, "step": 17380 }, { "epoch": 2.996209510682288, "grad_norm": 42.90975570678711, "learning_rate": 9.726943973387137e-13, "logits/chosen": -2.1187965869903564, "logits/rejected": -2.0839667320251465, "logps/chosen": -229.0812530517578, "logps/rejected": -299.0865173339844, "loss": 0.5169, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.7364839315414429, "rewards/margins": 0.7252603769302368, "rewards/rejected": -2.4617440700531006, "step": 17390 }, { "epoch": 2.997932460372157, "grad_norm": 30.220075607299805, "learning_rate": 2.8939700977836934e-13, "logits/chosen": -2.106614589691162, "logits/rejected": -2.068258285522461, "logps/chosen": -235.975341796875, "logps/rejected": -305.28216552734375, "loss": 0.548, "rewards/accuracies": 0.75, "rewards/chosen": -1.796933889389038, "rewards/margins": 0.7672160267829895, "rewards/rejected": -2.564150094985962, "step": 17400 }, { "epoch": 2.9996554100620263, "grad_norm": 72.88008117675781, "learning_rate": 8.038809595767305e-15, "logits/chosen": -2.0713143348693848, "logits/rejected": -2.0295281410217285, "logps/chosen": -219.08755493164062, "logps/rejected": -300.3836364746094, "loss": 0.4683, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.6422719955444336, "rewards/margins": 0.818253219127655, "rewards/rejected": -2.4605250358581543, "step": 17410 }, { "epoch": 3.0, "step": 17412, "total_flos": 0.0, "train_loss": 0.5850592939272546, "train_runtime": 88547.231, "train_samples_per_second": 3.146, "train_steps_per_second": 0.197 } ], "logging_steps": 10, "max_steps": 17412, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 400, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }