diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,26860 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0, + "eval_steps": 400, + "global_step": 17412, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00017229496898690558, + "grad_norm": 2.0902597904205322, + "learning_rate": 1.148105625717566e-10, + "logits/chosen": -2.8080272674560547, + "logits/rejected": -2.785019874572754, + "logps/chosen": -44.8405876159668, + "logps/rejected": -39.36625671386719, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1 + }, + { + "epoch": 0.0017229496898690559, + "grad_norm": 2.0985445976257324, + "learning_rate": 1.148105625717566e-09, + "logits/chosen": -2.9043519496917725, + "logits/rejected": -2.881565570831299, + "logps/chosen": -51.813934326171875, + "logps/rejected": -49.24929428100586, + "loss": 0.6931, + "rewards/accuracies": 0.4652777910232544, + "rewards/chosen": -5.939431503065862e-05, + "rewards/margins": 6.864402530482039e-05, + "rewards/rejected": -0.00012803831486962736, + "step": 10 + }, + { + "epoch": 0.0034458993797381117, + "grad_norm": 2.1032633781433105, + "learning_rate": 2.296211251435132e-09, + "logits/chosen": -2.9463086128234863, + "logits/rejected": -2.941572427749634, + "logps/chosen": -53.832359313964844, + "logps/rejected": -52.88805389404297, + "loss": 0.6931, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -0.0001046212637447752, + "rewards/margins": 2.1037729311501607e-05, + "rewards/rejected": -0.000125658989418298, + "step": 20 + }, + { + "epoch": 0.005168849069607168, + "grad_norm": 2.233656406402588, + "learning_rate": 3.4443168771526976e-09, + "logits/chosen": -2.9105770587921143, + "logits/rejected": -2.8925628662109375, + "logps/chosen": -57.676544189453125, + "logps/rejected": -57.83379364013672, + "loss": 0.6931, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": 2.8576847398653626e-05, + "rewards/margins": 5.704880550183589e-06, + "rewards/rejected": 2.2871969122206792e-05, + "step": 30 + }, + { + "epoch": 0.006891798759476223, + "grad_norm": 1.8444969654083252, + "learning_rate": 4.592422502870264e-09, + "logits/chosen": -2.926358938217163, + "logits/rejected": -2.9026577472686768, + "logps/chosen": -56.067359924316406, + "logps/rejected": -50.16572189331055, + "loss": 0.6931, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 4.615211582859047e-05, + "rewards/margins": 0.00012403741129674017, + "rewards/rejected": -7.788527000229806e-05, + "step": 40 + }, + { + "epoch": 0.00861474844934528, + "grad_norm": 1.9836063385009766, + "learning_rate": 5.74052812858783e-09, + "logits/chosen": -2.9309310913085938, + "logits/rejected": -2.9199869632720947, + "logps/chosen": -53.15720748901367, + "logps/rejected": -50.464534759521484, + "loss": 0.6931, + "rewards/accuracies": 0.48124998807907104, + "rewards/chosen": 0.00011061689292546362, + "rewards/margins": 5.0750844820868224e-05, + "rewards/rejected": 5.986605538055301e-05, + "step": 50 + }, + { + "epoch": 0.010337698139214336, + "grad_norm": 2.349973678588867, + "learning_rate": 6.888633754305395e-09, + "logits/chosen": -2.9500718116760254, + "logits/rejected": -2.9269402027130127, + "logps/chosen": -58.4092903137207, + "logps/rejected": -53.900306701660156, + "loss": 0.6932, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.0001145638307207264, + "rewards/margins": -0.00019984866958111525, + "rewards/rejected": 8.528483886038885e-05, + "step": 60 + }, + { + "epoch": 0.012060647829083391, + "grad_norm": 2.0366384983062744, + "learning_rate": 8.036739380022962e-09, + "logits/chosen": -2.906822681427002, + "logits/rejected": -2.894425868988037, + "logps/chosen": -54.81329345703125, + "logps/rejected": -52.41325759887695, + "loss": 0.693, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": 0.00014264558558352292, + "rewards/margins": 0.00025373551761731505, + "rewards/rejected": -0.00011108988110208884, + "step": 70 + }, + { + "epoch": 0.013783597518952447, + "grad_norm": 2.247792959213257, + "learning_rate": 9.184845005740529e-09, + "logits/chosen": -2.9640133380889893, + "logits/rejected": -2.9426205158233643, + "logps/chosen": -60.194786071777344, + "logps/rejected": -53.240272521972656, + "loss": 0.6932, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": 3.302104232716374e-05, + "rewards/margins": -5.603021782008e-05, + "rewards/rejected": 8.905124559532851e-05, + "step": 80 + }, + { + "epoch": 0.015506547208821502, + "grad_norm": 2.1423492431640625, + "learning_rate": 1.0332950631458094e-08, + "logits/chosen": -2.868622303009033, + "logits/rejected": -2.861255407333374, + "logps/chosen": -54.9310417175293, + "logps/rejected": -51.796836853027344, + "loss": 0.6932, + "rewards/accuracies": 0.5, + "rewards/chosen": 1.1550308954610955e-05, + "rewards/margins": -1.8355758584220894e-05, + "rewards/rejected": 2.9906092095188797e-05, + "step": 90 + }, + { + "epoch": 0.01722949689869056, + "grad_norm": 2.204350471496582, + "learning_rate": 1.148105625717566e-08, + "logits/chosen": -2.968977451324463, + "logits/rejected": -2.9209835529327393, + "logps/chosen": -57.40143966674805, + "logps/rejected": -48.77809524536133, + "loss": 0.693, + "rewards/accuracies": 0.4937500059604645, + "rewards/chosen": 8.09053672128357e-05, + "rewards/margins": 0.00019563671958167106, + "rewards/rejected": -0.00011473130143713206, + "step": 100 + }, + { + "epoch": 0.018952446588559616, + "grad_norm": 2.2462244033813477, + "learning_rate": 1.2629161882893224e-08, + "logits/chosen": -2.9463882446289062, + "logits/rejected": -2.927314281463623, + "logps/chosen": -56.65543746948242, + "logps/rejected": -51.98151779174805, + "loss": 0.693, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.00016289362974930555, + "rewards/margins": 0.0002862837864086032, + "rewards/rejected": -0.00012339015665929765, + "step": 110 + }, + { + "epoch": 0.02067539627842867, + "grad_norm": 2.3142030239105225, + "learning_rate": 1.377726750861079e-08, + "logits/chosen": -2.885002613067627, + "logits/rejected": -2.8734192848205566, + "logps/chosen": -53.67094802856445, + "logps/rejected": -54.91790771484375, + "loss": 0.693, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.0002903799177147448, + "rewards/margins": 0.0002786249096971005, + "rewards/rejected": 1.1754990737244952e-05, + "step": 120 + }, + { + "epoch": 0.022398345968297727, + "grad_norm": 1.963403582572937, + "learning_rate": 1.4925373134328357e-08, + "logits/chosen": -2.932080030441284, + "logits/rejected": -2.92702054977417, + "logps/chosen": -56.63043212890625, + "logps/rejected": -53.09601974487305, + "loss": 0.6931, + "rewards/accuracies": 0.53125, + "rewards/chosen": -6.796454545110464e-05, + "rewards/margins": 2.1783866941404995e-06, + "rewards/rejected": -7.014292350504547e-05, + "step": 130 + }, + { + "epoch": 0.024121295658166782, + "grad_norm": 2.389951467514038, + "learning_rate": 1.6073478760045924e-08, + "logits/chosen": -2.9416565895080566, + "logits/rejected": -2.931546449661255, + "logps/chosen": -54.46699905395508, + "logps/rejected": -52.611183166503906, + "loss": 0.693, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 7.823264604667202e-05, + "rewards/margins": 0.00021567563817370683, + "rewards/rejected": -0.00013744299940299243, + "step": 140 + }, + { + "epoch": 0.025844245348035838, + "grad_norm": 2.066187620162964, + "learning_rate": 1.722158438576349e-08, + "logits/chosen": -2.8867287635803223, + "logits/rejected": -2.8737969398498535, + "logps/chosen": -53.074363708496094, + "logps/rejected": -51.20355224609375, + "loss": 0.6932, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": -0.00011352014553267509, + "rewards/margins": -0.0001107621137634851, + "rewards/rejected": -2.7580172172747552e-06, + "step": 150 + }, + { + "epoch": 0.027567195037904894, + "grad_norm": 1.8884066343307495, + "learning_rate": 1.8369690011481057e-08, + "logits/chosen": -2.9306740760803223, + "logits/rejected": -2.916821002960205, + "logps/chosen": -54.592872619628906, + "logps/rejected": -54.249855041503906, + "loss": 0.6932, + "rewards/accuracies": 0.46875, + "rewards/chosen": 3.338105670991354e-05, + "rewards/margins": -9.989602403948084e-05, + "rewards/rejected": 0.00013327706255950034, + "step": 160 + }, + { + "epoch": 0.02929014472777395, + "grad_norm": 2.0824570655822754, + "learning_rate": 1.9517795637198624e-08, + "logits/chosen": -2.915168285369873, + "logits/rejected": -2.900991439819336, + "logps/chosen": -56.46404266357422, + "logps/rejected": -50.75846481323242, + "loss": 0.6932, + "rewards/accuracies": 0.48124998807907104, + "rewards/chosen": -9.851455979514867e-05, + "rewards/margins": -1.5921646991046146e-05, + "rewards/rejected": -8.259294554591179e-05, + "step": 170 + }, + { + "epoch": 0.031013094417643005, + "grad_norm": 2.266218423843384, + "learning_rate": 2.0665901262916187e-08, + "logits/chosen": -2.914151906967163, + "logits/rejected": -2.898178815841675, + "logps/chosen": -57.04075241088867, + "logps/rejected": -52.43286895751953, + "loss": 0.6931, + "rewards/accuracies": 0.4937500059604645, + "rewards/chosen": -6.253592346183723e-06, + "rewards/margins": 0.00015795855142641813, + "rewards/rejected": -0.00016421216423623264, + "step": 180 + }, + { + "epoch": 0.03273604410751206, + "grad_norm": 2.5385985374450684, + "learning_rate": 2.1814006888633754e-08, + "logits/chosen": -2.9488868713378906, + "logits/rejected": -2.9160306453704834, + "logps/chosen": -59.6348876953125, + "logps/rejected": -51.66065216064453, + "loss": 0.6931, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": -0.00011362945951987058, + "rewards/margins": 2.3863278784119757e-06, + "rewards/rejected": -0.00011601579899434, + "step": 190 + }, + { + "epoch": 0.03445899379738112, + "grad_norm": 2.239220380783081, + "learning_rate": 2.296211251435132e-08, + "logits/chosen": -2.9068408012390137, + "logits/rejected": -2.897510051727295, + "logps/chosen": -54.77439498901367, + "logps/rejected": -53.70017623901367, + "loss": 0.6931, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.00012153792340541258, + "rewards/margins": 2.215528002125211e-05, + "rewards/rejected": -0.00014369319251272827, + "step": 200 + }, + { + "epoch": 0.03618194348725017, + "grad_norm": 2.1656925678253174, + "learning_rate": 2.4110218140068887e-08, + "logits/chosen": -2.864589214324951, + "logits/rejected": -2.8619420528411865, + "logps/chosen": -54.096771240234375, + "logps/rejected": -56.413795471191406, + "loss": 0.6932, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -7.368248770944774e-05, + "rewards/margins": -8.52059674798511e-05, + "rewards/rejected": 1.1523479770403355e-05, + "step": 210 + }, + { + "epoch": 0.03790489317711923, + "grad_norm": 2.0480666160583496, + "learning_rate": 2.5258323765786448e-08, + "logits/chosen": -2.904480457305908, + "logits/rejected": -2.8816940784454346, + "logps/chosen": -53.449119567871094, + "logps/rejected": -50.03138732910156, + "loss": 0.6932, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -0.0002139102143701166, + "rewards/margins": -3.339463728480041e-05, + "rewards/rejected": -0.00018051560618914664, + "step": 220 + }, + { + "epoch": 0.03962784286698828, + "grad_norm": 2.2359554767608643, + "learning_rate": 2.6406429391504014e-08, + "logits/chosen": -2.9000978469848633, + "logits/rejected": -2.8889379501342773, + "logps/chosen": -49.822628021240234, + "logps/rejected": -49.380130767822266, + "loss": 0.6931, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.0002377505588810891, + "rewards/margins": 8.301097113871947e-05, + "rewards/rejected": -0.0003207615518476814, + "step": 230 + }, + { + "epoch": 0.04135079255685734, + "grad_norm": 1.985116958618164, + "learning_rate": 2.755453501722158e-08, + "logits/chosen": -2.8757596015930176, + "logits/rejected": -2.8462939262390137, + "logps/chosen": -56.7255859375, + "logps/rejected": -51.6666145324707, + "loss": 0.6931, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -9.991443948820233e-05, + "rewards/margins": 0.00014959466352593154, + "rewards/rejected": -0.00024950908846221864, + "step": 240 + }, + { + "epoch": 0.043073742246726394, + "grad_norm": 2.0716514587402344, + "learning_rate": 2.8702640642939148e-08, + "logits/chosen": -2.948909044265747, + "logits/rejected": -2.931750535964966, + "logps/chosen": -53.45426559448242, + "logps/rejected": -50.105186462402344, + "loss": 0.6931, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.00011698435992002487, + "rewards/margins": 6.186254177009687e-05, + "rewards/rejected": -0.00017884690896607935, + "step": 250 + }, + { + "epoch": 0.044796691936595454, + "grad_norm": 1.9955936670303345, + "learning_rate": 2.9850746268656714e-08, + "logits/chosen": -2.9301047325134277, + "logits/rejected": -2.922619342803955, + "logps/chosen": -55.73781204223633, + "logps/rejected": -55.22661209106445, + "loss": 0.6929, + "rewards/accuracies": 0.625, + "rewards/chosen": -3.683334944071248e-05, + "rewards/margins": 0.0005753434961661696, + "rewards/rejected": -0.0006121768383309245, + "step": 260 + }, + { + "epoch": 0.046519641626464506, + "grad_norm": 2.1792263984680176, + "learning_rate": 3.099885189437428e-08, + "logits/chosen": -2.8962621688842773, + "logits/rejected": -2.888561487197876, + "logps/chosen": -53.812705993652344, + "logps/rejected": -53.4767951965332, + "loss": 0.693, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.00036246198578737676, + "rewards/margins": 0.00020978061365894973, + "rewards/rejected": -0.0005722425994463265, + "step": 270 + }, + { + "epoch": 0.048242591316333565, + "grad_norm": 2.0500059127807617, + "learning_rate": 3.214695752009185e-08, + "logits/chosen": -2.9554762840270996, + "logits/rejected": -2.9338696002960205, + "logps/chosen": -58.78718948364258, + "logps/rejected": -52.61870193481445, + "loss": 0.693, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.00021337093494366854, + "rewards/margins": 0.0003163707733619958, + "rewards/rejected": -0.00052974175196141, + "step": 280 + }, + { + "epoch": 0.04996554100620262, + "grad_norm": 1.9906699657440186, + "learning_rate": 3.3295063145809414e-08, + "logits/chosen": -2.9041569232940674, + "logits/rejected": -2.895073175430298, + "logps/chosen": -56.95964431762695, + "logps/rejected": -53.340545654296875, + "loss": 0.693, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.00025733906659297645, + "rewards/margins": 0.00026806717505678535, + "rewards/rejected": -0.0005254062125459313, + "step": 290 + }, + { + "epoch": 0.051688490696071676, + "grad_norm": 2.0617942810058594, + "learning_rate": 3.444316877152698e-08, + "logits/chosen": -2.8528239727020264, + "logits/rejected": -2.8550000190734863, + "logps/chosen": -55.023704528808594, + "logps/rejected": -53.3245735168457, + "loss": 0.693, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.00032503431430086493, + "rewards/margins": 0.00029767598607577384, + "rewards/rejected": -0.0006227103294804692, + "step": 300 + }, + { + "epoch": 0.05341144038594073, + "grad_norm": 2.061204433441162, + "learning_rate": 3.559127439724455e-08, + "logits/chosen": -2.901181221008301, + "logits/rejected": -2.901825428009033, + "logps/chosen": -54.8587532043457, + "logps/rejected": -52.582496643066406, + "loss": 0.6929, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.0002555266546551138, + "rewards/margins": 0.0005454671336337924, + "rewards/rejected": -0.0008009938756003976, + "step": 310 + }, + { + "epoch": 0.05513439007580979, + "grad_norm": 2.3455982208251953, + "learning_rate": 3.6739380022962115e-08, + "logits/chosen": -2.887146472930908, + "logits/rejected": -2.8691606521606445, + "logps/chosen": -56.60396194458008, + "logps/rejected": -48.95844650268555, + "loss": 0.693, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.0003960584872402251, + "rewards/margins": 0.0003540136094670743, + "rewards/rejected": -0.0007500721258111298, + "step": 320 + }, + { + "epoch": 0.05685733976567884, + "grad_norm": 2.033954381942749, + "learning_rate": 3.788748564867968e-08, + "logits/chosen": -2.9154181480407715, + "logits/rejected": -2.8979363441467285, + "logps/chosen": -56.212135314941406, + "logps/rejected": -51.129173278808594, + "loss": 0.6927, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.00016808233340270817, + "rewards/margins": 0.0009294062620028853, + "rewards/rejected": -0.001097488566301763, + "step": 330 + }, + { + "epoch": 0.0585802894555479, + "grad_norm": 2.0031909942626953, + "learning_rate": 3.903559127439725e-08, + "logits/chosen": -2.8824925422668457, + "logits/rejected": -2.869655132293701, + "logps/chosen": -52.89168167114258, + "logps/rejected": -51.91801834106445, + "loss": 0.693, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.0004165670252405107, + "rewards/margins": 0.0003390835190657526, + "rewards/rejected": -0.0007556505734100938, + "step": 340 + }, + { + "epoch": 0.06030323914541695, + "grad_norm": 2.2410380840301514, + "learning_rate": 4.018369690011481e-08, + "logits/chosen": -2.855290412902832, + "logits/rejected": -2.822758436203003, + "logps/chosen": -57.14137649536133, + "logps/rejected": -53.933021545410156, + "loss": 0.6928, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.0003419906715862453, + "rewards/margins": 0.0006515433778986335, + "rewards/rejected": -0.0009935342241078615, + "step": 350 + }, + { + "epoch": 0.06202618883528601, + "grad_norm": 2.1879637241363525, + "learning_rate": 4.1331802525832375e-08, + "logits/chosen": -2.9514498710632324, + "logits/rejected": -2.933030605316162, + "logps/chosen": -56.0881233215332, + "logps/rejected": -49.44280242919922, + "loss": 0.6929, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.0008741362253203988, + "rewards/margins": 0.00042976863915100694, + "rewards/rejected": -0.0013039048062637448, + "step": 360 + }, + { + "epoch": 0.06374913852515507, + "grad_norm": 2.0574898719787598, + "learning_rate": 4.247990815154994e-08, + "logits/chosen": -2.9376654624938965, + "logits/rejected": -2.9129061698913574, + "logps/chosen": -54.38572311401367, + "logps/rejected": -51.12152099609375, + "loss": 0.6925, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.000479917653137818, + "rewards/margins": 0.0012370418990030885, + "rewards/rejected": -0.001716959523037076, + "step": 370 + }, + { + "epoch": 0.06547208821502412, + "grad_norm": 1.9364941120147705, + "learning_rate": 4.362801377726751e-08, + "logits/chosen": -2.9991958141326904, + "logits/rejected": -2.979586601257324, + "logps/chosen": -55.15120315551758, + "logps/rejected": -51.248802185058594, + "loss": 0.6926, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.0007126034470275044, + "rewards/margins": 0.001187127665616572, + "rewards/rejected": -0.0018997311126440763, + "step": 380 + }, + { + "epoch": 0.06719503790489317, + "grad_norm": 2.235966920852661, + "learning_rate": 4.4776119402985075e-08, + "logits/chosen": -2.931478977203369, + "logits/rejected": -2.916945219039917, + "logps/chosen": -57.42863845825195, + "logps/rejected": -54.23737716674805, + "loss": 0.6927, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -0.0010502212680876255, + "rewards/margins": 0.0009564169449731708, + "rewards/rejected": -0.0020066378638148308, + "step": 390 + }, + { + "epoch": 0.06891798759476224, + "grad_norm": 1.7472114562988281, + "learning_rate": 4.592422502870264e-08, + "logits/chosen": -2.918935537338257, + "logits/rejected": -2.9072728157043457, + "logps/chosen": -54.609474182128906, + "logps/rejected": -51.67049026489258, + "loss": 0.6924, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.0010759534779936075, + "rewards/margins": 0.0014685506466776133, + "rewards/rejected": -0.002544503891840577, + "step": 400 + }, + { + "epoch": 0.06891798759476224, + "eval_logits/chosen": -2.972291946411133, + "eval_logits/rejected": -2.968651294708252, + "eval_logps/chosen": -58.909366607666016, + "eval_logps/rejected": -62.675540924072266, + "eval_loss": 0.692988395690918, + "eval_rewards/accuracies": 0.5390334725379944, + "eval_rewards/chosen": 0.0010610398603603244, + "eval_rewards/margins": 0.000320415070746094, + "eval_rewards/rejected": 0.0007406247896142304, + "eval_runtime": 384.3708, + "eval_samples_per_second": 11.198, + "eval_steps_per_second": 1.4, + "step": 400 + }, + { + "epoch": 0.07064093728463129, + "grad_norm": 1.9456653594970703, + "learning_rate": 4.707233065442021e-08, + "logits/chosen": -2.9112446308135986, + "logits/rejected": -2.9126250743865967, + "logps/chosen": -51.31296920776367, + "logps/rejected": -54.56962203979492, + "loss": 0.6929, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.0019060971681028605, + "rewards/margins": 0.0004303969908505678, + "rewards/rejected": -0.0023364939261227846, + "step": 410 + }, + { + "epoch": 0.07236388697450034, + "grad_norm": 2.3853089809417725, + "learning_rate": 4.8220436280137775e-08, + "logits/chosen": -2.8980908393859863, + "logits/rejected": -2.8948521614074707, + "logps/chosen": -55.48943328857422, + "logps/rejected": -53.71550369262695, + "loss": 0.6926, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.0012784868013113737, + "rewards/margins": 0.0010058528278023005, + "rewards/rejected": -0.0022843393962830305, + "step": 420 + }, + { + "epoch": 0.0740868366643694, + "grad_norm": 2.0890214443206787, + "learning_rate": 4.9368541905855335e-08, + "logits/chosen": -2.936516284942627, + "logits/rejected": -2.9260849952697754, + "logps/chosen": -54.843605041503906, + "logps/rejected": -53.190086364746094, + "loss": 0.6926, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.0008548603509552777, + "rewards/margins": 0.0012039890279993415, + "rewards/rejected": -0.0020588492043316364, + "step": 430 + }, + { + "epoch": 0.07580978635423846, + "grad_norm": 2.3940982818603516, + "learning_rate": 5.0516647531572895e-08, + "logits/chosen": -2.9783670902252197, + "logits/rejected": -2.952179193496704, + "logps/chosen": -55.11848831176758, + "logps/rejected": -53.0138053894043, + "loss": 0.6918, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.001027547288686037, + "rewards/margins": 0.002638460136950016, + "rewards/rejected": -0.0036660078912973404, + "step": 440 + }, + { + "epoch": 0.07753273604410751, + "grad_norm": 2.0819737911224365, + "learning_rate": 5.166475315729046e-08, + "logits/chosen": -2.916598320007324, + "logits/rejected": -2.8953187465667725, + "logps/chosen": -57.37543869018555, + "logps/rejected": -54.627662658691406, + "loss": 0.6923, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.0009044323232956231, + "rewards/margins": 0.0017119159456342459, + "rewards/rejected": -0.002616348210722208, + "step": 450 + }, + { + "epoch": 0.07925568573397657, + "grad_norm": 2.0121195316314697, + "learning_rate": 5.281285878300803e-08, + "logits/chosen": -2.8869118690490723, + "logits/rejected": -2.8755950927734375, + "logps/chosen": -57.134727478027344, + "logps/rejected": -52.3559455871582, + "loss": 0.6925, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.001536081894300878, + "rewards/margins": 0.001315793488174677, + "rewards/rejected": -0.0028518750332295895, + "step": 460 + }, + { + "epoch": 0.08097863542384562, + "grad_norm": 2.0747272968292236, + "learning_rate": 5.3960964408725595e-08, + "logits/chosen": -2.899627923965454, + "logits/rejected": -2.875185966491699, + "logps/chosen": -54.55614471435547, + "logps/rejected": -50.614784240722656, + "loss": 0.692, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.0010662561981007457, + "rewards/margins": 0.0022261999547481537, + "rewards/rejected": -0.003292456269264221, + "step": 470 + }, + { + "epoch": 0.08270158511371468, + "grad_norm": 2.3838884830474854, + "learning_rate": 5.510907003444316e-08, + "logits/chosen": -2.9103691577911377, + "logits/rejected": -2.902712345123291, + "logps/chosen": -54.980934143066406, + "logps/rejected": -58.52118682861328, + "loss": 0.6922, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.001894455635920167, + "rewards/margins": 0.0019050573464483023, + "rewards/rejected": -0.003799512516707182, + "step": 480 + }, + { + "epoch": 0.08442453480358374, + "grad_norm": 2.2515811920166016, + "learning_rate": 5.625717566016073e-08, + "logits/chosen": -2.858151912689209, + "logits/rejected": -2.8224198818206787, + "logps/chosen": -61.32958221435547, + "logps/rejected": -50.735389709472656, + "loss": 0.6915, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.002147951629012823, + "rewards/margins": 0.0033252262510359287, + "rewards/rejected": -0.005473177880048752, + "step": 490 + }, + { + "epoch": 0.08614748449345279, + "grad_norm": 1.9670840501785278, + "learning_rate": 5.7405281285878295e-08, + "logits/chosen": -2.8843283653259277, + "logits/rejected": -2.8672218322753906, + "logps/chosen": -56.59968185424805, + "logps/rejected": -51.90899658203125, + "loss": 0.6922, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.0031538717448711395, + "rewards/margins": 0.00200037844479084, + "rewards/rejected": -0.005154250655323267, + "step": 500 + }, + { + "epoch": 0.08787043418332184, + "grad_norm": 1.9930695295333862, + "learning_rate": 5.855338691159586e-08, + "logits/chosen": -2.857835292816162, + "logits/rejected": -2.846858501434326, + "logps/chosen": -59.034812927246094, + "logps/rejected": -52.214622497558594, + "loss": 0.6928, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": -0.0035990376491099596, + "rewards/margins": 0.0006619172054342926, + "rewards/rejected": -0.004260954912751913, + "step": 510 + }, + { + "epoch": 0.08959338387319091, + "grad_norm": 1.9633930921554565, + "learning_rate": 5.970149253731343e-08, + "logits/chosen": -2.9089746475219727, + "logits/rejected": -2.8926620483398438, + "logps/chosen": -57.21832275390625, + "logps/rejected": -51.88628005981445, + "loss": 0.6917, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.004442922305315733, + "rewards/margins": 0.0029019941575825214, + "rewards/rejected": -0.007344916462898254, + "step": 520 + }, + { + "epoch": 0.09131633356305996, + "grad_norm": 1.8883012533187866, + "learning_rate": 6.084959816303099e-08, + "logits/chosen": -2.9070680141448975, + "logits/rejected": -2.8766965866088867, + "logps/chosen": -56.9815673828125, + "logps/rejected": -50.6800651550293, + "loss": 0.6909, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.0032037501223385334, + "rewards/margins": 0.004527016542851925, + "rewards/rejected": -0.007730766199529171, + "step": 530 + }, + { + "epoch": 0.09303928325292901, + "grad_norm": 1.9685890674591064, + "learning_rate": 6.199770378874856e-08, + "logits/chosen": -2.898916482925415, + "logits/rejected": -2.8864660263061523, + "logps/chosen": -54.44707489013672, + "logps/rejected": -53.032958984375, + "loss": 0.6917, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -0.0050659701228141785, + "rewards/margins": 0.002865965710952878, + "rewards/rejected": -0.007931936532258987, + "step": 540 + }, + { + "epoch": 0.09476223294279806, + "grad_norm": 2.0686702728271484, + "learning_rate": 6.314580941446614e-08, + "logits/chosen": -2.9271841049194336, + "logits/rejected": -2.9111077785491943, + "logps/chosen": -55.30144119262695, + "logps/rejected": -51.95891571044922, + "loss": 0.691, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.005565012339502573, + "rewards/margins": 0.004354250617325306, + "rewards/rejected": -0.009919262491166592, + "step": 550 + }, + { + "epoch": 0.09648518263266713, + "grad_norm": 2.248969078063965, + "learning_rate": 6.42939150401837e-08, + "logits/chosen": -2.9135735034942627, + "logits/rejected": -2.903707981109619, + "logps/chosen": -54.01261520385742, + "logps/rejected": -54.91992950439453, + "loss": 0.6914, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -0.006114738993346691, + "rewards/margins": 0.0035691908560693264, + "rewards/rejected": -0.00968393124639988, + "step": 560 + }, + { + "epoch": 0.09820813232253618, + "grad_norm": 2.045034170150757, + "learning_rate": 6.544202066590127e-08, + "logits/chosen": -2.886688709259033, + "logits/rejected": -2.8828177452087402, + "logps/chosen": -53.0010986328125, + "logps/rejected": -54.485321044921875, + "loss": 0.6916, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.0064199878834187984, + "rewards/margins": 0.0031796726398169994, + "rewards/rejected": -0.009599661454558372, + "step": 570 + }, + { + "epoch": 0.09993108201240523, + "grad_norm": 1.7068687677383423, + "learning_rate": 6.659012629161883e-08, + "logits/chosen": -2.9013490676879883, + "logits/rejected": -2.895749568939209, + "logps/chosen": -52.609703063964844, + "logps/rejected": -52.71235275268555, + "loss": 0.6919, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.0066453502513468266, + "rewards/margins": 0.0026767298113554716, + "rewards/rejected": -0.009322079829871655, + "step": 580 + }, + { + "epoch": 0.1016540317022743, + "grad_norm": 1.9736390113830566, + "learning_rate": 6.77382319173364e-08, + "logits/chosen": -2.897890567779541, + "logits/rejected": -2.8842339515686035, + "logps/chosen": -55.642784118652344, + "logps/rejected": -55.381492614746094, + "loss": 0.692, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -0.006683549843728542, + "rewards/margins": 0.002430492080748081, + "rewards/rejected": -0.009114041924476624, + "step": 590 + }, + { + "epoch": 0.10337698139214335, + "grad_norm": 2.4153261184692383, + "learning_rate": 6.888633754305396e-08, + "logits/chosen": -2.898934841156006, + "logits/rejected": -2.879272937774658, + "logps/chosen": -56.03900146484375, + "logps/rejected": -56.54254150390625, + "loss": 0.689, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.004332136828452349, + "rewards/margins": 0.008359143510460854, + "rewards/rejected": -0.012691279873251915, + "step": 600 + }, + { + "epoch": 0.1050999310820124, + "grad_norm": 2.184461832046509, + "learning_rate": 7.003444316877152e-08, + "logits/chosen": -2.8567090034484863, + "logits/rejected": -2.855968952178955, + "logps/chosen": -55.145233154296875, + "logps/rejected": -53.976341247558594, + "loss": 0.6927, + "rewards/accuracies": 0.53125, + "rewards/chosen": -0.008213085122406483, + "rewards/margins": 0.0010757189011201262, + "rewards/rejected": -0.009288804605603218, + "step": 610 + }, + { + "epoch": 0.10682288077188146, + "grad_norm": 2.221895694732666, + "learning_rate": 7.11825487944891e-08, + "logits/chosen": -2.945406675338745, + "logits/rejected": -2.923915386199951, + "logps/chosen": -57.062896728515625, + "logps/rejected": -53.8463020324707, + "loss": 0.6899, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.007125864736735821, + "rewards/margins": 0.006631826050579548, + "rewards/rejected": -0.013757690787315369, + "step": 620 + }, + { + "epoch": 0.10854583046175052, + "grad_norm": 2.301266670227051, + "learning_rate": 7.233065442020666e-08, + "logits/chosen": -2.939603328704834, + "logits/rejected": -2.91625714302063, + "logps/chosen": -56.117332458496094, + "logps/rejected": -51.174259185791016, + "loss": 0.6909, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.009119195863604546, + "rewards/margins": 0.004569724667817354, + "rewards/rejected": -0.013688920065760612, + "step": 630 + }, + { + "epoch": 0.11026878015161957, + "grad_norm": 2.2985427379608154, + "learning_rate": 7.347876004592423e-08, + "logits/chosen": -2.9217209815979004, + "logits/rejected": -2.9191370010375977, + "logps/chosen": -54.47275924682617, + "logps/rejected": -54.68788528442383, + "loss": 0.6922, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.010385403409600258, + "rewards/margins": 0.002089323475956917, + "rewards/rejected": -0.012474726885557175, + "step": 640 + }, + { + "epoch": 0.11199172984148863, + "grad_norm": 2.41143536567688, + "learning_rate": 7.462686567164179e-08, + "logits/chosen": -2.9253361225128174, + "logits/rejected": -2.928819179534912, + "logps/chosen": -54.176780700683594, + "logps/rejected": -55.32817459106445, + "loss": 0.6932, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.012147116474807262, + "rewards/margins": 0.00017380015924572945, + "rewards/rejected": -0.01232091709971428, + "step": 650 + }, + { + "epoch": 0.11371467953135768, + "grad_norm": 2.1162109375, + "learning_rate": 7.577497129735936e-08, + "logits/chosen": -2.869983196258545, + "logits/rejected": -2.8705716133117676, + "logps/chosen": -56.631378173828125, + "logps/rejected": -53.262786865234375, + "loss": 0.6915, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.009453224018216133, + "rewards/margins": 0.003567267907783389, + "rewards/rejected": -0.013020491227507591, + "step": 660 + }, + { + "epoch": 0.11543762922122675, + "grad_norm": 2.0193326473236084, + "learning_rate": 7.692307692307692e-08, + "logits/chosen": -2.8858590126037598, + "logits/rejected": -2.881812572479248, + "logps/chosen": -55.13932418823242, + "logps/rejected": -58.1707649230957, + "loss": 0.6916, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.008889088407158852, + "rewards/margins": 0.0032468761783093214, + "rewards/rejected": -0.012135963886976242, + "step": 670 + }, + { + "epoch": 0.1171605789110958, + "grad_norm": 2.262868881225586, + "learning_rate": 7.80711825487945e-08, + "logits/chosen": -2.8544554710388184, + "logits/rejected": -2.831420660018921, + "logps/chosen": -56.3204231262207, + "logps/rejected": -52.126670837402344, + "loss": 0.6905, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.012224559672176838, + "rewards/margins": 0.005443849600851536, + "rewards/rejected": -0.017668409273028374, + "step": 680 + }, + { + "epoch": 0.11888352860096485, + "grad_norm": 2.2813498973846436, + "learning_rate": 7.921928817451206e-08, + "logits/chosen": -2.943873643875122, + "logits/rejected": -2.9195990562438965, + "logps/chosen": -61.373748779296875, + "logps/rejected": -51.66704559326172, + "loss": 0.6906, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.008815468288958073, + "rewards/margins": 0.005301609635353088, + "rewards/rejected": -0.014117076992988586, + "step": 690 + }, + { + "epoch": 0.1206064782908339, + "grad_norm": 2.2850728034973145, + "learning_rate": 8.036739380022962e-08, + "logits/chosen": -2.9098591804504395, + "logits/rejected": -2.886373519897461, + "logps/chosen": -57.46692657470703, + "logps/rejected": -53.712921142578125, + "loss": 0.6898, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.009994572028517723, + "rewards/margins": 0.00689247390255332, + "rewards/rejected": -0.01688704639673233, + "step": 700 + }, + { + "epoch": 0.12232942798070297, + "grad_norm": 2.093459129333496, + "learning_rate": 8.151549942594719e-08, + "logits/chosen": -2.9022059440612793, + "logits/rejected": -2.885436534881592, + "logps/chosen": -56.37190628051758, + "logps/rejected": -55.75042724609375, + "loss": 0.6896, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.011153196915984154, + "rewards/margins": 0.007232338190078735, + "rewards/rejected": -0.01838553510606289, + "step": 710 + }, + { + "epoch": 0.12405237767057202, + "grad_norm": 2.2593727111816406, + "learning_rate": 8.266360505166475e-08, + "logits/chosen": -2.8753161430358887, + "logits/rejected": -2.871997833251953, + "logps/chosen": -55.58568572998047, + "logps/rejected": -55.225379943847656, + "loss": 0.6912, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.012562957592308521, + "rewards/margins": 0.004113520495593548, + "rewards/rejected": -0.01667647436261177, + "step": 720 + }, + { + "epoch": 0.12577532736044109, + "grad_norm": 2.259854316711426, + "learning_rate": 8.381171067738232e-08, + "logits/chosen": -2.9437055587768555, + "logits/rejected": -2.9245526790618896, + "logps/chosen": -58.900779724121094, + "logps/rejected": -54.2612190246582, + "loss": 0.689, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.014078138396143913, + "rewards/margins": 0.008628633804619312, + "rewards/rejected": -0.0227067731320858, + "step": 730 + }, + { + "epoch": 0.12749827705031014, + "grad_norm": 2.150418281555176, + "learning_rate": 8.495981630309988e-08, + "logits/chosen": -2.8817498683929443, + "logits/rejected": -2.8615224361419678, + "logps/chosen": -56.9492073059082, + "logps/rejected": -55.37824249267578, + "loss": 0.6894, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.012495382688939571, + "rewards/margins": 0.007908121682703495, + "rewards/rejected": -0.020403504371643066, + "step": 740 + }, + { + "epoch": 0.1292212267401792, + "grad_norm": 2.1469054222106934, + "learning_rate": 8.610792192881746e-08, + "logits/chosen": -2.9918925762176514, + "logits/rejected": -2.9728503227233887, + "logps/chosen": -58.38031005859375, + "logps/rejected": -55.4575080871582, + "loss": 0.6866, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.01179544534534216, + "rewards/margins": 0.013385293073952198, + "rewards/rejected": -0.025180738419294357, + "step": 750 + }, + { + "epoch": 0.13094417643004824, + "grad_norm": 2.4583590030670166, + "learning_rate": 8.725602755453502e-08, + "logits/chosen": -2.9008548259735107, + "logits/rejected": -2.8728179931640625, + "logps/chosen": -57.165260314941406, + "logps/rejected": -51.254608154296875, + "loss": 0.6873, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.016394078731536865, + "rewards/margins": 0.012260092422366142, + "rewards/rejected": -0.028654176741838455, + "step": 760 + }, + { + "epoch": 0.1326671261199173, + "grad_norm": 2.0270307064056396, + "learning_rate": 8.840413318025258e-08, + "logits/chosen": -2.9242424964904785, + "logits/rejected": -2.9103903770446777, + "logps/chosen": -56.168739318847656, + "logps/rejected": -54.60688018798828, + "loss": 0.6881, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.018161626532673836, + "rewards/margins": 0.010471840389072895, + "rewards/rejected": -0.028633465990424156, + "step": 770 + }, + { + "epoch": 0.13439007580978635, + "grad_norm": 2.446073055267334, + "learning_rate": 8.955223880597015e-08, + "logits/chosen": -2.922211170196533, + "logits/rejected": -2.8983256816864014, + "logps/chosen": -56.91460418701172, + "logps/rejected": -53.85261154174805, + "loss": 0.6874, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.023327644914388657, + "rewards/margins": 0.011989260092377663, + "rewards/rejected": -0.03531690686941147, + "step": 780 + }, + { + "epoch": 0.1361130254996554, + "grad_norm": 2.0871706008911133, + "learning_rate": 9.070034443168771e-08, + "logits/chosen": -2.9211132526397705, + "logits/rejected": -2.895343065261841, + "logps/chosen": -57.1553840637207, + "logps/rejected": -56.386634826660156, + "loss": 0.6869, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.0219894852489233, + "rewards/margins": 0.013090623542666435, + "rewards/rejected": -0.03508010879158974, + "step": 790 + }, + { + "epoch": 0.13783597518952448, + "grad_norm": 2.458371639251709, + "learning_rate": 9.184845005740528e-08, + "logits/chosen": -2.858908176422119, + "logits/rejected": -2.839359998703003, + "logps/chosen": -58.53376007080078, + "logps/rejected": -58.27695846557617, + "loss": 0.6891, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.02825082466006279, + "rewards/margins": 0.008799943141639233, + "rewards/rejected": -0.0370507650077343, + "step": 800 + }, + { + "epoch": 0.13783597518952448, + "eval_logits/chosen": -2.9621548652648926, + "eval_logits/rejected": -2.9587860107421875, + "eval_logps/chosen": -59.62390899658203, + "eval_logps/rejected": -63.83048629760742, + "eval_loss": 0.690947413444519, + "eval_rewards/accuracies": 0.5748141407966614, + "eval_rewards/chosen": -0.006084338761866093, + "eval_rewards/margins": 0.004724523052573204, + "eval_rewards/rejected": -0.010808861814439297, + "eval_runtime": 384.4775, + "eval_samples_per_second": 11.194, + "eval_steps_per_second": 1.399, + "step": 800 + }, + { + "epoch": 0.13955892487939353, + "grad_norm": 2.213287591934204, + "learning_rate": 9.299655568312284e-08, + "logits/chosen": -2.8889384269714355, + "logits/rejected": -2.8673510551452637, + "logps/chosen": -59.30315017700195, + "logps/rejected": -57.970436096191406, + "loss": 0.6881, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.023655524477362633, + "rewards/margins": 0.01091140415519476, + "rewards/rejected": -0.034566931426525116, + "step": 810 + }, + { + "epoch": 0.14128187456926258, + "grad_norm": 2.1062045097351074, + "learning_rate": 9.414466130884042e-08, + "logits/chosen": -2.921374797821045, + "logits/rejected": -2.9040017127990723, + "logps/chosen": -55.840232849121094, + "logps/rejected": -54.697853088378906, + "loss": 0.6873, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.030127938836812973, + "rewards/margins": 0.012304485775530338, + "rewards/rejected": -0.04243241995573044, + "step": 820 + }, + { + "epoch": 0.14300482425913164, + "grad_norm": 2.369229555130005, + "learning_rate": 9.529276693455798e-08, + "logits/chosen": -2.9105210304260254, + "logits/rejected": -2.8936829566955566, + "logps/chosen": -59.06435012817383, + "logps/rejected": -58.14280319213867, + "loss": 0.6845, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.03192334994673729, + "rewards/margins": 0.01835811696946621, + "rewards/rejected": -0.05028147250413895, + "step": 830 + }, + { + "epoch": 0.1447277739490007, + "grad_norm": 2.176154136657715, + "learning_rate": 9.644087256027555e-08, + "logits/chosen": -2.9422030448913574, + "logits/rejected": -2.92073917388916, + "logps/chosen": -58.568359375, + "logps/rejected": -54.570281982421875, + "loss": 0.6855, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.034468166530132294, + "rewards/margins": 0.016375292092561722, + "rewards/rejected": -0.050843458622694016, + "step": 840 + }, + { + "epoch": 0.14645072363886974, + "grad_norm": 2.1147189140319824, + "learning_rate": 9.758897818599311e-08, + "logits/chosen": -2.8813514709472656, + "logits/rejected": -2.872459888458252, + "logps/chosen": -55.68572998046875, + "logps/rejected": -59.400367736816406, + "loss": 0.6894, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.04169295355677605, + "rewards/margins": 0.008527868427336216, + "rewards/rejected": -0.05022081732749939, + "step": 850 + }, + { + "epoch": 0.1481736733287388, + "grad_norm": 2.189018726348877, + "learning_rate": 9.873708381171067e-08, + "logits/chosen": -2.906050205230713, + "logits/rejected": -2.891706705093384, + "logps/chosen": -58.0321044921875, + "logps/rejected": -56.58018112182617, + "loss": 0.686, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.04129540175199509, + "rewards/margins": 0.015181129798293114, + "rewards/rejected": -0.05647652596235275, + "step": 860 + }, + { + "epoch": 0.14989662301860784, + "grad_norm": 2.0911967754364014, + "learning_rate": 9.988518943742824e-08, + "logits/chosen": -2.9313414096832275, + "logits/rejected": -2.9336588382720947, + "logps/chosen": -56.133277893066406, + "logps/rejected": -58.191932678222656, + "loss": 0.6877, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.04252300783991814, + "rewards/margins": 0.01182483322918415, + "rewards/rejected": -0.054347842931747437, + "step": 870 + }, + { + "epoch": 0.15161957270847692, + "grad_norm": 2.3610286712646484, + "learning_rate": 1.0103329506314579e-07, + "logits/chosen": -2.8736624717712402, + "logits/rejected": -2.8538830280303955, + "logps/chosen": -57.096893310546875, + "logps/rejected": -55.92301559448242, + "loss": 0.6833, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.03677918389439583, + "rewards/margins": 0.020569270476698875, + "rewards/rejected": -0.057348452508449554, + "step": 880 + }, + { + "epoch": 0.15334252239834598, + "grad_norm": 2.0646069049835205, + "learning_rate": 1.0218140068886336e-07, + "logits/chosen": -2.897963047027588, + "logits/rejected": -2.871859312057495, + "logps/chosen": -62.93019485473633, + "logps/rejected": -58.28081512451172, + "loss": 0.6852, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.031163910403847694, + "rewards/margins": 0.016853151842951775, + "rewards/rejected": -0.04801706224679947, + "step": 890 + }, + { + "epoch": 0.15506547208821503, + "grad_norm": 2.542097806930542, + "learning_rate": 1.0332950631458092e-07, + "logits/chosen": -2.917391061782837, + "logits/rejected": -2.912045955657959, + "logps/chosen": -58.973899841308594, + "logps/rejected": -56.362701416015625, + "loss": 0.6872, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.03930676728487015, + "rewards/margins": 0.01295884232968092, + "rewards/rejected": -0.05226561427116394, + "step": 900 + }, + { + "epoch": 0.15678842177808408, + "grad_norm": 2.199962615966797, + "learning_rate": 1.044776119402985e-07, + "logits/chosen": -2.893681287765503, + "logits/rejected": -2.891852855682373, + "logps/chosen": -56.727622985839844, + "logps/rejected": -57.78075408935547, + "loss": 0.6875, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.04665730893611908, + "rewards/margins": 0.01240667887032032, + "rewards/rejected": -0.05906399339437485, + "step": 910 + }, + { + "epoch": 0.15851137146795313, + "grad_norm": 2.394650936126709, + "learning_rate": 1.0562571756601606e-07, + "logits/chosen": -2.916598081588745, + "logits/rejected": -2.885108709335327, + "logps/chosen": -60.41682815551758, + "logps/rejected": -54.64247512817383, + "loss": 0.686, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.04918244481086731, + "rewards/margins": 0.015305593609809875, + "rewards/rejected": -0.06448803842067719, + "step": 920 + }, + { + "epoch": 0.16023432115782218, + "grad_norm": 2.60247540473938, + "learning_rate": 1.0677382319173363e-07, + "logits/chosen": -2.9422740936279297, + "logits/rejected": -2.930769681930542, + "logps/chosen": -58.148521423339844, + "logps/rejected": -59.06476593017578, + "loss": 0.6864, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.04540998488664627, + "rewards/margins": 0.014568351209163666, + "rewards/rejected": -0.05997832864522934, + "step": 930 + }, + { + "epoch": 0.16195727084769124, + "grad_norm": 2.2433910369873047, + "learning_rate": 1.0792192881745119e-07, + "logits/chosen": -2.9525837898254395, + "logits/rejected": -2.9253265857696533, + "logps/chosen": -65.92704010009766, + "logps/rejected": -60.785980224609375, + "loss": 0.6856, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.04621075838804245, + "rewards/margins": 0.01654568873345852, + "rewards/rejected": -0.06275644898414612, + "step": 940 + }, + { + "epoch": 0.16368022053756032, + "grad_norm": 2.3847455978393555, + "learning_rate": 1.0907003444316875e-07, + "logits/chosen": -2.8176932334899902, + "logits/rejected": -2.8024542331695557, + "logps/chosen": -61.851539611816406, + "logps/rejected": -61.24384689331055, + "loss": 0.6891, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.059170741587877274, + "rewards/margins": 0.009380336850881577, + "rewards/rejected": -0.06855107843875885, + "step": 950 + }, + { + "epoch": 0.16540317022742937, + "grad_norm": 2.2610373497009277, + "learning_rate": 1.1021814006888632e-07, + "logits/chosen": -2.808915615081787, + "logits/rejected": -2.811962604522705, + "logps/chosen": -57.79589080810547, + "logps/rejected": -60.714317321777344, + "loss": 0.6945, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -0.06247736141085625, + "rewards/margins": -0.0015118229202926159, + "rewards/rejected": -0.060965538024902344, + "step": 960 + }, + { + "epoch": 0.16712611991729842, + "grad_norm": 2.532683849334717, + "learning_rate": 1.1136624569460388e-07, + "logits/chosen": -2.8959765434265137, + "logits/rejected": -2.869158983230591, + "logps/chosen": -66.4780502319336, + "logps/rejected": -57.21562957763672, + "loss": 0.6874, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -0.052064068615436554, + "rewards/margins": 0.01277141459286213, + "rewards/rejected": -0.06483548879623413, + "step": 970 + }, + { + "epoch": 0.16884906960716747, + "grad_norm": 2.805853843688965, + "learning_rate": 1.1251435132032146e-07, + "logits/chosen": -2.9637677669525146, + "logits/rejected": -2.945610761642456, + "logps/chosen": -62.131736755371094, + "logps/rejected": -59.7706298828125, + "loss": 0.6864, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.04719981551170349, + "rewards/margins": 0.014899635687470436, + "rewards/rejected": -0.06209943816065788, + "step": 980 + }, + { + "epoch": 0.17057201929703653, + "grad_norm": 2.672757148742676, + "learning_rate": 1.1366245694603902e-07, + "logits/chosen": -2.8849339485168457, + "logits/rejected": -2.866464376449585, + "logps/chosen": -60.4627571105957, + "logps/rejected": -58.09014129638672, + "loss": 0.6881, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -0.05641549080610275, + "rewards/margins": 0.011244365945458412, + "rewards/rejected": -0.06765986233949661, + "step": 990 + }, + { + "epoch": 0.17229496898690558, + "grad_norm": 2.177389621734619, + "learning_rate": 1.1481056257175659e-07, + "logits/chosen": -2.851919651031494, + "logits/rejected": -2.82978892326355, + "logps/chosen": -63.827980041503906, + "logps/rejected": -58.105125427246094, + "loss": 0.6865, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.056848056614398956, + "rewards/margins": 0.014737958088517189, + "rewards/rejected": -0.071586012840271, + "step": 1000 + }, + { + "epoch": 0.17401791867677463, + "grad_norm": 2.3776516914367676, + "learning_rate": 1.1595866819747415e-07, + "logits/chosen": -2.8108181953430176, + "logits/rejected": -2.816577672958374, + "logps/chosen": -60.68506622314453, + "logps/rejected": -63.4400749206543, + "loss": 0.69, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.06378235667943954, + "rewards/margins": 0.007416282780468464, + "rewards/rejected": -0.07119864225387573, + "step": 1010 + }, + { + "epoch": 0.17574086836664368, + "grad_norm": 2.2297322750091553, + "learning_rate": 1.1710677382319172e-07, + "logits/chosen": -2.9127297401428223, + "logits/rejected": -2.889315128326416, + "logps/chosen": -63.0562629699707, + "logps/rejected": -60.82661819458008, + "loss": 0.6802, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.048960812389850616, + "rewards/margins": 0.02726539969444275, + "rewards/rejected": -0.07622621953487396, + "step": 1020 + }, + { + "epoch": 0.17746381805651276, + "grad_norm": 2.291780471801758, + "learning_rate": 1.1825487944890928e-07, + "logits/chosen": -2.9396207332611084, + "logits/rejected": -2.9200801849365234, + "logps/chosen": -60.832740783691406, + "logps/rejected": -58.027183532714844, + "loss": 0.6856, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.05459435656666756, + "rewards/margins": 0.016271965578198433, + "rewards/rejected": -0.07086631655693054, + "step": 1030 + }, + { + "epoch": 0.17918676774638181, + "grad_norm": 2.8056833744049072, + "learning_rate": 1.1940298507462686e-07, + "logits/chosen": -2.8990871906280518, + "logits/rejected": -2.88954758644104, + "logps/chosen": -59.774559020996094, + "logps/rejected": -62.26892852783203, + "loss": 0.6898, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -0.06455157697200775, + "rewards/margins": 0.008438936434686184, + "rewards/rejected": -0.0729905217885971, + "step": 1040 + }, + { + "epoch": 0.18090971743625087, + "grad_norm": 2.4161949157714844, + "learning_rate": 1.205510907003444e-07, + "logits/chosen": -2.8405020236968994, + "logits/rejected": -2.8092803955078125, + "logps/chosen": -64.3838882446289, + "logps/rejected": -58.1729736328125, + "loss": 0.6834, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.06246421858668327, + "rewards/margins": 0.02075287327170372, + "rewards/rejected": -0.08321709930896759, + "step": 1050 + }, + { + "epoch": 0.18263266712611992, + "grad_norm": 2.3523647785186768, + "learning_rate": 1.2169919632606198e-07, + "logits/chosen": -2.858825206756592, + "logits/rejected": -2.8416731357574463, + "logps/chosen": -64.46825408935547, + "logps/rejected": -62.1015625, + "loss": 0.6884, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.06816236674785614, + "rewards/margins": 0.010872631333768368, + "rewards/rejected": -0.07903499901294708, + "step": 1060 + }, + { + "epoch": 0.18435561681598897, + "grad_norm": 2.313222646713257, + "learning_rate": 1.2284730195177955e-07, + "logits/chosen": -2.9378304481506348, + "logits/rejected": -2.9094176292419434, + "logps/chosen": -64.5782470703125, + "logps/rejected": -60.97203826904297, + "loss": 0.6826, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.06780969351530075, + "rewards/margins": 0.022828945890069008, + "rewards/rejected": -0.09063863754272461, + "step": 1070 + }, + { + "epoch": 0.18607856650585802, + "grad_norm": 2.3939578533172607, + "learning_rate": 1.2399540757749712e-07, + "logits/chosen": -2.936554431915283, + "logits/rejected": -2.918708086013794, + "logps/chosen": -64.82514190673828, + "logps/rejected": -59.743865966796875, + "loss": 0.6852, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.07702767848968506, + "rewards/margins": 0.01737195998430252, + "rewards/rejected": -0.09439964592456818, + "step": 1080 + }, + { + "epoch": 0.18780151619572708, + "grad_norm": 2.4799766540527344, + "learning_rate": 1.251435132032147e-07, + "logits/chosen": -2.8482329845428467, + "logits/rejected": -2.8464126586914062, + "logps/chosen": -61.432395935058594, + "logps/rejected": -60.96601104736328, + "loss": 0.6888, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.08055031299591064, + "rewards/margins": 0.010185223072767258, + "rewards/rejected": -0.0907355397939682, + "step": 1090 + }, + { + "epoch": 0.18952446588559613, + "grad_norm": 2.4442903995513916, + "learning_rate": 1.2629161882893227e-07, + "logits/chosen": -2.869385242462158, + "logits/rejected": -2.8703014850616455, + "logps/chosen": -59.78876876831055, + "logps/rejected": -63.30439376831055, + "loss": 0.6866, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.0754605308175087, + "rewards/margins": 0.015025362372398376, + "rewards/rejected": -0.09048587828874588, + "step": 1100 + }, + { + "epoch": 0.1912474155754652, + "grad_norm": 2.785104990005493, + "learning_rate": 1.2743972445464984e-07, + "logits/chosen": -2.8878140449523926, + "logits/rejected": -2.897512912750244, + "logps/chosen": -61.808631896972656, + "logps/rejected": -63.763877868652344, + "loss": 0.6892, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -0.07973656803369522, + "rewards/margins": 0.009631333872675896, + "rewards/rejected": -0.08936790376901627, + "step": 1110 + }, + { + "epoch": 0.19297036526533426, + "grad_norm": 2.718078374862671, + "learning_rate": 1.285878300803674e-07, + "logits/chosen": -2.9095702171325684, + "logits/rejected": -2.892914056777954, + "logps/chosen": -64.25332641601562, + "logps/rejected": -61.17406463623047, + "loss": 0.6865, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.0720595195889473, + "rewards/margins": 0.014846527948975563, + "rewards/rejected": -0.08690604567527771, + "step": 1120 + }, + { + "epoch": 0.1946933149552033, + "grad_norm": 2.2941906452178955, + "learning_rate": 1.2973593570608496e-07, + "logits/chosen": -2.9266607761383057, + "logits/rejected": -2.9116671085357666, + "logps/chosen": -59.73681640625, + "logps/rejected": -62.68779754638672, + "loss": 0.6803, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.0724368542432785, + "rewards/margins": 0.02737646922469139, + "rewards/rejected": -0.09981332719326019, + "step": 1130 + }, + { + "epoch": 0.19641626464507236, + "grad_norm": 2.5265445709228516, + "learning_rate": 1.3088404133180254e-07, + "logits/chosen": -2.8809289932250977, + "logits/rejected": -2.860023021697998, + "logps/chosen": -64.78260040283203, + "logps/rejected": -60.81663131713867, + "loss": 0.6861, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.06992609798908234, + "rewards/margins": 0.016008172184228897, + "rewards/rejected": -0.08593426644802094, + "step": 1140 + }, + { + "epoch": 0.19813921433494142, + "grad_norm": 2.750117063522339, + "learning_rate": 1.3203214695752008e-07, + "logits/chosen": -2.8423831462860107, + "logits/rejected": -2.832681655883789, + "logps/chosen": -61.31705856323242, + "logps/rejected": -61.98966598510742, + "loss": 0.682, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.0669320821762085, + "rewards/margins": 0.023900505155324936, + "rewards/rejected": -0.09083259105682373, + "step": 1150 + }, + { + "epoch": 0.19986216402481047, + "grad_norm": 2.309347152709961, + "learning_rate": 1.3318025258323766e-07, + "logits/chosen": -2.9073128700256348, + "logits/rejected": -2.900723695755005, + "logps/chosen": -61.77983856201172, + "logps/rejected": -61.5976676940918, + "loss": 0.6836, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.060506559908390045, + "rewards/margins": 0.020602624863386154, + "rewards/rejected": -0.0811091959476471, + "step": 1160 + }, + { + "epoch": 0.20158511371467952, + "grad_norm": 2.2978622913360596, + "learning_rate": 1.3432835820895523e-07, + "logits/chosen": -2.8373751640319824, + "logits/rejected": -2.828467845916748, + "logps/chosen": -59.83747482299805, + "logps/rejected": -61.81534957885742, + "loss": 0.6866, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.0691220685839653, + "rewards/margins": 0.014754706993699074, + "rewards/rejected": -0.08387677371501923, + "step": 1170 + }, + { + "epoch": 0.2033080634045486, + "grad_norm": 2.4647703170776367, + "learning_rate": 1.354764638346728e-07, + "logits/chosen": -2.8165135383605957, + "logits/rejected": -2.794498920440674, + "logps/chosen": -61.49424362182617, + "logps/rejected": -59.40460205078125, + "loss": 0.6837, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.07810764014720917, + "rewards/margins": 0.020892778411507607, + "rewards/rejected": -0.09900043159723282, + "step": 1180 + }, + { + "epoch": 0.20503101309441765, + "grad_norm": 2.720895767211914, + "learning_rate": 1.3662456946039035e-07, + "logits/chosen": -2.922314405441284, + "logits/rejected": -2.900804042816162, + "logps/chosen": -66.13607788085938, + "logps/rejected": -58.975799560546875, + "loss": 0.6807, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.07079530507326126, + "rewards/margins": 0.02672487497329712, + "rewards/rejected": -0.09752018749713898, + "step": 1190 + }, + { + "epoch": 0.2067539627842867, + "grad_norm": 2.2189040184020996, + "learning_rate": 1.3777267508610792e-07, + "logits/chosen": -2.825812816619873, + "logits/rejected": -2.8158926963806152, + "logps/chosen": -61.317138671875, + "logps/rejected": -58.899383544921875, + "loss": 0.6874, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.07221807539463043, + "rewards/margins": 0.013589012436568737, + "rewards/rejected": -0.08580709993839264, + "step": 1200 + }, + { + "epoch": 0.2067539627842867, + "eval_logits/chosen": -2.9395346641540527, + "eval_logits/rejected": -2.9361255168914795, + "eval_logps/chosen": -62.038490295410156, + "eval_logps/rejected": -67.01734924316406, + "eval_loss": 0.687571108341217, + "eval_rewards/accuracies": 0.5871282815933228, + "eval_rewards/chosen": -0.030230168253183365, + "eval_rewards/margins": 0.012447311542928219, + "eval_rewards/rejected": -0.04267747700214386, + "eval_runtime": 384.82, + "eval_samples_per_second": 11.184, + "eval_steps_per_second": 1.398, + "step": 1200 + }, + { + "epoch": 0.20847691247415576, + "grad_norm": 3.0337154865264893, + "learning_rate": 1.389207807118255e-07, + "logits/chosen": -2.8906006813049316, + "logits/rejected": -2.865419387817383, + "logps/chosen": -62.349098205566406, + "logps/rejected": -62.536407470703125, + "loss": 0.6814, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.07368902117013931, + "rewards/margins": 0.025377903133630753, + "rewards/rejected": -0.09906691312789917, + "step": 1210 + }, + { + "epoch": 0.2101998621640248, + "grad_norm": 2.5097906589508057, + "learning_rate": 1.4006888633754304e-07, + "logits/chosen": -2.89139986038208, + "logits/rejected": -2.8693718910217285, + "logps/chosen": -60.9052619934082, + "logps/rejected": -61.641456604003906, + "loss": 0.6812, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.07215452194213867, + "rewards/margins": 0.025878047570586205, + "rewards/rejected": -0.09803257137537003, + "step": 1220 + }, + { + "epoch": 0.21192281185389386, + "grad_norm": 2.8678061962127686, + "learning_rate": 1.4121699196326062e-07, + "logits/chosen": -2.9567160606384277, + "logits/rejected": -2.925724506378174, + "logps/chosen": -64.60700988769531, + "logps/rejected": -61.93334197998047, + "loss": 0.6792, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.07604970782995224, + "rewards/margins": 0.0297552403062582, + "rewards/rejected": -0.10580495744943619, + "step": 1230 + }, + { + "epoch": 0.2136457615437629, + "grad_norm": 2.821640729904175, + "learning_rate": 1.423650975889782e-07, + "logits/chosen": -2.855620861053467, + "logits/rejected": -2.845649480819702, + "logps/chosen": -61.8939094543457, + "logps/rejected": -63.21649169921875, + "loss": 0.6837, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.088918536901474, + "rewards/margins": 0.020954841747879982, + "rewards/rejected": -0.10987337678670883, + "step": 1240 + }, + { + "epoch": 0.21536871123363197, + "grad_norm": 2.81172776222229, + "learning_rate": 1.4351320321469576e-07, + "logits/chosen": -2.9326090812683105, + "logits/rejected": -2.9026684761047363, + "logps/chosen": -63.4760627746582, + "logps/rejected": -61.67022705078125, + "loss": 0.6807, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -0.08308391273021698, + "rewards/margins": 0.027814438566565514, + "rewards/rejected": -0.11089835315942764, + "step": 1250 + }, + { + "epoch": 0.21709166092350105, + "grad_norm": 2.774033308029175, + "learning_rate": 1.446613088404133e-07, + "logits/chosen": -2.819913148880005, + "logits/rejected": -2.800523519515991, + "logps/chosen": -62.77690887451172, + "logps/rejected": -60.63389205932617, + "loss": 0.681, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.08326883614063263, + "rewards/margins": 0.026683837175369263, + "rewards/rejected": -0.10995267331600189, + "step": 1260 + }, + { + "epoch": 0.2188146106133701, + "grad_norm": 2.880520820617676, + "learning_rate": 1.4580941446613089e-07, + "logits/chosen": -2.856600522994995, + "logits/rejected": -2.8484253883361816, + "logps/chosen": -61.86387252807617, + "logps/rejected": -64.2399673461914, + "loss": 0.687, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.08681348711252213, + "rewards/margins": 0.014278444461524487, + "rewards/rejected": -0.10109193623065948, + "step": 1270 + }, + { + "epoch": 0.22053756030323915, + "grad_norm": 2.6990182399749756, + "learning_rate": 1.4695752009184846e-07, + "logits/chosen": -2.8809759616851807, + "logits/rejected": -2.8884129524230957, + "logps/chosen": -59.63513946533203, + "logps/rejected": -68.55680847167969, + "loss": 0.6853, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.07746431231498718, + "rewards/margins": 0.017880458384752274, + "rewards/rejected": -0.09534476697444916, + "step": 1280 + }, + { + "epoch": 0.2222605099931082, + "grad_norm": 3.019570827484131, + "learning_rate": 1.4810562571756603e-07, + "logits/chosen": -2.8340437412261963, + "logits/rejected": -2.8122403621673584, + "logps/chosen": -63.11928176879883, + "logps/rejected": -60.483482360839844, + "loss": 0.6792, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.07383199036121368, + "rewards/margins": 0.03013637661933899, + "rewards/rejected": -0.10396836698055267, + "step": 1290 + }, + { + "epoch": 0.22398345968297725, + "grad_norm": 2.6284210681915283, + "learning_rate": 1.4925373134328358e-07, + "logits/chosen": -2.8784289360046387, + "logits/rejected": -2.870450973510742, + "logps/chosen": -59.34076690673828, + "logps/rejected": -64.48880767822266, + "loss": 0.6847, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.09290703386068344, + "rewards/margins": 0.01890570856630802, + "rewards/rejected": -0.11181274801492691, + "step": 1300 + }, + { + "epoch": 0.2257064093728463, + "grad_norm": 3.0810391902923584, + "learning_rate": 1.5040183696900115e-07, + "logits/chosen": -2.8738865852355957, + "logits/rejected": -2.8507232666015625, + "logps/chosen": -62.38170623779297, + "logps/rejected": -60.55586624145508, + "loss": 0.6773, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.08380500972270966, + "rewards/margins": 0.03459765389561653, + "rewards/rejected": -0.11840268224477768, + "step": 1310 + }, + { + "epoch": 0.22742935906271536, + "grad_norm": 2.8459906578063965, + "learning_rate": 1.5154994259471873e-07, + "logits/chosen": -2.895685911178589, + "logits/rejected": -2.8805301189422607, + "logps/chosen": -64.91357421875, + "logps/rejected": -61.18379592895508, + "loss": 0.6817, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.08374587446451187, + "rewards/margins": 0.02516447938978672, + "rewards/rejected": -0.10891035944223404, + "step": 1320 + }, + { + "epoch": 0.22915230875258444, + "grad_norm": 2.7527568340301514, + "learning_rate": 1.5269804822043627e-07, + "logits/chosen": -2.9197354316711426, + "logits/rejected": -2.8971869945526123, + "logps/chosen": -65.04910278320312, + "logps/rejected": -61.24757766723633, + "loss": 0.6812, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.0866512805223465, + "rewards/margins": 0.02625083550810814, + "rewards/rejected": -0.11290212720632553, + "step": 1330 + }, + { + "epoch": 0.2308752584424535, + "grad_norm": 2.576115846633911, + "learning_rate": 1.5384615384615385e-07, + "logits/chosen": -2.81272292137146, + "logits/rejected": -2.7970199584960938, + "logps/chosen": -64.03965759277344, + "logps/rejected": -62.298255920410156, + "loss": 0.6826, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.08078499883413315, + "rewards/margins": 0.02357543632388115, + "rewards/rejected": -0.10436044633388519, + "step": 1340 + }, + { + "epoch": 0.23259820813232254, + "grad_norm": 2.717984199523926, + "learning_rate": 1.5499425947187142e-07, + "logits/chosen": -2.855532169342041, + "logits/rejected": -2.8379480838775635, + "logps/chosen": -64.80758666992188, + "logps/rejected": -64.42263793945312, + "loss": 0.6885, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.10266206413507462, + "rewards/margins": 0.01211500447243452, + "rewards/rejected": -0.11477706581354141, + "step": 1350 + }, + { + "epoch": 0.2343211578221916, + "grad_norm": 2.949355125427246, + "learning_rate": 1.56142365097589e-07, + "logits/chosen": -2.814122200012207, + "logits/rejected": -2.802731990814209, + "logps/chosen": -65.5488510131836, + "logps/rejected": -65.61713409423828, + "loss": 0.6864, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.09135110676288605, + "rewards/margins": 0.015937041491270065, + "rewards/rejected": -0.10728814452886581, + "step": 1360 + }, + { + "epoch": 0.23604410751206065, + "grad_norm": 3.703526020050049, + "learning_rate": 1.5729047072330654e-07, + "logits/chosen": -2.8992276191711426, + "logits/rejected": -2.883058547973633, + "logps/chosen": -63.055274963378906, + "logps/rejected": -64.58672332763672, + "loss": 0.6793, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.09181781858205795, + "rewards/margins": 0.03024251200258732, + "rewards/rejected": -0.12206032127141953, + "step": 1370 + }, + { + "epoch": 0.2377670572019297, + "grad_norm": 3.2197022438049316, + "learning_rate": 1.584385763490241e-07, + "logits/chosen": -2.841649293899536, + "logits/rejected": -2.833360433578491, + "logps/chosen": -64.94597625732422, + "logps/rejected": -62.3499870300293, + "loss": 0.683, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.0867023766040802, + "rewards/margins": 0.022678587585687637, + "rewards/rejected": -0.10938096046447754, + "step": 1380 + }, + { + "epoch": 0.23949000689179875, + "grad_norm": 3.200864553451538, + "learning_rate": 1.5958668197474169e-07, + "logits/chosen": -2.9327969551086426, + "logits/rejected": -2.9127845764160156, + "logps/chosen": -66.3973388671875, + "logps/rejected": -63.198211669921875, + "loss": 0.6827, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.08382676541805267, + "rewards/margins": 0.023857740685343742, + "rewards/rejected": -0.10768450796604156, + "step": 1390 + }, + { + "epoch": 0.2412129565816678, + "grad_norm": 2.830458641052246, + "learning_rate": 1.6073478760045923e-07, + "logits/chosen": -2.8190531730651855, + "logits/rejected": -2.8015060424804688, + "logps/chosen": -65.630859375, + "logps/rejected": -63.66929244995117, + "loss": 0.6794, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.08757500350475311, + "rewards/margins": 0.03044615313410759, + "rewards/rejected": -0.1180211678147316, + "step": 1400 + }, + { + "epoch": 0.24293590627153688, + "grad_norm": 2.9669930934906006, + "learning_rate": 1.618828932261768e-07, + "logits/chosen": -2.8535845279693604, + "logits/rejected": -2.8465869426727295, + "logps/chosen": -63.831695556640625, + "logps/rejected": -64.20135498046875, + "loss": 0.6833, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.09446011483669281, + "rewards/margins": 0.02210450917482376, + "rewards/rejected": -0.11656463146209717, + "step": 1410 + }, + { + "epoch": 0.24465885596140594, + "grad_norm": 3.1416923999786377, + "learning_rate": 1.6303099885189438e-07, + "logits/chosen": -2.804192304611206, + "logits/rejected": -2.807462692260742, + "logps/chosen": -61.50630569458008, + "logps/rejected": -65.51634216308594, + "loss": 0.6896, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.10375789552927017, + "rewards/margins": 0.009949756786227226, + "rewards/rejected": -0.11370766162872314, + "step": 1420 + }, + { + "epoch": 0.246381805651275, + "grad_norm": 3.045356512069702, + "learning_rate": 1.6417910447761195e-07, + "logits/chosen": -2.877074718475342, + "logits/rejected": -2.8679327964782715, + "logps/chosen": -64.40380859375, + "logps/rejected": -67.46661376953125, + "loss": 0.6804, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.09864847362041473, + "rewards/margins": 0.028175463899970055, + "rewards/rejected": -0.12682393193244934, + "step": 1430 + }, + { + "epoch": 0.24810475534114404, + "grad_norm": 2.733543634414673, + "learning_rate": 1.653272101033295e-07, + "logits/chosen": -2.8185031414031982, + "logits/rejected": -2.799570083618164, + "logps/chosen": -61.34563446044922, + "logps/rejected": -60.22956466674805, + "loss": 0.6799, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.09759248793125153, + "rewards/margins": 0.029250025749206543, + "rewards/rejected": -0.12684249877929688, + "step": 1440 + }, + { + "epoch": 0.2498277050310131, + "grad_norm": 3.470953941345215, + "learning_rate": 1.6647531572904707e-07, + "logits/chosen": -2.8219285011291504, + "logits/rejected": -2.7927727699279785, + "logps/chosen": -64.85969543457031, + "logps/rejected": -64.55895233154297, + "loss": 0.6786, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.09148148447275162, + "rewards/margins": 0.03249611333012581, + "rewards/rejected": -0.12397760152816772, + "step": 1450 + }, + { + "epoch": 0.25155065472088217, + "grad_norm": 3.3708927631378174, + "learning_rate": 1.6762342135476465e-07, + "logits/chosen": -2.840583324432373, + "logits/rejected": -2.8144376277923584, + "logps/chosen": -63.61417770385742, + "logps/rejected": -63.9477653503418, + "loss": 0.6756, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.10387835651636124, + "rewards/margins": 0.038820572197437286, + "rewards/rejected": -0.14269892871379852, + "step": 1460 + }, + { + "epoch": 0.2532736044107512, + "grad_norm": 3.427384614944458, + "learning_rate": 1.687715269804822e-07, + "logits/chosen": -2.9569153785705566, + "logits/rejected": -2.9273934364318848, + "logps/chosen": -67.2813720703125, + "logps/rejected": -66.46237182617188, + "loss": 0.6765, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.08899648487567902, + "rewards/margins": 0.03610143065452576, + "rewards/rejected": -0.12509790062904358, + "step": 1470 + }, + { + "epoch": 0.2549965541006203, + "grad_norm": 3.171168565750122, + "learning_rate": 1.6991963260619977e-07, + "logits/chosen": -2.9081575870513916, + "logits/rejected": -2.8784821033477783, + "logps/chosen": -61.340660095214844, + "logps/rejected": -63.49712371826172, + "loss": 0.6797, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.09166257083415985, + "rewards/margins": 0.029424916952848434, + "rewards/rejected": -0.12108749151229858, + "step": 1480 + }, + { + "epoch": 0.2567195037904893, + "grad_norm": 3.0038366317749023, + "learning_rate": 1.7106773823191734e-07, + "logits/chosen": -2.890097141265869, + "logits/rejected": -2.8695075511932373, + "logps/chosen": -68.01859283447266, + "logps/rejected": -64.6750259399414, + "loss": 0.6807, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.09513401985168457, + "rewards/margins": 0.027182336896657944, + "rewards/rejected": -0.12231633812189102, + "step": 1490 + }, + { + "epoch": 0.2584424534803584, + "grad_norm": 3.3400094509124756, + "learning_rate": 1.722158438576349e-07, + "logits/chosen": -2.8165030479431152, + "logits/rejected": -2.815304756164551, + "logps/chosen": -62.07170486450195, + "logps/rejected": -65.22959899902344, + "loss": 0.6835, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -0.1098056212067604, + "rewards/margins": 0.02261367253959179, + "rewards/rejected": -0.13241930305957794, + "step": 1500 + }, + { + "epoch": 0.2601654031702274, + "grad_norm": 4.100131988525391, + "learning_rate": 1.7336394948335246e-07, + "logits/chosen": -2.81020188331604, + "logits/rejected": -2.7851126194000244, + "logps/chosen": -68.38099670410156, + "logps/rejected": -66.84229278564453, + "loss": 0.6716, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.09384194761514664, + "rewards/margins": 0.04618433117866516, + "rewards/rejected": -0.1400262713432312, + "step": 1510 + }, + { + "epoch": 0.2618883528600965, + "grad_norm": 3.5354981422424316, + "learning_rate": 1.7451205510907003e-07, + "logits/chosen": -2.865053653717041, + "logits/rejected": -2.862806558609009, + "logps/chosen": -64.65748596191406, + "logps/rejected": -63.5691032409668, + "loss": 0.6868, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.11419986188411713, + "rewards/margins": 0.01575290784239769, + "rewards/rejected": -0.12995277345180511, + "step": 1520 + }, + { + "epoch": 0.26361130254996556, + "grad_norm": 3.4934535026550293, + "learning_rate": 1.756601607347876e-07, + "logits/chosen": -2.794623851776123, + "logits/rejected": -2.7821030616760254, + "logps/chosen": -65.29815673828125, + "logps/rejected": -64.14790344238281, + "loss": 0.6832, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.12472577393054962, + "rewards/margins": 0.023620011284947395, + "rewards/rejected": -0.14834578335285187, + "step": 1530 + }, + { + "epoch": 0.2653342522398346, + "grad_norm": 3.742764711380005, + "learning_rate": 1.7680826636050515e-07, + "logits/chosen": -2.871792793273926, + "logits/rejected": -2.846169948577881, + "logps/chosen": -68.71684265136719, + "logps/rejected": -66.43431091308594, + "loss": 0.675, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.12237291038036346, + "rewards/margins": 0.040168728679418564, + "rewards/rejected": -0.16254164278507233, + "step": 1540 + }, + { + "epoch": 0.26705720192970367, + "grad_norm": 3.3786399364471436, + "learning_rate": 1.7795637198622273e-07, + "logits/chosen": -2.806138038635254, + "logits/rejected": -2.7949366569519043, + "logps/chosen": -67.20889282226562, + "logps/rejected": -65.76429748535156, + "loss": 0.6752, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.1240202784538269, + "rewards/margins": 0.03948847949504852, + "rewards/rejected": -0.16350875794887543, + "step": 1550 + }, + { + "epoch": 0.2687801516195727, + "grad_norm": 3.4467484951019287, + "learning_rate": 1.791044776119403e-07, + "logits/chosen": -2.817890167236328, + "logits/rejected": -2.80246639251709, + "logps/chosen": -65.50123596191406, + "logps/rejected": -66.32374572753906, + "loss": 0.6821, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.13890376687049866, + "rewards/margins": 0.02497437223792076, + "rewards/rejected": -0.16387812793254852, + "step": 1560 + }, + { + "epoch": 0.2705031013094418, + "grad_norm": 3.2813069820404053, + "learning_rate": 1.8025258323765787e-07, + "logits/chosen": -2.8562819957733154, + "logits/rejected": -2.861222982406616, + "logps/chosen": -67.8829116821289, + "logps/rejected": -72.48780822753906, + "loss": 0.6844, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.14664840698242188, + "rewards/margins": 0.021208012476563454, + "rewards/rejected": -0.16785642504692078, + "step": 1570 + }, + { + "epoch": 0.2722260509993108, + "grad_norm": 3.9598214626312256, + "learning_rate": 1.8140068886337542e-07, + "logits/chosen": -2.8063104152679443, + "logits/rejected": -2.801279306411743, + "logps/chosen": -67.23238372802734, + "logps/rejected": -71.10231018066406, + "loss": 0.6778, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.143377423286438, + "rewards/margins": 0.035317786037921906, + "rewards/rejected": -0.1786952167749405, + "step": 1580 + }, + { + "epoch": 0.2739490006891799, + "grad_norm": 5.066261291503906, + "learning_rate": 1.82548794489093e-07, + "logits/chosen": -2.838998317718506, + "logits/rejected": -2.8293375968933105, + "logps/chosen": -68.40142822265625, + "logps/rejected": -72.52519226074219, + "loss": 0.6739, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.1380554735660553, + "rewards/margins": 0.04200926795601845, + "rewards/rejected": -0.18006475269794464, + "step": 1590 + }, + { + "epoch": 0.27567195037904896, + "grad_norm": 3.795305013656616, + "learning_rate": 1.8369690011481057e-07, + "logits/chosen": -2.824371814727783, + "logits/rejected": -2.8006680011749268, + "logps/chosen": -68.08535766601562, + "logps/rejected": -69.51658630371094, + "loss": 0.676, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.14742091298103333, + "rewards/margins": 0.03879866749048233, + "rewards/rejected": -0.18621957302093506, + "step": 1600 + }, + { + "epoch": 0.27567195037904896, + "eval_logits/chosen": -2.8975741863250732, + "eval_logits/rejected": -2.8941969871520996, + "eval_logps/chosen": -69.5812759399414, + "eval_logps/rejected": -75.9064712524414, + "eval_loss": 0.6819967031478882, + "eval_rewards/accuracies": 0.5850371718406677, + "eval_rewards/chosen": -0.10565808415412903, + "eval_rewards/margins": 0.025910574942827225, + "eval_rewards/rejected": -0.13156867027282715, + "eval_runtime": 384.5219, + "eval_samples_per_second": 11.193, + "eval_steps_per_second": 1.399, + "step": 1600 + }, + { + "epoch": 0.277394900068918, + "grad_norm": 4.144330978393555, + "learning_rate": 1.848450057405281e-07, + "logits/chosen": -2.845106840133667, + "logits/rejected": -2.825932502746582, + "logps/chosen": -73.3796615600586, + "logps/rejected": -77.50773620605469, + "loss": 0.6769, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.17480914294719696, + "rewards/margins": 0.03731060028076172, + "rewards/rejected": -0.21211972832679749, + "step": 1610 + }, + { + "epoch": 0.27911784975878706, + "grad_norm": 3.920064926147461, + "learning_rate": 1.8599311136624569e-07, + "logits/chosen": -2.7817695140838623, + "logits/rejected": -2.765631914138794, + "logps/chosen": -71.98372650146484, + "logps/rejected": -76.48005676269531, + "loss": 0.6773, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.194097101688385, + "rewards/margins": 0.037283383309841156, + "rewards/rejected": -0.23138046264648438, + "step": 1620 + }, + { + "epoch": 0.2808407994486561, + "grad_norm": 5.102138996124268, + "learning_rate": 1.8714121699196326e-07, + "logits/chosen": -2.826125383377075, + "logits/rejected": -2.8114256858825684, + "logps/chosen": -73.12959289550781, + "logps/rejected": -71.62154388427734, + "loss": 0.6734, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.1605428159236908, + "rewards/margins": 0.04527025297284126, + "rewards/rejected": -0.20581302046775818, + "step": 1630 + }, + { + "epoch": 0.28256374913852517, + "grad_norm": 3.954345226287842, + "learning_rate": 1.8828932261768083e-07, + "logits/chosen": -2.8946547508239746, + "logits/rejected": -2.8683266639709473, + "logps/chosen": -75.14608001708984, + "logps/rejected": -70.92219543457031, + "loss": 0.6789, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.1698450744152069, + "rewards/margins": 0.034184712916612625, + "rewards/rejected": -0.20402979850769043, + "step": 1640 + }, + { + "epoch": 0.2842866988283942, + "grad_norm": 4.2861151695251465, + "learning_rate": 1.8943742824339838e-07, + "logits/chosen": -2.864513874053955, + "logits/rejected": -2.8481781482696533, + "logps/chosen": -74.4118423461914, + "logps/rejected": -76.10733795166016, + "loss": 0.6717, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.16274502873420715, + "rewards/margins": 0.04893755167722702, + "rewards/rejected": -0.21168258786201477, + "step": 1650 + }, + { + "epoch": 0.28600964851826327, + "grad_norm": 4.130656719207764, + "learning_rate": 1.9058553386911595e-07, + "logits/chosen": -2.8591482639312744, + "logits/rejected": -2.830831527709961, + "logps/chosen": -73.754638671875, + "logps/rejected": -70.54668426513672, + "loss": 0.6742, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.17435503005981445, + "rewards/margins": 0.04359282925724983, + "rewards/rejected": -0.21794787049293518, + "step": 1660 + }, + { + "epoch": 0.2877325982081323, + "grad_norm": 4.471282958984375, + "learning_rate": 1.9173363949483353e-07, + "logits/chosen": -2.8563740253448486, + "logits/rejected": -2.8407225608825684, + "logps/chosen": -72.65254974365234, + "logps/rejected": -74.63856506347656, + "loss": 0.6757, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.16366687417030334, + "rewards/margins": 0.039933811873197556, + "rewards/rejected": -0.2036006897687912, + "step": 1670 + }, + { + "epoch": 0.2894555478980014, + "grad_norm": 4.474172115325928, + "learning_rate": 1.928817451205511e-07, + "logits/chosen": -2.8281986713409424, + "logits/rejected": -2.8113129138946533, + "logps/chosen": -75.64710998535156, + "logps/rejected": -76.26982116699219, + "loss": 0.6753, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.18650095164775848, + "rewards/margins": 0.04180046170949936, + "rewards/rejected": -0.22830140590667725, + "step": 1680 + }, + { + "epoch": 0.29117849758787046, + "grad_norm": 4.739830493927002, + "learning_rate": 1.9402985074626865e-07, + "logits/chosen": -2.8134493827819824, + "logits/rejected": -2.8036468029022217, + "logps/chosen": -73.00105285644531, + "logps/rejected": -70.45274353027344, + "loss": 0.6853, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.17002905905246735, + "rewards/margins": 0.019979029893875122, + "rewards/rejected": -0.19000807404518127, + "step": 1690 + }, + { + "epoch": 0.2929014472777395, + "grad_norm": 4.144531726837158, + "learning_rate": 1.9517795637198622e-07, + "logits/chosen": -2.769505262374878, + "logits/rejected": -2.7662644386291504, + "logps/chosen": -69.11921691894531, + "logps/rejected": -75.20559692382812, + "loss": 0.685, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.18292875587940216, + "rewards/margins": 0.021692339330911636, + "rewards/rejected": -0.2046210765838623, + "step": 1700 + }, + { + "epoch": 0.29462439696760856, + "grad_norm": 4.729769706726074, + "learning_rate": 1.963260619977038e-07, + "logits/chosen": -2.8375561237335205, + "logits/rejected": -2.8075573444366455, + "logps/chosen": -76.96826171875, + "logps/rejected": -75.673583984375, + "loss": 0.6739, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.17701391875743866, + "rewards/margins": 0.043523646891117096, + "rewards/rejected": -0.22053758800029755, + "step": 1710 + }, + { + "epoch": 0.2963473466574776, + "grad_norm": 4.864485740661621, + "learning_rate": 1.9747416762342134e-07, + "logits/chosen": -2.818294048309326, + "logits/rejected": -2.7952685356140137, + "logps/chosen": -76.0293197631836, + "logps/rejected": -72.1862564086914, + "loss": 0.6797, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.18885084986686707, + "rewards/margins": 0.03140731528401375, + "rewards/rejected": -0.2202581912279129, + "step": 1720 + }, + { + "epoch": 0.29807029634734666, + "grad_norm": 4.814679145812988, + "learning_rate": 1.9862227324913891e-07, + "logits/chosen": -2.8460941314697266, + "logits/rejected": -2.8359787464141846, + "logps/chosen": -76.57044219970703, + "logps/rejected": -74.95271301269531, + "loss": 0.6882, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -0.21247780323028564, + "rewards/margins": 0.016523724421858788, + "rewards/rejected": -0.22900155186653137, + "step": 1730 + }, + { + "epoch": 0.2997932460372157, + "grad_norm": 4.04186487197876, + "learning_rate": 1.997703788748565e-07, + "logits/chosen": -2.8214850425720215, + "logits/rejected": -2.814258098602295, + "logps/chosen": -73.9696273803711, + "logps/rejected": -79.91903686523438, + "loss": 0.6719, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.2130396068096161, + "rewards/margins": 0.049149997532367706, + "rewards/rejected": -0.2621895968914032, + "step": 1740 + }, + { + "epoch": 0.30151619572708477, + "grad_norm": 4.963952541351318, + "learning_rate": 1.999998713790723e-07, + "logits/chosen": -2.817506790161133, + "logits/rejected": -2.8111929893493652, + "logps/chosen": -76.08406066894531, + "logps/rejected": -80.0066909790039, + "loss": 0.6764, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.20121629536151886, + "rewards/margins": 0.03890758752822876, + "rewards/rejected": -0.24012386798858643, + "step": 1750 + }, + { + "epoch": 0.30323914541695385, + "grad_norm": 4.477365493774414, + "learning_rate": 1.999993488571206e-07, + "logits/chosen": -2.837259292602539, + "logits/rejected": -2.8106486797332764, + "logps/chosen": -76.87786865234375, + "logps/rejected": -76.6913070678711, + "loss": 0.6719, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.19860495626926422, + "rewards/margins": 0.04847431927919388, + "rewards/rejected": -0.2470792829990387, + "step": 1760 + }, + { + "epoch": 0.3049620951068229, + "grad_norm": 4.9479498863220215, + "learning_rate": 1.9999842439743547e-07, + "logits/chosen": -2.833933115005493, + "logits/rejected": -2.8048694133758545, + "logps/chosen": -75.09016418457031, + "logps/rejected": -72.16284942626953, + "loss": 0.6684, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.1871923804283142, + "rewards/margins": 0.05504288524389267, + "rewards/rejected": -0.2422352284193039, + "step": 1770 + }, + { + "epoch": 0.30668504479669195, + "grad_norm": 4.612185955047607, + "learning_rate": 1.999970980037328e-07, + "logits/chosen": -2.7986302375793457, + "logits/rejected": -2.7996907234191895, + "logps/chosen": -74.53868865966797, + "logps/rejected": -82.4207992553711, + "loss": 0.6697, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.19617575407028198, + "rewards/margins": 0.05358383059501648, + "rewards/rejected": -0.24975958466529846, + "step": 1780 + }, + { + "epoch": 0.308407994486561, + "grad_norm": 5.077559471130371, + "learning_rate": 1.999953696813438e-07, + "logits/chosen": -2.8587779998779297, + "logits/rejected": -2.8454761505126953, + "logps/chosen": -75.32337188720703, + "logps/rejected": -79.9382553100586, + "loss": 0.6684, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.22314408421516418, + "rewards/margins": 0.05672816187143326, + "rewards/rejected": -0.27987223863601685, + "step": 1790 + }, + { + "epoch": 0.31013094417643006, + "grad_norm": 5.012825965881348, + "learning_rate": 1.9999323943721533e-07, + "logits/chosen": -2.857203960418701, + "logits/rejected": -2.8387646675109863, + "logps/chosen": -76.38417053222656, + "logps/rejected": -79.95955657958984, + "loss": 0.6739, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.24100014567375183, + "rewards/margins": 0.044186659157276154, + "rewards/rejected": -0.2851868271827698, + "step": 1800 + }, + { + "epoch": 0.3118538938662991, + "grad_norm": 6.740519046783447, + "learning_rate": 1.9999070727990972e-07, + "logits/chosen": -2.835402727127075, + "logits/rejected": -2.8102526664733887, + "logps/chosen": -81.8633041381836, + "logps/rejected": -81.54666137695312, + "loss": 0.6754, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.24266846477985382, + "rewards/margins": 0.04269016906619072, + "rewards/rejected": -0.28535860776901245, + "step": 1810 + }, + { + "epoch": 0.31357684355616816, + "grad_norm": 5.182940483093262, + "learning_rate": 1.999877732196047e-07, + "logits/chosen": -2.8230953216552734, + "logits/rejected": -2.799881935119629, + "logps/chosen": -81.13172912597656, + "logps/rejected": -79.09537506103516, + "loss": 0.6814, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.25399473309516907, + "rewards/margins": 0.031056974083185196, + "rewards/rejected": -0.28505173325538635, + "step": 1820 + }, + { + "epoch": 0.31529979324603724, + "grad_norm": 6.279678821563721, + "learning_rate": 1.9998443726809344e-07, + "logits/chosen": -2.760938882827759, + "logits/rejected": -2.7558131217956543, + "logps/chosen": -77.61692810058594, + "logps/rejected": -80.39018249511719, + "loss": 0.6739, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.2132168710231781, + "rewards/margins": 0.04549407586455345, + "rewards/rejected": -0.25871092081069946, + "step": 1830 + }, + { + "epoch": 0.31702274293590627, + "grad_norm": 5.1286301612854, + "learning_rate": 1.9998069943878452e-07, + "logits/chosen": -2.885629415512085, + "logits/rejected": -2.880615472793579, + "logps/chosen": -81.87084197998047, + "logps/rejected": -84.01935577392578, + "loss": 0.6761, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.259524941444397, + "rewards/margins": 0.04153280705213547, + "rewards/rejected": -0.30105772614479065, + "step": 1840 + }, + { + "epoch": 0.31874569262577535, + "grad_norm": 6.253596782684326, + "learning_rate": 1.9997655974670177e-07, + "logits/chosen": -2.8121185302734375, + "logits/rejected": -2.8132262229919434, + "logps/chosen": -80.99190521240234, + "logps/rejected": -83.85938262939453, + "loss": 0.6805, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.2681719660758972, + "rewards/margins": 0.032716695219278336, + "rewards/rejected": -0.30088865756988525, + "step": 1850 + }, + { + "epoch": 0.32046864231564437, + "grad_norm": 4.878740310668945, + "learning_rate": 1.9997201820848421e-07, + "logits/chosen": -2.771169424057007, + "logits/rejected": -2.7482142448425293, + "logps/chosen": -80.3410415649414, + "logps/rejected": -78.98155975341797, + "loss": 0.6681, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.2096785604953766, + "rewards/margins": 0.056961387395858765, + "rewards/rejected": -0.26663994789123535, + "step": 1860 + }, + { + "epoch": 0.32219159200551345, + "grad_norm": 8.953726768493652, + "learning_rate": 1.999670748423862e-07, + "logits/chosen": -2.773393154144287, + "logits/rejected": -2.7547454833984375, + "logps/chosen": -77.43568420410156, + "logps/rejected": -80.14945983886719, + "loss": 0.6631, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.20601901412010193, + "rewards/margins": 0.06925632059574127, + "rewards/rejected": -0.2752753496170044, + "step": 1870 + }, + { + "epoch": 0.3239145416953825, + "grad_norm": 5.518745422363281, + "learning_rate": 1.9996172966827712e-07, + "logits/chosen": -2.8194966316223145, + "logits/rejected": -2.7963428497314453, + "logps/chosen": -76.18521118164062, + "logps/rejected": -78.38603210449219, + "loss": 0.668, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.21804073452949524, + "rewards/margins": 0.0592648908495903, + "rewards/rejected": -0.27730563282966614, + "step": 1880 + }, + { + "epoch": 0.32563749138525155, + "grad_norm": 5.660772323608398, + "learning_rate": 1.9995598270764132e-07, + "logits/chosen": -2.8152401447296143, + "logits/rejected": -2.809969902038574, + "logps/chosen": -75.25090789794922, + "logps/rejected": -80.28077697753906, + "loss": 0.67, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.21531884372234344, + "rewards/margins": 0.05318418890237808, + "rewards/rejected": -0.2685030400753021, + "step": 1890 + }, + { + "epoch": 0.32736044107512063, + "grad_norm": 5.434048175811768, + "learning_rate": 1.9994983398357822e-07, + "logits/chosen": -2.803793430328369, + "logits/rejected": -2.7829272747039795, + "logps/chosen": -82.60762786865234, + "logps/rejected": -79.85516357421875, + "loss": 0.675, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.24345216155052185, + "rewards/margins": 0.043585650622844696, + "rewards/rejected": -0.28703781962394714, + "step": 1900 + }, + { + "epoch": 0.32908339076498966, + "grad_norm": 5.854109764099121, + "learning_rate": 1.9994328352080197e-07, + "logits/chosen": -2.7232251167297363, + "logits/rejected": -2.6980268955230713, + "logps/chosen": -81.93873596191406, + "logps/rejected": -87.40576171875, + "loss": 0.6598, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.27153441309928894, + "rewards/margins": 0.07509879767894745, + "rewards/rejected": -0.3466332256793976, + "step": 1910 + }, + { + "epoch": 0.33080634045485874, + "grad_norm": 7.13450288772583, + "learning_rate": 1.9993633134564157e-07, + "logits/chosen": -2.7804644107818604, + "logits/rejected": -2.7603726387023926, + "logps/chosen": -85.59014892578125, + "logps/rejected": -87.0826187133789, + "loss": 0.6706, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.2912139296531677, + "rewards/margins": 0.054294537752866745, + "rewards/rejected": -0.34550851583480835, + "step": 1920 + }, + { + "epoch": 0.33252929014472776, + "grad_norm": 7.671220779418945, + "learning_rate": 1.9992897748604057e-07, + "logits/chosen": -2.740054130554199, + "logits/rejected": -2.715630054473877, + "logps/chosen": -86.39445495605469, + "logps/rejected": -88.7936019897461, + "loss": 0.6712, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.3088904023170471, + "rewards/margins": 0.0533197820186615, + "rewards/rejected": -0.36221015453338623, + "step": 1930 + }, + { + "epoch": 0.33425223983459684, + "grad_norm": 6.598614692687988, + "learning_rate": 1.9992122197155713e-07, + "logits/chosen": -2.7623841762542725, + "logits/rejected": -2.752276659011841, + "logps/chosen": -79.55069732666016, + "logps/rejected": -82.56233215332031, + "loss": 0.6708, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.27555832266807556, + "rewards/margins": 0.05399967357516289, + "rewards/rejected": -0.32955801486968994, + "step": 1940 + }, + { + "epoch": 0.33597518952446587, + "grad_norm": 8.005949020385742, + "learning_rate": 1.9991306483336379e-07, + "logits/chosen": -2.789032459259033, + "logits/rejected": -2.787119150161743, + "logps/chosen": -82.3895034790039, + "logps/rejected": -88.818603515625, + "loss": 0.6738, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.2936074137687683, + "rewards/margins": 0.04737875610589981, + "rewards/rejected": -0.3409861624240875, + "step": 1950 + }, + { + "epoch": 0.33769813921433495, + "grad_norm": 6.6296305656433105, + "learning_rate": 1.9990450610424739e-07, + "logits/chosen": -2.7821130752563477, + "logits/rejected": -2.7680463790893555, + "logps/chosen": -85.0245590209961, + "logps/rejected": -89.01289367675781, + "loss": 0.6709, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.29906749725341797, + "rewards/margins": 0.053921766579151154, + "rewards/rejected": -0.3529892563819885, + "step": 1960 + }, + { + "epoch": 0.33942108890420397, + "grad_norm": 5.420909881591797, + "learning_rate": 1.9989554581860885e-07, + "logits/chosen": -2.7975800037384033, + "logits/rejected": -2.7792773246765137, + "logps/chosen": -86.5243148803711, + "logps/rejected": -83.91753387451172, + "loss": 0.6821, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -0.29315876960754395, + "rewards/margins": 0.032392654567956924, + "rewards/rejected": -0.32555145025253296, + "step": 1970 + }, + { + "epoch": 0.34114403859407305, + "grad_norm": 5.429892063140869, + "learning_rate": 1.9988618401246327e-07, + "logits/chosen": -2.7424893379211426, + "logits/rejected": -2.7354159355163574, + "logps/chosen": -85.56155395507812, + "logps/rejected": -84.77156066894531, + "loss": 0.6859, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.2920836806297302, + "rewards/margins": 0.022945603355765343, + "rewards/rejected": -0.31502923369407654, + "step": 1980 + }, + { + "epoch": 0.34286698828394213, + "grad_norm": 4.82956600189209, + "learning_rate": 1.9987642072343948e-07, + "logits/chosen": -2.8278892040252686, + "logits/rejected": -2.801731586456299, + "logps/chosen": -78.64983367919922, + "logps/rejected": -82.09465789794922, + "loss": 0.6565, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.2664022147655487, + "rewards/margins": 0.08327129483222961, + "rewards/rejected": -0.34967344999313354, + "step": 1990 + }, + { + "epoch": 0.34458993797381116, + "grad_norm": 7.895046234130859, + "learning_rate": 1.9986625599078007e-07, + "logits/chosen": -2.777226686477661, + "logits/rejected": -2.7833473682403564, + "logps/chosen": -78.89508056640625, + "logps/rejected": -88.81587982177734, + "loss": 0.6751, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.2843848168849945, + "rewards/margins": 0.04450703412294388, + "rewards/rejected": -0.3288918137550354, + "step": 2000 + }, + { + "epoch": 0.34458993797381116, + "eval_logits/chosen": -2.8467745780944824, + "eval_logits/rejected": -2.843392848968506, + "eval_logps/chosen": -76.16111755371094, + "eval_logps/rejected": -83.7308349609375, + "eval_loss": 0.677007257938385, + "eval_rewards/accuracies": 0.5889869928359985, + "eval_rewards/chosen": -0.17145642638206482, + "eval_rewards/margins": 0.03835584595799446, + "eval_rewards/rejected": -0.20981229841709137, + "eval_runtime": 398.552, + "eval_samples_per_second": 10.799, + "eval_steps_per_second": 1.35, + "step": 2000 + }, + { + "epoch": 0.34631288766368024, + "grad_norm": 7.642477035522461, + "learning_rate": 1.9985568985534123e-07, + "logits/chosen": -2.80598521232605, + "logits/rejected": -2.784700393676758, + "logps/chosen": -80.71785736083984, + "logps/rejected": -80.69572448730469, + "loss": 0.6681, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.23994019627571106, + "rewards/margins": 0.05967951938509941, + "rewards/rejected": -0.29961973428726196, + "step": 2010 + }, + { + "epoch": 0.34803583735354926, + "grad_norm": 6.802818298339844, + "learning_rate": 1.9984472235959246e-07, + "logits/chosen": -2.7718281745910645, + "logits/rejected": -2.7579102516174316, + "logps/chosen": -77.16645812988281, + "logps/rejected": -87.4555435180664, + "loss": 0.6615, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.2625424563884735, + "rewards/margins": 0.07505720853805542, + "rewards/rejected": -0.33759966492652893, + "step": 2020 + }, + { + "epoch": 0.34975878704341834, + "grad_norm": 7.350240707397461, + "learning_rate": 1.9983335354761662e-07, + "logits/chosen": -2.8413634300231934, + "logits/rejected": -2.824218988418579, + "logps/chosen": -84.81204986572266, + "logps/rejected": -88.4823226928711, + "loss": 0.6693, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.27297335863113403, + "rewards/margins": 0.058346938341856, + "rewards/rejected": -0.33132028579711914, + "step": 2030 + }, + { + "epoch": 0.35148173673328736, + "grad_norm": 6.206594467163086, + "learning_rate": 1.9982158346510952e-07, + "logits/chosen": -2.740448474884033, + "logits/rejected": -2.735485076904297, + "logps/chosen": -81.70101165771484, + "logps/rejected": -87.45999908447266, + "loss": 0.6639, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2743123173713684, + "rewards/margins": 0.07054021954536438, + "rewards/rejected": -0.3448525071144104, + "step": 2040 + }, + { + "epoch": 0.35320468642315644, + "grad_norm": 6.593442440032959, + "learning_rate": 1.998094121593799e-07, + "logits/chosen": -2.806100845336914, + "logits/rejected": -2.7891507148742676, + "logps/chosen": -77.92963409423828, + "logps/rejected": -85.45118713378906, + "loss": 0.6689, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.2585117518901825, + "rewards/margins": 0.058480918407440186, + "rewards/rejected": -0.31699270009994507, + "step": 2050 + }, + { + "epoch": 0.3549276361130255, + "grad_norm": 7.442837238311768, + "learning_rate": 1.9979683967934911e-07, + "logits/chosen": -2.809927463531494, + "logits/rejected": -2.784133195877075, + "logps/chosen": -80.66841125488281, + "logps/rejected": -82.72510528564453, + "loss": 0.6659, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.24928779900074005, + "rewards/margins": 0.06473737210035324, + "rewards/rejected": -0.3140251934528351, + "step": 2060 + }, + { + "epoch": 0.35665058580289455, + "grad_norm": 6.645026683807373, + "learning_rate": 1.9978386607555103e-07, + "logits/chosen": -2.825409412384033, + "logits/rejected": -2.8098556995391846, + "logps/chosen": -84.98794555664062, + "logps/rejected": -89.08955383300781, + "loss": 0.6676, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.2859798073768616, + "rewards/margins": 0.06237654760479927, + "rewards/rejected": -0.34835633635520935, + "step": 2070 + }, + { + "epoch": 0.35837353549276363, + "grad_norm": 5.906277179718018, + "learning_rate": 1.9977049140013183e-07, + "logits/chosen": -2.758378744125366, + "logits/rejected": -2.7402453422546387, + "logps/chosen": -83.45210266113281, + "logps/rejected": -87.67256927490234, + "loss": 0.6625, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.3064289689064026, + "rewards/margins": 0.0735897570848465, + "rewards/rejected": -0.3800187110900879, + "step": 2080 + }, + { + "epoch": 0.36009648518263265, + "grad_norm": 5.731607437133789, + "learning_rate": 1.997567157068497e-07, + "logits/chosen": -2.7923924922943115, + "logits/rejected": -2.7936103343963623, + "logps/chosen": -86.95465087890625, + "logps/rejected": -90.43916320800781, + "loss": 0.6733, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.31404390931129456, + "rewards/margins": 0.051842041313648224, + "rewards/rejected": -0.36588597297668457, + "step": 2090 + }, + { + "epoch": 0.36181943487250173, + "grad_norm": 5.9960479736328125, + "learning_rate": 1.997425390510747e-07, + "logits/chosen": -2.756856918334961, + "logits/rejected": -2.741403579711914, + "logps/chosen": -85.68098449707031, + "logps/rejected": -86.40949249267578, + "loss": 0.6662, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2941354513168335, + "rewards/margins": 0.0659971535205841, + "rewards/rejected": -0.3601325750350952, + "step": 2100 + }, + { + "epoch": 0.36354238456237076, + "grad_norm": 6.551288604736328, + "learning_rate": 1.9972796148978856e-07, + "logits/chosen": -2.7723071575164795, + "logits/rejected": -2.7736432552337646, + "logps/chosen": -80.19908905029297, + "logps/rejected": -90.48481750488281, + "loss": 0.6665, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.29734283685684204, + "rewards/margins": 0.06577859073877335, + "rewards/rejected": -0.3631214201450348, + "step": 2110 + }, + { + "epoch": 0.36526533425223984, + "grad_norm": 6.967728137969971, + "learning_rate": 1.9971298308158441e-07, + "logits/chosen": -2.7236945629119873, + "logits/rejected": -2.7059988975524902, + "logps/chosen": -80.61097717285156, + "logps/rejected": -84.89838409423828, + "loss": 0.6569, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.26060813665390015, + "rewards/margins": 0.08486412465572357, + "rewards/rejected": -0.3454722762107849, + "step": 2120 + }, + { + "epoch": 0.3669882839421089, + "grad_norm": 9.18038558959961, + "learning_rate": 1.9969760388666645e-07, + "logits/chosen": -2.7267282009124756, + "logits/rejected": -2.7112841606140137, + "logps/chosen": -85.85907745361328, + "logps/rejected": -92.39701843261719, + "loss": 0.6504, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.3092494308948517, + "rewards/margins": 0.0988670364022255, + "rewards/rejected": -0.4081164300441742, + "step": 2130 + }, + { + "epoch": 0.36871123363197794, + "grad_norm": 8.384355545043945, + "learning_rate": 1.996818239668499e-07, + "logits/chosen": -2.7107224464416504, + "logits/rejected": -2.705289125442505, + "logps/chosen": -81.84632110595703, + "logps/rejected": -93.03125762939453, + "loss": 0.6584, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.32457980513572693, + "rewards/margins": 0.07963573187589645, + "rewards/rejected": -0.4042155146598816, + "step": 2140 + }, + { + "epoch": 0.370434183321847, + "grad_norm": 7.8044514656066895, + "learning_rate": 1.9966564338556065e-07, + "logits/chosen": -2.733471155166626, + "logits/rejected": -2.701479434967041, + "logps/chosen": -87.98902893066406, + "logps/rejected": -88.12610626220703, + "loss": 0.6534, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.3127537965774536, + "rewards/margins": 0.09064503014087677, + "rewards/rejected": -0.4033988416194916, + "step": 2150 + }, + { + "epoch": 0.37215713301171605, + "grad_norm": 8.45705509185791, + "learning_rate": 1.9964906220783492e-07, + "logits/chosen": -2.716222047805786, + "logits/rejected": -2.705082893371582, + "logps/chosen": -94.84622955322266, + "logps/rejected": -92.92621612548828, + "loss": 0.6708, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.3836338222026825, + "rewards/margins": 0.056928135454654694, + "rewards/rejected": -0.4405619502067566, + "step": 2160 + }, + { + "epoch": 0.3738800827015851, + "grad_norm": 8.92052936553955, + "learning_rate": 1.9963208050031922e-07, + "logits/chosen": -2.8072280883789062, + "logits/rejected": -2.7956135272979736, + "logps/chosen": -93.62696838378906, + "logps/rejected": -100.60514831542969, + "loss": 0.6451, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.386393666267395, + "rewards/margins": 0.11263259500265121, + "rewards/rejected": -0.49902623891830444, + "step": 2170 + }, + { + "epoch": 0.37560303239145415, + "grad_norm": 9.09363842010498, + "learning_rate": 1.9961469833126987e-07, + "logits/chosen": -2.8279776573181152, + "logits/rejected": -2.803985595703125, + "logps/chosen": -106.29798889160156, + "logps/rejected": -105.96732330322266, + "loss": 0.6672, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.46758827567100525, + "rewards/margins": 0.07012106478214264, + "rewards/rejected": -0.5377094149589539, + "step": 2180 + }, + { + "epoch": 0.37732598208132323, + "grad_norm": 7.870185375213623, + "learning_rate": 1.995969157705528e-07, + "logits/chosen": -2.871009588241577, + "logits/rejected": -2.8679919242858887, + "logps/chosen": -95.17460632324219, + "logps/rejected": -101.10989379882812, + "loss": 0.67, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.4195151925086975, + "rewards/margins": 0.060843341052532196, + "rewards/rejected": -0.4803585410118103, + "step": 2190 + }, + { + "epoch": 0.37904893177119225, + "grad_norm": 8.457090377807617, + "learning_rate": 1.995787328896433e-07, + "logits/chosen": -2.7545523643493652, + "logits/rejected": -2.7420895099639893, + "logps/chosen": -90.79286193847656, + "logps/rejected": -100.09413146972656, + "loss": 0.659, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.3792213797569275, + "rewards/margins": 0.08726942539215088, + "rewards/rejected": -0.4664907455444336, + "step": 2200 + }, + { + "epoch": 0.38077188146106133, + "grad_norm": 8.350848197937012, + "learning_rate": 1.9956014976162572e-07, + "logits/chosen": -2.796229839324951, + "logits/rejected": -2.7860822677612305, + "logps/chosen": -92.96107482910156, + "logps/rejected": -95.38506317138672, + "loss": 0.6708, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.37106844782829285, + "rewards/margins": 0.058512069284915924, + "rewards/rejected": -0.42958053946495056, + "step": 2210 + }, + { + "epoch": 0.3824948311509304, + "grad_norm": 8.519368171691895, + "learning_rate": 1.9954116646119315e-07, + "logits/chosen": -2.674560308456421, + "logits/rejected": -2.668351650238037, + "logps/chosen": -90.71675872802734, + "logps/rejected": -96.26646423339844, + "loss": 0.6669, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.35924988985061646, + "rewards/margins": 0.06914254277944565, + "rewards/rejected": -0.4283924698829651, + "step": 2220 + }, + { + "epoch": 0.38421778084079944, + "grad_norm": 8.899928092956543, + "learning_rate": 1.9952178306464708e-07, + "logits/chosen": -2.7807633876800537, + "logits/rejected": -2.7601094245910645, + "logps/chosen": -92.98501586914062, + "logps/rejected": -93.89694213867188, + "loss": 0.6722, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.38546445965766907, + "rewards/margins": 0.054776061326265335, + "rewards/rejected": -0.4402404725551605, + "step": 2230 + }, + { + "epoch": 0.3859407305306685, + "grad_norm": 7.316110610961914, + "learning_rate": 1.9950199964989728e-07, + "logits/chosen": -2.772390604019165, + "logits/rejected": -2.74361515045166, + "logps/chosen": -94.11860656738281, + "logps/rejected": -94.66634368896484, + "loss": 0.6754, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.367158979177475, + "rewards/margins": 0.050020165741443634, + "rewards/rejected": -0.4171791076660156, + "step": 2240 + }, + { + "epoch": 0.38766368022053754, + "grad_norm": 13.700274467468262, + "learning_rate": 1.9948181629646125e-07, + "logits/chosen": -2.707676410675049, + "logits/rejected": -2.6810202598571777, + "logps/chosen": -93.48914337158203, + "logps/rejected": -94.83229064941406, + "loss": 0.672, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3691152334213257, + "rewards/margins": 0.05425702780485153, + "rewards/rejected": -0.4233722686767578, + "step": 2250 + }, + { + "epoch": 0.3893866299104066, + "grad_norm": 7.7341108322143555, + "learning_rate": 1.99461233085464e-07, + "logits/chosen": -2.712756395339966, + "logits/rejected": -2.686300039291382, + "logps/chosen": -96.61649322509766, + "logps/rejected": -99.14938354492188, + "loss": 0.6654, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.3552453815937042, + "rewards/margins": 0.07118260115385056, + "rewards/rejected": -0.4264279901981354, + "step": 2260 + }, + { + "epoch": 0.39110957960027565, + "grad_norm": 8.820086479187012, + "learning_rate": 1.9944025009963783e-07, + "logits/chosen": -2.701951265335083, + "logits/rejected": -2.6750802993774414, + "logps/chosen": -91.39891052246094, + "logps/rejected": -94.74635314941406, + "loss": 0.6633, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.364845335483551, + "rewards/margins": 0.07676877081394196, + "rewards/rejected": -0.4416140615940094, + "step": 2270 + }, + { + "epoch": 0.3928325292901447, + "grad_norm": 7.709187984466553, + "learning_rate": 1.9941886742332175e-07, + "logits/chosen": -2.724069833755493, + "logits/rejected": -2.720072031021118, + "logps/chosen": -89.43894958496094, + "logps/rejected": -98.11272430419922, + "loss": 0.6601, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.34747377038002014, + "rewards/margins": 0.08022281527519226, + "rewards/rejected": -0.42769655585289, + "step": 2280 + }, + { + "epoch": 0.3945554789800138, + "grad_norm": 7.868160247802734, + "learning_rate": 1.9939708514246143e-07, + "logits/chosen": -2.6722376346588135, + "logits/rejected": -2.6551549434661865, + "logps/chosen": -91.05497741699219, + "logps/rejected": -99.68929290771484, + "loss": 0.6515, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.376445472240448, + "rewards/margins": 0.09919819980859756, + "rewards/rejected": -0.47564369440078735, + "step": 2290 + }, + { + "epoch": 0.39627842866988283, + "grad_norm": 8.843903541564941, + "learning_rate": 1.9937490334460857e-07, + "logits/chosen": -2.7841269969940186, + "logits/rejected": -2.759320020675659, + "logps/chosen": -99.24239349365234, + "logps/rejected": -103.56510925292969, + "loss": 0.6549, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.41207608580589294, + "rewards/margins": 0.09394040703773499, + "rewards/rejected": -0.5060164928436279, + "step": 2300 + }, + { + "epoch": 0.3980013783597519, + "grad_norm": 7.929662704467773, + "learning_rate": 1.9935232211892083e-07, + "logits/chosen": -2.714627742767334, + "logits/rejected": -2.7024972438812256, + "logps/chosen": -96.11257934570312, + "logps/rejected": -103.85871887207031, + "loss": 0.655, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.4413214325904846, + "rewards/margins": 0.09551285207271576, + "rewards/rejected": -0.5368342399597168, + "step": 2310 + }, + { + "epoch": 0.39972432804962094, + "grad_norm": 7.020380020141602, + "learning_rate": 1.9932934155616127e-07, + "logits/chosen": -2.7679104804992676, + "logits/rejected": -2.7354788780212402, + "logps/chosen": -104.21220397949219, + "logps/rejected": -106.46934509277344, + "loss": 0.6535, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.4562048316001892, + "rewards/margins": 0.09959544241428375, + "rewards/rejected": -0.5558002591133118, + "step": 2320 + }, + { + "epoch": 0.40144727773949, + "grad_norm": 8.07779598236084, + "learning_rate": 1.9930596174869797e-07, + "logits/chosen": -2.723388195037842, + "logits/rejected": -2.7130160331726074, + "logps/chosen": -99.20726013183594, + "logps/rejected": -105.5082015991211, + "loss": 0.642, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.4184534549713135, + "rewards/margins": 0.12403968721628189, + "rewards/rejected": -0.5424931645393372, + "step": 2330 + }, + { + "epoch": 0.40317022742935904, + "grad_norm": 11.052251815795898, + "learning_rate": 1.992821827905039e-07, + "logits/chosen": -2.74686336517334, + "logits/rejected": -2.7400949001312256, + "logps/chosen": -95.10917663574219, + "logps/rejected": -102.4656982421875, + "loss": 0.6719, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.41894015669822693, + "rewards/margins": 0.05962671712040901, + "rewards/rejected": -0.47856688499450684, + "step": 2340 + }, + { + "epoch": 0.4048931771192281, + "grad_norm": 13.94660758972168, + "learning_rate": 1.9925800477715623e-07, + "logits/chosen": -2.7292239665985107, + "logits/rejected": -2.7181527614593506, + "logps/chosen": -97.06452941894531, + "logps/rejected": -103.667236328125, + "loss": 0.6482, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.3829440474510193, + "rewards/margins": 0.11175689846277237, + "rewards/rejected": -0.49470096826553345, + "step": 2350 + }, + { + "epoch": 0.4066161268090972, + "grad_norm": 9.01716136932373, + "learning_rate": 1.992334278058362e-07, + "logits/chosen": -2.7573258876800537, + "logits/rejected": -2.746138334274292, + "logps/chosen": -92.14654541015625, + "logps/rejected": -101.3851089477539, + "loss": 0.6496, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.4030567705631256, + "rewards/margins": 0.10895220935344696, + "rewards/rejected": -0.5120089650154114, + "step": 2360 + }, + { + "epoch": 0.4083390764989662, + "grad_norm": 11.883419036865234, + "learning_rate": 1.9920845197532854e-07, + "logits/chosen": -2.75362229347229, + "logits/rejected": -2.7488625049591064, + "logps/chosen": -100.34485626220703, + "logps/rejected": -108.6867904663086, + "loss": 0.6528, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.45487451553344727, + "rewards/margins": 0.10138414055109024, + "rewards/rejected": -0.5562586784362793, + "step": 2370 + }, + { + "epoch": 0.4100620261888353, + "grad_norm": 8.847671508789062, + "learning_rate": 1.991830773860212e-07, + "logits/chosen": -2.713022470474243, + "logits/rejected": -2.696528911590576, + "logps/chosen": -100.02056884765625, + "logps/rejected": -104.38777160644531, + "loss": 0.6686, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.4528826177120209, + "rewards/margins": 0.0709812194108963, + "rewards/rejected": -0.5238637924194336, + "step": 2380 + }, + { + "epoch": 0.41178497587870433, + "grad_norm": 12.014663696289062, + "learning_rate": 1.9915730413990486e-07, + "logits/chosen": -2.75661301612854, + "logits/rejected": -2.738581895828247, + "logps/chosen": -106.23995208740234, + "logps/rejected": -110.62483978271484, + "loss": 0.6544, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.4921490550041199, + "rewards/margins": 0.0999661311507225, + "rewards/rejected": -0.5921152830123901, + "step": 2390 + }, + { + "epoch": 0.4135079255685734, + "grad_norm": 10.028485298156738, + "learning_rate": 1.9913113234057264e-07, + "logits/chosen": -2.7949843406677246, + "logits/rejected": -2.7854666709899902, + "logps/chosen": -96.82019805908203, + "logps/rejected": -107.0666275024414, + "loss": 0.6518, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.4390484690666199, + "rewards/margins": 0.10471247136592865, + "rewards/rejected": -0.5437608957290649, + "step": 2400 + }, + { + "epoch": 0.4135079255685734, + "eval_logits/chosen": -2.79258131980896, + "eval_logits/rejected": -2.7893073558807373, + "eval_logps/chosen": -96.29039764404297, + "eval_logps/rejected": -106.56369018554688, + "eval_loss": 0.6676347851753235, + "eval_rewards/accuracies": 0.606877326965332, + "eval_rewards/chosen": -0.37274929881095886, + "eval_rewards/margins": 0.06539163738489151, + "eval_rewards/rejected": -0.43814095854759216, + "eval_runtime": 384.7623, + "eval_samples_per_second": 11.186, + "eval_steps_per_second": 1.398, + "step": 2400 + }, + { + "epoch": 0.41523087525844243, + "grad_norm": 11.282934188842773, + "learning_rate": 1.9910456209321956e-07, + "logits/chosen": -2.7576167583465576, + "logits/rejected": -2.73296856880188, + "logps/chosen": -101.84904479980469, + "logps/rejected": -103.86643981933594, + "loss": 0.6567, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.4611765742301941, + "rewards/margins": 0.09233228862285614, + "rewards/rejected": -0.5535088777542114, + "step": 2410 + }, + { + "epoch": 0.4169538249483115, + "grad_norm": 13.180535316467285, + "learning_rate": 1.9907759350464212e-07, + "logits/chosen": -2.7671501636505127, + "logits/rejected": -2.7475426197052, + "logps/chosen": -102.40633392333984, + "logps/rejected": -110.27593994140625, + "loss": 0.649, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.4542531371116638, + "rewards/margins": 0.1068098396062851, + "rewards/rejected": -0.5610629916191101, + "step": 2420 + }, + { + "epoch": 0.41867677463818054, + "grad_norm": 9.693948745727539, + "learning_rate": 1.9905022668323803e-07, + "logits/chosen": -2.7108891010284424, + "logits/rejected": -2.6946499347686768, + "logps/chosen": -100.01832580566406, + "logps/rejected": -101.14888000488281, + "loss": 0.6754, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.4425184726715088, + "rewards/margins": 0.05519191175699234, + "rewards/rejected": -0.49771031737327576, + "step": 2430 + }, + { + "epoch": 0.4203997243280496, + "grad_norm": 13.568098068237305, + "learning_rate": 1.9902246173900554e-07, + "logits/chosen": -2.7337582111358643, + "logits/rejected": -2.724292755126953, + "logps/chosen": -96.25423431396484, + "logps/rejected": -104.6993179321289, + "loss": 0.6456, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.39634186029434204, + "rewards/margins": 0.1141355037689209, + "rewards/rejected": -0.5104773640632629, + "step": 2440 + }, + { + "epoch": 0.4221226740179187, + "grad_norm": 9.999527931213379, + "learning_rate": 1.9899429878354318e-07, + "logits/chosen": -2.710087776184082, + "logits/rejected": -2.6934750080108643, + "logps/chosen": -100.3898696899414, + "logps/rejected": -104.9481201171875, + "loss": 0.6662, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.4584103524684906, + "rewards/margins": 0.07401929795742035, + "rewards/rejected": -0.532429575920105, + "step": 2450 + }, + { + "epoch": 0.4238456237077877, + "grad_norm": 6.6206536293029785, + "learning_rate": 1.989657379300492e-07, + "logits/chosen": -2.712404727935791, + "logits/rejected": -2.688204288482666, + "logps/chosen": -100.12140655517578, + "logps/rejected": -103.390625, + "loss": 0.6607, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.42770424485206604, + "rewards/margins": 0.09007831662893295, + "rewards/rejected": -0.5177825093269348, + "step": 2460 + }, + { + "epoch": 0.4255685733976568, + "grad_norm": 10.734334945678711, + "learning_rate": 1.9893677929332123e-07, + "logits/chosen": -2.7907164096832275, + "logits/rejected": -2.77321195602417, + "logps/chosen": -99.40889739990234, + "logps/rejected": -104.71065521240234, + "loss": 0.6583, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.4294106364250183, + "rewards/margins": 0.09319963306188583, + "rewards/rejected": -0.5226103067398071, + "step": 2470 + }, + { + "epoch": 0.4272915230875258, + "grad_norm": 9.096755981445312, + "learning_rate": 1.9890742298975574e-07, + "logits/chosen": -2.7226462364196777, + "logits/rejected": -2.6995315551757812, + "logps/chosen": -99.27317810058594, + "logps/rejected": -102.8205795288086, + "loss": 0.6601, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.4444698393344879, + "rewards/margins": 0.08868283033370972, + "rewards/rejected": -0.53315269947052, + "step": 2480 + }, + { + "epoch": 0.4290144727773949, + "grad_norm": 12.619534492492676, + "learning_rate": 1.9887766913734748e-07, + "logits/chosen": -2.735262870788574, + "logits/rejected": -2.7315163612365723, + "logps/chosen": -91.02969360351562, + "logps/rejected": -103.66337585449219, + "loss": 0.6457, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.407538503408432, + "rewards/margins": 0.1184179037809372, + "rewards/rejected": -0.5259564518928528, + "step": 2490 + }, + { + "epoch": 0.43073742246726393, + "grad_norm": 9.47433853149414, + "learning_rate": 1.9884751785568928e-07, + "logits/chosen": -2.763235569000244, + "logits/rejected": -2.744462490081787, + "logps/chosen": -107.6962890625, + "logps/rejected": -116.11775207519531, + "loss": 0.6578, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.5090689063072205, + "rewards/margins": 0.09511864930391312, + "rewards/rejected": -0.6041876077651978, + "step": 2500 + }, + { + "epoch": 0.432460372157133, + "grad_norm": 9.720632553100586, + "learning_rate": 1.9881696926597125e-07, + "logits/chosen": -2.6765334606170654, + "logits/rejected": -2.6696765422821045, + "logps/chosen": -101.8257827758789, + "logps/rejected": -110.89205169677734, + "loss": 0.6486, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.4863107204437256, + "rewards/margins": 0.11515744030475616, + "rewards/rejected": -0.6014681458473206, + "step": 2510 + }, + { + "epoch": 0.4341833218470021, + "grad_norm": 11.542259216308594, + "learning_rate": 1.987860234909805e-07, + "logits/chosen": -2.6656720638275146, + "logits/rejected": -2.6460516452789307, + "logps/chosen": -103.51522064208984, + "logps/rejected": -109.52400207519531, + "loss": 0.6524, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.5028563737869263, + "rewards/margins": 0.10998809337615967, + "rewards/rejected": -0.6128444075584412, + "step": 2520 + }, + { + "epoch": 0.4359062715368711, + "grad_norm": 11.412103652954102, + "learning_rate": 1.987546806551006e-07, + "logits/chosen": -2.726616382598877, + "logits/rejected": -2.72737455368042, + "logps/chosen": -101.48871612548828, + "logps/rejected": -109.21578216552734, + "loss": 0.6715, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.4935535490512848, + "rewards/margins": 0.06767278909683228, + "rewards/rejected": -0.5612263083457947, + "step": 2530 + }, + { + "epoch": 0.4376292212267402, + "grad_norm": 11.56679630279541, + "learning_rate": 1.9872294088431105e-07, + "logits/chosen": -2.719217538833618, + "logits/rejected": -2.706432580947876, + "logps/chosen": -106.15666198730469, + "logps/rejected": -116.7001724243164, + "loss": 0.6531, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.5228097438812256, + "rewards/margins": 0.11348120123147964, + "rewards/rejected": -0.6362909078598022, + "step": 2540 + }, + { + "epoch": 0.4393521709166092, + "grad_norm": 15.340383529663086, + "learning_rate": 1.9869080430618684e-07, + "logits/chosen": -2.716742515563965, + "logits/rejected": -2.690329074859619, + "logps/chosen": -109.13045501708984, + "logps/rejected": -112.8603744506836, + "loss": 0.6542, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.4905763566493988, + "rewards/margins": 0.1051291674375534, + "rewards/rejected": -0.595705509185791, + "step": 2550 + }, + { + "epoch": 0.4410751206064783, + "grad_norm": 13.995706558227539, + "learning_rate": 1.9865827104989774e-07, + "logits/chosen": -2.7623062133789062, + "logits/rejected": -2.743518352508545, + "logps/chosen": -105.69709777832031, + "logps/rejected": -112.81187438964844, + "loss": 0.6601, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.49545541405677795, + "rewards/margins": 0.09465087950229645, + "rewards/rejected": -0.5901063084602356, + "step": 2560 + }, + { + "epoch": 0.4427980702963473, + "grad_norm": 11.846534729003906, + "learning_rate": 1.9862534124620814e-07, + "logits/chosen": -2.704515218734741, + "logits/rejected": -2.6942319869995117, + "logps/chosen": -114.15962982177734, + "logps/rejected": -117.64332580566406, + "loss": 0.6697, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.5424486398696899, + "rewards/margins": 0.07557938992977142, + "rewards/rejected": -0.6180279850959778, + "step": 2570 + }, + { + "epoch": 0.4445210199862164, + "grad_norm": 13.413008689880371, + "learning_rate": 1.9859201502747614e-07, + "logits/chosen": -2.722397804260254, + "logits/rejected": -2.7104969024658203, + "logps/chosen": -109.5277099609375, + "logps/rejected": -115.61268615722656, + "loss": 0.6698, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.5544854998588562, + "rewards/margins": 0.07775478065013885, + "rewards/rejected": -0.6322402954101562, + "step": 2580 + }, + { + "epoch": 0.4462439696760855, + "grad_norm": 10.059220314025879, + "learning_rate": 1.985582925276533e-07, + "logits/chosen": -2.694520950317383, + "logits/rejected": -2.6677818298339844, + "logps/chosen": -107.39048767089844, + "logps/rejected": -109.03306579589844, + "loss": 0.6586, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.5027720332145691, + "rewards/margins": 0.08564041554927826, + "rewards/rejected": -0.5884124040603638, + "step": 2590 + }, + { + "epoch": 0.4479669193659545, + "grad_norm": 14.61945629119873, + "learning_rate": 1.9852417388228392e-07, + "logits/chosen": -2.7207679748535156, + "logits/rejected": -2.6922860145568848, + "logps/chosen": -110.1673355102539, + "logps/rejected": -108.0245361328125, + "loss": 0.6654, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.515986442565918, + "rewards/margins": 0.08189831674098969, + "rewards/rejected": -0.5978847742080688, + "step": 2600 + }, + { + "epoch": 0.4496898690558236, + "grad_norm": 10.718831062316895, + "learning_rate": 1.9848965922850464e-07, + "logits/chosen": -2.703439235687256, + "logits/rejected": -2.6759893894195557, + "logps/chosen": -105.53524017333984, + "logps/rejected": -105.3673324584961, + "loss": 0.6693, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.4681882858276367, + "rewards/margins": 0.07423902302980423, + "rewards/rejected": -0.5424273610115051, + "step": 2610 + }, + { + "epoch": 0.4514128187456926, + "grad_norm": 10.117892265319824, + "learning_rate": 1.9845474870504378e-07, + "logits/chosen": -2.7295031547546387, + "logits/rejected": -2.7077276706695557, + "logps/chosen": -93.91828918457031, + "logps/rejected": -104.33548736572266, + "loss": 0.6406, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.4194442331790924, + "rewards/margins": 0.12605123221874237, + "rewards/rejected": -0.5454954504966736, + "step": 2620 + }, + { + "epoch": 0.4531357684355617, + "grad_norm": 8.47807788848877, + "learning_rate": 1.984194424522208e-07, + "logits/chosen": -2.667053699493408, + "logits/rejected": -2.6478772163391113, + "logps/chosen": -96.94065856933594, + "logps/rejected": -108.85382080078125, + "loss": 0.6343, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.41304540634155273, + "rewards/margins": 0.14610633254051208, + "rewards/rejected": -0.5591517686843872, + "step": 2630 + }, + { + "epoch": 0.4548587181254307, + "grad_norm": 11.933856010437012, + "learning_rate": 1.9838374061194575e-07, + "logits/chosen": -2.683161497116089, + "logits/rejected": -2.6706173419952393, + "logps/chosen": -97.94457244873047, + "logps/rejected": -106.34178161621094, + "loss": 0.6477, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.46626657247543335, + "rewards/margins": 0.11536351591348648, + "rewards/rejected": -0.5816301107406616, + "step": 2640 + }, + { + "epoch": 0.4565816678152998, + "grad_norm": 10.261037826538086, + "learning_rate": 1.983476433277188e-07, + "logits/chosen": -2.6465036869049072, + "logits/rejected": -2.635824203491211, + "logps/chosen": -102.07220458984375, + "logps/rejected": -112.19637298583984, + "loss": 0.6536, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.4834628701210022, + "rewards/margins": 0.10731105506420135, + "rewards/rejected": -0.59077388048172, + "step": 2650 + }, + { + "epoch": 0.4583046175051689, + "grad_norm": 11.558037757873535, + "learning_rate": 1.9831115074462944e-07, + "logits/chosen": -2.6853396892547607, + "logits/rejected": -2.6572957038879395, + "logps/chosen": -109.86177825927734, + "logps/rejected": -115.7259750366211, + "loss": 0.6476, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.5141522288322449, + "rewards/margins": 0.12386395037174225, + "rewards/rejected": -0.6380161046981812, + "step": 2660 + }, + { + "epoch": 0.4600275671950379, + "grad_norm": 12.215116500854492, + "learning_rate": 1.982742630093561e-07, + "logits/chosen": -2.6593005657196045, + "logits/rejected": -2.6388514041900635, + "logps/chosen": -115.66468811035156, + "logps/rejected": -121.42977142333984, + "loss": 0.6646, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.6060430407524109, + "rewards/margins": 0.09283698350191116, + "rewards/rejected": -0.6988800764083862, + "step": 2670 + }, + { + "epoch": 0.461750516884907, + "grad_norm": 10.993180274963379, + "learning_rate": 1.9823698027016548e-07, + "logits/chosen": -2.7275149822235107, + "logits/rejected": -2.703504800796509, + "logps/chosen": -114.2564926147461, + "logps/rejected": -116.74766540527344, + "loss": 0.6615, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.546272337436676, + "rewards/margins": 0.09389664232730865, + "rewards/rejected": -0.6401689648628235, + "step": 2680 + }, + { + "epoch": 0.463473466574776, + "grad_norm": 11.470463752746582, + "learning_rate": 1.98199302676912e-07, + "logits/chosen": -2.690214157104492, + "logits/rejected": -2.6763110160827637, + "logps/chosen": -100.8199462890625, + "logps/rejected": -111.06422424316406, + "loss": 0.6436, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.49626702070236206, + "rewards/margins": 0.12201160192489624, + "rewards/rejected": -0.6182786226272583, + "step": 2690 + }, + { + "epoch": 0.4651964162646451, + "grad_norm": 9.260663032531738, + "learning_rate": 1.9816123038103701e-07, + "logits/chosen": -2.6932690143585205, + "logits/rejected": -2.6703147888183594, + "logps/chosen": -99.10530090332031, + "logps/rejected": -108.68851470947266, + "loss": 0.6524, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.45348605513572693, + "rewards/margins": 0.10938525199890137, + "rewards/rejected": -0.5628713369369507, + "step": 2700 + }, + { + "epoch": 0.4669193659545141, + "grad_norm": 11.823142051696777, + "learning_rate": 1.9812276353556852e-07, + "logits/chosen": -2.716191053390503, + "logits/rejected": -2.698429584503174, + "logps/chosen": -105.06886291503906, + "logps/rejected": -108.39021301269531, + "loss": 0.6578, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.44185343384742737, + "rewards/margins": 0.09492018818855286, + "rewards/rejected": -0.536773681640625, + "step": 2710 + }, + { + "epoch": 0.4686423156443832, + "grad_norm": 11.33167839050293, + "learning_rate": 1.9808390229512026e-07, + "logits/chosen": -2.686174154281616, + "logits/rejected": -2.6930418014526367, + "logps/chosen": -99.7154312133789, + "logps/rejected": -110.9734878540039, + "loss": 0.6506, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.4637439250946045, + "rewards/margins": 0.11209309101104736, + "rewards/rejected": -0.5758370161056519, + "step": 2720 + }, + { + "epoch": 0.4703652653342522, + "grad_norm": 13.145691871643066, + "learning_rate": 1.980446468158912e-07, + "logits/chosen": -2.693547248840332, + "logits/rejected": -2.684006929397583, + "logps/chosen": -108.5434341430664, + "logps/rejected": -113.65617370605469, + "loss": 0.6567, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4874972403049469, + "rewards/margins": 0.10460158437490463, + "rewards/rejected": -0.5920988321304321, + "step": 2730 + }, + { + "epoch": 0.4720882150241213, + "grad_norm": 10.7879638671875, + "learning_rate": 1.9800499725566506e-07, + "logits/chosen": -2.676889657974243, + "logits/rejected": -2.6644585132598877, + "logps/chosen": -100.70921325683594, + "logps/rejected": -102.6191177368164, + "loss": 0.6691, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.4652387499809265, + "rewards/margins": 0.07463903725147247, + "rewards/rejected": -0.5398778319358826, + "step": 2740 + }, + { + "epoch": 0.4738111647139904, + "grad_norm": 12.473271369934082, + "learning_rate": 1.9796495377380933e-07, + "logits/chosen": -2.6230242252349854, + "logits/rejected": -2.6228199005126953, + "logps/chosen": -92.10359191894531, + "logps/rejected": -106.92274475097656, + "loss": 0.6424, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.42038828134536743, + "rewards/margins": 0.1267349123954773, + "rewards/rejected": -0.5471231341362, + "step": 2750 + }, + { + "epoch": 0.4755341144038594, + "grad_norm": 12.147249221801758, + "learning_rate": 1.9792451653127496e-07, + "logits/chosen": -2.6653361320495605, + "logits/rejected": -2.657927989959717, + "logps/chosen": -97.95817565917969, + "logps/rejected": -110.66314697265625, + "loss": 0.6398, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.42441844940185547, + "rewards/margins": 0.1387939751148224, + "rewards/rejected": -0.5632123947143555, + "step": 2760 + }, + { + "epoch": 0.4772570640937285, + "grad_norm": 10.381051063537598, + "learning_rate": 1.9788368569059551e-07, + "logits/chosen": -2.7360317707061768, + "logits/rejected": -2.7096409797668457, + "logps/chosen": -106.78315734863281, + "logps/rejected": -111.8558349609375, + "loss": 0.6596, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.5055192112922668, + "rewards/margins": 0.0921645388007164, + "rewards/rejected": -0.5976837873458862, + "step": 2770 + }, + { + "epoch": 0.4789800137835975, + "grad_norm": 13.484970092773438, + "learning_rate": 1.9784246141588662e-07, + "logits/chosen": -2.6131300926208496, + "logits/rejected": -2.5922048091888428, + "logps/chosen": -112.4483642578125, + "logps/rejected": -119.95368957519531, + "loss": 0.6543, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5929507613182068, + "rewards/margins": 0.10826816409826279, + "rewards/rejected": -0.7012189030647278, + "step": 2780 + }, + { + "epoch": 0.4807029634734666, + "grad_norm": 12.457329750061035, + "learning_rate": 1.9780084387284535e-07, + "logits/chosen": -2.688363552093506, + "logits/rejected": -2.664832353591919, + "logps/chosen": -109.06170654296875, + "logps/rejected": -117.6565933227539, + "loss": 0.6457, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.5575080513954163, + "rewards/margins": 0.12331205606460571, + "rewards/rejected": -0.680820107460022, + "step": 2790 + }, + { + "epoch": 0.4824259131633356, + "grad_norm": 13.478117942810059, + "learning_rate": 1.977588332287493e-07, + "logits/chosen": -2.707609176635742, + "logits/rejected": -2.6857008934020996, + "logps/chosen": -124.88829040527344, + "logps/rejected": -128.7651824951172, + "loss": 0.6695, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.6486111283302307, + "rewards/margins": 0.08287011831998825, + "rewards/rejected": -0.7314812541007996, + "step": 2800 + }, + { + "epoch": 0.4824259131633356, + "eval_logits/chosen": -2.7450242042541504, + "eval_logits/rejected": -2.7414777278900146, + "eval_logps/chosen": -106.3523178100586, + "eval_logps/rejected": -118.35004425048828, + "eval_loss": 0.6630884408950806, + "eval_rewards/accuracies": 0.6140799522399902, + "eval_rewards/chosen": -0.47336843609809875, + "eval_rewards/margins": 0.0826360210776329, + "eval_rewards/rejected": -0.5560044646263123, + "eval_runtime": 384.301, + "eval_samples_per_second": 11.2, + "eval_steps_per_second": 1.4, + "step": 2800 + }, + { + "epoch": 0.4841488628532047, + "grad_norm": 16.544485092163086, + "learning_rate": 1.9771642965245623e-07, + "logits/chosen": -2.634906768798828, + "logits/rejected": -2.6151344776153564, + "logps/chosen": -112.07231140136719, + "logps/rejected": -121.56315612792969, + "loss": 0.656, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.5526391863822937, + "rewards/margins": 0.10764841735363007, + "rewards/rejected": -0.6602876782417297, + "step": 2810 + }, + { + "epoch": 0.48587181254307377, + "grad_norm": 13.295195579528809, + "learning_rate": 1.9767363331440324e-07, + "logits/chosen": -2.7115139961242676, + "logits/rejected": -2.7019412517547607, + "logps/chosen": -111.91889953613281, + "logps/rejected": -114.33284759521484, + "loss": 0.6771, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.5611149072647095, + "rewards/margins": 0.06479386240243912, + "rewards/rejected": -0.6259086728096008, + "step": 2820 + }, + { + "epoch": 0.4875947622329428, + "grad_norm": 10.603714942932129, + "learning_rate": 1.9763044438660606e-07, + "logits/chosen": -2.591468334197998, + "logits/rejected": -2.5788750648498535, + "logps/chosen": -108.39720153808594, + "logps/rejected": -118.72285461425781, + "loss": 0.6448, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.5164579153060913, + "rewards/margins": 0.12574324011802673, + "rewards/rejected": -0.6422011852264404, + "step": 2830 + }, + { + "epoch": 0.48931771192281187, + "grad_norm": 10.576966285705566, + "learning_rate": 1.9758686304265845e-07, + "logits/chosen": -2.697780132293701, + "logits/rejected": -2.6885037422180176, + "logps/chosen": -109.06404876708984, + "logps/rejected": -115.7732162475586, + "loss": 0.6539, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.5489058494567871, + "rewards/margins": 0.10463793575763702, + "rewards/rejected": -0.6535437703132629, + "step": 2840 + }, + { + "epoch": 0.4910406616126809, + "grad_norm": 14.944794654846191, + "learning_rate": 1.975428894577314e-07, + "logits/chosen": -2.6671526432037354, + "logits/rejected": -2.653252124786377, + "logps/chosen": -110.45272064208984, + "logps/rejected": -124.46026611328125, + "loss": 0.6443, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.5722262263298035, + "rewards/margins": 0.13191375136375427, + "rewards/rejected": -0.7041400074958801, + "step": 2850 + }, + { + "epoch": 0.49276361130255, + "grad_norm": 11.296040534973145, + "learning_rate": 1.9749852380857247e-07, + "logits/chosen": -2.6553471088409424, + "logits/rejected": -2.637782573699951, + "logps/chosen": -112.59574890136719, + "logps/rejected": -122.50654602050781, + "loss": 0.6428, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.5746721029281616, + "rewards/margins": 0.13814301788806915, + "rewards/rejected": -0.7128151655197144, + "step": 2860 + }, + { + "epoch": 0.494486560992419, + "grad_norm": 11.216682434082031, + "learning_rate": 1.9745376627350515e-07, + "logits/chosen": -2.7220160961151123, + "logits/rejected": -2.7088305950164795, + "logps/chosen": -113.31159973144531, + "logps/rejected": -120.91353607177734, + "loss": 0.6485, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.5657340884208679, + "rewards/margins": 0.12379582971334457, + "rewards/rejected": -0.6895298957824707, + "step": 2870 + }, + { + "epoch": 0.4962095106822881, + "grad_norm": 11.167985916137695, + "learning_rate": 1.9740861703242797e-07, + "logits/chosen": -2.7385172843933105, + "logits/rejected": -2.715331554412842, + "logps/chosen": -111.76835632324219, + "logps/rejected": -119.62261962890625, + "loss": 0.6401, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.5546174645423889, + "rewards/margins": 0.14337070286273956, + "rewards/rejected": -0.6979882121086121, + "step": 2880 + }, + { + "epoch": 0.49793246037215716, + "grad_norm": 14.719575881958008, + "learning_rate": 1.97363076266814e-07, + "logits/chosen": -2.737037181854248, + "logits/rejected": -2.729886531829834, + "logps/chosen": -112.28880310058594, + "logps/rejected": -123.53924560546875, + "loss": 0.6474, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.5839313268661499, + "rewards/margins": 0.13259650766849518, + "rewards/rejected": -0.7165278196334839, + "step": 2890 + }, + { + "epoch": 0.4996554100620262, + "grad_norm": 15.115251541137695, + "learning_rate": 1.9731714415970998e-07, + "logits/chosen": -2.683711528778076, + "logits/rejected": -2.678250789642334, + "logps/chosen": -109.89457702636719, + "logps/rejected": -120.73185729980469, + "loss": 0.6465, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.5626266598701477, + "rewards/margins": 0.13326212763786316, + "rewards/rejected": -0.6958888173103333, + "step": 2900 + }, + { + "epoch": 0.5013783597518953, + "grad_norm": 14.059300422668457, + "learning_rate": 1.9727082089573552e-07, + "logits/chosen": -2.7109973430633545, + "logits/rejected": -2.700655221939087, + "logps/chosen": -117.3915786743164, + "logps/rejected": -132.0679931640625, + "loss": 0.6301, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.6010380983352661, + "rewards/margins": 0.16865003108978271, + "rewards/rejected": -0.7696880102157593, + "step": 2910 + }, + { + "epoch": 0.5031013094417643, + "grad_norm": 12.552085876464844, + "learning_rate": 1.9722410666108251e-07, + "logits/chosen": -2.6676573753356934, + "logits/rejected": -2.6615819931030273, + "logps/chosen": -117.3313980102539, + "logps/rejected": -135.73495483398438, + "loss": 0.6261, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.6300705671310425, + "rewards/margins": 0.1892434060573578, + "rewards/rejected": -0.8193138837814331, + "step": 2920 + }, + { + "epoch": 0.5048242591316333, + "grad_norm": 9.844046592712402, + "learning_rate": 1.9717700164351435e-07, + "logits/chosen": -2.638333559036255, + "logits/rejected": -2.618190288543701, + "logps/chosen": -119.87908935546875, + "logps/rejected": -129.7265625, + "loss": 0.6452, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.6576894521713257, + "rewards/margins": 0.1377858966588974, + "rewards/rejected": -0.7954753637313843, + "step": 2930 + }, + { + "epoch": 0.5065472088215024, + "grad_norm": 13.81851577758789, + "learning_rate": 1.9712950603236508e-07, + "logits/chosen": -2.6915862560272217, + "logits/rejected": -2.664580821990967, + "logps/chosen": -115.47206115722656, + "logps/rejected": -120.67753601074219, + "loss": 0.6709, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.6299867630004883, + "rewards/margins": 0.07517345994710922, + "rewards/rejected": -0.7051601409912109, + "step": 2940 + }, + { + "epoch": 0.5082701585113715, + "grad_norm": 16.14995765686035, + "learning_rate": 1.9708162001853873e-07, + "logits/chosen": -2.679628372192383, + "logits/rejected": -2.6672160625457764, + "logps/chosen": -115.45826721191406, + "logps/rejected": -128.9546661376953, + "loss": 0.6392, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.6021052002906799, + "rewards/margins": 0.150004580616951, + "rewards/rejected": -0.7521097660064697, + "step": 2950 + }, + { + "epoch": 0.5099931082012406, + "grad_norm": 14.37147045135498, + "learning_rate": 1.9703334379450855e-07, + "logits/chosen": -2.662416458129883, + "logits/rejected": -2.642178535461426, + "logps/chosen": -113.80577087402344, + "logps/rejected": -125.65077209472656, + "loss": 0.6401, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.5899726152420044, + "rewards/margins": 0.1525414139032364, + "rewards/rejected": -0.7425141334533691, + "step": 2960 + }, + { + "epoch": 0.5117160578911096, + "grad_norm": 15.08690357208252, + "learning_rate": 1.969846775543161e-07, + "logits/chosen": -2.6417126655578613, + "logits/rejected": -2.6231141090393066, + "logps/chosen": -120.83221435546875, + "logps/rejected": -128.4465789794922, + "loss": 0.6525, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.6522833704948425, + "rewards/margins": 0.11867114156484604, + "rewards/rejected": -0.7709546089172363, + "step": 2970 + }, + { + "epoch": 0.5134390075809786, + "grad_norm": 18.869640350341797, + "learning_rate": 1.9693562149357072e-07, + "logits/chosen": -2.600480794906616, + "logits/rejected": -2.5805163383483887, + "logps/chosen": -113.0972900390625, + "logps/rejected": -125.45975494384766, + "loss": 0.6352, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.5786206126213074, + "rewards/margins": 0.1582469642162323, + "rewards/rejected": -0.7368675470352173, + "step": 2980 + }, + { + "epoch": 0.5151619572708477, + "grad_norm": 13.40176010131836, + "learning_rate": 1.9688617580944843e-07, + "logits/chosen": -2.647470712661743, + "logits/rejected": -2.635159969329834, + "logps/chosen": -123.00044250488281, + "logps/rejected": -128.8345489501953, + "loss": 0.6638, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.6534823179244995, + "rewards/margins": 0.09893546253442764, + "rewards/rejected": -0.7524177432060242, + "step": 2990 + }, + { + "epoch": 0.5168849069607168, + "grad_norm": 11.170113563537598, + "learning_rate": 1.9683634070069143e-07, + "logits/chosen": -2.6602885723114014, + "logits/rejected": -2.654470920562744, + "logps/chosen": -114.0888671875, + "logps/rejected": -125.89949798583984, + "loss": 0.6579, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.6134128570556641, + "rewards/margins": 0.10806174576282501, + "rewards/rejected": -0.7214745283126831, + "step": 3000 + }, + { + "epoch": 0.5186078566505858, + "grad_norm": 11.310858726501465, + "learning_rate": 1.967861163676071e-07, + "logits/chosen": -2.6761560440063477, + "logits/rejected": -2.6583657264709473, + "logps/chosen": -118.22630310058594, + "logps/rejected": -124.42864990234375, + "loss": 0.663, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.6270531415939331, + "rewards/margins": 0.09819537401199341, + "rewards/rejected": -0.7252485156059265, + "step": 3010 + }, + { + "epoch": 0.5203308063404548, + "grad_norm": 16.713150024414062, + "learning_rate": 1.9673550301206733e-07, + "logits/chosen": -2.722543239593506, + "logits/rejected": -2.7021353244781494, + "logps/chosen": -119.0759048461914, + "logps/rejected": -125.45185852050781, + "loss": 0.6528, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.6386953592300415, + "rewards/margins": 0.12587383389472961, + "rewards/rejected": -0.764569103717804, + "step": 3020 + }, + { + "epoch": 0.5220537560303239, + "grad_norm": 10.162747383117676, + "learning_rate": 1.9668450083750762e-07, + "logits/chosen": -2.6935548782348633, + "logits/rejected": -2.6769282817840576, + "logps/chosen": -115.16400146484375, + "logps/rejected": -122.29130554199219, + "loss": 0.6621, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.5884724855422974, + "rewards/margins": 0.09111051261425018, + "rewards/rejected": -0.679582953453064, + "step": 3030 + }, + { + "epoch": 0.523776705720193, + "grad_norm": 10.428220748901367, + "learning_rate": 1.9663311004892628e-07, + "logits/chosen": -2.7106406688690186, + "logits/rejected": -2.707573175430298, + "logps/chosen": -106.23811340332031, + "logps/rejected": -118.0770492553711, + "loss": 0.6545, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.5447341203689575, + "rewards/margins": 0.1072479635477066, + "rewards/rejected": -0.6519821286201477, + "step": 3040 + }, + { + "epoch": 0.525499655410062, + "grad_norm": 18.292776107788086, + "learning_rate": 1.9658133085288365e-07, + "logits/chosen": -2.650239944458008, + "logits/rejected": -2.6476333141326904, + "logps/chosen": -105.09783935546875, + "logps/rejected": -115.9225845336914, + "loss": 0.658, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.5097786784172058, + "rewards/margins": 0.09773959219455719, + "rewards/rejected": -0.6075183153152466, + "step": 3050 + }, + { + "epoch": 0.5272226050999311, + "grad_norm": 10.919912338256836, + "learning_rate": 1.965291634575011e-07, + "logits/chosen": -2.6657092571258545, + "logits/rejected": -2.6515870094299316, + "logps/chosen": -107.94815826416016, + "logps/rejected": -117.40814208984375, + "loss": 0.6516, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5158464312553406, + "rewards/margins": 0.11876648664474487, + "rewards/rejected": -0.6346129179000854, + "step": 3060 + }, + { + "epoch": 0.5289455547898001, + "grad_norm": 12.27825927734375, + "learning_rate": 1.9647660807246063e-07, + "logits/chosen": -2.64638090133667, + "logits/rejected": -2.6269311904907227, + "logps/chosen": -112.88493347167969, + "logps/rejected": -117.7209243774414, + "loss": 0.6524, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.5371401309967041, + "rewards/margins": 0.11090948432683945, + "rewards/rejected": -0.6480496525764465, + "step": 3070 + }, + { + "epoch": 0.5306685044796692, + "grad_norm": 12.138772010803223, + "learning_rate": 1.9642366490900337e-07, + "logits/chosen": -2.602574586868286, + "logits/rejected": -2.59029221534729, + "logps/chosen": -107.50993347167969, + "logps/rejected": -120.93980407714844, + "loss": 0.6581, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.566285252571106, + "rewards/margins": 0.10775377601385117, + "rewards/rejected": -0.6740390658378601, + "step": 3080 + }, + { + "epoch": 0.5323914541695383, + "grad_norm": 15.335344314575195, + "learning_rate": 1.9637033417992936e-07, + "logits/chosen": -2.634575605392456, + "logits/rejected": -2.6185824871063232, + "logps/chosen": -109.16261291503906, + "logps/rejected": -121.67292785644531, + "loss": 0.6343, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.5395610928535461, + "rewards/margins": 0.1506732702255249, + "rewards/rejected": -0.6902343034744263, + "step": 3090 + }, + { + "epoch": 0.5341144038594073, + "grad_norm": 19.178672790527344, + "learning_rate": 1.9631661609959628e-07, + "logits/chosen": -2.633981943130493, + "logits/rejected": -2.6135735511779785, + "logps/chosen": -117.04927062988281, + "logps/rejected": -128.36834716796875, + "loss": 0.6363, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.6114171743392944, + "rewards/margins": 0.1613766849040985, + "rewards/rejected": -0.7727938294410706, + "step": 3100 + }, + { + "epoch": 0.5358373535492763, + "grad_norm": 15.843265533447266, + "learning_rate": 1.9626251088391876e-07, + "logits/chosen": -2.607421398162842, + "logits/rejected": -2.6124348640441895, + "logps/chosen": -122.96217346191406, + "logps/rejected": -138.46749877929688, + "loss": 0.6534, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.7146931886672974, + "rewards/margins": 0.12729968130588531, + "rewards/rejected": -0.8419928550720215, + "step": 3110 + }, + { + "epoch": 0.5375603032391454, + "grad_norm": 13.338793754577637, + "learning_rate": 1.9620801875036753e-07, + "logits/chosen": -2.6482999324798584, + "logits/rejected": -2.6301398277282715, + "logps/chosen": -122.67982482910156, + "logps/rejected": -135.84725952148438, + "loss": 0.6384, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.6544739603996277, + "rewards/margins": 0.1578807681798935, + "rewards/rejected": -0.8123547434806824, + "step": 3120 + }, + { + "epoch": 0.5392832529290145, + "grad_norm": 16.10958480834961, + "learning_rate": 1.9615313991796843e-07, + "logits/chosen": -2.5828936100006104, + "logits/rejected": -2.577544927597046, + "logps/chosen": -116.6275634765625, + "logps/rejected": -133.2178955078125, + "loss": 0.6392, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.6371970772743225, + "rewards/margins": 0.1632428765296936, + "rewards/rejected": -0.8004400134086609, + "step": 3130 + }, + { + "epoch": 0.5410062026188835, + "grad_norm": 15.640116691589355, + "learning_rate": 1.960978746073016e-07, + "logits/chosen": -2.643578290939331, + "logits/rejected": -2.630753517150879, + "logps/chosen": -124.42008972167969, + "logps/rejected": -140.39816284179688, + "loss": 0.6407, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.7190755009651184, + "rewards/margins": 0.15711475908756256, + "rewards/rejected": -0.8761903047561646, + "step": 3140 + }, + { + "epoch": 0.5427291523087526, + "grad_norm": 19.58635711669922, + "learning_rate": 1.9604222304050074e-07, + "logits/chosen": -2.6569530963897705, + "logits/rejected": -2.6379687786102295, + "logps/chosen": -127.451904296875, + "logps/rejected": -133.31912231445312, + "loss": 0.6666, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.7172044515609741, + "rewards/margins": 0.09000442177057266, + "rewards/rejected": -0.8072088360786438, + "step": 3150 + }, + { + "epoch": 0.5444521019986216, + "grad_norm": 16.635807037353516, + "learning_rate": 1.9598618544125184e-07, + "logits/chosen": -2.603647232055664, + "logits/rejected": -2.5840981006622314, + "logps/chosen": -119.03878021240234, + "logps/rejected": -128.95834350585938, + "loss": 0.6452, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.6372925043106079, + "rewards/margins": 0.1405438780784607, + "rewards/rejected": -0.7778364419937134, + "step": 3160 + }, + { + "epoch": 0.5461750516884907, + "grad_norm": 12.24067211151123, + "learning_rate": 1.9592976203479266e-07, + "logits/chosen": -2.6436235904693604, + "logits/rejected": -2.6211209297180176, + "logps/chosen": -121.8818130493164, + "logps/rejected": -126.7547378540039, + "loss": 0.6423, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.647641658782959, + "rewards/margins": 0.14087779819965363, + "rewards/rejected": -0.7885195016860962, + "step": 3170 + }, + { + "epoch": 0.5478980013783598, + "grad_norm": 11.4162015914917, + "learning_rate": 1.9587295304791164e-07, + "logits/chosen": -2.676767349243164, + "logits/rejected": -2.6484227180480957, + "logps/chosen": -118.13658142089844, + "logps/rejected": -129.58218383789062, + "loss": 0.6398, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.6407798528671265, + "rewards/margins": 0.14756932854652405, + "rewards/rejected": -0.7883491516113281, + "step": 3180 + }, + { + "epoch": 0.5496209510682288, + "grad_norm": 13.021843910217285, + "learning_rate": 1.95815758708947e-07, + "logits/chosen": -2.6646246910095215, + "logits/rejected": -2.6626439094543457, + "logps/chosen": -119.4345703125, + "logps/rejected": -142.91665649414062, + "loss": 0.615, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.6598604321479797, + "rewards/margins": 0.20836925506591797, + "rewards/rejected": -0.8682296872138977, + "step": 3190 + }, + { + "epoch": 0.5513439007580979, + "grad_norm": 13.713115692138672, + "learning_rate": 1.957581792477859e-07, + "logits/chosen": -2.5963289737701416, + "logits/rejected": -2.579953670501709, + "logps/chosen": -133.90020751953125, + "logps/rejected": -143.35369873046875, + "loss": 0.6467, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.7653565406799316, + "rewards/margins": 0.15095801651477814, + "rewards/rejected": -0.9163146018981934, + "step": 3200 + }, + { + "epoch": 0.5513439007580979, + "eval_logits/chosen": -2.6902225017547607, + "eval_logits/rejected": -2.686380386352539, + "eval_logps/chosen": -126.01993560791016, + "eval_logps/rejected": -140.8850555419922, + "eval_loss": 0.6583073735237122, + "eval_rewards/accuracies": 0.625, + "eval_rewards/chosen": -0.6700446605682373, + "eval_rewards/margins": 0.11130973696708679, + "eval_rewards/rejected": -0.7813544273376465, + "eval_runtime": 384.2204, + "eval_samples_per_second": 11.202, + "eval_steps_per_second": 1.4, + "step": 3200 + }, + { + "epoch": 0.5530668504479669, + "grad_norm": 14.34167194366455, + "learning_rate": 1.9570021489586344e-07, + "logits/chosen": -2.537349224090576, + "logits/rejected": -2.5172617435455322, + "logps/chosen": -136.31301879882812, + "logps/rejected": -147.375244140625, + "loss": 0.6499, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.8167101740837097, + "rewards/margins": 0.1374952495098114, + "rewards/rejected": -0.9542053937911987, + "step": 3210 + }, + { + "epoch": 0.554789800137836, + "grad_norm": 13.798596382141113, + "learning_rate": 1.956418658861617e-07, + "logits/chosen": -2.5830676555633545, + "logits/rejected": -2.579763889312744, + "logps/chosen": -126.21321105957031, + "logps/rejected": -140.56715393066406, + "loss": 0.6511, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.7518718242645264, + "rewards/margins": 0.13411535322666168, + "rewards/rejected": -0.8859871625900269, + "step": 3220 + }, + { + "epoch": 0.556512749827705, + "grad_norm": 21.124483108520508, + "learning_rate": 1.9558313245320888e-07, + "logits/chosen": -2.6222100257873535, + "logits/rejected": -2.610461950302124, + "logps/chosen": -123.21502685546875, + "logps/rejected": -138.0326690673828, + "loss": 0.6526, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.6875979900360107, + "rewards/margins": 0.13721275329589844, + "rewards/rejected": -0.8248106241226196, + "step": 3230 + }, + { + "epoch": 0.5582356995175741, + "grad_norm": 16.622541427612305, + "learning_rate": 1.955240148330784e-07, + "logits/chosen": -2.7097229957580566, + "logits/rejected": -2.693171262741089, + "logps/chosen": -127.66766357421875, + "logps/rejected": -132.49832153320312, + "loss": 0.6612, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.7031322717666626, + "rewards/margins": 0.1105247363448143, + "rewards/rejected": -0.8136569857597351, + "step": 3240 + }, + { + "epoch": 0.5599586492074431, + "grad_norm": 15.283352851867676, + "learning_rate": 1.954645132633878e-07, + "logits/chosen": -2.6242454051971436, + "logits/rejected": -2.6123194694519043, + "logps/chosen": -114.93122863769531, + "logps/rejected": -128.0443572998047, + "loss": 0.6458, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.6113388538360596, + "rewards/margins": 0.14095903933048248, + "rewards/rejected": -0.7522978782653809, + "step": 3250 + }, + { + "epoch": 0.5616815988973122, + "grad_norm": 17.378856658935547, + "learning_rate": 1.9540462798329788e-07, + "logits/chosen": -2.6355347633361816, + "logits/rejected": -2.624072551727295, + "logps/chosen": -108.41911315917969, + "logps/rejected": -124.9243392944336, + "loss": 0.6285, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.5605282783508301, + "rewards/margins": 0.1749248504638672, + "rewards/rejected": -0.7354531288146973, + "step": 3260 + }, + { + "epoch": 0.5634045485871813, + "grad_norm": 13.821595191955566, + "learning_rate": 1.953443592335118e-07, + "logits/chosen": -2.6720964908599854, + "logits/rejected": -2.6658260822296143, + "logps/chosen": -121.88392639160156, + "logps/rejected": -136.3887939453125, + "loss": 0.6467, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.6581703424453735, + "rewards/margins": 0.14219583570957184, + "rewards/rejected": -0.800366222858429, + "step": 3270 + }, + { + "epoch": 0.5651274982770503, + "grad_norm": 15.801923751831055, + "learning_rate": 1.9528370725627393e-07, + "logits/chosen": -2.675248384475708, + "logits/rejected": -2.6648693084716797, + "logps/chosen": -117.72242736816406, + "logps/rejected": -132.8204803466797, + "loss": 0.6477, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.6743533611297607, + "rewards/margins": 0.13626965880393982, + "rewards/rejected": -0.810623049736023, + "step": 3280 + }, + { + "epoch": 0.5668504479669194, + "grad_norm": 13.726994514465332, + "learning_rate": 1.9522267229536907e-07, + "logits/chosen": -2.698050022125244, + "logits/rejected": -2.6729812622070312, + "logps/chosen": -129.0258331298828, + "logps/rejected": -145.4852294921875, + "loss": 0.6327, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.7511179447174072, + "rewards/margins": 0.1727326661348343, + "rewards/rejected": -0.9238505363464355, + "step": 3290 + }, + { + "epoch": 0.5685733976567884, + "grad_norm": 20.418867111206055, + "learning_rate": 1.9516125459612133e-07, + "logits/chosen": -2.6035115718841553, + "logits/rejected": -2.584538459777832, + "logps/chosen": -142.64402770996094, + "logps/rejected": -157.33480834960938, + "loss": 0.6359, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.8705562353134155, + "rewards/margins": 0.1773010492324829, + "rewards/rejected": -1.0478572845458984, + "step": 3300 + }, + { + "epoch": 0.5702963473466575, + "grad_norm": 18.851089477539062, + "learning_rate": 1.9509945440539328e-07, + "logits/chosen": -2.546842098236084, + "logits/rejected": -2.5207438468933105, + "logps/chosen": -147.82745361328125, + "logps/rejected": -162.19891357421875, + "loss": 0.6334, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.9319332838058472, + "rewards/margins": 0.18691737949848175, + "rewards/rejected": -1.1188507080078125, + "step": 3310 + }, + { + "epoch": 0.5720192970365265, + "grad_norm": 22.01388168334961, + "learning_rate": 1.9503727197158475e-07, + "logits/chosen": -2.5725531578063965, + "logits/rejected": -2.549002170562744, + "logps/chosen": -152.4097137451172, + "logps/rejected": -159.28634643554688, + "loss": 0.6629, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.9795411825180054, + "rewards/margins": 0.13039647042751312, + "rewards/rejected": -1.1099377870559692, + "step": 3320 + }, + { + "epoch": 0.5737422467263956, + "grad_norm": 20.20602798461914, + "learning_rate": 1.949747075446321e-07, + "logits/chosen": -2.658933162689209, + "logits/rejected": -2.6387689113616943, + "logps/chosen": -144.72409057617188, + "logps/rejected": -164.0241241455078, + "loss": 0.6238, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.8950947523117065, + "rewards/margins": 0.21241405606269836, + "rewards/rejected": -1.107508897781372, + "step": 3330 + }, + { + "epoch": 0.5754651964162646, + "grad_norm": 22.419872283935547, + "learning_rate": 1.9491176137600695e-07, + "logits/chosen": -2.620138645172119, + "logits/rejected": -2.6018574237823486, + "logps/chosen": -146.81373596191406, + "logps/rejected": -160.92575073242188, + "loss": 0.633, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.8906152844429016, + "rewards/margins": 0.18924424052238464, + "rewards/rejected": -1.0798594951629639, + "step": 3340 + }, + { + "epoch": 0.5771881461061337, + "grad_norm": 17.183467864990234, + "learning_rate": 1.9484843371871538e-07, + "logits/chosen": -2.5479228496551514, + "logits/rejected": -2.5361313819885254, + "logps/chosen": -137.95021057128906, + "logps/rejected": -154.76429748535156, + "loss": 0.635, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.8102957010269165, + "rewards/margins": 0.1809847056865692, + "rewards/rejected": -0.9912804365158081, + "step": 3350 + }, + { + "epoch": 0.5789110957960028, + "grad_norm": 16.16291618347168, + "learning_rate": 1.9478472482729677e-07, + "logits/chosen": -2.6077377796173096, + "logits/rejected": -2.585120916366577, + "logps/chosen": -132.53025817871094, + "logps/rejected": -144.04335021972656, + "loss": 0.6483, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7691653966903687, + "rewards/margins": 0.15932944416999817, + "rewards/rejected": -0.9284948110580444, + "step": 3360 + }, + { + "epoch": 0.5806340454858718, + "grad_norm": 20.075923919677734, + "learning_rate": 1.947206349578229e-07, + "logits/chosen": -2.596151113510132, + "logits/rejected": -2.592029094696045, + "logps/chosen": -119.96165466308594, + "logps/rejected": -141.7924346923828, + "loss": 0.6138, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.6749607920646667, + "rewards/margins": 0.2100691795349121, + "rewards/rejected": -0.8850299715995789, + "step": 3370 + }, + { + "epoch": 0.5823569951757409, + "grad_norm": 25.711334228515625, + "learning_rate": 1.9465616436789683e-07, + "logits/chosen": -2.6455085277557373, + "logits/rejected": -2.6214022636413574, + "logps/chosen": -127.19728088378906, + "logps/rejected": -135.57278442382812, + "loss": 0.6411, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.7012184858322144, + "rewards/margins": 0.1534968614578247, + "rewards/rejected": -0.8547152280807495, + "step": 3380 + }, + { + "epoch": 0.5840799448656099, + "grad_norm": 14.334299087524414, + "learning_rate": 1.9459131331665183e-07, + "logits/chosen": -2.5958807468414307, + "logits/rejected": -2.5762410163879395, + "logps/chosen": -126.18077087402344, + "logps/rejected": -138.03382873535156, + "loss": 0.6477, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.7176672220230103, + "rewards/margins": 0.15622806549072266, + "rewards/rejected": -0.8738951683044434, + "step": 3390 + }, + { + "epoch": 0.585802894555479, + "grad_norm": 14.765299797058105, + "learning_rate": 1.9452608206475044e-07, + "logits/chosen": -2.6166718006134033, + "logits/rejected": -2.5864694118499756, + "logps/chosen": -129.44992065429688, + "logps/rejected": -144.64212036132812, + "loss": 0.6391, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.7583649158477783, + "rewards/margins": 0.18269336223602295, + "rewards/rejected": -0.9410582780838013, + "step": 3400 + }, + { + "epoch": 0.587525844245348, + "grad_norm": 21.21045684814453, + "learning_rate": 1.9446047087438342e-07, + "logits/chosen": -2.541520833969116, + "logits/rejected": -2.521432399749756, + "logps/chosen": -121.6983871459961, + "logps/rejected": -129.38424682617188, + "loss": 0.6591, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.71497642993927, + "rewards/margins": 0.11594994366168976, + "rewards/rejected": -0.8309264183044434, + "step": 3410 + }, + { + "epoch": 0.5892487939352171, + "grad_norm": 16.52368927001953, + "learning_rate": 1.9439448000926859e-07, + "logits/chosen": -2.561218738555908, + "logits/rejected": -2.546726703643799, + "logps/chosen": -118.3755874633789, + "logps/rejected": -137.21578979492188, + "loss": 0.6195, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.6452140808105469, + "rewards/margins": 0.19474345445632935, + "rewards/rejected": -0.8399575352668762, + "step": 3420 + }, + { + "epoch": 0.5909717436250862, + "grad_norm": 17.433828353881836, + "learning_rate": 1.9432810973464988e-07, + "logits/chosen": -2.6282083988189697, + "logits/rejected": -2.6128320693969727, + "logps/chosen": -123.78349304199219, + "logps/rejected": -144.3318328857422, + "loss": 0.6289, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.715265154838562, + "rewards/margins": 0.18964102864265442, + "rewards/rejected": -0.904906153678894, + "step": 3430 + }, + { + "epoch": 0.5926946933149552, + "grad_norm": 18.383983612060547, + "learning_rate": 1.942613603172962e-07, + "logits/chosen": -2.552217721939087, + "logits/rejected": -2.5373892784118652, + "logps/chosen": -138.661376953125, + "logps/rejected": -157.70814514160156, + "loss": 0.6307, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.8593077659606934, + "rewards/margins": 0.19985555112361908, + "rewards/rejected": -1.0591633319854736, + "step": 3440 + }, + { + "epoch": 0.5944176430048242, + "grad_norm": 16.855152130126953, + "learning_rate": 1.9419423202550037e-07, + "logits/chosen": -2.723828077316284, + "logits/rejected": -2.694596767425537, + "logps/chosen": -149.76248168945312, + "logps/rejected": -157.1855926513672, + "loss": 0.6574, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.8911663293838501, + "rewards/margins": 0.14522525668144226, + "rewards/rejected": -1.0363914966583252, + "step": 3450 + }, + { + "epoch": 0.5961405926946933, + "grad_norm": 14.812005996704102, + "learning_rate": 1.9412672512907812e-07, + "logits/chosen": -2.566176414489746, + "logits/rejected": -2.560554265975952, + "logps/chosen": -126.41849517822266, + "logps/rejected": -144.072509765625, + "loss": 0.6337, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.7349315881729126, + "rewards/margins": 0.17624779045581818, + "rewards/rejected": -0.9111794233322144, + "step": 3460 + }, + { + "epoch": 0.5978635423845624, + "grad_norm": 14.932448387145996, + "learning_rate": 1.940588398993669e-07, + "logits/chosen": -2.6209988594055176, + "logits/rejected": -2.6123223304748535, + "logps/chosen": -130.9401092529297, + "logps/rejected": -146.76885986328125, + "loss": 0.6326, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.7369273900985718, + "rewards/margins": 0.19775813817977905, + "rewards/rejected": -0.9346855282783508, + "step": 3470 + }, + { + "epoch": 0.5995864920744314, + "grad_norm": 26.05785369873047, + "learning_rate": 1.9399057660922482e-07, + "logits/chosen": -2.5855112075805664, + "logits/rejected": -2.5643677711486816, + "logps/chosen": -133.5073699951172, + "logps/rejected": -150.279296875, + "loss": 0.629, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.7769684195518494, + "rewards/margins": 0.21371857821941376, + "rewards/rejected": -0.9906870126724243, + "step": 3480 + }, + { + "epoch": 0.6013094417643005, + "grad_norm": 21.644672393798828, + "learning_rate": 1.939219355330296e-07, + "logits/chosen": -2.575160503387451, + "logits/rejected": -2.5529327392578125, + "logps/chosen": -138.62142944335938, + "logps/rejected": -158.4349365234375, + "loss": 0.6285, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.8397635221481323, + "rewards/margins": 0.20524868369102478, + "rewards/rejected": -1.0450122356414795, + "step": 3490 + }, + { + "epoch": 0.6030323914541695, + "grad_norm": 18.445178985595703, + "learning_rate": 1.9385291694667742e-07, + "logits/chosen": -2.5392565727233887, + "logits/rejected": -2.5323662757873535, + "logps/chosen": -137.585693359375, + "logps/rejected": -152.7445526123047, + "loss": 0.6506, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.8483207821846008, + "rewards/margins": 0.14356929063796997, + "rewards/rejected": -0.9918900728225708, + "step": 3500 + }, + { + "epoch": 0.6047553411440386, + "grad_norm": 14.520369529724121, + "learning_rate": 1.9378352112758182e-07, + "logits/chosen": -2.505514621734619, + "logits/rejected": -2.483396053314209, + "logps/chosen": -143.06138610839844, + "logps/rejected": -152.49606323242188, + "loss": 0.656, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.8901029825210571, + "rewards/margins": 0.1305324286222458, + "rewards/rejected": -1.0206353664398193, + "step": 3510 + }, + { + "epoch": 0.6064782908339077, + "grad_norm": 21.934551239013672, + "learning_rate": 1.937137483546726e-07, + "logits/chosen": -2.5207297801971436, + "logits/rejected": -2.49786639213562, + "logps/chosen": -133.8035888671875, + "logps/rejected": -147.05685424804688, + "loss": 0.6334, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.7806012034416199, + "rewards/margins": 0.17642517387866974, + "rewards/rejected": -0.957026481628418, + "step": 3520 + }, + { + "epoch": 0.6082012405237767, + "grad_norm": 32.22636413574219, + "learning_rate": 1.936435989083947e-07, + "logits/chosen": -2.644346237182617, + "logits/rejected": -2.6069703102111816, + "logps/chosen": -137.42364501953125, + "logps/rejected": -151.6114501953125, + "loss": 0.6311, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.808983325958252, + "rewards/margins": 0.20544257760047913, + "rewards/rejected": -1.0144258737564087, + "step": 3530 + }, + { + "epoch": 0.6099241902136457, + "grad_norm": 18.672882080078125, + "learning_rate": 1.9357307307070706e-07, + "logits/chosen": -2.5201008319854736, + "logits/rejected": -2.499207019805908, + "logps/chosen": -138.7521209716797, + "logps/rejected": -151.4750213623047, + "loss": 0.649, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.8334717750549316, + "rewards/margins": 0.16167697310447693, + "rewards/rejected": -0.9951488375663757, + "step": 3540 + }, + { + "epoch": 0.6116471399035148, + "grad_norm": 15.886916160583496, + "learning_rate": 1.9350217112508145e-07, + "logits/chosen": -2.5792183876037598, + "logits/rejected": -2.5726521015167236, + "logps/chosen": -136.02572631835938, + "logps/rejected": -143.35610961914062, + "loss": 0.6642, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -0.8100021481513977, + "rewards/margins": 0.10618770122528076, + "rewards/rejected": -0.9161897897720337, + "step": 3550 + }, + { + "epoch": 0.6133700895933839, + "grad_norm": 16.117509841918945, + "learning_rate": 1.934308933565014e-07, + "logits/chosen": -2.546569585800171, + "logits/rejected": -2.5285823345184326, + "logps/chosen": -130.1741180419922, + "logps/rejected": -140.90634155273438, + "loss": 0.6484, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.7551542520523071, + "rewards/margins": 0.139734148979187, + "rewards/rejected": -0.8948885202407837, + "step": 3560 + }, + { + "epoch": 0.6150930392832529, + "grad_norm": 15.810790061950684, + "learning_rate": 1.9335924005146106e-07, + "logits/chosen": -2.640364170074463, + "logits/rejected": -2.613745927810669, + "logps/chosen": -141.25216674804688, + "logps/rejected": -148.0939178466797, + "loss": 0.6697, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.8347728848457336, + "rewards/margins": 0.11924564838409424, + "rewards/rejected": -0.9540184736251831, + "step": 3570 + }, + { + "epoch": 0.616815988973122, + "grad_norm": 14.903305053710938, + "learning_rate": 1.9328721149796392e-07, + "logits/chosen": -2.613096237182617, + "logits/rejected": -2.596419095993042, + "logps/chosen": -138.2270050048828, + "logps/rejected": -153.5955047607422, + "loss": 0.6374, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.791134238243103, + "rewards/margins": 0.1797659695148468, + "rewards/rejected": -0.9709001779556274, + "step": 3580 + }, + { + "epoch": 0.618538938662991, + "grad_norm": 15.328279495239258, + "learning_rate": 1.9321480798552184e-07, + "logits/chosen": -2.5752346515655518, + "logits/rejected": -2.564344644546509, + "logps/chosen": -138.5938720703125, + "logps/rejected": -153.5648193359375, + "loss": 0.6433, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.8292818069458008, + "rewards/margins": 0.17741826176643372, + "rewards/rejected": -1.006700038909912, + "step": 3590 + }, + { + "epoch": 0.6202618883528601, + "grad_norm": 26.101503372192383, + "learning_rate": 1.9314202980515378e-07, + "logits/chosen": -2.572817325592041, + "logits/rejected": -2.5499234199523926, + "logps/chosen": -129.71156311035156, + "logps/rejected": -144.66526794433594, + "loss": 0.6264, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.7399894595146179, + "rewards/margins": 0.19516396522521973, + "rewards/rejected": -0.9351534843444824, + "step": 3600 + }, + { + "epoch": 0.6202618883528601, + "eval_logits/chosen": -2.622478723526001, + "eval_logits/rejected": -2.6176483631134033, + "eval_logps/chosen": -122.61001586914062, + "eval_logps/rejected": -136.5857391357422, + "eval_loss": 0.6586455702781677, + "eval_rewards/accuracies": 0.6105948090553284, + "eval_rewards/chosen": -0.6359453797340393, + "eval_rewards/margins": 0.10241586714982986, + "eval_rewards/rejected": -0.7383612394332886, + "eval_runtime": 383.2298, + "eval_samples_per_second": 11.231, + "eval_steps_per_second": 1.404, + "step": 3600 + }, + { + "epoch": 0.6219848380427292, + "grad_norm": 22.015100479125977, + "learning_rate": 1.9306887724938452e-07, + "logits/chosen": -2.5326743125915527, + "logits/rejected": -2.522927761077881, + "logps/chosen": -143.52890014648438, + "logps/rejected": -151.68173217773438, + "loss": 0.6676, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.9094659686088562, + "rewards/margins": 0.12106283009052277, + "rewards/rejected": -1.0305287837982178, + "step": 3610 + }, + { + "epoch": 0.6237077877325982, + "grad_norm": 22.287555694580078, + "learning_rate": 1.929953506122438e-07, + "logits/chosen": -2.470320463180542, + "logits/rejected": -2.449979066848755, + "logps/chosen": -142.80337524414062, + "logps/rejected": -159.98110961914062, + "loss": 0.6196, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.8603125810623169, + "rewards/margins": 0.2076341211795807, + "rewards/rejected": -1.0679466724395752, + "step": 3620 + }, + { + "epoch": 0.6254307374224672, + "grad_norm": 20.508764266967773, + "learning_rate": 1.9292145018926478e-07, + "logits/chosen": -2.556079626083374, + "logits/rejected": -2.5517899990081787, + "logps/chosen": -148.13914489746094, + "logps/rejected": -177.71499633789062, + "loss": 0.6036, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.9577075839042664, + "rewards/margins": 0.2712717354297638, + "rewards/rejected": -1.228979468345642, + "step": 3630 + }, + { + "epoch": 0.6271536871123363, + "grad_norm": 25.0677547454834, + "learning_rate": 1.9284717627748308e-07, + "logits/chosen": -2.5317368507385254, + "logits/rejected": -2.520453453063965, + "logps/chosen": -155.36209106445312, + "logps/rejected": -174.81427001953125, + "loss": 0.6275, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.9892951250076294, + "rewards/margins": 0.20692935585975647, + "rewards/rejected": -1.196224570274353, + "step": 3640 + }, + { + "epoch": 0.6288766368022054, + "grad_norm": 23.427276611328125, + "learning_rate": 1.9277252917543557e-07, + "logits/chosen": -2.5060718059539795, + "logits/rejected": -2.5108015537261963, + "logps/chosen": -147.3946533203125, + "logps/rejected": -173.7878875732422, + "loss": 0.6163, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.9550941586494446, + "rewards/margins": 0.2426663339138031, + "rewards/rejected": -1.1977603435516357, + "step": 3650 + }, + { + "epoch": 0.6305995864920745, + "grad_norm": 22.165502548217773, + "learning_rate": 1.92697509183159e-07, + "logits/chosen": -2.522451639175415, + "logits/rejected": -2.4953484535217285, + "logps/chosen": -160.2294464111328, + "logps/rejected": -180.65109252929688, + "loss": 0.6194, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.046242117881775, + "rewards/margins": 0.23893484473228455, + "rewards/rejected": -1.2851769924163818, + "step": 3660 + }, + { + "epoch": 0.6323225361819435, + "grad_norm": 16.42013931274414, + "learning_rate": 1.926221166021891e-07, + "logits/chosen": -2.520784854888916, + "logits/rejected": -2.5007686614990234, + "logps/chosen": -150.28506469726562, + "logps/rejected": -163.83880615234375, + "loss": 0.6523, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.9631339311599731, + "rewards/margins": 0.15850678086280823, + "rewards/rejected": -1.1216405630111694, + "step": 3670 + }, + { + "epoch": 0.6340454858718125, + "grad_norm": 19.383630752563477, + "learning_rate": 1.9254635173555895e-07, + "logits/chosen": -2.568835496902466, + "logits/rejected": -2.5408787727355957, + "logps/chosen": -144.97140502929688, + "logps/rejected": -154.52496337890625, + "loss": 0.6447, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.8866475820541382, + "rewards/margins": 0.16674400866031647, + "rewards/rejected": -1.053391695022583, + "step": 3680 + }, + { + "epoch": 0.6357684355616816, + "grad_norm": 19.868803024291992, + "learning_rate": 1.9247021488779817e-07, + "logits/chosen": -2.513503313064575, + "logits/rejected": -2.5117948055267334, + "logps/chosen": -134.81625366210938, + "logps/rejected": -166.44728088378906, + "loss": 0.6101, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.8184541463851929, + "rewards/margins": 0.26437991857528687, + "rewards/rejected": -1.082834005355835, + "step": 3690 + }, + { + "epoch": 0.6374913852515507, + "grad_norm": 20.203310012817383, + "learning_rate": 1.923937063649315e-07, + "logits/chosen": -2.530961513519287, + "logits/rejected": -2.503265619277954, + "logps/chosen": -146.6129913330078, + "logps/rejected": -168.82098388671875, + "loss": 0.6264, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.8800510168075562, + "rewards/margins": 0.22863145172595978, + "rewards/rejected": -1.108682632446289, + "step": 3700 + }, + { + "epoch": 0.6392143349414197, + "grad_norm": 23.444467544555664, + "learning_rate": 1.9231682647447757e-07, + "logits/chosen": -2.5593762397766113, + "logits/rejected": -2.542349338531494, + "logps/chosen": -141.5807342529297, + "logps/rejected": -151.15762329101562, + "loss": 0.6617, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.8652147054672241, + "rewards/margins": 0.14454950392246246, + "rewards/rejected": -1.009764313697815, + "step": 3710 + }, + { + "epoch": 0.6409372846312887, + "grad_norm": 19.26589584350586, + "learning_rate": 1.9223957552544762e-07, + "logits/chosen": -2.5640718936920166, + "logits/rejected": -2.547266721725464, + "logps/chosen": -126.90428161621094, + "logps/rejected": -150.60821533203125, + "loss": 0.6083, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.7400155067443848, + "rewards/margins": 0.23334841430187225, + "rewards/rejected": -0.973363995552063, + "step": 3720 + }, + { + "epoch": 0.6426602343211578, + "grad_norm": 17.89430809020996, + "learning_rate": 1.9216195382834445e-07, + "logits/chosen": -2.556276559829712, + "logits/rejected": -2.5290729999542236, + "logps/chosen": -137.04393005371094, + "logps/rejected": -157.21775817871094, + "loss": 0.6117, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.8089156150817871, + "rewards/margins": 0.25906902551651, + "rewards/rejected": -1.067984700202942, + "step": 3730 + }, + { + "epoch": 0.6443831840110269, + "grad_norm": 15.389662742614746, + "learning_rate": 1.9208396169516092e-07, + "logits/chosen": -2.535585880279541, + "logits/rejected": -2.519484043121338, + "logps/chosen": -139.1349334716797, + "logps/rejected": -161.3406219482422, + "loss": 0.6303, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.8444369435310364, + "rewards/margins": 0.20586533844470978, + "rewards/rejected": -1.050302267074585, + "step": 3740 + }, + { + "epoch": 0.646106133700896, + "grad_norm": 17.75441551208496, + "learning_rate": 1.9200559943937895e-07, + "logits/chosen": -2.592435359954834, + "logits/rejected": -2.5743930339813232, + "logps/chosen": -141.97390747070312, + "logps/rejected": -160.23193359375, + "loss": 0.6287, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.8718975186347961, + "rewards/margins": 0.2032506763935089, + "rewards/rejected": -1.0751482248306274, + "step": 3750 + }, + { + "epoch": 0.647829083390765, + "grad_norm": 22.844364166259766, + "learning_rate": 1.91926867375968e-07, + "logits/chosen": -2.547484874725342, + "logits/rejected": -2.5382332801818848, + "logps/chosen": -156.30105590820312, + "logps/rejected": -165.59693908691406, + "loss": 0.6719, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.9945210218429565, + "rewards/margins": 0.11567112058401108, + "rewards/rejected": -1.110192060470581, + "step": 3760 + }, + { + "epoch": 0.649552033080634, + "grad_norm": 19.29290199279785, + "learning_rate": 1.9184776582138408e-07, + "logits/chosen": -2.5390050411224365, + "logits/rejected": -2.5214526653289795, + "logps/chosen": -157.914794921875, + "logps/rejected": -172.47427368164062, + "loss": 0.6534, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.0316898822784424, + "rewards/margins": 0.15778304636478424, + "rewards/rejected": -1.1894729137420654, + "step": 3770 + }, + { + "epoch": 0.6512749827705031, + "grad_norm": 23.615245819091797, + "learning_rate": 1.9176829509356817e-07, + "logits/chosen": -2.54388427734375, + "logits/rejected": -2.512617826461792, + "logps/chosen": -164.77212524414062, + "logps/rejected": -178.50497436523438, + "loss": 0.6339, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.0995312929153442, + "rewards/margins": 0.1998249590396881, + "rewards/rejected": -1.2993561029434204, + "step": 3780 + }, + { + "epoch": 0.6529979324603722, + "grad_norm": 22.931257247924805, + "learning_rate": 1.9168845551194526e-07, + "logits/chosen": -2.5178706645965576, + "logits/rejected": -2.4947054386138916, + "logps/chosen": -163.10385131835938, + "logps/rejected": -176.40982055664062, + "loss": 0.6431, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.081376552581787, + "rewards/margins": 0.18316730856895447, + "rewards/rejected": -1.2645437717437744, + "step": 3790 + }, + { + "epoch": 0.6547208821502413, + "grad_norm": 38.20081329345703, + "learning_rate": 1.916082473974228e-07, + "logits/chosen": -2.548003673553467, + "logits/rejected": -2.526517152786255, + "logps/chosen": -156.9646453857422, + "logps/rejected": -167.34617614746094, + "loss": 0.6432, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.9847234487533569, + "rewards/margins": 0.16609299182891846, + "rewards/rejected": -1.150816559791565, + "step": 3800 + }, + { + "epoch": 0.6564438318401102, + "grad_norm": 21.577075958251953, + "learning_rate": 1.9152767107238957e-07, + "logits/chosen": -2.5703864097595215, + "logits/rejected": -2.5399067401885986, + "logps/chosen": -156.12286376953125, + "logps/rejected": -171.01058959960938, + "loss": 0.6165, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9712567329406738, + "rewards/margins": 0.22613871097564697, + "rewards/rejected": -1.1973953247070312, + "step": 3810 + }, + { + "epoch": 0.6581667815299793, + "grad_norm": 20.8365535736084, + "learning_rate": 1.9144672686071437e-07, + "logits/chosen": -2.5181126594543457, + "logits/rejected": -2.497028112411499, + "logps/chosen": -149.05239868164062, + "logps/rejected": -167.2176513671875, + "loss": 0.6232, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.9377164840698242, + "rewards/margins": 0.2112659215927124, + "rewards/rejected": -1.148982286453247, + "step": 3820 + }, + { + "epoch": 0.6598897312198484, + "grad_norm": 25.174989700317383, + "learning_rate": 1.913654150877446e-07, + "logits/chosen": -2.547309398651123, + "logits/rejected": -2.5087881088256836, + "logps/chosen": -154.81350708007812, + "logps/rejected": -162.9886932373047, + "loss": 0.6444, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.9520630836486816, + "rewards/margins": 0.1712508648633957, + "rewards/rejected": -1.1233139038085938, + "step": 3830 + }, + { + "epoch": 0.6616126809097175, + "grad_norm": 15.482654571533203, + "learning_rate": 1.9128373608030513e-07, + "logits/chosen": -2.5088038444519043, + "logits/rejected": -2.500941038131714, + "logps/chosen": -147.49221801757812, + "logps/rejected": -174.9437255859375, + "loss": 0.6126, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.9215294122695923, + "rewards/margins": 0.24697823822498322, + "rewards/rejected": -1.1685075759887695, + "step": 3840 + }, + { + "epoch": 0.6633356305995864, + "grad_norm": 20.026220321655273, + "learning_rate": 1.9120169016669683e-07, + "logits/chosen": -2.584136486053467, + "logits/rejected": -2.56913423538208, + "logps/chosen": -149.8631134033203, + "logps/rejected": -163.15603637695312, + "loss": 0.6351, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.9149333834648132, + "rewards/margins": 0.18745577335357666, + "rewards/rejected": -1.1023890972137451, + "step": 3850 + }, + { + "epoch": 0.6650585802894555, + "grad_norm": 17.616666793823242, + "learning_rate": 1.9111927767669531e-07, + "logits/chosen": -2.5894997119903564, + "logits/rejected": -2.5635571479797363, + "logps/chosen": -156.982421875, + "logps/rejected": -167.90719604492188, + "loss": 0.658, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.0112090110778809, + "rewards/margins": 0.14642414450645447, + "rewards/rejected": -1.1576330661773682, + "step": 3860 + }, + { + "epoch": 0.6667815299793246, + "grad_norm": 24.253337860107422, + "learning_rate": 1.9103649894154965e-07, + "logits/chosen": -2.5155365467071533, + "logits/rejected": -2.494205951690674, + "logps/chosen": -154.18704223632812, + "logps/rejected": -175.84947204589844, + "loss": 0.6094, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.9422490000724792, + "rewards/margins": 0.27016550302505493, + "rewards/rejected": -1.2124145030975342, + "step": 3870 + }, + { + "epoch": 0.6685044796691937, + "grad_norm": 25.2159481048584, + "learning_rate": 1.90953354293981e-07, + "logits/chosen": -2.534515857696533, + "logits/rejected": -2.5364620685577393, + "logps/chosen": -149.01571655273438, + "logps/rejected": -162.34564208984375, + "loss": 0.6638, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.971118152141571, + "rewards/margins": 0.12453125417232513, + "rewards/rejected": -1.0956494808197021, + "step": 3880 + }, + { + "epoch": 0.6702274293590628, + "grad_norm": 16.787229537963867, + "learning_rate": 1.908698440681812e-07, + "logits/chosen": -2.60170316696167, + "logits/rejected": -2.582350254058838, + "logps/chosen": -134.91282653808594, + "logps/rejected": -150.58224487304688, + "loss": 0.6284, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.7993661165237427, + "rewards/margins": 0.19520600140094757, + "rewards/rejected": -0.9945721626281738, + "step": 3890 + }, + { + "epoch": 0.6719503790489317, + "grad_norm": 15.2194185256958, + "learning_rate": 1.9078596859981163e-07, + "logits/chosen": -2.584822416305542, + "logits/rejected": -2.5468311309814453, + "logps/chosen": -134.71353149414062, + "logps/rejected": -147.935791015625, + "loss": 0.6201, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.7654783725738525, + "rewards/margins": 0.21653859317302704, + "rewards/rejected": -0.982016921043396, + "step": 3900 + }, + { + "epoch": 0.6736733287388008, + "grad_norm": 15.335397720336914, + "learning_rate": 1.9070172822600152e-07, + "logits/chosen": -2.5756888389587402, + "logits/rejected": -2.558799982070923, + "logps/chosen": -140.26095581054688, + "logps/rejected": -163.974609375, + "loss": 0.6046, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.8372836112976074, + "rewards/margins": 0.27128416299819946, + "rewards/rejected": -1.1085678339004517, + "step": 3910 + }, + { + "epoch": 0.6753962784286699, + "grad_norm": 16.354459762573242, + "learning_rate": 1.90617123285347e-07, + "logits/chosen": -2.594193458557129, + "logits/rejected": -2.5603878498077393, + "logps/chosen": -137.20114135742188, + "logps/rejected": -147.0207061767578, + "loss": 0.6273, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.7684726715087891, + "rewards/margins": 0.20228557288646698, + "rewards/rejected": -0.9707581400871277, + "step": 3920 + }, + { + "epoch": 0.677119228118539, + "grad_norm": 22.57278060913086, + "learning_rate": 1.9053215411790945e-07, + "logits/chosen": -2.569798231124878, + "logits/rejected": -2.563342571258545, + "logps/chosen": -141.64752197265625, + "logps/rejected": -160.0135955810547, + "loss": 0.6395, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.889459490776062, + "rewards/margins": 0.19048067927360535, + "rewards/rejected": -1.0799401998519897, + "step": 3930 + }, + { + "epoch": 0.6788421778084079, + "grad_norm": 26.700321197509766, + "learning_rate": 1.9044682106521428e-07, + "logits/chosen": -2.454202890396118, + "logits/rejected": -2.4379258155822754, + "logps/chosen": -144.43515014648438, + "logps/rejected": -162.84532165527344, + "loss": 0.6277, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.8958985209465027, + "rewards/margins": 0.2047765702009201, + "rewards/rejected": -1.100675106048584, + "step": 3940 + }, + { + "epoch": 0.680565127498277, + "grad_norm": 24.479074478149414, + "learning_rate": 1.903611244702494e-07, + "logits/chosen": -2.4695637226104736, + "logits/rejected": -2.4349091053009033, + "logps/chosen": -139.02603149414062, + "logps/rejected": -158.69528198242188, + "loss": 0.6007, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.8301140666007996, + "rewards/margins": 0.2578274607658386, + "rewards/rejected": -1.0879415273666382, + "step": 3950 + }, + { + "epoch": 0.6822880771881461, + "grad_norm": 24.88389015197754, + "learning_rate": 1.9027506467746404e-07, + "logits/chosen": -2.534788131713867, + "logits/rejected": -2.533944606781006, + "logps/chosen": -149.79391479492188, + "logps/rejected": -176.94375610351562, + "loss": 0.6237, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.9553853869438171, + "rewards/margins": 0.2335664927959442, + "rewards/rejected": -1.188951849937439, + "step": 3960 + }, + { + "epoch": 0.6840110268780152, + "grad_norm": 21.85911750793457, + "learning_rate": 1.901886420327672e-07, + "logits/chosen": -2.505385637283325, + "logits/rejected": -2.48795747756958, + "logps/chosen": -161.1486053466797, + "logps/rejected": -184.07723999023438, + "loss": 0.6175, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.059510350227356, + "rewards/margins": 0.2523019015789032, + "rewards/rejected": -1.3118122816085815, + "step": 3970 + }, + { + "epoch": 0.6857339765678843, + "grad_norm": 20.922971725463867, + "learning_rate": 1.9010185688352643e-07, + "logits/chosen": -2.443387508392334, + "logits/rejected": -2.4325804710388184, + "logps/chosen": -154.60977172851562, + "logps/rejected": -184.83419799804688, + "loss": 0.5929, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.005619764328003, + "rewards/margins": 0.3168439269065857, + "rewards/rejected": -1.3224637508392334, + "step": 3980 + }, + { + "epoch": 0.6874569262577532, + "grad_norm": 17.087665557861328, + "learning_rate": 1.9001470957856615e-07, + "logits/chosen": -2.4980862140655518, + "logits/rejected": -2.4844629764556885, + "logps/chosen": -153.3562774658203, + "logps/rejected": -174.1827392578125, + "loss": 0.6393, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.9912816882133484, + "rewards/margins": 0.21040551364421844, + "rewards/rejected": -1.201687216758728, + "step": 3990 + }, + { + "epoch": 0.6891798759476223, + "grad_norm": 26.346763610839844, + "learning_rate": 1.8992720046816664e-07, + "logits/chosen": -2.566901922225952, + "logits/rejected": -2.5389089584350586, + "logps/chosen": -149.53170776367188, + "logps/rejected": -166.23568725585938, + "loss": 0.6203, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.918075680732727, + "rewards/margins": 0.22810205817222595, + "rewards/rejected": -1.1461777687072754, + "step": 4000 + }, + { + "epoch": 0.6891798759476223, + "eval_logits/chosen": -2.5642409324645996, + "eval_logits/rejected": -2.558300256729126, + "eval_logps/chosen": -137.52479553222656, + "eval_logps/rejected": -154.57752990722656, + "eval_loss": 0.6523177623748779, + "eval_rewards/accuracies": 0.6166356801986694, + "eval_rewards/chosen": -0.7850932478904724, + "eval_rewards/margins": 0.1331859678030014, + "eval_rewards/rejected": -0.9182791709899902, + "eval_runtime": 382.9463, + "eval_samples_per_second": 11.239, + "eval_steps_per_second": 1.405, + "step": 4000 + }, + { + "epoch": 0.6909028256374914, + "grad_norm": 20.584022521972656, + "learning_rate": 1.8983932990406229e-07, + "logits/chosen": -2.4782555103302, + "logits/rejected": -2.4685940742492676, + "logps/chosen": -147.3070526123047, + "logps/rejected": -178.8524932861328, + "loss": 0.6062, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.9803979992866516, + "rewards/margins": 0.2875971496105194, + "rewards/rejected": -1.2679951190948486, + "step": 4010 + }, + { + "epoch": 0.6926257753273605, + "grad_norm": 18.147808074951172, + "learning_rate": 1.8975109823944039e-07, + "logits/chosen": -2.484112501144409, + "logits/rejected": -2.479996919631958, + "logps/chosen": -154.5767822265625, + "logps/rejected": -175.74847412109375, + "loss": 0.6362, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.0183777809143066, + "rewards/margins": 0.2127773016691208, + "rewards/rejected": -1.2311551570892334, + "step": 4020 + }, + { + "epoch": 0.6943487250172296, + "grad_norm": 16.098173141479492, + "learning_rate": 1.8966250582893953e-07, + "logits/chosen": -2.4954991340637207, + "logits/rejected": -2.477116346359253, + "logps/chosen": -149.50643920898438, + "logps/rejected": -165.57916259765625, + "loss": 0.6407, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.9233428835868835, + "rewards/margins": 0.1793515682220459, + "rewards/rejected": -1.1026945114135742, + "step": 4030 + }, + { + "epoch": 0.6960716747070985, + "grad_norm": 21.584396362304688, + "learning_rate": 1.8957355302864842e-07, + "logits/chosen": -2.548405885696411, + "logits/rejected": -2.535510301589966, + "logps/chosen": -143.77305603027344, + "logps/rejected": -166.7368621826172, + "loss": 0.6117, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.8692083358764648, + "rewards/margins": 0.2582603096961975, + "rewards/rejected": -1.1274688243865967, + "step": 4040 + }, + { + "epoch": 0.6977946243969676, + "grad_norm": 19.66901206970215, + "learning_rate": 1.894842401961042e-07, + "logits/chosen": -2.4985928535461426, + "logits/rejected": -2.481311321258545, + "logps/chosen": -143.23513793945312, + "logps/rejected": -169.25555419921875, + "loss": 0.6129, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.9076918363571167, + "rewards/margins": 0.2686161398887634, + "rewards/rejected": -1.1763079166412354, + "step": 4050 + }, + { + "epoch": 0.6995175740868367, + "grad_norm": 19.5911865234375, + "learning_rate": 1.8939456769029122e-07, + "logits/chosen": -2.475037097930908, + "logits/rejected": -2.4540417194366455, + "logps/chosen": -163.9054412841797, + "logps/rejected": -176.58795166015625, + "loss": 0.6458, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.0669587850570679, + "rewards/margins": 0.18664845824241638, + "rewards/rejected": -1.2536073923110962, + "step": 4060 + }, + { + "epoch": 0.7012405237767058, + "grad_norm": 21.787147521972656, + "learning_rate": 1.8930453587163949e-07, + "logits/chosen": -2.4452805519104004, + "logits/rejected": -2.424757719039917, + "logps/chosen": -152.8466033935547, + "logps/rejected": -182.3805694580078, + "loss": 0.5859, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.9907079935073853, + "rewards/margins": 0.31428462266921997, + "rewards/rejected": -1.30499267578125, + "step": 4070 + }, + { + "epoch": 0.7029634734665747, + "grad_norm": 30.879796981811523, + "learning_rate": 1.8921414510202317e-07, + "logits/chosen": -2.441105604171753, + "logits/rejected": -2.4298994541168213, + "logps/chosen": -159.96908569335938, + "logps/rejected": -184.31655883789062, + "loss": 0.6232, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.0666249990463257, + "rewards/margins": 0.24226295948028564, + "rewards/rejected": -1.3088879585266113, + "step": 4080 + }, + { + "epoch": 0.7046864231564438, + "grad_norm": 25.079172134399414, + "learning_rate": 1.8912339574475925e-07, + "logits/chosen": -2.4596691131591797, + "logits/rejected": -2.4346156120300293, + "logps/chosen": -163.6860809326172, + "logps/rejected": -190.64089965820312, + "loss": 0.6097, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.0688049793243408, + "rewards/margins": 0.30191558599472046, + "rewards/rejected": -1.3707208633422852, + "step": 4090 + }, + { + "epoch": 0.7064093728463129, + "grad_norm": 31.3631534576416, + "learning_rate": 1.8903228816460598e-07, + "logits/chosen": -2.458258628845215, + "logits/rejected": -2.435795545578003, + "logps/chosen": -164.78323364257812, + "logps/rejected": -185.1956024169922, + "loss": 0.6139, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.1078027486801147, + "rewards/margins": 0.2592393755912781, + "rewards/rejected": -1.3670421838760376, + "step": 4100 + }, + { + "epoch": 0.708132322536182, + "grad_norm": 21.405792236328125, + "learning_rate": 1.8894082272776156e-07, + "logits/chosen": -2.42065691947937, + "logits/rejected": -2.409475803375244, + "logps/chosen": -173.5276641845703, + "logps/rejected": -180.2970733642578, + "loss": 0.6803, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -1.1685502529144287, + "rewards/margins": 0.1270175725221634, + "rewards/rejected": -1.2955677509307861, + "step": 4110 + }, + { + "epoch": 0.709855272226051, + "grad_norm": 17.203845977783203, + "learning_rate": 1.8884899980186248e-07, + "logits/chosen": -2.423537254333496, + "logits/rejected": -2.426220417022705, + "logps/chosen": -153.18441772460938, + "logps/rejected": -176.77294921875, + "loss": 0.6317, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.0196375846862793, + "rewards/margins": 0.21294169127941132, + "rewards/rejected": -1.2325794696807861, + "step": 4120 + }, + { + "epoch": 0.71157822191592, + "grad_norm": 19.912616729736328, + "learning_rate": 1.8875681975598207e-07, + "logits/chosen": -2.497934579849243, + "logits/rejected": -2.4791147708892822, + "logps/chosen": -146.62086486816406, + "logps/rejected": -164.03176879882812, + "loss": 0.6238, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.9095829725265503, + "rewards/margins": 0.21481628715991974, + "rewards/rejected": -1.1243993043899536, + "step": 4130 + }, + { + "epoch": 0.7133011716057891, + "grad_norm": 18.456600189208984, + "learning_rate": 1.8866428296062916e-07, + "logits/chosen": -2.5012519359588623, + "logits/rejected": -2.4845516681671143, + "logps/chosen": -154.83172607421875, + "logps/rejected": -159.90524291992188, + "loss": 0.6716, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.9765934944152832, + "rewards/margins": 0.11888899654150009, + "rewards/rejected": -1.095482587814331, + "step": 4140 + }, + { + "epoch": 0.7150241212956582, + "grad_norm": 15.824003219604492, + "learning_rate": 1.8857138978774647e-07, + "logits/chosen": -2.5242793560028076, + "logits/rejected": -2.506749391555786, + "logps/chosen": -147.20632934570312, + "logps/rejected": -158.5218963623047, + "loss": 0.6329, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.8848239183425903, + "rewards/margins": 0.19589009881019592, + "rewards/rejected": -1.0807139873504639, + "step": 4150 + }, + { + "epoch": 0.7167470709855273, + "grad_norm": 15.421948432922363, + "learning_rate": 1.8847814061070917e-07, + "logits/chosen": -2.468259334564209, + "logits/rejected": -2.4435718059539795, + "logps/chosen": -138.98873901367188, + "logps/rejected": -157.41748046875, + "loss": 0.6159, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.857384204864502, + "rewards/margins": 0.23457948863506317, + "rewards/rejected": -1.091963529586792, + "step": 4160 + }, + { + "epoch": 0.7184700206753962, + "grad_norm": 16.917064666748047, + "learning_rate": 1.8838453580432328e-07, + "logits/chosen": -2.503840208053589, + "logits/rejected": -2.495299816131592, + "logps/chosen": -145.8829345703125, + "logps/rejected": -167.2362060546875, + "loss": 0.6283, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.9338346719741821, + "rewards/margins": 0.21018731594085693, + "rewards/rejected": -1.1440218687057495, + "step": 4170 + }, + { + "epoch": 0.7201929703652653, + "grad_norm": 21.029468536376953, + "learning_rate": 1.882905757448243e-07, + "logits/chosen": -2.4621734619140625, + "logits/rejected": -2.442660093307495, + "logps/chosen": -156.11219787597656, + "logps/rejected": -180.0277557373047, + "loss": 0.6166, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.992640495300293, + "rewards/margins": 0.24560074508190155, + "rewards/rejected": -1.238241195678711, + "step": 4180 + }, + { + "epoch": 0.7219159200551344, + "grad_norm": 21.128299713134766, + "learning_rate": 1.8819626080987567e-07, + "logits/chosen": -2.4580259323120117, + "logits/rejected": -2.445139169692993, + "logps/chosen": -163.23568725585938, + "logps/rejected": -186.8887176513672, + "loss": 0.6251, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.112467646598816, + "rewards/margins": 0.236087366938591, + "rewards/rejected": -1.3485549688339233, + "step": 4190 + }, + { + "epoch": 0.7236388697450035, + "grad_norm": 17.36494255065918, + "learning_rate": 1.881015913785671e-07, + "logits/chosen": -2.4943463802337646, + "logits/rejected": -2.4810996055603027, + "logps/chosen": -159.11692810058594, + "logps/rejected": -166.8761749267578, + "loss": 0.6656, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.0203524827957153, + "rewards/margins": 0.1343226134777069, + "rewards/rejected": -1.154675006866455, + "step": 4200 + }, + { + "epoch": 0.7253618194348725, + "grad_norm": 26.34899139404297, + "learning_rate": 1.880065678314133e-07, + "logits/chosen": -2.479694128036499, + "logits/rejected": -2.4616291522979736, + "logps/chosen": -154.9434051513672, + "logps/rejected": -165.20925903320312, + "loss": 0.6586, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.9922011494636536, + "rewards/margins": 0.1440073698759079, + "rewards/rejected": -1.1362085342407227, + "step": 4210 + }, + { + "epoch": 0.7270847691247415, + "grad_norm": 16.302705764770508, + "learning_rate": 1.8791119055035221e-07, + "logits/chosen": -2.394646167755127, + "logits/rejected": -2.3819780349731445, + "logps/chosen": -142.0032958984375, + "logps/rejected": -160.90249633789062, + "loss": 0.6331, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.8893952369689941, + "rewards/margins": 0.19879098236560822, + "rewards/rejected": -1.088186264038086, + "step": 4220 + }, + { + "epoch": 0.7288077188146106, + "grad_norm": 19.511754989624023, + "learning_rate": 1.8781545991874362e-07, + "logits/chosen": -2.534529209136963, + "logits/rejected": -2.5153164863586426, + "logps/chosen": -151.25851440429688, + "logps/rejected": -167.86810302734375, + "loss": 0.6339, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.9434124231338501, + "rewards/margins": 0.20468302071094513, + "rewards/rejected": -1.1480954885482788, + "step": 4230 + }, + { + "epoch": 0.7305306685044797, + "grad_norm": 17.765453338623047, + "learning_rate": 1.8771937632136753e-07, + "logits/chosen": -2.4428322315216064, + "logits/rejected": -2.4241092205047607, + "logps/chosen": -145.80564880371094, + "logps/rejected": -167.83908081054688, + "loss": 0.6246, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.9024696350097656, + "rewards/margins": 0.22954091429710388, + "rewards/rejected": -1.1320104598999023, + "step": 4240 + }, + { + "epoch": 0.7322536181943488, + "grad_norm": 18.9899845123291, + "learning_rate": 1.8762294014442275e-07, + "logits/chosen": -2.456968307495117, + "logits/rejected": -2.4411208629608154, + "logps/chosen": -141.48629760742188, + "logps/rejected": -157.35751342773438, + "loss": 0.6276, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.8507699966430664, + "rewards/margins": 0.20453086495399475, + "rewards/rejected": -1.0553009510040283, + "step": 4250 + }, + { + "epoch": 0.7339765678842178, + "grad_norm": 18.64900779724121, + "learning_rate": 1.8752615177552515e-07, + "logits/chosen": -2.440948724746704, + "logits/rejected": -2.423311710357666, + "logps/chosen": -142.525634765625, + "logps/rejected": -165.6720733642578, + "loss": 0.6281, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.866360068321228, + "rewards/margins": 0.2523943781852722, + "rewards/rejected": -1.118754506111145, + "step": 4260 + }, + { + "epoch": 0.7356995175740868, + "grad_norm": 26.834325790405273, + "learning_rate": 1.8742901160370629e-07, + "logits/chosen": -2.439136028289795, + "logits/rejected": -2.4189846515655518, + "logps/chosen": -136.86460876464844, + "logps/rejected": -156.23016357421875, + "loss": 0.6264, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.8397220373153687, + "rewards/margins": 0.21862992644309998, + "rewards/rejected": -1.058351755142212, + "step": 4270 + }, + { + "epoch": 0.7374224672639559, + "grad_norm": 21.31093978881836, + "learning_rate": 1.8733152001941162e-07, + "logits/chosen": -2.433461904525757, + "logits/rejected": -2.416278600692749, + "logps/chosen": -143.94137573242188, + "logps/rejected": -168.7376251220703, + "loss": 0.6115, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.9106764793395996, + "rewards/margins": 0.2685638964176178, + "rewards/rejected": -1.1792404651641846, + "step": 4280 + }, + { + "epoch": 0.739145416953825, + "grad_norm": 23.622499465942383, + "learning_rate": 1.872336774144992e-07, + "logits/chosen": -2.4091594219207764, + "logits/rejected": -2.389822006225586, + "logps/chosen": -163.57362365722656, + "logps/rejected": -182.37948608398438, + "loss": 0.6325, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.0774534940719604, + "rewards/margins": 0.2432311475276947, + "rewards/rejected": -1.3206846714019775, + "step": 4290 + }, + { + "epoch": 0.740868366643694, + "grad_norm": 15.690317153930664, + "learning_rate": 1.8713548418223797e-07, + "logits/chosen": -2.4367196559906006, + "logits/rejected": -2.425534725189209, + "logps/chosen": -162.27609252929688, + "logps/rejected": -184.33816528320312, + "loss": 0.6228, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.085301160812378, + "rewards/margins": 0.24133506417274475, + "rewards/rejected": -1.3266363143920898, + "step": 4300 + }, + { + "epoch": 0.742591316333563, + "grad_norm": 22.461139678955078, + "learning_rate": 1.8703694071730612e-07, + "logits/chosen": -2.364335060119629, + "logits/rejected": -2.343078851699829, + "logps/chosen": -166.93553161621094, + "logps/rejected": -187.39309692382812, + "loss": 0.6172, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.115869164466858, + "rewards/margins": 0.24821288883686066, + "rewards/rejected": -1.3640820980072021, + "step": 4310 + }, + { + "epoch": 0.7443142660234321, + "grad_norm": 22.394622802734375, + "learning_rate": 1.8693804741578964e-07, + "logits/chosen": -2.4276387691497803, + "logits/rejected": -2.403562307357788, + "logps/chosen": -168.36390686035156, + "logps/rejected": -196.73269653320312, + "loss": 0.5796, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.0858043432235718, + "rewards/margins": 0.34008654952049255, + "rewards/rejected": -1.4258909225463867, + "step": 4320 + }, + { + "epoch": 0.7460372157133012, + "grad_norm": 19.556331634521484, + "learning_rate": 1.8683880467518055e-07, + "logits/chosen": -2.3903796672821045, + "logits/rejected": -2.366075038909912, + "logps/chosen": -168.3106689453125, + "logps/rejected": -182.62008666992188, + "loss": 0.6442, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.1263628005981445, + "rewards/margins": 0.20899710059165955, + "rewards/rejected": -1.3353599309921265, + "step": 4330 + }, + { + "epoch": 0.7477601654031703, + "grad_norm": 22.436267852783203, + "learning_rate": 1.8673921289437554e-07, + "logits/chosen": -2.3715744018554688, + "logits/rejected": -2.3499209880828857, + "logps/chosen": -154.8798828125, + "logps/rejected": -184.8426055908203, + "loss": 0.6044, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.0422241687774658, + "rewards/margins": 0.3092927634716034, + "rewards/rejected": -1.3515169620513916, + "step": 4340 + }, + { + "epoch": 0.7494831150930393, + "grad_norm": 23.099063873291016, + "learning_rate": 1.8663927247367407e-07, + "logits/chosen": -2.3912670612335205, + "logits/rejected": -2.3795111179351807, + "logps/chosen": -151.63742065429688, + "logps/rejected": -180.13919067382812, + "loss": 0.6018, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.9931322932243347, + "rewards/margins": 0.2769755423069, + "rewards/rejected": -1.2701078653335571, + "step": 4350 + }, + { + "epoch": 0.7512060647829083, + "grad_norm": 21.606122970581055, + "learning_rate": 1.865389838147771e-07, + "logits/chosen": -2.4218342304229736, + "logits/rejected": -2.4023971557617188, + "logps/chosen": -164.91665649414062, + "logps/rejected": -180.6358642578125, + "loss": 0.6554, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.1022279262542725, + "rewards/margins": 0.19608908891677856, + "rewards/rejected": -1.2983169555664062, + "step": 4360 + }, + { + "epoch": 0.7529290144727774, + "grad_norm": 28.500917434692383, + "learning_rate": 1.864383473207852e-07, + "logits/chosen": -2.443657398223877, + "logits/rejected": -2.423034191131592, + "logps/chosen": -149.47891235351562, + "logps/rejected": -171.32008361816406, + "loss": 0.6229, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.9359003901481628, + "rewards/margins": 0.23575489223003387, + "rewards/rejected": -1.171655535697937, + "step": 4370 + }, + { + "epoch": 0.7546519641626465, + "grad_norm": 25.91777992248535, + "learning_rate": 1.8633736339619702e-07, + "logits/chosen": -2.4580273628234863, + "logits/rejected": -2.4401497840881348, + "logps/chosen": -155.8708953857422, + "logps/rejected": -171.21290588378906, + "loss": 0.6473, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.0010490417480469, + "rewards/margins": 0.1942082643508911, + "rewards/rejected": -1.195257306098938, + "step": 4380 + }, + { + "epoch": 0.7563749138525155, + "grad_norm": 23.267330169677734, + "learning_rate": 1.8623603244690772e-07, + "logits/chosen": -2.4025485515594482, + "logits/rejected": -2.3881447315216064, + "logps/chosen": -147.8549041748047, + "logps/rejected": -173.3671875, + "loss": 0.6106, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.9391400218009949, + "rewards/margins": 0.2640628516674042, + "rewards/rejected": -1.2032029628753662, + "step": 4390 + }, + { + "epoch": 0.7580978635423845, + "grad_norm": 33.47300338745117, + "learning_rate": 1.861343548802073e-07, + "logits/chosen": -2.4176034927368164, + "logits/rejected": -2.3994877338409424, + "logps/chosen": -165.08901977539062, + "logps/rejected": -180.73426818847656, + "loss": 0.6341, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.0616172552108765, + "rewards/margins": 0.19609788060188293, + "rewards/rejected": -1.257715106010437, + "step": 4400 + }, + { + "epoch": 0.7580978635423845, + "eval_logits/chosen": -2.4722869396209717, + "eval_logits/rejected": -2.4643163681030273, + "eval_logps/chosen": -146.87518310546875, + "eval_logps/rejected": -165.33773803710938, + "eval_loss": 0.6487003564834595, + "eval_rewards/accuracies": 0.6129181981086731, + "eval_rewards/chosen": -0.8785969614982605, + "eval_rewards/margins": 0.14728423953056335, + "eval_rewards/rejected": -1.0258814096450806, + "eval_runtime": 382.734, + "eval_samples_per_second": 11.245, + "eval_steps_per_second": 1.406, + "step": 4400 + }, + { + "epoch": 0.7598208132322536, + "grad_norm": 26.31522560119629, + "learning_rate": 1.8603233110477884e-07, + "logits/chosen": -2.35225510597229, + "logits/rejected": -2.3370361328125, + "logps/chosen": -167.1627197265625, + "logps/rejected": -185.4983367919922, + "loss": 0.6481, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.1053941249847412, + "rewards/margins": 0.20654484629631042, + "rewards/rejected": -1.311939001083374, + "step": 4410 + }, + { + "epoch": 0.7615437629221227, + "grad_norm": 26.59756851196289, + "learning_rate": 1.8592996153069715e-07, + "logits/chosen": -2.471127986907959, + "logits/rejected": -2.4345784187316895, + "logps/chosen": -166.3771209716797, + "logps/rejected": -178.87313842773438, + "loss": 0.6456, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.103097915649414, + "rewards/margins": 0.18923258781433105, + "rewards/rejected": -1.2923305034637451, + "step": 4420 + }, + { + "epoch": 0.7632667126119917, + "grad_norm": 21.480712890625, + "learning_rate": 1.8582724656942683e-07, + "logits/chosen": -2.392627477645874, + "logits/rejected": -2.3656272888183594, + "logps/chosen": -159.09445190429688, + "logps/rejected": -176.07310485839844, + "loss": 0.6416, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0466508865356445, + "rewards/margins": 0.21135036647319794, + "rewards/rejected": -1.2580013275146484, + "step": 4430 + }, + { + "epoch": 0.7649896623018608, + "grad_norm": 21.1779727935791, + "learning_rate": 1.8572418663382074e-07, + "logits/chosen": -2.434896945953369, + "logits/rejected": -2.4075655937194824, + "logps/chosen": -163.56765747070312, + "logps/rejected": -185.07579040527344, + "loss": 0.6057, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.0352531671524048, + "rewards/margins": 0.2752748131752014, + "rewards/rejected": -1.310528039932251, + "step": 4440 + }, + { + "epoch": 0.7667126119917298, + "grad_norm": 17.894512176513672, + "learning_rate": 1.8562078213811833e-07, + "logits/chosen": -2.3739511966705322, + "logits/rejected": -2.3611817359924316, + "logps/chosen": -149.56271362304688, + "logps/rejected": -168.64669799804688, + "loss": 0.62, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.9247000813484192, + "rewards/margins": 0.23195107281208038, + "rewards/rejected": -1.1566511392593384, + "step": 4450 + }, + { + "epoch": 0.7684355616815989, + "grad_norm": 20.049449920654297, + "learning_rate": 1.8551703349794406e-07, + "logits/chosen": -2.448498249053955, + "logits/rejected": -2.4351491928100586, + "logps/chosen": -145.53578186035156, + "logps/rejected": -174.42141723632812, + "loss": 0.6027, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.9232245683670044, + "rewards/margins": 0.30191126465797424, + "rewards/rejected": -1.2251358032226562, + "step": 4460 + }, + { + "epoch": 0.770158511371468, + "grad_norm": 19.79228401184082, + "learning_rate": 1.854129411303055e-07, + "logits/chosen": -2.47465181350708, + "logits/rejected": -2.4493346214294434, + "logps/chosen": -153.1445770263672, + "logps/rejected": -166.31228637695312, + "loss": 0.6392, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.9666484594345093, + "rewards/margins": 0.20043261349201202, + "rewards/rejected": -1.1670811176300049, + "step": 4470 + }, + { + "epoch": 0.771881461061337, + "grad_norm": 24.481473922729492, + "learning_rate": 1.8530850545359193e-07, + "logits/chosen": -2.4872870445251465, + "logits/rejected": -2.4746615886688232, + "logps/chosen": -152.59925842285156, + "logps/rejected": -171.68936157226562, + "loss": 0.643, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.0053293704986572, + "rewards/margins": 0.20094509422779083, + "rewards/rejected": -1.2062745094299316, + "step": 4480 + }, + { + "epoch": 0.7736044107512061, + "grad_norm": 21.01685333251953, + "learning_rate": 1.8520372688757245e-07, + "logits/chosen": -2.4067769050598145, + "logits/rejected": -2.3791446685791016, + "logps/chosen": -146.48553466796875, + "logps/rejected": -164.83828735351562, + "loss": 0.6286, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.9418350458145142, + "rewards/margins": 0.2116595208644867, + "rewards/rejected": -1.1534945964813232, + "step": 4490 + }, + { + "epoch": 0.7753273604410751, + "grad_norm": 21.07688331604004, + "learning_rate": 1.8509860585339446e-07, + "logits/chosen": -2.392287015914917, + "logits/rejected": -2.371561288833618, + "logps/chosen": -159.52886962890625, + "logps/rejected": -180.74732971191406, + "loss": 0.6216, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.017209768295288, + "rewards/margins": 0.2693372368812561, + "rewards/rejected": -1.286547064781189, + "step": 4500 + }, + { + "epoch": 0.7770503101309442, + "grad_norm": 23.083614349365234, + "learning_rate": 1.8499314277358167e-07, + "logits/chosen": -2.4393038749694824, + "logits/rejected": -2.4085822105407715, + "logps/chosen": -176.33653259277344, + "logps/rejected": -210.8615264892578, + "loss": 0.5921, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.209841012954712, + "rewards/margins": 0.34841373562812805, + "rewards/rejected": -1.558254599571228, + "step": 4510 + }, + { + "epoch": 0.7787732598208132, + "grad_norm": 26.21770477294922, + "learning_rate": 1.848873380720329e-07, + "logits/chosen": -2.3809621334075928, + "logits/rejected": -2.3610141277313232, + "logps/chosen": -198.77658081054688, + "logps/rejected": -216.4871826171875, + "loss": 0.6556, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.4526500701904297, + "rewards/margins": 0.215979665517807, + "rewards/rejected": -1.6686298847198486, + "step": 4520 + }, + { + "epoch": 0.7804962095106823, + "grad_norm": 35.826942443847656, + "learning_rate": 1.8478119217401985e-07, + "logits/chosen": -2.3922924995422363, + "logits/rejected": -2.3780174255371094, + "logps/chosen": -169.76864624023438, + "logps/rejected": -187.02670288085938, + "loss": 0.6569, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.1504439115524292, + "rewards/margins": 0.19426843523979187, + "rewards/rejected": -1.3447123765945435, + "step": 4530 + }, + { + "epoch": 0.7822191592005513, + "grad_norm": 36.92649841308594, + "learning_rate": 1.8467470550618574e-07, + "logits/chosen": -2.3660781383514404, + "logits/rejected": -2.3454301357269287, + "logps/chosen": -154.78317260742188, + "logps/rejected": -174.80319213867188, + "loss": 0.6292, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.0085923671722412, + "rewards/margins": 0.23993799090385437, + "rewards/rejected": -1.248530387878418, + "step": 4540 + }, + { + "epoch": 0.7839421088904204, + "grad_norm": 25.094932556152344, + "learning_rate": 1.8456787849654347e-07, + "logits/chosen": -2.4405195713043213, + "logits/rejected": -2.4174976348876953, + "logps/chosen": -152.58892822265625, + "logps/rejected": -170.98582458496094, + "loss": 0.6282, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.946140468120575, + "rewards/margins": 0.23043744266033173, + "rewards/rejected": -1.1765779256820679, + "step": 4550 + }, + { + "epoch": 0.7856650585802895, + "grad_norm": 21.068693161010742, + "learning_rate": 1.844607115744739e-07, + "logits/chosen": -2.3593530654907227, + "logits/rejected": -2.326908826828003, + "logps/chosen": -164.55909729003906, + "logps/rejected": -188.50650024414062, + "loss": 0.6079, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.0752851963043213, + "rewards/margins": 0.2878347933292389, + "rewards/rejected": -1.3631200790405273, + "step": 4560 + }, + { + "epoch": 0.7873880082701585, + "grad_norm": 30.99504852294922, + "learning_rate": 1.8435320517072408e-07, + "logits/chosen": -2.3525474071502686, + "logits/rejected": -2.3298683166503906, + "logps/chosen": -182.09889221191406, + "logps/rejected": -209.79025268554688, + "loss": 0.6312, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.2944326400756836, + "rewards/margins": 0.2926630973815918, + "rewards/rejected": -1.5870954990386963, + "step": 4570 + }, + { + "epoch": 0.7891109579600276, + "grad_norm": 18.812456130981445, + "learning_rate": 1.842453597174057e-07, + "logits/chosen": -2.3480334281921387, + "logits/rejected": -2.322362184524536, + "logps/chosen": -163.5109405517578, + "logps/rejected": -184.9033203125, + "loss": 0.6169, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.0780930519104004, + "rewards/margins": 0.25485703349113464, + "rewards/rejected": -1.3329499959945679, + "step": 4580 + }, + { + "epoch": 0.7908339076498966, + "grad_norm": 24.78331756591797, + "learning_rate": 1.841371756479931e-07, + "logits/chosen": -2.414440155029297, + "logits/rejected": -2.389694929122925, + "logps/chosen": -177.2312469482422, + "logps/rejected": -192.85855102539062, + "loss": 0.6596, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.2167154550552368, + "rewards/margins": 0.19507446885108948, + "rewards/rejected": -1.4117896556854248, + "step": 4590 + }, + { + "epoch": 0.7925568573397657, + "grad_norm": 26.33585548400879, + "learning_rate": 1.8402865339732171e-07, + "logits/chosen": -2.340993642807007, + "logits/rejected": -2.3130643367767334, + "logps/chosen": -172.70472717285156, + "logps/rejected": -210.70401000976562, + "loss": 0.5706, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.1847777366638184, + "rewards/margins": 0.3849853575229645, + "rewards/rejected": -1.5697630643844604, + "step": 4600 + }, + { + "epoch": 0.7942798070296347, + "grad_norm": 26.14021873474121, + "learning_rate": 1.8391979340158627e-07, + "logits/chosen": -2.32139253616333, + "logits/rejected": -2.3075783252716064, + "logps/chosen": -203.4938507080078, + "logps/rejected": -219.33523559570312, + "loss": 0.6653, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.5140093564987183, + "rewards/margins": 0.18047411739826202, + "rewards/rejected": -1.6944833993911743, + "step": 4610 + }, + { + "epoch": 0.7960027567195038, + "grad_norm": 27.7562255859375, + "learning_rate": 1.8381059609833904e-07, + "logits/chosen": -2.35945200920105, + "logits/rejected": -2.3298919200897217, + "logps/chosen": -203.76455688476562, + "logps/rejected": -238.9661102294922, + "loss": 0.5745, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.470356822013855, + "rewards/margins": 0.36763960123062134, + "rewards/rejected": -1.837996244430542, + "step": 4620 + }, + { + "epoch": 0.7977257064093728, + "grad_norm": 39.02242660522461, + "learning_rate": 1.83701061926488e-07, + "logits/chosen": -2.339899778366089, + "logits/rejected": -2.3107967376708984, + "logps/chosen": -224.0686798095703, + "logps/rejected": -250.54379272460938, + "loss": 0.6192, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.6597421169281006, + "rewards/margins": 0.29094910621643066, + "rewards/rejected": -1.9506912231445312, + "step": 4630 + }, + { + "epoch": 0.7994486560992419, + "grad_norm": 22.518293380737305, + "learning_rate": 1.8359119132629522e-07, + "logits/chosen": -2.3632376194000244, + "logits/rejected": -2.351179838180542, + "logps/chosen": -203.08949279785156, + "logps/rejected": -226.6865692138672, + "loss": 0.6346, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.48097825050354, + "rewards/margins": 0.2385386973619461, + "rewards/rejected": -1.7195169925689697, + "step": 4640 + }, + { + "epoch": 0.801171605789111, + "grad_norm": 17.876625061035156, + "learning_rate": 1.8348098473937498e-07, + "logits/chosen": -2.40993595123291, + "logits/rejected": -2.384605884552002, + "logps/chosen": -165.33267211914062, + "logps/rejected": -181.77304077148438, + "loss": 0.6362, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.100512981414795, + "rewards/margins": 0.21383455395698547, + "rewards/rejected": -1.3143476247787476, + "step": 4650 + }, + { + "epoch": 0.80289455547898, + "grad_norm": 18.722620010375977, + "learning_rate": 1.8337044260869195e-07, + "logits/chosen": -2.4083523750305176, + "logits/rejected": -2.3905911445617676, + "logps/chosen": -142.04159545898438, + "logps/rejected": -158.0908660888672, + "loss": 0.6309, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.8668336868286133, + "rewards/margins": 0.1838662177324295, + "rewards/rejected": -1.0506998300552368, + "step": 4660 + }, + { + "epoch": 0.8046175051688491, + "grad_norm": 16.902860641479492, + "learning_rate": 1.8325956537855964e-07, + "logits/chosen": -2.4333815574645996, + "logits/rejected": -2.4127719402313232, + "logps/chosen": -142.946533203125, + "logps/rejected": -163.60421752929688, + "loss": 0.6064, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.900127112865448, + "rewards/margins": 0.25424256920814514, + "rewards/rejected": -1.1543697118759155, + "step": 4670 + }, + { + "epoch": 0.8063404548587181, + "grad_norm": 17.532752990722656, + "learning_rate": 1.8314835349463834e-07, + "logits/chosen": -2.4113612174987793, + "logits/rejected": -2.3857650756835938, + "logps/chosen": -160.74105834960938, + "logps/rejected": -179.22518920898438, + "loss": 0.6465, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.0686453580856323, + "rewards/margins": 0.20836098492145538, + "rewards/rejected": -1.2770063877105713, + "step": 4680 + }, + { + "epoch": 0.8080634045485872, + "grad_norm": 20.075408935546875, + "learning_rate": 1.8303680740393354e-07, + "logits/chosen": -2.432438373565674, + "logits/rejected": -2.411315441131592, + "logps/chosen": -167.2003936767578, + "logps/rejected": -194.3954620361328, + "loss": 0.6118, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.1071606874465942, + "rewards/margins": 0.3014344871044159, + "rewards/rejected": -1.408595323562622, + "step": 4690 + }, + { + "epoch": 0.8097863542384562, + "grad_norm": 23.070444107055664, + "learning_rate": 1.829249275547939e-07, + "logits/chosen": -2.41184139251709, + "logits/rejected": -2.3968803882598877, + "logps/chosen": -161.8149871826172, + "logps/rejected": -190.91075134277344, + "loss": 0.6151, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.0889852046966553, + "rewards/margins": 0.2751777768135071, + "rewards/rejected": -1.3641631603240967, + "step": 4700 + }, + { + "epoch": 0.8115093039283253, + "grad_norm": 22.357309341430664, + "learning_rate": 1.8281271439690972e-07, + "logits/chosen": -2.4170899391174316, + "logits/rejected": -2.387962818145752, + "logps/chosen": -178.35140991210938, + "logps/rejected": -203.06228637695312, + "loss": 0.5967, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.1947414875030518, + "rewards/margins": 0.3194465637207031, + "rewards/rejected": -1.5141879320144653, + "step": 4710 + }, + { + "epoch": 0.8132322536181944, + "grad_norm": 21.492481231689453, + "learning_rate": 1.8270016838131098e-07, + "logits/chosen": -2.328721523284912, + "logits/rejected": -2.2992498874664307, + "logps/chosen": -178.8650360107422, + "logps/rejected": -209.41500854492188, + "loss": 0.5832, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.211004614830017, + "rewards/margins": 0.36220628023147583, + "rewards/rejected": -1.5732109546661377, + "step": 4720 + }, + { + "epoch": 0.8149552033080634, + "grad_norm": 27.255842208862305, + "learning_rate": 1.825872899603655e-07, + "logits/chosen": -2.312281370162964, + "logits/rejected": -2.2868740558624268, + "logps/chosen": -181.79415893554688, + "logps/rejected": -201.48757934570312, + "loss": 0.6487, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.268235445022583, + "rewards/margins": 0.2422148734331131, + "rewards/rejected": -1.5104503631591797, + "step": 4730 + }, + { + "epoch": 0.8166781529979324, + "grad_norm": 21.115175247192383, + "learning_rate": 1.824740795877772e-07, + "logits/chosen": -2.390545606613159, + "logits/rejected": -2.3746256828308105, + "logps/chosen": -158.5949249267578, + "logps/rejected": -193.73794555664062, + "loss": 0.5753, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.0280969142913818, + "rewards/margins": 0.3749907612800598, + "rewards/rejected": -1.403087854385376, + "step": 4740 + }, + { + "epoch": 0.8184011026878015, + "grad_norm": 18.447275161743164, + "learning_rate": 1.8236053771858428e-07, + "logits/chosen": -2.355696678161621, + "logits/rejected": -2.348707914352417, + "logps/chosen": -179.67068481445312, + "logps/rejected": -197.00306701660156, + "loss": 0.6373, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.231192946434021, + "rewards/margins": 0.2000356912612915, + "rewards/rejected": -1.4312288761138916, + "step": 4750 + }, + { + "epoch": 0.8201240523776706, + "grad_norm": 22.534955978393555, + "learning_rate": 1.8224666480915732e-07, + "logits/chosen": -2.3500988483428955, + "logits/rejected": -2.3278253078460693, + "logps/chosen": -184.92636108398438, + "logps/rejected": -212.4356231689453, + "loss": 0.61, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.32185959815979, + "rewards/margins": 0.295890748500824, + "rewards/rejected": -1.6177504062652588, + "step": 4760 + }, + { + "epoch": 0.8218470020675396, + "grad_norm": 18.955162048339844, + "learning_rate": 1.8213246131719746e-07, + "logits/chosen": -2.377048969268799, + "logits/rejected": -2.357697010040283, + "logps/chosen": -209.9337615966797, + "logps/rejected": -226.17276000976562, + "loss": 0.6535, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.494385838508606, + "rewards/margins": 0.22841958701610565, + "rewards/rejected": -1.722805380821228, + "step": 4770 + }, + { + "epoch": 0.8235699517574087, + "grad_norm": 25.681936264038086, + "learning_rate": 1.8201792770173462e-07, + "logits/chosen": -2.277048110961914, + "logits/rejected": -2.257699728012085, + "logps/chosen": -206.84872436523438, + "logps/rejected": -243.0885772705078, + "loss": 0.5753, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.5417685508728027, + "rewards/margins": 0.36321893334388733, + "rewards/rejected": -1.9049873352050781, + "step": 4780 + }, + { + "epoch": 0.8252929014472777, + "grad_norm": 19.8004093170166, + "learning_rate": 1.8190306442312565e-07, + "logits/chosen": -2.3256642818450928, + "logits/rejected": -2.3049635887145996, + "logps/chosen": -219.4740753173828, + "logps/rejected": -230.257568359375, + "loss": 0.6755, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.6086790561676025, + "rewards/margins": 0.17308612167835236, + "rewards/rejected": -1.7817652225494385, + "step": 4790 + }, + { + "epoch": 0.8270158511371468, + "grad_norm": 21.14756202697754, + "learning_rate": 1.8178787194305239e-07, + "logits/chosen": -2.319889545440674, + "logits/rejected": -2.3040847778320312, + "logps/chosen": -186.53793334960938, + "logps/rejected": -211.5950927734375, + "loss": 0.6184, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.3383642435073853, + "rewards/margins": 0.27897733449935913, + "rewards/rejected": -1.6173416376113892, + "step": 4800 + }, + { + "epoch": 0.8270158511371468, + "eval_logits/chosen": -2.424163818359375, + "eval_logits/rejected": -2.414050817489624, + "eval_logps/chosen": -166.67298889160156, + "eval_logps/rejected": -187.56295776367188, + "eval_loss": 0.6454273462295532, + "eval_rewards/accuracies": 0.6129181981086731, + "eval_rewards/chosen": -1.0765751600265503, + "eval_rewards/margins": 0.17155835032463074, + "eval_rewards/rejected": -1.2481335401535034, + "eval_runtime": 383.5899, + "eval_samples_per_second": 11.22, + "eval_steps_per_second": 1.403, + "step": 4800 + }, + { + "epoch": 0.8287388008270159, + "grad_norm": 24.968095779418945, + "learning_rate": 1.816723507245199e-07, + "logits/chosen": -2.3386826515197754, + "logits/rejected": -2.3073079586029053, + "logps/chosen": -182.45309448242188, + "logps/rejected": -206.2501220703125, + "loss": 0.6021, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.2412832975387573, + "rewards/margins": 0.31198614835739136, + "rewards/rejected": -1.553269386291504, + "step": 4810 + }, + { + "epoch": 0.8304617505168849, + "grad_norm": 23.982210159301758, + "learning_rate": 1.8155650123185458e-07, + "logits/chosen": -2.3738133907318115, + "logits/rejected": -2.3586814403533936, + "logps/chosen": -171.57936096191406, + "logps/rejected": -196.97628784179688, + "loss": 0.6235, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.1862354278564453, + "rewards/margins": 0.2727213203907013, + "rewards/rejected": -1.4589568376541138, + "step": 4820 + }, + { + "epoch": 0.832184700206754, + "grad_norm": 20.661731719970703, + "learning_rate": 1.8144032393070225e-07, + "logits/chosen": -2.35894513130188, + "logits/rejected": -2.3394012451171875, + "logps/chosen": -171.9304656982422, + "logps/rejected": -195.32716369628906, + "loss": 0.6235, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.168662190437317, + "rewards/margins": 0.26239484548568726, + "rewards/rejected": -1.4310569763183594, + "step": 4830 + }, + { + "epoch": 0.833907649896623, + "grad_norm": 20.392499923706055, + "learning_rate": 1.8132381928802643e-07, + "logits/chosen": -2.3282320499420166, + "logits/rejected": -2.2940051555633545, + "logps/chosen": -185.9119110107422, + "logps/rejected": -214.93667602539062, + "loss": 0.5982, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.2838799953460693, + "rewards/margins": 0.3473908305168152, + "rewards/rejected": -1.6312707662582397, + "step": 4840 + }, + { + "epoch": 0.8356305995864921, + "grad_norm": 20.10554313659668, + "learning_rate": 1.8120698777210626e-07, + "logits/chosen": -2.3485913276672363, + "logits/rejected": -2.3278183937072754, + "logps/chosen": -195.07545471191406, + "logps/rejected": -225.80789184570312, + "loss": 0.5997, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.397279143333435, + "rewards/margins": 0.33234673738479614, + "rewards/rejected": -1.729625940322876, + "step": 4850 + }, + { + "epoch": 0.8373535492763611, + "grad_norm": 36.35572814941406, + "learning_rate": 1.8108982985253472e-07, + "logits/chosen": -2.330827474594116, + "logits/rejected": -2.2981231212615967, + "logps/chosen": -211.63613891601562, + "logps/rejected": -226.32748413085938, + "loss": 0.6464, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.5219459533691406, + "rewards/margins": 0.24193425476551056, + "rewards/rejected": -1.7638801336288452, + "step": 4860 + }, + { + "epoch": 0.8390764989662302, + "grad_norm": 24.555248260498047, + "learning_rate": 1.8097234600021679e-07, + "logits/chosen": -2.361845016479492, + "logits/rejected": -2.331868886947632, + "logps/chosen": -208.45864868164062, + "logps/rejected": -239.99966430664062, + "loss": 0.5848, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.4959218502044678, + "rewards/margins": 0.3945196568965912, + "rewards/rejected": -1.8904415369033813, + "step": 4870 + }, + { + "epoch": 0.8407994486560992, + "grad_norm": 31.07630157470703, + "learning_rate": 1.8085453668736745e-07, + "logits/chosen": -2.2912700176239014, + "logits/rejected": -2.261535406112671, + "logps/chosen": -196.89691162109375, + "logps/rejected": -217.8070831298828, + "loss": 0.6382, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.431493878364563, + "rewards/margins": 0.2531737685203552, + "rewards/rejected": -1.6846675872802734, + "step": 4880 + }, + { + "epoch": 0.8425223983459683, + "grad_norm": 27.161630630493164, + "learning_rate": 1.8073640238750988e-07, + "logits/chosen": -2.3718910217285156, + "logits/rejected": -2.345691680908203, + "logps/chosen": -184.5735321044922, + "logps/rejected": -217.81478881835938, + "loss": 0.5836, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.2863454818725586, + "rewards/margins": 0.36429768800735474, + "rewards/rejected": -1.6506431102752686, + "step": 4890 + }, + { + "epoch": 0.8442453480358374, + "grad_norm": 24.7296199798584, + "learning_rate": 1.806179435754735e-07, + "logits/chosen": -2.3248345851898193, + "logits/rejected": -2.295557975769043, + "logps/chosen": -179.61318969726562, + "logps/rejected": -199.75857543945312, + "loss": 0.6453, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.2582156658172607, + "rewards/margins": 0.245769664645195, + "rewards/rejected": -1.5039854049682617, + "step": 4900 + }, + { + "epoch": 0.8459682977257064, + "grad_norm": 26.73059844970703, + "learning_rate": 1.804991607273921e-07, + "logits/chosen": -2.432668924331665, + "logits/rejected": -2.3986926078796387, + "logps/chosen": -181.2235565185547, + "logps/rejected": -205.87939453125, + "loss": 0.611, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.2396998405456543, + "rewards/margins": 0.3025253415107727, + "rewards/rejected": -1.5422253608703613, + "step": 4910 + }, + { + "epoch": 0.8476912474155754, + "grad_norm": 25.337461471557617, + "learning_rate": 1.8038005432070183e-07, + "logits/chosen": -2.291698455810547, + "logits/rejected": -2.2636170387268066, + "logps/chosen": -193.2530059814453, + "logps/rejected": -228.5478515625, + "loss": 0.5935, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.3832341432571411, + "rewards/margins": 0.38054174184799194, + "rewards/rejected": -1.7637760639190674, + "step": 4920 + }, + { + "epoch": 0.8494141971054445, + "grad_norm": 30.94846534729004, + "learning_rate": 1.8026062483413943e-07, + "logits/chosen": -2.388110637664795, + "logits/rejected": -2.367668628692627, + "logps/chosen": -214.96780395507812, + "logps/rejected": -244.97134399414062, + "loss": 0.6433, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.6070770025253296, + "rewards/margins": 0.27405670285224915, + "rewards/rejected": -1.8811336755752563, + "step": 4930 + }, + { + "epoch": 0.8511371467953136, + "grad_norm": 26.88053321838379, + "learning_rate": 1.8014087274774018e-07, + "logits/chosen": -2.3821301460266113, + "logits/rejected": -2.354160785675049, + "logps/chosen": -198.7532958984375, + "logps/rejected": -225.3708038330078, + "loss": 0.6198, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.4216960668563843, + "rewards/margins": 0.3176547884941101, + "rewards/rejected": -1.7393509149551392, + "step": 4940 + }, + { + "epoch": 0.8528600964851827, + "grad_norm": 22.982711791992188, + "learning_rate": 1.8002079854283605e-07, + "logits/chosen": -2.257017135620117, + "logits/rejected": -2.2330005168914795, + "logps/chosen": -185.67608642578125, + "logps/rejected": -210.94851684570312, + "loss": 0.6271, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.29585862159729, + "rewards/margins": 0.2753117084503174, + "rewards/rejected": -1.5711703300476074, + "step": 4950 + }, + { + "epoch": 0.8545830461750517, + "grad_norm": 33.373817443847656, + "learning_rate": 1.799004027020537e-07, + "logits/chosen": -2.32515025138855, + "logits/rejected": -2.3151698112487793, + "logps/chosen": -180.51144409179688, + "logps/rejected": -223.17764282226562, + "loss": 0.5729, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.2766039371490479, + "rewards/margins": 0.4046260714530945, + "rewards/rejected": -1.6812299489974976, + "step": 4960 + }, + { + "epoch": 0.8563059958649207, + "grad_norm": 27.193952560424805, + "learning_rate": 1.7977968570931262e-07, + "logits/chosen": -2.282940149307251, + "logits/rejected": -2.2675819396972656, + "logps/chosen": -193.19253540039062, + "logps/rejected": -230.05044555664062, + "loss": 0.5858, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.404479742050171, + "rewards/margins": 0.3858822286128998, + "rewards/rejected": -1.790361762046814, + "step": 4970 + }, + { + "epoch": 0.8580289455547898, + "grad_norm": 28.381431579589844, + "learning_rate": 1.796586480498231e-07, + "logits/chosen": -2.3110270500183105, + "logits/rejected": -2.2937839031219482, + "logps/chosen": -199.82888793945312, + "logps/rejected": -233.05908203125, + "loss": 0.5956, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.4531872272491455, + "rewards/margins": 0.3392183184623718, + "rewards/rejected": -1.7924054861068726, + "step": 4980 + }, + { + "epoch": 0.8597518952446589, + "grad_norm": 32.66222381591797, + "learning_rate": 1.7953729021008434e-07, + "logits/chosen": -2.265355348587036, + "logits/rejected": -2.245307445526123, + "logps/chosen": -211.9461669921875, + "logps/rejected": -252.12063598632812, + "loss": 0.5988, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.5934269428253174, + "rewards/margins": 0.3837546706199646, + "rewards/rejected": -1.9771816730499268, + "step": 4990 + }, + { + "epoch": 0.8614748449345279, + "grad_norm": 29.829771041870117, + "learning_rate": 1.7941561267788245e-07, + "logits/chosen": -2.279102325439453, + "logits/rejected": -2.249908924102783, + "logps/chosen": -200.36363220214844, + "logps/rejected": -233.7318572998047, + "loss": 0.5876, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.4332067966461182, + "rewards/margins": 0.3809499740600586, + "rewards/rejected": -1.8141567707061768, + "step": 5000 + }, + { + "epoch": 0.8631977946243969, + "grad_norm": 23.35953712463379, + "learning_rate": 1.7929361594228852e-07, + "logits/chosen": -2.2788968086242676, + "logits/rejected": -2.2506165504455566, + "logps/chosen": -198.2695770263672, + "logps/rejected": -231.7457733154297, + "loss": 0.5872, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.4459125995635986, + "rewards/margins": 0.37506669759750366, + "rewards/rejected": -1.820979356765747, + "step": 5010 + }, + { + "epoch": 0.864920744314266, + "grad_norm": 23.324260711669922, + "learning_rate": 1.7917130049365672e-07, + "logits/chosen": -2.2702226638793945, + "logits/rejected": -2.237178325653076, + "logps/chosen": -186.34371948242188, + "logps/rejected": -223.6638946533203, + "loss": 0.59, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.3197271823883057, + "rewards/margins": 0.4005703032016754, + "rewards/rejected": -1.7202975749969482, + "step": 5020 + }, + { + "epoch": 0.8666436940041351, + "grad_norm": 48.629798889160156, + "learning_rate": 1.7904866682362213e-07, + "logits/chosen": -2.2448463439941406, + "logits/rejected": -2.2259209156036377, + "logps/chosen": -208.24423217773438, + "logps/rejected": -233.9544677734375, + "loss": 0.6497, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.5443228483200073, + "rewards/margins": 0.2657424807548523, + "rewards/rejected": -1.8100652694702148, + "step": 5030 + }, + { + "epoch": 0.8683666436940042, + "grad_norm": 37.488189697265625, + "learning_rate": 1.7892571542509896e-07, + "logits/chosen": -2.3680639266967773, + "logits/rejected": -2.33235239982605, + "logps/chosen": -208.89395141601562, + "logps/rejected": -230.7420196533203, + "loss": 0.6191, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.5072259902954102, + "rewards/margins": 0.2992441654205322, + "rewards/rejected": -1.8064699172973633, + "step": 5040 + }, + { + "epoch": 0.8700895933838731, + "grad_norm": 18.682756423950195, + "learning_rate": 1.7880244679227853e-07, + "logits/chosen": -2.2868175506591797, + "logits/rejected": -2.2622358798980713, + "logps/chosen": -205.58200073242188, + "logps/rejected": -235.55917358398438, + "loss": 0.6223, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.48475182056427, + "rewards/margins": 0.3248194754123688, + "rewards/rejected": -1.8095712661743164, + "step": 5050 + }, + { + "epoch": 0.8718125430737422, + "grad_norm": 23.736186981201172, + "learning_rate": 1.7867886142062717e-07, + "logits/chosen": -2.2453980445861816, + "logits/rejected": -2.2342467308044434, + "logps/chosen": -182.3429412841797, + "logps/rejected": -209.33657836914062, + "loss": 0.6408, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.3162187337875366, + "rewards/margins": 0.2594860792160034, + "rewards/rejected": -1.5757049322128296, + "step": 5060 + }, + { + "epoch": 0.8735354927636113, + "grad_norm": 20.68201446533203, + "learning_rate": 1.785549598068844e-07, + "logits/chosen": -2.3142597675323486, + "logits/rejected": -2.2893218994140625, + "logps/chosen": -162.87265014648438, + "logps/rejected": -180.55992126464844, + "loss": 0.6424, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.0877201557159424, + "rewards/margins": 0.24460335075855255, + "rewards/rejected": -1.3323233127593994, + "step": 5070 + }, + { + "epoch": 0.8752584424534804, + "grad_norm": 25.388307571411133, + "learning_rate": 1.7843074244906075e-07, + "logits/chosen": -2.499162197113037, + "logits/rejected": -2.461242198944092, + "logps/chosen": -154.22413635253906, + "logps/rejected": -178.7101287841797, + "loss": 0.5933, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.9620031118392944, + "rewards/margins": 0.3197912871837616, + "rewards/rejected": -1.2817943096160889, + "step": 5080 + }, + { + "epoch": 0.8769813921433495, + "grad_norm": 27.99173355102539, + "learning_rate": 1.7830620984643597e-07, + "logits/chosen": -2.353234052658081, + "logits/rejected": -2.3224358558654785, + "logps/chosen": -174.60372924804688, + "logps/rejected": -208.08639526367188, + "loss": 0.578, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.1949115991592407, + "rewards/margins": 0.3782222867012024, + "rewards/rejected": -1.573133945465088, + "step": 5090 + }, + { + "epoch": 0.8787043418332184, + "grad_norm": 37.61249542236328, + "learning_rate": 1.7818136249955678e-07, + "logits/chosen": -2.229835271835327, + "logits/rejected": -2.1953840255737305, + "logps/chosen": -207.0838165283203, + "logps/rejected": -227.10610961914062, + "loss": 0.6509, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.5098870992660522, + "rewards/margins": 0.2407897710800171, + "rewards/rejected": -1.7506768703460693, + "step": 5100 + }, + { + "epoch": 0.8804272915230875, + "grad_norm": 25.037174224853516, + "learning_rate": 1.7805620091023505e-07, + "logits/chosen": -2.3433327674865723, + "logits/rejected": -2.314168930053711, + "logps/chosen": -204.13002014160156, + "logps/rejected": -220.486572265625, + "loss": 0.6719, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.4547873735427856, + "rewards/margins": 0.21724343299865723, + "rewards/rejected": -1.672031044960022, + "step": 5110 + }, + { + "epoch": 0.8821502412129566, + "grad_norm": 22.87247657775879, + "learning_rate": 1.7793072558154573e-07, + "logits/chosen": -2.378096103668213, + "logits/rejected": -2.348536252975464, + "logps/chosen": -164.9044647216797, + "logps/rejected": -187.47647094726562, + "loss": 0.6194, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.1046850681304932, + "rewards/margins": 0.27192893624305725, + "rewards/rejected": -1.376613974571228, + "step": 5120 + }, + { + "epoch": 0.8838731909028257, + "grad_norm": 24.949138641357422, + "learning_rate": 1.778049370178248e-07, + "logits/chosen": -2.429076671600342, + "logits/rejected": -2.3960869312286377, + "logps/chosen": -163.94699096679688, + "logps/rejected": -187.69308471679688, + "loss": 0.5998, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.0935906171798706, + "rewards/margins": 0.29233506321907043, + "rewards/rejected": -1.3859257698059082, + "step": 5130 + }, + { + "epoch": 0.8855961405926946, + "grad_norm": 23.759702682495117, + "learning_rate": 1.7767883572466726e-07, + "logits/chosen": -2.3825690746307373, + "logits/rejected": -2.352921724319458, + "logps/chosen": -161.80252075195312, + "logps/rejected": -172.0554656982422, + "loss": 0.6553, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.0280125141143799, + "rewards/margins": 0.17422954738140106, + "rewards/rejected": -1.2022418975830078, + "step": 5140 + }, + { + "epoch": 0.8873190902825637, + "grad_norm": 21.064332962036133, + "learning_rate": 1.7755242220892507e-07, + "logits/chosen": -2.3944315910339355, + "logits/rejected": -2.3759605884552, + "logps/chosen": -147.7733154296875, + "logps/rejected": -173.36581420898438, + "loss": 0.6133, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.9367380142211914, + "rewards/margins": 0.258679062128067, + "rewards/rejected": -1.1954171657562256, + "step": 5150 + }, + { + "epoch": 0.8890420399724328, + "grad_norm": 28.148576736450195, + "learning_rate": 1.7742569697870512e-07, + "logits/chosen": -2.40571665763855, + "logits/rejected": -2.3768887519836426, + "logps/chosen": -152.49459838867188, + "logps/rejected": -167.98675537109375, + "loss": 0.6408, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.999171257019043, + "rewards/margins": 0.20267994701862335, + "rewards/rejected": -1.2018513679504395, + "step": 5160 + }, + { + "epoch": 0.8907649896623019, + "grad_norm": 19.74530792236328, + "learning_rate": 1.7729866054336734e-07, + "logits/chosen": -2.3742120265960693, + "logits/rejected": -2.345825672149658, + "logps/chosen": -163.65151977539062, + "logps/rejected": -194.41513061523438, + "loss": 0.5832, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.0627447366714478, + "rewards/margins": 0.3682798743247986, + "rewards/rejected": -1.4310245513916016, + "step": 5170 + }, + { + "epoch": 0.892487939352171, + "grad_norm": 24.458284378051758, + "learning_rate": 1.7717131341352235e-07, + "logits/chosen": -2.4376699924468994, + "logits/rejected": -2.4229021072387695, + "logps/chosen": -189.02191162109375, + "logps/rejected": -206.2496795654297, + "loss": 0.6558, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.3285324573516846, + "rewards/margins": 0.1917664110660553, + "rewards/rejected": -1.520298957824707, + "step": 5180 + }, + { + "epoch": 0.8942108890420399, + "grad_norm": 22.02104377746582, + "learning_rate": 1.770436561010297e-07, + "logits/chosen": -2.3305296897888184, + "logits/rejected": -2.3187999725341797, + "logps/chosen": -180.78086853027344, + "logps/rejected": -199.74057006835938, + "loss": 0.6602, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.2771596908569336, + "rewards/margins": 0.18791182339191437, + "rewards/rejected": -1.465071439743042, + "step": 5190 + }, + { + "epoch": 0.895933838731909, + "grad_norm": 23.871702194213867, + "learning_rate": 1.7691568911899556e-07, + "logits/chosen": -2.412647008895874, + "logits/rejected": -2.384337902069092, + "logps/chosen": -174.80752563476562, + "logps/rejected": -202.4107208251953, + "loss": 0.609, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.219252347946167, + "rewards/margins": 0.27993494272232056, + "rewards/rejected": -1.4991872310638428, + "step": 5200 + }, + { + "epoch": 0.895933838731909, + "eval_logits/chosen": -2.4079904556274414, + "eval_logits/rejected": -2.3969969749450684, + "eval_logps/chosen": -158.2065887451172, + "eval_logps/rejected": -179.52780151367188, + "eval_loss": 0.6414390206336975, + "eval_rewards/accuracies": 0.616403341293335, + "eval_rewards/chosen": -0.9919112324714661, + "eval_rewards/margins": 0.1758705973625183, + "eval_rewards/rejected": -1.1677817106246948, + "eval_runtime": 382.9928, + "eval_samples_per_second": 11.238, + "eval_steps_per_second": 1.405, + "step": 5200 + }, + { + "epoch": 0.8976567884217781, + "grad_norm": 30.867982864379883, + "learning_rate": 1.7678741298177092e-07, + "logits/chosen": -2.3551595211029053, + "logits/rejected": -2.3347275257110596, + "logps/chosen": -173.14175415039062, + "logps/rejected": -192.95025634765625, + "loss": 0.6329, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.2061526775360107, + "rewards/margins": 0.23082256317138672, + "rewards/rejected": -1.4369752407073975, + "step": 5210 + }, + { + "epoch": 0.8993797381116472, + "grad_norm": 17.44063377380371, + "learning_rate": 1.766588282049494e-07, + "logits/chosen": -2.377192258834839, + "logits/rejected": -2.361964702606201, + "logps/chosen": -160.16224670410156, + "logps/rejected": -179.97178649902344, + "loss": 0.6529, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0694485902786255, + "rewards/margins": 0.1831921637058258, + "rewards/rejected": -1.252640962600708, + "step": 5220 + }, + { + "epoch": 0.9011026878015161, + "grad_norm": 21.062334060668945, + "learning_rate": 1.7652993530536497e-07, + "logits/chosen": -2.3476855754852295, + "logits/rejected": -2.332172155380249, + "logps/chosen": -156.99844360351562, + "logps/rejected": -190.17465209960938, + "loss": 0.5777, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.9969981908798218, + "rewards/margins": 0.3543012738227844, + "rewards/rejected": -1.3512994050979614, + "step": 5230 + }, + { + "epoch": 0.9028256374913852, + "grad_norm": 26.4909610748291, + "learning_rate": 1.764007348010903e-07, + "logits/chosen": -2.314365863800049, + "logits/rejected": -2.2813925743103027, + "logps/chosen": -164.72598266601562, + "logps/rejected": -209.2113800048828, + "loss": 0.5639, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.1105834245681763, + "rewards/margins": 0.4559425413608551, + "rewards/rejected": -1.566525936126709, + "step": 5240 + }, + { + "epoch": 0.9045485871812543, + "grad_norm": 30.440879821777344, + "learning_rate": 1.762712272114343e-07, + "logits/chosen": -2.2364721298217773, + "logits/rejected": -2.218665599822998, + "logps/chosen": -184.82301330566406, + "logps/rejected": -216.07345581054688, + "loss": 0.5961, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.3264563083648682, + "rewards/margins": 0.3302633464336395, + "rewards/rejected": -1.6567198038101196, + "step": 5250 + }, + { + "epoch": 0.9062715368711234, + "grad_norm": 35.71326446533203, + "learning_rate": 1.7614141305694029e-07, + "logits/chosen": -2.2694973945617676, + "logits/rejected": -2.2302098274230957, + "logps/chosen": -191.68019104003906, + "logps/rejected": -218.50619506835938, + "loss": 0.6129, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.3447864055633545, + "rewards/margins": 0.33376210927963257, + "rewards/rejected": -1.6785485744476318, + "step": 5260 + }, + { + "epoch": 0.9079944865609925, + "grad_norm": 26.6447696685791, + "learning_rate": 1.7601129285938364e-07, + "logits/chosen": -2.3759639263153076, + "logits/rejected": -2.351966381072998, + "logps/chosen": -187.26132202148438, + "logps/rejected": -216.2740478515625, + "loss": 0.6218, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.343926191329956, + "rewards/margins": 0.3137759268283844, + "rewards/rejected": -1.657702088356018, + "step": 5270 + }, + { + "epoch": 0.9097174362508614, + "grad_norm": 29.428714752197266, + "learning_rate": 1.7588086714177003e-07, + "logits/chosen": -2.320923328399658, + "logits/rejected": -2.2858223915100098, + "logps/chosen": -187.89013671875, + "logps/rejected": -211.83566284179688, + "loss": 0.6178, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.3284814357757568, + "rewards/margins": 0.2929961681365967, + "rewards/rejected": -1.621477484703064, + "step": 5280 + }, + { + "epoch": 0.9114403859407305, + "grad_norm": 25.15055274963379, + "learning_rate": 1.7575013642833295e-07, + "logits/chosen": -2.295776844024658, + "logits/rejected": -2.2582039833068848, + "logps/chosen": -174.5590057373047, + "logps/rejected": -198.55734252929688, + "loss": 0.6163, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.1752634048461914, + "rewards/margins": 0.30027860403060913, + "rewards/rejected": -1.4755419492721558, + "step": 5290 + }, + { + "epoch": 0.9131633356305996, + "grad_norm": 22.768001556396484, + "learning_rate": 1.7561910124453195e-07, + "logits/chosen": -2.313915729522705, + "logits/rejected": -2.2940611839294434, + "logps/chosen": -169.04296875, + "logps/rejected": -199.2408447265625, + "loss": 0.5988, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.1217864751815796, + "rewards/margins": 0.3264429271221161, + "rewards/rejected": -1.4482295513153076, + "step": 5300 + }, + { + "epoch": 0.9148862853204687, + "grad_norm": 25.457565307617188, + "learning_rate": 1.7548776211705034e-07, + "logits/chosen": -2.3798553943634033, + "logits/rejected": -2.3660120964050293, + "logps/chosen": -183.68081665039062, + "logps/rejected": -200.96652221679688, + "loss": 0.6349, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.2447845935821533, + "rewards/margins": 0.24797436594963074, + "rewards/rejected": -1.4927589893341064, + "step": 5310 + }, + { + "epoch": 0.9166092350103378, + "grad_norm": 27.405920028686523, + "learning_rate": 1.7535611957379302e-07, + "logits/chosen": -2.317561149597168, + "logits/rejected": -2.263969898223877, + "logps/chosen": -192.5063018798828, + "logps/rejected": -217.94100952148438, + "loss": 0.5931, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.3361905813217163, + "rewards/margins": 0.35336703062057495, + "rewards/rejected": -1.689557433128357, + "step": 5320 + }, + { + "epoch": 0.9183321847002067, + "grad_norm": 22.66073989868164, + "learning_rate": 1.7522417414388446e-07, + "logits/chosen": -2.248110294342041, + "logits/rejected": -2.2363169193267822, + "logps/chosen": -207.2706298828125, + "logps/rejected": -251.25704956054688, + "loss": 0.5868, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.5597829818725586, + "rewards/margins": 0.4321991503238678, + "rewards/rejected": -1.9919822216033936, + "step": 5330 + }, + { + "epoch": 0.9200551343900758, + "grad_norm": 29.119892120361328, + "learning_rate": 1.7509192635766664e-07, + "logits/chosen": -2.262770414352417, + "logits/rejected": -2.2210538387298584, + "logps/chosen": -215.5501708984375, + "logps/rejected": -244.3115692138672, + "loss": 0.5908, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.5742887258529663, + "rewards/margins": 0.3592612147331238, + "rewards/rejected": -1.9335498809814453, + "step": 5340 + }, + { + "epoch": 0.9217780840799449, + "grad_norm": 20.251754760742188, + "learning_rate": 1.7495937674669675e-07, + "logits/chosen": -2.272902011871338, + "logits/rejected": -2.245993137359619, + "logps/chosen": -193.52423095703125, + "logps/rejected": -218.2841339111328, + "loss": 0.6288, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.3998287916183472, + "rewards/margins": 0.2747870087623596, + "rewards/rejected": -1.6746160984039307, + "step": 5350 + }, + { + "epoch": 0.923501033769814, + "grad_norm": 27.8018856048584, + "learning_rate": 1.7482652584374514e-07, + "logits/chosen": -2.339388847351074, + "logits/rejected": -2.3210878372192383, + "logps/chosen": -181.2473602294922, + "logps/rejected": -221.5663299560547, + "loss": 0.5801, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.266035556793213, + "rewards/margins": 0.38970786333084106, + "rewards/rejected": -1.6557432413101196, + "step": 5360 + }, + { + "epoch": 0.9252239834596829, + "grad_norm": 25.928937911987305, + "learning_rate": 1.7469337418279325e-07, + "logits/chosen": -2.283285140991211, + "logits/rejected": -2.267566680908203, + "logps/chosen": -182.66424560546875, + "logps/rejected": -206.1977996826172, + "loss": 0.6343, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.2891303300857544, + "rewards/margins": 0.2453964203596115, + "rewards/rejected": -1.5345267057418823, + "step": 5370 + }, + { + "epoch": 0.926946933149552, + "grad_norm": 24.324359893798828, + "learning_rate": 1.7455992229903133e-07, + "logits/chosen": -2.3700790405273438, + "logits/rejected": -2.339428424835205, + "logps/chosen": -183.50149536132812, + "logps/rejected": -204.10726928710938, + "loss": 0.6165, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.2439554929733276, + "rewards/margins": 0.2912697494029999, + "rewards/rejected": -1.53522527217865, + "step": 5380 + }, + { + "epoch": 0.9286698828394211, + "grad_norm": 35.17918395996094, + "learning_rate": 1.7442617072885627e-07, + "logits/chosen": -2.2917733192443848, + "logits/rejected": -2.2521700859069824, + "logps/chosen": -189.371826171875, + "logps/rejected": -213.36972045898438, + "loss": 0.6084, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.3059356212615967, + "rewards/margins": 0.3214091360569, + "rewards/rejected": -1.6273447275161743, + "step": 5390 + }, + { + "epoch": 0.9303928325292902, + "grad_norm": 25.27411651611328, + "learning_rate": 1.7429212000986965e-07, + "logits/chosen": -2.3175926208496094, + "logits/rejected": -2.293071985244751, + "logps/chosen": -167.50717163085938, + "logps/rejected": -209.6229248046875, + "loss": 0.5747, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.1563806533813477, + "rewards/margins": 0.40642791986465454, + "rewards/rejected": -1.562808632850647, + "step": 5400 + }, + { + "epoch": 0.9321157822191593, + "grad_norm": 40.84873962402344, + "learning_rate": 1.7415777068087545e-07, + "logits/chosen": -2.3327441215515137, + "logits/rejected": -2.3204197883605957, + "logps/chosen": -183.882080078125, + "logps/rejected": -197.12765502929688, + "loss": 0.6669, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.25808584690094, + "rewards/margins": 0.17094475030899048, + "rewards/rejected": -1.4290306568145752, + "step": 5410 + }, + { + "epoch": 0.9338387319090282, + "grad_norm": 31.94063949584961, + "learning_rate": 1.7402312328187776e-07, + "logits/chosen": -2.3866710662841797, + "logits/rejected": -2.3662643432617188, + "logps/chosen": -176.82510375976562, + "logps/rejected": -197.71966552734375, + "loss": 0.6343, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.2160383462905884, + "rewards/margins": 0.23323245346546173, + "rewards/rejected": -1.4492708444595337, + "step": 5420 + }, + { + "epoch": 0.9355616815988973, + "grad_norm": 28.33039665222168, + "learning_rate": 1.7388817835407884e-07, + "logits/chosen": -2.353658676147461, + "logits/rejected": -2.3364737033843994, + "logps/chosen": -170.1305694580078, + "logps/rejected": -195.0408172607422, + "loss": 0.6107, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.1594005823135376, + "rewards/margins": 0.2835468053817749, + "rewards/rejected": -1.4429473876953125, + "step": 5430 + }, + { + "epoch": 0.9372846312887664, + "grad_norm": 26.746353149414062, + "learning_rate": 1.737529364398768e-07, + "logits/chosen": -2.3374722003936768, + "logits/rejected": -2.3144500255584717, + "logps/chosen": -180.90878295898438, + "logps/rejected": -211.0740509033203, + "loss": 0.6003, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.2434018850326538, + "rewards/margins": 0.32336291670799255, + "rewards/rejected": -1.5667648315429688, + "step": 5440 + }, + { + "epoch": 0.9390075809786355, + "grad_norm": 43.07821273803711, + "learning_rate": 1.7361739808286343e-07, + "logits/chosen": -2.2768020629882812, + "logits/rejected": -2.2596564292907715, + "logps/chosen": -194.68710327148438, + "logps/rejected": -224.31982421875, + "loss": 0.6064, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.3785072565078735, + "rewards/margins": 0.33700355887413025, + "rewards/rejected": -1.7155107259750366, + "step": 5450 + }, + { + "epoch": 0.9407305306685044, + "grad_norm": 33.96110916137695, + "learning_rate": 1.7348156382782215e-07, + "logits/chosen": -2.274019956588745, + "logits/rejected": -2.2504825592041016, + "logps/chosen": -198.7681884765625, + "logps/rejected": -219.91854858398438, + "loss": 0.6386, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.4427436590194702, + "rewards/margins": 0.24368536472320557, + "rewards/rejected": -1.6864287853240967, + "step": 5460 + }, + { + "epoch": 0.9424534803583735, + "grad_norm": 24.447280883789062, + "learning_rate": 1.733454342207256e-07, + "logits/chosen": -2.259782075881958, + "logits/rejected": -2.245398998260498, + "logps/chosen": -197.2213897705078, + "logps/rejected": -221.02444458007812, + "loss": 0.6565, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.4507030248641968, + "rewards/margins": 0.25823861360549927, + "rewards/rejected": -1.7089416980743408, + "step": 5470 + }, + { + "epoch": 0.9441764300482426, + "grad_norm": 32.96196746826172, + "learning_rate": 1.732090098087336e-07, + "logits/chosen": -2.270646572113037, + "logits/rejected": -2.243563652038574, + "logps/chosen": -195.88558959960938, + "logps/rejected": -227.06527709960938, + "loss": 0.5953, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.4136393070220947, + "rewards/margins": 0.3479146361351013, + "rewards/rejected": -1.7615541219711304, + "step": 5480 + }, + { + "epoch": 0.9458993797381117, + "grad_norm": 19.306108474731445, + "learning_rate": 1.7307229114019091e-07, + "logits/chosen": -2.2577857971191406, + "logits/rejected": -2.2329039573669434, + "logps/chosen": -195.81765747070312, + "logps/rejected": -209.193359375, + "loss": 0.6551, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3897420167922974, + "rewards/margins": 0.19239407777786255, + "rewards/rejected": -1.5821361541748047, + "step": 5490 + }, + { + "epoch": 0.9476223294279807, + "grad_norm": 26.862762451171875, + "learning_rate": 1.7293527876462504e-07, + "logits/chosen": -2.3610310554504395, + "logits/rejected": -2.340768575668335, + "logps/chosen": -176.81341552734375, + "logps/rejected": -212.5503387451172, + "loss": 0.585, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.2214715480804443, + "rewards/margins": 0.3627847731113434, + "rewards/rejected": -1.5842561721801758, + "step": 5500 + }, + { + "epoch": 0.9493452791178497, + "grad_norm": 23.03242301940918, + "learning_rate": 1.72797973232744e-07, + "logits/chosen": -2.288675308227539, + "logits/rejected": -2.272275447845459, + "logps/chosen": -187.26974487304688, + "logps/rejected": -210.4651336669922, + "loss": 0.6331, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.336241364479065, + "rewards/margins": 0.24365845322608948, + "rewards/rejected": -1.5798996686935425, + "step": 5510 + }, + { + "epoch": 0.9510682288077188, + "grad_norm": 24.83022117614746, + "learning_rate": 1.726603750964341e-07, + "logits/chosen": -2.2835960388183594, + "logits/rejected": -2.2611312866210938, + "logps/chosen": -185.1580810546875, + "logps/rejected": -211.9181365966797, + "loss": 0.5992, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.2729332447052002, + "rewards/margins": 0.33776092529296875, + "rewards/rejected": -1.610694169998169, + "step": 5520 + }, + { + "epoch": 0.9527911784975879, + "grad_norm": 20.41211700439453, + "learning_rate": 1.725224849087578e-07, + "logits/chosen": -2.3336851596832275, + "logits/rejected": -2.2991302013397217, + "logps/chosen": -187.03628540039062, + "logps/rejected": -202.1573486328125, + "loss": 0.6331, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.2588682174682617, + "rewards/margins": 0.24945969879627228, + "rewards/rejected": -1.508327841758728, + "step": 5530 + }, + { + "epoch": 0.954514128187457, + "grad_norm": 23.775251388549805, + "learning_rate": 1.723843032239514e-07, + "logits/chosen": -2.3340952396392822, + "logits/rejected": -2.3236658573150635, + "logps/chosen": -169.35516357421875, + "logps/rejected": -204.19137573242188, + "loss": 0.6068, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.1559011936187744, + "rewards/margins": 0.32853391766548157, + "rewards/rejected": -1.4844350814819336, + "step": 5540 + }, + { + "epoch": 0.956237077877326, + "grad_norm": 23.238468170166016, + "learning_rate": 1.722458305974229e-07, + "logits/chosen": -2.251826763153076, + "logits/rejected": -2.2365410327911377, + "logps/chosen": -165.81344604492188, + "logps/rejected": -181.73863220214844, + "loss": 0.6747, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.1286054849624634, + "rewards/margins": 0.1507066935300827, + "rewards/rejected": -1.279312252998352, + "step": 5550 + }, + { + "epoch": 0.957960027567195, + "grad_norm": 27.57474708557129, + "learning_rate": 1.7210706758574957e-07, + "logits/chosen": -2.3349127769470215, + "logits/rejected": -2.307807207107544, + "logps/chosen": -145.54527282714844, + "logps/rejected": -165.34902954101562, + "loss": 0.6212, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9249517321586609, + "rewards/margins": 0.24045920372009277, + "rewards/rejected": -1.1654109954833984, + "step": 5560 + }, + { + "epoch": 0.9596829772570641, + "grad_norm": 18.97833824157715, + "learning_rate": 1.71968014746676e-07, + "logits/chosen": -2.3977248668670654, + "logits/rejected": -2.3879504203796387, + "logps/chosen": -142.7986297607422, + "logps/rejected": -164.1881866455078, + "loss": 0.647, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.908721923828125, + "rewards/margins": 0.1725355088710785, + "rewards/rejected": -1.0812573432922363, + "step": 5570 + }, + { + "epoch": 0.9614059269469332, + "grad_norm": 24.691076278686523, + "learning_rate": 1.7182867263911163e-07, + "logits/chosen": -2.3252456188201904, + "logits/rejected": -2.3095462322235107, + "logps/chosen": -144.26171875, + "logps/rejected": -170.215087890625, + "loss": 0.6079, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.901009738445282, + "rewards/margins": 0.26762694120407104, + "rewards/rejected": -1.1686367988586426, + "step": 5580 + }, + { + "epoch": 0.9631288766368022, + "grad_norm": 22.33702850341797, + "learning_rate": 1.7168904182312863e-07, + "logits/chosen": -2.4048874378204346, + "logits/rejected": -2.365469455718994, + "logps/chosen": -154.96484375, + "logps/rejected": -179.8811492919922, + "loss": 0.6156, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.9779122471809387, + "rewards/margins": 0.27446627616882324, + "rewards/rejected": -1.2523784637451172, + "step": 5590 + }, + { + "epoch": 0.9648518263266712, + "grad_norm": 23.034870147705078, + "learning_rate": 1.715491228599596e-07, + "logits/chosen": -2.3783340454101562, + "logits/rejected": -2.3681511878967285, + "logps/chosen": -158.8350830078125, + "logps/rejected": -193.6468505859375, + "loss": 0.5977, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.056706190109253, + "rewards/margins": 0.31611010432243347, + "rewards/rejected": -1.3728163242340088, + "step": 5600 + }, + { + "epoch": 0.9648518263266712, + "eval_logits/chosen": -2.404153347015381, + "eval_logits/rejected": -2.3932902812957764, + "eval_logps/chosen": -150.67100524902344, + "eval_logps/rejected": -170.78880310058594, + "eval_loss": 0.643197238445282, + "eval_rewards/accuracies": 0.6273234486579895, + "eval_rewards/chosen": -0.9165552258491516, + "eval_rewards/margins": 0.1638367623090744, + "eval_rewards/rejected": -1.0803918838500977, + "eval_runtime": 383.0072, + "eval_samples_per_second": 11.237, + "eval_steps_per_second": 1.405, + "step": 5600 + }, + { + "epoch": 0.9665747760165403, + "grad_norm": 24.86272430419922, + "learning_rate": 1.7140891631199533e-07, + "logits/chosen": -2.353733777999878, + "logits/rejected": -2.333936929702759, + "logps/chosen": -175.52102661132812, + "logps/rejected": -216.1461181640625, + "loss": 0.5819, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.226787805557251, + "rewards/margins": 0.3806641697883606, + "rewards/rejected": -1.6074520349502563, + "step": 5610 + }, + { + "epoch": 0.9682977257064094, + "grad_norm": 36.44122314453125, + "learning_rate": 1.7126842274278245e-07, + "logits/chosen": -2.2590763568878174, + "logits/rejected": -2.2382168769836426, + "logps/chosen": -203.39913940429688, + "logps/rejected": -224.2606658935547, + "loss": 0.6405, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.479689598083496, + "rewards/margins": 0.26059383153915405, + "rewards/rejected": -1.7402836084365845, + "step": 5620 + }, + { + "epoch": 0.9700206753962785, + "grad_norm": 26.534252166748047, + "learning_rate": 1.7112764271702135e-07, + "logits/chosen": -2.3296680450439453, + "logits/rejected": -2.298461675643921, + "logps/chosen": -189.0676727294922, + "logps/rejected": -214.03378295898438, + "loss": 0.6184, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.3230987787246704, + "rewards/margins": 0.3118281364440918, + "rewards/rejected": -1.6349267959594727, + "step": 5630 + }, + { + "epoch": 0.9717436250861475, + "grad_norm": 26.497467041015625, + "learning_rate": 1.7098657680056373e-07, + "logits/chosen": -2.3253607749938965, + "logits/rejected": -2.3052287101745605, + "logps/chosen": -165.99765014648438, + "logps/rejected": -201.12632751464844, + "loss": 0.6061, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.1070231199264526, + "rewards/margins": 0.35045483708381653, + "rewards/rejected": -1.4574780464172363, + "step": 5640 + }, + { + "epoch": 0.9734665747760165, + "grad_norm": 16.514123916625977, + "learning_rate": 1.7084522556041049e-07, + "logits/chosen": -2.2654869556427, + "logits/rejected": -2.243253469467163, + "logps/chosen": -162.08592224121094, + "logps/rejected": -198.29437255859375, + "loss": 0.5992, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.1030868291854858, + "rewards/margins": 0.36720800399780273, + "rewards/rejected": -1.4702950716018677, + "step": 5650 + }, + { + "epoch": 0.9751895244658856, + "grad_norm": 24.211523056030273, + "learning_rate": 1.7070358956470923e-07, + "logits/chosen": -2.2788543701171875, + "logits/rejected": -2.263421058654785, + "logps/chosen": -169.59774780273438, + "logps/rejected": -196.69061279296875, + "loss": 0.6203, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1747559309005737, + "rewards/margins": 0.2927972674369812, + "rewards/rejected": -1.4675531387329102, + "step": 5660 + }, + { + "epoch": 0.9769124741557547, + "grad_norm": 26.447824478149414, + "learning_rate": 1.705616693827522e-07, + "logits/chosen": -2.276493787765503, + "logits/rejected": -2.2538936138153076, + "logps/chosen": -184.60488891601562, + "logps/rejected": -212.2623748779297, + "loss": 0.6032, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.292311429977417, + "rewards/margins": 0.30952146649360657, + "rewards/rejected": -1.6018329858779907, + "step": 5670 + }, + { + "epoch": 0.9786354238456237, + "grad_norm": 48.45136260986328, + "learning_rate": 1.7041946558497388e-07, + "logits/chosen": -2.2704081535339355, + "logits/rejected": -2.2329323291778564, + "logps/chosen": -212.3938751220703, + "logps/rejected": -245.57284545898438, + "loss": 0.5841, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.5729804039001465, + "rewards/margins": 0.3733724355697632, + "rewards/rejected": -1.9463527202606201, + "step": 5680 + }, + { + "epoch": 0.9803583735354927, + "grad_norm": 27.563390731811523, + "learning_rate": 1.7027697874294867e-07, + "logits/chosen": -2.284641742706299, + "logits/rejected": -2.244694471359253, + "logps/chosen": -222.4856414794922, + "logps/rejected": -256.3599853515625, + "loss": 0.5792, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.6246564388275146, + "rewards/margins": 0.4309469759464264, + "rewards/rejected": -2.055603504180908, + "step": 5690 + }, + { + "epoch": 0.9820813232253618, + "grad_norm": 80.95060729980469, + "learning_rate": 1.7013420942938876e-07, + "logits/chosen": -2.1855034828186035, + "logits/rejected": -2.1641812324523926, + "logps/chosen": -216.79989624023438, + "logps/rejected": -245.7619171142578, + "loss": 0.6365, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.5874277353286743, + "rewards/margins": 0.339304655790329, + "rewards/rejected": -1.9267324209213257, + "step": 5700 + }, + { + "epoch": 0.9838042729152309, + "grad_norm": 24.923404693603516, + "learning_rate": 1.6999115821814155e-07, + "logits/chosen": -2.312826633453369, + "logits/rejected": -2.2875611782073975, + "logps/chosen": -202.64010620117188, + "logps/rejected": -239.53475952148438, + "loss": 0.5897, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.464739441871643, + "rewards/margins": 0.41259440779685974, + "rewards/rejected": -1.8773338794708252, + "step": 5710 + }, + { + "epoch": 0.9855272226051, + "grad_norm": 25.859790802001953, + "learning_rate": 1.6984782568418766e-07, + "logits/chosen": -2.2918834686279297, + "logits/rejected": -2.259382963180542, + "logps/chosen": -179.59243774414062, + "logps/rejected": -216.98263549804688, + "loss": 0.5698, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.2602999210357666, + "rewards/margins": 0.42483147978782654, + "rewards/rejected": -1.6851314306259155, + "step": 5720 + }, + { + "epoch": 0.987250172294969, + "grad_norm": 29.39626693725586, + "learning_rate": 1.697042124036383e-07, + "logits/chosen": -2.349903106689453, + "logits/rejected": -2.3303074836730957, + "logps/chosen": -182.60787963867188, + "logps/rejected": -214.63491821289062, + "loss": 0.6162, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.2972140312194824, + "rewards/margins": 0.33766549825668335, + "rewards/rejected": -1.634879469871521, + "step": 5730 + }, + { + "epoch": 0.988973121984838, + "grad_norm": 26.878019332885742, + "learning_rate": 1.6956031895373327e-07, + "logits/chosen": -2.27702260017395, + "logits/rejected": -2.2376182079315186, + "logps/chosen": -189.97862243652344, + "logps/rejected": -228.1022186279297, + "loss": 0.5716, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.303370475769043, + "rewards/margins": 0.44443821907043457, + "rewards/rejected": -1.7478086948394775, + "step": 5740 + }, + { + "epoch": 0.9906960716747071, + "grad_norm": 26.576335906982422, + "learning_rate": 1.6941614591283834e-07, + "logits/chosen": -2.340158462524414, + "logits/rejected": -2.319542407989502, + "logps/chosen": -189.45004272460938, + "logps/rejected": -203.67274475097656, + "loss": 0.65, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.3123247623443604, + "rewards/margins": 0.22555959224700928, + "rewards/rejected": -1.5378843545913696, + "step": 5750 + }, + { + "epoch": 0.9924190213645762, + "grad_norm": 21.105655670166016, + "learning_rate": 1.6927169386044313e-07, + "logits/chosen": -2.3314685821533203, + "logits/rejected": -2.3016769886016846, + "logps/chosen": -168.71255493164062, + "logps/rejected": -194.60256958007812, + "loss": 0.6264, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.1167190074920654, + "rewards/margins": 0.2818935811519623, + "rewards/rejected": -1.398612380027771, + "step": 5760 + }, + { + "epoch": 0.9941419710544452, + "grad_norm": 26.614208221435547, + "learning_rate": 1.691269633771588e-07, + "logits/chosen": -2.2715003490448, + "logits/rejected": -2.2406039237976074, + "logps/chosen": -163.55667114257812, + "logps/rejected": -199.54388427734375, + "loss": 0.5905, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.1084734201431274, + "rewards/margins": 0.35367265343666077, + "rewards/rejected": -1.4621461629867554, + "step": 5770 + }, + { + "epoch": 0.9958649207443143, + "grad_norm": 25.3371524810791, + "learning_rate": 1.6898195504471552e-07, + "logits/chosen": -2.252403497695923, + "logits/rejected": -2.225072145462036, + "logps/chosen": -195.75697326660156, + "logps/rejected": -231.71957397460938, + "loss": 0.6095, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.423097014427185, + "rewards/margins": 0.3620423674583435, + "rewards/rejected": -1.7851394414901733, + "step": 5780 + }, + { + "epoch": 0.9975878704341833, + "grad_norm": 25.647926330566406, + "learning_rate": 1.688366694459603e-07, + "logits/chosen": -2.201721668243408, + "logits/rejected": -2.1597495079040527, + "logps/chosen": -188.7086181640625, + "logps/rejected": -233.0748291015625, + "loss": 0.5789, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.3738667964935303, + "rewards/margins": 0.41834861040115356, + "rewards/rejected": -1.792215347290039, + "step": 5790 + }, + { + "epoch": 0.9993108201240524, + "grad_norm": 22.43425941467285, + "learning_rate": 1.6869110716485456e-07, + "logits/chosen": -2.3050150871276855, + "logits/rejected": -2.269188165664673, + "logps/chosen": -186.7384033203125, + "logps/rejected": -226.17269897460938, + "loss": 0.5916, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.349612832069397, + "rewards/margins": 0.4007296562194824, + "rewards/rejected": -1.750342607498169, + "step": 5800 + }, + { + "epoch": 1.0010337698139213, + "grad_norm": 24.5869197845459, + "learning_rate": 1.6854526878647186e-07, + "logits/chosen": -2.322516918182373, + "logits/rejected": -2.297133207321167, + "logps/chosen": -185.1601104736328, + "logps/rejected": -216.9307098388672, + "loss": 0.612, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.2970949411392212, + "rewards/margins": 0.3414715826511383, + "rewards/rejected": -1.6385663747787476, + "step": 5810 + }, + { + "epoch": 1.0027567195037905, + "grad_norm": 27.19829750061035, + "learning_rate": 1.6839915489699545e-07, + "logits/chosen": -2.316255807876587, + "logits/rejected": -2.269670009613037, + "logps/chosen": -188.31253051757812, + "logps/rejected": -236.2301483154297, + "loss": 0.5283, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.343603491783142, + "rewards/margins": 0.5325624346733093, + "rewards/rejected": -1.8761659860610962, + "step": 5820 + }, + { + "epoch": 1.0044796691936595, + "grad_norm": 46.75299835205078, + "learning_rate": 1.682527660837161e-07, + "logits/chosen": -2.220951557159424, + "logits/rejected": -2.1911635398864746, + "logps/chosen": -211.2012176513672, + "logps/rejected": -245.8602294921875, + "loss": 0.6076, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.5650999546051025, + "rewards/margins": 0.377856969833374, + "rewards/rejected": -1.9429566860198975, + "step": 5830 + }, + { + "epoch": 1.0062026188835287, + "grad_norm": 21.179813385009766, + "learning_rate": 1.6810610293502944e-07, + "logits/chosen": -2.259906053543091, + "logits/rejected": -2.233856439590454, + "logps/chosen": -202.88821411132812, + "logps/rejected": -246.70016479492188, + "loss": 0.5882, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.4954583644866943, + "rewards/margins": 0.4338740408420563, + "rewards/rejected": -1.9293327331542969, + "step": 5840 + }, + { + "epoch": 1.0079255685733977, + "grad_norm": 23.58659553527832, + "learning_rate": 1.679591660404339e-07, + "logits/chosen": -2.303954601287842, + "logits/rejected": -2.271392345428467, + "logps/chosen": -183.2528076171875, + "logps/rejected": -234.5919647216797, + "loss": 0.5451, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.316575050354004, + "rewards/margins": 0.4936788082122803, + "rewards/rejected": -1.8102538585662842, + "step": 5850 + }, + { + "epoch": 1.0096485182632666, + "grad_norm": 21.694034576416016, + "learning_rate": 1.6781195599052807e-07, + "logits/chosen": -2.206890106201172, + "logits/rejected": -2.181102991104126, + "logps/chosen": -190.34397888183594, + "logps/rejected": -238.74783325195312, + "loss": 0.5629, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.372124433517456, + "rewards/margins": 0.49931803345680237, + "rewards/rejected": -1.8714425563812256, + "step": 5860 + }, + { + "epoch": 1.0113714679531358, + "grad_norm": 26.660905838012695, + "learning_rate": 1.6766447337700865e-07, + "logits/chosen": -2.247291088104248, + "logits/rejected": -2.2175240516662598, + "logps/chosen": -198.14712524414062, + "logps/rejected": -241.3429412841797, + "loss": 0.5602, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.415946125984192, + "rewards/margins": 0.4595022201538086, + "rewards/rejected": -1.875448226928711, + "step": 5870 + }, + { + "epoch": 1.0130944176430048, + "grad_norm": 25.15273094177246, + "learning_rate": 1.6751671879266769e-07, + "logits/chosen": -2.2789111137390137, + "logits/rejected": -2.254117965698242, + "logps/chosen": -191.4722137451172, + "logps/rejected": -225.45297241210938, + "loss": 0.5961, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.3582924604415894, + "rewards/margins": 0.37257465720176697, + "rewards/rejected": -1.7308671474456787, + "step": 5880 + }, + { + "epoch": 1.014817367332874, + "grad_norm": 32.9681510925293, + "learning_rate": 1.673686928313905e-07, + "logits/chosen": -2.307744264602661, + "logits/rejected": -2.2846286296844482, + "logps/chosen": -187.4040985107422, + "logps/rejected": -220.6470184326172, + "loss": 0.6146, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.3102037906646729, + "rewards/margins": 0.32465046644210815, + "rewards/rejected": -1.6348543167114258, + "step": 5890 + }, + { + "epoch": 1.016540317022743, + "grad_norm": 26.85865020751953, + "learning_rate": 1.6722039608815315e-07, + "logits/chosen": -2.2252914905548096, + "logits/rejected": -2.1969614028930664, + "logps/chosen": -187.75491333007812, + "logps/rejected": -230.99462890625, + "loss": 0.5611, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.3111248016357422, + "rewards/margins": 0.4657079577445984, + "rewards/rejected": -1.7768325805664062, + "step": 5900 + }, + { + "epoch": 1.018263266712612, + "grad_norm": 24.568252563476562, + "learning_rate": 1.670718291590201e-07, + "logits/chosen": -2.2330894470214844, + "logits/rejected": -2.2195515632629395, + "logps/chosen": -185.08482360839844, + "logps/rejected": -224.62344360351562, + "loss": 0.5979, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.3206746578216553, + "rewards/margins": 0.35854750871658325, + "rewards/rejected": -1.6792221069335938, + "step": 5910 + }, + { + "epoch": 1.019986216402481, + "grad_norm": 35.23897171020508, + "learning_rate": 1.6692299264114178e-07, + "logits/chosen": -2.1981778144836426, + "logits/rejected": -2.1714072227478027, + "logps/chosen": -200.17672729492188, + "logps/rejected": -229.83621215820312, + "loss": 0.6394, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.4837713241577148, + "rewards/margins": 0.28825488686561584, + "rewards/rejected": -1.7720263004302979, + "step": 5920 + }, + { + "epoch": 1.02170916609235, + "grad_norm": 22.59590721130371, + "learning_rate": 1.6677388713275224e-07, + "logits/chosen": -2.262528657913208, + "logits/rejected": -2.239509105682373, + "logps/chosen": -191.60922241210938, + "logps/rejected": -227.3905029296875, + "loss": 0.6215, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.4099931716918945, + "rewards/margins": 0.3598439693450928, + "rewards/rejected": -1.7698370218276978, + "step": 5930 + }, + { + "epoch": 1.0234321157822193, + "grad_norm": 20.984752655029297, + "learning_rate": 1.6662451323316663e-07, + "logits/chosen": -2.261719226837158, + "logits/rejected": -2.2246999740600586, + "logps/chosen": -164.25592041015625, + "logps/rejected": -204.17506408691406, + "loss": 0.5654, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.0919129848480225, + "rewards/margins": 0.4257412850856781, + "rewards/rejected": -1.5176541805267334, + "step": 5940 + }, + { + "epoch": 1.0251550654720882, + "grad_norm": 27.647113800048828, + "learning_rate": 1.6647487154277897e-07, + "logits/chosen": -2.2034451961517334, + "logits/rejected": -2.184757709503174, + "logps/chosen": -173.5892791748047, + "logps/rejected": -207.5119171142578, + "loss": 0.5837, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1839631795883179, + "rewards/margins": 0.3621217608451843, + "rewards/rejected": -1.546085000038147, + "step": 5950 + }, + { + "epoch": 1.0268780151619572, + "grad_norm": 24.661508560180664, + "learning_rate": 1.6632496266305958e-07, + "logits/chosen": -2.2062265872955322, + "logits/rejected": -2.164212703704834, + "logps/chosen": -191.03012084960938, + "logps/rejected": -208.45529174804688, + "loss": 0.6431, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.3232829570770264, + "rewards/margins": 0.2540973126888275, + "rewards/rejected": -1.5773800611495972, + "step": 5960 + }, + { + "epoch": 1.0286009648518264, + "grad_norm": 24.430320739746094, + "learning_rate": 1.661747871965527e-07, + "logits/chosen": -2.2344512939453125, + "logits/rejected": -2.209048271179199, + "logps/chosen": -177.21939086914062, + "logps/rejected": -210.49755859375, + "loss": 0.5936, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.2416160106658936, + "rewards/margins": 0.3695695698261261, + "rewards/rejected": -1.6111854314804077, + "step": 5970 + }, + { + "epoch": 1.0303239145416954, + "grad_norm": 25.474464416503906, + "learning_rate": 1.6602434574687417e-07, + "logits/chosen": -2.223212718963623, + "logits/rejected": -2.207329511642456, + "logps/chosen": -168.71603393554688, + "logps/rejected": -211.7668914794922, + "loss": 0.575, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.1495542526245117, + "rewards/margins": 0.4093645215034485, + "rewards/rejected": -1.5589187145233154, + "step": 5980 + }, + { + "epoch": 1.0320468642315643, + "grad_norm": 24.24454689025879, + "learning_rate": 1.658736389187089e-07, + "logits/chosen": -2.284614086151123, + "logits/rejected": -2.246558666229248, + "logps/chosen": -173.7165069580078, + "logps/rejected": -209.68753051757812, + "loss": 0.5746, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.1815001964569092, + "rewards/margins": 0.40414220094680786, + "rewards/rejected": -1.5856424570083618, + "step": 5990 + }, + { + "epoch": 1.0337698139214335, + "grad_norm": 35.86470031738281, + "learning_rate": 1.6572266731780842e-07, + "logits/chosen": -2.2233195304870605, + "logits/rejected": -2.196089744567871, + "logps/chosen": -201.31045532226562, + "logps/rejected": -242.53005981445312, + "loss": 0.5845, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.48953378200531, + "rewards/margins": 0.42532163858413696, + "rewards/rejected": -1.9148553609848022, + "step": 6000 + }, + { + "epoch": 1.0337698139214335, + "eval_logits/chosen": -2.2815613746643066, + "eval_logits/rejected": -2.2640297412872314, + "eval_logps/chosen": -195.87583923339844, + "eval_logps/rejected": -223.07241821289062, + "eval_loss": 0.6437787413597107, + "eval_rewards/accuracies": 0.624535322189331, + "eval_rewards/chosen": -1.3686038255691528, + "eval_rewards/margins": 0.23462428152561188, + "eval_rewards/rejected": -1.6032280921936035, + "eval_runtime": 383.8708, + "eval_samples_per_second": 11.212, + "eval_steps_per_second": 1.402, + "step": 6000 + }, + { + "epoch": 1.0354927636113025, + "grad_norm": 32.003517150878906, + "learning_rate": 1.655714315509885e-07, + "logits/chosen": -2.2215774059295654, + "logits/rejected": -2.2042527198791504, + "logps/chosen": -207.7661895751953, + "logps/rejected": -231.06192016601562, + "loss": 0.661, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.5392072200775146, + "rewards/margins": 0.268566370010376, + "rewards/rejected": -1.8077733516693115, + "step": 6010 + }, + { + "epoch": 1.0372157133011717, + "grad_norm": 22.12173843383789, + "learning_rate": 1.654199322261267e-07, + "logits/chosen": -2.349522352218628, + "logits/rejected": -2.317288875579834, + "logps/chosen": -193.19227600097656, + "logps/rejected": -214.86181640625, + "loss": 0.645, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.3733975887298584, + "rewards/margins": 0.286784827709198, + "rewards/rejected": -1.6601823568344116, + "step": 6020 + }, + { + "epoch": 1.0389386629910407, + "grad_norm": 22.20186996459961, + "learning_rate": 1.6526816995215995e-07, + "logits/chosen": -2.1700615882873535, + "logits/rejected": -2.143908739089966, + "logps/chosen": -164.04891967773438, + "logps/rejected": -192.51254272460938, + "loss": 0.6245, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.1379988193511963, + "rewards/margins": 0.28763145208358765, + "rewards/rejected": -1.4256302118301392, + "step": 6030 + }, + { + "epoch": 1.0406616126809096, + "grad_norm": 30.76277732849121, + "learning_rate": 1.651161453390821e-07, + "logits/chosen": -2.309840202331543, + "logits/rejected": -2.2857794761657715, + "logps/chosen": -154.37887573242188, + "logps/rejected": -181.8328094482422, + "loss": 0.5997, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.0026304721832275, + "rewards/margins": 0.30957844853401184, + "rewards/rejected": -1.3122087717056274, + "step": 6040 + }, + { + "epoch": 1.0423845623707788, + "grad_norm": 24.196666717529297, + "learning_rate": 1.6496385899794135e-07, + "logits/chosen": -2.2179017066955566, + "logits/rejected": -2.1860344409942627, + "logps/chosen": -193.98912048339844, + "logps/rejected": -227.5110626220703, + "loss": 0.5726, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.335597038269043, + "rewards/margins": 0.41937771439552307, + "rewards/rejected": -1.7549747228622437, + "step": 6050 + }, + { + "epoch": 1.0441075120606478, + "grad_norm": 28.210494995117188, + "learning_rate": 1.64811311540838e-07, + "logits/chosen": -2.2067856788635254, + "logits/rejected": -2.1841800212860107, + "logps/chosen": -208.82028198242188, + "logps/rejected": -239.56918334960938, + "loss": 0.6251, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.547461748123169, + "rewards/margins": 0.3352636694908142, + "rewards/rejected": -1.882725477218628, + "step": 6060 + }, + { + "epoch": 1.045830461750517, + "grad_norm": 42.14088439941406, + "learning_rate": 1.6465850358092184e-07, + "logits/chosen": -2.2099831104278564, + "logits/rejected": -2.1768100261688232, + "logps/chosen": -210.5271759033203, + "logps/rejected": -253.0932159423828, + "loss": 0.5568, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.5555570125579834, + "rewards/margins": 0.4656600058078766, + "rewards/rejected": -2.021216869354248, + "step": 6070 + }, + { + "epoch": 1.047553411440386, + "grad_norm": 21.90705680847168, + "learning_rate": 1.645054357323897e-07, + "logits/chosen": -2.2115724086761475, + "logits/rejected": -2.1864047050476074, + "logps/chosen": -218.80557250976562, + "logps/rejected": -249.6724090576172, + "loss": 0.603, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.5844274759292603, + "rewards/margins": 0.390811026096344, + "rewards/rejected": -1.9752384424209595, + "step": 6080 + }, + { + "epoch": 1.049276361130255, + "grad_norm": 23.00667381286621, + "learning_rate": 1.6435210861048302e-07, + "logits/chosen": -2.263542413711548, + "logits/rejected": -2.2285208702087402, + "logps/chosen": -197.87942504882812, + "logps/rejected": -247.3519287109375, + "loss": 0.5377, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4560437202453613, + "rewards/margins": 0.5109397768974304, + "rewards/rejected": -1.9669831991195679, + "step": 6090 + }, + { + "epoch": 1.050999310820124, + "grad_norm": 27.286256790161133, + "learning_rate": 1.6419852283148535e-07, + "logits/chosen": -2.259631872177124, + "logits/rejected": -2.231081485748291, + "logps/chosen": -197.79969787597656, + "logps/rejected": -248.64035034179688, + "loss": 0.5464, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.3961166143417358, + "rewards/margins": 0.5630569458007812, + "rewards/rejected": -1.9591734409332275, + "step": 6100 + }, + { + "epoch": 1.052722260509993, + "grad_norm": 30.823551177978516, + "learning_rate": 1.6404467901271998e-07, + "logits/chosen": -2.211205244064331, + "logits/rejected": -2.180361032485962, + "logps/chosen": -212.4109344482422, + "logps/rejected": -263.77099609375, + "loss": 0.5624, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.5987329483032227, + "rewards/margins": 0.5072905421257019, + "rewards/rejected": -2.1060233116149902, + "step": 6110 + }, + { + "epoch": 1.0544452101998623, + "grad_norm": 40.18345642089844, + "learning_rate": 1.6389057777254722e-07, + "logits/chosen": -2.2784905433654785, + "logits/rejected": -2.2230658531188965, + "logps/chosen": -229.3824462890625, + "logps/rejected": -293.1062316894531, + "loss": 0.5076, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.7395166158676147, + "rewards/margins": 0.669109582901001, + "rewards/rejected": -2.408626079559326, + "step": 6120 + }, + { + "epoch": 1.0561681598897312, + "grad_norm": 25.442590713500977, + "learning_rate": 1.6373621973036224e-07, + "logits/chosen": -2.1934523582458496, + "logits/rejected": -2.1601486206054688, + "logps/chosen": -239.6211700439453, + "logps/rejected": -286.9929504394531, + "loss": 0.5719, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.8426984548568726, + "rewards/margins": 0.4940398335456848, + "rewards/rejected": -2.336738109588623, + "step": 6130 + }, + { + "epoch": 1.0578911095796002, + "grad_norm": 49.917945861816406, + "learning_rate": 1.6358160550659213e-07, + "logits/chosen": -2.2199857234954834, + "logits/rejected": -2.187546968460083, + "logps/chosen": -225.39395141601562, + "logps/rejected": -270.90447998046875, + "loss": 0.5977, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.7169145345687866, + "rewards/margins": 0.4762873649597168, + "rewards/rejected": -2.193201780319214, + "step": 6140 + }, + { + "epoch": 1.0596140592694694, + "grad_norm": 22.244651794433594, + "learning_rate": 1.6342673572269398e-07, + "logits/chosen": -2.197420597076416, + "logits/rejected": -2.1729331016540527, + "logps/chosen": -213.5404052734375, + "logps/rejected": -249.8678741455078, + "loss": 0.6162, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.6002448797225952, + "rewards/margins": 0.4091455340385437, + "rewards/rejected": -2.009390354156494, + "step": 6150 + }, + { + "epoch": 1.0613370089593384, + "grad_norm": 28.97780418395996, + "learning_rate": 1.632716110011519e-07, + "logits/chosen": -2.186614513397217, + "logits/rejected": -2.165234088897705, + "logps/chosen": -201.4184112548828, + "logps/rejected": -235.50375366210938, + "loss": 0.6203, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.4898805618286133, + "rewards/margins": 0.3466905355453491, + "rewards/rejected": -1.8365710973739624, + "step": 6160 + }, + { + "epoch": 1.0630599586492075, + "grad_norm": 32.25000762939453, + "learning_rate": 1.6311623196547474e-07, + "logits/chosen": -2.2768733501434326, + "logits/rejected": -2.2490344047546387, + "logps/chosen": -219.40673828125, + "logps/rejected": -263.5992126464844, + "loss": 0.5747, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.5952725410461426, + "rewards/margins": 0.47955289483070374, + "rewards/rejected": -2.0748257637023926, + "step": 6170 + }, + { + "epoch": 1.0647829083390765, + "grad_norm": 25.306297302246094, + "learning_rate": 1.6296059924019353e-07, + "logits/chosen": -2.275926351547241, + "logits/rejected": -2.2481603622436523, + "logps/chosen": -205.8504638671875, + "logps/rejected": -236.63137817382812, + "loss": 0.5982, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.46571946144104, + "rewards/margins": 0.3783527910709381, + "rewards/rejected": -1.8440719842910767, + "step": 6180 + }, + { + "epoch": 1.0665058580289455, + "grad_norm": 29.48764419555664, + "learning_rate": 1.6280471345085901e-07, + "logits/chosen": -2.2747912406921387, + "logits/rejected": -2.246176242828369, + "logps/chosen": -203.13949584960938, + "logps/rejected": -234.68856811523438, + "loss": 0.6018, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.4325004816055298, + "rewards/margins": 0.39268478751182556, + "rewards/rejected": -1.8251851797103882, + "step": 6190 + }, + { + "epoch": 1.0682288077188147, + "grad_norm": 40.502254486083984, + "learning_rate": 1.6264857522403906e-07, + "logits/chosen": -2.1964752674102783, + "logits/rejected": -2.1596341133117676, + "logps/chosen": -186.2880859375, + "logps/rejected": -229.4107208251953, + "loss": 0.5771, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.3087577819824219, + "rewards/margins": 0.44233304262161255, + "rewards/rejected": -1.7510907649993896, + "step": 6200 + }, + { + "epoch": 1.0699517574086836, + "grad_norm": 32.917842864990234, + "learning_rate": 1.6249218518731623e-07, + "logits/chosen": -2.2598016262054443, + "logits/rejected": -2.2300467491149902, + "logps/chosen": -195.76866149902344, + "logps/rejected": -231.24526977539062, + "loss": 0.5751, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.394277811050415, + "rewards/margins": 0.3902966380119324, + "rewards/rejected": -1.784574270248413, + "step": 6210 + }, + { + "epoch": 1.0716747070985528, + "grad_norm": 23.806379318237305, + "learning_rate": 1.6233554396928515e-07, + "logits/chosen": -2.2467989921569824, + "logits/rejected": -2.22261381149292, + "logps/chosen": -194.40060424804688, + "logps/rejected": -229.7987518310547, + "loss": 0.61, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.4125274419784546, + "rewards/margins": 0.3693087697029114, + "rewards/rejected": -1.7818362712860107, + "step": 6220 + }, + { + "epoch": 1.0733976567884218, + "grad_norm": 26.85658073425293, + "learning_rate": 1.6217865219955008e-07, + "logits/chosen": -2.338665008544922, + "logits/rejected": -2.3019931316375732, + "logps/chosen": -192.6621551513672, + "logps/rejected": -246.43557739257812, + "loss": 0.5288, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3202307224273682, + "rewards/margins": 0.5889842510223389, + "rewards/rejected": -1.909214973449707, + "step": 6230 + }, + { + "epoch": 1.0751206064782908, + "grad_norm": 29.26054573059082, + "learning_rate": 1.6202151050872242e-07, + "logits/chosen": -2.1933677196502686, + "logits/rejected": -2.163956880569458, + "logps/chosen": -201.94302368164062, + "logps/rejected": -236.05807495117188, + "loss": 0.6181, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.449880599975586, + "rewards/margins": 0.38049185276031494, + "rewards/rejected": -1.8303724527359009, + "step": 6240 + }, + { + "epoch": 1.07684355616816, + "grad_norm": 32.749603271484375, + "learning_rate": 1.618641195284179e-07, + "logits/chosen": -2.305246353149414, + "logits/rejected": -2.2783963680267334, + "logps/chosen": -173.74647521972656, + "logps/rejected": -196.07456970214844, + "loss": 0.6333, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.168368935585022, + "rewards/margins": 0.26043254137039185, + "rewards/rejected": -1.428801417350769, + "step": 6250 + }, + { + "epoch": 1.078566505858029, + "grad_norm": 16.783361434936523, + "learning_rate": 1.6170647989125455e-07, + "logits/chosen": -2.2322545051574707, + "logits/rejected": -2.208922863006592, + "logps/chosen": -155.6553955078125, + "logps/rejected": -178.28318786621094, + "loss": 0.6247, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.0038013458251953, + "rewards/margins": 0.2728382647037506, + "rewards/rejected": -1.2766395807266235, + "step": 6260 + }, + { + "epoch": 1.080289455547898, + "grad_norm": 15.635846138000488, + "learning_rate": 1.6154859223084953e-07, + "logits/chosen": -2.4459521770477295, + "logits/rejected": -2.439850330352783, + "logps/chosen": -144.04306030273438, + "logps/rejected": -171.6459197998047, + "loss": 0.6206, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.9036776423454285, + "rewards/margins": 0.2726871371269226, + "rewards/rejected": -1.1763646602630615, + "step": 6270 + }, + { + "epoch": 1.082012405237767, + "grad_norm": 20.35181999206543, + "learning_rate": 1.613904571818171e-07, + "logits/chosen": -2.2114737033843994, + "logits/rejected": -2.184677839279175, + "logps/chosen": -146.02078247070312, + "logps/rejected": -174.31607055664062, + "loss": 0.5935, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.9112964868545532, + "rewards/margins": 0.3144376873970032, + "rewards/rejected": -1.225733995437622, + "step": 6280 + }, + { + "epoch": 1.083735354927636, + "grad_norm": 19.11197853088379, + "learning_rate": 1.6123207537976588e-07, + "logits/chosen": -2.2452197074890137, + "logits/rejected": -2.2174201011657715, + "logps/chosen": -162.3245086669922, + "logps/rejected": -197.2689971923828, + "loss": 0.5868, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.1011226177215576, + "rewards/margins": 0.35962381958961487, + "rewards/rejected": -1.4607465267181396, + "step": 6290 + }, + { + "epoch": 1.0854583046175053, + "grad_norm": 23.40118980407715, + "learning_rate": 1.6107344746129622e-07, + "logits/chosen": -2.261756658554077, + "logits/rejected": -2.2390553951263428, + "logps/chosen": -180.0264434814453, + "logps/rejected": -208.5572052001953, + "loss": 0.6152, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.2631686925888062, + "rewards/margins": 0.3133259117603302, + "rewards/rejected": -1.5764944553375244, + "step": 6300 + }, + { + "epoch": 1.0871812543073742, + "grad_norm": 38.912540435791016, + "learning_rate": 1.609145740639977e-07, + "logits/chosen": -2.254859447479248, + "logits/rejected": -2.2240333557128906, + "logps/chosen": -165.6407470703125, + "logps/rejected": -196.86428833007812, + "loss": 0.6129, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.1381008625030518, + "rewards/margins": 0.3193897306919098, + "rewards/rejected": -1.4574905633926392, + "step": 6310 + }, + { + "epoch": 1.0889042039972432, + "grad_norm": 22.782106399536133, + "learning_rate": 1.6075545582644663e-07, + "logits/chosen": -2.222576856613159, + "logits/rejected": -2.1984236240386963, + "logps/chosen": -162.3043670654297, + "logps/rejected": -195.146484375, + "loss": 0.5999, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.068186640739441, + "rewards/margins": 0.3570778965950012, + "rewards/rejected": -1.425264596939087, + "step": 6320 + }, + { + "epoch": 1.0906271536871124, + "grad_norm": 20.50153350830078, + "learning_rate": 1.6059609338820342e-07, + "logits/chosen": -2.2440178394317627, + "logits/rejected": -2.2133121490478516, + "logps/chosen": -166.8011016845703, + "logps/rejected": -211.500732421875, + "loss": 0.5555, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.1194723844528198, + "rewards/margins": 0.47705450654029846, + "rewards/rejected": -1.5965269804000854, + "step": 6330 + }, + { + "epoch": 1.0923501033769814, + "grad_norm": 28.833908081054688, + "learning_rate": 1.6043648738981e-07, + "logits/chosen": -2.269195556640625, + "logits/rejected": -2.2403371334075928, + "logps/chosen": -185.642822265625, + "logps/rejected": -217.7083740234375, + "loss": 0.5996, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.2849600315093994, + "rewards/margins": 0.36721131205558777, + "rewards/rejected": -1.6521714925765991, + "step": 6340 + }, + { + "epoch": 1.0940730530668505, + "grad_norm": 36.53247833251953, + "learning_rate": 1.6027663847278725e-07, + "logits/chosen": -2.1933300495147705, + "logits/rejected": -2.176132917404175, + "logps/chosen": -200.3306884765625, + "logps/rejected": -239.5429229736328, + "loss": 0.5821, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.4361860752105713, + "rewards/margins": 0.4046160578727722, + "rewards/rejected": -1.8408019542694092, + "step": 6350 + }, + { + "epoch": 1.0957960027567195, + "grad_norm": 23.467296600341797, + "learning_rate": 1.6011654727963252e-07, + "logits/chosen": -2.1536242961883545, + "logits/rejected": -2.1328327655792236, + "logps/chosen": -211.01321411132812, + "logps/rejected": -261.79376220703125, + "loss": 0.5684, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.5633403062820435, + "rewards/margins": 0.5159338712692261, + "rewards/rejected": -2.0792741775512695, + "step": 6360 + }, + { + "epoch": 1.0975189524465885, + "grad_norm": 28.605619430541992, + "learning_rate": 1.599562144538169e-07, + "logits/chosen": -2.1722331047058105, + "logits/rejected": -2.1576180458068848, + "logps/chosen": -221.28628540039062, + "logps/rejected": -250.8852996826172, + "loss": 0.6574, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.660936951637268, + "rewards/margins": 0.3105929493904114, + "rewards/rejected": -1.9715297222137451, + "step": 6370 + }, + { + "epoch": 1.0992419021364577, + "grad_norm": 31.780277252197266, + "learning_rate": 1.597956406397827e-07, + "logits/chosen": -2.2239651679992676, + "logits/rejected": -2.1921591758728027, + "logps/chosen": -212.7681121826172, + "logps/rejected": -263.39239501953125, + "loss": 0.5588, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.5899945497512817, + "rewards/margins": 0.4881893992424011, + "rewards/rejected": -2.078183889389038, + "step": 6380 + }, + { + "epoch": 1.1009648518263266, + "grad_norm": 30.17275047302246, + "learning_rate": 1.5963482648294085e-07, + "logits/chosen": -2.259859085083008, + "logits/rejected": -2.2155401706695557, + "logps/chosen": -197.69004821777344, + "logps/rejected": -241.78903198242188, + "loss": 0.5508, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.4024711847305298, + "rewards/margins": 0.5164901614189148, + "rewards/rejected": -1.9189612865447998, + "step": 6390 + }, + { + "epoch": 1.1026878015161956, + "grad_norm": 30.444303512573242, + "learning_rate": 1.5947377262966842e-07, + "logits/chosen": -2.223111629486084, + "logits/rejected": -2.1871323585510254, + "logps/chosen": -206.0340118408203, + "logps/rejected": -249.349365234375, + "loss": 0.5789, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.4722647666931152, + "rewards/margins": 0.4635472893714905, + "rewards/rejected": -1.9358123540878296, + "step": 6400 + }, + { + "epoch": 1.1026878015161956, + "eval_logits/chosen": -2.2595252990722656, + "eval_logits/rejected": -2.242840528488159, + "eval_logps/chosen": -197.83056640625, + "eval_logps/rejected": -224.87249755859375, + "eval_loss": 0.6455348134040833, + "eval_rewards/accuracies": 0.616403341293335, + "eval_rewards/chosen": -1.388150691986084, + "eval_rewards/margins": 0.23307843506336212, + "eval_rewards/rejected": -1.6212290525436401, + "eval_runtime": 384.0489, + "eval_samples_per_second": 11.207, + "eval_steps_per_second": 1.401, + "step": 6400 + }, + { + "epoch": 1.1044107512060648, + "grad_norm": 39.10361862182617, + "learning_rate": 1.5931247972730572e-07, + "logits/chosen": -2.2497591972351074, + "logits/rejected": -2.220106601715088, + "logps/chosen": -230.1884765625, + "logps/rejected": -270.73980712890625, + "loss": 0.6077, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.7419331073760986, + "rewards/margins": 0.4585306644439697, + "rewards/rejected": -2.2004637718200684, + "step": 6410 + }, + { + "epoch": 1.1061337008959338, + "grad_norm": 24.094772338867188, + "learning_rate": 1.591509484241541e-07, + "logits/chosen": -2.2276716232299805, + "logits/rejected": -2.200310230255127, + "logps/chosen": -219.2551727294922, + "logps/rejected": -252.8025665283203, + "loss": 0.6332, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.6354185342788696, + "rewards/margins": 0.36073872447013855, + "rewards/rejected": -1.9961572885513306, + "step": 6420 + }, + { + "epoch": 1.107856650585803, + "grad_norm": 26.919458389282227, + "learning_rate": 1.5898917936947297e-07, + "logits/chosen": -2.1865882873535156, + "logits/rejected": -2.1732869148254395, + "logps/chosen": -189.06442260742188, + "logps/rejected": -213.3035125732422, + "loss": 0.6421, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.3809840679168701, + "rewards/margins": 0.2711717486381531, + "rewards/rejected": -1.652155876159668, + "step": 6430 + }, + { + "epoch": 1.109579600275672, + "grad_norm": 21.60903549194336, + "learning_rate": 1.5882717321347752e-07, + "logits/chosen": -2.241295099258423, + "logits/rejected": -2.215834379196167, + "logps/chosen": -186.6502685546875, + "logps/rejected": -218.45486450195312, + "loss": 0.6103, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.3116780519485474, + "rewards/margins": 0.34163540601730347, + "rewards/rejected": -1.653313398361206, + "step": 6440 + }, + { + "epoch": 1.111302549965541, + "grad_norm": 21.67346954345703, + "learning_rate": 1.5866493060733576e-07, + "logits/chosen": -2.2527382373809814, + "logits/rejected": -2.21574330329895, + "logps/chosen": -177.25755310058594, + "logps/rejected": -212.2408905029297, + "loss": 0.5797, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.1989918947219849, + "rewards/margins": 0.4047037959098816, + "rewards/rejected": -1.6036956310272217, + "step": 6450 + }, + { + "epoch": 1.11302549965541, + "grad_norm": 26.42724609375, + "learning_rate": 1.585024522031663e-07, + "logits/chosen": -2.1832480430603027, + "logits/rejected": -2.1704015731811523, + "logps/chosen": -181.95785522460938, + "logps/rejected": -235.64987182617188, + "loss": 0.5557, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.2918856143951416, + "rewards/margins": 0.498269259929657, + "rewards/rejected": -1.7901546955108643, + "step": 6460 + }, + { + "epoch": 1.114748449345279, + "grad_norm": 27.41206932067871, + "learning_rate": 1.5833973865403533e-07, + "logits/chosen": -2.124350070953369, + "logits/rejected": -2.0984079837799072, + "logps/chosen": -197.56777954101562, + "logps/rejected": -235.68283081054688, + "loss": 0.5946, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.433988332748413, + "rewards/margins": 0.4133404791355133, + "rewards/rejected": -1.8473289012908936, + "step": 6470 + }, + { + "epoch": 1.1164713990351482, + "grad_norm": 23.416603088378906, + "learning_rate": 1.5817679061395426e-07, + "logits/chosen": -2.2085471153259277, + "logits/rejected": -2.1690986156463623, + "logps/chosen": -193.58267211914062, + "logps/rejected": -229.81869506835938, + "loss": 0.5803, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.3631269931793213, + "rewards/margins": 0.43150943517684937, + "rewards/rejected": -1.7946363687515259, + "step": 6480 + }, + { + "epoch": 1.1181943487250172, + "grad_norm": 27.429529190063477, + "learning_rate": 1.5801360873787704e-07, + "logits/chosen": -2.3445587158203125, + "logits/rejected": -2.3242859840393066, + "logps/chosen": -196.75357055664062, + "logps/rejected": -231.71450805664062, + "loss": 0.6102, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.4204045534133911, + "rewards/margins": 0.36140257120132446, + "rewards/rejected": -1.78180730342865, + "step": 6490 + }, + { + "epoch": 1.1199172984148862, + "grad_norm": 25.771257400512695, + "learning_rate": 1.5785019368169748e-07, + "logits/chosen": -2.2539565563201904, + "logits/rejected": -2.233546733856201, + "logps/chosen": -189.86428833007812, + "logps/rejected": -215.73965454101562, + "loss": 0.6098, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.3247458934783936, + "rewards/margins": 0.3122237026691437, + "rewards/rejected": -1.6369695663452148, + "step": 6500 + }, + { + "epoch": 1.1216402481047554, + "grad_norm": 27.679521560668945, + "learning_rate": 1.5768654610224664e-07, + "logits/chosen": -2.2240753173828125, + "logits/rejected": -2.1769585609436035, + "logps/chosen": -189.4766845703125, + "logps/rejected": -226.5435028076172, + "loss": 0.5855, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.3213176727294922, + "rewards/margins": 0.42212343215942383, + "rewards/rejected": -1.7434409856796265, + "step": 6510 + }, + { + "epoch": 1.1233631977946243, + "grad_norm": 35.53666305541992, + "learning_rate": 1.575226666572901e-07, + "logits/chosen": -2.2380692958831787, + "logits/rejected": -2.206503391265869, + "logps/chosen": -176.08033752441406, + "logps/rejected": -212.88357543945312, + "loss": 0.5733, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.2142693996429443, + "rewards/margins": 0.40591007471084595, + "rewards/rejected": -1.620179533958435, + "step": 6520 + }, + { + "epoch": 1.1250861474844935, + "grad_norm": 16.802722930908203, + "learning_rate": 1.573585560055256e-07, + "logits/chosen": -2.190563201904297, + "logits/rejected": -2.149428367614746, + "logps/chosen": -183.85194396972656, + "logps/rejected": -234.85830688476562, + "loss": 0.5242, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.2935733795166016, + "rewards/margins": 0.5506505966186523, + "rewards/rejected": -1.8442237377166748, + "step": 6530 + }, + { + "epoch": 1.1268090971743625, + "grad_norm": 31.97213363647461, + "learning_rate": 1.5719421480657996e-07, + "logits/chosen": -2.1805355548858643, + "logits/rejected": -2.1448729038238525, + "logps/chosen": -213.38833618164062, + "logps/rejected": -248.4428253173828, + "loss": 0.6279, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.553464412689209, + "rewards/margins": 0.37057262659072876, + "rewards/rejected": -1.924036979675293, + "step": 6540 + }, + { + "epoch": 1.1285320468642315, + "grad_norm": 36.40608215332031, + "learning_rate": 1.570296437210068e-07, + "logits/chosen": -2.133267641067505, + "logits/rejected": -2.1039319038391113, + "logps/chosen": -206.63760375976562, + "logps/rejected": -245.5586395263672, + "loss": 0.6107, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.5502650737762451, + "rewards/margins": 0.39422768354415894, + "rewards/rejected": -1.9444925785064697, + "step": 6550 + }, + { + "epoch": 1.1302549965541007, + "grad_norm": 33.15107727050781, + "learning_rate": 1.5686484341028374e-07, + "logits/chosen": -2.217454195022583, + "logits/rejected": -2.178605079650879, + "logps/chosen": -198.23617553710938, + "logps/rejected": -239.9998321533203, + "loss": 0.5656, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.414640188217163, + "rewards/margins": 0.4772668778896332, + "rewards/rejected": -1.8919073343276978, + "step": 6560 + }, + { + "epoch": 1.1319779462439696, + "grad_norm": 29.158788681030273, + "learning_rate": 1.566998145368097e-07, + "logits/chosen": -2.2073724269866943, + "logits/rejected": -2.1710896492004395, + "logps/chosen": -200.75865173339844, + "logps/rejected": -242.34756469726562, + "loss": 0.5686, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.4528988599777222, + "rewards/margins": 0.4519721567630768, + "rewards/rejected": -1.9048709869384766, + "step": 6570 + }, + { + "epoch": 1.1337008959338388, + "grad_norm": 26.331056594848633, + "learning_rate": 1.5653455776390235e-07, + "logits/chosen": -2.22875714302063, + "logits/rejected": -2.187915802001953, + "logps/chosen": -197.57884216308594, + "logps/rejected": -227.84872436523438, + "loss": 0.6095, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.4123129844665527, + "rewards/margins": 0.37162190675735474, + "rewards/rejected": -1.7839349508285522, + "step": 6580 + }, + { + "epoch": 1.1354238456237078, + "grad_norm": 31.246917724609375, + "learning_rate": 1.563690737557953e-07, + "logits/chosen": -2.20268177986145, + "logits/rejected": -2.16776180267334, + "logps/chosen": -184.05455017089844, + "logps/rejected": -226.29769897460938, + "loss": 0.5749, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.3052704334259033, + "rewards/margins": 0.42799264192581177, + "rewards/rejected": -1.7332630157470703, + "step": 6590 + }, + { + "epoch": 1.1371467953135768, + "grad_norm": 33.766841888427734, + "learning_rate": 1.562033631776356e-07, + "logits/chosen": -2.2406680583953857, + "logits/rejected": -2.210420608520508, + "logps/chosen": -203.17628479003906, + "logps/rejected": -243.853515625, + "loss": 0.5912, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.4638261795043945, + "rewards/margins": 0.44317826628685, + "rewards/rejected": -1.9070043563842773, + "step": 6600 + }, + { + "epoch": 1.138869745003446, + "grad_norm": 26.146657943725586, + "learning_rate": 1.560374266954809e-07, + "logits/chosen": -2.1933908462524414, + "logits/rejected": -2.166749954223633, + "logps/chosen": -219.3288116455078, + "logps/rejected": -266.52911376953125, + "loss": 0.5595, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.662623405456543, + "rewards/margins": 0.49311646819114685, + "rewards/rejected": -2.1557400226593018, + "step": 6610 + }, + { + "epoch": 1.140592694693315, + "grad_norm": 30.15679359436035, + "learning_rate": 1.5587126497629686e-07, + "logits/chosen": -2.1265695095062256, + "logits/rejected": -2.0984911918640137, + "logps/chosen": -235.67855834960938, + "logps/rejected": -269.5155944824219, + "loss": 0.6436, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.8247787952423096, + "rewards/margins": 0.3574926257133484, + "rewards/rejected": -2.1822714805603027, + "step": 6620 + }, + { + "epoch": 1.1423156443831841, + "grad_norm": 29.55496597290039, + "learning_rate": 1.557048786879545e-07, + "logits/chosen": -2.1744980812072754, + "logits/rejected": -2.1456077098846436, + "logps/chosen": -190.76315307617188, + "logps/rejected": -218.13583374023438, + "loss": 0.6088, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.3612579107284546, + "rewards/margins": 0.3343326449394226, + "rewards/rejected": -1.6955903768539429, + "step": 6630 + }, + { + "epoch": 1.144038594073053, + "grad_norm": 29.953752517700195, + "learning_rate": 1.5553826849922747e-07, + "logits/chosen": -2.2497270107269287, + "logits/rejected": -2.2161362171173096, + "logps/chosen": -180.73580932617188, + "logps/rejected": -204.81265258789062, + "loss": 0.6203, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.261747121810913, + "rewards/margins": 0.30038005113601685, + "rewards/rejected": -1.5621273517608643, + "step": 6640 + }, + { + "epoch": 1.145761543762922, + "grad_norm": 26.446489334106445, + "learning_rate": 1.553714350797893e-07, + "logits/chosen": -2.290693759918213, + "logits/rejected": -2.2610902786254883, + "logps/chosen": -167.90994262695312, + "logps/rejected": -211.5741424560547, + "loss": 0.5604, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.1483453512191772, + "rewards/margins": 0.43969884514808655, + "rewards/rejected": -1.5880441665649414, + "step": 6650 + }, + { + "epoch": 1.1474844934527912, + "grad_norm": 29.325246810913086, + "learning_rate": 1.5520437910021084e-07, + "logits/chosen": -2.2955827713012695, + "logits/rejected": -2.276799440383911, + "logps/chosen": -176.8847198486328, + "logps/rejected": -213.06546020507812, + "loss": 0.5975, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.2398980855941772, + "rewards/margins": 0.360373854637146, + "rewards/rejected": -1.6002719402313232, + "step": 6660 + }, + { + "epoch": 1.1492074431426602, + "grad_norm": 24.36836814880371, + "learning_rate": 1.550371012319575e-07, + "logits/chosen": -2.17775821685791, + "logits/rejected": -2.1535000801086426, + "logps/chosen": -192.9346160888672, + "logps/rejected": -252.8832244873047, + "loss": 0.5325, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.4218976497650146, + "rewards/margins": 0.5638034343719482, + "rewards/rejected": -1.9857012033462524, + "step": 6670 + }, + { + "epoch": 1.1509303928325294, + "grad_norm": 31.592592239379883, + "learning_rate": 1.5486960214738648e-07, + "logits/chosen": -2.1380467414855957, + "logits/rejected": -2.1065773963928223, + "logps/chosen": -217.91091918945312, + "logps/rejected": -257.14508056640625, + "loss": 0.6023, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.6398321390151978, + "rewards/margins": 0.4262255132198334, + "rewards/rejected": -2.0660576820373535, + "step": 6680 + }, + { + "epoch": 1.1526533425223984, + "grad_norm": 22.85630226135254, + "learning_rate": 1.547018825197443e-07, + "logits/chosen": -2.188157081604004, + "logits/rejected": -2.1534061431884766, + "logps/chosen": -216.86978149414062, + "logps/rejected": -265.63018798828125, + "loss": 0.5528, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.5875884294509888, + "rewards/margins": 0.5234032869338989, + "rewards/rejected": -2.1109914779663086, + "step": 6690 + }, + { + "epoch": 1.1543762922122673, + "grad_norm": 44.523284912109375, + "learning_rate": 1.5453394302316366e-07, + "logits/chosen": -2.1419711112976074, + "logits/rejected": -2.1201653480529785, + "logps/chosen": -236.85684204101562, + "logps/rejected": -283.3230895996094, + "loss": 0.5803, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.7883342504501343, + "rewards/margins": 0.4932478368282318, + "rewards/rejected": -2.2815825939178467, + "step": 6700 + }, + { + "epoch": 1.1560992419021365, + "grad_norm": 26.147733688354492, + "learning_rate": 1.5436578433266126e-07, + "logits/chosen": -2.1709694862365723, + "logits/rejected": -2.128147602081299, + "logps/chosen": -219.7293243408203, + "logps/rejected": -272.3164978027344, + "loss": 0.5688, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.617134690284729, + "rewards/margins": 0.5824311375617981, + "rewards/rejected": -2.1995654106140137, + "step": 6710 + }, + { + "epoch": 1.1578221915920055, + "grad_norm": 32.670108795166016, + "learning_rate": 1.5419740712413472e-07, + "logits/chosen": -2.177053451538086, + "logits/rejected": -2.1449925899505615, + "logps/chosen": -198.5723876953125, + "logps/rejected": -241.6822967529297, + "loss": 0.5817, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.459101915359497, + "rewards/margins": 0.44461125135421753, + "rewards/rejected": -1.9037132263183594, + "step": 6720 + }, + { + "epoch": 1.1595451412818747, + "grad_norm": 30.65315818786621, + "learning_rate": 1.5402881207436e-07, + "logits/chosen": -2.1471118927001953, + "logits/rejected": -2.1220784187316895, + "logps/chosen": -207.4105682373047, + "logps/rejected": -236.3886260986328, + "loss": 0.6307, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.4984217882156372, + "rewards/margins": 0.32371002435684204, + "rewards/rejected": -1.8221315145492554, + "step": 6730 + }, + { + "epoch": 1.1612680909717437, + "grad_norm": 37.31703567504883, + "learning_rate": 1.5385999986098858e-07, + "logits/chosen": -2.18902850151062, + "logits/rejected": -2.1660685539245605, + "logps/chosen": -197.50247192382812, + "logps/rejected": -240.78140258789062, + "loss": 0.5585, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.421494960784912, + "rewards/margins": 0.43881043791770935, + "rewards/rejected": -1.8603054285049438, + "step": 6740 + }, + { + "epoch": 1.1629910406616126, + "grad_norm": 39.52729797363281, + "learning_rate": 1.5369097116254493e-07, + "logits/chosen": -2.223015785217285, + "logits/rejected": -2.1959691047668457, + "logps/chosen": -207.1807861328125, + "logps/rejected": -252.77059936523438, + "loss": 0.5831, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.5373988151550293, + "rewards/margins": 0.4599340856075287, + "rewards/rejected": -1.9973331689834595, + "step": 6750 + }, + { + "epoch": 1.1647139903514818, + "grad_norm": 37.49037170410156, + "learning_rate": 1.5352172665842351e-07, + "logits/chosen": -2.1724252700805664, + "logits/rejected": -2.137951374053955, + "logps/chosen": -202.9478302001953, + "logps/rejected": -243.4915771484375, + "loss": 0.5801, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.4955527782440186, + "rewards/margins": 0.4567495882511139, + "rewards/rejected": -1.9523022174835205, + "step": 6760 + }, + { + "epoch": 1.1664369400413508, + "grad_norm": 33.874122619628906, + "learning_rate": 1.5335226702888636e-07, + "logits/chosen": -2.20847749710083, + "logits/rejected": -2.1878161430358887, + "logps/chosen": -202.27294921875, + "logps/rejected": -246.615478515625, + "loss": 0.581, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.4698498249053955, + "rewards/margins": 0.4451657831668854, + "rewards/rejected": -1.9150155782699585, + "step": 6770 + }, + { + "epoch": 1.1681598897312198, + "grad_norm": 26.217775344848633, + "learning_rate": 1.5318259295506004e-07, + "logits/chosen": -2.2057294845581055, + "logits/rejected": -2.170468330383301, + "logps/chosen": -199.53536987304688, + "logps/rejected": -232.54922485351562, + "loss": 0.6079, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.4228346347808838, + "rewards/margins": 0.37124061584472656, + "rewards/rejected": -1.7940753698349, + "step": 6780 + }, + { + "epoch": 1.169882839421089, + "grad_norm": 24.48525619506836, + "learning_rate": 1.5301270511893315e-07, + "logits/chosen": -2.247795820236206, + "logits/rejected": -2.213374614715576, + "logps/chosen": -176.34242248535156, + "logps/rejected": -228.22158813476562, + "loss": 0.5359, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.24251389503479, + "rewards/margins": 0.5219536423683167, + "rewards/rejected": -1.764467477798462, + "step": 6790 + }, + { + "epoch": 1.171605789110958, + "grad_norm": 20.723039627075195, + "learning_rate": 1.5284260420335345e-07, + "logits/chosen": -2.151221513748169, + "logits/rejected": -2.111957311630249, + "logps/chosen": -192.44325256347656, + "logps/rejected": -236.78494262695312, + "loss": 0.5681, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.3916516304016113, + "rewards/margins": 0.4804829955101013, + "rewards/rejected": -1.872134804725647, + "step": 6800 + }, + { + "epoch": 1.171605789110958, + "eval_logits/chosen": -2.2592737674713135, + "eval_logits/rejected": -2.2435038089752197, + "eval_logps/chosen": -192.49172973632812, + "eval_logps/rejected": -217.7540283203125, + "eval_loss": 0.6433852314949036, + "eval_rewards/accuracies": 0.6129181981086731, + "eval_rewards/chosen": -1.3347625732421875, + "eval_rewards/margins": 0.21528159081935883, + "eval_rewards/rejected": -1.550044298171997, + "eval_runtime": 382.3053, + "eval_samples_per_second": 11.258, + "eval_steps_per_second": 1.407, + "step": 6800 + }, + { + "epoch": 1.173328738800827, + "grad_norm": 28.285385131835938, + "learning_rate": 1.5267229089202514e-07, + "logits/chosen": -2.159170627593994, + "logits/rejected": -2.125225067138672, + "logps/chosen": -220.8817138671875, + "logps/rejected": -257.7799987792969, + "loss": 0.5872, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.6409006118774414, + "rewards/margins": 0.4281826913356781, + "rewards/rejected": -2.0690832138061523, + "step": 6810 + }, + { + "epoch": 1.175051688490696, + "grad_norm": 24.356565475463867, + "learning_rate": 1.5250176586950615e-07, + "logits/chosen": -2.2474682331085205, + "logits/rejected": -2.2131118774414062, + "logps/chosen": -224.6580047607422, + "logps/rejected": -268.55230712890625, + "loss": 0.5788, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.6821105480194092, + "rewards/margins": 0.485291063785553, + "rewards/rejected": -2.1674017906188965, + "step": 6820 + }, + { + "epoch": 1.176774638180565, + "grad_norm": 26.990726470947266, + "learning_rate": 1.523310298212054e-07, + "logits/chosen": -2.242975950241089, + "logits/rejected": -2.226395606994629, + "logps/chosen": -213.24368286132812, + "logps/rejected": -257.6997985839844, + "loss": 0.5952, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.5863196849822998, + "rewards/margins": 0.4677578806877136, + "rewards/rejected": -2.054077625274658, + "step": 6830 + }, + { + "epoch": 1.1784975878704342, + "grad_norm": 30.25794792175293, + "learning_rate": 1.5216008343337987e-07, + "logits/chosen": -2.2314229011535645, + "logits/rejected": -2.197105884552002, + "logps/chosen": -213.30313110351562, + "logps/rejected": -253.2093963623047, + "loss": 0.6025, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5536158084869385, + "rewards/margins": 0.43098410964012146, + "rewards/rejected": -1.9845998287200928, + "step": 6840 + }, + { + "epoch": 1.1802205375603032, + "grad_norm": 31.903627395629883, + "learning_rate": 1.5198892739313216e-07, + "logits/chosen": -2.160515546798706, + "logits/rejected": -2.1238436698913574, + "logps/chosen": -196.7727508544922, + "logps/rejected": -231.3065948486328, + "loss": 0.6101, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.4143223762512207, + "rewards/margins": 0.37205883860588074, + "rewards/rejected": -1.7863811254501343, + "step": 6850 + }, + { + "epoch": 1.1819434872501722, + "grad_norm": 23.95057487487793, + "learning_rate": 1.518175623884074e-07, + "logits/chosen": -2.245159864425659, + "logits/rejected": -2.203448534011841, + "logps/chosen": -199.4086456298828, + "logps/rejected": -227.75155639648438, + "loss": 0.5858, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.3946173191070557, + "rewards/margins": 0.3646080493927002, + "rewards/rejected": -1.7592252492904663, + "step": 6860 + }, + { + "epoch": 1.1836664369400414, + "grad_norm": 28.446277618408203, + "learning_rate": 1.516459891079907e-07, + "logits/chosen": -2.1487252712249756, + "logits/rejected": -2.1328117847442627, + "logps/chosen": -192.9253692626953, + "logps/rejected": -231.7952117919922, + "loss": 0.586, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.4090341329574585, + "rewards/margins": 0.39750105142593384, + "rewards/rejected": -1.8065353631973267, + "step": 6870 + }, + { + "epoch": 1.1853893866299103, + "grad_norm": 26.053268432617188, + "learning_rate": 1.5147420824150435e-07, + "logits/chosen": -2.2122786045074463, + "logits/rejected": -2.1730782985687256, + "logps/chosen": -196.47982788085938, + "logps/rejected": -238.5743408203125, + "loss": 0.5652, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.40488862991333, + "rewards/margins": 0.4748099446296692, + "rewards/rejected": -1.879698395729065, + "step": 6880 + }, + { + "epoch": 1.1871123363197795, + "grad_norm": 24.240577697753906, + "learning_rate": 1.5130222047940492e-07, + "logits/chosen": -2.132542848587036, + "logits/rejected": -2.0986483097076416, + "logps/chosen": -203.53436279296875, + "logps/rejected": -254.85830688476562, + "loss": 0.5518, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.5031158924102783, + "rewards/margins": 0.525459885597229, + "rewards/rejected": -2.0285756587982178, + "step": 6890 + }, + { + "epoch": 1.1888352860096485, + "grad_norm": 28.958574295043945, + "learning_rate": 1.5113002651298062e-07, + "logits/chosen": -2.17870831489563, + "logits/rejected": -2.1460628509521484, + "logps/chosen": -215.60385131835938, + "logps/rejected": -255.82870483398438, + "loss": 0.5897, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.6127774715423584, + "rewards/margins": 0.41913384199142456, + "rewards/rejected": -2.0319113731384277, + "step": 6900 + }, + { + "epoch": 1.1905582356995175, + "grad_norm": 27.42066764831543, + "learning_rate": 1.509576270343485e-07, + "logits/chosen": -2.1975607872009277, + "logits/rejected": -2.165322780609131, + "logps/chosen": -222.44503784179688, + "logps/rejected": -268.72802734375, + "loss": 0.5683, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.646388292312622, + "rewards/margins": 0.4927579462528229, + "rewards/rejected": -2.139146327972412, + "step": 6910 + }, + { + "epoch": 1.1922811853893867, + "grad_norm": 31.671611785888672, + "learning_rate": 1.5078502273645164e-07, + "logits/chosen": -2.240344762802124, + "logits/rejected": -2.2021901607513428, + "logps/chosen": -225.1938018798828, + "logps/rejected": -262.28289794921875, + "loss": 0.6135, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.685520887374878, + "rewards/margins": 0.4017064571380615, + "rewards/rejected": -2.0872275829315186, + "step": 6920 + }, + { + "epoch": 1.1940041350792556, + "grad_norm": 27.858375549316406, + "learning_rate": 1.5061221431305632e-07, + "logits/chosen": -2.1529998779296875, + "logits/rejected": -2.108311414718628, + "logps/chosen": -202.53628540039062, + "logps/rejected": -253.01431274414062, + "loss": 0.5529, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.4367836713790894, + "rewards/margins": 0.5604701638221741, + "rewards/rejected": -1.9972540140151978, + "step": 6930 + }, + { + "epoch": 1.1957270847691248, + "grad_norm": 33.72744369506836, + "learning_rate": 1.5043920245874937e-07, + "logits/chosen": -2.111140727996826, + "logits/rejected": -2.0591862201690674, + "logps/chosen": -204.8585205078125, + "logps/rejected": -251.541748046875, + "loss": 0.5423, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.4479444026947021, + "rewards/margins": 0.5700526833534241, + "rewards/rejected": -2.0179970264434814, + "step": 6940 + }, + { + "epoch": 1.1974500344589938, + "grad_norm": 29.296106338500977, + "learning_rate": 1.5026598786893522e-07, + "logits/chosen": -2.1314244270324707, + "logits/rejected": -2.098526954650879, + "logps/chosen": -229.43649291992188, + "logps/rejected": -279.2596130371094, + "loss": 0.5683, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.7518694400787354, + "rewards/margins": 0.513970136642456, + "rewards/rejected": -2.2658395767211914, + "step": 6950 + }, + { + "epoch": 1.1991729841488628, + "grad_norm": 26.940580368041992, + "learning_rate": 1.5009257123983322e-07, + "logits/chosen": -2.258476734161377, + "logits/rejected": -2.2213873863220215, + "logps/chosen": -235.5916748046875, + "logps/rejected": -263.60174560546875, + "loss": 0.6182, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.795373558998108, + "rewards/margins": 0.3696233332157135, + "rewards/rejected": -2.16499662399292, + "step": 6960 + }, + { + "epoch": 1.200895933838732, + "grad_norm": 26.772729873657227, + "learning_rate": 1.499189532684747e-07, + "logits/chosen": -2.210001230239868, + "logits/rejected": -2.1754508018493652, + "logps/chosen": -209.4125518798828, + "logps/rejected": -254.1207733154297, + "loss": 0.5482, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.5401835441589355, + "rewards/margins": 0.5008086562156677, + "rewards/rejected": -2.040992259979248, + "step": 6970 + }, + { + "epoch": 1.202618883528601, + "grad_norm": 28.58230209350586, + "learning_rate": 1.4974513465270049e-07, + "logits/chosen": -2.1468191146850586, + "logits/rejected": -2.111788272857666, + "logps/chosen": -212.5028839111328, + "logps/rejected": -256.9225769042969, + "loss": 0.5604, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.5714538097381592, + "rewards/margins": 0.47803419828414917, + "rewards/rejected": -2.049488067626953, + "step": 6980 + }, + { + "epoch": 1.20434183321847, + "grad_norm": 25.351545333862305, + "learning_rate": 1.4957111609115761e-07, + "logits/chosen": -2.1056313514709473, + "logits/rejected": -2.079068422317505, + "logps/chosen": -216.9329071044922, + "logps/rejected": -250.76547241210938, + "loss": 0.6179, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.5887540578842163, + "rewards/margins": 0.3808908462524414, + "rewards/rejected": -1.9696447849273682, + "step": 6990 + }, + { + "epoch": 1.206064782908339, + "grad_norm": 23.871530532836914, + "learning_rate": 1.4939689828329694e-07, + "logits/chosen": -2.3162684440612793, + "logits/rejected": -2.2757205963134766, + "logps/chosen": -211.6276397705078, + "logps/rejected": -262.7303466796875, + "loss": 0.5402, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.5442078113555908, + "rewards/margins": 0.5424421429634094, + "rewards/rejected": -2.0866498947143555, + "step": 7000 + }, + { + "epoch": 1.207787732598208, + "grad_norm": 19.846067428588867, + "learning_rate": 1.492224819293701e-07, + "logits/chosen": -2.2165017127990723, + "logits/rejected": -2.1862950325012207, + "logps/chosen": -192.9996337890625, + "logps/rejected": -227.0289764404297, + "loss": 0.5975, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.3586089611053467, + "rewards/margins": 0.39048129320144653, + "rewards/rejected": -1.7490901947021484, + "step": 7010 + }, + { + "epoch": 1.2095106822880772, + "grad_norm": 30.123388290405273, + "learning_rate": 1.490478677304268e-07, + "logits/chosen": -2.1815481185913086, + "logits/rejected": -2.1474032402038574, + "logps/chosen": -179.94114685058594, + "logps/rejected": -219.311279296875, + "loss": 0.5872, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.279196858406067, + "rewards/margins": 0.3986092209815979, + "rewards/rejected": -1.6778061389923096, + "step": 7020 + }, + { + "epoch": 1.2112336319779462, + "grad_norm": 49.9505729675293, + "learning_rate": 1.4887305638831207e-07, + "logits/chosen": -2.226543426513672, + "logits/rejected": -2.192350149154663, + "logps/chosen": -198.51046752929688, + "logps/rejected": -240.5613555908203, + "loss": 0.588, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.4425830841064453, + "rewards/margins": 0.4182146191596985, + "rewards/rejected": -1.860797643661499, + "step": 7030 + }, + { + "epoch": 1.2129565816678154, + "grad_norm": 74.18596649169922, + "learning_rate": 1.486980486056631e-07, + "logits/chosen": -2.1702017784118652, + "logits/rejected": -2.1381258964538574, + "logps/chosen": -212.8441619873047, + "logps/rejected": -259.13226318359375, + "loss": 0.5804, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5529807806015015, + "rewards/margins": 0.4791305959224701, + "rewards/rejected": -2.032111406326294, + "step": 7040 + }, + { + "epoch": 1.2146795313576844, + "grad_norm": 27.31275177001953, + "learning_rate": 1.4852284508590686e-07, + "logits/chosen": -2.1526901721954346, + "logits/rejected": -2.128535032272339, + "logps/chosen": -198.78216552734375, + "logps/rejected": -236.4357147216797, + "loss": 0.62, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.4617741107940674, + "rewards/margins": 0.37819933891296387, + "rewards/rejected": -1.8399736881256104, + "step": 7050 + }, + { + "epoch": 1.2164024810475533, + "grad_norm": 28.094131469726562, + "learning_rate": 1.483474465332569e-07, + "logits/chosen": -2.2378499507904053, + "logits/rejected": -2.2252941131591797, + "logps/chosen": -185.1388397216797, + "logps/rejected": -211.9171600341797, + "loss": 0.6271, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.3142409324645996, + "rewards/margins": 0.2868518829345703, + "rewards/rejected": -1.6010926961898804, + "step": 7060 + }, + { + "epoch": 1.2181254307374225, + "grad_norm": 29.921306610107422, + "learning_rate": 1.4817185365271092e-07, + "logits/chosen": -2.2271568775177, + "logits/rejected": -2.1991305351257324, + "logps/chosen": -167.1840057373047, + "logps/rejected": -190.1419677734375, + "loss": 0.6348, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.118652582168579, + "rewards/margins": 0.25607386231422424, + "rewards/rejected": -1.3747262954711914, + "step": 7070 + }, + { + "epoch": 1.2198483804272915, + "grad_norm": 31.323341369628906, + "learning_rate": 1.4799606715004744e-07, + "logits/chosen": -2.3093223571777344, + "logits/rejected": -2.2754738330841064, + "logps/chosen": -154.3928985595703, + "logps/rejected": -181.71890258789062, + "loss": 0.6099, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.999815821647644, + "rewards/margins": 0.28015637397766113, + "rewards/rejected": -1.2799723148345947, + "step": 7080 + }, + { + "epoch": 1.2215713301171607, + "grad_norm": 38.321563720703125, + "learning_rate": 1.4782008773182342e-07, + "logits/chosen": -2.3063206672668457, + "logits/rejected": -2.2809898853302, + "logps/chosen": -166.0218048095703, + "logps/rejected": -211.0521697998047, + "loss": 0.546, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.1230088472366333, + "rewards/margins": 0.48068147897720337, + "rewards/rejected": -1.6036901473999023, + "step": 7090 + }, + { + "epoch": 1.2232942798070296, + "grad_norm": 29.58236312866211, + "learning_rate": 1.476439161053711e-07, + "logits/chosen": -2.159698009490967, + "logits/rejected": -2.1280694007873535, + "logps/chosen": -210.9506378173828, + "logps/rejected": -251.65866088867188, + "loss": 0.6058, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.5356671810150146, + "rewards/margins": 0.42311763763427734, + "rewards/rejected": -1.958784818649292, + "step": 7100 + }, + { + "epoch": 1.2250172294968986, + "grad_norm": 21.98423957824707, + "learning_rate": 1.4746755297879535e-07, + "logits/chosen": -2.1789379119873047, + "logits/rejected": -2.1484053134918213, + "logps/chosen": -204.1139678955078, + "logps/rejected": -236.242919921875, + "loss": 0.6129, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.4982563257217407, + "rewards/margins": 0.36157017946243286, + "rewards/rejected": -1.8598264455795288, + "step": 7110 + }, + { + "epoch": 1.2267401791867678, + "grad_norm": 38.57432174682617, + "learning_rate": 1.4729099906097074e-07, + "logits/chosen": -2.189185857772827, + "logits/rejected": -2.14705753326416, + "logps/chosen": -204.56578063964844, + "logps/rejected": -229.01296997070312, + "loss": 0.61, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.4135963916778564, + "rewards/margins": 0.35852089524269104, + "rewards/rejected": -1.772117257118225, + "step": 7120 + }, + { + "epoch": 1.2284631288766368, + "grad_norm": 40.41454315185547, + "learning_rate": 1.4711425506153872e-07, + "logits/chosen": -2.1416923999786377, + "logits/rejected": -2.108492612838745, + "logps/chosen": -184.8870391845703, + "logps/rejected": -223.3743438720703, + "loss": 0.5817, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.2780604362487793, + "rewards/margins": 0.42994004487991333, + "rewards/rejected": -1.7080005407333374, + "step": 7130 + }, + { + "epoch": 1.230186078566506, + "grad_norm": 29.750425338745117, + "learning_rate": 1.4693732169090472e-07, + "logits/chosen": -2.2249319553375244, + "logits/rejected": -2.2040207386016846, + "logps/chosen": -176.0301513671875, + "logps/rejected": -214.18447875976562, + "loss": 0.5941, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.2371530532836914, + "rewards/margins": 0.3913215100765228, + "rewards/rejected": -1.6284745931625366, + "step": 7140 + }, + { + "epoch": 1.231909028256375, + "grad_norm": 37.294952392578125, + "learning_rate": 1.4676019966023537e-07, + "logits/chosen": -2.2079851627349854, + "logits/rejected": -2.1838433742523193, + "logps/chosen": -224.13235473632812, + "logps/rejected": -258.48931884765625, + "loss": 0.6243, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.6731014251708984, + "rewards/margins": 0.3684059679508209, + "rewards/rejected": -2.0415074825286865, + "step": 7150 + }, + { + "epoch": 1.233631977946244, + "grad_norm": 42.03847885131836, + "learning_rate": 1.4658288968145556e-07, + "logits/chosen": -2.19736909866333, + "logits/rejected": -2.154069423675537, + "logps/chosen": -187.75115966796875, + "logps/rejected": -230.8140869140625, + "loss": 0.5753, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.335819959640503, + "rewards/margins": 0.44506731629371643, + "rewards/rejected": -1.7808876037597656, + "step": 7160 + }, + { + "epoch": 1.235354927636113, + "grad_norm": 45.9422721862793, + "learning_rate": 1.4640539246724565e-07, + "logits/chosen": -2.196071147918701, + "logits/rejected": -2.1541483402252197, + "logps/chosen": -178.31436157226562, + "logps/rejected": -224.1558380126953, + "loss": 0.5716, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.2226836681365967, + "rewards/margins": 0.4886323809623718, + "rewards/rejected": -1.7113158702850342, + "step": 7170 + }, + { + "epoch": 1.237077877325982, + "grad_norm": 28.878841400146484, + "learning_rate": 1.4622770873103857e-07, + "logits/chosen": -2.2925209999084473, + "logits/rejected": -2.2665061950683594, + "logps/chosen": -177.89590454101562, + "logps/rejected": -213.99520874023438, + "loss": 0.5741, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.2149291038513184, + "rewards/margins": 0.3878980576992035, + "rewards/rejected": -1.6028270721435547, + "step": 7180 + }, + { + "epoch": 1.2388008270158513, + "grad_norm": 29.684158325195312, + "learning_rate": 1.4604983918701692e-07, + "logits/chosen": -2.1275532245635986, + "logits/rejected": -2.08992338180542, + "logps/chosen": -190.8628692626953, + "logps/rejected": -236.9882049560547, + "loss": 0.5687, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.3781225681304932, + "rewards/margins": 0.46524912118911743, + "rewards/rejected": -1.8433716297149658, + "step": 7190 + }, + { + "epoch": 1.2405237767057202, + "grad_norm": 30.833763122558594, + "learning_rate": 1.4587178455011021e-07, + "logits/chosen": -2.1466710567474365, + "logits/rejected": -2.109937906265259, + "logps/chosen": -215.5335693359375, + "logps/rejected": -269.4213562011719, + "loss": 0.5602, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.616548776626587, + "rewards/margins": 0.5279964208602905, + "rewards/rejected": -2.144545316696167, + "step": 7200 + }, + { + "epoch": 1.2405237767057202, + "eval_logits/chosen": -2.2377779483795166, + "eval_logits/rejected": -2.2210443019866943, + "eval_logps/chosen": -195.74278259277344, + "eval_logps/rejected": -222.339111328125, + "eval_loss": 0.6448310017585754, + "eval_rewards/accuracies": 0.6233736276626587, + "eval_rewards/chosen": -1.367273211479187, + "eval_rewards/margins": 0.22862191498279572, + "eval_rewards/rejected": -1.5958951711654663, + "eval_runtime": 383.222, + "eval_samples_per_second": 11.231, + "eval_steps_per_second": 1.404, + "step": 7200 + }, + { + "epoch": 1.2422467263955892, + "grad_norm": 39.25101852416992, + "learning_rate": 1.4569354553599186e-07, + "logits/chosen": -2.211578845977783, + "logits/rejected": -2.1857690811157227, + "logps/chosen": -227.9265899658203, + "logps/rejected": -249.36239624023438, + "loss": 0.6531, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.6993329524993896, + "rewards/margins": 0.2680916488170624, + "rewards/rejected": -1.967424750328064, + "step": 7210 + }, + { + "epoch": 1.2439696760854584, + "grad_norm": 29.855588912963867, + "learning_rate": 1.4551512286107642e-07, + "logits/chosen": -2.142920970916748, + "logits/rejected": -2.0990686416625977, + "logps/chosen": -193.0585479736328, + "logps/rejected": -237.34970092773438, + "loss": 0.5624, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.3795111179351807, + "rewards/margins": 0.4934348464012146, + "rewards/rejected": -1.872945785522461, + "step": 7220 + }, + { + "epoch": 1.2456926257753274, + "grad_norm": 28.475894927978516, + "learning_rate": 1.4533651724251654e-07, + "logits/chosen": -2.1553685665130615, + "logits/rejected": -2.127537250518799, + "logps/chosen": -188.8539581298828, + "logps/rejected": -226.0784912109375, + "loss": 0.5846, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.336523413658142, + "rewards/margins": 0.407946914434433, + "rewards/rejected": -1.7444703578948975, + "step": 7230 + }, + { + "epoch": 1.2474155754651963, + "grad_norm": 25.94778060913086, + "learning_rate": 1.4515772939820036e-07, + "logits/chosen": -2.198272228240967, + "logits/rejected": -2.176140308380127, + "logps/chosen": -200.8077392578125, + "logps/rejected": -236.87417602539062, + "loss": 0.5909, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.4269249439239502, + "rewards/margins": 0.39614081382751465, + "rewards/rejected": -1.8230657577514648, + "step": 7240 + }, + { + "epoch": 1.2491385251550655, + "grad_norm": 29.854503631591797, + "learning_rate": 1.4497876004674824e-07, + "logits/chosen": -2.2051305770874023, + "logits/rejected": -2.167301893234253, + "logps/chosen": -193.99215698242188, + "logps/rejected": -232.569091796875, + "loss": 0.565, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.38264799118042, + "rewards/margins": 0.4309522211551666, + "rewards/rejected": -1.813599944114685, + "step": 7250 + }, + { + "epoch": 1.2508614748449345, + "grad_norm": 33.17638397216797, + "learning_rate": 1.4479960990751037e-07, + "logits/chosen": -2.19197154045105, + "logits/rejected": -2.1598422527313232, + "logps/chosen": -205.8832550048828, + "logps/rejected": -246.341552734375, + "loss": 0.5765, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.507854700088501, + "rewards/margins": 0.44655442237854004, + "rewards/rejected": -1.9544092416763306, + "step": 7260 + }, + { + "epoch": 1.2525844245348035, + "grad_norm": 33.7602424621582, + "learning_rate": 1.4462027970056336e-07, + "logits/chosen": -2.221886396408081, + "logits/rejected": -2.180298328399658, + "logps/chosen": -192.14157104492188, + "logps/rejected": -230.5829315185547, + "loss": 0.5885, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.3673036098480225, + "rewards/margins": 0.4185425341129303, + "rewards/rejected": -1.7858461141586304, + "step": 7270 + }, + { + "epoch": 1.2543073742246726, + "grad_norm": 28.689376831054688, + "learning_rate": 1.4444077014670767e-07, + "logits/chosen": -2.2600386142730713, + "logits/rejected": -2.216193437576294, + "logps/chosen": -200.1356964111328, + "logps/rejected": -241.4607696533203, + "loss": 0.5815, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.4219920635223389, + "rewards/margins": 0.4745899736881256, + "rewards/rejected": -1.896582007408142, + "step": 7280 + }, + { + "epoch": 1.2560303239145416, + "grad_norm": 27.535062789916992, + "learning_rate": 1.4426108196746465e-07, + "logits/chosen": -2.136287212371826, + "logits/rejected": -2.109666347503662, + "logps/chosen": -191.55276489257812, + "logps/rejected": -227.65219116210938, + "loss": 0.5933, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.3572779893875122, + "rewards/margins": 0.367828905582428, + "rewards/rejected": -1.725106954574585, + "step": 7290 + }, + { + "epoch": 1.2577532736044108, + "grad_norm": 25.037973403930664, + "learning_rate": 1.4408121588507358e-07, + "logits/chosen": -2.101698398590088, + "logits/rejected": -2.072650194168091, + "logps/chosen": -185.10435485839844, + "logps/rejected": -224.55752563476562, + "loss": 0.6116, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.3446990251541138, + "rewards/margins": 0.37932828068733215, + "rewards/rejected": -1.724027395248413, + "step": 7300 + }, + { + "epoch": 1.2594762232942798, + "grad_norm": 24.10384750366211, + "learning_rate": 1.4390117262248886e-07, + "logits/chosen": -2.235344409942627, + "logits/rejected": -2.1998746395111084, + "logps/chosen": -192.3137969970703, + "logps/rejected": -235.06784057617188, + "loss": 0.578, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.3667701482772827, + "rewards/margins": 0.47494545578956604, + "rewards/rejected": -1.8417155742645264, + "step": 7310 + }, + { + "epoch": 1.2611991729841487, + "grad_norm": 21.041650772094727, + "learning_rate": 1.4372095290337697e-07, + "logits/chosen": -2.248053789138794, + "logits/rejected": -2.21608829498291, + "logps/chosen": -176.29461669921875, + "logps/rejected": -209.7880096435547, + "loss": 0.6027, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.205918788909912, + "rewards/margins": 0.38784992694854736, + "rewards/rejected": -1.5937687158584595, + "step": 7320 + }, + { + "epoch": 1.262922122674018, + "grad_norm": 43.863494873046875, + "learning_rate": 1.4354055745211372e-07, + "logits/chosen": -2.1404201984405518, + "logits/rejected": -2.103395938873291, + "logps/chosen": -191.5880584716797, + "logps/rejected": -242.8134002685547, + "loss": 0.5418, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.3741697072982788, + "rewards/margins": 0.5526102185249329, + "rewards/rejected": -1.9267799854278564, + "step": 7330 + }, + { + "epoch": 1.264645072363887, + "grad_norm": 22.694883346557617, + "learning_rate": 1.4335998699378123e-07, + "logits/chosen": -2.2119076251983643, + "logits/rejected": -2.1771349906921387, + "logps/chosen": -206.36874389648438, + "logps/rejected": -241.12820434570312, + "loss": 0.6079, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.503580093383789, + "rewards/margins": 0.3770178556442261, + "rewards/rejected": -1.8805980682373047, + "step": 7340 + }, + { + "epoch": 1.266368022053756, + "grad_norm": 33.1309700012207, + "learning_rate": 1.4317924225416493e-07, + "logits/chosen": -2.2810635566711426, + "logits/rejected": -2.243312120437622, + "logps/chosen": -177.04080200195312, + "logps/rejected": -215.0124969482422, + "loss": 0.5747, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.1863033771514893, + "rewards/margins": 0.45680537819862366, + "rewards/rejected": -1.6431087255477905, + "step": 7350 + }, + { + "epoch": 1.268090971743625, + "grad_norm": 36.34376525878906, + "learning_rate": 1.42998323959751e-07, + "logits/chosen": -2.171330213546753, + "logits/rejected": -2.145925998687744, + "logps/chosen": -194.1071319580078, + "logps/rejected": -231.82693481445312, + "loss": 0.5934, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.385913610458374, + "rewards/margins": 0.4217739701271057, + "rewards/rejected": -1.807687759399414, + "step": 7360 + }, + { + "epoch": 1.269813921433494, + "grad_norm": 44.59245681762695, + "learning_rate": 1.4281723283772297e-07, + "logits/chosen": -2.1258740425109863, + "logits/rejected": -2.0956592559814453, + "logps/chosen": -197.22552490234375, + "logps/rejected": -241.16964721679688, + "loss": 0.5757, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.4306964874267578, + "rewards/margins": 0.47660359740257263, + "rewards/rejected": -1.9073002338409424, + "step": 7370 + }, + { + "epoch": 1.2715368711233632, + "grad_norm": 31.368579864501953, + "learning_rate": 1.4263596961595913e-07, + "logits/chosen": -2.1727757453918457, + "logits/rejected": -2.1414594650268555, + "logps/chosen": -196.63665771484375, + "logps/rejected": -243.6747589111328, + "loss": 0.5809, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.4313442707061768, + "rewards/margins": 0.47405076026916504, + "rewards/rejected": -1.9053949117660522, + "step": 7380 + }, + { + "epoch": 1.2732598208132322, + "grad_norm": 47.17496871948242, + "learning_rate": 1.424545350230296e-07, + "logits/chosen": -2.1499431133270264, + "logits/rejected": -2.1169068813323975, + "logps/chosen": -197.3967742919922, + "logps/rejected": -244.5073699951172, + "loss": 0.5753, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.4444047212600708, + "rewards/margins": 0.4816197454929352, + "rewards/rejected": -1.9260244369506836, + "step": 7390 + }, + { + "epoch": 1.2749827705031014, + "grad_norm": 23.355920791625977, + "learning_rate": 1.422729297881931e-07, + "logits/chosen": -2.127746343612671, + "logits/rejected": -2.0778279304504395, + "logps/chosen": -226.56710815429688, + "logps/rejected": -276.52569580078125, + "loss": 0.5395, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.7020788192749023, + "rewards/margins": 0.5654906034469604, + "rewards/rejected": -2.267569065093994, + "step": 7400 + }, + { + "epoch": 1.2767057201929704, + "grad_norm": 45.91021728515625, + "learning_rate": 1.4209115464139445e-07, + "logits/chosen": -2.117419958114624, + "logits/rejected": -2.0680668354034424, + "logps/chosen": -225.8282470703125, + "logps/rejected": -279.32623291015625, + "loss": 0.5735, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.7132211923599243, + "rewards/margins": 0.5609859824180603, + "rewards/rejected": -2.274207353591919, + "step": 7410 + }, + { + "epoch": 1.2784286698828393, + "grad_norm": 30.91571044921875, + "learning_rate": 1.419092103132612e-07, + "logits/chosen": -2.076505661010742, + "logits/rejected": -2.050628423690796, + "logps/chosen": -228.55966186523438, + "logps/rejected": -265.3526916503906, + "loss": 0.5955, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.7106529474258423, + "rewards/margins": 0.41547513008117676, + "rewards/rejected": -2.1261279582977295, + "step": 7420 + }, + { + "epoch": 1.2801516195727085, + "grad_norm": 39.247623443603516, + "learning_rate": 1.4172709753510117e-07, + "logits/chosen": -2.0895779132843018, + "logits/rejected": -2.051273822784424, + "logps/chosen": -219.62612915039062, + "logps/rejected": -268.3280334472656, + "loss": 0.5791, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.6621767282485962, + "rewards/margins": 0.5093218088150024, + "rewards/rejected": -2.1714985370635986, + "step": 7430 + }, + { + "epoch": 1.2818745692625775, + "grad_norm": 26.95642852783203, + "learning_rate": 1.41544817038899e-07, + "logits/chosen": -2.2014544010162354, + "logits/rejected": -2.1671154499053955, + "logps/chosen": -202.35275268554688, + "logps/rejected": -237.54611206054688, + "loss": 0.61, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.4587031602859497, + "rewards/margins": 0.3934493660926819, + "rewards/rejected": -1.8521524667739868, + "step": 7440 + }, + { + "epoch": 1.2835975189524467, + "grad_norm": 28.73160171508789, + "learning_rate": 1.4136236955731354e-07, + "logits/chosen": -2.317675828933716, + "logits/rejected": -2.2803618907928467, + "logps/chosen": -173.7164764404297, + "logps/rejected": -201.6647186279297, + "loss": 0.6046, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.1620408296585083, + "rewards/margins": 0.33942776918411255, + "rewards/rejected": -1.5014686584472656, + "step": 7450 + }, + { + "epoch": 1.2853204686423156, + "grad_norm": 26.703969955444336, + "learning_rate": 1.4117975582367488e-07, + "logits/chosen": -2.168306350708008, + "logits/rejected": -2.148745536804199, + "logps/chosen": -176.16409301757812, + "logps/rejected": -217.79684448242188, + "loss": 0.5928, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.25123131275177, + "rewards/margins": 0.4050864577293396, + "rewards/rejected": -1.6563177108764648, + "step": 7460 + }, + { + "epoch": 1.2870434183321846, + "grad_norm": 33.33496856689453, + "learning_rate": 1.4099697657198128e-07, + "logits/chosen": -2.2102653980255127, + "logits/rejected": -2.189234733581543, + "logps/chosen": -196.8131103515625, + "logps/rejected": -221.9905548095703, + "loss": 0.65, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.4170329570770264, + "rewards/margins": 0.29639095067977905, + "rewards/rejected": -1.7134240865707397, + "step": 7470 + }, + { + "epoch": 1.2887663680220538, + "grad_norm": 32.69403839111328, + "learning_rate": 1.4081403253689638e-07, + "logits/chosen": -2.1745803356170654, + "logits/rejected": -2.140530586242676, + "logps/chosen": -177.96560668945312, + "logps/rejected": -204.87222290039062, + "loss": 0.6119, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.2249789237976074, + "rewards/margins": 0.3372560739517212, + "rewards/rejected": -1.5622351169586182, + "step": 7480 + }, + { + "epoch": 1.2904893177119228, + "grad_norm": 27.952274322509766, + "learning_rate": 1.4063092445374591e-07, + "logits/chosen": -2.1473286151885986, + "logits/rejected": -2.1264710426330566, + "logps/chosen": -187.06661987304688, + "logps/rejected": -220.3990936279297, + "loss": 0.6118, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.3386499881744385, + "rewards/margins": 0.33736392855644226, + "rewards/rejected": -1.6760139465332031, + "step": 7490 + }, + { + "epoch": 1.292212267401792, + "grad_norm": 27.661653518676758, + "learning_rate": 1.404476530585153e-07, + "logits/chosen": -2.1809844970703125, + "logits/rejected": -2.1510908603668213, + "logps/chosen": -175.53463745117188, + "logps/rejected": -207.1257781982422, + "loss": 0.6244, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.2388243675231934, + "rewards/margins": 0.32033151388168335, + "rewards/rejected": -1.559156060218811, + "step": 7500 + }, + { + "epoch": 1.293935217091661, + "grad_norm": 26.77812957763672, + "learning_rate": 1.402642190878462e-07, + "logits/chosen": -2.1876208782196045, + "logits/rejected": -2.161717414855957, + "logps/chosen": -178.3339080810547, + "logps/rejected": -207.5714111328125, + "loss": 0.5876, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.1986923217773438, + "rewards/margins": 0.36776408553123474, + "rewards/rejected": -1.5664561986923218, + "step": 7510 + }, + { + "epoch": 1.29565816678153, + "grad_norm": 26.58428192138672, + "learning_rate": 1.4008062327903373e-07, + "logits/chosen": -2.1912448406219482, + "logits/rejected": -2.166442394256592, + "logps/chosen": -173.25840759277344, + "logps/rejected": -209.9363555908203, + "loss": 0.5816, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.1874148845672607, + "rewards/margins": 0.38662785291671753, + "rewards/rejected": -1.574042797088623, + "step": 7520 + }, + { + "epoch": 1.297381116471399, + "grad_norm": 24.964679718017578, + "learning_rate": 1.398968663700235e-07, + "logits/chosen": -2.139922618865967, + "logits/rejected": -2.1178131103515625, + "logps/chosen": -172.1335906982422, + "logps/rejected": -214.1690216064453, + "loss": 0.5787, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.194275140762329, + "rewards/margins": 0.4243914484977722, + "rewards/rejected": -1.618666410446167, + "step": 7530 + }, + { + "epoch": 1.299104066161268, + "grad_norm": 24.18116569519043, + "learning_rate": 1.3971294909940872e-07, + "logits/chosen": -2.2325923442840576, + "logits/rejected": -2.204035758972168, + "logps/chosen": -176.44895935058594, + "logps/rejected": -231.82229614257812, + "loss": 0.5317, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.2390239238739014, + "rewards/margins": 0.5569388270378113, + "rewards/rejected": -1.7959626913070679, + "step": 7540 + }, + { + "epoch": 1.3008270158511372, + "grad_norm": 33.15352249145508, + "learning_rate": 1.395288722064271e-07, + "logits/chosen": -2.1470162868499756, + "logits/rejected": -2.121925115585327, + "logps/chosen": -198.5567169189453, + "logps/rejected": -244.9143829345703, + "loss": 0.585, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.4541960954666138, + "rewards/margins": 0.4865036606788635, + "rewards/rejected": -1.940699577331543, + "step": 7550 + }, + { + "epoch": 1.3025499655410062, + "grad_norm": 27.650455474853516, + "learning_rate": 1.39344636430958e-07, + "logits/chosen": -2.2052130699157715, + "logits/rejected": -2.1664042472839355, + "logps/chosen": -195.47647094726562, + "logps/rejected": -255.9792938232422, + "loss": 0.5353, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.4195246696472168, + "rewards/margins": 0.5902754068374634, + "rewards/rejected": -2.0097999572753906, + "step": 7560 + }, + { + "epoch": 1.3042729152308752, + "grad_norm": 37.809913635253906, + "learning_rate": 1.3916024251351922e-07, + "logits/chosen": -2.1798274517059326, + "logits/rejected": -2.1409709453582764, + "logps/chosen": -233.6822052001953, + "logps/rejected": -287.028564453125, + "loss": 0.5546, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.7669155597686768, + "rewards/margins": 0.5862165093421936, + "rewards/rejected": -2.3531317710876465, + "step": 7570 + }, + { + "epoch": 1.3059958649207444, + "grad_norm": 38.519527435302734, + "learning_rate": 1.3897569119526442e-07, + "logits/chosen": -2.1253037452697754, + "logits/rejected": -2.095015287399292, + "logps/chosen": -234.3262939453125, + "logps/rejected": -282.79449462890625, + "loss": 0.5699, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.7710899114608765, + "rewards/margins": 0.5180790424346924, + "rewards/rejected": -2.2891690731048584, + "step": 7580 + }, + { + "epoch": 1.3077188146106133, + "grad_norm": 24.080333709716797, + "learning_rate": 1.387909832179798e-07, + "logits/chosen": -2.1358370780944824, + "logits/rejected": -2.0884411334991455, + "logps/chosen": -253.19937133789062, + "logps/rejected": -307.0641784667969, + "loss": 0.5695, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.9425344467163086, + "rewards/margins": 0.632226824760437, + "rewards/rejected": -2.574761390686035, + "step": 7590 + }, + { + "epoch": 1.3094417643004825, + "grad_norm": 45.20371627807617, + "learning_rate": 1.3860611932408118e-07, + "logits/chosen": -2.122115135192871, + "logits/rejected": -2.0939908027648926, + "logps/chosen": -235.76028442382812, + "logps/rejected": -265.4013366699219, + "loss": 0.6357, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.8253326416015625, + "rewards/margins": 0.3514913022518158, + "rewards/rejected": -2.176823854446411, + "step": 7600 + }, + { + "epoch": 1.3094417643004825, + "eval_logits/chosen": -2.2208468914031982, + "eval_logits/rejected": -2.203432083129883, + "eval_logps/chosen": -198.77024841308594, + "eval_logps/rejected": -226.18759155273438, + "eval_loss": 0.6412925720214844, + "eval_rewards/accuracies": 0.6124535202980042, + "eval_rewards/chosen": -1.3975476026535034, + "eval_rewards/margins": 0.23683220148086548, + "eval_rewards/rejected": -1.6343798637390137, + "eval_runtime": 383.4213, + "eval_samples_per_second": 11.225, + "eval_steps_per_second": 1.403, + "step": 7600 + }, + { + "epoch": 1.3111647139903515, + "grad_norm": 22.482765197753906, + "learning_rate": 1.3842110025661126e-07, + "logits/chosen": -2.1169135570526123, + "logits/rejected": -2.0805201530456543, + "logps/chosen": -200.65127563476562, + "logps/rejected": -246.5351104736328, + "loss": 0.5702, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.4636411666870117, + "rewards/margins": 0.5099278688430786, + "rewards/rejected": -1.9735692739486694, + "step": 7610 + }, + { + "epoch": 1.3128876636802205, + "grad_norm": 31.662145614624023, + "learning_rate": 1.3823592675923625e-07, + "logits/chosen": -2.166860342025757, + "logits/rejected": -2.135286569595337, + "logps/chosen": -199.06710815429688, + "logps/rejected": -237.6704864501953, + "loss": 0.5748, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.4023758172988892, + "rewards/margins": 0.43845334649086, + "rewards/rejected": -1.8408292531967163, + "step": 7620 + }, + { + "epoch": 1.3146106133700897, + "grad_norm": 30.58859634399414, + "learning_rate": 1.3805059957624318e-07, + "logits/chosen": -2.1249918937683105, + "logits/rejected": -2.1092543601989746, + "logps/chosen": -196.4220733642578, + "logps/rejected": -248.9763946533203, + "loss": 0.5485, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4566963911056519, + "rewards/margins": 0.49759864807128906, + "rewards/rejected": -1.9542949199676514, + "step": 7630 + }, + { + "epoch": 1.3163335630599586, + "grad_norm": 38.346683502197266, + "learning_rate": 1.3786511945253675e-07, + "logits/chosen": -2.1035542488098145, + "logits/rejected": -2.066838502883911, + "logps/chosen": -229.5228271484375, + "logps/rejected": -273.35430908203125, + "loss": 0.5913, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.7108421325683594, + "rewards/margins": 0.5063773989677429, + "rewards/rejected": -2.217219591140747, + "step": 7640 + }, + { + "epoch": 1.3180565127498278, + "grad_norm": 25.326038360595703, + "learning_rate": 1.3767948713363646e-07, + "logits/chosen": -2.168118476867676, + "logits/rejected": -2.138068199157715, + "logps/chosen": -226.3521270751953, + "logps/rejected": -265.44915771484375, + "loss": 0.6025, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.6705223321914673, + "rewards/margins": 0.4374934136867523, + "rewards/rejected": -2.108015775680542, + "step": 7650 + }, + { + "epoch": 1.3197794624396968, + "grad_norm": 21.727615356445312, + "learning_rate": 1.374937033656735e-07, + "logits/chosen": -2.2056868076324463, + "logits/rejected": -2.164641857147217, + "logps/chosen": -193.8480682373047, + "logps/rejected": -246.1538848876953, + "loss": 0.5336, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.3655202388763428, + "rewards/margins": 0.5554262399673462, + "rewards/rejected": -1.920946717262268, + "step": 7660 + }, + { + "epoch": 1.3215024121295658, + "grad_norm": 31.452966690063477, + "learning_rate": 1.3730776889538776e-07, + "logits/chosen": -2.1546008586883545, + "logits/rejected": -2.1246042251586914, + "logps/chosen": -194.42176818847656, + "logps/rejected": -227.3402557373047, + "loss": 0.6119, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.3616440296173096, + "rewards/margins": 0.3867500424385071, + "rewards/rejected": -1.7483940124511719, + "step": 7670 + }, + { + "epoch": 1.323225361819435, + "grad_norm": 27.648405075073242, + "learning_rate": 1.3712168447012493e-07, + "logits/chosen": -2.192735195159912, + "logits/rejected": -2.1616322994232178, + "logps/chosen": -187.9457244873047, + "logps/rejected": -228.00625610351562, + "loss": 0.5635, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.3604012727737427, + "rewards/margins": 0.43012505769729614, + "rewards/rejected": -1.7905261516571045, + "step": 7680 + }, + { + "epoch": 1.324948311509304, + "grad_norm": 24.951980590820312, + "learning_rate": 1.369354508378334e-07, + "logits/chosen": -2.2712864875793457, + "logits/rejected": -2.2198472023010254, + "logps/chosen": -194.9119873046875, + "logps/rejected": -235.7847137451172, + "loss": 0.5708, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.385386347770691, + "rewards/margins": 0.48633819818496704, + "rewards/rejected": -1.8717244863510132, + "step": 7690 + }, + { + "epoch": 1.3266712611991731, + "grad_norm": 22.935848236083984, + "learning_rate": 1.3674906874706129e-07, + "logits/chosen": -2.1742866039276123, + "logits/rejected": -2.1302480697631836, + "logps/chosen": -194.0531005859375, + "logps/rejected": -235.50125122070312, + "loss": 0.5815, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.3940820693969727, + "rewards/margins": 0.42350903153419495, + "rewards/rejected": -1.8175909519195557, + "step": 7700 + }, + { + "epoch": 1.328394210889042, + "grad_norm": 31.301496505737305, + "learning_rate": 1.365625389469534e-07, + "logits/chosen": -2.1869258880615234, + "logits/rejected": -2.160074234008789, + "logps/chosen": -198.0553436279297, + "logps/rejected": -230.83023071289062, + "loss": 0.6123, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.4490340948104858, + "rewards/margins": 0.3512323498725891, + "rewards/rejected": -1.8002665042877197, + "step": 7710 + }, + { + "epoch": 1.330117160578911, + "grad_norm": 32.13523864746094, + "learning_rate": 1.363758621872483e-07, + "logits/chosen": -2.193956136703491, + "logits/rejected": -2.156627893447876, + "logps/chosen": -196.5673370361328, + "logps/rejected": -227.8412628173828, + "loss": 0.5943, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.3913943767547607, + "rewards/margins": 0.3918333947658539, + "rewards/rejected": -1.7832276821136475, + "step": 7720 + }, + { + "epoch": 1.33184011026878, + "grad_norm": 36.192283630371094, + "learning_rate": 1.361890392182752e-07, + "logits/chosen": -2.131290912628174, + "logits/rejected": -2.0990428924560547, + "logps/chosen": -181.3380584716797, + "logps/rejected": -223.23141479492188, + "loss": 0.5653, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.2632769346237183, + "rewards/margins": 0.4550713002681732, + "rewards/rejected": -1.7183481454849243, + "step": 7730 + }, + { + "epoch": 1.3335630599586492, + "grad_norm": 26.759370803833008, + "learning_rate": 1.3600207079095097e-07, + "logits/chosen": -2.1518218517303467, + "logits/rejected": -2.1181464195251465, + "logps/chosen": -205.61929321289062, + "logps/rejected": -261.2309875488281, + "loss": 0.5487, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.5240737199783325, + "rewards/margins": 0.5604895353317261, + "rewards/rejected": -2.0845632553100586, + "step": 7740 + }, + { + "epoch": 1.3352860096485184, + "grad_norm": 34.86008071899414, + "learning_rate": 1.3581495765677718e-07, + "logits/chosen": -2.1620683670043945, + "logits/rejected": -2.1198601722717285, + "logps/chosen": -224.58352661132812, + "logps/rejected": -274.17803955078125, + "loss": 0.5766, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.69365656375885, + "rewards/margins": 0.5385280847549438, + "rewards/rejected": -2.232184648513794, + "step": 7750 + }, + { + "epoch": 1.3370089593383874, + "grad_norm": 34.262413024902344, + "learning_rate": 1.3562770056783702e-07, + "logits/chosen": -2.0609984397888184, + "logits/rejected": -2.026923894882202, + "logps/chosen": -197.84727478027344, + "logps/rejected": -253.30307006835938, + "loss": 0.5368, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4634578227996826, + "rewards/margins": 0.5615237951278687, + "rewards/rejected": -2.0249814987182617, + "step": 7760 + }, + { + "epoch": 1.3387319090282563, + "grad_norm": 24.656667709350586, + "learning_rate": 1.3544030027679232e-07, + "logits/chosen": -2.1213603019714355, + "logits/rejected": -2.085960865020752, + "logps/chosen": -206.48831176757812, + "logps/rejected": -255.5338134765625, + "loss": 0.5678, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.532331109046936, + "rewards/margins": 0.5047855377197266, + "rewards/rejected": -2.037116527557373, + "step": 7770 + }, + { + "epoch": 1.3404548587181253, + "grad_norm": 29.939220428466797, + "learning_rate": 1.3525275753688042e-07, + "logits/chosen": -2.193809986114502, + "logits/rejected": -2.168917179107666, + "logps/chosen": -219.8052520751953, + "logps/rejected": -268.90814208984375, + "loss": 0.6032, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.6730769872665405, + "rewards/margins": 0.4895497262477875, + "rewards/rejected": -2.1626267433166504, + "step": 7780 + }, + { + "epoch": 1.3421778084079945, + "grad_norm": 45.841880798339844, + "learning_rate": 1.350650731019113e-07, + "logits/chosen": -2.1801536083221436, + "logits/rejected": -2.1468937397003174, + "logps/chosen": -216.2582244873047, + "logps/rejected": -274.93438720703125, + "loss": 0.5399, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.641721487045288, + "rewards/margins": 0.5975975394248962, + "rewards/rejected": -2.239319324493408, + "step": 7790 + }, + { + "epoch": 1.3439007580978635, + "grad_norm": 32.21296691894531, + "learning_rate": 1.3487724772626439e-07, + "logits/chosen": -2.1707370281219482, + "logits/rejected": -2.138662338256836, + "logps/chosen": -228.29141235351562, + "logps/rejected": -283.7637939453125, + "loss": 0.5713, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.7291542291641235, + "rewards/margins": 0.5668483972549438, + "rewards/rejected": -2.2960026264190674, + "step": 7800 + }, + { + "epoch": 1.3456237077877327, + "grad_norm": 50.75136184692383, + "learning_rate": 1.346892821648857e-07, + "logits/chosen": -2.175175428390503, + "logits/rejected": -2.130139112472534, + "logps/chosen": -235.88687133789062, + "logps/rejected": -280.6183776855469, + "loss": 0.5859, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.7639777660369873, + "rewards/margins": 0.5005490779876709, + "rewards/rejected": -2.264526844024658, + "step": 7810 + }, + { + "epoch": 1.3473466574776016, + "grad_norm": 52.12438201904297, + "learning_rate": 1.3450117717328468e-07, + "logits/chosen": -2.166834592819214, + "logits/rejected": -2.1256909370422363, + "logps/chosen": -219.2531280517578, + "logps/rejected": -271.19378662109375, + "loss": 0.581, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.616594672203064, + "rewards/margins": 0.5541667342185974, + "rewards/rejected": -2.1707613468170166, + "step": 7820 + }, + { + "epoch": 1.3490696071674706, + "grad_norm": 31.646299362182617, + "learning_rate": 1.3431293350753115e-07, + "logits/chosen": -2.1095938682556152, + "logits/rejected": -2.0786352157592773, + "logps/chosen": -205.64035034179688, + "logps/rejected": -261.259521484375, + "loss": 0.5568, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.5484790802001953, + "rewards/margins": 0.5396331548690796, + "rewards/rejected": -2.0881123542785645, + "step": 7830 + }, + { + "epoch": 1.3507925568573398, + "grad_norm": 28.87881851196289, + "learning_rate": 1.341245519242524e-07, + "logits/chosen": -2.1058099269866943, + "logits/rejected": -2.0774412155151367, + "logps/chosen": -197.8090362548828, + "logps/rejected": -233.66806030273438, + "loss": 0.6179, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.4320785999298096, + "rewards/margins": 0.37303489446640015, + "rewards/rejected": -1.805113434791565, + "step": 7840 + }, + { + "epoch": 1.3525155065472088, + "grad_norm": 27.308841705322266, + "learning_rate": 1.3393603318063e-07, + "logits/chosen": -2.0920326709747314, + "logits/rejected": -2.0379960536956787, + "logps/chosen": -205.4201202392578, + "logps/rejected": -240.3004913330078, + "loss": 0.6017, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.4918204545974731, + "rewards/margins": 0.4359316825866699, + "rewards/rejected": -1.927752137184143, + "step": 7850 + }, + { + "epoch": 1.354238456237078, + "grad_norm": 36.3019905090332, + "learning_rate": 1.3374737803439685e-07, + "logits/chosen": -2.107954502105713, + "logits/rejected": -2.058523416519165, + "logps/chosen": -224.4720916748047, + "logps/rejected": -279.4774169921875, + "loss": 0.5657, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.71206533908844, + "rewards/margins": 0.566798210144043, + "rewards/rejected": -2.2788634300231934, + "step": 7860 + }, + { + "epoch": 1.355961405926947, + "grad_norm": 34.743247985839844, + "learning_rate": 1.3355858724383415e-07, + "logits/chosen": -2.0951883792877197, + "logits/rejected": -2.0639822483062744, + "logps/chosen": -243.0995635986328, + "logps/rejected": -293.8856506347656, + "loss": 0.5744, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.863247275352478, + "rewards/margins": 0.5420467257499695, + "rewards/rejected": -2.405294179916382, + "step": 7870 + }, + { + "epoch": 1.3576843556168159, + "grad_norm": 42.764366149902344, + "learning_rate": 1.3336966156776822e-07, + "logits/chosen": -2.1480154991149902, + "logits/rejected": -2.1229541301727295, + "logps/chosen": -233.7257537841797, + "logps/rejected": -269.1768798828125, + "loss": 0.6355, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.7973674535751343, + "rewards/margins": 0.3781353533267975, + "rewards/rejected": -2.1755027770996094, + "step": 7880 + }, + { + "epoch": 1.359407305306685, + "grad_norm": 29.920547485351562, + "learning_rate": 1.3318060176556756e-07, + "logits/chosen": -2.1473171710968018, + "logits/rejected": -2.112473487854004, + "logps/chosen": -203.85462951660156, + "logps/rejected": -251.13766479492188, + "loss": 0.5794, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.4900063276290894, + "rewards/margins": 0.47319668531417847, + "rewards/rejected": -1.9632028341293335, + "step": 7890 + }, + { + "epoch": 1.361130254996554, + "grad_norm": 35.13427734375, + "learning_rate": 1.3299140859713983e-07, + "logits/chosen": -2.195388078689575, + "logits/rejected": -2.1696789264678955, + "logps/chosen": -184.6055908203125, + "logps/rejected": -229.09329223632812, + "loss": 0.5793, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.3148016929626465, + "rewards/margins": 0.45367079973220825, + "rewards/rejected": -1.768472671508789, + "step": 7900 + }, + { + "epoch": 1.3628532046864232, + "grad_norm": 42.53324890136719, + "learning_rate": 1.3280208282292878e-07, + "logits/chosen": -2.192570686340332, + "logits/rejected": -2.1600852012634277, + "logps/chosen": -189.4005889892578, + "logps/rejected": -221.49951171875, + "loss": 0.6167, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.3212350606918335, + "rewards/margins": 0.36447951197624207, + "rewards/rejected": -1.6857147216796875, + "step": 7910 + }, + { + "epoch": 1.3645761543762922, + "grad_norm": 26.738149642944336, + "learning_rate": 1.3261262520391097e-07, + "logits/chosen": -2.1648764610290527, + "logits/rejected": -2.136169195175171, + "logps/chosen": -188.26707458496094, + "logps/rejected": -223.86648559570312, + "loss": 0.5765, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.3203630447387695, + "rewards/margins": 0.4093676209449768, + "rewards/rejected": -1.7297306060791016, + "step": 7920 + }, + { + "epoch": 1.3662991040661612, + "grad_norm": 35.8544921875, + "learning_rate": 1.3242303650159313e-07, + "logits/chosen": -2.196377992630005, + "logits/rejected": -2.1608593463897705, + "logps/chosen": -200.53225708007812, + "logps/rejected": -236.4687957763672, + "loss": 0.5847, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.4146637916564941, + "rewards/margins": 0.43156057596206665, + "rewards/rejected": -1.8462244272232056, + "step": 7930 + }, + { + "epoch": 1.3680220537560304, + "grad_norm": 47.39460754394531, + "learning_rate": 1.3223331747800867e-07, + "logits/chosen": -2.1359665393829346, + "logits/rejected": -2.1010541915893555, + "logps/chosen": -199.37338256835938, + "logps/rejected": -249.5021209716797, + "loss": 0.5582, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.4021588563919067, + "rewards/margins": 0.5483847856521606, + "rewards/rejected": -1.9505436420440674, + "step": 7940 + }, + { + "epoch": 1.3697450034458993, + "grad_norm": 25.43636703491211, + "learning_rate": 1.3204346889571494e-07, + "logits/chosen": -2.12013840675354, + "logits/rejected": -2.100837230682373, + "logps/chosen": -177.21873474121094, + "logps/rejected": -224.29898071289062, + "loss": 0.5749, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.251285195350647, + "rewards/margins": 0.47028857469558716, + "rewards/rejected": -1.721573829650879, + "step": 7950 + }, + { + "epoch": 1.3714679531357685, + "grad_norm": 24.467222213745117, + "learning_rate": 1.3185349151779e-07, + "logits/chosen": -2.1382768154144287, + "logits/rejected": -2.104619264602661, + "logps/chosen": -182.4629364013672, + "logps/rejected": -221.2128143310547, + "loss": 0.5913, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.31582510471344, + "rewards/margins": 0.38097429275512695, + "rewards/rejected": -1.6967992782592773, + "step": 7960 + }, + { + "epoch": 1.3731909028256375, + "grad_norm": 27.679777145385742, + "learning_rate": 1.3166338610782957e-07, + "logits/chosen": -2.211303234100342, + "logits/rejected": -2.1780335903167725, + "logps/chosen": -182.8100128173828, + "logps/rejected": -221.35983276367188, + "loss": 0.5754, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.26361083984375, + "rewards/margins": 0.43397146463394165, + "rewards/rejected": -1.6975822448730469, + "step": 7970 + }, + { + "epoch": 1.3749138525155065, + "grad_norm": 27.631725311279297, + "learning_rate": 1.31473153429944e-07, + "logits/chosen": -2.2298529148101807, + "logits/rejected": -2.2019245624542236, + "logps/chosen": -190.2526092529297, + "logps/rejected": -237.2704620361328, + "loss": 0.5636, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.3224337100982666, + "rewards/margins": 0.49690714478492737, + "rewards/rejected": -1.819340705871582, + "step": 7980 + }, + { + "epoch": 1.3766368022053757, + "grad_norm": 36.38218688964844, + "learning_rate": 1.3128279424875523e-07, + "logits/chosen": -2.2090554237365723, + "logits/rejected": -2.1736884117126465, + "logps/chosen": -195.2666778564453, + "logps/rejected": -249.32376098632812, + "loss": 0.5378, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.4387704133987427, + "rewards/margins": 0.5402941703796387, + "rewards/rejected": -1.9790645837783813, + "step": 7990 + }, + { + "epoch": 1.3783597518952446, + "grad_norm": 25.795717239379883, + "learning_rate": 1.3109230932939354e-07, + "logits/chosen": -2.0892562866210938, + "logits/rejected": -2.057300090789795, + "logps/chosen": -210.97891235351562, + "logps/rejected": -268.6546325683594, + "loss": 0.5491, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5498253107070923, + "rewards/margins": 0.5857113599777222, + "rewards/rejected": -2.1355366706848145, + "step": 8000 + }, + { + "epoch": 1.3783597518952446, + "eval_logits/chosen": -2.2084763050079346, + "eval_logits/rejected": -2.190638542175293, + "eval_logps/chosen": -205.56568908691406, + "eval_logps/rejected": -233.9599151611328, + "eval_loss": 0.6438331604003906, + "eval_rewards/accuracies": 0.6054832935333252, + "eval_rewards/chosen": -1.4655022621154785, + "eval_rewards/margins": 0.24660077691078186, + "eval_rewards/rejected": -1.712103009223938, + "eval_runtime": 382.7297, + "eval_samples_per_second": 11.246, + "eval_steps_per_second": 1.406, + "step": 8000 + }, + { + "epoch": 1.3800827015851138, + "grad_norm": 50.01524353027344, + "learning_rate": 1.3090169943749475e-07, + "logits/chosen": -2.1011762619018555, + "logits/rejected": -2.053696632385254, + "logps/chosen": -227.21572875976562, + "logps/rejected": -275.5449523925781, + "loss": 0.576, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.6916106939315796, + "rewards/margins": 0.5524572134017944, + "rewards/rejected": -2.244067907333374, + "step": 8010 + }, + { + "epoch": 1.3818056512749828, + "grad_norm": 31.691274642944336, + "learning_rate": 1.307109653391969e-07, + "logits/chosen": -2.124875068664551, + "logits/rejected": -2.08036208152771, + "logps/chosen": -237.47872924804688, + "logps/rejected": -280.66436767578125, + "loss": 0.5605, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.7674553394317627, + "rewards/margins": 0.5002973079681396, + "rewards/rejected": -2.2677524089813232, + "step": 8020 + }, + { + "epoch": 1.3835286009648518, + "grad_norm": 43.40495681762695, + "learning_rate": 1.3052010780113726e-07, + "logits/chosen": -2.1229870319366455, + "logits/rejected": -2.0922768115997314, + "logps/chosen": -206.2229461669922, + "logps/rejected": -257.94525146484375, + "loss": 0.5747, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.5460470914840698, + "rewards/margins": 0.5066577196121216, + "rewards/rejected": -2.0527050495147705, + "step": 8030 + }, + { + "epoch": 1.385251550654721, + "grad_norm": 28.32925796508789, + "learning_rate": 1.3032912759044937e-07, + "logits/chosen": -2.1125435829162598, + "logits/rejected": -2.062844753265381, + "logps/chosen": -220.2933349609375, + "logps/rejected": -269.1329650878906, + "loss": 0.5803, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.6463918685913086, + "rewards/margins": 0.5379444360733032, + "rewards/rejected": -2.1843366622924805, + "step": 8040 + }, + { + "epoch": 1.38697450034459, + "grad_norm": 26.020471572875977, + "learning_rate": 1.301380254747597e-07, + "logits/chosen": -2.1440205574035645, + "logits/rejected": -2.113903045654297, + "logps/chosen": -203.15582275390625, + "logps/rejected": -256.64044189453125, + "loss": 0.5576, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.4739043712615967, + "rewards/margins": 0.5403920412063599, + "rewards/rejected": -2.014296531677246, + "step": 8050 + }, + { + "epoch": 1.388697450034459, + "grad_norm": 32.8132209777832, + "learning_rate": 1.2994680222218478e-07, + "logits/chosen": -2.1887154579162598, + "logits/rejected": -2.1529600620269775, + "logps/chosen": -211.48086547851562, + "logps/rejected": -248.77041625976562, + "loss": 0.598, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.5500705242156982, + "rewards/margins": 0.44143232703208923, + "rewards/rejected": -1.9915030002593994, + "step": 8060 + }, + { + "epoch": 1.390420399724328, + "grad_norm": 39.48831558227539, + "learning_rate": 1.29755458601328e-07, + "logits/chosen": -2.1170506477355957, + "logits/rejected": -2.078484058380127, + "logps/chosen": -200.0701904296875, + "logps/rejected": -240.6426544189453, + "loss": 0.5641, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.4589961767196655, + "rewards/margins": 0.45816683769226074, + "rewards/rejected": -1.9171631336212158, + "step": 8070 + }, + { + "epoch": 1.392143349414197, + "grad_norm": 33.201900482177734, + "learning_rate": 1.2956399538127665e-07, + "logits/chosen": -2.114403247833252, + "logits/rejected": -2.0826992988586426, + "logps/chosen": -205.1845245361328, + "logps/rejected": -267.910888671875, + "loss": 0.5387, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.513293981552124, + "rewards/margins": 0.5943515300750732, + "rewards/rejected": -2.1076455116271973, + "step": 8080 + }, + { + "epoch": 1.3938662991040662, + "grad_norm": 28.73116111755371, + "learning_rate": 1.2937241333159854e-07, + "logits/chosen": -2.089317798614502, + "logits/rejected": -2.0446722507476807, + "logps/chosen": -224.7921600341797, + "logps/rejected": -283.7295837402344, + "loss": 0.5499, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.7055130004882812, + "rewards/margins": 0.6278866529464722, + "rewards/rejected": -2.333399772644043, + "step": 8090 + }, + { + "epoch": 1.3955892487939352, + "grad_norm": 36.38704299926758, + "learning_rate": 1.2918071322233933e-07, + "logits/chosen": -2.109844446182251, + "logits/rejected": -2.093400478363037, + "logps/chosen": -251.6876220703125, + "logps/rejected": -291.4291687011719, + "loss": 0.628, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.9619048833847046, + "rewards/margins": 0.37870702147483826, + "rewards/rejected": -2.3406119346618652, + "step": 8100 + }, + { + "epoch": 1.3973121984838044, + "grad_norm": 37.000370025634766, + "learning_rate": 1.2898889582401912e-07, + "logits/chosen": -2.089730739593506, + "logits/rejected": -2.046140193939209, + "logps/chosen": -251.7401580810547, + "logps/rejected": -300.5556640625, + "loss": 0.5798, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.9494653940200806, + "rewards/margins": 0.5562061071395874, + "rewards/rejected": -2.505671262741089, + "step": 8110 + }, + { + "epoch": 1.3990351481736734, + "grad_norm": 35.38205337524414, + "learning_rate": 1.287969619076294e-07, + "logits/chosen": -2.1034393310546875, + "logits/rejected": -2.068429470062256, + "logps/chosen": -234.1424102783203, + "logps/rejected": -277.00262451171875, + "loss": 0.5932, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.783160924911499, + "rewards/margins": 0.46717602014541626, + "rewards/rejected": -2.2503371238708496, + "step": 8120 + }, + { + "epoch": 1.4007580978635423, + "grad_norm": 43.41401672363281, + "learning_rate": 1.2860491224463003e-07, + "logits/chosen": -2.146254777908325, + "logits/rejected": -2.1182961463928223, + "logps/chosen": -212.2090606689453, + "logps/rejected": -252.13058471679688, + "loss": 0.5811, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.5653047561645508, + "rewards/margins": 0.45889678597450256, + "rewards/rejected": -2.0242016315460205, + "step": 8130 + }, + { + "epoch": 1.4024810475534115, + "grad_norm": 36.6799430847168, + "learning_rate": 1.2841274760694607e-07, + "logits/chosen": -2.1333212852478027, + "logits/rejected": -2.093632936477661, + "logps/chosen": -203.4479217529297, + "logps/rejected": -258.7621154785156, + "loss": 0.553, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.4967834949493408, + "rewards/margins": 0.5681794881820679, + "rewards/rejected": -2.064962863922119, + "step": 8140 + }, + { + "epoch": 1.4042039972432805, + "grad_norm": 24.73906135559082, + "learning_rate": 1.282204687669648e-07, + "logits/chosen": -2.1712634563446045, + "logits/rejected": -2.13863468170166, + "logps/chosen": -212.09573364257812, + "logps/rejected": -257.94891357421875, + "loss": 0.6103, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.5868251323699951, + "rewards/margins": 0.47013989090919495, + "rewards/rejected": -2.0569651126861572, + "step": 8150 + }, + { + "epoch": 1.4059269469331497, + "grad_norm": 26.953794479370117, + "learning_rate": 1.280280764975324e-07, + "logits/chosen": -2.1399598121643066, + "logits/rejected": -2.097625732421875, + "logps/chosen": -194.59352111816406, + "logps/rejected": -259.80902099609375, + "loss": 0.4902, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.381890058517456, + "rewards/margins": 0.706649661064148, + "rewards/rejected": -2.0885396003723145, + "step": 8160 + }, + { + "epoch": 1.4076498966230186, + "grad_norm": 36.16432571411133, + "learning_rate": 1.278355715719511e-07, + "logits/chosen": -2.164865016937256, + "logits/rejected": -2.1224586963653564, + "logps/chosen": -215.82766723632812, + "logps/rejected": -252.830078125, + "loss": 0.5694, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.5576438903808594, + "rewards/margins": 0.45905083417892456, + "rewards/rejected": -2.0166945457458496, + "step": 8170 + }, + { + "epoch": 1.4093728463128876, + "grad_norm": 37.798011779785156, + "learning_rate": 1.276429547639758e-07, + "logits/chosen": -2.149423837661743, + "logits/rejected": -2.1127476692199707, + "logps/chosen": -241.2819061279297, + "logps/rejected": -288.2625732421875, + "loss": 0.5852, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.8370189666748047, + "rewards/margins": 0.4997822344303131, + "rewards/rejected": -2.336801290512085, + "step": 8180 + }, + { + "epoch": 1.4110957960027566, + "grad_norm": 38.44746398925781, + "learning_rate": 1.274502268478112e-07, + "logits/chosen": -2.0557899475097656, + "logits/rejected": -2.0156021118164062, + "logps/chosen": -246.6701202392578, + "logps/rejected": -301.4805603027344, + "loss": 0.5658, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.8727941513061523, + "rewards/margins": 0.6068886518478394, + "rewards/rejected": -2.4796829223632812, + "step": 8190 + }, + { + "epoch": 1.4128187456926258, + "grad_norm": 46.51035690307617, + "learning_rate": 1.2725738859810862e-07, + "logits/chosen": -2.112189769744873, + "logits/rejected": -2.080653667449951, + "logps/chosen": -244.71798706054688, + "logps/rejected": -282.700439453125, + "loss": 0.6207, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.8888120651245117, + "rewards/margins": 0.4255140721797943, + "rewards/rejected": -2.314326286315918, + "step": 8200 + }, + { + "epoch": 1.414541695382495, + "grad_norm": 24.586009979248047, + "learning_rate": 1.270644407899627e-07, + "logits/chosen": -2.067437171936035, + "logits/rejected": -2.0337612628936768, + "logps/chosen": -203.90200805664062, + "logps/rejected": -251.04037475585938, + "loss": 0.5897, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.4770008325576782, + "rewards/margins": 0.4779542088508606, + "rewards/rejected": -1.9549548625946045, + "step": 8210 + }, + { + "epoch": 1.416264645072364, + "grad_norm": 32.18049240112305, + "learning_rate": 1.2687138419890863e-07, + "logits/chosen": -2.1636199951171875, + "logits/rejected": -2.1185073852539062, + "logps/chosen": -184.97103881835938, + "logps/rejected": -235.7440948486328, + "loss": 0.5581, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.3181358575820923, + "rewards/margins": 0.5168372392654419, + "rewards/rejected": -1.8349730968475342, + "step": 8220 + }, + { + "epoch": 1.417987594762233, + "grad_norm": 30.544574737548828, + "learning_rate": 1.2667821960091865e-07, + "logits/chosen": -2.14854097366333, + "logits/rejected": -2.1254589557647705, + "logps/chosen": -202.51792907714844, + "logps/rejected": -237.36862182617188, + "loss": 0.6038, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.4582659006118774, + "rewards/margins": 0.41172361373901367, + "rewards/rejected": -1.8699896335601807, + "step": 8230 + }, + { + "epoch": 1.4197105444521019, + "grad_norm": 48.499568939208984, + "learning_rate": 1.2648494777239934e-07, + "logits/chosen": -2.19791579246521, + "logits/rejected": -2.153782844543457, + "logps/chosen": -221.105712890625, + "logps/rejected": -264.9075622558594, + "loss": 0.572, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.6414047479629517, + "rewards/margins": 0.4974588453769684, + "rewards/rejected": -2.1388633251190186, + "step": 8240 + }, + { + "epoch": 1.421433494141971, + "grad_norm": 37.22239303588867, + "learning_rate": 1.2629156949018805e-07, + "logits/chosen": -2.154099464416504, + "logits/rejected": -2.1236770153045654, + "logps/chosen": -220.8912811279297, + "logps/rejected": -274.5480041503906, + "loss": 0.5441, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.6146440505981445, + "rewards/margins": 0.589509129524231, + "rewards/rejected": -2.204153537750244, + "step": 8250 + }, + { + "epoch": 1.42315644383184, + "grad_norm": 28.23543930053711, + "learning_rate": 1.260980855315502e-07, + "logits/chosen": -2.190843105316162, + "logits/rejected": -2.158343553543091, + "logps/chosen": -228.07455444335938, + "logps/rejected": -278.42626953125, + "loss": 0.5663, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.6998857259750366, + "rewards/margins": 0.5324597954750061, + "rewards/rejected": -2.2323453426361084, + "step": 8260 + }, + { + "epoch": 1.4248793935217092, + "grad_norm": 35.08918380737305, + "learning_rate": 1.2590449667417585e-07, + "logits/chosen": -2.1991262435913086, + "logits/rejected": -2.1735010147094727, + "logps/chosen": -216.458251953125, + "logps/rejected": -261.60736083984375, + "loss": 0.6025, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.6242341995239258, + "rewards/margins": 0.4514681398868561, + "rewards/rejected": -2.07570219039917, + "step": 8270 + }, + { + "epoch": 1.4266023432115782, + "grad_norm": 32.83218002319336, + "learning_rate": 1.2571080369617673e-07, + "logits/chosen": -2.139239549636841, + "logits/rejected": -2.1168651580810547, + "logps/chosen": -204.988525390625, + "logps/rejected": -241.57247924804688, + "loss": 0.615, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.5196573734283447, + "rewards/margins": 0.3771902322769165, + "rewards/rejected": -1.8968474864959717, + "step": 8280 + }, + { + "epoch": 1.4283252929014472, + "grad_norm": 22.9846134185791, + "learning_rate": 1.2551700737608313e-07, + "logits/chosen": -2.14395809173584, + "logits/rejected": -2.0958104133605957, + "logps/chosen": -183.81094360351562, + "logps/rejected": -212.01416015625, + "loss": 0.5932, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.231366753578186, + "rewards/margins": 0.3542328476905823, + "rewards/rejected": -1.585599660873413, + "step": 8290 + }, + { + "epoch": 1.4300482425913164, + "grad_norm": 20.401521682739258, + "learning_rate": 1.253231084928406e-07, + "logits/chosen": -2.2636139392852783, + "logits/rejected": -2.2296109199523926, + "logps/chosen": -199.21340942382812, + "logps/rejected": -241.51412963867188, + "loss": 0.5865, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.3988856077194214, + "rewards/margins": 0.4828069806098938, + "rewards/rejected": -1.8816925287246704, + "step": 8300 + }, + { + "epoch": 1.4317711922811853, + "grad_norm": 36.791229248046875, + "learning_rate": 1.2512910782580704e-07, + "logits/chosen": -2.1130568981170654, + "logits/rejected": -2.073232889175415, + "logps/chosen": -195.42286682128906, + "logps/rejected": -232.5188446044922, + "loss": 0.5896, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.3795632123947144, + "rewards/margins": 0.4427759051322937, + "rewards/rejected": -1.8223390579223633, + "step": 8310 + }, + { + "epoch": 1.4334941419710545, + "grad_norm": 28.587932586669922, + "learning_rate": 1.2493500615474937e-07, + "logits/chosen": -2.128284215927124, + "logits/rejected": -2.10135817527771, + "logps/chosen": -190.30288696289062, + "logps/rejected": -242.4776611328125, + "loss": 0.5698, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.366477131843567, + "rewards/margins": 0.523558497428894, + "rewards/rejected": -1.89003586769104, + "step": 8320 + }, + { + "epoch": 1.4352170916609235, + "grad_norm": 30.92583656311035, + "learning_rate": 1.2474080425984056e-07, + "logits/chosen": -2.1785707473754883, + "logits/rejected": -2.1576743125915527, + "logps/chosen": -188.43667602539062, + "logps/rejected": -229.5851593017578, + "loss": 0.615, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.3890917301177979, + "rewards/margins": 0.37919071316719055, + "rewards/rejected": -1.7682822942733765, + "step": 8330 + }, + { + "epoch": 1.4369400413507925, + "grad_norm": 24.35154151916504, + "learning_rate": 1.2454650292165634e-07, + "logits/chosen": -2.2455568313598633, + "logits/rejected": -2.2208244800567627, + "logps/chosen": -173.263427734375, + "logps/rejected": -210.4772491455078, + "loss": 0.5819, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.1742428541183472, + "rewards/margins": 0.41545358300209045, + "rewards/rejected": -1.5896964073181152, + "step": 8340 + }, + { + "epoch": 1.4386629910406616, + "grad_norm": 25.503570556640625, + "learning_rate": 1.2435210292117223e-07, + "logits/chosen": -2.1394078731536865, + "logits/rejected": -2.112879753112793, + "logps/chosen": -168.69577026367188, + "logps/rejected": -195.05099487304688, + "loss": 0.615, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.1397912502288818, + "rewards/margins": 0.28986939787864685, + "rewards/rejected": -1.4296607971191406, + "step": 8350 + }, + { + "epoch": 1.4403859407305306, + "grad_norm": 29.409194946289062, + "learning_rate": 1.2415760503976027e-07, + "logits/chosen": -2.1161611080169678, + "logits/rejected": -2.072409152984619, + "logps/chosen": -164.82449340820312, + "logps/rejected": -202.44180297851562, + "loss": 0.5674, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.0871448516845703, + "rewards/margins": 0.4222991466522217, + "rewards/rejected": -1.509443998336792, + "step": 8360 + }, + { + "epoch": 1.4421088904203998, + "grad_norm": 31.52328109741211, + "learning_rate": 1.2396301005918592e-07, + "logits/chosen": -2.1593313217163086, + "logits/rejected": -2.1244845390319824, + "logps/chosen": -173.5267791748047, + "logps/rejected": -224.0096435546875, + "loss": 0.5643, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.214634656906128, + "rewards/margins": 0.5000126361846924, + "rewards/rejected": -1.7146470546722412, + "step": 8370 + }, + { + "epoch": 1.4438318401102688, + "grad_norm": 26.57062339782715, + "learning_rate": 1.2376831876160493e-07, + "logits/chosen": -2.207657814025879, + "logits/rejected": -2.174440860748291, + "logps/chosen": -182.63824462890625, + "logps/rejected": -229.58169555664062, + "loss": 0.5749, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.2988611459732056, + "rewards/margins": 0.46366414427757263, + "rewards/rejected": -1.7625253200531006, + "step": 8380 + }, + { + "epoch": 1.4455547898001377, + "grad_norm": 33.905601501464844, + "learning_rate": 1.2357353192956015e-07, + "logits/chosen": -2.181260824203491, + "logits/rejected": -2.130100727081299, + "logps/chosen": -204.32766723632812, + "logps/rejected": -269.24566650390625, + "loss": 0.5149, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.469922423362732, + "rewards/margins": 0.6692734956741333, + "rewards/rejected": -2.1391959190368652, + "step": 8390 + }, + { + "epoch": 1.447277739490007, + "grad_norm": 32.535919189453125, + "learning_rate": 1.2337865034597853e-07, + "logits/chosen": -2.1397032737731934, + "logits/rejected": -2.1052467823028564, + "logps/chosen": -210.44638061523438, + "logps/rejected": -265.6088562011719, + "loss": 0.5537, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.5950980186462402, + "rewards/margins": 0.5580428242683411, + "rewards/rejected": -2.1531407833099365, + "step": 8400 + }, + { + "epoch": 1.447277739490007, + "eval_logits/chosen": -2.198362350463867, + "eval_logits/rejected": -2.1797170639038086, + "eval_logps/chosen": -202.7633819580078, + "eval_logps/rejected": -230.68116760253906, + "eval_loss": 0.6445255875587463, + "eval_rewards/accuracies": 0.6259293556213379, + "eval_rewards/chosen": -1.4374792575836182, + "eval_rewards/margins": 0.24183642864227295, + "eval_rewards/rejected": -1.679315447807312, + "eval_runtime": 383.012, + "eval_samples_per_second": 11.237, + "eval_steps_per_second": 1.405, + "step": 8400 + }, + { + "epoch": 1.449000689179876, + "grad_norm": 31.82181167602539, + "learning_rate": 1.2318367479416772e-07, + "logits/chosen": -2.113138198852539, + "logits/rejected": -2.0723438262939453, + "logps/chosen": -223.67822265625, + "logps/rejected": -277.77703857421875, + "loss": 0.5478, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.6481117010116577, + "rewards/margins": 0.6274040937423706, + "rewards/rejected": -2.2755157947540283, + "step": 8410 + }, + { + "epoch": 1.450723638869745, + "grad_norm": 31.47911262512207, + "learning_rate": 1.2298860605781317e-07, + "logits/chosen": -2.025463581085205, + "logits/rejected": -1.9937463998794556, + "logps/chosen": -209.2928466796875, + "logps/rejected": -263.2427673339844, + "loss": 0.5496, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.5683337450027466, + "rewards/margins": 0.5581804513931274, + "rewards/rejected": -2.126513957977295, + "step": 8420 + }, + { + "epoch": 1.452446588559614, + "grad_norm": 36.388572692871094, + "learning_rate": 1.2279344492097482e-07, + "logits/chosen": -2.0982298851013184, + "logits/rejected": -2.066349744796753, + "logps/chosen": -224.09603881835938, + "logps/rejected": -277.3132629394531, + "loss": 0.5998, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.6971712112426758, + "rewards/margins": 0.5131348371505737, + "rewards/rejected": -2.21030592918396, + "step": 8430 + }, + { + "epoch": 1.454169538249483, + "grad_norm": 28.68044662475586, + "learning_rate": 1.2259819216808406e-07, + "logits/chosen": -2.1119582653045654, + "logits/rejected": -2.089153289794922, + "logps/chosen": -222.8911895751953, + "logps/rejected": -261.83636474609375, + "loss": 0.6012, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.6672919988632202, + "rewards/margins": 0.4484184682369232, + "rewards/rejected": -2.115710496902466, + "step": 8440 + }, + { + "epoch": 1.4558924879393522, + "grad_norm": 44.86223220825195, + "learning_rate": 1.2240284858394048e-07, + "logits/chosen": -2.0294086933135986, + "logits/rejected": -2.006143093109131, + "logps/chosen": -217.24111938476562, + "logps/rejected": -272.67327880859375, + "loss": 0.5672, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.685194969177246, + "rewards/margins": 0.5192277431488037, + "rewards/rejected": -2.20442271232605, + "step": 8450 + }, + { + "epoch": 1.4576154376292212, + "grad_norm": 28.0307559967041, + "learning_rate": 1.2220741495370875e-07, + "logits/chosen": -2.113776683807373, + "logits/rejected": -2.079357147216797, + "logps/chosen": -225.5147247314453, + "logps/rejected": -274.3211669921875, + "loss": 0.5735, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.7199233770370483, + "rewards/margins": 0.49351826310157776, + "rewards/rejected": -2.2134416103363037, + "step": 8460 + }, + { + "epoch": 1.4593383873190904, + "grad_norm": 45.38313293457031, + "learning_rate": 1.220118920629155e-07, + "logits/chosen": -2.125673770904541, + "logits/rejected": -2.0898425579071045, + "logps/chosen": -226.78396606445312, + "logps/rejected": -271.1668395996094, + "loss": 0.6055, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.7033987045288086, + "rewards/margins": 0.4875558316707611, + "rewards/rejected": -2.1909546852111816, + "step": 8470 + }, + { + "epoch": 1.4610613370089593, + "grad_norm": 34.2986946105957, + "learning_rate": 1.2181628069744613e-07, + "logits/chosen": -2.1393344402313232, + "logits/rejected": -2.0992624759674072, + "logps/chosen": -187.6902313232422, + "logps/rejected": -234.9958038330078, + "loss": 0.5692, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.3351308107376099, + "rewards/margins": 0.5066913366317749, + "rewards/rejected": -1.8418220281600952, + "step": 8480 + }, + { + "epoch": 1.4627842866988283, + "grad_norm": 27.435794830322266, + "learning_rate": 1.216205816435416e-07, + "logits/chosen": -2.2194714546203613, + "logits/rejected": -2.195204257965088, + "logps/chosen": -175.31588745117188, + "logps/rejected": -224.5002899169922, + "loss": 0.5579, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.201210618019104, + "rewards/margins": 0.48213768005371094, + "rewards/rejected": -1.6833486557006836, + "step": 8490 + }, + { + "epoch": 1.4645072363886975, + "grad_norm": 32.16939926147461, + "learning_rate": 1.2142479568779545e-07, + "logits/chosen": -2.1218631267547607, + "logits/rejected": -2.0974326133728027, + "logps/chosen": -182.5852813720703, + "logps/rejected": -221.85195922851562, + "loss": 0.579, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.2722389698028564, + "rewards/margins": 0.42209678888320923, + "rewards/rejected": -1.694335699081421, + "step": 8500 + }, + { + "epoch": 1.4662301860785665, + "grad_norm": 40.23246765136719, + "learning_rate": 1.2122892361715042e-07, + "logits/chosen": -2.1258957386016846, + "logits/rejected": -2.0866127014160156, + "logps/chosen": -197.31124877929688, + "logps/rejected": -246.4371795654297, + "loss": 0.5417, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.3823117017745972, + "rewards/margins": 0.5729239583015442, + "rewards/rejected": -1.9552357196807861, + "step": 8510 + }, + { + "epoch": 1.4679531357684357, + "grad_norm": 26.580339431762695, + "learning_rate": 1.2103296621889531e-07, + "logits/chosen": -2.098947525024414, + "logits/rejected": -2.0675082206726074, + "logps/chosen": -206.1828155517578, + "logps/rejected": -247.58139038085938, + "loss": 0.5763, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.5146195888519287, + "rewards/margins": 0.4820035994052887, + "rewards/rejected": -1.9966232776641846, + "step": 8520 + }, + { + "epoch": 1.4696760854583046, + "grad_norm": 24.382217407226562, + "learning_rate": 1.2083692428066207e-07, + "logits/chosen": -2.1002607345581055, + "logits/rejected": -2.0706868171691895, + "logps/chosen": -195.86622619628906, + "logps/rejected": -232.44155883789062, + "loss": 0.6081, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.4051289558410645, + "rewards/margins": 0.4165209233760834, + "rewards/rejected": -1.8216501474380493, + "step": 8530 + }, + { + "epoch": 1.4713990351481736, + "grad_norm": 25.53142738342285, + "learning_rate": 1.2064079859042237e-07, + "logits/chosen": -2.24005126953125, + "logits/rejected": -2.216982126235962, + "logps/chosen": -178.56973266601562, + "logps/rejected": -211.1771240234375, + "loss": 0.6123, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.2287400960922241, + "rewards/margins": 0.34191378951072693, + "rewards/rejected": -1.570654034614563, + "step": 8540 + }, + { + "epoch": 1.4731219848380428, + "grad_norm": 26.57423973083496, + "learning_rate": 1.204445899364844e-07, + "logits/chosen": -2.174487590789795, + "logits/rejected": -2.1445860862731934, + "logps/chosen": -174.37026977539062, + "logps/rejected": -221.49685668945312, + "loss": 0.5647, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.1900184154510498, + "rewards/margins": 0.47659429907798767, + "rewards/rejected": -1.6666128635406494, + "step": 8550 + }, + { + "epoch": 1.4748449345279118, + "grad_norm": 26.368349075317383, + "learning_rate": 1.2024829910749e-07, + "logits/chosen": -2.24088978767395, + "logits/rejected": -2.1960439682006836, + "logps/chosen": -182.83432006835938, + "logps/rejected": -239.61111450195312, + "loss": 0.5278, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.2800990343093872, + "rewards/margins": 0.5987238883972168, + "rewards/rejected": -1.878822684288025, + "step": 8560 + }, + { + "epoch": 1.476567884217781, + "grad_norm": 36.1506233215332, + "learning_rate": 1.2005192689241111e-07, + "logits/chosen": -2.098243236541748, + "logits/rejected": -2.067201614379883, + "logps/chosen": -207.9953155517578, + "logps/rejected": -254.19345092773438, + "loss": 0.5628, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.5292612314224243, + "rewards/margins": 0.5006064176559448, + "rewards/rejected": -2.029867649078369, + "step": 8570 + }, + { + "epoch": 1.47829083390765, + "grad_norm": 36.61823272705078, + "learning_rate": 1.1985547408054707e-07, + "logits/chosen": -2.137028217315674, + "logits/rejected": -2.096841335296631, + "logps/chosen": -226.59426879882812, + "logps/rejected": -289.16400146484375, + "loss": 0.5243, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.712284803390503, + "rewards/margins": 0.639613926410675, + "rewards/rejected": -2.351898670196533, + "step": 8580 + }, + { + "epoch": 1.480013783597519, + "grad_norm": 27.322467803955078, + "learning_rate": 1.1965894146152083e-07, + "logits/chosen": -2.114105701446533, + "logits/rejected": -2.0724968910217285, + "logps/chosen": -236.0207977294922, + "logps/rejected": -287.5677185058594, + "loss": 0.5503, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.7737798690795898, + "rewards/margins": 0.5756506323814392, + "rewards/rejected": -2.349430561065674, + "step": 8590 + }, + { + "epoch": 1.481736733287388, + "grad_norm": 39.7527961730957, + "learning_rate": 1.1946232982527637e-07, + "logits/chosen": -2.091597080230713, + "logits/rejected": -2.0650010108947754, + "logps/chosen": -234.6602020263672, + "logps/rejected": -267.87298583984375, + "loss": 0.647, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.8054077625274658, + "rewards/margins": 0.36355963349342346, + "rewards/rejected": -2.1689672470092773, + "step": 8600 + }, + { + "epoch": 1.483459682977257, + "grad_norm": 32.23507308959961, + "learning_rate": 1.1926563996207518e-07, + "logits/chosen": -2.141597270965576, + "logits/rejected": -2.107510566711426, + "logps/chosen": -205.731689453125, + "logps/rejected": -252.4749755859375, + "loss": 0.5796, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.503330945968628, + "rewards/margins": 0.518940806388855, + "rewards/rejected": -2.0222718715667725, + "step": 8610 + }, + { + "epoch": 1.4851826326671262, + "grad_norm": 25.941207885742188, + "learning_rate": 1.1906887266249317e-07, + "logits/chosen": -2.1043217182159424, + "logits/rejected": -2.088639736175537, + "logps/chosen": -187.4060821533203, + "logps/rejected": -212.50051879882812, + "loss": 0.6189, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.3152862787246704, + "rewards/margins": 0.3075936436653137, + "rewards/rejected": -1.622880220413208, + "step": 8620 + }, + { + "epoch": 1.4869055823569952, + "grad_norm": 27.588563919067383, + "learning_rate": 1.1887202871741757e-07, + "logits/chosen": -2.0996415615081787, + "logits/rejected": -2.0707783699035645, + "logps/chosen": -168.55072021484375, + "logps/rejected": -217.21298217773438, + "loss": 0.5502, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.1530864238739014, + "rewards/margins": 0.5020955204963684, + "rewards/rejected": -1.655182123184204, + "step": 8630 + }, + { + "epoch": 1.4886285320468642, + "grad_norm": 24.333717346191406, + "learning_rate": 1.1867510891804353e-07, + "logits/chosen": -2.1870296001434326, + "logits/rejected": -2.1533381938934326, + "logps/chosen": -199.0816650390625, + "logps/rejected": -234.541015625, + "loss": 0.6065, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.4219980239868164, + "rewards/margins": 0.4048318862915039, + "rewards/rejected": -1.8268299102783203, + "step": 8640 + }, + { + "epoch": 1.4903514817367332, + "grad_norm": 39.82074737548828, + "learning_rate": 1.1847811405587127e-07, + "logits/chosen": -2.109865665435791, + "logits/rejected": -2.071108341217041, + "logps/chosen": -205.94320678710938, + "logps/rejected": -248.83682250976562, + "loss": 0.5983, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.4994192123413086, + "rewards/margins": 0.4557233452796936, + "rewards/rejected": -1.955142617225647, + "step": 8650 + }, + { + "epoch": 1.4920744314266023, + "grad_norm": 42.07971954345703, + "learning_rate": 1.1828104492270254e-07, + "logits/chosen": -2.117985963821411, + "logits/rejected": -2.086888551712036, + "logps/chosen": -208.15737915039062, + "logps/rejected": -256.1336975097656, + "loss": 0.592, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.5353782176971436, + "rewards/margins": 0.5151991248130798, + "rewards/rejected": -2.05057692527771, + "step": 8660 + }, + { + "epoch": 1.4937973811164715, + "grad_norm": 26.2050724029541, + "learning_rate": 1.1808390231063783e-07, + "logits/chosen": -2.1967854499816895, + "logits/rejected": -2.16111421585083, + "logps/chosen": -198.25198364257812, + "logps/rejected": -248.2962188720703, + "loss": 0.5703, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.4539095163345337, + "rewards/margins": 0.5109778642654419, + "rewards/rejected": -1.964887261390686, + "step": 8670 + }, + { + "epoch": 1.4955203308063405, + "grad_norm": 33.02647399902344, + "learning_rate": 1.1788668701207274e-07, + "logits/chosen": -2.121121883392334, + "logits/rejected": -2.1066901683807373, + "logps/chosen": -204.23362731933594, + "logps/rejected": -239.91250610351562, + "loss": 0.6268, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.5213901996612549, + "rewards/margins": 0.34961163997650146, + "rewards/rejected": -1.871001958847046, + "step": 8680 + }, + { + "epoch": 1.4972432804962095, + "grad_norm": 28.559772491455078, + "learning_rate": 1.1768939981969515e-07, + "logits/chosen": -2.1288161277770996, + "logits/rejected": -2.0944597721099854, + "logps/chosen": -211.2086181640625, + "logps/rejected": -243.73361206054688, + "loss": 0.6311, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.5508366823196411, + "rewards/margins": 0.41266554594039917, + "rewards/rejected": -1.9635021686553955, + "step": 8690 + }, + { + "epoch": 1.4989662301860784, + "grad_norm": 23.26927947998047, + "learning_rate": 1.1749204152648191e-07, + "logits/chosen": -2.1648428440093994, + "logits/rejected": -2.130405902862549, + "logps/chosen": -206.45309448242188, + "logps/rejected": -235.69580078125, + "loss": 0.6144, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.493087887763977, + "rewards/margins": 0.3440720736980438, + "rewards/rejected": -1.8371598720550537, + "step": 8700 + }, + { + "epoch": 1.5006891798759476, + "grad_norm": 21.11803436279297, + "learning_rate": 1.1729461292569563e-07, + "logits/chosen": -2.1213269233703613, + "logits/rejected": -2.1003379821777344, + "logps/chosen": -190.43356323242188, + "logps/rejected": -219.8393096923828, + "loss": 0.6394, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.364392876625061, + "rewards/margins": 0.30296334624290466, + "rewards/rejected": -1.6673561334609985, + "step": 8710 + }, + { + "epoch": 1.5024121295658168, + "grad_norm": 33.64053726196289, + "learning_rate": 1.1709711481088156e-07, + "logits/chosen": -2.1832005977630615, + "logits/rejected": -2.1413581371307373, + "logps/chosen": -174.26988220214844, + "logps/rejected": -220.2556610107422, + "loss": 0.541, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.1718385219573975, + "rewards/margins": 0.49143847823143005, + "rewards/rejected": -1.66327702999115, + "step": 8720 + }, + { + "epoch": 1.5041350792556858, + "grad_norm": 27.05516242980957, + "learning_rate": 1.1689954797586422e-07, + "logits/chosen": -2.1730306148529053, + "logits/rejected": -2.127138614654541, + "logps/chosen": -193.1197052001953, + "logps/rejected": -243.0399169921875, + "loss": 0.5735, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.3963727951049805, + "rewards/margins": 0.519677996635437, + "rewards/rejected": -1.916050672531128, + "step": 8730 + }, + { + "epoch": 1.5058580289455548, + "grad_norm": 57.3426628112793, + "learning_rate": 1.1670191321474457e-07, + "logits/chosen": -2.146721363067627, + "logits/rejected": -2.1224703788757324, + "logps/chosen": -215.717529296875, + "logps/rejected": -269.1415710449219, + "loss": 0.5754, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.623270034790039, + "rewards/margins": 0.546475887298584, + "rewards/rejected": -2.169745922088623, + "step": 8740 + }, + { + "epoch": 1.5075809786354237, + "grad_norm": 33.93220138549805, + "learning_rate": 1.1650421132189634e-07, + "logits/chosen": -2.145536422729492, + "logits/rejected": -2.1085126399993896, + "logps/chosen": -213.81161499023438, + "logps/rejected": -272.66522216796875, + "loss": 0.5375, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.5925045013427734, + "rewards/margins": 0.6129032969474792, + "rewards/rejected": -2.2054076194763184, + "step": 8750 + }, + { + "epoch": 1.509303928325293, + "grad_norm": 26.928314208984375, + "learning_rate": 1.1630644309196327e-07, + "logits/chosen": -2.095851421356201, + "logits/rejected": -2.0776500701904297, + "logps/chosen": -219.29922485351562, + "logps/rejected": -266.6626892089844, + "loss": 0.5936, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.6660619974136353, + "rewards/margins": 0.457836389541626, + "rewards/rejected": -2.1238980293273926, + "step": 8760 + }, + { + "epoch": 1.5110268780151621, + "grad_norm": 22.584171295166016, + "learning_rate": 1.1610860931985566e-07, + "logits/chosen": -2.140084743499756, + "logits/rejected": -2.1087329387664795, + "logps/chosen": -208.4669952392578, + "logps/rejected": -253.07095336914062, + "loss": 0.5809, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.5258299112319946, + "rewards/margins": 0.5107992887496948, + "rewards/rejected": -2.0366291999816895, + "step": 8770 + }, + { + "epoch": 1.512749827705031, + "grad_norm": 30.782161712646484, + "learning_rate": 1.1591071080074727e-07, + "logits/chosen": -2.200284957885742, + "logits/rejected": -2.1839470863342285, + "logps/chosen": -196.86715698242188, + "logps/rejected": -236.0321044921875, + "loss": 0.6009, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.4668762683868408, + "rewards/margins": 0.3684500455856323, + "rewards/rejected": -1.8353259563446045, + "step": 8780 + }, + { + "epoch": 1.5144727773949, + "grad_norm": 31.48596954345703, + "learning_rate": 1.1571274833007214e-07, + "logits/chosen": -2.2006895542144775, + "logits/rejected": -2.162903308868408, + "logps/chosen": -187.66943359375, + "logps/rejected": -231.6300048828125, + "loss": 0.5716, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.3145688772201538, + "rewards/margins": 0.4905197024345398, + "rewards/rejected": -1.8050886392593384, + "step": 8790 + }, + { + "epoch": 1.516195727084769, + "grad_norm": 22.056011199951172, + "learning_rate": 1.1551472270352125e-07, + "logits/chosen": -2.150611162185669, + "logits/rejected": -2.116769790649414, + "logps/chosen": -187.0447540283203, + "logps/rejected": -218.8388214111328, + "loss": 0.61, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.307472586631775, + "rewards/margins": 0.36511510610580444, + "rewards/rejected": -1.6725876331329346, + "step": 8800 + }, + { + "epoch": 1.516195727084769, + "eval_logits/chosen": -2.2578678131103516, + "eval_logits/rejected": -2.242769479751587, + "eval_logps/chosen": -168.42660522460938, + "eval_logps/rejected": -192.21197509765625, + "eval_loss": 0.640521764755249, + "eval_rewards/accuracies": 0.616403341293335, + "eval_rewards/chosen": -1.094111442565918, + "eval_rewards/margins": 0.20051223039627075, + "eval_rewards/rejected": -1.294623613357544, + "eval_runtime": 383.423, + "eval_samples_per_second": 11.225, + "eval_steps_per_second": 1.403, + "step": 8800 + }, + { + "epoch": 1.5179186767746382, + "grad_norm": 37.015296936035156, + "learning_rate": 1.1531663471703956e-07, + "logits/chosen": -2.187131404876709, + "logits/rejected": -2.144275426864624, + "logps/chosen": -187.49765014648438, + "logps/rejected": -243.79248046875, + "loss": 0.5379, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.3532283306121826, + "rewards/margins": 0.5681970119476318, + "rewards/rejected": -1.921425223350525, + "step": 8810 + }, + { + "epoch": 1.5196416264645074, + "grad_norm": 32.8203239440918, + "learning_rate": 1.1511848516682257e-07, + "logits/chosen": -2.192147970199585, + "logits/rejected": -2.1544651985168457, + "logps/chosen": -194.306640625, + "logps/rejected": -248.0606231689453, + "loss": 0.5455, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.4270803928375244, + "rewards/margins": 0.564303994178772, + "rewards/rejected": -1.9913842678070068, + "step": 8820 + }, + { + "epoch": 1.5213645761543764, + "grad_norm": 38.81319808959961, + "learning_rate": 1.149202748493133e-07, + "logits/chosen": -2.034425735473633, + "logits/rejected": -1.9980943202972412, + "logps/chosen": -211.1022491455078, + "logps/rejected": -260.2487487792969, + "loss": 0.5638, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.557013750076294, + "rewards/margins": 0.5233016014099121, + "rewards/rejected": -2.080315351486206, + "step": 8830 + }, + { + "epoch": 1.5230875258442453, + "grad_norm": 29.696474075317383, + "learning_rate": 1.1472200456119901e-07, + "logits/chosen": -2.088050365447998, + "logits/rejected": -2.060323476791382, + "logps/chosen": -198.53713989257812, + "logps/rejected": -258.73529052734375, + "loss": 0.537, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.449790358543396, + "rewards/margins": 0.6091513633728027, + "rewards/rejected": -2.058941602706909, + "step": 8840 + }, + { + "epoch": 1.5248104755341143, + "grad_norm": 40.81111145019531, + "learning_rate": 1.1452367509940794e-07, + "logits/chosen": -2.1819236278533936, + "logits/rejected": -2.1462504863739014, + "logps/chosen": -206.32754516601562, + "logps/rejected": -267.1129455566406, + "loss": 0.5548, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.5286524295806885, + "rewards/margins": 0.6038883924484253, + "rewards/rejected": -2.1325409412384033, + "step": 8850 + }, + { + "epoch": 1.5265334252239835, + "grad_norm": 37.93629455566406, + "learning_rate": 1.1432528726110628e-07, + "logits/chosen": -2.0919547080993652, + "logits/rejected": -2.056307315826416, + "logps/chosen": -230.38778686523438, + "logps/rejected": -283.297119140625, + "loss": 0.5624, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.7613922357559204, + "rewards/margins": 0.5530128479003906, + "rewards/rejected": -2.3144049644470215, + "step": 8860 + }, + { + "epoch": 1.5282563749138525, + "grad_norm": 28.398059844970703, + "learning_rate": 1.1412684184369478e-07, + "logits/chosen": -2.2037432193756104, + "logits/rejected": -2.154242753982544, + "logps/chosen": -233.8716583251953, + "logps/rejected": -292.74822998046875, + "loss": 0.5315, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.7590751647949219, + "rewards/margins": 0.623824954032898, + "rewards/rejected": -2.3829002380371094, + "step": 8870 + }, + { + "epoch": 1.5299793246037217, + "grad_norm": 54.44942092895508, + "learning_rate": 1.1392833964480564e-07, + "logits/chosen": -2.0453591346740723, + "logits/rejected": -2.0088441371917725, + "logps/chosen": -233.88278198242188, + "logps/rejected": -281.6195373535156, + "loss": 0.6176, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.7956584692001343, + "rewards/margins": 0.515036940574646, + "rewards/rejected": -2.310695171356201, + "step": 8880 + }, + { + "epoch": 1.5317022742935906, + "grad_norm": 38.829612731933594, + "learning_rate": 1.137297814622993e-07, + "logits/chosen": -2.057898759841919, + "logits/rejected": -2.014880895614624, + "logps/chosen": -218.6936492919922, + "logps/rejected": -271.9684753417969, + "loss": 0.5398, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6280596256256104, + "rewards/margins": 0.591567873954773, + "rewards/rejected": -2.2196271419525146, + "step": 8890 + }, + { + "epoch": 1.5334252239834596, + "grad_norm": 34.26298904418945, + "learning_rate": 1.1353116809426121e-07, + "logits/chosen": -2.137526273727417, + "logits/rejected": -2.099030017852783, + "logps/chosen": -216.14047241210938, + "logps/rejected": -262.841796875, + "loss": 0.5809, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.6013559103012085, + "rewards/margins": 0.5089315176010132, + "rewards/rejected": -2.110287666320801, + "step": 8900 + }, + { + "epoch": 1.5351481736733288, + "grad_norm": 41.93275451660156, + "learning_rate": 1.1333250033899867e-07, + "logits/chosen": -2.138963222503662, + "logits/rejected": -2.1128087043762207, + "logps/chosen": -210.839111328125, + "logps/rejected": -253.3659210205078, + "loss": 0.5888, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.537819504737854, + "rewards/margins": 0.4727010130882263, + "rewards/rejected": -2.0105204582214355, + "step": 8910 + }, + { + "epoch": 1.5368711233631978, + "grad_norm": 25.67527198791504, + "learning_rate": 1.131337789950375e-07, + "logits/chosen": -2.2184395790100098, + "logits/rejected": -2.170405149459839, + "logps/chosen": -203.72010803222656, + "logps/rejected": -259.0718688964844, + "loss": 0.5379, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.449211835861206, + "rewards/margins": 0.6281321048736572, + "rewards/rejected": -2.0773439407348633, + "step": 8920 + }, + { + "epoch": 1.538594073053067, + "grad_norm": 30.74202537536621, + "learning_rate": 1.12935004861119e-07, + "logits/chosen": -2.1468801498413086, + "logits/rejected": -2.103550434112549, + "logps/chosen": -206.2528533935547, + "logps/rejected": -254.24356079101562, + "loss": 0.5687, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.510958194732666, + "rewards/margins": 0.533153772354126, + "rewards/rejected": -2.044111967086792, + "step": 8930 + }, + { + "epoch": 1.540317022742936, + "grad_norm": 31.103778839111328, + "learning_rate": 1.1273617873619663e-07, + "logits/chosen": -2.155284881591797, + "logits/rejected": -2.1240036487579346, + "logps/chosen": -204.4677276611328, + "logps/rejected": -248.03125, + "loss": 0.5825, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.4737517833709717, + "rewards/margins": 0.4530414044857025, + "rewards/rejected": -1.9267933368682861, + "step": 8940 + }, + { + "epoch": 1.5420399724328049, + "grad_norm": 33.27176284790039, + "learning_rate": 1.1253730141943276e-07, + "logits/chosen": -2.0801100730895996, + "logits/rejected": -2.0663437843322754, + "logps/chosen": -206.09115600585938, + "logps/rejected": -250.7277374267578, + "loss": 0.5877, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.5113497972488403, + "rewards/margins": 0.43479982018470764, + "rewards/rejected": -1.946149230003357, + "step": 8950 + }, + { + "epoch": 1.5437629221226739, + "grad_norm": 57.94588088989258, + "learning_rate": 1.1233837371019566e-07, + "logits/chosen": -2.1334681510925293, + "logits/rejected": -2.0911877155303955, + "logps/chosen": -240.5612030029297, + "logps/rejected": -292.9984130859375, + "loss": 0.5788, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.8518180847167969, + "rewards/margins": 0.5720192193984985, + "rewards/rejected": -2.423837423324585, + "step": 8960 + }, + { + "epoch": 1.545485871812543, + "grad_norm": 31.889646530151367, + "learning_rate": 1.1213939640805594e-07, + "logits/chosen": -2.095510721206665, + "logits/rejected": -2.0452637672424316, + "logps/chosen": -237.1003875732422, + "logps/rejected": -292.24005126953125, + "loss": 0.52, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.8281110525131226, + "rewards/margins": 0.5949289798736572, + "rewards/rejected": -2.4230399131774902, + "step": 8970 + }, + { + "epoch": 1.5472088215024122, + "grad_norm": 40.414710998535156, + "learning_rate": 1.1194037031278378e-07, + "logits/chosen": -2.1494863033294678, + "logits/rejected": -2.118666172027588, + "logps/chosen": -257.7496337890625, + "logps/rejected": -294.08026123046875, + "loss": 0.6499, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -2.0259671211242676, + "rewards/margins": 0.3900921046733856, + "rewards/rejected": -2.4160590171813965, + "step": 8980 + }, + { + "epoch": 1.5489317711922812, + "grad_norm": 27.115970611572266, + "learning_rate": 1.1174129622434531e-07, + "logits/chosen": -2.084998607635498, + "logits/rejected": -2.046213388442993, + "logps/chosen": -217.5188446044922, + "logps/rejected": -272.76715087890625, + "loss": 0.5303, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.6049985885620117, + "rewards/margins": 0.5865806341171265, + "rewards/rejected": -2.1915793418884277, + "step": 8990 + }, + { + "epoch": 1.5506547208821502, + "grad_norm": 38.3233528137207, + "learning_rate": 1.1154217494289966e-07, + "logits/chosen": -2.1537277698516846, + "logits/rejected": -2.1174731254577637, + "logps/chosen": -232.237060546875, + "logps/rejected": -275.6959228515625, + "loss": 0.6133, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.7371547222137451, + "rewards/margins": 0.5046462416648865, + "rewards/rejected": -2.2418007850646973, + "step": 9000 + }, + { + "epoch": 1.5523776705720191, + "grad_norm": 26.543476104736328, + "learning_rate": 1.1134300726879557e-07, + "logits/chosen": -2.1145331859588623, + "logits/rejected": -2.084182024002075, + "logps/chosen": -218.10641479492188, + "logps/rejected": -260.49365234375, + "loss": 0.5863, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.6136271953582764, + "rewards/margins": 0.4704989790916443, + "rewards/rejected": -2.0841259956359863, + "step": 9010 + }, + { + "epoch": 1.5541006202618883, + "grad_norm": 32.8950080871582, + "learning_rate": 1.1114379400256828e-07, + "logits/chosen": -2.0746097564697266, + "logits/rejected": -2.040530204772949, + "logps/chosen": -196.68701171875, + "logps/rejected": -250.39822387695312, + "loss": 0.5431, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.4095488786697388, + "rewards/margins": 0.5949562788009644, + "rewards/rejected": -2.004505157470703, + "step": 9020 + }, + { + "epoch": 1.5558235699517575, + "grad_norm": 34.005210876464844, + "learning_rate": 1.1094453594493634e-07, + "logits/chosen": -2.1117730140686035, + "logits/rejected": -2.1014599800109863, + "logps/chosen": -189.75453186035156, + "logps/rejected": -237.1101837158203, + "loss": 0.5811, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.373258113861084, + "rewards/margins": 0.4692157208919525, + "rewards/rejected": -1.8424737453460693, + "step": 9030 + }, + { + "epoch": 1.5575465196416265, + "grad_norm": 35.201900482177734, + "learning_rate": 1.107452338967982e-07, + "logits/chosen": -2.1079840660095215, + "logits/rejected": -2.0829663276672363, + "logps/chosen": -194.3500518798828, + "logps/rejected": -235.8008575439453, + "loss": 0.6012, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.430095911026001, + "rewards/margins": 0.4139363169670105, + "rewards/rejected": -1.8440319299697876, + "step": 9040 + }, + { + "epoch": 1.5592694693314955, + "grad_norm": 48.223419189453125, + "learning_rate": 1.1054588865922931e-07, + "logits/chosen": -2.161323308944702, + "logits/rejected": -2.1286869049072266, + "logps/chosen": -210.0124969482422, + "logps/rejected": -252.99826049804688, + "loss": 0.6013, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.5600329637527466, + "rewards/margins": 0.4573109745979309, + "rewards/rejected": -2.0173439979553223, + "step": 9050 + }, + { + "epoch": 1.5609924190213644, + "grad_norm": 32.22039794921875, + "learning_rate": 1.1034650103347856e-07, + "logits/chosen": -2.17484712600708, + "logits/rejected": -2.1460723876953125, + "logps/chosen": -193.7638397216797, + "logps/rejected": -224.82772827148438, + "loss": 0.6186, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.3822447061538696, + "rewards/margins": 0.34044378995895386, + "rewards/rejected": -1.7226884365081787, + "step": 9060 + }, + { + "epoch": 1.5627153687112336, + "grad_norm": 27.751741409301758, + "learning_rate": 1.1014707182096525e-07, + "logits/chosen": -2.160062313079834, + "logits/rejected": -2.1260383129119873, + "logps/chosen": -174.01168823242188, + "logps/rejected": -227.6369171142578, + "loss": 0.5207, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.1757690906524658, + "rewards/margins": 0.5599743127822876, + "rewards/rejected": -1.735743522644043, + "step": 9070 + }, + { + "epoch": 1.5644383184011028, + "grad_norm": 29.899240493774414, + "learning_rate": 1.0994760182327593e-07, + "logits/chosen": -2.1407668590545654, + "logits/rejected": -2.116692304611206, + "logps/chosen": -184.6007080078125, + "logps/rejected": -230.424560546875, + "loss": 0.5897, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.315338134765625, + "rewards/margins": 0.438321053981781, + "rewards/rejected": -1.7536592483520508, + "step": 9080 + }, + { + "epoch": 1.5661612680909718, + "grad_norm": 29.706432342529297, + "learning_rate": 1.0974809184216094e-07, + "logits/chosen": -2.0893330574035645, + "logits/rejected": -2.04660701751709, + "logps/chosen": -212.39706420898438, + "logps/rejected": -258.37762451171875, + "loss": 0.5413, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.5627620220184326, + "rewards/margins": 0.5210008025169373, + "rewards/rejected": -2.0837631225585938, + "step": 9090 + }, + { + "epoch": 1.5678842177808407, + "grad_norm": 31.138164520263672, + "learning_rate": 1.0954854267953146e-07, + "logits/chosen": -2.1368656158447266, + "logits/rejected": -2.1022555828094482, + "logps/chosen": -227.8143768310547, + "logps/rejected": -255.43417358398438, + "loss": 0.6453, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.7001497745513916, + "rewards/margins": 0.3254784941673279, + "rewards/rejected": -2.025627851486206, + "step": 9100 + }, + { + "epoch": 1.5696071674707097, + "grad_norm": 31.21167755126953, + "learning_rate": 1.0934895513745603e-07, + "logits/chosen": -2.1031768321990967, + "logits/rejected": -2.066413402557373, + "logps/chosen": -214.42800903320312, + "logps/rejected": -263.53326416015625, + "loss": 0.5665, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.5775201320648193, + "rewards/margins": 0.5366379022598267, + "rewards/rejected": -2.1141581535339355, + "step": 9110 + }, + { + "epoch": 1.571330117160579, + "grad_norm": 32.25068283081055, + "learning_rate": 1.0914933001815754e-07, + "logits/chosen": -2.144286870956421, + "logits/rejected": -2.1097588539123535, + "logps/chosen": -217.8202362060547, + "logps/rejected": -258.30316162109375, + "loss": 0.5685, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.560901165008545, + "rewards/margins": 0.47228294610977173, + "rewards/rejected": -2.033184289932251, + "step": 9120 + }, + { + "epoch": 1.573053066850448, + "grad_norm": 32.76934051513672, + "learning_rate": 1.0894966812400992e-07, + "logits/chosen": -2.10341215133667, + "logits/rejected": -2.06626558303833, + "logps/chosen": -213.362548828125, + "logps/rejected": -258.8315734863281, + "loss": 0.5847, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.579810380935669, + "rewards/margins": 0.4671041965484619, + "rewards/rejected": -2.046914577484131, + "step": 9130 + }, + { + "epoch": 1.574776016540317, + "grad_norm": 21.085250854492188, + "learning_rate": 1.0874997025753482e-07, + "logits/chosen": -2.157405376434326, + "logits/rejected": -2.106078624725342, + "logps/chosen": -202.14071655273438, + "logps/rejected": -251.3083038330078, + "loss": 0.5309, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.4339019060134888, + "rewards/margins": 0.5942884683609009, + "rewards/rejected": -2.0281903743743896, + "step": 9140 + }, + { + "epoch": 1.576498966230186, + "grad_norm": 29.505142211914062, + "learning_rate": 1.0855023722139864e-07, + "logits/chosen": -2.1527392864227295, + "logits/rejected": -2.1070356369018555, + "logps/chosen": -207.39437866210938, + "logps/rejected": -264.4066467285156, + "loss": 0.5427, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.478251576423645, + "rewards/margins": 0.6295614838600159, + "rewards/rejected": -2.1078131198883057, + "step": 9150 + }, + { + "epoch": 1.578221915920055, + "grad_norm": 29.595401763916016, + "learning_rate": 1.0835046981840896e-07, + "logits/chosen": -2.1166820526123047, + "logits/rejected": -2.0935609340667725, + "logps/chosen": -195.57730102539062, + "logps/rejected": -249.2331085205078, + "loss": 0.5487, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.456304907798767, + "rewards/margins": 0.5181986093521118, + "rewards/rejected": -1.974503517150879, + "step": 9160 + }, + { + "epoch": 1.5799448656099242, + "grad_norm": 29.17534637451172, + "learning_rate": 1.0815066885151165e-07, + "logits/chosen": -2.167940616607666, + "logits/rejected": -2.1178174018859863, + "logps/chosen": -214.2770233154297, + "logps/rejected": -270.4759521484375, + "loss": 0.5349, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.5862025022506714, + "rewards/margins": 0.6339669227600098, + "rewards/rejected": -2.2201695442199707, + "step": 9170 + }, + { + "epoch": 1.5816678152997934, + "grad_norm": 25.466968536376953, + "learning_rate": 1.0795083512378738e-07, + "logits/chosen": -2.119816303253174, + "logits/rejected": -2.098113536834717, + "logps/chosen": -221.22421264648438, + "logps/rejected": -262.0200500488281, + "loss": 0.6017, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.6510483026504517, + "rewards/margins": 0.4491577744483948, + "rewards/rejected": -2.100205898284912, + "step": 9180 + }, + { + "epoch": 1.5833907649896624, + "grad_norm": 29.182552337646484, + "learning_rate": 1.077509694384485e-07, + "logits/chosen": -2.2061610221862793, + "logits/rejected": -2.1876559257507324, + "logps/chosen": -216.8682403564453, + "logps/rejected": -272.07733154296875, + "loss": 0.5311, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.6165103912353516, + "rewards/margins": 0.5453293919563293, + "rewards/rejected": -2.1618399620056152, + "step": 9190 + }, + { + "epoch": 1.5851137146795313, + "grad_norm": 29.112823486328125, + "learning_rate": 1.0755107259883591e-07, + "logits/chosen": -2.1319005489349365, + "logits/rejected": -2.094393491744995, + "logps/chosen": -216.9573974609375, + "logps/rejected": -276.7088317871094, + "loss": 0.523, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.6260998249053955, + "rewards/margins": 0.6355565190315247, + "rewards/rejected": -2.2616562843322754, + "step": 9200 + }, + { + "epoch": 1.5851137146795313, + "eval_logits/chosen": -2.1756374835968018, + "eval_logits/rejected": -2.156975507736206, + "eval_logps/chosen": -204.97225952148438, + "eval_logps/rejected": -233.03977966308594, + "eval_loss": 0.6431333422660828, + "eval_rewards/accuracies": 0.6289498209953308, + "eval_rewards/chosen": -1.459567904472351, + "eval_rewards/margins": 0.24333389103412628, + "eval_rewards/rejected": -1.7029017210006714, + "eval_runtime": 383.0245, + "eval_samples_per_second": 11.237, + "eval_steps_per_second": 1.405, + "step": 9200 + }, + { + "epoch": 1.5868366643694003, + "grad_norm": 40.64500427246094, + "learning_rate": 1.0735114540841565e-07, + "logits/chosen": -2.053277015686035, + "logits/rejected": -2.010258913040161, + "logps/chosen": -233.3690948486328, + "logps/rejected": -277.2367248535156, + "loss": 0.5958, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.7608321905136108, + "rewards/margins": 0.5039989352226257, + "rewards/rejected": -2.264831066131592, + "step": 9210 + }, + { + "epoch": 1.5885596140592695, + "grad_norm": 44.250858306884766, + "learning_rate": 1.0715118867077575e-07, + "logits/chosen": -2.101931095123291, + "logits/rejected": -2.070049285888672, + "logps/chosen": -214.3753204345703, + "logps/rejected": -270.01031494140625, + "loss": 0.5596, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.6313447952270508, + "rewards/margins": 0.5771504640579224, + "rewards/rejected": -2.2084951400756836, + "step": 9220 + }, + { + "epoch": 1.5902825637491387, + "grad_norm": 30.64914894104004, + "learning_rate": 1.0695120318962305e-07, + "logits/chosen": -2.047006607055664, + "logits/rejected": -2.011451482772827, + "logps/chosen": -207.5570068359375, + "logps/rejected": -269.8608703613281, + "loss": 0.5311, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.573559045791626, + "rewards/margins": 0.5993128418922424, + "rewards/rejected": -2.1728720664978027, + "step": 9230 + }, + { + "epoch": 1.5920055134390076, + "grad_norm": 33.54546356201172, + "learning_rate": 1.0675118976877989e-07, + "logits/chosen": -2.0637457370758057, + "logits/rejected": -2.0366601943969727, + "logps/chosen": -238.0423583984375, + "logps/rejected": -293.1909484863281, + "loss": 0.569, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.8054208755493164, + "rewards/margins": 0.5892099738121033, + "rewards/rejected": -2.3946309089660645, + "step": 9240 + }, + { + "epoch": 1.5937284631288766, + "grad_norm": 25.22513198852539, + "learning_rate": 1.0655114921218086e-07, + "logits/chosen": -2.0545201301574707, + "logits/rejected": -2.0165328979492188, + "logps/chosen": -219.8867645263672, + "logps/rejected": -272.6210632324219, + "loss": 0.5682, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.635759711265564, + "rewards/margins": 0.5482776165008545, + "rewards/rejected": -2.184037208557129, + "step": 9250 + }, + { + "epoch": 1.5954514128187456, + "grad_norm": 27.668685913085938, + "learning_rate": 1.0635108232386976e-07, + "logits/chosen": -2.0562832355499268, + "logits/rejected": -2.0279722213745117, + "logps/chosen": -224.8714141845703, + "logps/rejected": -274.3570861816406, + "loss": 0.5967, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.7631824016571045, + "rewards/margins": 0.5013735890388489, + "rewards/rejected": -2.264556407928467, + "step": 9260 + }, + { + "epoch": 1.5971743625086148, + "grad_norm": 26.909574508666992, + "learning_rate": 1.0615098990799607e-07, + "logits/chosen": -2.1258835792541504, + "logits/rejected": -2.084963798522949, + "logps/chosen": -222.63735961914062, + "logps/rejected": -271.89166259765625, + "loss": 0.5589, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.664276123046875, + "rewards/margins": 0.5433754920959473, + "rewards/rejected": -2.2076516151428223, + "step": 9270 + }, + { + "epoch": 1.598897312198484, + "grad_norm": 34.04690170288086, + "learning_rate": 1.05950872768812e-07, + "logits/chosen": -2.1449623107910156, + "logits/rejected": -2.1120221614837646, + "logps/chosen": -200.7971649169922, + "logps/rejected": -247.43359375, + "loss": 0.5702, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.4581162929534912, + "rewards/margins": 0.4871610105037689, + "rewards/rejected": -1.945277214050293, + "step": 9280 + }, + { + "epoch": 1.600620261888353, + "grad_norm": 26.49469566345215, + "learning_rate": 1.0575073171066906e-07, + "logits/chosen": -2.07810640335083, + "logits/rejected": -2.04874324798584, + "logps/chosen": -191.39736938476562, + "logps/rejected": -225.0348663330078, + "loss": 0.6096, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.35977041721344, + "rewards/margins": 0.3903138041496277, + "rewards/rejected": -1.7500841617584229, + "step": 9290 + }, + { + "epoch": 1.602343211578222, + "grad_norm": 34.1944694519043, + "learning_rate": 1.0555056753801493e-07, + "logits/chosen": -2.1365926265716553, + "logits/rejected": -2.1075198650360107, + "logps/chosen": -190.500732421875, + "logps/rejected": -245.53759765625, + "loss": 0.5479, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.3554012775421143, + "rewards/margins": 0.5518912672996521, + "rewards/rejected": -1.9072927236557007, + "step": 9300 + }, + { + "epoch": 1.6040661612680909, + "grad_norm": 26.409666061401367, + "learning_rate": 1.0535038105539014e-07, + "logits/chosen": -2.129552125930786, + "logits/rejected": -2.1025259494781494, + "logps/chosen": -191.71160888671875, + "logps/rejected": -234.30126953125, + "loss": 0.5647, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.3693513870239258, + "rewards/margins": 0.47751206159591675, + "rewards/rejected": -1.8468633890151978, + "step": 9310 + }, + { + "epoch": 1.60578911095796, + "grad_norm": 28.054391860961914, + "learning_rate": 1.0515017306742504e-07, + "logits/chosen": -2.186974048614502, + "logits/rejected": -2.148070812225342, + "logps/chosen": -194.74349975585938, + "logps/rejected": -250.1185760498047, + "loss": 0.5573, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.4252640008926392, + "rewards/margins": 0.5447186231613159, + "rewards/rejected": -1.9699825048446655, + "step": 9320 + }, + { + "epoch": 1.607512060647829, + "grad_norm": 34.07315444946289, + "learning_rate": 1.0494994437883619e-07, + "logits/chosen": -2.1339361667633057, + "logits/rejected": -2.0924479961395264, + "logps/chosen": -207.88687133789062, + "logps/rejected": -262.8285827636719, + "loss": 0.5524, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4944640398025513, + "rewards/margins": 0.5901376605033875, + "rewards/rejected": -2.084601879119873, + "step": 9330 + }, + { + "epoch": 1.6092350103376982, + "grad_norm": 26.45479393005371, + "learning_rate": 1.0474969579442356e-07, + "logits/chosen": -2.067122459411621, + "logits/rejected": -2.0326459407806396, + "logps/chosen": -214.92770385742188, + "logps/rejected": -278.4351806640625, + "loss": 0.5254, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.6262165307998657, + "rewards/margins": 0.654121994972229, + "rewards/rejected": -2.2803382873535156, + "step": 9340 + }, + { + "epoch": 1.6109579600275672, + "grad_norm": 55.737125396728516, + "learning_rate": 1.0454942811906703e-07, + "logits/chosen": -2.0701682567596436, + "logits/rejected": -2.0222160816192627, + "logps/chosen": -223.65353393554688, + "logps/rejected": -282.44549560546875, + "loss": 0.5307, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.7031676769256592, + "rewards/margins": 0.6135429739952087, + "rewards/rejected": -2.3167104721069336, + "step": 9350 + }, + { + "epoch": 1.6126809097174362, + "grad_norm": 42.1988639831543, + "learning_rate": 1.0434914215772318e-07, + "logits/chosen": -2.1071135997772217, + "logits/rejected": -2.058732509613037, + "logps/chosen": -254.6279296875, + "logps/rejected": -318.9466552734375, + "loss": 0.5517, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.9617589712142944, + "rewards/margins": 0.7104604244232178, + "rewards/rejected": -2.6722190380096436, + "step": 9360 + }, + { + "epoch": 1.6144038594073054, + "grad_norm": 55.07982635498047, + "learning_rate": 1.0414883871542208e-07, + "logits/chosen": -2.111906051635742, + "logits/rejected": -2.054779529571533, + "logps/chosen": -245.83413696289062, + "logps/rejected": -314.3738098144531, + "loss": 0.5225, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.9080016613006592, + "rewards/margins": 0.7261230945587158, + "rewards/rejected": -2.634124994277954, + "step": 9370 + }, + { + "epoch": 1.6161268090971743, + "grad_norm": 30.782182693481445, + "learning_rate": 1.0394851859726408e-07, + "logits/chosen": -2.116983652114868, + "logits/rejected": -2.0886120796203613, + "logps/chosen": -228.97787475585938, + "logps/rejected": -272.6360168457031, + "loss": 0.6418, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.7381610870361328, + "rewards/margins": 0.4510871469974518, + "rewards/rejected": -2.189248561859131, + "step": 9380 + }, + { + "epoch": 1.6178497587870435, + "grad_norm": 28.4465389251709, + "learning_rate": 1.0374818260841663e-07, + "logits/chosen": -2.0388429164886475, + "logits/rejected": -2.00158953666687, + "logps/chosen": -199.9791717529297, + "logps/rejected": -260.9677734375, + "loss": 0.5261, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.4609673023223877, + "rewards/margins": 0.6105450391769409, + "rewards/rejected": -2.071512460708618, + "step": 9390 + }, + { + "epoch": 1.6195727084769125, + "grad_norm": 29.523244857788086, + "learning_rate": 1.035478315541108e-07, + "logits/chosen": -2.10453462600708, + "logits/rejected": -2.0754971504211426, + "logps/chosen": -196.71092224121094, + "logps/rejected": -237.4402618408203, + "loss": 0.6086, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.41031813621521, + "rewards/margins": 0.42263931035995483, + "rewards/rejected": -1.8329575061798096, + "step": 9400 + }, + { + "epoch": 1.6212956581667815, + "grad_norm": 40.687496185302734, + "learning_rate": 1.0334746623963843e-07, + "logits/chosen": -2.087872266769409, + "logits/rejected": -2.050535202026367, + "logps/chosen": -208.1427764892578, + "logps/rejected": -262.15130615234375, + "loss": 0.5531, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.5345708131790161, + "rewards/margins": 0.5761247277259827, + "rewards/rejected": -2.1106953620910645, + "step": 9410 + }, + { + "epoch": 1.6230186078566504, + "grad_norm": 50.26692581176758, + "learning_rate": 1.031470874703485e-07, + "logits/chosen": -2.109248161315918, + "logits/rejected": -2.0769641399383545, + "logps/chosen": -220.5753936767578, + "logps/rejected": -263.15814208984375, + "loss": 0.5914, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.6488628387451172, + "rewards/margins": 0.47126445174217224, + "rewards/rejected": -2.1201274394989014, + "step": 9420 + }, + { + "epoch": 1.6247415575465196, + "grad_norm": 28.470251083374023, + "learning_rate": 1.0294669605164417e-07, + "logits/chosen": -2.0839807987213135, + "logits/rejected": -2.043217182159424, + "logps/chosen": -210.2068328857422, + "logps/rejected": -260.85699462890625, + "loss": 0.5825, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.5591367483139038, + "rewards/margins": 0.5711666345596313, + "rewards/rejected": -2.130303144454956, + "step": 9430 + }, + { + "epoch": 1.6264645072363888, + "grad_norm": 39.39202117919922, + "learning_rate": 1.0274629278897941e-07, + "logits/chosen": -2.091012716293335, + "logits/rejected": -2.070128917694092, + "logps/chosen": -195.10609436035156, + "logps/rejected": -238.289306640625, + "loss": 0.5901, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.4520760774612427, + "rewards/margins": 0.41639605164527893, + "rewards/rejected": -1.8684720993041992, + "step": 9440 + }, + { + "epoch": 1.6281874569262578, + "grad_norm": 40.777000427246094, + "learning_rate": 1.0254587848785574e-07, + "logits/chosen": -2.208874464035034, + "logits/rejected": -2.1783530712127686, + "logps/chosen": -209.24563598632812, + "logps/rejected": -247.0983123779297, + "loss": 0.6091, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.5514070987701416, + "rewards/margins": 0.4226817190647125, + "rewards/rejected": -1.9740889072418213, + "step": 9450 + }, + { + "epoch": 1.6299104066161267, + "grad_norm": 30.348520278930664, + "learning_rate": 1.0234545395381922e-07, + "logits/chosen": -2.1149468421936035, + "logits/rejected": -2.081460475921631, + "logps/chosen": -191.42092895507812, + "logps/rejected": -270.44873046875, + "loss": 0.4516, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -1.3773084878921509, + "rewards/margins": 0.7978097796440125, + "rewards/rejected": -2.1751179695129395, + "step": 9460 + }, + { + "epoch": 1.6316333563059957, + "grad_norm": 31.95038414001465, + "learning_rate": 1.021450199924568e-07, + "logits/chosen": -1.9736360311508179, + "logits/rejected": -1.9359562397003174, + "logps/chosen": -214.228759765625, + "logps/rejected": -256.5456237792969, + "loss": 0.6069, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.6125974655151367, + "rewards/margins": 0.4563944339752197, + "rewards/rejected": -2.0689916610717773, + "step": 9470 + }, + { + "epoch": 1.633356305995865, + "grad_norm": 37.818477630615234, + "learning_rate": 1.0194457740939353e-07, + "logits/chosen": -2.091583490371704, + "logits/rejected": -2.0452983379364014, + "logps/chosen": -221.3782958984375, + "logps/rejected": -272.5511474609375, + "loss": 0.5538, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.6660854816436768, + "rewards/margins": 0.5488374829292297, + "rewards/rejected": -2.2149226665496826, + "step": 9480 + }, + { + "epoch": 1.635079255685734, + "grad_norm": 37.699649810791016, + "learning_rate": 1.0174412701028899e-07, + "logits/chosen": -2.0410289764404297, + "logits/rejected": -2.002805233001709, + "logps/chosen": -231.63192749023438, + "logps/rejected": -287.02618408203125, + "loss": 0.5512, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.7601169347763062, + "rewards/margins": 0.5951749682426453, + "rewards/rejected": -2.3552918434143066, + "step": 9490 + }, + { + "epoch": 1.636802205375603, + "grad_norm": 26.676788330078125, + "learning_rate": 1.0154366960083422e-07, + "logits/chosen": -2.0825207233428955, + "logits/rejected": -2.049968719482422, + "logps/chosen": -237.82357788085938, + "logps/rejected": -303.791259765625, + "loss": 0.5249, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.8297290802001953, + "rewards/margins": 0.6732373237609863, + "rewards/rejected": -2.5029664039611816, + "step": 9500 + }, + { + "epoch": 1.638525155065472, + "grad_norm": 36.48911666870117, + "learning_rate": 1.0134320598674846e-07, + "logits/chosen": -1.985640287399292, + "logits/rejected": -1.9444854259490967, + "logps/chosen": -244.42471313476562, + "logps/rejected": -323.17071533203125, + "loss": 0.5188, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.922539472579956, + "rewards/margins": 0.7798765301704407, + "rewards/rejected": -2.70241641998291, + "step": 9510 + }, + { + "epoch": 1.640248104755341, + "grad_norm": 28.293848037719727, + "learning_rate": 1.0114273697377583e-07, + "logits/chosen": -2.153301239013672, + "logits/rejected": -2.1260287761688232, + "logps/chosen": -258.92950439453125, + "logps/rejected": -325.4205627441406, + "loss": 0.6037, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.056684970855713, + "rewards/margins": 0.6256363987922668, + "rewards/rejected": -2.682321786880493, + "step": 9520 + }, + { + "epoch": 1.6419710544452102, + "grad_norm": 31.049102783203125, + "learning_rate": 1.0094226336768224e-07, + "logits/chosen": -2.119263172149658, + "logits/rejected": -2.0707802772521973, + "logps/chosen": -223.88095092773438, + "logps/rejected": -281.79473876953125, + "loss": 0.5493, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.6941604614257812, + "rewards/margins": 0.6301865577697754, + "rewards/rejected": -2.3243470191955566, + "step": 9530 + }, + { + "epoch": 1.6436940041350794, + "grad_norm": 27.43020248413086, + "learning_rate": 1.0074178597425194e-07, + "logits/chosen": -2.0635290145874023, + "logits/rejected": -2.0217814445495605, + "logps/chosen": -212.50009155273438, + "logps/rejected": -273.19610595703125, + "loss": 0.5317, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5750280618667603, + "rewards/margins": 0.6327553391456604, + "rewards/rejected": -2.2077832221984863, + "step": 9540 + }, + { + "epoch": 1.6454169538249483, + "grad_norm": 24.99987030029297, + "learning_rate": 1.0054130559928451e-07, + "logits/chosen": -2.129505157470703, + "logits/rejected": -2.1067185401916504, + "logps/chosen": -204.6131134033203, + "logps/rejected": -259.88726806640625, + "loss": 0.5647, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.504603624343872, + "rewards/margins": 0.5379934310913086, + "rewards/rejected": -2.0425968170166016, + "step": 9550 + }, + { + "epoch": 1.6471399035148173, + "grad_norm": 40.675254821777344, + "learning_rate": 1.0034082304859144e-07, + "logits/chosen": -2.1753644943237305, + "logits/rejected": -2.1537044048309326, + "logps/chosen": -207.90188598632812, + "logps/rejected": -250.1908416748047, + "loss": 0.5972, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.5333625078201294, + "rewards/margins": 0.4467272162437439, + "rewards/rejected": -1.9800899028778076, + "step": 9560 + }, + { + "epoch": 1.6488628532046863, + "grad_norm": 35.79283905029297, + "learning_rate": 1.00140339127993e-07, + "logits/chosen": -2.075749397277832, + "logits/rejected": -2.0463900566101074, + "logps/chosen": -202.37844848632812, + "logps/rejected": -250.78274536132812, + "loss": 0.5997, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.5012257099151611, + "rewards/margins": 0.5024011135101318, + "rewards/rejected": -2.003626823425293, + "step": 9570 + }, + { + "epoch": 1.6505858028945555, + "grad_norm": 35.26013946533203, + "learning_rate": 9.9939854643315e-08, + "logits/chosen": -2.158979654312134, + "logits/rejected": -2.122878313064575, + "logps/chosen": -195.38125610351562, + "logps/rejected": -243.55880737304688, + "loss": 0.576, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.401939868927002, + "rewards/margins": 0.46915024518966675, + "rewards/rejected": -1.8710901737213135, + "step": 9580 + }, + { + "epoch": 1.6523087525844247, + "grad_norm": 30.336355209350586, + "learning_rate": 9.973937040038544e-08, + "logits/chosen": -2.2403571605682373, + "logits/rejected": -2.203962802886963, + "logps/chosen": -197.30946350097656, + "logps/rejected": -245.12820434570312, + "loss": 0.564, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.399221420288086, + "rewards/margins": 0.49407368898391724, + "rewards/rejected": -1.8932949304580688, + "step": 9590 + }, + { + "epoch": 1.6540317022742936, + "grad_norm": 45.379878997802734, + "learning_rate": 9.953888720503145e-08, + "logits/chosen": -2.093346118927002, + "logits/rejected": -2.0450000762939453, + "logps/chosen": -209.98733520507812, + "logps/rejected": -269.8240966796875, + "loss": 0.5412, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.5394213199615479, + "rewards/margins": 0.6430832147598267, + "rewards/rejected": -2.182504892349243, + "step": 9600 + }, + { + "epoch": 1.6540317022742936, + "eval_logits/chosen": -2.1707704067230225, + "eval_logits/rejected": -2.151291608810425, + "eval_logps/chosen": -201.29856872558594, + "eval_logps/rejected": -231.70973205566406, + "eval_loss": 0.6392949223518372, + "eval_rewards/accuracies": 0.63150554895401, + "eval_rewards/chosen": -1.4228310585021973, + "eval_rewards/margins": 0.26677021384239197, + "eval_rewards/rejected": -1.689601182937622, + "eval_runtime": 383.1623, + "eval_samples_per_second": 11.233, + "eval_steps_per_second": 1.404, + "step": 9600 + }, + { + "epoch": 1.6557546519641626, + "grad_norm": 27.446199417114258, + "learning_rate": 9.933840586307579e-08, + "logits/chosen": -2.0719919204711914, + "logits/rejected": -2.036891460418701, + "logps/chosen": -212.3845672607422, + "logps/rejected": -278.9921875, + "loss": 0.5134, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.5430954694747925, + "rewards/margins": 0.7068275809288025, + "rewards/rejected": -2.2499232292175293, + "step": 9610 + }, + { + "epoch": 1.6574776016540316, + "grad_norm": 31.442110061645508, + "learning_rate": 9.913792718033396e-08, + "logits/chosen": -2.150604724884033, + "logits/rejected": -2.118518352508545, + "logps/chosen": -223.6409454345703, + "logps/rejected": -266.01214599609375, + "loss": 0.6095, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.6785471439361572, + "rewards/margins": 0.43278011679649353, + "rewards/rejected": -2.1113271713256836, + "step": 9620 + }, + { + "epoch": 1.6592005513439008, + "grad_norm": 61.02971649169922, + "learning_rate": 9.893745196261062e-08, + "logits/chosen": -2.092367649078369, + "logits/rejected": -2.0572047233581543, + "logps/chosen": -236.3819580078125, + "logps/rejected": -284.91693115234375, + "loss": 0.6022, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.8070704936981201, + "rewards/margins": 0.5147414207458496, + "rewards/rejected": -2.3218119144439697, + "step": 9630 + }, + { + "epoch": 1.66092350103377, + "grad_norm": 31.099504470825195, + "learning_rate": 9.873698101569657e-08, + "logits/chosen": -2.1342499256134033, + "logits/rejected": -2.097806692123413, + "logps/chosen": -211.4412384033203, + "logps/rejected": -256.29473876953125, + "loss": 0.56, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.5404903888702393, + "rewards/margins": 0.5262420773506165, + "rewards/rejected": -2.066732406616211, + "step": 9640 + }, + { + "epoch": 1.662646450723639, + "grad_norm": 44.89314270019531, + "learning_rate": 9.853651514536552e-08, + "logits/chosen": -2.0789711475372314, + "logits/rejected": -2.0402519702911377, + "logps/chosen": -206.84707641601562, + "logps/rejected": -245.1453399658203, + "loss": 0.6071, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.5060863494873047, + "rewards/margins": 0.4056490361690521, + "rewards/rejected": -1.9117352962493896, + "step": 9650 + }, + { + "epoch": 1.664369400413508, + "grad_norm": 31.32309341430664, + "learning_rate": 9.833605515737058e-08, + "logits/chosen": -2.0809390544891357, + "logits/rejected": -2.055072069168091, + "logps/chosen": -187.5325469970703, + "logps/rejected": -240.6793670654297, + "loss": 0.5671, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.3576128482818604, + "rewards/margins": 0.5182755589485168, + "rewards/rejected": -1.875888466835022, + "step": 9660 + }, + { + "epoch": 1.6660923501033769, + "grad_norm": 33.44990539550781, + "learning_rate": 9.813560185744138e-08, + "logits/chosen": -2.1692049503326416, + "logits/rejected": -2.1284148693084717, + "logps/chosen": -202.41641235351562, + "logps/rejected": -259.0638122558594, + "loss": 0.5392, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.44061279296875, + "rewards/margins": 0.6232479214668274, + "rewards/rejected": -2.0638606548309326, + "step": 9670 + }, + { + "epoch": 1.667815299793246, + "grad_norm": 33.01234436035156, + "learning_rate": 9.79351560512806e-08, + "logits/chosen": -2.1055922508239746, + "logits/rejected": -2.0845696926116943, + "logps/chosen": -209.5716094970703, + "logps/rejected": -242.68685913085938, + "loss": 0.6386, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.539527177810669, + "rewards/margins": 0.371540904045105, + "rewards/rejected": -1.9110679626464844, + "step": 9680 + }, + { + "epoch": 1.6695382494831152, + "grad_norm": 42.91765594482422, + "learning_rate": 9.773471854456087e-08, + "logits/chosen": -2.057504177093506, + "logits/rejected": -2.0265281200408936, + "logps/chosen": -196.57354736328125, + "logps/rejected": -242.1881866455078, + "loss": 0.566, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.4017528295516968, + "rewards/margins": 0.4889547824859619, + "rewards/rejected": -1.8907076120376587, + "step": 9690 + }, + { + "epoch": 1.6712611991729842, + "grad_norm": 59.99947738647461, + "learning_rate": 9.753429014292132e-08, + "logits/chosen": -2.0934269428253174, + "logits/rejected": -2.0551304817199707, + "logps/chosen": -189.7425537109375, + "logps/rejected": -230.2326202392578, + "loss": 0.6123, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.3643513917922974, + "rewards/margins": 0.42653173208236694, + "rewards/rejected": -1.7908833026885986, + "step": 9700 + }, + { + "epoch": 1.6729841488628532, + "grad_norm": 31.751218795776367, + "learning_rate": 9.73338716519646e-08, + "logits/chosen": -2.0292251110076904, + "logits/rejected": -2.010593891143799, + "logps/chosen": -180.52139282226562, + "logps/rejected": -216.5252227783203, + "loss": 0.6059, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.2649705410003662, + "rewards/margins": 0.3905658423900604, + "rewards/rejected": -1.6555362939834595, + "step": 9710 + }, + { + "epoch": 1.6747070985527222, + "grad_norm": 30.22354507446289, + "learning_rate": 9.713346387725355e-08, + "logits/chosen": -2.165675640106201, + "logits/rejected": -2.1414732933044434, + "logps/chosen": -177.21224975585938, + "logps/rejected": -205.3203887939453, + "loss": 0.6241, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.2220103740692139, + "rewards/margins": 0.3224605917930603, + "rewards/rejected": -1.5444709062576294, + "step": 9720 + }, + { + "epoch": 1.6764300482425913, + "grad_norm": 29.720321655273438, + "learning_rate": 9.693306762430782e-08, + "logits/chosen": -2.1688475608825684, + "logits/rejected": -2.1419014930725098, + "logps/chosen": -172.5311279296875, + "logps/rejected": -221.6881561279297, + "loss": 0.5561, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1749764680862427, + "rewards/margins": 0.504775881767273, + "rewards/rejected": -1.6797523498535156, + "step": 9730 + }, + { + "epoch": 1.6781529979324605, + "grad_norm": 26.8512020111084, + "learning_rate": 9.673268369860086e-08, + "logits/chosen": -2.133647918701172, + "logits/rejected": -2.107252359390259, + "logps/chosen": -191.29776000976562, + "logps/rejected": -230.453369140625, + "loss": 0.5964, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.356992244720459, + "rewards/margins": 0.4239605963230133, + "rewards/rejected": -1.780953049659729, + "step": 9740 + }, + { + "epoch": 1.6798759476223295, + "grad_norm": 24.874778747558594, + "learning_rate": 9.653231290555647e-08, + "logits/chosen": -2.1468117237091064, + "logits/rejected": -2.0880696773529053, + "logps/chosen": -199.73204040527344, + "logps/rejected": -246.09619140625, + "loss": 0.5717, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.437829613685608, + "rewards/margins": 0.5362062454223633, + "rewards/rejected": -1.9740358591079712, + "step": 9750 + }, + { + "epoch": 1.6815988973121985, + "grad_norm": 23.847570419311523, + "learning_rate": 9.633195605054573e-08, + "logits/chosen": -2.125284194946289, + "logits/rejected": -2.079078197479248, + "logps/chosen": -195.61141967773438, + "logps/rejected": -247.21658325195312, + "loss": 0.5554, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.4141006469726562, + "rewards/margins": 0.5550005435943604, + "rewards/rejected": -1.9691009521484375, + "step": 9760 + }, + { + "epoch": 1.6833218470020674, + "grad_norm": 37.763572692871094, + "learning_rate": 9.613161393888372e-08, + "logits/chosen": -2.0627262592315674, + "logits/rejected": -2.0276284217834473, + "logps/chosen": -195.24488830566406, + "logps/rejected": -244.3018341064453, + "loss": 0.5826, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.4011054039001465, + "rewards/margins": 0.5025665163993835, + "rewards/rejected": -1.9036719799041748, + "step": 9770 + }, + { + "epoch": 1.6850447966919366, + "grad_norm": 24.46339225769043, + "learning_rate": 9.593128737582623e-08, + "logits/chosen": -2.1057629585266113, + "logits/rejected": -2.049747943878174, + "logps/chosen": -201.80274963378906, + "logps/rejected": -250.95144653320312, + "loss": 0.5648, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.4022512435913086, + "rewards/margins": 0.5962937474250793, + "rewards/rejected": -1.9985450506210327, + "step": 9780 + }, + { + "epoch": 1.6867677463818056, + "grad_norm": 28.699012756347656, + "learning_rate": 9.57309771665665e-08, + "logits/chosen": -2.119412899017334, + "logits/rejected": -2.0996031761169434, + "logps/chosen": -199.32797241210938, + "logps/rejected": -255.63925170898438, + "loss": 0.582, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.4494515657424927, + "rewards/margins": 0.5416026711463928, + "rewards/rejected": -1.9910542964935303, + "step": 9790 + }, + { + "epoch": 1.6884906960716748, + "grad_norm": 47.394649505615234, + "learning_rate": 9.553068411623211e-08, + "logits/chosen": -2.157869815826416, + "logits/rejected": -2.1125385761260986, + "logps/chosen": -199.79861450195312, + "logps/rejected": -255.11672973632812, + "loss": 0.562, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.412087321281433, + "rewards/margins": 0.6073322296142578, + "rewards/rejected": -2.0194194316864014, + "step": 9800 + }, + { + "epoch": 1.6902136457615438, + "grad_norm": 30.579559326171875, + "learning_rate": 9.533040902988164e-08, + "logits/chosen": -2.1238341331481934, + "logits/rejected": -2.079685688018799, + "logps/chosen": -200.06643676757812, + "logps/rejected": -249.8910675048828, + "loss": 0.5945, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.4329220056533813, + "rewards/margins": 0.5416523218154907, + "rewards/rejected": -1.9745744466781616, + "step": 9810 + }, + { + "epoch": 1.6919365954514127, + "grad_norm": 33.1264533996582, + "learning_rate": 9.51301527125015e-08, + "logits/chosen": -2.131040573120117, + "logits/rejected": -2.09142804145813, + "logps/chosen": -185.69674682617188, + "logps/rejected": -241.4114227294922, + "loss": 0.5386, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.306038498878479, + "rewards/margins": 0.5765220522880554, + "rewards/rejected": -1.8825607299804688, + "step": 9820 + }, + { + "epoch": 1.693659545141282, + "grad_norm": 35.00567626953125, + "learning_rate": 9.492991596900265e-08, + "logits/chosen": -2.1659820079803467, + "logits/rejected": -2.143907070159912, + "logps/chosen": -197.84617614746094, + "logps/rejected": -236.71890258789062, + "loss": 0.6339, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.385030746459961, + "rewards/margins": 0.4190130829811096, + "rewards/rejected": -1.8040437698364258, + "step": 9830 + }, + { + "epoch": 1.6953824948311509, + "grad_norm": 31.27936363220215, + "learning_rate": 9.47296996042173e-08, + "logits/chosen": -2.1307952404022217, + "logits/rejected": -2.0879383087158203, + "logps/chosen": -185.07919311523438, + "logps/rejected": -237.0386962890625, + "loss": 0.5739, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.2881401777267456, + "rewards/margins": 0.5533360242843628, + "rewards/rejected": -1.8414760828018188, + "step": 9840 + }, + { + "epoch": 1.69710544452102, + "grad_norm": 29.849496841430664, + "learning_rate": 9.452950442289582e-08, + "logits/chosen": -2.148524284362793, + "logits/rejected": -2.125277042388916, + "logps/chosen": -176.24105834960938, + "logps/rejected": -219.44589233398438, + "loss": 0.5566, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.1986567974090576, + "rewards/margins": 0.4844595789909363, + "rewards/rejected": -1.6831163167953491, + "step": 9850 + }, + { + "epoch": 1.698828394210889, + "grad_norm": 28.343416213989258, + "learning_rate": 9.432933122970347e-08, + "logits/chosen": -2.17924165725708, + "logits/rejected": -2.159785509109497, + "logps/chosen": -195.09347534179688, + "logps/rejected": -228.8980712890625, + "loss": 0.6072, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.3785400390625, + "rewards/margins": 0.3773375153541565, + "rewards/rejected": -1.7558774948120117, + "step": 9860 + }, + { + "epoch": 1.700551343900758, + "grad_norm": 30.498952865600586, + "learning_rate": 9.412918082921706e-08, + "logits/chosen": -2.1416916847229004, + "logits/rejected": -2.1032252311706543, + "logps/chosen": -188.11207580566406, + "logps/rejected": -235.4365997314453, + "loss": 0.5707, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.3446654081344604, + "rewards/margins": 0.5133527517318726, + "rewards/rejected": -1.858018159866333, + "step": 9870 + }, + { + "epoch": 1.7022742935906272, + "grad_norm": 26.755144119262695, + "learning_rate": 9.39290540259218e-08, + "logits/chosen": -2.0809402465820312, + "logits/rejected": -2.058351993560791, + "logps/chosen": -189.55882263183594, + "logps/rejected": -243.50753784179688, + "loss": 0.5537, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.3670248985290527, + "rewards/margins": 0.5220685005187988, + "rewards/rejected": -1.8890936374664307, + "step": 9880 + }, + { + "epoch": 1.7039972432804962, + "grad_norm": 26.49530029296875, + "learning_rate": 9.372895162420808e-08, + "logits/chosen": -2.071610689163208, + "logits/rejected": -2.044877052307129, + "logps/chosen": -198.6932373046875, + "logps/rejected": -251.8302001953125, + "loss": 0.5383, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.408941388130188, + "rewards/margins": 0.580849289894104, + "rewards/rejected": -1.9897905588150024, + "step": 9890 + }, + { + "epoch": 1.7057201929703654, + "grad_norm": 33.74192428588867, + "learning_rate": 9.352887442836816e-08, + "logits/chosen": -2.1049561500549316, + "logits/rejected": -2.0604281425476074, + "logps/chosen": -195.46896362304688, + "logps/rejected": -265.776123046875, + "loss": 0.4979, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.4245421886444092, + "rewards/margins": 0.7445545792579651, + "rewards/rejected": -2.1690969467163086, + "step": 9900 + }, + { + "epoch": 1.7074431426602343, + "grad_norm": 26.481822967529297, + "learning_rate": 9.332882324259306e-08, + "logits/chosen": -2.1466808319091797, + "logits/rejected": -2.0927011966705322, + "logps/chosen": -213.91171264648438, + "logps/rejected": -260.396728515625, + "loss": 0.5752, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.5574681758880615, + "rewards/margins": 0.5506702065467834, + "rewards/rejected": -2.1081383228302, + "step": 9910 + }, + { + "epoch": 1.7091660923501033, + "grad_norm": 35.15553283691406, + "learning_rate": 9.312879887096923e-08, + "logits/chosen": -2.167757272720337, + "logits/rejected": -2.1290271282196045, + "logps/chosen": -212.5194091796875, + "logps/rejected": -266.25323486328125, + "loss": 0.5405, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.5846225023269653, + "rewards/margins": 0.5880954265594482, + "rewards/rejected": -2.172717809677124, + "step": 9920 + }, + { + "epoch": 1.7108890420399723, + "grad_norm": 38.19834518432617, + "learning_rate": 9.292880211747528e-08, + "logits/chosen": -2.097388744354248, + "logits/rejected": -2.0738513469696045, + "logps/chosen": -213.891357421875, + "logps/rejected": -275.8308410644531, + "loss": 0.555, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.6055524349212646, + "rewards/margins": 0.6078485250473022, + "rewards/rejected": -2.2134010791778564, + "step": 9930 + }, + { + "epoch": 1.7126119917298415, + "grad_norm": 24.613040924072266, + "learning_rate": 9.27288337859789e-08, + "logits/chosen": -2.1469640731811523, + "logits/rejected": -2.1228179931640625, + "logps/chosen": -225.4186553955078, + "logps/rejected": -278.17218017578125, + "loss": 0.59, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.6956018209457397, + "rewards/margins": 0.48676204681396484, + "rewards/rejected": -2.182363986968994, + "step": 9940 + }, + { + "epoch": 1.7143349414197107, + "grad_norm": 49.375003814697266, + "learning_rate": 9.252889468023348e-08, + "logits/chosen": -2.115926504135132, + "logits/rejected": -2.065687894821167, + "logps/chosen": -225.0879364013672, + "logps/rejected": -289.8050537109375, + "loss": 0.5322, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.7030729055404663, + "rewards/margins": 0.6950501203536987, + "rewards/rejected": -2.398123264312744, + "step": 9950 + }, + { + "epoch": 1.7160578911095796, + "grad_norm": 36.60076904296875, + "learning_rate": 9.232898560387503e-08, + "logits/chosen": -2.1651546955108643, + "logits/rejected": -2.1324679851531982, + "logps/chosen": -222.7748260498047, + "logps/rejected": -272.51092529296875, + "loss": 0.589, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.6857795715332031, + "rewards/margins": 0.4993601441383362, + "rewards/rejected": -2.1851398944854736, + "step": 9960 + }, + { + "epoch": 1.7177808407994486, + "grad_norm": 37.716888427734375, + "learning_rate": 9.212910736041868e-08, + "logits/chosen": -2.1696953773498535, + "logits/rejected": -2.1427950859069824, + "logps/chosen": -206.0946502685547, + "logps/rejected": -253.1869354248047, + "loss": 0.5856, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.536563754081726, + "rewards/margins": 0.4869001507759094, + "rewards/rejected": -2.0234639644622803, + "step": 9970 + }, + { + "epoch": 1.7195037904893176, + "grad_norm": 31.95079803466797, + "learning_rate": 9.19292607532558e-08, + "logits/chosen": -2.0850770473480225, + "logits/rejected": -2.0535874366760254, + "logps/chosen": -207.0938262939453, + "logps/rejected": -266.87335205078125, + "loss": 0.5404, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.5402209758758545, + "rewards/margins": 0.6110003590583801, + "rewards/rejected": -2.151221513748169, + "step": 9980 + }, + { + "epoch": 1.7212267401791868, + "grad_norm": 48.57706832885742, + "learning_rate": 9.172944658565057e-08, + "logits/chosen": -2.1648459434509277, + "logits/rejected": -2.1238019466400146, + "logps/chosen": -201.11058044433594, + "logps/rejected": -244.0185089111328, + "loss": 0.5645, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.4623303413391113, + "rewards/margins": 0.49238890409469604, + "rewards/rejected": -1.9547193050384521, + "step": 9990 + }, + { + "epoch": 1.722949689869056, + "grad_norm": 28.529874801635742, + "learning_rate": 9.15296656607367e-08, + "logits/chosen": -2.1567516326904297, + "logits/rejected": -2.124053478240967, + "logps/chosen": -200.27005004882812, + "logps/rejected": -253.435791015625, + "loss": 0.5368, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.4336096048355103, + "rewards/margins": 0.5446447134017944, + "rewards/rejected": -1.9782540798187256, + "step": 10000 + }, + { + "epoch": 1.722949689869056, + "eval_logits/chosen": -2.191511869430542, + "eval_logits/rejected": -2.17295241355896, + "eval_logps/chosen": -192.59469604492188, + "eval_logps/rejected": -221.33299255371094, + "eval_loss": 0.6408481597900391, + "eval_rewards/accuracies": 0.6236059665679932, + "eval_rewards/chosen": -1.335792064666748, + "eval_rewards/margins": 0.2500417232513428, + "eval_rewards/rejected": -1.5858336687088013, + "eval_runtime": 382.9314, + "eval_samples_per_second": 11.24, + "eval_steps_per_second": 1.405, + "step": 10000 + }, + { + "epoch": 1.724672639558925, + "grad_norm": 28.102582931518555, + "learning_rate": 9.132991878151444e-08, + "logits/chosen": -2.148301362991333, + "logits/rejected": -2.11006760597229, + "logps/chosen": -207.0460662841797, + "logps/rejected": -269.1331787109375, + "loss": 0.5292, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.54012930393219, + "rewards/margins": 0.6273112297058105, + "rewards/rejected": -2.167440414428711, + "step": 10010 + }, + { + "epoch": 1.7263955892487939, + "grad_norm": 38.713417053222656, + "learning_rate": 9.113020675084693e-08, + "logits/chosen": -2.0745723247528076, + "logits/rejected": -2.028005599975586, + "logps/chosen": -222.94821166992188, + "logps/rejected": -279.4356384277344, + "loss": 0.5582, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.6800209283828735, + "rewards/margins": 0.6077736020088196, + "rewards/rejected": -2.287794351577759, + "step": 10020 + }, + { + "epoch": 1.7281185389386629, + "grad_norm": 46.993404388427734, + "learning_rate": 9.093053037145756e-08, + "logits/chosen": -2.0698273181915283, + "logits/rejected": -2.0276541709899902, + "logps/chosen": -232.9654998779297, + "logps/rejected": -272.5043029785156, + "loss": 0.5962, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.7468217611312866, + "rewards/margins": 0.464282751083374, + "rewards/rejected": -2.211104393005371, + "step": 10030 + }, + { + "epoch": 1.729841488628532, + "grad_norm": 24.55193519592285, + "learning_rate": 9.073089044592619e-08, + "logits/chosen": -2.2040719985961914, + "logits/rejected": -2.1704514026641846, + "logps/chosen": -221.79983520507812, + "logps/rejected": -280.9211730957031, + "loss": 0.5402, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.6553106307983398, + "rewards/margins": 0.6160081624984741, + "rewards/rejected": -2.2713184356689453, + "step": 10040 + }, + { + "epoch": 1.7315644383184012, + "grad_norm": 35.085670471191406, + "learning_rate": 9.053128777668629e-08, + "logits/chosen": -2.1237728595733643, + "logits/rejected": -2.109290361404419, + "logps/chosen": -211.3069305419922, + "logps/rejected": -255.8780975341797, + "loss": 0.5977, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.5639381408691406, + "rewards/margins": 0.44324740767478943, + "rewards/rejected": -2.007185697555542, + "step": 10050 + }, + { + "epoch": 1.7332873880082702, + "grad_norm": 41.619598388671875, + "learning_rate": 9.033172316602148e-08, + "logits/chosen": -2.1192288398742676, + "logits/rejected": -2.0995373725891113, + "logps/chosen": -197.01919555664062, + "logps/rejected": -245.8788299560547, + "loss": 0.5834, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.4734445810317993, + "rewards/margins": 0.4632526934146881, + "rewards/rejected": -1.936697006225586, + "step": 10060 + }, + { + "epoch": 1.7350103376981392, + "grad_norm": 26.65183448791504, + "learning_rate": 9.013219741606244e-08, + "logits/chosen": -2.086177110671997, + "logits/rejected": -2.0609517097473145, + "logps/chosen": -180.7833709716797, + "logps/rejected": -239.9923858642578, + "loss": 0.5384, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.2739650011062622, + "rewards/margins": 0.6015530824661255, + "rewards/rejected": -1.8755180835723877, + "step": 10070 + }, + { + "epoch": 1.7367332873880081, + "grad_norm": 41.86779022216797, + "learning_rate": 8.993271132878371e-08, + "logits/chosen": -2.152545690536499, + "logits/rejected": -2.116054058074951, + "logps/chosen": -191.3165283203125, + "logps/rejected": -240.53225708007812, + "loss": 0.5592, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.3363940715789795, + "rewards/margins": 0.5229128003120422, + "rewards/rejected": -1.8593066930770874, + "step": 10080 + }, + { + "epoch": 1.7384562370778773, + "grad_norm": 36.689613342285156, + "learning_rate": 8.973326570600038e-08, + "logits/chosen": -2.187739849090576, + "logits/rejected": -2.1426002979278564, + "logps/chosen": -198.7881317138672, + "logps/rejected": -266.9776306152344, + "loss": 0.5284, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.4309791326522827, + "rewards/margins": 0.6683201789855957, + "rewards/rejected": -2.099299192428589, + "step": 10090 + }, + { + "epoch": 1.7401791867677465, + "grad_norm": 51.00032424926758, + "learning_rate": 8.953386134936489e-08, + "logits/chosen": -2.1044974327087402, + "logits/rejected": -2.0780110359191895, + "logps/chosen": -231.7470245361328, + "logps/rejected": -280.9796447753906, + "loss": 0.5958, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.752642273902893, + "rewards/margins": 0.5219553709030151, + "rewards/rejected": -2.2745978832244873, + "step": 10100 + }, + { + "epoch": 1.7419021364576155, + "grad_norm": 37.555301666259766, + "learning_rate": 8.933449906036373e-08, + "logits/chosen": -2.141427516937256, + "logits/rejected": -2.114920139312744, + "logps/chosen": -227.4268035888672, + "logps/rejected": -294.4813537597656, + "loss": 0.5533, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7672802209854126, + "rewards/margins": 0.64067542552948, + "rewards/rejected": -2.4079554080963135, + "step": 10110 + }, + { + "epoch": 1.7436250861474845, + "grad_norm": 34.8802490234375, + "learning_rate": 8.913517964031447e-08, + "logits/chosen": -2.1124048233032227, + "logits/rejected": -2.0721170902252197, + "logps/chosen": -227.3213348388672, + "logps/rejected": -283.9159240722656, + "loss": 0.5579, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.7284618616104126, + "rewards/margins": 0.6203962564468384, + "rewards/rejected": -2.348858118057251, + "step": 10120 + }, + { + "epoch": 1.7453480358373534, + "grad_norm": 28.73194694519043, + "learning_rate": 8.893590389036226e-08, + "logits/chosen": -2.1395440101623535, + "logits/rejected": -2.1035239696502686, + "logps/chosen": -214.451904296875, + "logps/rejected": -280.0576171875, + "loss": 0.5343, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.5973799228668213, + "rewards/margins": 0.6699446439743042, + "rewards/rejected": -2.267324924468994, + "step": 10130 + }, + { + "epoch": 1.7470709855272226, + "grad_norm": 31.8477840423584, + "learning_rate": 8.873667261147673e-08, + "logits/chosen": -2.1350624561309814, + "logits/rejected": -2.0828769207000732, + "logps/chosen": -222.15798950195312, + "logps/rejected": -273.2109680175781, + "loss": 0.5767, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.6050676107406616, + "rewards/margins": 0.5911908745765686, + "rewards/rejected": -2.196258544921875, + "step": 10140 + }, + { + "epoch": 1.7487939352170918, + "grad_norm": 26.414133071899414, + "learning_rate": 8.853748660444881e-08, + "logits/chosen": -2.127324342727661, + "logits/rejected": -2.086487293243408, + "logps/chosen": -187.64529418945312, + "logps/rejected": -249.9339141845703, + "loss": 0.5472, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.3201484680175781, + "rewards/margins": 0.6274799108505249, + "rewards/rejected": -1.947628378868103, + "step": 10150 + }, + { + "epoch": 1.7505168849069608, + "grad_norm": 29.215452194213867, + "learning_rate": 8.833834666988738e-08, + "logits/chosen": -2.141927719116211, + "logits/rejected": -2.097440481185913, + "logps/chosen": -179.4386444091797, + "logps/rejected": -246.930419921875, + "loss": 0.4974, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.2599875926971436, + "rewards/margins": 0.6941196322441101, + "rewards/rejected": -1.9541072845458984, + "step": 10160 + }, + { + "epoch": 1.7522398345968297, + "grad_norm": 35.651058197021484, + "learning_rate": 8.813925360821624e-08, + "logits/chosen": -2.0970935821533203, + "logits/rejected": -2.0645716190338135, + "logps/chosen": -207.51675415039062, + "logps/rejected": -269.4786071777344, + "loss": 0.5383, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.5255186557769775, + "rewards/margins": 0.6351886987686157, + "rewards/rejected": -2.1607069969177246, + "step": 10170 + }, + { + "epoch": 1.7539627842866987, + "grad_norm": 24.785844802856445, + "learning_rate": 8.794020821967075e-08, + "logits/chosen": -2.038064479827881, + "logits/rejected": -1.9947633743286133, + "logps/chosen": -225.9365234375, + "logps/rejected": -299.15802001953125, + "loss": 0.5266, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.729060173034668, + "rewards/margins": 0.7350554466247559, + "rewards/rejected": -2.464115619659424, + "step": 10180 + }, + { + "epoch": 1.755685733976568, + "grad_norm": 44.277008056640625, + "learning_rate": 8.774121130429464e-08, + "logits/chosen": -2.0331788063049316, + "logits/rejected": -1.9945003986358643, + "logps/chosen": -229.5591583251953, + "logps/rejected": -300.8504333496094, + "loss": 0.537, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.771598219871521, + "rewards/margins": 0.7276459336280823, + "rewards/rejected": -2.499244213104248, + "step": 10190 + }, + { + "epoch": 1.757408683666437, + "grad_norm": 30.182825088500977, + "learning_rate": 8.754226366193677e-08, + "logits/chosen": -2.0969910621643066, + "logits/rejected": -2.0572707653045654, + "logps/chosen": -235.1160430908203, + "logps/rejected": -300.4604187011719, + "loss": 0.5356, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.8176597356796265, + "rewards/margins": 0.687166690826416, + "rewards/rejected": -2.504826545715332, + "step": 10200 + }, + { + "epoch": 1.759131633356306, + "grad_norm": 54.42823028564453, + "learning_rate": 8.734336609224798e-08, + "logits/chosen": -2.084446430206299, + "logits/rejected": -2.0502068996429443, + "logps/chosen": -246.3536834716797, + "logps/rejected": -325.4996643066406, + "loss": 0.5233, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.906688928604126, + "rewards/margins": 0.7914060354232788, + "rewards/rejected": -2.6980948448181152, + "step": 10210 + }, + { + "epoch": 1.760854583046175, + "grad_norm": 31.210098266601562, + "learning_rate": 8.714451939467793e-08, + "logits/chosen": -2.058612585067749, + "logits/rejected": -2.026947021484375, + "logps/chosen": -226.6744384765625, + "logps/rejected": -285.47247314453125, + "loss": 0.5576, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.7720890045166016, + "rewards/margins": 0.5804320573806763, + "rewards/rejected": -2.3525211811065674, + "step": 10220 + }, + { + "epoch": 1.762577532736044, + "grad_norm": 36.709999084472656, + "learning_rate": 8.69457243684717e-08, + "logits/chosen": -2.0432868003845215, + "logits/rejected": -2.003864288330078, + "logps/chosen": -226.4574737548828, + "logps/rejected": -276.8308410644531, + "loss": 0.5898, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.7202298641204834, + "rewards/margins": 0.5365725755691528, + "rewards/rejected": -2.2568023204803467, + "step": 10230 + }, + { + "epoch": 1.7643004824259132, + "grad_norm": 31.065011978149414, + "learning_rate": 8.67469818126667e-08, + "logits/chosen": -2.069124460220337, + "logits/rejected": -2.023430109024048, + "logps/chosen": -216.22946166992188, + "logps/rejected": -298.06903076171875, + "loss": 0.4958, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.641461968421936, + "rewards/margins": 0.8219528198242188, + "rewards/rejected": -2.4634146690368652, + "step": 10240 + }, + { + "epoch": 1.7660234321157822, + "grad_norm": 45.54820251464844, + "learning_rate": 8.654829252608947e-08, + "logits/chosen": -2.118251085281372, + "logits/rejected": -2.0692851543426514, + "logps/chosen": -220.3714141845703, + "logps/rejected": -275.654296875, + "loss": 0.5211, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.627004623413086, + "rewards/margins": 0.6395336389541626, + "rewards/rejected": -2.266538619995117, + "step": 10250 + }, + { + "epoch": 1.7677463818056514, + "grad_norm": 44.912437438964844, + "learning_rate": 8.634965730735238e-08, + "logits/chosen": -2.0857231616973877, + "logits/rejected": -2.0690531730651855, + "logps/chosen": -224.17861938476562, + "logps/rejected": -287.48419189453125, + "loss": 0.5445, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.7005201578140259, + "rewards/margins": 0.6115856170654297, + "rewards/rejected": -2.312105655670166, + "step": 10260 + }, + { + "epoch": 1.7694693314955203, + "grad_norm": 30.748966217041016, + "learning_rate": 8.615107695485059e-08, + "logits/chosen": -2.0707106590270996, + "logits/rejected": -2.039233684539795, + "logps/chosen": -220.36544799804688, + "logps/rejected": -282.8392639160156, + "loss": 0.5349, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.639102578163147, + "rewards/margins": 0.6370723247528076, + "rewards/rejected": -2.276174783706665, + "step": 10270 + }, + { + "epoch": 1.7711922811853893, + "grad_norm": 40.262969970703125, + "learning_rate": 8.595255226675867e-08, + "logits/chosen": -2.0837535858154297, + "logits/rejected": -2.056248426437378, + "logps/chosen": -224.76107788085938, + "logps/rejected": -264.0224609375, + "loss": 0.6194, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.6971912384033203, + "rewards/margins": 0.42010498046875, + "rewards/rejected": -2.1172964572906494, + "step": 10280 + }, + { + "epoch": 1.7729152308752585, + "grad_norm": 33.58854675292969, + "learning_rate": 8.575408404102739e-08, + "logits/chosen": -2.0661511421203613, + "logits/rejected": -2.0259451866149902, + "logps/chosen": -199.9491424560547, + "logps/rejected": -268.76312255859375, + "loss": 0.5468, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.4780738353729248, + "rewards/margins": 0.6608957052230835, + "rewards/rejected": -2.1389694213867188, + "step": 10290 + }, + { + "epoch": 1.7746381805651275, + "grad_norm": 53.497398376464844, + "learning_rate": 8.555567307538067e-08, + "logits/chosen": -2.1200575828552246, + "logits/rejected": -2.092484951019287, + "logps/chosen": -225.9880828857422, + "logps/rejected": -271.4896545410156, + "loss": 0.6079, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.6974165439605713, + "rewards/margins": 0.4778365194797516, + "rewards/rejected": -2.17525315284729, + "step": 10300 + }, + { + "epoch": 1.7763611302549966, + "grad_norm": 30.578693389892578, + "learning_rate": 8.53573201673122e-08, + "logits/chosen": -2.051030158996582, + "logits/rejected": -2.013582706451416, + "logps/chosen": -221.84011840820312, + "logps/rejected": -282.8453369140625, + "loss": 0.5257, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.641297698020935, + "rewards/margins": 0.686009407043457, + "rewards/rejected": -2.3273074626922607, + "step": 10310 + }, + { + "epoch": 1.7780840799448656, + "grad_norm": 39.207130432128906, + "learning_rate": 8.515902611408245e-08, + "logits/chosen": -2.1029365062713623, + "logits/rejected": -2.062652111053467, + "logps/chosen": -215.40438842773438, + "logps/rejected": -267.7357177734375, + "loss": 0.584, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.5729243755340576, + "rewards/margins": 0.5734225511550903, + "rewards/rejected": -2.1463465690612793, + "step": 10320 + }, + { + "epoch": 1.7798070296347346, + "grad_norm": 35.41927719116211, + "learning_rate": 8.496079171271512e-08, + "logits/chosen": -2.1222665309906006, + "logits/rejected": -2.08558988571167, + "logps/chosen": -196.37164306640625, + "logps/rejected": -254.0879669189453, + "loss": 0.5371, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.4327995777130127, + "rewards/margins": 0.5754380822181702, + "rewards/rejected": -2.008237600326538, + "step": 10330 + }, + { + "epoch": 1.7815299793246038, + "grad_norm": 27.103147506713867, + "learning_rate": 8.476261775999432e-08, + "logits/chosen": -2.156501293182373, + "logits/rejected": -2.117757558822632, + "logps/chosen": -194.71116638183594, + "logps/rejected": -252.72793579101562, + "loss": 0.5415, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.3771531581878662, + "rewards/margins": 0.6131818890571594, + "rewards/rejected": -1.9903347492218018, + "step": 10340 + }, + { + "epoch": 1.7832529290144727, + "grad_norm": 41.56328582763672, + "learning_rate": 8.45645050524611e-08, + "logits/chosen": -2.1711769104003906, + "logits/rejected": -2.1348977088928223, + "logps/chosen": -204.80001831054688, + "logps/rejected": -252.4854736328125, + "loss": 0.5598, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.4941831827163696, + "rewards/margins": 0.5315493941307068, + "rewards/rejected": -2.0257325172424316, + "step": 10350 + }, + { + "epoch": 1.784975878704342, + "grad_norm": 36.49805450439453, + "learning_rate": 8.436645438641038e-08, + "logits/chosen": -2.102998971939087, + "logits/rejected": -2.0662338733673096, + "logps/chosen": -203.11863708496094, + "logps/rejected": -242.20803833007812, + "loss": 0.5992, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.461118459701538, + "rewards/margins": 0.4449211061000824, + "rewards/rejected": -1.9060395956039429, + "step": 10360 + }, + { + "epoch": 1.786698828394211, + "grad_norm": 40.32637023925781, + "learning_rate": 8.416846655788774e-08, + "logits/chosen": -1.995539903640747, + "logits/rejected": -1.9595807790756226, + "logps/chosen": -184.28761291503906, + "logps/rejected": -232.712890625, + "loss": 0.5566, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.307677984237671, + "rewards/margins": 0.500927209854126, + "rewards/rejected": -1.8086051940917969, + "step": 10370 + }, + { + "epoch": 1.7884217780840799, + "grad_norm": 39.63434982299805, + "learning_rate": 8.397054236268611e-08, + "logits/chosen": -2.1119132041931152, + "logits/rejected": -2.092855215072632, + "logps/chosen": -197.7986602783203, + "logps/rejected": -221.83645629882812, + "loss": 0.6914, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.4502270221710205, + "rewards/margins": 0.2483011782169342, + "rewards/rejected": -1.6985282897949219, + "step": 10380 + }, + { + "epoch": 1.7901447277739488, + "grad_norm": 23.693660736083984, + "learning_rate": 8.37726825963427e-08, + "logits/chosen": -2.164780378341675, + "logits/rejected": -2.127293825149536, + "logps/chosen": -174.3961639404297, + "logps/rejected": -218.28988647460938, + "loss": 0.5728, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.1731741428375244, + "rewards/margins": 0.47819751501083374, + "rewards/rejected": -1.6513715982437134, + "step": 10390 + }, + { + "epoch": 1.791867677463818, + "grad_norm": 28.680999755859375, + "learning_rate": 8.357488805413576e-08, + "logits/chosen": -2.198361873626709, + "logits/rejected": -2.1695852279663086, + "logps/chosen": -170.17636108398438, + "logps/rejected": -233.5998077392578, + "loss": 0.5064, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.1772061586380005, + "rewards/margins": 0.6171708106994629, + "rewards/rejected": -1.794377088546753, + "step": 10400 + }, + { + "epoch": 1.791867677463818, + "eval_logits/chosen": -2.2306599617004395, + "eval_logits/rejected": -2.215008497238159, + "eval_logps/chosen": -165.2631072998047, + "eval_logps/rejected": -188.94882202148438, + "eval_loss": 0.6423309445381165, + "eval_rewards/accuracies": 0.6215148568153381, + "eval_rewards/chosen": -1.062476396560669, + "eval_rewards/margins": 0.1995159536600113, + "eval_rewards/rejected": -1.2619922161102295, + "eval_runtime": 383.4393, + "eval_samples_per_second": 11.225, + "eval_steps_per_second": 1.403, + "step": 10400 + }, + { + "epoch": 1.7935906271536872, + "grad_norm": 32.13147735595703, + "learning_rate": 8.337715953108133e-08, + "logits/chosen": -2.1234824657440186, + "logits/rejected": -2.08290958404541, + "logps/chosen": -185.5501251220703, + "logps/rejected": -223.6185760498047, + "loss": 0.5874, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.280967354774475, + "rewards/margins": 0.43095582723617554, + "rewards/rejected": -1.7119232416152954, + "step": 10410 + }, + { + "epoch": 1.7953135768435562, + "grad_norm": 56.05282974243164, + "learning_rate": 8.317949782193021e-08, + "logits/chosen": -2.1323277950286865, + "logits/rejected": -2.0914833545684814, + "logps/chosen": -200.09124755859375, + "logps/rejected": -253.63876342773438, + "loss": 0.5699, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.452972412109375, + "rewards/margins": 0.5477496981620789, + "rewards/rejected": -2.0007221698760986, + "step": 10420 + }, + { + "epoch": 1.7970365265334252, + "grad_norm": 31.1216983795166, + "learning_rate": 8.298190372116449e-08, + "logits/chosen": -2.1449904441833496, + "logits/rejected": -2.109835386276245, + "logps/chosen": -215.15225219726562, + "logps/rejected": -261.8252258300781, + "loss": 0.6041, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.5944058895111084, + "rewards/margins": 0.4992167055606842, + "rewards/rejected": -2.0936226844787598, + "step": 10430 + }, + { + "epoch": 1.7987594762232941, + "grad_norm": 31.59772491455078, + "learning_rate": 8.278437802299462e-08, + "logits/chosen": -2.1712965965270996, + "logits/rejected": -2.1471519470214844, + "logps/chosen": -227.6300506591797, + "logps/rejected": -270.65386962890625, + "loss": 0.5907, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.6827600002288818, + "rewards/margins": 0.46045559644699097, + "rewards/rejected": -2.1432156562805176, + "step": 10440 + }, + { + "epoch": 1.8004824259131633, + "grad_norm": 35.741493225097656, + "learning_rate": 8.258692152135605e-08, + "logits/chosen": -2.1343047618865967, + "logits/rejected": -2.1071267127990723, + "logps/chosen": -214.2671661376953, + "logps/rejected": -278.6723937988281, + "loss": 0.5215, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.5967563390731812, + "rewards/margins": 0.6269431710243225, + "rewards/rejected": -2.2236995697021484, + "step": 10450 + }, + { + "epoch": 1.8022053756030325, + "grad_norm": 28.693790435791016, + "learning_rate": 8.238953500990624e-08, + "logits/chosen": -2.1232166290283203, + "logits/rejected": -2.0772557258605957, + "logps/chosen": -214.5017547607422, + "logps/rejected": -265.8387145996094, + "loss": 0.5561, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.5892353057861328, + "rewards/margins": 0.5419289469718933, + "rewards/rejected": -2.131164073944092, + "step": 10460 + }, + { + "epoch": 1.8039283252929015, + "grad_norm": 45.37855911254883, + "learning_rate": 8.219221928202108e-08, + "logits/chosen": -2.009894609451294, + "logits/rejected": -1.9749433994293213, + "logps/chosen": -222.81887817382812, + "logps/rejected": -276.101318359375, + "loss": 0.5717, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.7024590969085693, + "rewards/margins": 0.5605179071426392, + "rewards/rejected": -2.262976884841919, + "step": 10470 + }, + { + "epoch": 1.8056512749827704, + "grad_norm": 36.52731704711914, + "learning_rate": 8.199497513079219e-08, + "logits/chosen": -2.1089510917663574, + "logits/rejected": -2.0611464977264404, + "logps/chosen": -226.2388458251953, + "logps/rejected": -295.7428283691406, + "loss": 0.5359, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.6885255575180054, + "rewards/margins": 0.7537800073623657, + "rewards/rejected": -2.44230580329895, + "step": 10480 + }, + { + "epoch": 1.8073742246726394, + "grad_norm": 30.162227630615234, + "learning_rate": 8.179780334902338e-08, + "logits/chosen": -2.099175214767456, + "logits/rejected": -2.0624618530273438, + "logps/chosen": -213.5786895751953, + "logps/rejected": -270.34735107421875, + "loss": 0.5526, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.5826948881149292, + "rewards/margins": 0.59993976354599, + "rewards/rejected": -2.1826348304748535, + "step": 10490 + }, + { + "epoch": 1.8090971743625086, + "grad_norm": 35.62689208984375, + "learning_rate": 8.16007047292276e-08, + "logits/chosen": -2.1074697971343994, + "logits/rejected": -2.058868885040283, + "logps/chosen": -213.86355590820312, + "logps/rejected": -284.5144958496094, + "loss": 0.5221, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.60076904296875, + "rewards/margins": 0.7032946944236755, + "rewards/rejected": -2.3040637969970703, + "step": 10500 + }, + { + "epoch": 1.8108201240523778, + "grad_norm": 35.58950424194336, + "learning_rate": 8.140368006362378e-08, + "logits/chosen": -2.111414909362793, + "logits/rejected": -2.0616517066955566, + "logps/chosen": -219.7614288330078, + "logps/rejected": -283.29779052734375, + "loss": 0.5341, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.6361150741577148, + "rewards/margins": 0.6754859089851379, + "rewards/rejected": -2.311600923538208, + "step": 10510 + }, + { + "epoch": 1.8125430737422468, + "grad_norm": 54.72854232788086, + "learning_rate": 8.120673014413346e-08, + "logits/chosen": -2.1215085983276367, + "logits/rejected": -2.089259624481201, + "logps/chosen": -228.813232421875, + "logps/rejected": -306.3352966308594, + "loss": 0.5364, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.767823576927185, + "rewards/margins": 0.7603949904441833, + "rewards/rejected": -2.5282187461853027, + "step": 10520 + }, + { + "epoch": 1.8142660234321157, + "grad_norm": 41.22834014892578, + "learning_rate": 8.100985576237789e-08, + "logits/chosen": -2.053673505783081, + "logits/rejected": -2.018805503845215, + "logps/chosen": -238.222412109375, + "logps/rejected": -289.5137634277344, + "loss": 0.5878, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.8332278728485107, + "rewards/margins": 0.5466124415397644, + "rewards/rejected": -2.37984037399292, + "step": 10530 + }, + { + "epoch": 1.8159889731219847, + "grad_norm": 39.10302734375, + "learning_rate": 8.081305770967466e-08, + "logits/chosen": -2.0007946491241455, + "logits/rejected": -1.9645189046859741, + "logps/chosen": -222.1063690185547, + "logps/rejected": -276.51312255859375, + "loss": 0.5475, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.689347267150879, + "rewards/margins": 0.5932210683822632, + "rewards/rejected": -2.2825684547424316, + "step": 10540 + }, + { + "epoch": 1.817711922811854, + "grad_norm": 33.643516540527344, + "learning_rate": 8.061633677703457e-08, + "logits/chosen": -2.174844741821289, + "logits/rejected": -2.1463987827301025, + "logps/chosen": -237.1045379638672, + "logps/rejected": -293.6675109863281, + "loss": 0.5867, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.8306430578231812, + "rewards/margins": 0.5582226514816284, + "rewards/rejected": -2.3888659477233887, + "step": 10550 + }, + { + "epoch": 1.819434872501723, + "grad_norm": 35.75984191894531, + "learning_rate": 8.041969375515835e-08, + "logits/chosen": -2.065913438796997, + "logits/rejected": -2.016780138015747, + "logps/chosen": -222.2818145751953, + "logps/rejected": -298.7868347167969, + "loss": 0.5156, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.6556761264801025, + "rewards/margins": 0.800155758857727, + "rewards/rejected": -2.455831527709961, + "step": 10560 + }, + { + "epoch": 1.821157822191592, + "grad_norm": 53.0742301940918, + "learning_rate": 8.022312943443369e-08, + "logits/chosen": -2.112668514251709, + "logits/rejected": -2.078002691268921, + "logps/chosen": -226.45956420898438, + "logps/rejected": -291.5998840332031, + "loss": 0.5494, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.6997945308685303, + "rewards/margins": 0.6631422638893127, + "rewards/rejected": -2.3629367351531982, + "step": 10570 + }, + { + "epoch": 1.822880771881461, + "grad_norm": 41.303245544433594, + "learning_rate": 8.002664460493194e-08, + "logits/chosen": -2.1236252784729004, + "logits/rejected": -2.0799834728240967, + "logps/chosen": -203.3345947265625, + "logps/rejected": -271.3036804199219, + "loss": 0.5138, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4962794780731201, + "rewards/margins": 0.6943894624710083, + "rewards/rejected": -2.190668821334839, + "step": 10580 + }, + { + "epoch": 1.82460372157133, + "grad_norm": 28.450899124145508, + "learning_rate": 7.983024005640487e-08, + "logits/chosen": -2.090174913406372, + "logits/rejected": -2.045700788497925, + "logps/chosen": -210.66390991210938, + "logps/rejected": -261.7216796875, + "loss": 0.5587, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.540217399597168, + "rewards/margins": 0.5885982513427734, + "rewards/rejected": -2.1288156509399414, + "step": 10590 + }, + { + "epoch": 1.8263266712611992, + "grad_norm": 24.977964401245117, + "learning_rate": 7.963391657828167e-08, + "logits/chosen": -2.158010482788086, + "logits/rejected": -2.1370630264282227, + "logps/chosen": -195.0901641845703, + "logps/rejected": -249.91494750976562, + "loss": 0.5716, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.4173429012298584, + "rewards/margins": 0.543049693107605, + "rewards/rejected": -1.960392713546753, + "step": 10600 + }, + { + "epoch": 1.8280496209510684, + "grad_norm": 44.63248825073242, + "learning_rate": 7.943767495966556e-08, + "logits/chosen": -2.094036102294922, + "logits/rejected": -2.066676139831543, + "logps/chosen": -208.56723022460938, + "logps/rejected": -263.9549560546875, + "loss": 0.5582, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.5647661685943604, + "rewards/margins": 0.5567968487739563, + "rewards/rejected": -2.121562957763672, + "step": 10610 + }, + { + "epoch": 1.8297725706409373, + "grad_norm": 37.00849151611328, + "learning_rate": 7.924151598933077e-08, + "logits/chosen": -2.0145325660705566, + "logits/rejected": -1.9684088230133057, + "logps/chosen": -207.1978302001953, + "logps/rejected": -270.76348876953125, + "loss": 0.5188, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.523241400718689, + "rewards/margins": 0.6691671013832092, + "rewards/rejected": -2.192408561706543, + "step": 10620 + }, + { + "epoch": 1.8314955203308063, + "grad_norm": 29.079648971557617, + "learning_rate": 7.904544045571942e-08, + "logits/chosen": -2.102393627166748, + "logits/rejected": -2.0552027225494385, + "logps/chosen": -214.1923065185547, + "logps/rejected": -284.5091552734375, + "loss": 0.545, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.586716890335083, + "rewards/margins": 0.7270705103874207, + "rewards/rejected": -2.3137869834899902, + "step": 10630 + }, + { + "epoch": 1.8332184700206753, + "grad_norm": 61.565185546875, + "learning_rate": 7.884944914693819e-08, + "logits/chosen": -2.1358046531677246, + "logits/rejected": -2.094247341156006, + "logps/chosen": -218.451171875, + "logps/rejected": -275.0665588378906, + "loss": 0.5669, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.631365180015564, + "rewards/margins": 0.6042715907096863, + "rewards/rejected": -2.2356367111206055, + "step": 10640 + }, + { + "epoch": 1.8349414197105445, + "grad_norm": 30.415767669677734, + "learning_rate": 7.865354285075517e-08, + "logits/chosen": -2.135023593902588, + "logits/rejected": -2.0953783988952637, + "logps/chosen": -220.89950561523438, + "logps/rejected": -292.2164611816406, + "loss": 0.5329, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.6609359979629517, + "rewards/margins": 0.7331883311271667, + "rewards/rejected": -2.3941242694854736, + "step": 10650 + }, + { + "epoch": 1.8366643694004137, + "grad_norm": 40.1267204284668, + "learning_rate": 7.845772235459687e-08, + "logits/chosen": -2.0598466396331787, + "logits/rejected": -2.025097608566284, + "logps/chosen": -226.88601684570312, + "logps/rejected": -285.40167236328125, + "loss": 0.5567, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.7189706563949585, + "rewards/margins": 0.5910989046096802, + "rewards/rejected": -2.3100695610046387, + "step": 10660 + }, + { + "epoch": 1.8383873190902826, + "grad_norm": 33.5792236328125, + "learning_rate": 7.826198844554484e-08, + "logits/chosen": -2.0914905071258545, + "logits/rejected": -2.0463545322418213, + "logps/chosen": -229.7507781982422, + "logps/rejected": -295.3099670410156, + "loss": 0.5437, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.7596862316131592, + "rewards/margins": 0.682495653629303, + "rewards/rejected": -2.4421820640563965, + "step": 10670 + }, + { + "epoch": 1.8401102687801516, + "grad_norm": 34.08005905151367, + "learning_rate": 7.806634191033268e-08, + "logits/chosen": -2.118601083755493, + "logits/rejected": -2.0770506858825684, + "logps/chosen": -219.3430633544922, + "logps/rejected": -275.48748779296875, + "loss": 0.5409, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.6182947158813477, + "rewards/margins": 0.6389121413230896, + "rewards/rejected": -2.257207155227661, + "step": 10680 + }, + { + "epoch": 1.8418332184700206, + "grad_norm": 35.42080307006836, + "learning_rate": 7.787078353534276e-08, + "logits/chosen": -2.083477020263672, + "logits/rejected": -2.0510470867156982, + "logps/chosen": -218.898193359375, + "logps/rejected": -286.86431884765625, + "loss": 0.5394, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.6395857334136963, + "rewards/margins": 0.6692296266555786, + "rewards/rejected": -2.3088154792785645, + "step": 10690 + }, + { + "epoch": 1.8435561681598898, + "grad_norm": 51.69084167480469, + "learning_rate": 7.767531410660307e-08, + "logits/chosen": -2.130699396133423, + "logits/rejected": -2.0828940868377686, + "logps/chosen": -219.0266876220703, + "logps/rejected": -268.32159423828125, + "loss": 0.5482, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.624758005142212, + "rewards/margins": 0.5619906187057495, + "rewards/rejected": -2.1867482662200928, + "step": 10700 + }, + { + "epoch": 1.8452791178497587, + "grad_norm": 41.16576385498047, + "learning_rate": 7.74799344097841e-08, + "logits/chosen": -2.089820146560669, + "logits/rejected": -2.045987367630005, + "logps/chosen": -209.64437866210938, + "logps/rejected": -264.726318359375, + "loss": 0.5433, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.5587536096572876, + "rewards/margins": 0.6008284687995911, + "rewards/rejected": -2.1595821380615234, + "step": 10710 + }, + { + "epoch": 1.847002067539628, + "grad_norm": 39.79367446899414, + "learning_rate": 7.728464523019574e-08, + "logits/chosen": -2.0866949558258057, + "logits/rejected": -2.0479319095611572, + "logps/chosen": -215.8254852294922, + "logps/rejected": -266.1983642578125, + "loss": 0.597, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.625404953956604, + "rewards/margins": 0.522426426410675, + "rewards/rejected": -2.147831439971924, + "step": 10720 + }, + { + "epoch": 1.848725017229497, + "grad_norm": 31.813405990600586, + "learning_rate": 7.7089447352784e-08, + "logits/chosen": -2.203573226928711, + "logits/rejected": -2.150926113128662, + "logps/chosen": -206.60537719726562, + "logps/rejected": -267.6660461425781, + "loss": 0.5131, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.5262092351913452, + "rewards/margins": 0.6517717242240906, + "rewards/rejected": -2.177980899810791, + "step": 10730 + }, + { + "epoch": 1.8504479669193659, + "grad_norm": 34.61793899536133, + "learning_rate": 7.689434156212788e-08, + "logits/chosen": -2.0917916297912598, + "logits/rejected": -2.0554144382476807, + "logps/chosen": -209.6357879638672, + "logps/rejected": -256.06610107421875, + "loss": 0.6046, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.5490946769714355, + "rewards/margins": 0.466376930475235, + "rewards/rejected": -2.0154716968536377, + "step": 10740 + }, + { + "epoch": 1.852170916609235, + "grad_norm": 33.33896255493164, + "learning_rate": 7.669932864243627e-08, + "logits/chosen": -2.0860400199890137, + "logits/rejected": -2.032809019088745, + "logps/chosen": -192.8465576171875, + "logps/rejected": -261.32330322265625, + "loss": 0.5341, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.4007537364959717, + "rewards/margins": 0.6914193630218506, + "rewards/rejected": -2.0921730995178223, + "step": 10750 + }, + { + "epoch": 1.853893866299104, + "grad_norm": 36.69129180908203, + "learning_rate": 7.65044093775448e-08, + "logits/chosen": -2.0872907638549805, + "logits/rejected": -2.049820899963379, + "logps/chosen": -206.46987915039062, + "logps/rejected": -257.88897705078125, + "loss": 0.5712, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.5185004472732544, + "rewards/margins": 0.5392345190048218, + "rewards/rejected": -2.057734966278076, + "step": 10760 + }, + { + "epoch": 1.8556168159889732, + "grad_norm": 32.58485794067383, + "learning_rate": 7.630958455091266e-08, + "logits/chosen": -2.069591522216797, + "logits/rejected": -2.039121627807617, + "logps/chosen": -213.1195526123047, + "logps/rejected": -270.96746826171875, + "loss": 0.5377, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.5476007461547852, + "rewards/margins": 0.6266247630119324, + "rewards/rejected": -2.174225330352783, + "step": 10770 + }, + { + "epoch": 1.8573397656788422, + "grad_norm": 27.233510971069336, + "learning_rate": 7.611485494561947e-08, + "logits/chosen": -2.190983772277832, + "logits/rejected": -2.161463737487793, + "logps/chosen": -208.59158325195312, + "logps/rejected": -264.21661376953125, + "loss": 0.5847, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.5422141551971436, + "rewards/margins": 0.5495244264602661, + "rewards/rejected": -2.091738700866699, + "step": 10780 + }, + { + "epoch": 1.8590627153687111, + "grad_norm": 44.738311767578125, + "learning_rate": 7.592022134436201e-08, + "logits/chosen": -2.232229709625244, + "logits/rejected": -2.18972110748291, + "logps/chosen": -193.24270629882812, + "logps/rejected": -248.223876953125, + "loss": 0.5538, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.3993558883666992, + "rewards/margins": 0.5599854588508606, + "rewards/rejected": -1.959341287612915, + "step": 10790 + }, + { + "epoch": 1.8607856650585803, + "grad_norm": 31.418119430541992, + "learning_rate": 7.57256845294513e-08, + "logits/chosen": -2.178560733795166, + "logits/rejected": -2.139875650405884, + "logps/chosen": -198.04063415527344, + "logps/rejected": -266.0334167480469, + "loss": 0.5268, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.4451549053192139, + "rewards/margins": 0.6842938661575317, + "rewards/rejected": -2.129448652267456, + "step": 10800 + }, + { + "epoch": 1.8607856650585803, + "eval_logits/chosen": -2.183124542236328, + "eval_logits/rejected": -2.1644392013549805, + "eval_logps/chosen": -201.55577087402344, + "eval_logps/rejected": -231.0404052734375, + "eval_loss": 0.6405770778656006, + "eval_rewards/accuracies": 0.634061336517334, + "eval_rewards/chosen": -1.425403118133545, + "eval_rewards/margins": 0.2575048804283142, + "eval_rewards/rejected": -1.6829079389572144, + "eval_runtime": 382.9043, + "eval_samples_per_second": 11.24, + "eval_steps_per_second": 1.405, + "step": 10800 + }, + { + "epoch": 1.8625086147484493, + "grad_norm": 28.313385009765625, + "learning_rate": 7.553124528280928e-08, + "logits/chosen": -2.138568162918091, + "logits/rejected": -2.0898187160491943, + "logps/chosen": -221.3721466064453, + "logps/rejected": -265.95477294921875, + "loss": 0.5684, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.6270099878311157, + "rewards/margins": 0.5401317477226257, + "rewards/rejected": -2.1671416759490967, + "step": 10810 + }, + { + "epoch": 1.8642315644383185, + "grad_norm": 31.738422393798828, + "learning_rate": 7.533690438596583e-08, + "logits/chosen": -2.091878890991211, + "logits/rejected": -2.0571346282958984, + "logps/chosen": -203.2316131591797, + "logps/rejected": -257.54656982421875, + "loss": 0.5733, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.4928950071334839, + "rewards/margins": 0.5778385996818542, + "rewards/rejected": -2.0707337856292725, + "step": 10820 + }, + { + "epoch": 1.8659545141281875, + "grad_norm": 52.44428253173828, + "learning_rate": 7.514266262005528e-08, + "logits/chosen": -2.1045467853546143, + "logits/rejected": -2.069561004638672, + "logps/chosen": -223.9551544189453, + "logps/rejected": -274.0635681152344, + "loss": 0.5889, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.708439588546753, + "rewards/margins": 0.5089355111122131, + "rewards/rejected": -2.2173752784729004, + "step": 10830 + }, + { + "epoch": 1.8676774638180564, + "grad_norm": 28.92299461364746, + "learning_rate": 7.494852076581377e-08, + "logits/chosen": -2.120326042175293, + "logits/rejected": -2.091909408569336, + "logps/chosen": -208.5050506591797, + "logps/rejected": -250.04397583007812, + "loss": 0.6206, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.5447773933410645, + "rewards/margins": 0.4490395188331604, + "rewards/rejected": -1.9938167333602905, + "step": 10840 + }, + { + "epoch": 1.8694004135079254, + "grad_norm": 32.259178161621094, + "learning_rate": 7.475447960357572e-08, + "logits/chosen": -2.1164214611053467, + "logits/rejected": -2.0852532386779785, + "logps/chosen": -182.91921997070312, + "logps/rejected": -235.68115234375, + "loss": 0.5502, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.3145451545715332, + "rewards/margins": 0.5021630525588989, + "rewards/rejected": -1.8167082071304321, + "step": 10850 + }, + { + "epoch": 1.8711233631977946, + "grad_norm": 35.98172378540039, + "learning_rate": 7.456053991327083e-08, + "logits/chosen": -2.1797902584075928, + "logits/rejected": -2.1264162063598633, + "logps/chosen": -199.8368377685547, + "logps/rejected": -259.7995910644531, + "loss": 0.5373, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.4367259740829468, + "rewards/margins": 0.6395610570907593, + "rewards/rejected": -2.076287269592285, + "step": 10860 + }, + { + "epoch": 1.8728463128876638, + "grad_norm": 29.174753189086914, + "learning_rate": 7.436670247442107e-08, + "logits/chosen": -2.0565848350524902, + "logits/rejected": -2.0258164405822754, + "logps/chosen": -196.3871612548828, + "logps/rejected": -266.41925048828125, + "loss": 0.5371, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.4551265239715576, + "rewards/margins": 0.655053973197937, + "rewards/rejected": -2.110180377960205, + "step": 10870 + }, + { + "epoch": 1.8745692625775328, + "grad_norm": 37.872257232666016, + "learning_rate": 7.417296806613718e-08, + "logits/chosen": -2.1089484691619873, + "logits/rejected": -2.0674309730529785, + "logps/chosen": -224.12796020507812, + "logps/rejected": -283.5495300292969, + "loss": 0.545, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.6908639669418335, + "rewards/margins": 0.6248513460159302, + "rewards/rejected": -2.3157153129577637, + "step": 10880 + }, + { + "epoch": 1.8762922122674017, + "grad_norm": 36.295127868652344, + "learning_rate": 7.397933746711603e-08, + "logits/chosen": -2.129281997680664, + "logits/rejected": -2.1052193641662598, + "logps/chosen": -233.7235870361328, + "logps/rejected": -286.4725341796875, + "loss": 0.5873, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.7942317724227905, + "rewards/margins": 0.5287545919418335, + "rewards/rejected": -2.322986602783203, + "step": 10890 + }, + { + "epoch": 1.8780151619572707, + "grad_norm": 42.38193893432617, + "learning_rate": 7.378581145563709e-08, + "logits/chosen": -2.161618709564209, + "logits/rejected": -2.1121408939361572, + "logps/chosen": -225.397705078125, + "logps/rejected": -281.1845397949219, + "loss": 0.5398, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.6846339702606201, + "rewards/margins": 0.6255574822425842, + "rewards/rejected": -2.3101913928985596, + "step": 10900 + }, + { + "epoch": 1.8797381116471399, + "grad_norm": 55.0797119140625, + "learning_rate": 7.35923908095595e-08, + "logits/chosen": -2.078524112701416, + "logits/rejected": -2.0375468730926514, + "logps/chosen": -220.0901641845703, + "logps/rejected": -267.397705078125, + "loss": 0.5922, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.6298377513885498, + "rewards/margins": 0.5042181015014648, + "rewards/rejected": -2.1340558528900146, + "step": 10910 + }, + { + "epoch": 1.881461061337009, + "grad_norm": 34.892669677734375, + "learning_rate": 7.339907630631886e-08, + "logits/chosen": -2.0853939056396484, + "logits/rejected": -2.045135736465454, + "logps/chosen": -213.90255737304688, + "logps/rejected": -266.9146728515625, + "loss": 0.5396, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.5748045444488525, + "rewards/margins": 0.5764104723930359, + "rewards/rejected": -2.151215076446533, + "step": 10920 + }, + { + "epoch": 1.883184011026878, + "grad_norm": 35.38461685180664, + "learning_rate": 7.320586872292413e-08, + "logits/chosen": -2.049386739730835, + "logits/rejected": -2.015204429626465, + "logps/chosen": -212.7605743408203, + "logps/rejected": -256.4028015136719, + "loss": 0.5719, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.5758640766143799, + "rewards/margins": 0.4759438931941986, + "rewards/rejected": -2.0518081188201904, + "step": 10930 + }, + { + "epoch": 1.884906960716747, + "grad_norm": 41.828216552734375, + "learning_rate": 7.301276883595463e-08, + "logits/chosen": -2.048062562942505, + "logits/rejected": -2.019521713256836, + "logps/chosen": -213.22213745117188, + "logps/rejected": -262.60064697265625, + "loss": 0.5598, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.5625088214874268, + "rewards/margins": 0.5233115553855896, + "rewards/rejected": -2.085820436477661, + "step": 10940 + }, + { + "epoch": 1.886629910406616, + "grad_norm": 38.982906341552734, + "learning_rate": 7.281977742155669e-08, + "logits/chosen": -2.1038460731506348, + "logits/rejected": -2.074618101119995, + "logps/chosen": -204.53701782226562, + "logps/rejected": -253.8427276611328, + "loss": 0.553, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.4922102689743042, + "rewards/margins": 0.5051702260971069, + "rewards/rejected": -1.9973804950714111, + "step": 10950 + }, + { + "epoch": 1.8883528600964852, + "grad_norm": 35.97544479370117, + "learning_rate": 7.262689525544067e-08, + "logits/chosen": -2.1165997982025146, + "logits/rejected": -2.082392692565918, + "logps/chosen": -208.37155151367188, + "logps/rejected": -262.97265625, + "loss": 0.5808, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.541710615158081, + "rewards/margins": 0.5694451332092285, + "rewards/rejected": -2.1111557483673096, + "step": 10960 + }, + { + "epoch": 1.8900758097863544, + "grad_norm": 35.33456802368164, + "learning_rate": 7.243412311287782e-08, + "logits/chosen": -2.138899087905884, + "logits/rejected": -2.094754457473755, + "logps/chosen": -197.99417114257812, + "logps/rejected": -256.3907775878906, + "loss": 0.5568, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.4036202430725098, + "rewards/margins": 0.6245434880256653, + "rewards/rejected": -2.0281636714935303, + "step": 10970 + }, + { + "epoch": 1.8917987594762233, + "grad_norm": 47.53961181640625, + "learning_rate": 7.224146176869717e-08, + "logits/chosen": -2.1551897525787354, + "logits/rejected": -2.116971492767334, + "logps/chosen": -203.69459533691406, + "logps/rejected": -265.29327392578125, + "loss": 0.5628, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.477172613143921, + "rewards/margins": 0.6235934495925903, + "rewards/rejected": -2.100766181945801, + "step": 10980 + }, + { + "epoch": 1.8935217091660923, + "grad_norm": 31.756277084350586, + "learning_rate": 7.204891199728241e-08, + "logits/chosen": -2.0958075523376465, + "logits/rejected": -2.0669543743133545, + "logps/chosen": -186.14865112304688, + "logps/rejected": -236.04080200195312, + "loss": 0.5647, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.3263320922851562, + "rewards/margins": 0.5096399188041687, + "rewards/rejected": -1.8359720706939697, + "step": 10990 + }, + { + "epoch": 1.8952446588559613, + "grad_norm": 30.142967224121094, + "learning_rate": 7.185647457256879e-08, + "logits/chosen": -2.2401175498962402, + "logits/rejected": -2.2189393043518066, + "logps/chosen": -189.11007690429688, + "logps/rejected": -246.180419921875, + "loss": 0.5703, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.3603506088256836, + "rewards/margins": 0.542121410369873, + "rewards/rejected": -1.902471899986267, + "step": 11000 + }, + { + "epoch": 1.8969676085458305, + "grad_norm": 37.16350173950195, + "learning_rate": 7.166415026803991e-08, + "logits/chosen": -2.147042751312256, + "logits/rejected": -2.098593235015869, + "logps/chosen": -193.9458770751953, + "logps/rejected": -260.72247314453125, + "loss": 0.506, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.3998019695281982, + "rewards/margins": 0.7031914591789246, + "rewards/rejected": -2.1029934883117676, + "step": 11010 + }, + { + "epoch": 1.8986905582356997, + "grad_norm": 25.575349807739258, + "learning_rate": 7.147193985672477e-08, + "logits/chosen": -2.162492275238037, + "logits/rejected": -2.124326467514038, + "logps/chosen": -208.29403686523438, + "logps/rejected": -263.257568359375, + "loss": 0.553, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.522696852684021, + "rewards/margins": 0.5898150205612183, + "rewards/rejected": -2.1125118732452393, + "step": 11020 + }, + { + "epoch": 1.9004135079255686, + "grad_norm": 32.54864501953125, + "learning_rate": 7.127984411119461e-08, + "logits/chosen": -2.151383876800537, + "logits/rejected": -2.1169238090515137, + "logps/chosen": -231.1976318359375, + "logps/rejected": -291.98944091796875, + "loss": 0.5982, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.7518266439437866, + "rewards/margins": 0.6365460157394409, + "rewards/rejected": -2.3883728981018066, + "step": 11030 + }, + { + "epoch": 1.9021364576154376, + "grad_norm": 27.272384643554688, + "learning_rate": 7.108786380355971e-08, + "logits/chosen": -2.161572217941284, + "logits/rejected": -2.126248836517334, + "logps/chosen": -217.8511199951172, + "logps/rejected": -274.0556640625, + "loss": 0.5472, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.5999891757965088, + "rewards/margins": 0.5932620167732239, + "rewards/rejected": -2.193251132965088, + "step": 11040 + }, + { + "epoch": 1.9038594073053066, + "grad_norm": 33.55147933959961, + "learning_rate": 7.089599970546642e-08, + "logits/chosen": -2.104125499725342, + "logits/rejected": -2.0692391395568848, + "logps/chosen": -203.26809692382812, + "logps/rejected": -269.63470458984375, + "loss": 0.5176, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.5091619491577148, + "rewards/margins": 0.6466538906097412, + "rewards/rejected": -2.155815839767456, + "step": 11050 + }, + { + "epoch": 1.9055823569951758, + "grad_norm": 30.785818099975586, + "learning_rate": 7.070425258809394e-08, + "logits/chosen": -2.0970897674560547, + "logits/rejected": -2.031811237335205, + "logps/chosen": -225.00997924804688, + "logps/rejected": -298.5378723144531, + "loss": 0.5053, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.6961663961410522, + "rewards/margins": 0.778182864189148, + "rewards/rejected": -2.474349021911621, + "step": 11060 + }, + { + "epoch": 1.907305306685045, + "grad_norm": 37.45442199707031, + "learning_rate": 7.051262322215128e-08, + "logits/chosen": -2.1341097354888916, + "logits/rejected": -2.072995662689209, + "logps/chosen": -230.6807098388672, + "logps/rejected": -307.604248046875, + "loss": 0.4812, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.7601606845855713, + "rewards/margins": 0.8273337483406067, + "rewards/rejected": -2.587494373321533, + "step": 11070 + }, + { + "epoch": 1.909028256374914, + "grad_norm": 26.271894454956055, + "learning_rate": 7.032111237787424e-08, + "logits/chosen": -2.0926690101623535, + "logits/rejected": -2.053788661956787, + "logps/chosen": -246.2302703857422, + "logps/rejected": -309.3777160644531, + "loss": 0.5356, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.8914282321929932, + "rewards/margins": 0.686504602432251, + "rewards/rejected": -2.577932834625244, + "step": 11080 + }, + { + "epoch": 1.9107512060647829, + "grad_norm": 36.41754913330078, + "learning_rate": 7.01297208250222e-08, + "logits/chosen": -2.080683946609497, + "logits/rejected": -2.040173053741455, + "logps/chosen": -246.9329833984375, + "logps/rejected": -303.186767578125, + "loss": 0.5431, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.8879804611206055, + "rewards/margins": 0.6224623918533325, + "rewards/rejected": -2.5104429721832275, + "step": 11090 + }, + { + "epoch": 1.9124741557546519, + "grad_norm": 34.7519645690918, + "learning_rate": 6.993844933287496e-08, + "logits/chosen": -2.0179200172424316, + "logits/rejected": -1.9861838817596436, + "logps/chosen": -230.6690673828125, + "logps/rejected": -291.9774475097656, + "loss": 0.5703, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.7871367931365967, + "rewards/margins": 0.6063834428787231, + "rewards/rejected": -2.3935203552246094, + "step": 11100 + }, + { + "epoch": 1.914197105444521, + "grad_norm": 34.446861267089844, + "learning_rate": 6.974729867022989e-08, + "logits/chosen": -2.06170392036438, + "logits/rejected": -2.01041841506958, + "logps/chosen": -250.8234100341797, + "logps/rejected": -332.9823303222656, + "loss": 0.5677, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.9623744487762451, + "rewards/margins": 0.8461829423904419, + "rewards/rejected": -2.8085572719573975, + "step": 11110 + }, + { + "epoch": 1.9159200551343902, + "grad_norm": 35.57859802246094, + "learning_rate": 6.955626960539855e-08, + "logits/chosen": -2.165560245513916, + "logits/rejected": -2.1220479011535645, + "logps/chosen": -229.8457794189453, + "logps/rejected": -289.033203125, + "loss": 0.5335, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.6932449340820312, + "rewards/margins": 0.6686538457870483, + "rewards/rejected": -2.36189866065979, + "step": 11120 + }, + { + "epoch": 1.9176430048242592, + "grad_norm": 52.480201721191406, + "learning_rate": 6.936536290620393e-08, + "logits/chosen": -2.066098690032959, + "logits/rejected": -2.018115520477295, + "logps/chosen": -226.6170196533203, + "logps/rejected": -289.1579284667969, + "loss": 0.5292, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.7019894123077393, + "rewards/margins": 0.6620699167251587, + "rewards/rejected": -2.3640592098236084, + "step": 11130 + }, + { + "epoch": 1.9193659545141282, + "grad_norm": 42.84165573120117, + "learning_rate": 6.917457933997706e-08, + "logits/chosen": -2.01469349861145, + "logits/rejected": -1.9801337718963623, + "logps/chosen": -226.46005249023438, + "logps/rejected": -282.3187255859375, + "loss": 0.5586, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.7090473175048828, + "rewards/margins": 0.6025687456130981, + "rewards/rejected": -2.3116161823272705, + "step": 11140 + }, + { + "epoch": 1.9210889042039971, + "grad_norm": 39.90765380859375, + "learning_rate": 6.898391967355405e-08, + "logits/chosen": -2.1024584770202637, + "logits/rejected": -2.076080799102783, + "logps/chosen": -231.018310546875, + "logps/rejected": -283.4735412597656, + "loss": 0.5776, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.722633719444275, + "rewards/margins": 0.5566567778587341, + "rewards/rejected": -2.2792906761169434, + "step": 11150 + }, + { + "epoch": 1.9228118538938663, + "grad_norm": 52.43754577636719, + "learning_rate": 6.879338467327302e-08, + "logits/chosen": -2.14897084236145, + "logits/rejected": -2.1171956062316895, + "logps/chosen": -208.48263549804688, + "logps/rejected": -254.87289428710938, + "loss": 0.5868, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.5512667894363403, + "rewards/margins": 0.4904497265815735, + "rewards/rejected": -2.0417165756225586, + "step": 11160 + }, + { + "epoch": 1.9245348035837355, + "grad_norm": 38.62773895263672, + "learning_rate": 6.860297510497104e-08, + "logits/chosen": -2.0566763877868652, + "logits/rejected": -2.0216500759124756, + "logps/chosen": -211.69384765625, + "logps/rejected": -248.56982421875, + "loss": 0.588, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.5310242176055908, + "rewards/margins": 0.46797189116477966, + "rewards/rejected": -1.9989961385726929, + "step": 11170 + }, + { + "epoch": 1.9262577532736045, + "grad_norm": 32.62820053100586, + "learning_rate": 6.841269173398107e-08, + "logits/chosen": -2.1622769832611084, + "logits/rejected": -2.1254467964172363, + "logps/chosen": -216.33853149414062, + "logps/rejected": -272.8714294433594, + "loss": 0.5488, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.611806869506836, + "rewards/margins": 0.610049307346344, + "rewards/rejected": -2.221856117248535, + "step": 11180 + }, + { + "epoch": 1.9279807029634735, + "grad_norm": 48.73835372924805, + "learning_rate": 6.82225353251286e-08, + "logits/chosen": -2.1593518257141113, + "logits/rejected": -2.109025239944458, + "logps/chosen": -198.0034637451172, + "logps/rejected": -267.95458984375, + "loss": 0.4723, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.420243263244629, + "rewards/margins": 0.7498121857643127, + "rewards/rejected": -2.170055389404297, + "step": 11190 + }, + { + "epoch": 1.9297036526533424, + "grad_norm": 36.46852493286133, + "learning_rate": 6.80325066427291e-08, + "logits/chosen": -2.1360020637512207, + "logits/rejected": -2.0926783084869385, + "logps/chosen": -210.9085693359375, + "logps/rejected": -271.9500732421875, + "loss": 0.5384, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.5630364418029785, + "rewards/margins": 0.622287929058075, + "rewards/rejected": -2.1853244304656982, + "step": 11200 + }, + { + "epoch": 1.9297036526533424, + "eval_logits/chosen": -2.150330066680908, + "eval_logits/rejected": -2.129934072494507, + "eval_logps/chosen": -223.87203979492188, + "eval_logps/rejected": -257.14404296875, + "eval_loss": 0.6418474912643433, + "eval_rewards/accuracies": 0.6363847851753235, + "eval_rewards/chosen": -1.648565649986267, + "eval_rewards/margins": 0.2953791320323944, + "eval_rewards/rejected": -1.9439448118209839, + "eval_runtime": 383.1567, + "eval_samples_per_second": 11.233, + "eval_steps_per_second": 1.404, + "step": 11200 + }, + { + "epoch": 1.9314266023432116, + "grad_norm": 38.86859130859375, + "learning_rate": 6.784260645058445e-08, + "logits/chosen": -2.0044658184051514, + "logits/rejected": -1.9595401287078857, + "logps/chosen": -239.3448028564453, + "logps/rejected": -320.5731506347656, + "loss": 0.5335, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.8804552555084229, + "rewards/margins": 0.7997304201126099, + "rewards/rejected": -2.6801857948303223, + "step": 11210 + }, + { + "epoch": 1.9331495520330806, + "grad_norm": 52.32000732421875, + "learning_rate": 6.765283551198016e-08, + "logits/chosen": -2.0517823696136475, + "logits/rejected": -2.023305892944336, + "logps/chosen": -250.20114135742188, + "logps/rejected": -312.5027770996094, + "loss": 0.6199, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.9770643711090088, + "rewards/margins": 0.5979080200195312, + "rewards/rejected": -2.574972629547119, + "step": 11220 + }, + { + "epoch": 1.9348725017229498, + "grad_norm": 30.262535095214844, + "learning_rate": 6.746319458968226e-08, + "logits/chosen": -2.0434021949768066, + "logits/rejected": -2.012618064880371, + "logps/chosen": -255.679931640625, + "logps/rejected": -310.25408935546875, + "loss": 0.5841, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.982190728187561, + "rewards/margins": 0.6203389167785645, + "rewards/rejected": -2.602529764175415, + "step": 11230 + }, + { + "epoch": 1.9365954514128187, + "grad_norm": 31.0468807220459, + "learning_rate": 6.727368444593408e-08, + "logits/chosen": -2.082054376602173, + "logits/rejected": -2.0466256141662598, + "logps/chosen": -224.1901397705078, + "logps/rejected": -274.36688232421875, + "loss": 0.5864, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.6886026859283447, + "rewards/margins": 0.5320411920547485, + "rewards/rejected": -2.220643997192383, + "step": 11240 + }, + { + "epoch": 1.9383184011026877, + "grad_norm": 30.541744232177734, + "learning_rate": 6.708430584245337e-08, + "logits/chosen": -2.110844373703003, + "logits/rejected": -2.0703446865081787, + "logps/chosen": -216.69485473632812, + "logps/rejected": -277.73150634765625, + "loss": 0.5383, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.6169925928115845, + "rewards/margins": 0.6295633316040039, + "rewards/rejected": -2.246555805206299, + "step": 11250 + }, + { + "epoch": 1.940041350792557, + "grad_norm": 31.27999496459961, + "learning_rate": 6.689505954042913e-08, + "logits/chosen": -2.0711936950683594, + "logits/rejected": -2.0286784172058105, + "logps/chosen": -218.43081665039062, + "logps/rejected": -267.02716064453125, + "loss": 0.5712, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.5965648889541626, + "rewards/margins": 0.5688450932502747, + "rewards/rejected": -2.165409803390503, + "step": 11260 + }, + { + "epoch": 1.9417643004824259, + "grad_norm": 43.0255126953125, + "learning_rate": 6.67059463005187e-08, + "logits/chosen": -2.0696632862091064, + "logits/rejected": -2.0213420391082764, + "logps/chosen": -209.5532989501953, + "logps/rejected": -272.81805419921875, + "loss": 0.5049, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.532209038734436, + "rewards/margins": 0.69169682264328, + "rewards/rejected": -2.2239060401916504, + "step": 11270 + }, + { + "epoch": 1.943487250172295, + "grad_norm": 28.406047821044922, + "learning_rate": 6.651696688284438e-08, + "logits/chosen": -2.106395959854126, + "logits/rejected": -2.059577703475952, + "logps/chosen": -232.7011260986328, + "logps/rejected": -284.5666198730469, + "loss": 0.5597, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.7567068338394165, + "rewards/margins": 0.5732653737068176, + "rewards/rejected": -2.329972505569458, + "step": 11280 + }, + { + "epoch": 1.945210199862164, + "grad_norm": 37.7677116394043, + "learning_rate": 6.632812204699077e-08, + "logits/chosen": -2.094848871231079, + "logits/rejected": -2.055690288543701, + "logps/chosen": -231.9086456298828, + "logps/rejected": -297.62689208984375, + "loss": 0.5308, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.7121360301971436, + "rewards/margins": 0.7090049982070923, + "rewards/rejected": -2.4211411476135254, + "step": 11290 + }, + { + "epoch": 1.946933149552033, + "grad_norm": 43.34477233886719, + "learning_rate": 6.613941255200147e-08, + "logits/chosen": -2.0163540840148926, + "logits/rejected": -1.9858640432357788, + "logps/chosen": -264.2913513183594, + "logps/rejected": -301.52703857421875, + "loss": 0.6495, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -2.0428531169891357, + "rewards/margins": 0.442251056432724, + "rewards/rejected": -2.4851040840148926, + "step": 11300 + }, + { + "epoch": 1.948656099241902, + "grad_norm": 47.41953659057617, + "learning_rate": 6.595083915637602e-08, + "logits/chosen": -2.127748966217041, + "logits/rejected": -2.0928280353546143, + "logps/chosen": -237.9222412109375, + "logps/rejected": -310.1321716308594, + "loss": 0.5151, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.854783058166504, + "rewards/margins": 0.7411599159240723, + "rewards/rejected": -2.595942974090576, + "step": 11310 + }, + { + "epoch": 1.9503790489317712, + "grad_norm": 32.63195037841797, + "learning_rate": 6.576240261806711e-08, + "logits/chosen": -2.0840084552764893, + "logits/rejected": -2.041901111602783, + "logps/chosen": -236.044677734375, + "logps/rejected": -304.3868103027344, + "loss": 0.5322, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.7976115942001343, + "rewards/margins": 0.7138528227806091, + "rewards/rejected": -2.5114643573760986, + "step": 11320 + }, + { + "epoch": 1.9521019986216404, + "grad_norm": 81.51802062988281, + "learning_rate": 6.557410369447712e-08, + "logits/chosen": -2.0825531482696533, + "logits/rejected": -2.0385289192199707, + "logps/chosen": -242.2621307373047, + "logps/rejected": -308.0596008300781, + "loss": 0.5261, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.8813930749893188, + "rewards/margins": 0.7215787172317505, + "rewards/rejected": -2.6029715538024902, + "step": 11330 + }, + { + "epoch": 1.9538249483115093, + "grad_norm": 43.0372200012207, + "learning_rate": 6.538594314245541e-08, + "logits/chosen": -2.0444424152374268, + "logits/rejected": -1.997079849243164, + "logps/chosen": -236.3446502685547, + "logps/rejected": -302.52606201171875, + "loss": 0.5421, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.7998119592666626, + "rewards/margins": 0.6873337626457214, + "rewards/rejected": -2.4871456623077393, + "step": 11340 + }, + { + "epoch": 1.9555478980013783, + "grad_norm": 46.32229995727539, + "learning_rate": 6.51979217182952e-08, + "logits/chosen": -2.1301732063293457, + "logits/rejected": -2.0888075828552246, + "logps/chosen": -244.2246856689453, + "logps/rejected": -303.2362365722656, + "loss": 0.5816, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.858077049255371, + "rewards/margins": 0.6253143548965454, + "rewards/rejected": -2.483391284942627, + "step": 11350 + }, + { + "epoch": 1.9572708476912473, + "grad_norm": 35.97012710571289, + "learning_rate": 6.501004017773049e-08, + "logits/chosen": -2.0921897888183594, + "logits/rejected": -2.056212902069092, + "logps/chosen": -232.33438110351562, + "logps/rejected": -288.73779296875, + "loss": 0.5435, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.7822563648223877, + "rewards/margins": 0.6243523955345154, + "rewards/rejected": -2.406608819961548, + "step": 11360 + }, + { + "epoch": 1.9589937973811165, + "grad_norm": 33.192787170410156, + "learning_rate": 6.482229927593292e-08, + "logits/chosen": -2.087029457092285, + "logits/rejected": -2.0539119243621826, + "logps/chosen": -225.0400848388672, + "logps/rejected": -280.0304260253906, + "loss": 0.5488, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.682896614074707, + "rewards/margins": 0.5910366177558899, + "rewards/rejected": -2.273932933807373, + "step": 11370 + }, + { + "epoch": 1.9607167470709856, + "grad_norm": 47.09540557861328, + "learning_rate": 6.463469976750894e-08, + "logits/chosen": -2.0447041988372803, + "logits/rejected": -2.00447154045105, + "logps/chosen": -225.29141235351562, + "logps/rejected": -284.82025146484375, + "loss": 0.5638, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.7298924922943115, + "rewards/margins": 0.6187609434127808, + "rewards/rejected": -2.3486533164978027, + "step": 11380 + }, + { + "epoch": 1.9624396967608546, + "grad_norm": 35.9467887878418, + "learning_rate": 6.444724240649674e-08, + "logits/chosen": -2.1504344940185547, + "logits/rejected": -2.1013474464416504, + "logps/chosen": -203.40408325195312, + "logps/rejected": -275.85430908203125, + "loss": 0.5266, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.4995545148849487, + "rewards/margins": 0.7154166102409363, + "rewards/rejected": -2.2149710655212402, + "step": 11390 + }, + { + "epoch": 1.9641626464507236, + "grad_norm": 34.19363784790039, + "learning_rate": 6.425992794636305e-08, + "logits/chosen": -2.1202802658081055, + "logits/rejected": -2.081855535507202, + "logps/chosen": -209.0074462890625, + "logps/rejected": -267.097900390625, + "loss": 0.5436, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.522707223892212, + "rewards/margins": 0.6264538764953613, + "rewards/rejected": -2.149160861968994, + "step": 11400 + }, + { + "epoch": 1.9658855961405926, + "grad_norm": 33.694339752197266, + "learning_rate": 6.407275714000029e-08, + "logits/chosen": -2.1229145526885986, + "logits/rejected": -2.084303140640259, + "logps/chosen": -214.90896606445312, + "logps/rejected": -272.10284423828125, + "loss": 0.5436, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.5856306552886963, + "rewards/margins": 0.5950139164924622, + "rewards/rejected": -2.1806445121765137, + "step": 11410 + }, + { + "epoch": 1.9676085458304617, + "grad_norm": 25.552141189575195, + "learning_rate": 6.388573073972341e-08, + "logits/chosen": -2.104806661605835, + "logits/rejected": -2.069352626800537, + "logps/chosen": -215.0218505859375, + "logps/rejected": -257.0823669433594, + "loss": 0.5875, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.6048667430877686, + "rewards/margins": 0.4714290201663971, + "rewards/rejected": -2.0762956142425537, + "step": 11420 + }, + { + "epoch": 1.969331495520331, + "grad_norm": 31.175952911376953, + "learning_rate": 6.3698849497267e-08, + "logits/chosen": -2.075159788131714, + "logits/rejected": -2.0389952659606934, + "logps/chosen": -196.71681213378906, + "logps/rejected": -245.8275909423828, + "loss": 0.5744, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.4119598865509033, + "rewards/margins": 0.5342273712158203, + "rewards/rejected": -1.9461870193481445, + "step": 11430 + }, + { + "epoch": 1.9710544452102, + "grad_norm": 32.55466079711914, + "learning_rate": 6.351211416378221e-08, + "logits/chosen": -2.113795757293701, + "logits/rejected": -2.0957531929016113, + "logps/chosen": -195.7284393310547, + "logps/rejected": -235.590576171875, + "loss": 0.5862, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.4285451173782349, + "rewards/margins": 0.40397801995277405, + "rewards/rejected": -1.8325231075286865, + "step": 11440 + }, + { + "epoch": 1.9727773949000689, + "grad_norm": 30.875164031982422, + "learning_rate": 6.332552548983368e-08, + "logits/chosen": -2.1119213104248047, + "logits/rejected": -2.0720133781433105, + "logps/chosen": -200.646484375, + "logps/rejected": -256.79559326171875, + "loss": 0.5535, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.4703338146209717, + "rewards/margins": 0.5860790014266968, + "rewards/rejected": -2.056412935256958, + "step": 11450 + }, + { + "epoch": 1.9745003445899378, + "grad_norm": 30.871906280517578, + "learning_rate": 6.313908422539656e-08, + "logits/chosen": -2.1062095165252686, + "logits/rejected": -2.06668758392334, + "logps/chosen": -199.67001342773438, + "logps/rejected": -259.66522216796875, + "loss": 0.5361, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.4358268976211548, + "rewards/margins": 0.6267709732055664, + "rewards/rejected": -2.0625977516174316, + "step": 11460 + }, + { + "epoch": 1.976223294279807, + "grad_norm": 28.920886993408203, + "learning_rate": 6.295279111985354e-08, + "logits/chosen": -2.1364591121673584, + "logits/rejected": -2.075352430343628, + "logps/chosen": -216.4774627685547, + "logps/rejected": -280.35980224609375, + "loss": 0.5025, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.5842845439910889, + "rewards/margins": 0.7140352725982666, + "rewards/rejected": -2.2983195781707764, + "step": 11470 + }, + { + "epoch": 1.9779462439696762, + "grad_norm": 35.6077766418457, + "learning_rate": 6.276664692199175e-08, + "logits/chosen": -2.1571812629699707, + "logits/rejected": -2.110039234161377, + "logps/chosen": -202.0142364501953, + "logps/rejected": -247.08834838867188, + "loss": 0.5817, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.444929838180542, + "rewards/margins": 0.5276190042495728, + "rewards/rejected": -1.9725488424301147, + "step": 11480 + }, + { + "epoch": 1.9796691936595452, + "grad_norm": 29.205793380737305, + "learning_rate": 6.258065237999988e-08, + "logits/chosen": -2.1270751953125, + "logits/rejected": -2.0870296955108643, + "logps/chosen": -210.76516723632812, + "logps/rejected": -252.1230926513672, + "loss": 0.5994, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.548182487487793, + "rewards/margins": 0.4664430618286133, + "rewards/rejected": -2.0146255493164062, + "step": 11490 + }, + { + "epoch": 1.9813921433494142, + "grad_norm": 35.74567794799805, + "learning_rate": 6.239480824146503e-08, + "logits/chosen": -2.148181438446045, + "logits/rejected": -2.1178340911865234, + "logps/chosen": -193.29258728027344, + "logps/rejected": -232.23593139648438, + "loss": 0.596, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.4171593189239502, + "rewards/margins": 0.4054163098335266, + "rewards/rejected": -1.822575569152832, + "step": 11500 + }, + { + "epoch": 1.9831150930392831, + "grad_norm": 41.26714324951172, + "learning_rate": 6.220911525336977e-08, + "logits/chosen": -2.104710578918457, + "logits/rejected": -2.073302745819092, + "logps/chosen": -203.44204711914062, + "logps/rejected": -262.4302062988281, + "loss": 0.5674, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.471699595451355, + "rewards/margins": 0.6327524781227112, + "rewards/rejected": -2.104451894760132, + "step": 11510 + }, + { + "epoch": 1.9848380427291523, + "grad_norm": 37.78926086425781, + "learning_rate": 6.202357416208911e-08, + "logits/chosen": -2.152705669403076, + "logits/rejected": -2.0834972858428955, + "logps/chosen": -195.06761169433594, + "logps/rejected": -245.7989959716797, + "loss": 0.5314, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.3396432399749756, + "rewards/margins": 0.6214916706085205, + "rewards/rejected": -1.9611351490020752, + "step": 11520 + }, + { + "epoch": 1.9865609924190215, + "grad_norm": 24.8828067779541, + "learning_rate": 6.183818571338766e-08, + "logits/chosen": -2.119786024093628, + "logits/rejected": -2.073561191558838, + "logps/chosen": -188.87936401367188, + "logps/rejected": -236.6806640625, + "loss": 0.5444, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.3401225805282593, + "rewards/margins": 0.5436755418777466, + "rewards/rejected": -1.8837982416152954, + "step": 11530 + }, + { + "epoch": 1.9882839421088905, + "grad_norm": 32.03831481933594, + "learning_rate": 6.165295065241633e-08, + "logits/chosen": -2.1403820514678955, + "logits/rejected": -2.1111044883728027, + "logps/chosen": -199.05844116210938, + "logps/rejected": -277.83038330078125, + "loss": 0.5031, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.4719569683074951, + "rewards/margins": 0.7576948404312134, + "rewards/rejected": -2.229651689529419, + "step": 11540 + }, + { + "epoch": 1.9900068917987594, + "grad_norm": 36.63600158691406, + "learning_rate": 6.146786972370959e-08, + "logits/chosen": -2.069815158843994, + "logits/rejected": -2.041349411010742, + "logps/chosen": -222.26174926757812, + "logps/rejected": -275.2193908691406, + "loss": 0.5902, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.698169469833374, + "rewards/margins": 0.5312357544898987, + "rewards/rejected": -2.229405164718628, + "step": 11550 + }, + { + "epoch": 1.9917298414886284, + "grad_norm": 33.422481536865234, + "learning_rate": 6.128294367118237e-08, + "logits/chosen": -2.1030850410461426, + "logits/rejected": -2.0553946495056152, + "logps/chosen": -223.8405303955078, + "logps/rejected": -285.63922119140625, + "loss": 0.5378, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.6755192279815674, + "rewards/margins": 0.676115870475769, + "rewards/rejected": -2.3516347408294678, + "step": 11560 + }, + { + "epoch": 1.9934527911784976, + "grad_norm": 37.54303741455078, + "learning_rate": 6.109817323812706e-08, + "logits/chosen": -2.1160786151885986, + "logits/rejected": -2.0797441005706787, + "logps/chosen": -232.2115936279297, + "logps/rejected": -290.12860107421875, + "loss": 0.5729, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.7748934030532837, + "rewards/margins": 0.579551100730896, + "rewards/rejected": -2.3544440269470215, + "step": 11570 + }, + { + "epoch": 1.9951757408683668, + "grad_norm": 36.5296745300293, + "learning_rate": 6.091355916721064e-08, + "logits/chosen": -2.1580021381378174, + "logits/rejected": -2.118701219558716, + "logps/chosen": -219.04708862304688, + "logps/rejected": -277.0885314941406, + "loss": 0.5636, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.618523359298706, + "rewards/margins": 0.607399582862854, + "rewards/rejected": -2.2259230613708496, + "step": 11580 + }, + { + "epoch": 1.9968986905582358, + "grad_norm": 40.546112060546875, + "learning_rate": 6.072910220047159e-08, + "logits/chosen": -2.062598943710327, + "logits/rejected": -2.0163540840148926, + "logps/chosen": -213.2329864501953, + "logps/rejected": -254.9839324951172, + "loss": 0.5645, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.5311720371246338, + "rewards/margins": 0.5165271759033203, + "rewards/rejected": -2.047699451446533, + "step": 11590 + }, + { + "epoch": 1.9986216402481047, + "grad_norm": 36.788360595703125, + "learning_rate": 6.054480307931678e-08, + "logits/chosen": -2.1328606605529785, + "logits/rejected": -2.103621244430542, + "logps/chosen": -197.7500457763672, + "logps/rejected": -250.41305541992188, + "loss": 0.5734, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.4637445211410522, + "rewards/margins": 0.512778103351593, + "rewards/rejected": -1.97652268409729, + "step": 11600 + }, + { + "epoch": 1.9986216402481047, + "eval_logits/chosen": -2.1813366413116455, + "eval_logits/rejected": -2.162360429763794, + "eval_logps/chosen": -202.5782012939453, + "eval_logps/rejected": -233.75628662109375, + "eval_loss": 0.6378381252288818, + "eval_rewards/accuracies": 0.6361523866653442, + "eval_rewards/chosen": -1.4356271028518677, + "eval_rewards/margins": 0.27443984150886536, + "eval_rewards/rejected": -1.710066795349121, + "eval_runtime": 383.3871, + "eval_samples_per_second": 11.226, + "eval_steps_per_second": 1.403, + "step": 11600 + }, + { + "epoch": 2.0003445899379737, + "grad_norm": 38.874786376953125, + "learning_rate": 6.036066254451881e-08, + "logits/chosen": -2.080693244934082, + "logits/rejected": -2.040472984313965, + "logps/chosen": -205.5887908935547, + "logps/rejected": -267.6993713378906, + "loss": 0.5544, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.5286204814910889, + "rewards/margins": 0.6358743906021118, + "rewards/rejected": -2.1644949913024902, + "step": 11610 + }, + { + "epoch": 2.0020675396278427, + "grad_norm": 42.31683349609375, + "learning_rate": 6.017668133621275e-08, + "logits/chosen": -2.1189987659454346, + "logits/rejected": -2.0837512016296387, + "logps/chosen": -216.97024536132812, + "logps/rejected": -276.1315002441406, + "loss": 0.5511, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.5971972942352295, + "rewards/margins": 0.6041030883789062, + "rewards/rejected": -2.2013003826141357, + "step": 11620 + }, + { + "epoch": 2.003790489317712, + "grad_norm": 35.677406311035156, + "learning_rate": 5.999286019389342e-08, + "logits/chosen": -2.1324095726013184, + "logits/rejected": -2.083679676055908, + "logps/chosen": -208.1287384033203, + "logps/rejected": -270.8012390136719, + "loss": 0.5135, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.4944071769714355, + "rewards/margins": 0.689971923828125, + "rewards/rejected": -2.1843791007995605, + "step": 11630 + }, + { + "epoch": 2.005513439007581, + "grad_norm": 26.854320526123047, + "learning_rate": 5.980919985641202e-08, + "logits/chosen": -2.13673734664917, + "logits/rejected": -2.0990772247314453, + "logps/chosen": -206.71560668945312, + "logps/rejected": -269.2923889160156, + "loss": 0.5106, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.5317871570587158, + "rewards/margins": 0.6630698442459106, + "rewards/rejected": -2.194857120513916, + "step": 11640 + }, + { + "epoch": 2.00723638869745, + "grad_norm": 45.685150146484375, + "learning_rate": 5.962570106197364e-08, + "logits/chosen": -2.0522255897521973, + "logits/rejected": -1.999353051185608, + "logps/chosen": -206.79458618164062, + "logps/rejected": -270.4117126464844, + "loss": 0.5254, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.514573574066162, + "rewards/margins": 0.6830571293830872, + "rewards/rejected": -2.1976304054260254, + "step": 11650 + }, + { + "epoch": 2.008959338387319, + "grad_norm": 28.480182647705078, + "learning_rate": 5.944236454813396e-08, + "logits/chosen": -2.123844623565674, + "logits/rejected": -2.067272186279297, + "logps/chosen": -233.8195037841797, + "logps/rejected": -292.9638366699219, + "loss": 0.5361, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.7744499444961548, + "rewards/margins": 0.665165364742279, + "rewards/rejected": -2.439615488052368, + "step": 11660 + }, + { + "epoch": 2.010682288077188, + "grad_norm": 39.119380950927734, + "learning_rate": 5.9259191051796375e-08, + "logits/chosen": -2.0889270305633545, + "logits/rejected": -2.028006076812744, + "logps/chosen": -230.15066528320312, + "logps/rejected": -327.9476623535156, + "loss": 0.4392, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.7356784343719482, + "rewards/margins": 1.015694499015808, + "rewards/rejected": -2.751373291015625, + "step": 11670 + }, + { + "epoch": 2.0124052377670574, + "grad_norm": 42.146400451660156, + "learning_rate": 5.907618130920919e-08, + "logits/chosen": -2.0312490463256836, + "logits/rejected": -2.000492572784424, + "logps/chosen": -240.016357421875, + "logps/rejected": -307.0270690917969, + "loss": 0.558, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.8243763446807861, + "rewards/margins": 0.6948267221450806, + "rewards/rejected": -2.519202947616577, + "step": 11680 + }, + { + "epoch": 2.0141281874569263, + "grad_norm": 42.349021911621094, + "learning_rate": 5.8893336055962254e-08, + "logits/chosen": -2.1421942710876465, + "logits/rejected": -2.0884110927581787, + "logps/chosen": -228.0297088623047, + "logps/rejected": -291.3267517089844, + "loss": 0.5139, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.6707584857940674, + "rewards/margins": 0.7115265130996704, + "rewards/rejected": -2.3822848796844482, + "step": 11690 + }, + { + "epoch": 2.0158511371467953, + "grad_norm": 30.39948272705078, + "learning_rate": 5.871065602698451e-08, + "logits/chosen": -2.1972460746765137, + "logits/rejected": -2.1512832641601562, + "logps/chosen": -199.59202575683594, + "logps/rejected": -259.33209228515625, + "loss": 0.5424, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.422917127609253, + "rewards/margins": 0.6099113821983337, + "rewards/rejected": -2.0328285694122314, + "step": 11700 + }, + { + "epoch": 2.0175740868366643, + "grad_norm": 39.6872444152832, + "learning_rate": 5.852814195654068e-08, + "logits/chosen": -2.116191864013672, + "logits/rejected": -2.066092014312744, + "logps/chosen": -213.960205078125, + "logps/rejected": -273.5892028808594, + "loss": 0.5265, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.5808203220367432, + "rewards/margins": 0.6322665810585022, + "rewards/rejected": -2.2130866050720215, + "step": 11710 + }, + { + "epoch": 2.0192970365265333, + "grad_norm": 34.95515823364258, + "learning_rate": 5.834579457822848e-08, + "logits/chosen": -2.1429433822631836, + "logits/rejected": -2.1109375953674316, + "logps/chosen": -223.36117553710938, + "logps/rejected": -280.72528076171875, + "loss": 0.5774, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.681788444519043, + "rewards/margins": 0.6002216935157776, + "rewards/rejected": -2.2820098400115967, + "step": 11720 + }, + { + "epoch": 2.0210199862164027, + "grad_norm": 32.278526306152344, + "learning_rate": 5.81636146249756e-08, + "logits/chosen": -2.0954790115356445, + "logits/rejected": -2.060276508331299, + "logps/chosen": -219.8001251220703, + "logps/rejected": -285.2901916503906, + "loss": 0.5431, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6424379348754883, + "rewards/margins": 0.6421239376068115, + "rewards/rejected": -2.2845618724823, + "step": 11730 + }, + { + "epoch": 2.0227429359062716, + "grad_norm": 37.31665802001953, + "learning_rate": 5.798160282903672e-08, + "logits/chosen": -2.1379740238189697, + "logits/rejected": -2.1029396057128906, + "logps/chosen": -211.76272583007812, + "logps/rejected": -270.9972839355469, + "loss": 0.5506, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.5817822217941284, + "rewards/margins": 0.605307400226593, + "rewards/rejected": -2.187089681625366, + "step": 11740 + }, + { + "epoch": 2.0244658855961406, + "grad_norm": 27.076473236083984, + "learning_rate": 5.779975992199075e-08, + "logits/chosen": -2.137813091278076, + "logits/rejected": -2.105919122695923, + "logps/chosen": -214.4691925048828, + "logps/rejected": -299.862060546875, + "loss": 0.4774, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.6512985229492188, + "rewards/margins": 0.8046330213546753, + "rewards/rejected": -2.4559314250946045, + "step": 11750 + }, + { + "epoch": 2.0261888352860096, + "grad_norm": 34.72412109375, + "learning_rate": 5.761808663473775e-08, + "logits/chosen": -2.1149559020996094, + "logits/rejected": -2.0760014057159424, + "logps/chosen": -237.0303192138672, + "logps/rejected": -312.3663635253906, + "loss": 0.5381, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.812771201133728, + "rewards/margins": 0.7852569222450256, + "rewards/rejected": -2.5980281829833984, + "step": 11760 + }, + { + "epoch": 2.0279117849758785, + "grad_norm": 30.311351776123047, + "learning_rate": 5.74365836974959e-08, + "logits/chosen": -2.0622406005859375, + "logits/rejected": -2.0181469917297363, + "logps/chosen": -227.7604217529297, + "logps/rejected": -292.30255126953125, + "loss": 0.5248, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.6945585012435913, + "rewards/margins": 0.715693473815918, + "rewards/rejected": -2.410252094268799, + "step": 11770 + }, + { + "epoch": 2.029634734665748, + "grad_norm": 45.21958923339844, + "learning_rate": 5.7255251839798726e-08, + "logits/chosen": -2.038755178451538, + "logits/rejected": -2.0028738975524902, + "logps/chosen": -227.783203125, + "logps/rejected": -288.4049072265625, + "loss": 0.5598, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.7753404378890991, + "rewards/margins": 0.5917962789535522, + "rewards/rejected": -2.3671367168426514, + "step": 11780 + }, + { + "epoch": 2.031357684355617, + "grad_norm": 36.97776412963867, + "learning_rate": 5.7074091790492206e-08, + "logits/chosen": -2.0641019344329834, + "logits/rejected": -2.0340189933776855, + "logps/chosen": -220.64047241210938, + "logps/rejected": -322.8470153808594, + "loss": 0.425, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -1.7013603448867798, + "rewards/margins": 0.9990695714950562, + "rewards/rejected": -2.700429916381836, + "step": 11790 + }, + { + "epoch": 2.033080634045486, + "grad_norm": 34.968109130859375, + "learning_rate": 5.6893104277731594e-08, + "logits/chosen": -2.077822685241699, + "logits/rejected": -2.0449321269989014, + "logps/chosen": -257.3918762207031, + "logps/rejected": -310.2939453125, + "loss": 0.5721, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.9884878396987915, + "rewards/margins": 0.5862862467765808, + "rewards/rejected": -2.5747742652893066, + "step": 11800 + }, + { + "epoch": 2.034803583735355, + "grad_norm": 46.57944869995117, + "learning_rate": 5.6712290028978815e-08, + "logits/chosen": -2.075214385986328, + "logits/rejected": -2.031886577606201, + "logps/chosen": -256.56695556640625, + "logps/rejected": -326.1233215332031, + "loss": 0.5368, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.995705008506775, + "rewards/margins": 0.7450501322746277, + "rewards/rejected": -2.740755558013916, + "step": 11810 + }, + { + "epoch": 2.036526533425224, + "grad_norm": 32.89664077758789, + "learning_rate": 5.653164977099921e-08, + "logits/chosen": -2.0899271965026855, + "logits/rejected": -2.056302309036255, + "logps/chosen": -241.9139862060547, + "logps/rejected": -305.611083984375, + "loss": 0.5702, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.9050085544586182, + "rewards/margins": 0.6445132493972778, + "rewards/rejected": -2.5495216846466064, + "step": 11820 + }, + { + "epoch": 2.0382494831150932, + "grad_norm": 49.7681999206543, + "learning_rate": 5.635118422985896e-08, + "logits/chosen": -2.0551645755767822, + "logits/rejected": -2.027526378631592, + "logps/chosen": -226.86361694335938, + "logps/rejected": -297.20831298828125, + "loss": 0.5262, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.7415897846221924, + "rewards/margins": 0.6913633942604065, + "rewards/rejected": -2.432953119277954, + "step": 11830 + }, + { + "epoch": 2.039972432804962, + "grad_norm": 34.186309814453125, + "learning_rate": 5.61708941309218e-08, + "logits/chosen": -2.0613903999328613, + "logits/rejected": -2.0164942741394043, + "logps/chosen": -239.87210083007812, + "logps/rejected": -312.2968444824219, + "loss": 0.5158, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.8667516708374023, + "rewards/margins": 0.7428416609764099, + "rewards/rejected": -2.609593152999878, + "step": 11840 + }, + { + "epoch": 2.041695382494831, + "grad_norm": 39.7913703918457, + "learning_rate": 5.5990780198846435e-08, + "logits/chosen": -2.0198912620544434, + "logits/rejected": -1.9685665369033813, + "logps/chosen": -253.1337890625, + "logps/rejected": -317.5090026855469, + "loss": 0.5376, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.9773327112197876, + "rewards/margins": 0.7132784724235535, + "rewards/rejected": -2.6906113624572754, + "step": 11850 + }, + { + "epoch": 2.0434183321847, + "grad_norm": 56.37113571166992, + "learning_rate": 5.581084315758351e-08, + "logits/chosen": -2.093982696533203, + "logits/rejected": -2.075417995452881, + "logps/chosen": -268.6842346191406, + "logps/rejected": -316.57073974609375, + "loss": 0.6342, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -2.1388251781463623, + "rewards/margins": 0.4565099775791168, + "rewards/rejected": -2.595335006713867, + "step": 11860 + }, + { + "epoch": 2.045141281874569, + "grad_norm": 42.2559700012207, + "learning_rate": 5.563108373037243e-08, + "logits/chosen": -2.0415985584259033, + "logits/rejected": -2.0136475563049316, + "logps/chosen": -233.3759002685547, + "logps/rejected": -308.192626953125, + "loss": 0.5188, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.8376801013946533, + "rewards/margins": 0.7106243371963501, + "rewards/rejected": -2.548304796218872, + "step": 11870 + }, + { + "epoch": 2.0468642315644385, + "grad_norm": 45.246246337890625, + "learning_rate": 5.545150263973897e-08, + "logits/chosen": -2.063602924346924, + "logits/rejected": -2.015317916870117, + "logps/chosen": -246.6908721923828, + "logps/rejected": -325.9163513183594, + "loss": 0.4574, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.8842356204986572, + "rewards/margins": 0.8306552767753601, + "rewards/rejected": -2.714890718460083, + "step": 11880 + }, + { + "epoch": 2.0485871812543075, + "grad_norm": 43.25898742675781, + "learning_rate": 5.527210060749201e-08, + "logits/chosen": -2.0884766578674316, + "logits/rejected": -2.0456490516662598, + "logps/chosen": -256.9544982910156, + "logps/rejected": -329.82989501953125, + "loss": 0.5009, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.9893547296524048, + "rewards/margins": 0.7906600832939148, + "rewards/rejected": -2.780014753341675, + "step": 11890 + }, + { + "epoch": 2.0503101309441765, + "grad_norm": 48.36300277709961, + "learning_rate": 5.509287835472067e-08, + "logits/chosen": -2.000892162322998, + "logits/rejected": -1.9563884735107422, + "logps/chosen": -238.4857177734375, + "logps/rejected": -315.6385192871094, + "loss": 0.5189, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.8447710275650024, + "rewards/margins": 0.8057880401611328, + "rewards/rejected": -2.6505589485168457, + "step": 11900 + }, + { + "epoch": 2.0520330806340454, + "grad_norm": 30.14019012451172, + "learning_rate": 5.4913836601791497e-08, + "logits/chosen": -2.1544063091278076, + "logits/rejected": -2.138747215270996, + "logps/chosen": -231.66281127929688, + "logps/rejected": -307.263916015625, + "loss": 0.5387, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.7832281589508057, + "rewards/margins": 0.7166194915771484, + "rewards/rejected": -2.499847888946533, + "step": 11910 + }, + { + "epoch": 2.0537560303239144, + "grad_norm": 38.02259063720703, + "learning_rate": 5.473497606834554e-08, + "logits/chosen": -2.119105100631714, + "logits/rejected": -2.089402437210083, + "logps/chosen": -230.0631866455078, + "logps/rejected": -290.7784423828125, + "loss": 0.5544, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.7121751308441162, + "rewards/margins": 0.6320434212684631, + "rewards/rejected": -2.3442187309265137, + "step": 11920 + }, + { + "epoch": 2.055478980013784, + "grad_norm": 31.72987174987793, + "learning_rate": 5.4556297473295496e-08, + "logits/chosen": -2.1159067153930664, + "logits/rejected": -2.081695079803467, + "logps/chosen": -210.96792602539062, + "logps/rejected": -279.8138732910156, + "loss": 0.5045, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.5741071701049805, + "rewards/margins": 0.67446368932724, + "rewards/rejected": -2.2485709190368652, + "step": 11930 + }, + { + "epoch": 2.057201929703653, + "grad_norm": 29.604206085205078, + "learning_rate": 5.4377801534822676e-08, + "logits/chosen": -2.078176975250244, + "logits/rejected": -2.032797336578369, + "logps/chosen": -203.94651794433594, + "logps/rejected": -282.87371826171875, + "loss": 0.4724, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.4840799570083618, + "rewards/margins": 0.8179097175598145, + "rewards/rejected": -2.301989793777466, + "step": 11940 + }, + { + "epoch": 2.0589248793935218, + "grad_norm": 39.70060348510742, + "learning_rate": 5.419948897037436e-08, + "logits/chosen": -2.1387264728546143, + "logits/rejected": -2.099928617477417, + "logps/chosen": -237.4861602783203, + "logps/rejected": -289.05340576171875, + "loss": 0.5961, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.7850711345672607, + "rewards/margins": 0.5495141744613647, + "rewards/rejected": -2.334585189819336, + "step": 11950 + }, + { + "epoch": 2.0606478290833907, + "grad_norm": 47.06940460205078, + "learning_rate": 5.4021360496660614e-08, + "logits/chosen": -2.024587869644165, + "logits/rejected": -1.9838438034057617, + "logps/chosen": -224.7746124267578, + "logps/rejected": -310.7027893066406, + "loss": 0.4805, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -1.7129604816436768, + "rewards/margins": 0.8761369585990906, + "rewards/rejected": -2.589097499847412, + "step": 11960 + }, + { + "epoch": 2.0623707787732597, + "grad_norm": 31.619342803955078, + "learning_rate": 5.3843416829651713e-08, + "logits/chosen": -2.1144022941589355, + "logits/rejected": -2.0715739727020264, + "logps/chosen": -232.4583740234375, + "logps/rejected": -300.1246032714844, + "loss": 0.5358, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.7532933950424194, + "rewards/margins": 0.72503262758255, + "rewards/rejected": -2.478325605392456, + "step": 11970 + }, + { + "epoch": 2.0640937284631287, + "grad_norm": 36.4967041015625, + "learning_rate": 5.3665658684574975e-08, + "logits/chosen": -2.0988223552703857, + "logits/rejected": -2.049405097961426, + "logps/chosen": -223.54940795898438, + "logps/rejected": -293.61090087890625, + "loss": 0.4992, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.6739461421966553, + "rewards/margins": 0.7265938520431519, + "rewards/rejected": -2.4005398750305176, + "step": 11980 + }, + { + "epoch": 2.065816678152998, + "grad_norm": 32.14418029785156, + "learning_rate": 5.348808677591222e-08, + "logits/chosen": -2.153207302093506, + "logits/rejected": -2.098644495010376, + "logps/chosen": -228.08499145507812, + "logps/rejected": -290.0260009765625, + "loss": 0.5217, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.6931899785995483, + "rewards/margins": 0.6990076303482056, + "rewards/rejected": -2.392197847366333, + "step": 11990 + }, + { + "epoch": 2.067539627842867, + "grad_norm": 30.61179542541504, + "learning_rate": 5.331070181739654e-08, + "logits/chosen": -2.0519518852233887, + "logits/rejected": -2.02451753616333, + "logps/chosen": -229.9219512939453, + "logps/rejected": -305.4267578125, + "loss": 0.5302, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.780832290649414, + "rewards/margins": 0.7345147728919983, + "rewards/rejected": -2.5153470039367676, + "step": 12000 + }, + { + "epoch": 2.067539627842867, + "eval_logits/chosen": -2.1466264724731445, + "eval_logits/rejected": -2.125704050064087, + "eval_logps/chosen": -229.6514892578125, + "eval_logps/rejected": -265.5969543457031, + "eval_loss": 0.641311764717102, + "eval_rewards/accuracies": 0.6291821599006653, + "eval_rewards/chosen": -1.706360101699829, + "eval_rewards/margins": 0.32211339473724365, + "eval_rewards/rejected": -2.028473377227783, + "eval_runtime": 382.9749, + "eval_samples_per_second": 11.238, + "eval_steps_per_second": 1.405, + "step": 12000 + }, + { + "epoch": 2.069262577532736, + "grad_norm": 38.363101959228516, + "learning_rate": 5.313350452200962e-08, + "logits/chosen": -2.1067209243774414, + "logits/rejected": -2.0558550357818604, + "logps/chosen": -243.1296844482422, + "logps/rejected": -316.4700622558594, + "loss": 0.5338, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.8736377954483032, + "rewards/margins": 0.7471028566360474, + "rewards/rejected": -2.6207406520843506, + "step": 12010 + }, + { + "epoch": 2.070985527222605, + "grad_norm": 43.54338455200195, + "learning_rate": 5.295649560197895e-08, + "logits/chosen": -2.1059651374816895, + "logits/rejected": -2.0664753913879395, + "logps/chosen": -233.4269561767578, + "logps/rejected": -297.7746887207031, + "loss": 0.5522, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.8017711639404297, + "rewards/margins": 0.6565322279930115, + "rewards/rejected": -2.458303451538086, + "step": 12020 + }, + { + "epoch": 2.072708476912474, + "grad_norm": 54.19285583496094, + "learning_rate": 5.27796757687748e-08, + "logits/chosen": -2.1758646965026855, + "logits/rejected": -2.144629716873169, + "logps/chosen": -212.6887664794922, + "logps/rejected": -268.6290588378906, + "loss": 0.5845, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.625186562538147, + "rewards/margins": 0.5565309524536133, + "rewards/rejected": -2.1817173957824707, + "step": 12030 + }, + { + "epoch": 2.0744314266023434, + "grad_norm": 33.641727447509766, + "learning_rate": 5.260304573310743e-08, + "logits/chosen": -2.0929903984069824, + "logits/rejected": -2.04295015335083, + "logps/chosen": -227.9827117919922, + "logps/rejected": -301.3453063964844, + "loss": 0.5225, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.7123521566390991, + "rewards/margins": 0.7900432348251343, + "rewards/rejected": -2.5023951530456543, + "step": 12040 + }, + { + "epoch": 2.0761543762922123, + "grad_norm": 41.476985931396484, + "learning_rate": 5.242660620492416e-08, + "logits/chosen": -2.1373560428619385, + "logits/rejected": -2.0753097534179688, + "logps/chosen": -226.6936492919922, + "logps/rejected": -305.44647216796875, + "loss": 0.4736, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.7126344442367554, + "rewards/margins": 0.8372985124588013, + "rewards/rejected": -2.5499329566955566, + "step": 12050 + }, + { + "epoch": 2.0778773259820813, + "grad_norm": 41.41499328613281, + "learning_rate": 5.2250357893406703e-08, + "logits/chosen": -2.0900511741638184, + "logits/rejected": -2.05121111869812, + "logps/chosen": -239.5316162109375, + "logps/rejected": -307.7794494628906, + "loss": 0.5373, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.8286758661270142, + "rewards/margins": 0.7210630178451538, + "rewards/rejected": -2.549738883972168, + "step": 12060 + }, + { + "epoch": 2.0796002756719503, + "grad_norm": 42.31587219238281, + "learning_rate": 5.2074301506968165e-08, + "logits/chosen": -2.0873608589172363, + "logits/rejected": -2.029822587966919, + "logps/chosen": -233.84033203125, + "logps/rejected": -314.54278564453125, + "loss": 0.4958, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.7759023904800415, + "rewards/margins": 0.8784404993057251, + "rewards/rejected": -2.6543426513671875, + "step": 12070 + }, + { + "epoch": 2.0813232253618192, + "grad_norm": 66.47350311279297, + "learning_rate": 5.189843775325018e-08, + "logits/chosen": -2.0308492183685303, + "logits/rejected": -1.979935646057129, + "logps/chosen": -232.94021606445312, + "logps/rejected": -327.6007080078125, + "loss": 0.4672, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -1.7581536769866943, + "rewards/margins": 0.9758221507072449, + "rewards/rejected": -2.733975887298584, + "step": 12080 + }, + { + "epoch": 2.0830461750516887, + "grad_norm": 53.56596374511719, + "learning_rate": 5.172276733912009e-08, + "logits/chosen": -2.0624167919158936, + "logits/rejected": -2.0192716121673584, + "logps/chosen": -261.79852294921875, + "logps/rejected": -325.7588806152344, + "loss": 0.5509, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.0378787517547607, + "rewards/margins": 0.6984132528305054, + "rewards/rejected": -2.7362923622131348, + "step": 12090 + }, + { + "epoch": 2.0847691247415576, + "grad_norm": 48.26594543457031, + "learning_rate": 5.1547290970668243e-08, + "logits/chosen": -2.052945613861084, + "logits/rejected": -2.0171191692352295, + "logps/chosen": -244.5965118408203, + "logps/rejected": -308.4600830078125, + "loss": 0.5303, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.859833002090454, + "rewards/margins": 0.7063957452774048, + "rewards/rejected": -2.5662283897399902, + "step": 12100 + }, + { + "epoch": 2.0864920744314266, + "grad_norm": 33.33226013183594, + "learning_rate": 5.13720093532049e-08, + "logits/chosen": -2.085258960723877, + "logits/rejected": -2.0338473320007324, + "logps/chosen": -251.0377655029297, + "logps/rejected": -320.9676818847656, + "loss": 0.5262, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.9285335540771484, + "rewards/margins": 0.7524645328521729, + "rewards/rejected": -2.6809983253479004, + "step": 12110 + }, + { + "epoch": 2.0882150241212956, + "grad_norm": 51.34297561645508, + "learning_rate": 5.1196923191257654e-08, + "logits/chosen": -2.0758512020111084, + "logits/rejected": -2.0404000282287598, + "logps/chosen": -242.778076171875, + "logps/rejected": -309.3258056640625, + "loss": 0.5513, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.8604662418365479, + "rewards/margins": 0.7200610637664795, + "rewards/rejected": -2.5805275440216064, + "step": 12120 + }, + { + "epoch": 2.0899379738111645, + "grad_norm": 29.25992202758789, + "learning_rate": 5.102203318856847e-08, + "logits/chosen": -2.0763731002807617, + "logits/rejected": -2.0236387252807617, + "logps/chosen": -230.5070037841797, + "logps/rejected": -313.5376892089844, + "loss": 0.4763, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.7576888799667358, + "rewards/margins": 0.850530743598938, + "rewards/rejected": -2.608219623565674, + "step": 12130 + }, + { + "epoch": 2.091660923501034, + "grad_norm": 49.207420349121094, + "learning_rate": 5.084734004809079e-08, + "logits/chosen": -2.1137688159942627, + "logits/rejected": -2.0833239555358887, + "logps/chosen": -229.6987762451172, + "logps/rejected": -289.147216796875, + "loss": 0.5833, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.7667906284332275, + "rewards/margins": 0.5884928703308105, + "rewards/rejected": -2.355283498764038, + "step": 12140 + }, + { + "epoch": 2.093383873190903, + "grad_norm": 109.75863647460938, + "learning_rate": 5.0672844471986806e-08, + "logits/chosen": -2.1269781589508057, + "logits/rejected": -2.09053373336792, + "logps/chosen": -212.83584594726562, + "logps/rejected": -284.8175964355469, + "loss": 0.546, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.5835081338882446, + "rewards/margins": 0.7448925971984863, + "rewards/rejected": -2.3284006118774414, + "step": 12150 + }, + { + "epoch": 2.095106822880772, + "grad_norm": 46.11418151855469, + "learning_rate": 5.049854716162469e-08, + "logits/chosen": -2.0724129676818848, + "logits/rejected": -2.0280184745788574, + "logps/chosen": -211.89907836914062, + "logps/rejected": -272.09698486328125, + "loss": 0.5479, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.5520589351654053, + "rewards/margins": 0.6403037309646606, + "rewards/rejected": -2.1923625469207764, + "step": 12160 + }, + { + "epoch": 2.096829772570641, + "grad_norm": 30.395431518554688, + "learning_rate": 5.032444881757575e-08, + "logits/chosen": -2.0958447456359863, + "logits/rejected": -2.058408260345459, + "logps/chosen": -217.18734741210938, + "logps/rejected": -282.3731384277344, + "loss": 0.5525, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.6155513525009155, + "rewards/margins": 0.6562166213989258, + "rewards/rejected": -2.271768093109131, + "step": 12170 + }, + { + "epoch": 2.09855272226051, + "grad_norm": 30.15849494934082, + "learning_rate": 5.015055013961129e-08, + "logits/chosen": -2.0267977714538574, + "logits/rejected": -1.9941260814666748, + "logps/chosen": -218.51113891601562, + "logps/rejected": -305.34735107421875, + "loss": 0.4984, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.692307472229004, + "rewards/margins": 0.7915050387382507, + "rewards/rejected": -2.4838123321533203, + "step": 12180 + }, + { + "epoch": 2.1002756719503792, + "grad_norm": 33.532596588134766, + "learning_rate": 4.9976851826700385e-08, + "logits/chosen": -2.068605661392212, + "logits/rejected": -2.0211539268493652, + "logps/chosen": -222.3265380859375, + "logps/rejected": -281.1471252441406, + "loss": 0.5434, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.6454464197158813, + "rewards/margins": 0.6401292085647583, + "rewards/rejected": -2.2855758666992188, + "step": 12190 + }, + { + "epoch": 2.101998621640248, + "grad_norm": 51.70030212402344, + "learning_rate": 4.980335457700665e-08, + "logits/chosen": -2.1316757202148438, + "logits/rejected": -2.0937085151672363, + "logps/chosen": -236.2337646484375, + "logps/rejected": -304.72802734375, + "loss": 0.5122, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.7878990173339844, + "rewards/margins": 0.7227575182914734, + "rewards/rejected": -2.5106565952301025, + "step": 12200 + }, + { + "epoch": 2.103721571330117, + "grad_norm": 41.22138595581055, + "learning_rate": 4.963005908788547e-08, + "logits/chosen": -2.0903918743133545, + "logits/rejected": -2.0646119117736816, + "logps/chosen": -237.90762329101562, + "logps/rejected": -295.3863830566406, + "loss": 0.5607, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.829699158668518, + "rewards/margins": 0.6166217923164368, + "rewards/rejected": -2.4463210105895996, + "step": 12210 + }, + { + "epoch": 2.105444521019986, + "grad_norm": 57.86821746826172, + "learning_rate": 4.945696605588143e-08, + "logits/chosen": -2.0225930213928223, + "logits/rejected": -1.9905112981796265, + "logps/chosen": -229.29971313476562, + "logps/rejected": -291.74102783203125, + "loss": 0.5468, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.7564723491668701, + "rewards/margins": 0.6317520141601562, + "rewards/rejected": -2.3882243633270264, + "step": 12220 + }, + { + "epoch": 2.107167470709855, + "grad_norm": 55.78356170654297, + "learning_rate": 4.928407617672519e-08, + "logits/chosen": -2.09854793548584, + "logits/rejected": -2.0631284713745117, + "logps/chosen": -224.0506591796875, + "logps/rejected": -292.0597839355469, + "loss": 0.5564, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.7134735584259033, + "rewards/margins": 0.7068896293640137, + "rewards/rejected": -2.420363664627075, + "step": 12230 + }, + { + "epoch": 2.1088904203997245, + "grad_norm": 46.384521484375, + "learning_rate": 4.911139014533099e-08, + "logits/chosen": -2.0313241481781006, + "logits/rejected": -1.9866256713867188, + "logps/chosen": -211.84573364257812, + "logps/rejected": -284.09539794921875, + "loss": 0.5232, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.5812326669692993, + "rewards/margins": 0.743860125541687, + "rewards/rejected": -2.3250927925109863, + "step": 12240 + }, + { + "epoch": 2.1106133700895935, + "grad_norm": 59.478694915771484, + "learning_rate": 4.893890865579362e-08, + "logits/chosen": -2.075040817260742, + "logits/rejected": -2.039846658706665, + "logps/chosen": -221.32901000976562, + "logps/rejected": -269.36419677734375, + "loss": 0.6039, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.6808462142944336, + "rewards/margins": 0.5116840600967407, + "rewards/rejected": -2.1925301551818848, + "step": 12250 + }, + { + "epoch": 2.1123363197794625, + "grad_norm": 45.6484375, + "learning_rate": 4.8766632401385856e-08, + "logits/chosen": -2.0814313888549805, + "logits/rejected": -2.0419795513153076, + "logps/chosen": -213.54916381835938, + "logps/rejected": -276.7439880371094, + "loss": 0.5496, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.599725604057312, + "rewards/margins": 0.6187621355056763, + "rewards/rejected": -2.2184877395629883, + "step": 12260 + }, + { + "epoch": 2.1140592694693314, + "grad_norm": 48.90853500366211, + "learning_rate": 4.859456207455539e-08, + "logits/chosen": -2.153773069381714, + "logits/rejected": -2.1088504791259766, + "logps/chosen": -224.59121704101562, + "logps/rejected": -293.13031005859375, + "loss": 0.5484, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.6861425638198853, + "rewards/margins": 0.7158703804016113, + "rewards/rejected": -2.402012825012207, + "step": 12270 + }, + { + "epoch": 2.1157822191592004, + "grad_norm": 32.83452606201172, + "learning_rate": 4.842269836692239e-08, + "logits/chosen": -2.0922152996063232, + "logits/rejected": -2.051940441131592, + "logps/chosen": -226.1486358642578, + "logps/rejected": -295.8097839355469, + "loss": 0.5137, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.6863266229629517, + "rewards/margins": 0.7510272860527039, + "rewards/rejected": -2.4373538494110107, + "step": 12280 + }, + { + "epoch": 2.11750516884907, + "grad_norm": 58.551639556884766, + "learning_rate": 4.8251041969276355e-08, + "logits/chosen": -2.034183979034424, + "logits/rejected": -1.9952113628387451, + "logps/chosen": -225.10250854492188, + "logps/rejected": -283.5815124511719, + "loss": 0.5476, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.703881025314331, + "rewards/margins": 0.6042089462280273, + "rewards/rejected": -2.3080897331237793, + "step": 12290 + }, + { + "epoch": 2.1192281185389388, + "grad_norm": 46.86976623535156, + "learning_rate": 4.8079593571573654e-08, + "logits/chosen": -2.0990262031555176, + "logits/rejected": -2.052614688873291, + "logps/chosen": -230.6024169921875, + "logps/rejected": -292.49884033203125, + "loss": 0.5279, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.749632477760315, + "rewards/margins": 0.6554352045059204, + "rewards/rejected": -2.4050679206848145, + "step": 12300 + }, + { + "epoch": 2.1209510682288077, + "grad_norm": 36.16896438598633, + "learning_rate": 4.7908353862934645e-08, + "logits/chosen": -2.0593631267547607, + "logits/rejected": -2.006453037261963, + "logps/chosen": -230.8593292236328, + "logps/rejected": -307.69720458984375, + "loss": 0.5149, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.742922067642212, + "rewards/margins": 0.8167440295219421, + "rewards/rejected": -2.5596659183502197, + "step": 12310 + }, + { + "epoch": 2.1226740179186767, + "grad_norm": 37.408653259277344, + "learning_rate": 4.773732353164069e-08, + "logits/chosen": -2.054842472076416, + "logits/rejected": -2.0034537315368652, + "logps/chosen": -228.54055786132812, + "logps/rejected": -325.37860107421875, + "loss": 0.4833, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.7277143001556396, + "rewards/margins": 0.9802853465080261, + "rewards/rejected": -2.7079997062683105, + "step": 12320 + }, + { + "epoch": 2.1243969676085457, + "grad_norm": 35.757171630859375, + "learning_rate": 4.756650326513175e-08, + "logits/chosen": -2.073939085006714, + "logits/rejected": -2.0456321239471436, + "logps/chosen": -239.0164031982422, + "logps/rejected": -308.36529541015625, + "loss": 0.5265, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.8104842901229858, + "rewards/margins": 0.7307012677192688, + "rewards/rejected": -2.5411856174468994, + "step": 12330 + }, + { + "epoch": 2.126119917298415, + "grad_norm": 32.52260971069336, + "learning_rate": 4.739589375000345e-08, + "logits/chosen": -2.076432466506958, + "logits/rejected": -2.0272200107574463, + "logps/chosen": -246.3824920654297, + "logps/rejected": -331.07965087890625, + "loss": 0.4841, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.9130935668945312, + "rewards/margins": 0.8666901588439941, + "rewards/rejected": -2.7797837257385254, + "step": 12340 + }, + { + "epoch": 2.127842866988284, + "grad_norm": 44.25779342651367, + "learning_rate": 4.722549567200423e-08, + "logits/chosen": -1.996809959411621, + "logits/rejected": -1.9381519556045532, + "logps/chosen": -259.73126220703125, + "logps/rejected": -331.24725341796875, + "loss": 0.5365, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.027064561843872, + "rewards/margins": 0.7794581651687622, + "rewards/rejected": -2.806522846221924, + "step": 12350 + }, + { + "epoch": 2.129565816678153, + "grad_norm": 41.06666564941406, + "learning_rate": 4.70553097160327e-08, + "logits/chosen": -2.079986095428467, + "logits/rejected": -2.0405890941619873, + "logps/chosen": -229.076904296875, + "logps/rejected": -316.8277282714844, + "loss": 0.4616, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7490637302398682, + "rewards/margins": 0.9058129191398621, + "rewards/rejected": -2.654876708984375, + "step": 12360 + }, + { + "epoch": 2.131288766368022, + "grad_norm": 35.618194580078125, + "learning_rate": 4.6885336566134905e-08, + "logits/chosen": -2.060807466506958, + "logits/rejected": -2.0051631927490234, + "logps/chosen": -245.77291870117188, + "logps/rejected": -335.7901306152344, + "loss": 0.4906, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.921326994895935, + "rewards/margins": 0.9444554448127747, + "rewards/rejected": -2.8657822608947754, + "step": 12370 + }, + { + "epoch": 2.133011716057891, + "grad_norm": 43.511146545410156, + "learning_rate": 4.671557690550158e-08, + "logits/chosen": -2.0528554916381836, + "logits/rejected": -2.024606704711914, + "logps/chosen": -244.91891479492188, + "logps/rejected": -336.2535400390625, + "loss": 0.4881, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.9351543188095093, + "rewards/margins": 0.8718324899673462, + "rewards/rejected": -2.8069872856140137, + "step": 12380 + }, + { + "epoch": 2.13473466574776, + "grad_norm": 38.06670379638672, + "learning_rate": 4.65460314164652e-08, + "logits/chosen": -2.140407085418701, + "logits/rejected": -2.1023502349853516, + "logps/chosen": -248.9733123779297, + "logps/rejected": -341.685546875, + "loss": 0.4843, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -1.9468520879745483, + "rewards/margins": 0.9342899322509766, + "rewards/rejected": -2.8811416625976562, + "step": 12390 + }, + { + "epoch": 2.1364576154376294, + "grad_norm": 31.036680221557617, + "learning_rate": 4.637670078049759e-08, + "logits/chosen": -2.056527614593506, + "logits/rejected": -2.023624897003174, + "logps/chosen": -274.3301086425781, + "logps/rejected": -358.85400390625, + "loss": 0.4961, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.1827850341796875, + "rewards/margins": 0.8480589985847473, + "rewards/rejected": -3.03084397315979, + "step": 12400 + }, + { + "epoch": 2.1364576154376294, + "eval_logits/chosen": -2.11784029006958, + "eval_logits/rejected": -2.09578013420105, + "eval_logps/chosen": -259.76959228515625, + "eval_logps/rejected": -299.8690185546875, + "eval_loss": 0.6474471688270569, + "eval_rewards/accuracies": 0.6387081742286682, + "eval_rewards/chosen": -2.0075414180755615, + "eval_rewards/margins": 0.36365291476249695, + "eval_rewards/rejected": -2.3711941242218018, + "eval_runtime": 383.2126, + "eval_samples_per_second": 11.231, + "eval_steps_per_second": 1.404, + "step": 12400 + }, + { + "epoch": 2.1381805651274983, + "grad_norm": 44.605613708496094, + "learning_rate": 4.620758567820686e-08, + "logits/chosen": -2.03633975982666, + "logits/rejected": -1.9880211353302002, + "logps/chosen": -259.0669250488281, + "logps/rejected": -327.37298583984375, + "loss": 0.5481, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -2.0398917198181152, + "rewards/margins": 0.7153648138046265, + "rewards/rejected": -2.7552566528320312, + "step": 12410 + }, + { + "epoch": 2.1399035148173673, + "grad_norm": 69.91634368896484, + "learning_rate": 4.60386867893348e-08, + "logits/chosen": -1.9803155660629272, + "logits/rejected": -1.94023859500885, + "logps/chosen": -263.7940368652344, + "logps/rejected": -339.0141906738281, + "loss": 0.5273, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -2.0752742290496826, + "rewards/margins": 0.7945727109909058, + "rewards/rejected": -2.869846820831299, + "step": 12420 + }, + { + "epoch": 2.1416264645072363, + "grad_norm": 47.40889358520508, + "learning_rate": 4.5870004792754257e-08, + "logits/chosen": -2.044445514678955, + "logits/rejected": -2.0063881874084473, + "logps/chosen": -260.7670593261719, + "logps/rejected": -335.7140808105469, + "loss": 0.5264, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -2.047086477279663, + "rewards/margins": 0.7734764814376831, + "rewards/rejected": -2.8205630779266357, + "step": 12430 + }, + { + "epoch": 2.1433494141971057, + "grad_norm": 30.57667350769043, + "learning_rate": 4.570154036646625e-08, + "logits/chosen": -2.0202064514160156, + "logits/rejected": -1.9847602844238281, + "logps/chosen": -236.6773223876953, + "logps/rejected": -304.9550476074219, + "loss": 0.5395, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.8218532800674438, + "rewards/margins": 0.7106314897537231, + "rewards/rejected": -2.532484531402588, + "step": 12440 + }, + { + "epoch": 2.1450723638869746, + "grad_norm": 53.94111251831055, + "learning_rate": 4.553329418759726e-08, + "logits/chosen": -2.0629711151123047, + "logits/rejected": -2.032637357711792, + "logps/chosen": -234.65023803710938, + "logps/rejected": -299.62969970703125, + "loss": 0.5676, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.8310039043426514, + "rewards/margins": 0.6646952033042908, + "rewards/rejected": -2.495698928833008, + "step": 12450 + }, + { + "epoch": 2.1467953135768436, + "grad_norm": 44.45255661010742, + "learning_rate": 4.5365266932396526e-08, + "logits/chosen": -2.111095666885376, + "logits/rejected": -2.0713887214660645, + "logps/chosen": -234.33627319335938, + "logps/rejected": -308.1429443359375, + "loss": 0.5145, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.7842357158660889, + "rewards/margins": 0.7793278694152832, + "rewards/rejected": -2.563563585281372, + "step": 12460 + }, + { + "epoch": 2.1485182632667126, + "grad_norm": 53.02629089355469, + "learning_rate": 4.519745927623344e-08, + "logits/chosen": -2.1693835258483887, + "logits/rejected": -2.1237571239471436, + "logps/chosen": -225.2732696533203, + "logps/rejected": -309.4500427246094, + "loss": 0.4799, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.6921237707138062, + "rewards/margins": 0.84943026304245, + "rewards/rejected": -2.5415542125701904, + "step": 12470 + }, + { + "epoch": 2.1502412129565815, + "grad_norm": 42.24469757080078, + "learning_rate": 4.5029871893594695e-08, + "logits/chosen": -2.152308464050293, + "logits/rejected": -2.1171212196350098, + "logps/chosen": -230.6731414794922, + "logps/rejected": -296.44952392578125, + "loss": 0.528, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.7500979900360107, + "rewards/margins": 0.6756486892700195, + "rewards/rejected": -2.4257466793060303, + "step": 12480 + }, + { + "epoch": 2.1519641626464505, + "grad_norm": 78.8714370727539, + "learning_rate": 4.486250545808159e-08, + "logits/chosen": -2.0349388122558594, + "logits/rejected": -1.998011827468872, + "logps/chosen": -241.18063354492188, + "logps/rejected": -319.050537109375, + "loss": 0.5447, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.8526427745819092, + "rewards/margins": 0.8055623769760132, + "rewards/rejected": -2.6582047939300537, + "step": 12490 + }, + { + "epoch": 2.15368711233632, + "grad_norm": 48.69804000854492, + "learning_rate": 4.469536064240731e-08, + "logits/chosen": -2.075000286102295, + "logits/rejected": -2.0368382930755615, + "logps/chosen": -246.09445190429688, + "logps/rejected": -324.2549743652344, + "loss": 0.5345, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.915993332862854, + "rewards/margins": 0.801496148109436, + "rewards/rejected": -2.717489242553711, + "step": 12500 + }, + { + "epoch": 2.155410062026189, + "grad_norm": 35.36015701293945, + "learning_rate": 4.452843811839435e-08, + "logits/chosen": -2.120659112930298, + "logits/rejected": -2.087756872177124, + "logps/chosen": -231.48532104492188, + "logps/rejected": -295.49078369140625, + "loss": 0.5469, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.7738559246063232, + "rewards/margins": 0.6695488691329956, + "rewards/rejected": -2.4434046745300293, + "step": 12510 + }, + { + "epoch": 2.157133011716058, + "grad_norm": 43.532100677490234, + "learning_rate": 4.436173855697174e-08, + "logits/chosen": -2.055793046951294, + "logits/rejected": -2.0177674293518066, + "logps/chosen": -232.77920532226562, + "logps/rejected": -294.95440673828125, + "loss": 0.5566, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.7898495197296143, + "rewards/margins": 0.6242788434028625, + "rewards/rejected": -2.414128303527832, + "step": 12520 + }, + { + "epoch": 2.158855961405927, + "grad_norm": 47.047569274902344, + "learning_rate": 4.4195262628172224e-08, + "logits/chosen": -2.0752203464508057, + "logits/rejected": -2.0413734912872314, + "logps/chosen": -233.0418701171875, + "logps/rejected": -306.05621337890625, + "loss": 0.5037, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.7798078060150146, + "rewards/margins": 0.7302197217941284, + "rewards/rejected": -2.5100274085998535, + "step": 12530 + }, + { + "epoch": 2.160578911095796, + "grad_norm": 35.58103561401367, + "learning_rate": 4.402901100112972e-08, + "logits/chosen": -2.072754144668579, + "logits/rejected": -2.0337443351745605, + "logps/chosen": -233.8501739501953, + "logps/rejected": -298.0557556152344, + "loss": 0.5718, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.7930688858032227, + "rewards/margins": 0.6550329923629761, + "rewards/rejected": -2.448101758956909, + "step": 12540 + }, + { + "epoch": 2.162301860785665, + "grad_norm": 50.7521858215332, + "learning_rate": 4.386298434407666e-08, + "logits/chosen": -2.1599698066711426, + "logits/rejected": -2.133126735687256, + "logps/chosen": -221.6575469970703, + "logps/rejected": -277.86492919921875, + "loss": 0.5649, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.6921859979629517, + "rewards/margins": 0.5701245069503784, + "rewards/rejected": -2.26231050491333, + "step": 12550 + }, + { + "epoch": 2.164024810475534, + "grad_norm": 44.69491195678711, + "learning_rate": 4.369718332434109e-08, + "logits/chosen": -2.110529661178589, + "logits/rejected": -2.0664124488830566, + "logps/chosen": -224.39920043945312, + "logps/rejected": -274.45062255859375, + "loss": 0.573, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.6608068943023682, + "rewards/margins": 0.5945785045623779, + "rewards/rejected": -2.255385637283325, + "step": 12560 + }, + { + "epoch": 2.165747760165403, + "grad_norm": 38.576751708984375, + "learning_rate": 4.3531608608344274e-08, + "logits/chosen": -2.0868732929229736, + "logits/rejected": -2.0362396240234375, + "logps/chosen": -204.62542724609375, + "logps/rejected": -267.41387939453125, + "loss": 0.5145, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.5071732997894287, + "rewards/margins": 0.6663482189178467, + "rewards/rejected": -2.1735215187072754, + "step": 12570 + }, + { + "epoch": 2.167470709855272, + "grad_norm": 44.46379470825195, + "learning_rate": 4.3366260861597814e-08, + "logits/chosen": -2.106191635131836, + "logits/rejected": -2.0512826442718506, + "logps/chosen": -219.61532592773438, + "logps/rejected": -283.9519348144531, + "loss": 0.4881, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.5764575004577637, + "rewards/margins": 0.7457872629165649, + "rewards/rejected": -2.322244882583618, + "step": 12580 + }, + { + "epoch": 2.169193659545141, + "grad_norm": 44.6986083984375, + "learning_rate": 4.3201140748701e-08, + "logits/chosen": -2.077023983001709, + "logits/rejected": -2.0341241359710693, + "logps/chosen": -217.6835174560547, + "logps/rejected": -285.7940979003906, + "loss": 0.5134, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.6051843166351318, + "rewards/margins": 0.731010913848877, + "rewards/rejected": -2.336195468902588, + "step": 12590 + }, + { + "epoch": 2.1709166092350105, + "grad_norm": 39.228145599365234, + "learning_rate": 4.303624893333816e-08, + "logits/chosen": -2.0710034370422363, + "logits/rejected": -2.0349724292755127, + "logps/chosen": -242.8468475341797, + "logps/rejected": -303.30194091796875, + "loss": 0.5498, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.833044409751892, + "rewards/margins": 0.6784363985061646, + "rewards/rejected": -2.5114808082580566, + "step": 12600 + }, + { + "epoch": 2.1726395589248795, + "grad_norm": 33.84947967529297, + "learning_rate": 4.287158607827607e-08, + "logits/chosen": -2.1545472145080566, + "logits/rejected": -2.1268463134765625, + "logps/chosen": -229.6652374267578, + "logps/rejected": -303.5273742675781, + "loss": 0.5085, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.7480127811431885, + "rewards/margins": 0.7504814267158508, + "rewards/rejected": -2.4984943866729736, + "step": 12610 + }, + { + "epoch": 2.1743625086147484, + "grad_norm": 38.461978912353516, + "learning_rate": 4.270715284536124e-08, + "logits/chosen": -2.1819534301757812, + "logits/rejected": -2.143298387527466, + "logps/chosen": -260.98065185546875, + "logps/rejected": -328.08758544921875, + "loss": 0.5528, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.05389142036438, + "rewards/margins": 0.7068518996238708, + "rewards/rejected": -2.7607436180114746, + "step": 12620 + }, + { + "epoch": 2.1760854583046174, + "grad_norm": 48.95317077636719, + "learning_rate": 4.2542949895517066e-08, + "logits/chosen": -2.127363681793213, + "logits/rejected": -2.0970330238342285, + "logps/chosen": -235.3542022705078, + "logps/rejected": -309.9244079589844, + "loss": 0.5624, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.8453752994537354, + "rewards/margins": 0.7416688203811646, + "rewards/rejected": -2.5870440006256104, + "step": 12630 + }, + { + "epoch": 2.1778084079944864, + "grad_norm": 37.52322006225586, + "learning_rate": 4.2378977888741506e-08, + "logits/chosen": -2.1374919414520264, + "logits/rejected": -2.0961146354675293, + "logps/chosen": -223.96456909179688, + "logps/rejected": -295.3265380859375, + "loss": 0.4979, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.6855684518814087, + "rewards/margins": 0.7392398118972778, + "rewards/rejected": -2.4248080253601074, + "step": 12640 + }, + { + "epoch": 2.179531357684356, + "grad_norm": 57.112144470214844, + "learning_rate": 4.221523748410428e-08, + "logits/chosen": -2.089409828186035, + "logits/rejected": -2.0558364391326904, + "logps/chosen": -222.017578125, + "logps/rejected": -278.6932067871094, + "loss": 0.5873, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.6818082332611084, + "rewards/margins": 0.5971177220344543, + "rewards/rejected": -2.278926134109497, + "step": 12650 + }, + { + "epoch": 2.1812543073742248, + "grad_norm": 50.265281677246094, + "learning_rate": 4.2051729339744056e-08, + "logits/chosen": -2.027480363845825, + "logits/rejected": -1.9964735507965088, + "logps/chosen": -231.51260375976562, + "logps/rejected": -287.72509765625, + "loss": 0.5954, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.7851423025131226, + "rewards/margins": 0.5692311525344849, + "rewards/rejected": -2.3543732166290283, + "step": 12660 + }, + { + "epoch": 2.1829772570640937, + "grad_norm": 51.05306625366211, + "learning_rate": 4.1888454112866125e-08, + "logits/chosen": -2.2497687339782715, + "logits/rejected": -2.1945605278015137, + "logps/chosen": -206.6665802001953, + "logps/rejected": -275.145263671875, + "loss": 0.5233, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.5245170593261719, + "rewards/margins": 0.7166666984558105, + "rewards/rejected": -2.2411837577819824, + "step": 12670 + }, + { + "epoch": 2.1847002067539627, + "grad_norm": 26.886734008789062, + "learning_rate": 4.172541245973943e-08, + "logits/chosen": -2.082343101501465, + "logits/rejected": -2.0427489280700684, + "logps/chosen": -213.1779022216797, + "logps/rejected": -282.32244873046875, + "loss": 0.5273, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.606439232826233, + "rewards/margins": 0.7072060108184814, + "rewards/rejected": -2.313645124435425, + "step": 12680 + }, + { + "epoch": 2.1864231564438317, + "grad_norm": 36.41101837158203, + "learning_rate": 4.156260503569423e-08, + "logits/chosen": -2.061903953552246, + "logits/rejected": -2.029452085494995, + "logps/chosen": -211.6961669921875, + "logps/rejected": -280.8975830078125, + "loss": 0.5686, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.5883698463439941, + "rewards/margins": 0.6996433734893799, + "rewards/rejected": -2.288013219833374, + "step": 12690 + }, + { + "epoch": 2.188146106133701, + "grad_norm": 28.81426429748535, + "learning_rate": 4.1400032495119183e-08, + "logits/chosen": -2.0733580589294434, + "logits/rejected": -2.033250331878662, + "logps/chosen": -213.970947265625, + "logps/rejected": -279.64385986328125, + "loss": 0.52, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.5556237697601318, + "rewards/margins": 0.7045634984970093, + "rewards/rejected": -2.2601871490478516, + "step": 12700 + }, + { + "epoch": 2.18986905582357, + "grad_norm": 45.23122787475586, + "learning_rate": 4.123769549145901e-08, + "logits/chosen": -2.1473305225372314, + "logits/rejected": -2.120551347732544, + "logps/chosen": -213.69515991210938, + "logps/rejected": -287.4021301269531, + "loss": 0.5262, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.5914419889450073, + "rewards/margins": 0.7104102373123169, + "rewards/rejected": -2.3018524646759033, + "step": 12710 + }, + { + "epoch": 2.191592005513439, + "grad_norm": 32.552513122558594, + "learning_rate": 4.10755946772116e-08, + "logits/chosen": -2.1464271545410156, + "logits/rejected": -2.0934534072875977, + "logps/chosen": -211.2435302734375, + "logps/rejected": -282.8409423828125, + "loss": 0.48, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.53597092628479, + "rewards/margins": 0.7881087064743042, + "rewards/rejected": -2.324079751968384, + "step": 12720 + }, + { + "epoch": 2.193314955203308, + "grad_norm": 47.42426681518555, + "learning_rate": 4.0913730703925485e-08, + "logits/chosen": -2.0571787357330322, + "logits/rejected": -2.0159764289855957, + "logps/chosen": -223.61099243164062, + "logps/rejected": -297.05218505859375, + "loss": 0.5254, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -1.7240705490112305, + "rewards/margins": 0.7499324083328247, + "rewards/rejected": -2.4740028381347656, + "step": 12730 + }, + { + "epoch": 2.195037904893177, + "grad_norm": 32.92298126220703, + "learning_rate": 4.075210422219732e-08, + "logits/chosen": -2.0610849857330322, + "logits/rejected": -2.031686305999756, + "logps/chosen": -247.9683380126953, + "logps/rejected": -316.2990417480469, + "loss": 0.5609, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.9526822566986084, + "rewards/margins": 0.6972087025642395, + "rewards/rejected": -2.6498911380767822, + "step": 12740 + }, + { + "epoch": 2.1967608545830464, + "grad_norm": 39.98879623413086, + "learning_rate": 4.059071588166921e-08, + "logits/chosen": -2.052370548248291, + "logits/rejected": -2.0056519508361816, + "logps/chosen": -234.57046508789062, + "logps/rejected": -312.9558410644531, + "loss": 0.4892, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7930113077163696, + "rewards/margins": 0.7980147004127502, + "rewards/rejected": -2.5910258293151855, + "step": 12750 + }, + { + "epoch": 2.1984838042729153, + "grad_norm": 47.036380767822266, + "learning_rate": 4.042956633102597e-08, + "logits/chosen": -2.0422403812408447, + "logits/rejected": -2.0114598274230957, + "logps/chosen": -244.37646484375, + "logps/rejected": -314.6592712402344, + "loss": 0.5813, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.915923833847046, + "rewards/margins": 0.7069417238235474, + "rewards/rejected": -2.622865676879883, + "step": 12760 + }, + { + "epoch": 2.2002067539627843, + "grad_norm": 40.41290283203125, + "learning_rate": 4.0268656217992615e-08, + "logits/chosen": -2.1366629600524902, + "logits/rejected": -2.0922679901123047, + "logps/chosen": -236.18856811523438, + "logps/rejected": -291.73162841796875, + "loss": 0.5701, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.7758594751358032, + "rewards/margins": 0.6260801553726196, + "rewards/rejected": -2.401939868927002, + "step": 12770 + }, + { + "epoch": 2.2019297036526533, + "grad_norm": 29.852205276489258, + "learning_rate": 4.0107986189331875e-08, + "logits/chosen": -2.1107864379882812, + "logits/rejected": -2.072418689727783, + "logps/chosen": -237.52664184570312, + "logps/rejected": -301.23297119140625, + "loss": 0.5871, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.852663278579712, + "rewards/margins": 0.6307103037834167, + "rewards/rejected": -2.4833736419677734, + "step": 12780 + }, + { + "epoch": 2.2036526533425222, + "grad_norm": 46.336830139160156, + "learning_rate": 3.9947556890841464e-08, + "logits/chosen": -2.045558214187622, + "logits/rejected": -2.007340908050537, + "logps/chosen": -240.2716522216797, + "logps/rejected": -312.72625732421875, + "loss": 0.5075, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.8492214679718018, + "rewards/margins": 0.757246732711792, + "rewards/rejected": -2.6064682006835938, + "step": 12790 + }, + { + "epoch": 2.205375603032391, + "grad_norm": 44.32802963256836, + "learning_rate": 3.978736896735141e-08, + "logits/chosen": -2.11002516746521, + "logits/rejected": -2.076409101486206, + "logps/chosen": -226.1758270263672, + "logps/rejected": -287.878662109375, + "loss": 0.55, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.701133131980896, + "rewards/margins": 0.6254535913467407, + "rewards/rejected": -2.326586961746216, + "step": 12800 + }, + { + "epoch": 2.205375603032391, + "eval_logits/chosen": -2.176112651824951, + "eval_logits/rejected": -2.1573870182037354, + "eval_logps/chosen": -209.3660430908203, + "eval_logps/rejected": -241.4328155517578, + "eval_loss": 0.6414783596992493, + "eval_rewards/accuracies": 0.63150554895401, + "eval_rewards/chosen": -1.503505825996399, + "eval_rewards/margins": 0.2833262085914612, + "eval_rewards/rejected": -1.7868318557739258, + "eval_runtime": 383.2929, + "eval_samples_per_second": 11.229, + "eval_steps_per_second": 1.404, + "step": 12800 + }, + { + "epoch": 2.2070985527222606, + "grad_norm": 43.70280456542969, + "learning_rate": 3.96274230627216e-08, + "logits/chosen": -2.1460890769958496, + "logits/rejected": -2.1152615547180176, + "logps/chosen": -213.664306640625, + "logps/rejected": -281.9350280761719, + "loss": 0.5265, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.6249897480010986, + "rewards/margins": 0.6531460285186768, + "rewards/rejected": -2.2781357765197754, + "step": 12810 + }, + { + "epoch": 2.2088215024121296, + "grad_norm": 47.759342193603516, + "learning_rate": 3.9467719819839186e-08, + "logits/chosen": -2.0540995597839355, + "logits/rejected": -2.0256972312927246, + "logps/chosen": -207.12734985351562, + "logps/rejected": -273.18743896484375, + "loss": 0.5398, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.5232833623886108, + "rewards/margins": 0.6660471558570862, + "rewards/rejected": -2.1893303394317627, + "step": 12820 + }, + { + "epoch": 2.2105444521019986, + "grad_norm": 32.4177360534668, + "learning_rate": 3.930825988061599e-08, + "logits/chosen": -2.056870698928833, + "logits/rejected": -2.0277340412139893, + "logps/chosen": -216.51528930664062, + "logps/rejected": -269.9931640625, + "loss": 0.5782, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.6514508724212646, + "rewards/margins": 0.5335295796394348, + "rewards/rejected": -2.1849803924560547, + "step": 12830 + }, + { + "epoch": 2.2122674017918675, + "grad_norm": 34.3258056640625, + "learning_rate": 3.914904388598577e-08, + "logits/chosen": -2.156665563583374, + "logits/rejected": -2.1270575523376465, + "logps/chosen": -213.1860809326172, + "logps/rejected": -286.87841796875, + "loss": 0.5084, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.5715808868408203, + "rewards/margins": 0.7577574849128723, + "rewards/rejected": -2.329338312149048, + "step": 12840 + }, + { + "epoch": 2.213990351481737, + "grad_norm": 33.174949645996094, + "learning_rate": 3.899007247590191e-08, + "logits/chosen": -2.1505343914031982, + "logits/rejected": -2.1065900325775146, + "logps/chosen": -213.80368041992188, + "logps/rejected": -275.1285705566406, + "loss": 0.5279, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.598930835723877, + "rewards/margins": 0.6486135125160217, + "rewards/rejected": -2.247544288635254, + "step": 12850 + }, + { + "epoch": 2.215713301171606, + "grad_norm": 33.14176559448242, + "learning_rate": 3.883134628933465e-08, + "logits/chosen": -2.048759937286377, + "logits/rejected": -2.0071349143981934, + "logps/chosen": -229.22830200195312, + "logps/rejected": -300.01043701171875, + "loss": 0.5409, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.7220948934555054, + "rewards/margins": 0.7704235315322876, + "rewards/rejected": -2.492518424987793, + "step": 12860 + }, + { + "epoch": 2.217436250861475, + "grad_norm": 58.30504608154297, + "learning_rate": 3.867286596426853e-08, + "logits/chosen": -2.0994811058044434, + "logits/rejected": -2.0638999938964844, + "logps/chosen": -221.7117919921875, + "logps/rejected": -286.0891418457031, + "loss": 0.5738, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.6807664632797241, + "rewards/margins": 0.6800059676170349, + "rewards/rejected": -2.3607726097106934, + "step": 12870 + }, + { + "epoch": 2.219159200551344, + "grad_norm": 43.13029861450195, + "learning_rate": 3.851463213769996e-08, + "logits/chosen": -2.1190624237060547, + "logits/rejected": -2.084502935409546, + "logps/chosen": -210.6239013671875, + "logps/rejected": -275.19720458984375, + "loss": 0.5292, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.5534844398498535, + "rewards/margins": 0.6436396837234497, + "rewards/rejected": -2.1971240043640137, + "step": 12880 + }, + { + "epoch": 2.220882150241213, + "grad_norm": 30.03142738342285, + "learning_rate": 3.8356645445634575e-08, + "logits/chosen": -2.149033308029175, + "logits/rejected": -2.120265483856201, + "logps/chosen": -218.79638671875, + "logps/rejected": -276.3510437011719, + "loss": 0.5302, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.601214051246643, + "rewards/margins": 0.6322570443153381, + "rewards/rejected": -2.233471155166626, + "step": 12890 + }, + { + "epoch": 2.222605099931082, + "grad_norm": 44.66122055053711, + "learning_rate": 3.8198906523084594e-08, + "logits/chosen": -2.11344313621521, + "logits/rejected": -2.062312602996826, + "logps/chosen": -233.74307250976562, + "logps/rejected": -312.79876708984375, + "loss": 0.4754, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.784369707107544, + "rewards/margins": 0.8080135583877563, + "rewards/rejected": -2.5923831462860107, + "step": 12900 + }, + { + "epoch": 2.224328049620951, + "grad_norm": 36.82771301269531, + "learning_rate": 3.8041416004066364e-08, + "logits/chosen": -2.130718231201172, + "logits/rejected": -2.0977675914764404, + "logps/chosen": -229.9229278564453, + "logps/rejected": -296.8409729003906, + "loss": 0.5508, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7804368734359741, + "rewards/margins": 0.6411340832710266, + "rewards/rejected": -2.4215707778930664, + "step": 12910 + }, + { + "epoch": 2.22605099931082, + "grad_norm": 39.755863189697266, + "learning_rate": 3.7884174521597866e-08, + "logits/chosen": -2.0777835845947266, + "logits/rejected": -2.0352044105529785, + "logps/chosen": -229.240478515625, + "logps/rejected": -318.3649597167969, + "loss": 0.499, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.7551753520965576, + "rewards/margins": 0.9018391370773315, + "rewards/rejected": -2.6570143699645996, + "step": 12920 + }, + { + "epoch": 2.227773949000689, + "grad_norm": 32.51687240600586, + "learning_rate": 3.77271827076961e-08, + "logits/chosen": -2.1255993843078613, + "logits/rejected": -2.096824884414673, + "logps/chosen": -221.54824829101562, + "logps/rejected": -285.9750061035156, + "loss": 0.573, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.7131789922714233, + "rewards/margins": 0.6252934336662292, + "rewards/rejected": -2.338472604751587, + "step": 12930 + }, + { + "epoch": 2.229496898690558, + "grad_norm": 44.66363525390625, + "learning_rate": 3.757044119337449e-08, + "logits/chosen": -2.0880866050720215, + "logits/rejected": -2.0467846393585205, + "logps/chosen": -228.26327514648438, + "logps/rejected": -289.1280212402344, + "loss": 0.5451, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.7492284774780273, + "rewards/margins": 0.6592920422554016, + "rewards/rejected": -2.4085206985473633, + "step": 12940 + }, + { + "epoch": 2.231219848380427, + "grad_norm": 51.84122085571289, + "learning_rate": 3.741395060864038e-08, + "logits/chosen": -2.0882647037506104, + "logits/rejected": -2.0473856925964355, + "logps/chosen": -216.173583984375, + "logps/rejected": -298.5913391113281, + "loss": 0.474, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.6181936264038086, + "rewards/margins": 0.8368695974349976, + "rewards/rejected": -2.4550633430480957, + "step": 12950 + }, + { + "epoch": 2.2329427980702965, + "grad_norm": 40.83433151245117, + "learning_rate": 3.7257711582492645e-08, + "logits/chosen": -2.043201208114624, + "logits/rejected": -2.009258508682251, + "logps/chosen": -233.2020263671875, + "logps/rejected": -281.8921813964844, + "loss": 0.5717, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.7400344610214233, + "rewards/margins": 0.5515367388725281, + "rewards/rejected": -2.2915711402893066, + "step": 12960 + }, + { + "epoch": 2.2346657477601655, + "grad_norm": 33.69052505493164, + "learning_rate": 3.7101724742918915e-08, + "logits/chosen": -2.036499500274658, + "logits/rejected": -1.9901424646377563, + "logps/chosen": -220.5728759765625, + "logps/rejected": -300.4973449707031, + "loss": 0.4663, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.664202094078064, + "rewards/margins": 0.8676424026489258, + "rewards/rejected": -2.5318446159362793, + "step": 12970 + }, + { + "epoch": 2.2363886974500344, + "grad_norm": 53.17600631713867, + "learning_rate": 3.694599071689329e-08, + "logits/chosen": -2.0466625690460205, + "logits/rejected": -2.016874313354492, + "logps/chosen": -229.981201171875, + "logps/rejected": -301.1022033691406, + "loss": 0.5053, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.7895774841308594, + "rewards/margins": 0.7001364827156067, + "rewards/rejected": -2.4897139072418213, + "step": 12980 + }, + { + "epoch": 2.2381116471399034, + "grad_norm": 39.544010162353516, + "learning_rate": 3.679051013037361e-08, + "logits/chosen": -2.1186952590942383, + "logits/rejected": -2.0678322315216064, + "logps/chosen": -251.4757080078125, + "logps/rejected": -324.4326171875, + "loss": 0.5166, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.916233777999878, + "rewards/margins": 0.8067552447319031, + "rewards/rejected": -2.722989320755005, + "step": 12990 + }, + { + "epoch": 2.2398345968297724, + "grad_norm": 37.2462158203125, + "learning_rate": 3.663528360829915e-08, + "logits/chosen": -2.0603792667388916, + "logits/rejected": -2.017498254776001, + "logps/chosen": -262.0799560546875, + "logps/rejected": -340.74285888671875, + "loss": 0.508, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -2.0763087272644043, + "rewards/margins": 0.8207263946533203, + "rewards/rejected": -2.8970351219177246, + "step": 13000 + }, + { + "epoch": 2.241557546519642, + "grad_norm": 39.30596160888672, + "learning_rate": 3.6480311774587877e-08, + "logits/chosen": -2.0265860557556152, + "logits/rejected": -1.9936189651489258, + "logps/chosen": -259.8675537109375, + "logps/rejected": -325.21942138671875, + "loss": 0.5646, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.0876617431640625, + "rewards/margins": 0.672526478767395, + "rewards/rejected": -2.760188579559326, + "step": 13010 + }, + { + "epoch": 2.2432804962095108, + "grad_norm": 34.74739074707031, + "learning_rate": 3.6325595252134144e-08, + "logits/chosen": -2.0303027629852295, + "logits/rejected": -1.9870803356170654, + "logps/chosen": -241.1360321044922, + "logps/rejected": -316.6763610839844, + "loss": 0.5396, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.887058973312378, + "rewards/margins": 0.7892248034477234, + "rewards/rejected": -2.676283836364746, + "step": 13020 + }, + { + "epoch": 2.2450034458993797, + "grad_norm": 35.928749084472656, + "learning_rate": 3.617113466280612e-08, + "logits/chosen": -2.1021182537078857, + "logits/rejected": -2.066804885864258, + "logps/chosen": -241.94442749023438, + "logps/rejected": -299.8307189941406, + "loss": 0.5608, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.8644864559173584, + "rewards/margins": 0.6163435578346252, + "rewards/rejected": -2.480829954147339, + "step": 13030 + }, + { + "epoch": 2.2467263955892487, + "grad_norm": 52.62453842163086, + "learning_rate": 3.601693062744322e-08, + "logits/chosen": -2.1262030601501465, + "logits/rejected": -2.082648515701294, + "logps/chosen": -229.03598022460938, + "logps/rejected": -314.4962158203125, + "loss": 0.4906, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.7390594482421875, + "rewards/margins": 0.8555986285209656, + "rewards/rejected": -2.594658136367798, + "step": 13040 + }, + { + "epoch": 2.2484493452791177, + "grad_norm": 52.518775939941406, + "learning_rate": 3.586298376585363e-08, + "logits/chosen": -2.1036646366119385, + "logits/rejected": -2.0692028999328613, + "logps/chosen": -240.4881134033203, + "logps/rejected": -314.6589050292969, + "loss": 0.5534, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.8791265487670898, + "rewards/margins": 0.7353334426879883, + "rewards/rejected": -2.614459753036499, + "step": 13050 + }, + { + "epoch": 2.250172294968987, + "grad_norm": 43.86840057373047, + "learning_rate": 3.5709294696811985e-08, + "logits/chosen": -2.1041011810302734, + "logits/rejected": -2.069333553314209, + "logps/chosen": -238.55062866210938, + "logps/rejected": -315.8983459472656, + "loss": 0.5483, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.8438698053359985, + "rewards/margins": 0.7788059115409851, + "rewards/rejected": -2.6226754188537598, + "step": 13060 + }, + { + "epoch": 2.251895244658856, + "grad_norm": 58.724525451660156, + "learning_rate": 3.555586403805663e-08, + "logits/chosen": -2.0676779747009277, + "logits/rejected": -2.0228145122528076, + "logps/chosen": -215.6838836669922, + "logps/rejected": -275.9107360839844, + "loss": 0.5651, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.5825284719467163, + "rewards/margins": 0.6505758166313171, + "rewards/rejected": -2.2331044673919678, + "step": 13070 + }, + { + "epoch": 2.253618194348725, + "grad_norm": 32.55445861816406, + "learning_rate": 3.540269240628726e-08, + "logits/chosen": -2.049349069595337, + "logits/rejected": -2.0212624073028564, + "logps/chosen": -225.2374725341797, + "logps/rejected": -285.44671630859375, + "loss": 0.5521, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.7316248416900635, + "rewards/margins": 0.611584484577179, + "rewards/rejected": -2.3432092666625977, + "step": 13080 + }, + { + "epoch": 2.255341144038594, + "grad_norm": 37.775718688964844, + "learning_rate": 3.52497804171625e-08, + "logits/chosen": -2.148881196975708, + "logits/rejected": -2.1036956310272217, + "logps/chosen": -224.271484375, + "logps/rejected": -284.09429931640625, + "loss": 0.5724, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.6970704793930054, + "rewards/margins": 0.6304658651351929, + "rewards/rejected": -2.327536106109619, + "step": 13090 + }, + { + "epoch": 2.257064093728463, + "grad_norm": 25.495990753173828, + "learning_rate": 3.509712868529738e-08, + "logits/chosen": -2.2044754028320312, + "logits/rejected": -2.1581830978393555, + "logps/chosen": -213.9280242919922, + "logps/rejected": -284.5126647949219, + "loss": 0.5051, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.5838515758514404, + "rewards/margins": 0.7658999562263489, + "rewards/rejected": -2.3497514724731445, + "step": 13100 + }, + { + "epoch": 2.2587870434183324, + "grad_norm": 33.51130676269531, + "learning_rate": 3.494473782426073e-08, + "logits/chosen": -2.0514578819274902, + "logits/rejected": -2.0082719326019287, + "logps/chosen": -219.0296630859375, + "logps/rejected": -288.70654296875, + "loss": 0.5318, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.6402019262313843, + "rewards/margins": 0.7379582524299622, + "rewards/rejected": -2.378160238265991, + "step": 13110 + }, + { + "epoch": 2.2605099931082013, + "grad_norm": 31.88288688659668, + "learning_rate": 3.479260844657297e-08, + "logits/chosen": -2.1669747829437256, + "logits/rejected": -2.1406798362731934, + "logps/chosen": -217.6679229736328, + "logps/rejected": -277.6645812988281, + "loss": 0.5686, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.622894048690796, + "rewards/margins": 0.6306630373001099, + "rewards/rejected": -2.253556728363037, + "step": 13120 + }, + { + "epoch": 2.2622329427980703, + "grad_norm": 31.417335510253906, + "learning_rate": 3.46407411637034e-08, + "logits/chosen": -2.207447052001953, + "logits/rejected": -2.1617865562438965, + "logps/chosen": -203.57965087890625, + "logps/rejected": -294.2021484375, + "loss": 0.4744, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.4846456050872803, + "rewards/margins": 0.8848323822021484, + "rewards/rejected": -2.3694777488708496, + "step": 13130 + }, + { + "epoch": 2.2639558924879393, + "grad_norm": 27.322017669677734, + "learning_rate": 3.448913658606798e-08, + "logits/chosen": -2.078812837600708, + "logits/rejected": -2.040693759918213, + "logps/chosen": -211.0071258544922, + "logps/rejected": -282.39678955078125, + "loss": 0.4836, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.5680955648422241, + "rewards/margins": 0.7172769904136658, + "rewards/rejected": -2.285372257232666, + "step": 13140 + }, + { + "epoch": 2.2656788421778082, + "grad_norm": 38.46590805053711, + "learning_rate": 3.43377953230266e-08, + "logits/chosen": -2.078317642211914, + "logits/rejected": -2.0354676246643066, + "logps/chosen": -227.1080780029297, + "logps/rejected": -309.87066650390625, + "loss": 0.4814, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.7072769403457642, + "rewards/margins": 0.8658990859985352, + "rewards/rejected": -2.573176145553589, + "step": 13150 + }, + { + "epoch": 2.2674017918676777, + "grad_norm": 40.2093620300293, + "learning_rate": 3.418671798288093e-08, + "logits/chosen": -2.0533547401428223, + "logits/rejected": -2.008688449859619, + "logps/chosen": -251.79995727539062, + "logps/rejected": -322.4410705566406, + "loss": 0.5278, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.9460399150848389, + "rewards/margins": 0.7493022084236145, + "rewards/rejected": -2.6953423023223877, + "step": 13160 + }, + { + "epoch": 2.2691247415575466, + "grad_norm": 48.867897033691406, + "learning_rate": 3.403590517287175e-08, + "logits/chosen": -2.0968425273895264, + "logits/rejected": -2.0625452995300293, + "logps/chosen": -235.0701446533203, + "logps/rejected": -298.63433837890625, + "loss": 0.5422, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.799288034439087, + "rewards/margins": 0.670975387096405, + "rewards/rejected": -2.4702632427215576, + "step": 13170 + }, + { + "epoch": 2.2708476912474156, + "grad_norm": 39.24733352661133, + "learning_rate": 3.388535749917653e-08, + "logits/chosen": -2.1031875610351562, + "logits/rejected": -2.054490566253662, + "logps/chosen": -225.1061553955078, + "logps/rejected": -303.8103332519531, + "loss": 0.4691, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.7030404806137085, + "rewards/margins": 0.8334137201309204, + "rewards/rejected": -2.536454200744629, + "step": 13180 + }, + { + "epoch": 2.2725706409372846, + "grad_norm": 40.70262145996094, + "learning_rate": 3.373507556690718e-08, + "logits/chosen": -2.094082832336426, + "logits/rejected": -2.0526416301727295, + "logps/chosen": -237.23123168945312, + "logps/rejected": -295.06768798828125, + "loss": 0.5715, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.8039411306381226, + "rewards/margins": 0.6279858946800232, + "rewards/rejected": -2.43192720413208, + "step": 13190 + }, + { + "epoch": 2.2742935906271535, + "grad_norm": 35.791282653808594, + "learning_rate": 3.358505998010743e-08, + "logits/chosen": -2.080048084259033, + "logits/rejected": -2.0455188751220703, + "logps/chosen": -233.1878662109375, + "logps/rejected": -290.905029296875, + "loss": 0.5546, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.7701551914215088, + "rewards/margins": 0.6163936257362366, + "rewards/rejected": -2.3865485191345215, + "step": 13200 + }, + { + "epoch": 2.2742935906271535, + "eval_logits/chosen": -2.161210298538208, + "eval_logits/rejected": -2.1413331031799316, + "eval_logps/chosen": -226.16151428222656, + "eval_logps/rejected": -261.48590087890625, + "eval_loss": 0.6425279974937439, + "eval_rewards/accuracies": 0.6303438544273376, + "eval_rewards/chosen": -1.6714603900909424, + "eval_rewards/margins": 0.31590259075164795, + "eval_rewards/rejected": -1.9873627424240112, + "eval_runtime": 383.1902, + "eval_samples_per_second": 11.232, + "eval_steps_per_second": 1.404, + "step": 13200 + }, + { + "epoch": 2.2760165403170225, + "grad_norm": 36.40027618408203, + "learning_rate": 3.343531134175046e-08, + "logits/chosen": -2.1576075553894043, + "logits/rejected": -2.1340811252593994, + "logps/chosen": -220.05160522460938, + "logps/rejected": -283.6050109863281, + "loss": 0.5517, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.6668260097503662, + "rewards/margins": 0.6425694227218628, + "rewards/rejected": -2.3093955516815186, + "step": 13210 + }, + { + "epoch": 2.277739490006892, + "grad_norm": 35.08451843261719, + "learning_rate": 3.3285830253736405e-08, + "logits/chosen": -2.1245293617248535, + "logits/rejected": -2.092247724533081, + "logps/chosen": -225.3599853515625, + "logps/rejected": -289.3499755859375, + "loss": 0.5274, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.693889856338501, + "rewards/margins": 0.6483731865882874, + "rewards/rejected": -2.3422627449035645, + "step": 13220 + }, + { + "epoch": 2.279462439696761, + "grad_norm": 27.97449493408203, + "learning_rate": 3.313661731689013e-08, + "logits/chosen": -2.0879783630371094, + "logits/rejected": -2.0488433837890625, + "logps/chosen": -219.3804473876953, + "logps/rejected": -301.9391784667969, + "loss": 0.5051, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.670306921005249, + "rewards/margins": 0.8253267407417297, + "rewards/rejected": -2.495633602142334, + "step": 13230 + }, + { + "epoch": 2.28118538938663, + "grad_norm": 32.750732421875, + "learning_rate": 3.298767313095865e-08, + "logits/chosen": -2.1046676635742188, + "logits/rejected": -2.0787782669067383, + "logps/chosen": -233.48611450195312, + "logps/rejected": -293.97784423828125, + "loss": 0.5207, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -1.8176323175430298, + "rewards/margins": 0.5957657694816589, + "rewards/rejected": -2.413398265838623, + "step": 13240 + }, + { + "epoch": 2.282908339076499, + "grad_norm": 35.13943862915039, + "learning_rate": 3.283899829460873e-08, + "logits/chosen": -2.0670647621154785, + "logits/rejected": -2.035553455352783, + "logps/chosen": -227.38436889648438, + "logps/rejected": -314.468017578125, + "loss": 0.4911, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7445780038833618, + "rewards/margins": 0.8522817492485046, + "rewards/rejected": -2.596859931945801, + "step": 13250 + }, + { + "epoch": 2.2846312887663682, + "grad_norm": 41.03888702392578, + "learning_rate": 3.269059340542448e-08, + "logits/chosen": -2.161862373352051, + "logits/rejected": -2.1250216960906982, + "logps/chosen": -232.2884979248047, + "logps/rejected": -315.02960205078125, + "loss": 0.5266, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.828884482383728, + "rewards/margins": 0.8013812899589539, + "rewards/rejected": -2.630265712738037, + "step": 13260 + }, + { + "epoch": 2.286354238456237, + "grad_norm": 35.590736389160156, + "learning_rate": 3.2542459059905127e-08, + "logits/chosen": -2.0372190475463867, + "logits/rejected": -2.0048985481262207, + "logps/chosen": -238.0127716064453, + "logps/rejected": -311.5678405761719, + "loss": 0.5186, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.8236106634140015, + "rewards/margins": 0.7575092315673828, + "rewards/rejected": -2.581120014190674, + "step": 13270 + }, + { + "epoch": 2.288077188146106, + "grad_norm": 38.78087615966797, + "learning_rate": 3.239459585346228e-08, + "logits/chosen": -2.093226909637451, + "logits/rejected": -2.046966791152954, + "logps/chosen": -222.5635528564453, + "logps/rejected": -292.07373046875, + "loss": 0.5405, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.6759002208709717, + "rewards/margins": 0.7385443449020386, + "rewards/rejected": -2.4144444465637207, + "step": 13280 + }, + { + "epoch": 2.289800137835975, + "grad_norm": 46.30903244018555, + "learning_rate": 3.224700438041789e-08, + "logits/chosen": -2.080031156539917, + "logits/rejected": -2.040649652481079, + "logps/chosen": -222.87655639648438, + "logps/rejected": -294.4432678222656, + "loss": 0.5004, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.6777915954589844, + "rewards/margins": 0.7428628206253052, + "rewards/rejected": -2.420654535293579, + "step": 13290 + }, + { + "epoch": 2.291523087525844, + "grad_norm": 36.877052307128906, + "learning_rate": 3.209968523400165e-08, + "logits/chosen": -2.088174343109131, + "logits/rejected": -2.0509660243988037, + "logps/chosen": -248.04592895507812, + "logps/rejected": -313.04864501953125, + "loss": 0.581, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.9629755020141602, + "rewards/margins": 0.662213146686554, + "rewards/rejected": -2.6251888275146484, + "step": 13300 + }, + { + "epoch": 2.293246037215713, + "grad_norm": 34.83769226074219, + "learning_rate": 3.195263900634863e-08, + "logits/chosen": -2.093043088912964, + "logits/rejected": -2.0511364936828613, + "logps/chosen": -244.8745880126953, + "logps/rejected": -321.04046630859375, + "loss": 0.5388, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.8718029260635376, + "rewards/margins": 0.8050811886787415, + "rewards/rejected": -2.676884174346924, + "step": 13310 + }, + { + "epoch": 2.2949689869055825, + "grad_norm": 50.702056884765625, + "learning_rate": 3.180586628849692e-08, + "logits/chosen": -2.1294782161712646, + "logits/rejected": -2.081721782684326, + "logps/chosen": -243.4737548828125, + "logps/rejected": -289.72357177734375, + "loss": 0.623, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.866064429283142, + "rewards/margins": 0.5558760166168213, + "rewards/rejected": -2.421940565109253, + "step": 13320 + }, + { + "epoch": 2.2966919365954515, + "grad_norm": 38.728511810302734, + "learning_rate": 3.165936767038534e-08, + "logits/chosen": -2.0673575401306152, + "logits/rejected": -2.025552988052368, + "logps/chosen": -212.02685546875, + "logps/rejected": -287.38671875, + "loss": 0.5166, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5518863201141357, + "rewards/margins": 0.8267132639884949, + "rewards/rejected": -2.3785996437072754, + "step": 13330 + }, + { + "epoch": 2.2984148862853204, + "grad_norm": 47.14889907836914, + "learning_rate": 3.151314374085097e-08, + "logits/chosen": -2.169553518295288, + "logits/rejected": -2.141165256500244, + "logps/chosen": -221.1737823486328, + "logps/rejected": -283.49029541015625, + "loss": 0.5646, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.6549873352050781, + "rewards/margins": 0.6485617756843567, + "rewards/rejected": -2.30354905128479, + "step": 13340 + }, + { + "epoch": 2.3001378359751894, + "grad_norm": 35.23659133911133, + "learning_rate": 3.136719508762674e-08, + "logits/chosen": -2.1357874870300293, + "logits/rejected": -2.0790905952453613, + "logps/chosen": -203.52828979492188, + "logps/rejected": -281.9134826660156, + "loss": 0.4834, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.461071252822876, + "rewards/margins": 0.850469708442688, + "rewards/rejected": -2.3115413188934326, + "step": 13350 + }, + { + "epoch": 2.301860785665059, + "grad_norm": 40.01634216308594, + "learning_rate": 3.1221522297339177e-08, + "logits/chosen": -2.1334757804870605, + "logits/rejected": -2.0900192260742188, + "logps/chosen": -217.0267333984375, + "logps/rejected": -296.7123718261719, + "loss": 0.5045, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.5906345844268799, + "rewards/margins": 0.8470972180366516, + "rewards/rejected": -2.4377317428588867, + "step": 13360 + }, + { + "epoch": 2.3035837353549278, + "grad_norm": 35.108001708984375, + "learning_rate": 3.1076125955506015e-08, + "logits/chosen": -2.093878746032715, + "logits/rejected": -2.039607524871826, + "logps/chosen": -217.65939331054688, + "logps/rejected": -293.3722229003906, + "loss": 0.5265, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.6202102899551392, + "rewards/margins": 0.7910685539245605, + "rewards/rejected": -2.4112792015075684, + "step": 13370 + }, + { + "epoch": 2.3053066850447967, + "grad_norm": 45.69697952270508, + "learning_rate": 3.0931006646533866e-08, + "logits/chosen": -2.041187286376953, + "logits/rejected": -1.9984385967254639, + "logps/chosen": -230.24514770507812, + "logps/rejected": -290.89263916015625, + "loss": 0.5392, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7188899517059326, + "rewards/margins": 0.6583026647567749, + "rewards/rejected": -2.377192497253418, + "step": 13380 + }, + { + "epoch": 2.3070296347346657, + "grad_norm": 45.5197639465332, + "learning_rate": 3.078616495371574e-08, + "logits/chosen": -2.070643186569214, + "logits/rejected": -2.014615535736084, + "logps/chosen": -212.42764282226562, + "logps/rejected": -277.73175048828125, + "loss": 0.5393, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.5495667457580566, + "rewards/margins": 0.7197413444519043, + "rewards/rejected": -2.269308090209961, + "step": 13390 + }, + { + "epoch": 2.3087525844245347, + "grad_norm": 27.229652404785156, + "learning_rate": 3.064160145922884e-08, + "logits/chosen": -2.0850396156311035, + "logits/rejected": -2.0378663539886475, + "logps/chosen": -216.22128295898438, + "logps/rejected": -297.5529479980469, + "loss": 0.4751, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.6000601053237915, + "rewards/margins": 0.8747854232788086, + "rewards/rejected": -2.4748454093933105, + "step": 13400 + }, + { + "epoch": 2.3104755341144037, + "grad_norm": 36.54592514038086, + "learning_rate": 3.0497316744132215e-08, + "logits/chosen": -2.104112148284912, + "logits/rejected": -2.0560758113861084, + "logps/chosen": -241.180908203125, + "logps/rejected": -320.4126892089844, + "loss": 0.5309, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.864006757736206, + "rewards/margins": 0.8387478590011597, + "rewards/rejected": -2.702754497528076, + "step": 13410 + }, + { + "epoch": 2.312198483804273, + "grad_norm": 43.858726501464844, + "learning_rate": 3.035331138836431e-08, + "logits/chosen": -2.0926592350006104, + "logits/rejected": -2.0584444999694824, + "logps/chosen": -238.5738983154297, + "logps/rejected": -321.810546875, + "loss": 0.4983, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.844842553138733, + "rewards/margins": 0.8219674229621887, + "rewards/rejected": -2.6668102741241455, + "step": 13420 + }, + { + "epoch": 2.313921433494142, + "grad_norm": 47.705291748046875, + "learning_rate": 3.020958597074081e-08, + "logits/chosen": -2.166374683380127, + "logits/rejected": -2.1191868782043457, + "logps/chosen": -236.5503692626953, + "logps/rejected": -315.83587646484375, + "loss": 0.5115, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.8055598735809326, + "rewards/margins": 0.7851177453994751, + "rewards/rejected": -2.590677499771118, + "step": 13430 + }, + { + "epoch": 2.315644383184011, + "grad_norm": 40.79521942138672, + "learning_rate": 3.006614106895211e-08, + "logits/chosen": -2.026750087738037, + "logits/rejected": -1.9977821111679077, + "logps/chosen": -223.0307159423828, + "logps/rejected": -284.9368896484375, + "loss": 0.5683, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.6900018453598022, + "rewards/margins": 0.6437240839004517, + "rewards/rejected": -2.333725929260254, + "step": 13440 + }, + { + "epoch": 2.31736733287388, + "grad_norm": 32.59689712524414, + "learning_rate": 2.992297725956121e-08, + "logits/chosen": -2.061788558959961, + "logits/rejected": -2.020012378692627, + "logps/chosen": -218.10922241210938, + "logps/rejected": -291.716552734375, + "loss": 0.5251, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.6696808338165283, + "rewards/margins": 0.7335036993026733, + "rewards/rejected": -2.403184413909912, + "step": 13450 + }, + { + "epoch": 2.3190902825637494, + "grad_norm": 41.41628646850586, + "learning_rate": 2.978009511800116e-08, + "logits/chosen": -2.0997917652130127, + "logits/rejected": -2.0537500381469727, + "logps/chosen": -220.21572875976562, + "logps/rejected": -301.9630126953125, + "loss": 0.4729, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.643982172012329, + "rewards/margins": 0.8779670596122742, + "rewards/rejected": -2.521949291229248, + "step": 13460 + }, + { + "epoch": 2.3208132322536184, + "grad_norm": 38.67691421508789, + "learning_rate": 2.9637495218572972e-08, + "logits/chosen": -2.0158824920654297, + "logits/rejected": -1.976339340209961, + "logps/chosen": -237.1468048095703, + "logps/rejected": -301.9319152832031, + "loss": 0.5574, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.8132215738296509, + "rewards/margins": 0.709286093711853, + "rewards/rejected": -2.522507429122925, + "step": 13470 + }, + { + "epoch": 2.3225361819434873, + "grad_norm": 44.51760482788086, + "learning_rate": 2.9495178134443254e-08, + "logits/chosen": -2.125648021697998, + "logits/rejected": -2.0716588497161865, + "logps/chosen": -220.8754425048828, + "logps/rejected": -296.6810607910156, + "loss": 0.4888, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.6669012308120728, + "rewards/margins": 0.7847355008125305, + "rewards/rejected": -2.451636552810669, + "step": 13480 + }, + { + "epoch": 2.3242591316333563, + "grad_norm": 31.296419143676758, + "learning_rate": 2.9353144437641662e-08, + "logits/chosen": -2.1042752265930176, + "logits/rejected": -2.056389570236206, + "logps/chosen": -234.0042266845703, + "logps/rejected": -307.01837158203125, + "loss": 0.5252, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.7978731393814087, + "rewards/margins": 0.7429569959640503, + "rewards/rejected": -2.540830135345459, + "step": 13490 + }, + { + "epoch": 2.3259820813232253, + "grad_norm": 41.06807327270508, + "learning_rate": 2.9211394699058987e-08, + "logits/chosen": -2.094754457473755, + "logits/rejected": -2.0499045848846436, + "logps/chosen": -236.43984985351562, + "logps/rejected": -315.6556396484375, + "loss": 0.473, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.7972850799560547, + "rewards/margins": 0.8472954630851746, + "rewards/rejected": -2.644580602645874, + "step": 13500 + }, + { + "epoch": 2.3277050310130942, + "grad_norm": 31.174585342407227, + "learning_rate": 2.9069929488444678e-08, + "logits/chosen": -2.026517391204834, + "logits/rejected": -1.997696876525879, + "logps/chosen": -222.0631866455078, + "logps/rejected": -294.4779357910156, + "loss": 0.5431, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.7136270999908447, + "rewards/margins": 0.6927502155303955, + "rewards/rejected": -2.4063773155212402, + "step": 13510 + }, + { + "epoch": 2.3294279807029636, + "grad_norm": 31.05910301208496, + "learning_rate": 2.8928749374404448e-08, + "logits/chosen": -1.9960353374481201, + "logits/rejected": -1.9593353271484375, + "logps/chosen": -234.98104858398438, + "logps/rejected": -313.2256164550781, + "loss": 0.5235, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.8336776494979858, + "rewards/margins": 0.7925911545753479, + "rewards/rejected": -2.6262686252593994, + "step": 13520 + }, + { + "epoch": 2.3311509303928326, + "grad_norm": 54.28462219238281, + "learning_rate": 2.8787854924398123e-08, + "logits/chosen": -2.093662738800049, + "logits/rejected": -2.056663990020752, + "logps/chosen": -235.8678436279297, + "logps/rejected": -285.80438232421875, + "loss": 0.5827, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.8033876419067383, + "rewards/margins": 0.5410268306732178, + "rewards/rejected": -2.344414472579956, + "step": 13530 + }, + { + "epoch": 2.3328738800827016, + "grad_norm": 67.41534423828125, + "learning_rate": 2.8647246704737382e-08, + "logits/chosen": -2.041447401046753, + "logits/rejected": -1.9953224658966064, + "logps/chosen": -233.7212677001953, + "logps/rejected": -306.98822021484375, + "loss": 0.5158, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.7728395462036133, + "rewards/margins": 0.8034697771072388, + "rewards/rejected": -2.5763089656829834, + "step": 13540 + }, + { + "epoch": 2.3345968297725705, + "grad_norm": 35.68818664550781, + "learning_rate": 2.8506925280583417e-08, + "logits/chosen": -2.074463367462158, + "logits/rejected": -2.0343222618103027, + "logps/chosen": -231.67538452148438, + "logps/rejected": -292.61614990234375, + "loss": 0.5818, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.8013169765472412, + "rewards/margins": 0.6350749135017395, + "rewards/rejected": -2.436391830444336, + "step": 13550 + }, + { + "epoch": 2.3363197794624395, + "grad_norm": 47.221675872802734, + "learning_rate": 2.8366891215944598e-08, + "logits/chosen": -2.104546308517456, + "logits/rejected": -2.0793075561523438, + "logps/chosen": -211.36093139648438, + "logps/rejected": -272.89996337890625, + "loss": 0.5679, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.5778541564941406, + "rewards/margins": 0.6280261874198914, + "rewards/rejected": -2.205880641937256, + "step": 13560 + }, + { + "epoch": 2.338042729152309, + "grad_norm": 31.64224624633789, + "learning_rate": 2.8227145073674385e-08, + "logits/chosen": -2.0302841663360596, + "logits/rejected": -1.9922692775726318, + "logps/chosen": -227.22756958007812, + "logps/rejected": -306.2268981933594, + "loss": 0.4881, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7482671737670898, + "rewards/margins": 0.8114362955093384, + "rewards/rejected": -2.5597033500671387, + "step": 13570 + }, + { + "epoch": 2.339765678842178, + "grad_norm": 36.071170806884766, + "learning_rate": 2.8087687415468896e-08, + "logits/chosen": -2.077479600906372, + "logits/rejected": -2.052595615386963, + "logps/chosen": -217.6588592529297, + "logps/rejected": -281.9828796386719, + "loss": 0.5402, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.644518256187439, + "rewards/margins": 0.6487827897071838, + "rewards/rejected": -2.2933011054992676, + "step": 13580 + }, + { + "epoch": 2.341488628532047, + "grad_norm": 41.25293731689453, + "learning_rate": 2.7948518801864697e-08, + "logits/chosen": -2.041703939437866, + "logits/rejected": -2.0177359580993652, + "logps/chosen": -224.66085815429688, + "logps/rejected": -298.35089111328125, + "loss": 0.5183, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.7158304452896118, + "rewards/margins": 0.7353793978691101, + "rewards/rejected": -2.451209545135498, + "step": 13590 + }, + { + "epoch": 2.343211578221916, + "grad_norm": 45.824951171875, + "learning_rate": 2.780963979223663e-08, + "logits/chosen": -2.097001552581787, + "logits/rejected": -2.0635221004486084, + "logps/chosen": -229.45321655273438, + "logps/rejected": -288.4214782714844, + "loss": 0.5639, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.695074439048767, + "rewards/margins": 0.6171914935112, + "rewards/rejected": -2.3122661113739014, + "step": 13600 + }, + { + "epoch": 2.343211578221916, + "eval_logits/chosen": -2.167468786239624, + "eval_logits/rejected": -2.1481270790100098, + "eval_logps/chosen": -218.10006713867188, + "eval_logps/rejected": -252.5519256591797, + "eval_loss": 0.6408571600914001, + "eval_rewards/accuracies": 0.6289498209953308, + "eval_rewards/chosen": -1.5908458232879639, + "eval_rewards/margins": 0.30717742443084717, + "eval_rewards/rejected": -1.8980233669281006, + "eval_runtime": 382.927, + "eval_samples_per_second": 11.24, + "eval_steps_per_second": 1.405, + "step": 13600 + }, + { + "epoch": 2.344934527911785, + "grad_norm": 56.389686584472656, + "learning_rate": 2.7671050944795494e-08, + "logits/chosen": -2.2281742095947266, + "logits/rejected": -2.187246561050415, + "logps/chosen": -214.6632537841797, + "logps/rejected": -280.9909362792969, + "loss": 0.5508, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.6076910495758057, + "rewards/margins": 0.6706972122192383, + "rewards/rejected": -2.278388261795044, + "step": 13610 + }, + { + "epoch": 2.346657477601654, + "grad_norm": 27.47957420349121, + "learning_rate": 2.753275281658578e-08, + "logits/chosen": -2.1068856716156006, + "logits/rejected": -2.0513110160827637, + "logps/chosen": -219.8840789794922, + "logps/rejected": -288.0721740722656, + "loss": 0.4984, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.6228272914886475, + "rewards/margins": 0.7473019361495972, + "rewards/rejected": -2.370129346847534, + "step": 13620 + }, + { + "epoch": 2.348380427291523, + "grad_norm": 36.49810791015625, + "learning_rate": 2.7394745963483414e-08, + "logits/chosen": -2.041313409805298, + "logits/rejected": -1.9859342575073242, + "logps/chosen": -224.9228973388672, + "logps/rejected": -303.43414306640625, + "loss": 0.4732, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -1.6923065185546875, + "rewards/margins": 0.8486798405647278, + "rewards/rejected": -2.5409865379333496, + "step": 13630 + }, + { + "epoch": 2.350103376981392, + "grad_norm": 33.58571243286133, + "learning_rate": 2.725703094019368e-08, + "logits/chosen": -2.050534725189209, + "logits/rejected": -2.008087396621704, + "logps/chosen": -226.65194702148438, + "logps/rejected": -307.14324951171875, + "loss": 0.4801, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.7088916301727295, + "rewards/margins": 0.8303858041763306, + "rewards/rejected": -2.5392775535583496, + "step": 13640 + }, + { + "epoch": 2.351826326671261, + "grad_norm": 39.161048889160156, + "learning_rate": 2.7119608300248842e-08, + "logits/chosen": -2.118753671646118, + "logits/rejected": -2.077744245529175, + "logps/chosen": -237.79550170898438, + "logps/rejected": -312.37933349609375, + "loss": 0.508, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.826664686203003, + "rewards/margins": 0.7895326018333435, + "rewards/rejected": -2.616197109222412, + "step": 13650 + }, + { + "epoch": 2.35354927636113, + "grad_norm": 46.64421463012695, + "learning_rate": 2.698247859600591e-08, + "logits/chosen": -2.0097432136535645, + "logits/rejected": -1.9697071313858032, + "logps/chosen": -232.4499969482422, + "logps/rejected": -302.5597229003906, + "loss": 0.5348, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7880842685699463, + "rewards/margins": 0.7070426940917969, + "rewards/rejected": -2.495126962661743, + "step": 13660 + }, + { + "epoch": 2.3552722260509995, + "grad_norm": 42.685482025146484, + "learning_rate": 2.6845642378644463e-08, + "logits/chosen": -2.0989060401916504, + "logits/rejected": -2.05808162689209, + "logps/chosen": -238.69668579101562, + "logps/rejected": -302.35272216796875, + "loss": 0.5426, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.8286670446395874, + "rewards/margins": 0.6564762592315674, + "rewards/rejected": -2.4851431846618652, + "step": 13670 + }, + { + "epoch": 2.3569951757408685, + "grad_norm": 40.78122329711914, + "learning_rate": 2.6709100198164513e-08, + "logits/chosen": -2.105421781539917, + "logits/rejected": -2.0629665851593018, + "logps/chosen": -236.335693359375, + "logps/rejected": -299.8169250488281, + "loss": 0.5282, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.8018970489501953, + "rewards/margins": 0.6912158727645874, + "rewards/rejected": -2.4931130409240723, + "step": 13680 + }, + { + "epoch": 2.3587181254307374, + "grad_norm": 33.260982513427734, + "learning_rate": 2.657285260338421e-08, + "logits/chosen": -2.078080892562866, + "logits/rejected": -2.0286943912506104, + "logps/chosen": -236.0628204345703, + "logps/rejected": -314.62322998046875, + "loss": 0.4971, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.814112901687622, + "rewards/margins": 0.7981823086738586, + "rewards/rejected": -2.612295150756836, + "step": 13690 + }, + { + "epoch": 2.3604410751206064, + "grad_norm": 58.762420654296875, + "learning_rate": 2.643690014193758e-08, + "logits/chosen": -2.072925090789795, + "logits/rejected": -2.030588150024414, + "logps/chosen": -245.0630340576172, + "logps/rejected": -304.95379638671875, + "loss": 0.5536, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.8534374237060547, + "rewards/margins": 0.6731254458427429, + "rewards/rejected": -2.5265629291534424, + "step": 13700 + }, + { + "epoch": 2.3621640248104754, + "grad_norm": 53.97275924682617, + "learning_rate": 2.6301243360272394e-08, + "logits/chosen": -2.022672176361084, + "logits/rejected": -1.9718831777572632, + "logps/chosen": -233.41122436523438, + "logps/rejected": -311.2149353027344, + "loss": 0.5107, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.787607192993164, + "rewards/margins": 0.790905773639679, + "rewards/rejected": -2.578512668609619, + "step": 13710 + }, + { + "epoch": 2.3638869745003444, + "grad_norm": 61.27710723876953, + "learning_rate": 2.6165882803648055e-08, + "logits/chosen": -2.044752597808838, + "logits/rejected": -2.001466751098633, + "logps/chosen": -232.3321533203125, + "logps/rejected": -300.99346923828125, + "loss": 0.5336, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.7707017660140991, + "rewards/margins": 0.7224616408348083, + "rewards/rejected": -2.4931633472442627, + "step": 13720 + }, + { + "epoch": 2.3656099241902138, + "grad_norm": 73.16886901855469, + "learning_rate": 2.60308190161332e-08, + "logits/chosen": -2.161386489868164, + "logits/rejected": -2.1212000846862793, + "logps/chosen": -235.1083221435547, + "logps/rejected": -325.3297424316406, + "loss": 0.4772, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -1.7882179021835327, + "rewards/margins": 0.8951144218444824, + "rewards/rejected": -2.6833324432373047, + "step": 13730 + }, + { + "epoch": 2.3673328738800827, + "grad_norm": 50.45926284790039, + "learning_rate": 2.5896052540603706e-08, + "logits/chosen": -2.141047477722168, + "logits/rejected": -2.095937490463257, + "logps/chosen": -246.432373046875, + "logps/rejected": -324.0269775390625, + "loss": 0.517, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.8807487487792969, + "rewards/margins": 0.8315391540527344, + "rewards/rejected": -2.7122879028320312, + "step": 13740 + }, + { + "epoch": 2.3690558235699517, + "grad_norm": 41.474308013916016, + "learning_rate": 2.576158391874047e-08, + "logits/chosen": -2.1030590534210205, + "logits/rejected": -2.0579898357391357, + "logps/chosen": -246.07363891601562, + "logps/rejected": -331.3897399902344, + "loss": 0.5176, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.8955215215682983, + "rewards/margins": 0.8814752697944641, + "rewards/rejected": -2.7769968509674072, + "step": 13750 + }, + { + "epoch": 2.3707787732598207, + "grad_norm": 52.19126510620117, + "learning_rate": 2.562741369102711e-08, + "logits/chosen": -2.1607160568237305, + "logits/rejected": -2.1174328327178955, + "logps/chosen": -229.7025146484375, + "logps/rejected": -289.3486328125, + "loss": 0.5784, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.7426655292510986, + "rewards/margins": 0.6520506739616394, + "rewards/rejected": -2.394716262817383, + "step": 13760 + }, + { + "epoch": 2.37250172294969, + "grad_norm": 45.66209411621094, + "learning_rate": 2.549354239674786e-08, + "logits/chosen": -2.155529260635376, + "logits/rejected": -2.1266839504241943, + "logps/chosen": -226.15097045898438, + "logps/rejected": -297.9380798339844, + "loss": 0.5314, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.746569275856018, + "rewards/margins": 0.694832980632782, + "rewards/rejected": -2.441401958465576, + "step": 13770 + }, + { + "epoch": 2.374224672639559, + "grad_norm": 30.729534149169922, + "learning_rate": 2.5359970573985524e-08, + "logits/chosen": -2.2028872966766357, + "logits/rejected": -2.151646137237549, + "logps/chosen": -237.97409057617188, + "logps/rejected": -293.50445556640625, + "loss": 0.5552, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.779500961303711, + "rewards/margins": 0.6468724012374878, + "rewards/rejected": -2.4263734817504883, + "step": 13780 + }, + { + "epoch": 2.375947622329428, + "grad_norm": 33.8580207824707, + "learning_rate": 2.522669875961919e-08, + "logits/chosen": -2.11983060836792, + "logits/rejected": -2.080904722213745, + "logps/chosen": -215.71340942382812, + "logps/rejected": -279.94342041015625, + "loss": 0.5113, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.5791537761688232, + "rewards/margins": 0.7006839513778687, + "rewards/rejected": -2.2798376083374023, + "step": 13790 + }, + { + "epoch": 2.377670572019297, + "grad_norm": 43.75486373901367, + "learning_rate": 2.509372748932195e-08, + "logits/chosen": -2.186190128326416, + "logits/rejected": -2.13720703125, + "logps/chosen": -213.9628448486328, + "logps/rejected": -287.0473937988281, + "loss": 0.4907, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.5549368858337402, + "rewards/margins": 0.8049592971801758, + "rewards/rejected": -2.359896183013916, + "step": 13800 + }, + { + "epoch": 2.379393521709166, + "grad_norm": 49.993106842041016, + "learning_rate": 2.4961057297559064e-08, + "logits/chosen": -2.102524995803833, + "logits/rejected": -2.061375617980957, + "logps/chosen": -212.2505340576172, + "logps/rejected": -283.9283447265625, + "loss": 0.5364, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.581167459487915, + "rewards/margins": 0.7305034399032593, + "rewards/rejected": -2.3116707801818848, + "step": 13810 + }, + { + "epoch": 2.381116471399035, + "grad_norm": 39.11954879760742, + "learning_rate": 2.4828688717585567e-08, + "logits/chosen": -2.153839111328125, + "logits/rejected": -2.103940963745117, + "logps/chosen": -216.89474487304688, + "logps/rejected": -281.2050476074219, + "loss": 0.5263, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.5603468418121338, + "rewards/margins": 0.7254433035850525, + "rewards/rejected": -2.285790205001831, + "step": 13820 + }, + { + "epoch": 2.3828394210889043, + "grad_norm": 41.82209396362305, + "learning_rate": 2.4696622281444158e-08, + "logits/chosen": -2.1807799339294434, + "logits/rejected": -2.154695987701416, + "logps/chosen": -209.79354858398438, + "logps/rejected": -268.2076416015625, + "loss": 0.5323, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.5338456630706787, + "rewards/margins": 0.6318690180778503, + "rewards/rejected": -2.165714979171753, + "step": 13830 + }, + { + "epoch": 2.3845623707787733, + "grad_norm": 39.15401077270508, + "learning_rate": 2.4564858519963195e-08, + "logits/chosen": -2.1524453163146973, + "logits/rejected": -2.1096901893615723, + "logps/chosen": -218.0504150390625, + "logps/rejected": -271.6688537597656, + "loss": 0.5577, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.6052839756011963, + "rewards/margins": 0.59880530834198, + "rewards/rejected": -2.204089641571045, + "step": 13840 + }, + { + "epoch": 2.3862853204686423, + "grad_norm": 38.87300109863281, + "learning_rate": 2.443339796275432e-08, + "logits/chosen": -2.078158140182495, + "logits/rejected": -2.0419223308563232, + "logps/chosen": -215.25033569335938, + "logps/rejected": -275.47967529296875, + "loss": 0.5611, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.6021534204483032, + "rewards/margins": 0.6668301224708557, + "rewards/rejected": -2.2689836025238037, + "step": 13850 + }, + { + "epoch": 2.3880082701585112, + "grad_norm": 37.93467330932617, + "learning_rate": 2.4302241138210633e-08, + "logits/chosen": -2.0814151763916016, + "logits/rejected": -2.04856276512146, + "logps/chosen": -221.61465454101562, + "logps/rejected": -282.6617736816406, + "loss": 0.5406, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.6941856145858765, + "rewards/margins": 0.6259894371032715, + "rewards/rejected": -2.3201751708984375, + "step": 13860 + }, + { + "epoch": 2.3897312198483807, + "grad_norm": 44.15261459350586, + "learning_rate": 2.417138857350428e-08, + "logits/chosen": -2.119511365890503, + "logits/rejected": -2.085003614425659, + "logps/chosen": -235.39614868164062, + "logps/rejected": -303.26483154296875, + "loss": 0.552, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.7935514450073242, + "rewards/margins": 0.7201517820358276, + "rewards/rejected": -2.513702869415283, + "step": 13870 + }, + { + "epoch": 2.3914541695382496, + "grad_norm": 36.41903305053711, + "learning_rate": 2.404084079458457e-08, + "logits/chosen": -2.0601439476013184, + "logits/rejected": -2.0118136405944824, + "logps/chosen": -226.09365844726562, + "logps/rejected": -284.78387451171875, + "loss": 0.5702, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.706420660018921, + "rewards/margins": 0.6236202716827393, + "rewards/rejected": -2.3300411701202393, + "step": 13880 + }, + { + "epoch": 2.3931771192281186, + "grad_norm": 31.09129524230957, + "learning_rate": 2.3910598326175635e-08, + "logits/chosen": -2.155531644821167, + "logits/rejected": -2.1182315349578857, + "logps/chosen": -211.36474609375, + "logps/rejected": -272.09503173828125, + "loss": 0.5368, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.5504828691482544, + "rewards/margins": 0.6465977430343628, + "rewards/rejected": -2.197080612182617, + "step": 13890 + }, + { + "epoch": 2.3949000689179876, + "grad_norm": 38.452693939208984, + "learning_rate": 2.3780661691774585e-08, + "logits/chosen": -2.0754923820495605, + "logits/rejected": -2.0319905281066895, + "logps/chosen": -206.372314453125, + "logps/rejected": -269.18389892578125, + "loss": 0.5489, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.5245025157928467, + "rewards/margins": 0.6824948787689209, + "rewards/rejected": -2.2069973945617676, + "step": 13900 + }, + { + "epoch": 2.3966230186078565, + "grad_norm": 47.45440673828125, + "learning_rate": 2.3651031413649127e-08, + "logits/chosen": -2.1021831035614014, + "logits/rejected": -2.0657753944396973, + "logps/chosen": -202.1910400390625, + "logps/rejected": -255.5797576904297, + "loss": 0.5502, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.447701096534729, + "rewards/margins": 0.5892106890678406, + "rewards/rejected": -2.036911964416504, + "step": 13910 + }, + { + "epoch": 2.3983459682977255, + "grad_norm": 52.61918640136719, + "learning_rate": 2.3521708012835696e-08, + "logits/chosen": -2.1503491401672363, + "logits/rejected": -2.1002659797668457, + "logps/chosen": -216.20458984375, + "logps/rejected": -273.5358581542969, + "loss": 0.518, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.5507729053497314, + "rewards/margins": 0.6722269654273987, + "rewards/rejected": -2.2229995727539062, + "step": 13920 + }, + { + "epoch": 2.400068917987595, + "grad_norm": 38.167327880859375, + "learning_rate": 2.3392692009137193e-08, + "logits/chosen": -2.1040003299713135, + "logits/rejected": -2.0707528591156006, + "logps/chosen": -205.02896118164062, + "logps/rejected": -252.3942413330078, + "loss": 0.603, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.5288441181182861, + "rewards/margins": 0.4923536777496338, + "rewards/rejected": -2.02119779586792, + "step": 13930 + }, + { + "epoch": 2.401791867677464, + "grad_norm": 39.31071090698242, + "learning_rate": 2.3263983921120987e-08, + "logits/chosen": -2.0793838500976562, + "logits/rejected": -2.0337014198303223, + "logps/chosen": -198.89938354492188, + "logps/rejected": -282.0915832519531, + "loss": 0.5019, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.450226068496704, + "rewards/margins": 0.8140614628791809, + "rewards/rejected": -2.2642874717712402, + "step": 13940 + }, + { + "epoch": 2.403514817367333, + "grad_norm": 49.855323791503906, + "learning_rate": 2.3135584266116837e-08, + "logits/chosen": -2.140976667404175, + "logits/rejected": -2.103226661682129, + "logps/chosen": -216.03695678710938, + "logps/rejected": -277.74993896484375, + "loss": 0.5757, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.6169942617416382, + "rewards/margins": 0.6376385688781738, + "rewards/rejected": -2.2546324729919434, + "step": 13950 + }, + { + "epoch": 2.405237767057202, + "grad_norm": 34.05168533325195, + "learning_rate": 2.3007493560214787e-08, + "logits/chosen": -2.008964776992798, + "logits/rejected": -1.994206428527832, + "logps/chosen": -208.091064453125, + "logps/rejected": -256.63726806640625, + "loss": 0.5781, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.53592848777771, + "rewards/margins": 0.5078008770942688, + "rewards/rejected": -2.043729305267334, + "step": 13960 + }, + { + "epoch": 2.406960716747071, + "grad_norm": 27.172494888305664, + "learning_rate": 2.2879712318263056e-08, + "logits/chosen": -2.1079671382904053, + "logits/rejected": -2.0630617141723633, + "logps/chosen": -207.2488250732422, + "logps/rejected": -272.19744873046875, + "loss": 0.5354, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.4706203937530518, + "rewards/margins": 0.7252450585365295, + "rewards/rejected": -2.1958653926849365, + "step": 13970 + }, + { + "epoch": 2.40868366643694, + "grad_norm": 38.71217346191406, + "learning_rate": 2.2752241053865973e-08, + "logits/chosen": -2.0822973251342773, + "logits/rejected": -2.0517477989196777, + "logps/chosen": -209.283447265625, + "logps/rejected": -281.3548278808594, + "loss": 0.5157, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.5362039804458618, + "rewards/margins": 0.7391260266304016, + "rewards/rejected": -2.275330066680908, + "step": 13980 + }, + { + "epoch": 2.410406616126809, + "grad_norm": 49.03131103515625, + "learning_rate": 2.2625080279382024e-08, + "logits/chosen": -2.108020067214966, + "logits/rejected": -2.0622098445892334, + "logps/chosen": -210.24856567382812, + "logps/rejected": -269.4635925292969, + "loss": 0.55, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.5178747177124023, + "rewards/margins": 0.6414793729782104, + "rewards/rejected": -2.1593542098999023, + "step": 13990 + }, + { + "epoch": 2.412129565816678, + "grad_norm": 38.7979736328125, + "learning_rate": 2.249823050592169e-08, + "logits/chosen": -2.0372188091278076, + "logits/rejected": -1.9894564151763916, + "logps/chosen": -209.2309112548828, + "logps/rejected": -284.48480224609375, + "loss": 0.5055, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.5764163732528687, + "rewards/margins": 0.7421399354934692, + "rewards/rejected": -2.318556070327759, + "step": 14000 + }, + { + "epoch": 2.412129565816678, + "eval_logits/chosen": -2.1856634616851807, + "eval_logits/rejected": -2.1664555072784424, + "eval_logps/chosen": -205.19786071777344, + "eval_logps/rejected": -239.03466796875, + "eval_loss": 0.6384242177009583, + "eval_rewards/accuracies": 0.6256970167160034, + "eval_rewards/chosen": -1.46182382106781, + "eval_rewards/margins": 0.3010266423225403, + "eval_rewards/rejected": -1.7628505229949951, + "eval_runtime": 383.0062, + "eval_samples_per_second": 11.237, + "eval_steps_per_second": 1.405, + "step": 14000 + }, + { + "epoch": 2.413852515506547, + "grad_norm": 31.980548858642578, + "learning_rate": 2.2371692243345354e-08, + "logits/chosen": -2.0730888843536377, + "logits/rejected": -2.044764280319214, + "logps/chosen": -214.8597869873047, + "logps/rejected": -277.0663146972656, + "loss": 0.5672, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.5847432613372803, + "rewards/margins": 0.660110354423523, + "rewards/rejected": -2.2448537349700928, + "step": 14010 + }, + { + "epoch": 2.415575465196416, + "grad_norm": 36.432456970214844, + "learning_rate": 2.2245466000261394e-08, + "logits/chosen": -2.082345962524414, + "logits/rejected": -2.0584168434143066, + "logps/chosen": -218.6758575439453, + "logps/rejected": -274.13787841796875, + "loss": 0.5691, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.631087303161621, + "rewards/margins": 0.6045592427253723, + "rewards/rejected": -2.2356464862823486, + "step": 14020 + }, + { + "epoch": 2.4172984148862855, + "grad_norm": 40.95353698730469, + "learning_rate": 2.211955228402399e-08, + "logits/chosen": -2.0695528984069824, + "logits/rejected": -2.0302963256835938, + "logps/chosen": -220.57418823242188, + "logps/rejected": -279.3653259277344, + "loss": 0.5479, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.6627451181411743, + "rewards/margins": 0.6441237926483154, + "rewards/rejected": -2.3068687915802, + "step": 14030 + }, + { + "epoch": 2.4190213645761545, + "grad_norm": 43.107215881347656, + "learning_rate": 2.1993951600731154e-08, + "logits/chosen": -2.075979709625244, + "logits/rejected": -2.0187344551086426, + "logps/chosen": -218.7676239013672, + "logps/rejected": -283.742431640625, + "loss": 0.512, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.6136515140533447, + "rewards/margins": 0.7331534624099731, + "rewards/rejected": -2.3468048572540283, + "step": 14040 + }, + { + "epoch": 2.4207443142660234, + "grad_norm": 32.694793701171875, + "learning_rate": 2.186866445522273e-08, + "logits/chosen": -2.1185286045074463, + "logits/rejected": -2.072413682937622, + "logps/chosen": -202.73074340820312, + "logps/rejected": -259.3929443359375, + "loss": 0.5442, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.4719576835632324, + "rewards/margins": 0.622622013092041, + "rewards/rejected": -2.0945794582366943, + "step": 14050 + }, + { + "epoch": 2.4224672639558924, + "grad_norm": 36.23963165283203, + "learning_rate": 2.1743691351078332e-08, + "logits/chosen": -2.129517078399658, + "logits/rejected": -2.070852756500244, + "logps/chosen": -210.60415649414062, + "logps/rejected": -298.68585205078125, + "loss": 0.4587, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.535704255104065, + "rewards/margins": 0.9426101446151733, + "rewards/rejected": -2.4783143997192383, + "step": 14060 + }, + { + "epoch": 2.4241902136457614, + "grad_norm": 43.657432556152344, + "learning_rate": 2.161903279061529e-08, + "logits/chosen": -2.0908563137054443, + "logits/rejected": -2.0465846061706543, + "logps/chosen": -226.6675262451172, + "logps/rejected": -300.82196044921875, + "loss": 0.5151, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.7071689367294312, + "rewards/margins": 0.7603108882904053, + "rewards/rejected": -2.467479944229126, + "step": 14070 + }, + { + "epoch": 2.425913163335631, + "grad_norm": 37.620723724365234, + "learning_rate": 2.14946892748866e-08, + "logits/chosen": -2.040656805038452, + "logits/rejected": -1.9950001239776611, + "logps/chosen": -241.02963256835938, + "logps/rejected": -301.5330505371094, + "loss": 0.5648, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.8209514617919922, + "rewards/margins": 0.6740165948867798, + "rewards/rejected": -2.4949679374694824, + "step": 14080 + }, + { + "epoch": 2.4276361130254998, + "grad_norm": 43.765499114990234, + "learning_rate": 2.1370661303679084e-08, + "logits/chosen": -2.074293613433838, + "logits/rejected": -2.0272514820098877, + "logps/chosen": -216.4875946044922, + "logps/rejected": -272.84417724609375, + "loss": 0.5578, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.590613603591919, + "rewards/margins": 0.6262162327766418, + "rewards/rejected": -2.216829776763916, + "step": 14090 + }, + { + "epoch": 2.4293590627153687, + "grad_norm": 25.765214920043945, + "learning_rate": 2.1246949375511214e-08, + "logits/chosen": -2.1253323554992676, + "logits/rejected": -2.080301523208618, + "logps/chosen": -219.0335235595703, + "logps/rejected": -295.12152099609375, + "loss": 0.4907, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.5946428775787354, + "rewards/margins": 0.8154427409172058, + "rewards/rejected": -2.4100852012634277, + "step": 14100 + }, + { + "epoch": 2.4310820124052377, + "grad_norm": 49.0091552734375, + "learning_rate": 2.1123553987631126e-08, + "logits/chosen": -2.126990795135498, + "logits/rejected": -2.101468801498413, + "logps/chosen": -217.9353790283203, + "logps/rejected": -278.48260498046875, + "loss": 0.5529, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.6728681325912476, + "rewards/margins": 0.6070038676261902, + "rewards/rejected": -2.279871940612793, + "step": 14110 + }, + { + "epoch": 2.4328049620951067, + "grad_norm": 62.83906555175781, + "learning_rate": 2.1000475636014635e-08, + "logits/chosen": -2.1041083335876465, + "logits/rejected": -2.0625221729278564, + "logps/chosen": -219.5806427001953, + "logps/rejected": -277.6369934082031, + "loss": 0.5599, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.6661217212677002, + "rewards/margins": 0.5916668176651001, + "rewards/rejected": -2.2577884197235107, + "step": 14120 + }, + { + "epoch": 2.4345279117849756, + "grad_norm": 50.54969787597656, + "learning_rate": 2.0877714815363366e-08, + "logits/chosen": -2.13179349899292, + "logits/rejected": -2.097782611846924, + "logps/chosen": -213.8779296875, + "logps/rejected": -265.7869567871094, + "loss": 0.5524, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.5833685398101807, + "rewards/margins": 0.5597018003463745, + "rewards/rejected": -2.1430704593658447, + "step": 14130 + }, + { + "epoch": 2.436250861474845, + "grad_norm": 48.95522689819336, + "learning_rate": 2.0755272019102542e-08, + "logits/chosen": -2.1710026264190674, + "logits/rejected": -2.132071018218994, + "logps/chosen": -228.41012573242188, + "logps/rejected": -292.615966796875, + "loss": 0.5522, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.7536594867706299, + "rewards/margins": 0.6659385561943054, + "rewards/rejected": -2.419598340988159, + "step": 14140 + }, + { + "epoch": 2.437973811164714, + "grad_norm": 54.72209930419922, + "learning_rate": 2.063314773937921e-08, + "logits/chosen": -2.1553657054901123, + "logits/rejected": -2.120232343673706, + "logps/chosen": -224.70114135742188, + "logps/rejected": -288.44012451171875, + "loss": 0.5617, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.6969101428985596, + "rewards/margins": 0.641015887260437, + "rewards/rejected": -2.337926149368286, + "step": 14150 + }, + { + "epoch": 2.439696760854583, + "grad_norm": 36.77251434326172, + "learning_rate": 2.051134246706008e-08, + "logits/chosen": -2.1020195484161377, + "logits/rejected": -2.0639472007751465, + "logps/chosen": -217.10897827148438, + "logps/rejected": -275.93084716796875, + "loss": 0.5907, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.6270267963409424, + "rewards/margins": 0.5740610361099243, + "rewards/rejected": -2.2010879516601562, + "step": 14160 + }, + { + "epoch": 2.441419710544452, + "grad_norm": 38.20256423950195, + "learning_rate": 2.0389856691729734e-08, + "logits/chosen": -2.0549514293670654, + "logits/rejected": -2.01374888420105, + "logps/chosen": -209.72048950195312, + "logps/rejected": -270.4997863769531, + "loss": 0.5582, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.5325407981872559, + "rewards/margins": 0.6379245519638062, + "rewards/rejected": -2.1704652309417725, + "step": 14170 + }, + { + "epoch": 2.4431426602343214, + "grad_norm": 58.34638595581055, + "learning_rate": 2.026869090168849e-08, + "logits/chosen": -2.117248058319092, + "logits/rejected": -2.068472385406494, + "logps/chosen": -220.41830444335938, + "logps/rejected": -269.4673767089844, + "loss": 0.5768, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.6232048273086548, + "rewards/margins": 0.5637925267219543, + "rewards/rejected": -2.186997413635254, + "step": 14180 + }, + { + "epoch": 2.4448656099241903, + "grad_norm": 46.98957824707031, + "learning_rate": 2.0147845583950552e-08, + "logits/chosen": -2.1380128860473633, + "logits/rejected": -2.101867198944092, + "logps/chosen": -221.2222900390625, + "logps/rejected": -279.1551208496094, + "loss": 0.5456, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.6172845363616943, + "rewards/margins": 0.6354303359985352, + "rewards/rejected": -2.2527148723602295, + "step": 14190 + }, + { + "epoch": 2.4465885596140593, + "grad_norm": 33.68207550048828, + "learning_rate": 2.0027321224242067e-08, + "logits/chosen": -2.0417656898498535, + "logits/rejected": -2.0019569396972656, + "logps/chosen": -204.66732788085938, + "logps/rejected": -285.6546936035156, + "loss": 0.4812, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.4978091716766357, + "rewards/margins": 0.8225471377372742, + "rewards/rejected": -2.3203561305999756, + "step": 14200 + }, + { + "epoch": 2.4483115093039283, + "grad_norm": 56.56424331665039, + "learning_rate": 1.9907118306999017e-08, + "logits/chosen": -2.112985610961914, + "logits/rejected": -2.0740582942962646, + "logps/chosen": -222.14852905273438, + "logps/rejected": -287.42132568359375, + "loss": 0.5402, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.6827729940414429, + "rewards/margins": 0.6711798906326294, + "rewards/rejected": -2.3539528846740723, + "step": 14210 + }, + { + "epoch": 2.4500344589937972, + "grad_norm": 33.83453369140625, + "learning_rate": 1.9787237315365424e-08, + "logits/chosen": -2.164368152618408, + "logits/rejected": -2.113048791885376, + "logps/chosen": -226.94155883789062, + "logps/rejected": -297.2632751464844, + "loss": 0.5128, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.6764634847640991, + "rewards/margins": 0.750978410243988, + "rewards/rejected": -2.4274418354034424, + "step": 14220 + }, + { + "epoch": 2.451757408683666, + "grad_norm": 39.73827362060547, + "learning_rate": 1.9667678731191373e-08, + "logits/chosen": -2.028599262237549, + "logits/rejected": -1.974353551864624, + "logps/chosen": -228.91439819335938, + "logps/rejected": -302.0907897949219, + "loss": 0.5124, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.7156779766082764, + "rewards/margins": 0.795965313911438, + "rewards/rejected": -2.511643171310425, + "step": 14230 + }, + { + "epoch": 2.4534803583735356, + "grad_norm": 48.25334548950195, + "learning_rate": 1.9548443035031125e-08, + "logits/chosen": -2.028688430786133, + "logits/rejected": -1.9942216873168945, + "logps/chosen": -236.4214630126953, + "logps/rejected": -316.0946960449219, + "loss": 0.5161, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.8170276880264282, + "rewards/margins": 0.8088165521621704, + "rewards/rejected": -2.6258442401885986, + "step": 14240 + }, + { + "epoch": 2.4552033080634046, + "grad_norm": 44.34630584716797, + "learning_rate": 1.942953070614094e-08, + "logits/chosen": -2.0448529720306396, + "logits/rejected": -2.0084269046783447, + "logps/chosen": -234.31857299804688, + "logps/rejected": -294.1382141113281, + "loss": 0.551, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.7902448177337646, + "rewards/margins": 0.6250909566879272, + "rewards/rejected": -2.4153358936309814, + "step": 14250 + }, + { + "epoch": 2.4569262577532736, + "grad_norm": 63.74422073364258, + "learning_rate": 1.93109422224775e-08, + "logits/chosen": -2.1369385719299316, + "logits/rejected": -2.0834081172943115, + "logps/chosen": -230.72824096679688, + "logps/rejected": -293.1272888183594, + "loss": 0.5428, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.7284284830093384, + "rewards/margins": 0.7105134725570679, + "rewards/rejected": -2.4389424324035645, + "step": 14260 + }, + { + "epoch": 2.4586492074431425, + "grad_norm": 38.61103057861328, + "learning_rate": 1.9192678060695812e-08, + "logits/chosen": -2.1000282764434814, + "logits/rejected": -2.0557782649993896, + "logps/chosen": -233.59414672851562, + "logps/rejected": -309.5748291015625, + "loss": 0.5062, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7813142538070679, + "rewards/margins": 0.8087684512138367, + "rewards/rejected": -2.590082883834839, + "step": 14270 + }, + { + "epoch": 2.460372157133012, + "grad_norm": 34.51493835449219, + "learning_rate": 1.9074738696147196e-08, + "logits/chosen": -2.0385901927948, + "logits/rejected": -2.0119054317474365, + "logps/chosen": -225.359619140625, + "logps/rejected": -284.545654296875, + "loss": 0.5792, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.7285268306732178, + "rewards/margins": 0.5999918580055237, + "rewards/rejected": -2.3285186290740967, + "step": 14280 + }, + { + "epoch": 2.462095106822881, + "grad_norm": 35.47799301147461, + "learning_rate": 1.8957124602877618e-08, + "logits/chosen": -2.036486864089966, + "logits/rejected": -1.9845813512802124, + "logps/chosen": -228.3271942138672, + "logps/rejected": -290.8971862792969, + "loss": 0.5302, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.6707541942596436, + "rewards/margins": 0.7225546836853027, + "rewards/rejected": -2.3933091163635254, + "step": 14290 + }, + { + "epoch": 2.46381805651275, + "grad_norm": 31.757747650146484, + "learning_rate": 1.8839836253625496e-08, + "logits/chosen": -2.1560089588165283, + "logits/rejected": -2.1177501678466797, + "logps/chosen": -210.2001190185547, + "logps/rejected": -289.1632385253906, + "loss": 0.4882, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.5514459609985352, + "rewards/margins": 0.7843285202980042, + "rewards/rejected": -2.3357746601104736, + "step": 14300 + }, + { + "epoch": 2.465541006202619, + "grad_norm": 43.582733154296875, + "learning_rate": 1.872287411982011e-08, + "logits/chosen": -2.0736470222473145, + "logits/rejected": -2.032609701156616, + "logps/chosen": -228.1890106201172, + "logps/rejected": -295.87188720703125, + "loss": 0.5463, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.7359241247177124, + "rewards/margins": 0.7118991613388062, + "rewards/rejected": -2.4478230476379395, + "step": 14310 + }, + { + "epoch": 2.467263955892488, + "grad_norm": 36.0502815246582, + "learning_rate": 1.860623867157941e-08, + "logits/chosen": -2.0633580684661865, + "logits/rejected": -2.0303797721862793, + "logps/chosen": -209.2544708251953, + "logps/rejected": -286.25189208984375, + "loss": 0.4936, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5319209098815918, + "rewards/margins": 0.784302294254303, + "rewards/rejected": -2.31622314453125, + "step": 14320 + }, + { + "epoch": 2.468986905582357, + "grad_norm": 39.22027587890625, + "learning_rate": 1.8489930377708372e-08, + "logits/chosen": -2.1764934062957764, + "logits/rejected": -2.118807315826416, + "logps/chosen": -228.25131225585938, + "logps/rejected": -319.8058776855469, + "loss": 0.4699, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.723565697669983, + "rewards/margins": 0.9397293329238892, + "rewards/rejected": -2.663294792175293, + "step": 14330 + }, + { + "epoch": 2.470709855272226, + "grad_norm": 49.67063903808594, + "learning_rate": 1.8373949705696934e-08, + "logits/chosen": -2.0703787803649902, + "logits/rejected": -2.028900384902954, + "logps/chosen": -231.64517211914062, + "logps/rejected": -319.1253662109375, + "loss": 0.4915, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.758962631225586, + "rewards/margins": 0.8747455477714539, + "rewards/rejected": -2.6337082386016846, + "step": 14340 + }, + { + "epoch": 2.472432804962095, + "grad_norm": 36.914669036865234, + "learning_rate": 1.8258297121718204e-08, + "logits/chosen": -2.1144473552703857, + "logits/rejected": -2.0762577056884766, + "logps/chosen": -224.0520782470703, + "logps/rejected": -297.4252014160156, + "loss": 0.5135, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.6970233917236328, + "rewards/margins": 0.7429527044296265, + "rewards/rejected": -2.439976215362549, + "step": 14350 + }, + { + "epoch": 2.474155754651964, + "grad_norm": 52.473472595214844, + "learning_rate": 1.81429730906266e-08, + "logits/chosen": -2.0756735801696777, + "logits/rejected": -2.0378482341766357, + "logps/chosen": -248.0349578857422, + "logps/rejected": -300.68670654296875, + "loss": 0.6018, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.915371298789978, + "rewards/margins": 0.5735548734664917, + "rewards/rejected": -2.488926410675049, + "step": 14360 + }, + { + "epoch": 2.475878704341833, + "grad_norm": 40.73741912841797, + "learning_rate": 1.8027978075955953e-08, + "logits/chosen": -2.0995495319366455, + "logits/rejected": -2.0527689456939697, + "logps/chosen": -236.0800018310547, + "logps/rejected": -303.43731689453125, + "loss": 0.5193, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.79816472530365, + "rewards/margins": 0.7360619902610779, + "rewards/rejected": -2.534226417541504, + "step": 14370 + }, + { + "epoch": 2.4776016540317025, + "grad_norm": 36.18812561035156, + "learning_rate": 1.7913312539917624e-08, + "logits/chosen": -2.168733596801758, + "logits/rejected": -2.128675937652588, + "logps/chosen": -233.74838256835938, + "logps/rejected": -306.06292724609375, + "loss": 0.5068, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.8042490482330322, + "rewards/margins": 0.7510305643081665, + "rewards/rejected": -2.555279493331909, + "step": 14380 + }, + { + "epoch": 2.4793246037215715, + "grad_norm": 52.489402770996094, + "learning_rate": 1.7798976943398623e-08, + "logits/chosen": -2.078742504119873, + "logits/rejected": -2.0347018241882324, + "logps/chosen": -230.8499755859375, + "logps/rejected": -311.3640441894531, + "loss": 0.5033, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.7754895687103271, + "rewards/margins": 0.8343890309333801, + "rewards/rejected": -2.6098785400390625, + "step": 14390 + }, + { + "epoch": 2.4810475534114405, + "grad_norm": 52.65116882324219, + "learning_rate": 1.7684971745959887e-08, + "logits/chosen": -2.1193552017211914, + "logits/rejected": -2.0765717029571533, + "logps/chosen": -232.67263793945312, + "logps/rejected": -301.19561767578125, + "loss": 0.5404, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.7463268041610718, + "rewards/margins": 0.7748914957046509, + "rewards/rejected": -2.5212185382843018, + "step": 14400 + }, + { + "epoch": 2.4810475534114405, + "eval_logits/chosen": -2.161259889602661, + "eval_logits/rejected": -2.141052007675171, + "eval_logps/chosen": -224.158935546875, + "eval_logps/rejected": -260.64886474609375, + "eval_loss": 0.6405364871025085, + "eval_rewards/accuracies": 0.6284851431846619, + "eval_rewards/chosen": -1.6514345407485962, + "eval_rewards/margins": 0.32755807042121887, + "eval_rewards/rejected": -1.9789925813674927, + "eval_runtime": 383.2726, + "eval_samples_per_second": 11.23, + "eval_steps_per_second": 1.404, + "step": 14400 + }, + { + "epoch": 2.4827705031013094, + "grad_norm": 35.988075256347656, + "learning_rate": 1.7571297405834328e-08, + "logits/chosen": -2.082426071166992, + "logits/rejected": -2.0395476818084717, + "logps/chosen": -219.3708953857422, + "logps/rejected": -296.60821533203125, + "loss": 0.5272, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.6755558252334595, + "rewards/margins": 0.7709904909133911, + "rewards/rejected": -2.4465460777282715, + "step": 14410 + }, + { + "epoch": 2.4844934527911784, + "grad_norm": 33.371829986572266, + "learning_rate": 1.7457954379924967e-08, + "logits/chosen": -2.1155333518981934, + "logits/rejected": -2.0812180042266846, + "logps/chosen": -230.984130859375, + "logps/rejected": -296.2799377441406, + "loss": 0.571, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.782016396522522, + "rewards/margins": 0.6671894788742065, + "rewards/rejected": -2.4492058753967285, + "step": 14420 + }, + { + "epoch": 2.4862164024810474, + "grad_norm": 63.244773864746094, + "learning_rate": 1.7344943123803126e-08, + "logits/chosen": -2.0925819873809814, + "logits/rejected": -2.0567777156829834, + "logps/chosen": -221.43994140625, + "logps/rejected": -288.7841491699219, + "loss": 0.5606, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.6841884851455688, + "rewards/margins": 0.7082483768463135, + "rewards/rejected": -2.3924367427825928, + "step": 14430 + }, + { + "epoch": 2.4879393521709168, + "grad_norm": 36.74225616455078, + "learning_rate": 1.7232264091706682e-08, + "logits/chosen": -2.0929274559020996, + "logits/rejected": -2.0496408939361572, + "logps/chosen": -210.4450225830078, + "logps/rejected": -288.16119384765625, + "loss": 0.4963, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.5563026666641235, + "rewards/margins": 0.7918773889541626, + "rewards/rejected": -2.3481802940368652, + "step": 14440 + }, + { + "epoch": 2.4896623018607857, + "grad_norm": 35.94378662109375, + "learning_rate": 1.7119917736538115e-08, + "logits/chosen": -2.0818235874176025, + "logits/rejected": -2.0427815914154053, + "logps/chosen": -227.7500762939453, + "logps/rejected": -294.5572204589844, + "loss": 0.5141, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.7150242328643799, + "rewards/margins": 0.7176655530929565, + "rewards/rejected": -2.432689666748047, + "step": 14450 + }, + { + "epoch": 2.4913852515506547, + "grad_norm": 31.527305603027344, + "learning_rate": 1.700790450986276e-08, + "logits/chosen": -2.0948452949523926, + "logits/rejected": -2.061279058456421, + "logps/chosen": -221.8003387451172, + "logps/rejected": -284.5101013183594, + "loss": 0.5474, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.6733448505401611, + "rewards/margins": 0.6444646120071411, + "rewards/rejected": -2.3178091049194336, + "step": 14460 + }, + { + "epoch": 2.4931082012405237, + "grad_norm": 39.6318244934082, + "learning_rate": 1.6896224861907004e-08, + "logits/chosen": -2.1865909099578857, + "logits/rejected": -2.1406912803649902, + "logps/chosen": -232.9261474609375, + "logps/rejected": -295.4991760253906, + "loss": 0.5054, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.7305749654769897, + "rewards/margins": 0.7335001230239868, + "rewards/rejected": -2.4640750885009766, + "step": 14470 + }, + { + "epoch": 2.4948311509303926, + "grad_norm": 31.915298461914062, + "learning_rate": 1.6784879241556395e-08, + "logits/chosen": -2.1030077934265137, + "logits/rejected": -2.082937002182007, + "logps/chosen": -222.57131958007812, + "logps/rejected": -295.8778991699219, + "loss": 0.5156, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6824105978012085, + "rewards/margins": 0.7339216470718384, + "rewards/rejected": -2.416332721710205, + "step": 14480 + }, + { + "epoch": 2.496554100620262, + "grad_norm": 39.143375396728516, + "learning_rate": 1.667386809635387e-08, + "logits/chosen": -2.0768282413482666, + "logits/rejected": -2.0459749698638916, + "logps/chosen": -228.5738983154297, + "logps/rejected": -297.850341796875, + "loss": 0.5533, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.7719007730484009, + "rewards/margins": 0.6979681849479675, + "rewards/rejected": -2.4698688983917236, + "step": 14490 + }, + { + "epoch": 2.498277050310131, + "grad_norm": 33.62594985961914, + "learning_rate": 1.6563191872498062e-08, + "logits/chosen": -2.097094774246216, + "logits/rejected": -2.038964033126831, + "logps/chosen": -218.2742462158203, + "logps/rejected": -294.4154968261719, + "loss": 0.4801, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.6070648431777954, + "rewards/margins": 0.8043497204780579, + "rewards/rejected": -2.411414623260498, + "step": 14500 + }, + { + "epoch": 2.5, + "grad_norm": 41.214996337890625, + "learning_rate": 1.6452851014841374e-08, + "logits/chosen": -2.1380131244659424, + "logits/rejected": -2.1041531562805176, + "logps/chosen": -228.376220703125, + "logps/rejected": -275.10205078125, + "loss": 0.5938, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.7192304134368896, + "rewards/margins": 0.5255845785140991, + "rewards/rejected": -2.2448153495788574, + "step": 14510 + }, + { + "epoch": 2.501722949689869, + "grad_norm": 46.394371032714844, + "learning_rate": 1.634284596688823e-08, + "logits/chosen": -2.090603828430176, + "logits/rejected": -2.0525267124176025, + "logps/chosen": -226.4547119140625, + "logps/rejected": -286.3438720703125, + "loss": 0.5721, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.696892499923706, + "rewards/margins": 0.6256784796714783, + "rewards/rejected": -2.322571039199829, + "step": 14520 + }, + { + "epoch": 2.503445899379738, + "grad_norm": 30.314685821533203, + "learning_rate": 1.623317717079328e-08, + "logits/chosen": -2.1235809326171875, + "logits/rejected": -2.0845789909362793, + "logps/chosen": -225.544677734375, + "logps/rejected": -293.58978271484375, + "loss": 0.5188, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.6724278926849365, + "rewards/margins": 0.7151906490325928, + "rewards/rejected": -2.3876185417175293, + "step": 14530 + }, + { + "epoch": 2.505168849069607, + "grad_norm": 35.82331848144531, + "learning_rate": 1.6123845067359676e-08, + "logits/chosen": -2.0943241119384766, + "logits/rejected": -2.0486671924591064, + "logps/chosen": -215.40835571289062, + "logps/rejected": -291.39263916015625, + "loss": 0.5088, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.6005795001983643, + "rewards/margins": 0.7979137301445007, + "rewards/rejected": -2.3984932899475098, + "step": 14540 + }, + { + "epoch": 2.5068917987594763, + "grad_norm": 33.8138427734375, + "learning_rate": 1.6014850096037304e-08, + "logits/chosen": -2.1238183975219727, + "logits/rejected": -2.071692705154419, + "logps/chosen": -212.27365112304688, + "logps/rejected": -289.61138916015625, + "loss": 0.5027, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.5402295589447021, + "rewards/margins": 0.8287081718444824, + "rewards/rejected": -2.3689382076263428, + "step": 14550 + }, + { + "epoch": 2.5086147484493453, + "grad_norm": 37.8752555847168, + "learning_rate": 1.5906192694920883e-08, + "logits/chosen": -2.078948736190796, + "logits/rejected": -2.0395030975341797, + "logps/chosen": -222.35928344726562, + "logps/rejected": -295.3717346191406, + "loss": 0.5384, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.6899096965789795, + "rewards/margins": 0.7251261472702026, + "rewards/rejected": -2.4150357246398926, + "step": 14560 + }, + { + "epoch": 2.5103376981392143, + "grad_norm": 32.014007568359375, + "learning_rate": 1.5797873300748355e-08, + "logits/chosen": -2.035325527191162, + "logits/rejected": -2.0111773014068604, + "logps/chosen": -216.21414184570312, + "logps/rejected": -283.3451232910156, + "loss": 0.5642, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.6279312372207642, + "rewards/margins": 0.6702073812484741, + "rewards/rejected": -2.2981388568878174, + "step": 14570 + }, + { + "epoch": 2.5120606478290832, + "grad_norm": 43.30143737792969, + "learning_rate": 1.5689892348899103e-08, + "logits/chosen": -2.128683567047119, + "logits/rejected": -2.0904922485351562, + "logps/chosen": -214.5843963623047, + "logps/rejected": -279.8626403808594, + "loss": 0.5395, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.6395725011825562, + "rewards/margins": 0.657935619354248, + "rewards/rejected": -2.2975080013275146, + "step": 14580 + }, + { + "epoch": 2.5137835975189526, + "grad_norm": 37.98158645629883, + "learning_rate": 1.5582250273392107e-08, + "logits/chosen": -2.0742502212524414, + "logits/rejected": -2.0438551902770996, + "logps/chosen": -211.3285369873047, + "logps/rejected": -276.2047424316406, + "loss": 0.5419, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.574963927268982, + "rewards/margins": 0.6697554588317871, + "rewards/rejected": -2.2447195053100586, + "step": 14590 + }, + { + "epoch": 2.5155065472088216, + "grad_norm": 52.75736618041992, + "learning_rate": 1.547494750688435e-08, + "logits/chosen": -2.091930627822876, + "logits/rejected": -2.038133144378662, + "logps/chosen": -220.406494140625, + "logps/rejected": -295.84893798828125, + "loss": 0.4755, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.6465896368026733, + "rewards/margins": 0.8058226704597473, + "rewards/rejected": -2.4524121284484863, + "step": 14600 + }, + { + "epoch": 2.5172294968986906, + "grad_norm": 71.89414978027344, + "learning_rate": 1.5367984480668884e-08, + "logits/chosen": -2.066631317138672, + "logits/rejected": -2.019594669342041, + "logps/chosen": -223.1024169921875, + "logps/rejected": -288.41741943359375, + "loss": 0.5025, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.6547962427139282, + "rewards/margins": 0.7386652827262878, + "rewards/rejected": -2.3934614658355713, + "step": 14610 + }, + { + "epoch": 2.5189524465885595, + "grad_norm": 40.280296325683594, + "learning_rate": 1.526136162467333e-08, + "logits/chosen": -2.042661190032959, + "logits/rejected": -2.0123565196990967, + "logps/chosen": -239.93679809570312, + "logps/rejected": -309.1552429199219, + "loss": 0.5739, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.8766295909881592, + "rewards/margins": 0.6878591775894165, + "rewards/rejected": -2.564488649368286, + "step": 14620 + }, + { + "epoch": 2.5206753962784285, + "grad_norm": 38.62200164794922, + "learning_rate": 1.5155079367457925e-08, + "logits/chosen": -2.0123703479766846, + "logits/rejected": -1.9841398000717163, + "logps/chosen": -232.4508056640625, + "logps/rejected": -293.60650634765625, + "loss": 0.5437, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.777007818222046, + "rewards/margins": 0.6375226974487305, + "rewards/rejected": -2.4145302772521973, + "step": 14630 + }, + { + "epoch": 2.5223983459682975, + "grad_norm": 28.682510375976562, + "learning_rate": 1.5049138136213968e-08, + "logits/chosen": -2.040053606033325, + "logits/rejected": -2.0036227703094482, + "logps/chosen": -226.60018920898438, + "logps/rejected": -306.9270324707031, + "loss": 0.5403, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.7411445379257202, + "rewards/margins": 0.8110846281051636, + "rewards/rejected": -2.552229642868042, + "step": 14640 + }, + { + "epoch": 2.524121295658167, + "grad_norm": 99.67330169677734, + "learning_rate": 1.4943538356762065e-08, + "logits/chosen": -2.0883889198303223, + "logits/rejected": -2.0557003021240234, + "logps/chosen": -248.0118408203125, + "logps/rejected": -296.2115783691406, + "loss": 0.6158, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.9309829473495483, + "rewards/margins": 0.5404241681098938, + "rewards/rejected": -2.471407413482666, + "step": 14650 + }, + { + "epoch": 2.525844245348036, + "grad_norm": 43.54148483276367, + "learning_rate": 1.4838280453550234e-08, + "logits/chosen": -2.0526440143585205, + "logits/rejected": -1.996791124343872, + "logps/chosen": -229.3895263671875, + "logps/rejected": -313.713134765625, + "loss": 0.46, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.7467634677886963, + "rewards/margins": 0.8816523551940918, + "rewards/rejected": -2.628415822982788, + "step": 14660 + }, + { + "epoch": 2.527567195037905, + "grad_norm": 32.807762145996094, + "learning_rate": 1.4733364849652518e-08, + "logits/chosen": -2.005056619644165, + "logits/rejected": -1.9649536609649658, + "logps/chosen": -218.852783203125, + "logps/rejected": -297.0928039550781, + "loss": 0.4897, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.6662321090698242, + "rewards/margins": 0.7909852266311646, + "rewards/rejected": -2.4572174549102783, + "step": 14670 + }, + { + "epoch": 2.529290144727774, + "grad_norm": 53.55386734008789, + "learning_rate": 1.4628791966767095e-08, + "logits/chosen": -2.063568353652954, + "logits/rejected": -2.027937173843384, + "logps/chosen": -231.7452392578125, + "logps/rejected": -297.2411804199219, + "loss": 0.5546, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.8246181011199951, + "rewards/margins": 0.6291089057922363, + "rewards/rejected": -2.4537270069122314, + "step": 14680 + }, + { + "epoch": 2.531013094417643, + "grad_norm": 68.92058563232422, + "learning_rate": 1.4524562225214532e-08, + "logits/chosen": -2.069715976715088, + "logits/rejected": -2.0306973457336426, + "logps/chosen": -253.0674285888672, + "logps/rejected": -330.63787841796875, + "loss": 0.5521, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.0037357807159424, + "rewards/margins": 0.7751157283782959, + "rewards/rejected": -2.7788515090942383, + "step": 14690 + }, + { + "epoch": 2.532736044107512, + "grad_norm": 36.464759826660156, + "learning_rate": 1.4420676043936198e-08, + "logits/chosen": -2.1349639892578125, + "logits/rejected": -2.0926318168640137, + "logps/chosen": -251.04220581054688, + "logps/rejected": -341.8655090332031, + "loss": 0.5306, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.9554615020751953, + "rewards/margins": 0.9042149782180786, + "rewards/rejected": -2.8596763610839844, + "step": 14700 + }, + { + "epoch": 2.534458993797381, + "grad_norm": 45.3137321472168, + "learning_rate": 1.4317133840492612e-08, + "logits/chosen": -2.088095188140869, + "logits/rejected": -2.06190824508667, + "logps/chosen": -229.5307159423828, + "logps/rejected": -297.6787109375, + "loss": 0.5367, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.749803900718689, + "rewards/margins": 0.6973880529403687, + "rewards/rejected": -2.4471919536590576, + "step": 14710 + }, + { + "epoch": 2.53618194348725, + "grad_norm": 46.88576889038086, + "learning_rate": 1.4213936031061691e-08, + "logits/chosen": -2.0573620796203613, + "logits/rejected": -2.0041470527648926, + "logps/chosen": -236.66488647460938, + "logps/rejected": -316.1903381347656, + "loss": 0.5133, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.7825686931610107, + "rewards/margins": 0.8490725755691528, + "rewards/rejected": -2.631641387939453, + "step": 14720 + }, + { + "epoch": 2.537904893177119, + "grad_norm": 50.461631774902344, + "learning_rate": 1.411108303043701e-08, + "logits/chosen": -2.126465082168579, + "logits/rejected": -2.077259063720703, + "logps/chosen": -227.8856964111328, + "logps/rejected": -303.0856018066406, + "loss": 0.5361, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.73442804813385, + "rewards/margins": 0.7996103167533875, + "rewards/rejected": -2.5340380668640137, + "step": 14730 + }, + { + "epoch": 2.539627842866988, + "grad_norm": 49.08579635620117, + "learning_rate": 1.4008575252026334e-08, + "logits/chosen": -2.050816059112549, + "logits/rejected": -2.024536371231079, + "logps/chosen": -242.36447143554688, + "logps/rejected": -312.283935546875, + "loss": 0.5423, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.8535900115966797, + "rewards/margins": 0.7273507714271545, + "rewards/rejected": -2.5809407234191895, + "step": 14740 + }, + { + "epoch": 2.5413507925568575, + "grad_norm": 43.996585845947266, + "learning_rate": 1.3906413107849757e-08, + "logits/chosen": -2.079751491546631, + "logits/rejected": -2.0369937419891357, + "logps/chosen": -225.7006378173828, + "logps/rejected": -294.62030029296875, + "loss": 0.5134, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.721297264099121, + "rewards/margins": 0.7374666333198547, + "rewards/rejected": -2.458763837814331, + "step": 14750 + }, + { + "epoch": 2.5430737422467264, + "grad_norm": 33.08069610595703, + "learning_rate": 1.3804597008538177e-08, + "logits/chosen": -2.1115968227386475, + "logits/rejected": -2.0671963691711426, + "logps/chosen": -226.8126983642578, + "logps/rejected": -305.046142578125, + "loss": 0.5026, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7297484874725342, + "rewards/margins": 0.7960547208786011, + "rewards/rejected": -2.525803327560425, + "step": 14760 + }, + { + "epoch": 2.5447966919365954, + "grad_norm": 64.5079116821289, + "learning_rate": 1.3703127363331556e-08, + "logits/chosen": -2.100311756134033, + "logits/rejected": -2.0607922077178955, + "logps/chosen": -237.541259765625, + "logps/rejected": -309.08929443359375, + "loss": 0.5523, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.808807134628296, + "rewards/margins": 0.7249511480331421, + "rewards/rejected": -2.5337581634521484, + "step": 14770 + }, + { + "epoch": 2.5465196416264644, + "grad_norm": 39.559322357177734, + "learning_rate": 1.3602004580077375e-08, + "logits/chosen": -2.064465045928955, + "logits/rejected": -2.0404582023620605, + "logps/chosen": -228.71432495117188, + "logps/rejected": -295.16485595703125, + "loss": 0.5589, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.7813472747802734, + "rewards/margins": 0.6820805072784424, + "rewards/rejected": -2.463427782058716, + "step": 14780 + }, + { + "epoch": 2.548242591316334, + "grad_norm": 34.208404541015625, + "learning_rate": 1.3501229065228892e-08, + "logits/chosen": -2.1109213829040527, + "logits/rejected": -2.066493034362793, + "logps/chosen": -252.40908813476562, + "logps/rejected": -317.8335266113281, + "loss": 0.5796, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.9827115535736084, + "rewards/margins": 0.6900705099105835, + "rewards/rejected": -2.6727821826934814, + "step": 14790 + }, + { + "epoch": 2.5499655410062028, + "grad_norm": 47.914310455322266, + "learning_rate": 1.3400801223843539e-08, + "logits/chosen": -2.076321840286255, + "logits/rejected": -2.044887065887451, + "logps/chosen": -236.44992065429688, + "logps/rejected": -312.14312744140625, + "loss": 0.5348, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.8138248920440674, + "rewards/margins": 0.7549554705619812, + "rewards/rejected": -2.5687801837921143, + "step": 14800 + }, + { + "epoch": 2.5499655410062028, + "eval_logits/chosen": -2.157832384109497, + "eval_logits/rejected": -2.1374824047088623, + "eval_logps/chosen": -227.1384735107422, + "eval_logps/rejected": -263.64813232421875, + "eval_loss": 0.6417858004570007, + "eval_rewards/accuracies": 0.627555787563324, + "eval_rewards/chosen": -1.681229829788208, + "eval_rewards/margins": 0.32775557041168213, + "eval_rewards/rejected": -2.0089855194091797, + "eval_runtime": 383.3252, + "eval_samples_per_second": 11.228, + "eval_steps_per_second": 1.404, + "step": 14800 + }, + { + "epoch": 2.5516884906960717, + "grad_norm": 30.952611923217773, + "learning_rate": 1.3300721459581355e-08, + "logits/chosen": -2.113208293914795, + "logits/rejected": -2.0605287551879883, + "logps/chosen": -245.7742462158203, + "logps/rejected": -312.1253356933594, + "loss": 0.5285, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.8606784343719482, + "rewards/margins": 0.78578120470047, + "rewards/rejected": -2.6464600563049316, + "step": 14810 + }, + { + "epoch": 2.5534114403859407, + "grad_norm": 67.501953125, + "learning_rate": 1.3200990174703308e-08, + "logits/chosen": -2.2114782333374023, + "logits/rejected": -2.159426689147949, + "logps/chosen": -232.00704956054688, + "logps/rejected": -317.8869934082031, + "loss": 0.4664, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7742869853973389, + "rewards/margins": 0.8930938839912415, + "rewards/rejected": -2.6673808097839355, + "step": 14820 + }, + { + "epoch": 2.5551343900758097, + "grad_norm": 41.97214126586914, + "learning_rate": 1.3101607770069667e-08, + "logits/chosen": -2.0992329120635986, + "logits/rejected": -2.055812358856201, + "logps/chosen": -231.4944305419922, + "logps/rejected": -303.2246398925781, + "loss": 0.5352, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7502925395965576, + "rewards/margins": 0.7767872214317322, + "rewards/rejected": -2.5270798206329346, + "step": 14830 + }, + { + "epoch": 2.5568573397656786, + "grad_norm": 35.21275329589844, + "learning_rate": 1.3002574645138375e-08, + "logits/chosen": -2.1365883350372314, + "logits/rejected": -2.090890407562256, + "logps/chosen": -246.3368682861328, + "logps/rejected": -325.1856994628906, + "loss": 0.51, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.894221305847168, + "rewards/margins": 0.8003508448600769, + "rewards/rejected": -2.6945719718933105, + "step": 14840 + }, + { + "epoch": 2.558580289455548, + "grad_norm": 32.376625061035156, + "learning_rate": 1.2903891197963568e-08, + "logits/chosen": -2.076740026473999, + "logits/rejected": -2.0330796241760254, + "logps/chosen": -245.0225067138672, + "logps/rejected": -323.209228515625, + "loss": 0.5266, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.9002012014389038, + "rewards/margins": 0.8007373809814453, + "rewards/rejected": -2.7009384632110596, + "step": 14850 + }, + { + "epoch": 2.560303239145417, + "grad_norm": 40.997806549072266, + "learning_rate": 1.2805557825193857e-08, + "logits/chosen": -2.054138660430908, + "logits/rejected": -2.024360179901123, + "logps/chosen": -238.13211059570312, + "logps/rejected": -314.7568359375, + "loss": 0.5651, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.8250688314437866, + "rewards/margins": 0.8130731582641602, + "rewards/rejected": -2.6381421089172363, + "step": 14860 + }, + { + "epoch": 2.562026188835286, + "grad_norm": 41.17534255981445, + "learning_rate": 1.2707574922070708e-08, + "logits/chosen": -2.1252520084381104, + "logits/rejected": -2.0850586891174316, + "logps/chosen": -237.7712860107422, + "logps/rejected": -298.0438232421875, + "loss": 0.6044, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.7930355072021484, + "rewards/margins": 0.6633931398391724, + "rewards/rejected": -2.4564290046691895, + "step": 14870 + }, + { + "epoch": 2.563749138525155, + "grad_norm": 32.69382858276367, + "learning_rate": 1.2609942882426938e-08, + "logits/chosen": -2.0632481575012207, + "logits/rejected": -2.039973020553589, + "logps/chosen": -220.024658203125, + "logps/rejected": -292.401611328125, + "loss": 0.5123, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.6605151891708374, + "rewards/margins": 0.7392647862434387, + "rewards/rejected": -2.399779796600342, + "step": 14880 + }, + { + "epoch": 2.5654720882150244, + "grad_norm": 50.100215911865234, + "learning_rate": 1.2512662098685144e-08, + "logits/chosen": -2.028979778289795, + "logits/rejected": -1.9955250024795532, + "logps/chosen": -230.2629852294922, + "logps/rejected": -301.06414794921875, + "loss": 0.5213, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.7187559604644775, + "rewards/margins": 0.7263022661209106, + "rewards/rejected": -2.4450583457946777, + "step": 14890 + }, + { + "epoch": 2.5671950379048933, + "grad_norm": 57.26528549194336, + "learning_rate": 1.2415732961856006e-08, + "logits/chosen": -2.018132209777832, + "logits/rejected": -1.9759390354156494, + "logps/chosen": -217.76461791992188, + "logps/rejected": -286.56805419921875, + "loss": 0.5304, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6334377527236938, + "rewards/margins": 0.7253769636154175, + "rewards/rejected": -2.3588147163391113, + "step": 14900 + }, + { + "epoch": 2.5689179875947623, + "grad_norm": 34.72880935668945, + "learning_rate": 1.2319155861536867e-08, + "logits/chosen": -2.086937665939331, + "logits/rejected": -2.051175594329834, + "logps/chosen": -209.20523071289062, + "logps/rejected": -286.2922668457031, + "loss": 0.5023, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.5768992900848389, + "rewards/margins": 0.7545540928840637, + "rewards/rejected": -2.331453561782837, + "step": 14910 + }, + { + "epoch": 2.5706409372846313, + "grad_norm": 30.486162185668945, + "learning_rate": 1.222293118591008e-08, + "logits/chosen": -2.0713205337524414, + "logits/rejected": -2.0423014163970947, + "logps/chosen": -224.8202667236328, + "logps/rejected": -304.6694641113281, + "loss": 0.5278, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.7100989818572998, + "rewards/margins": 0.7935729026794434, + "rewards/rejected": -2.5036721229553223, + "step": 14920 + }, + { + "epoch": 2.5723638869745002, + "grad_norm": 51.449337005615234, + "learning_rate": 1.2127059321741417e-08, + "logits/chosen": -2.169572591781616, + "logits/rejected": -2.124962568283081, + "logps/chosen": -212.64041137695312, + "logps/rejected": -297.39984130859375, + "loss": 0.479, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.5795478820800781, + "rewards/margins": 0.8730304837226868, + "rewards/rejected": -2.45257830619812, + "step": 14930 + }, + { + "epoch": 2.574086836664369, + "grad_norm": 55.8746452331543, + "learning_rate": 1.203154065437857e-08, + "logits/chosen": -2.09590482711792, + "logits/rejected": -2.0522971153259277, + "logps/chosen": -219.5468292236328, + "logps/rejected": -285.1246337890625, + "loss": 0.527, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.6562507152557373, + "rewards/margins": 0.7033320665359497, + "rewards/rejected": -2.3595831394195557, + "step": 14940 + }, + { + "epoch": 2.575809786354238, + "grad_norm": 56.473838806152344, + "learning_rate": 1.1936375567749612e-08, + "logits/chosen": -2.1660001277923584, + "logits/rejected": -2.1302969455718994, + "logps/chosen": -234.9818572998047, + "logps/rejected": -291.95697021484375, + "loss": 0.5774, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.7821323871612549, + "rewards/margins": 0.6441555619239807, + "rewards/rejected": -2.426287889480591, + "step": 14950 + }, + { + "epoch": 2.5775327360441076, + "grad_norm": 39.60293197631836, + "learning_rate": 1.1841564444361496e-08, + "logits/chosen": -2.076197624206543, + "logits/rejected": -2.035750150680542, + "logps/chosen": -225.7178955078125, + "logps/rejected": -292.7468566894531, + "loss": 0.5525, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.6837339401245117, + "rewards/margins": 0.704573392868042, + "rewards/rejected": -2.388307571411133, + "step": 14960 + }, + { + "epoch": 2.5792556857339766, + "grad_norm": 44.428306579589844, + "learning_rate": 1.1747107665298273e-08, + "logits/chosen": -2.1206603050231934, + "logits/rejected": -2.073085308074951, + "logps/chosen": -215.6715087890625, + "logps/rejected": -288.7075500488281, + "loss": 0.5348, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.607478380203247, + "rewards/margins": 0.7512115240097046, + "rewards/rejected": -2.358689785003662, + "step": 14970 + }, + { + "epoch": 2.5809786354238455, + "grad_norm": 42.954856872558594, + "learning_rate": 1.1653005610219913e-08, + "logits/chosen": -2.142565965652466, + "logits/rejected": -2.084108352661133, + "logps/chosen": -220.28713989257812, + "logps/rejected": -305.529541015625, + "loss": 0.4796, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.6533963680267334, + "rewards/margins": 0.8964015245437622, + "rewards/rejected": -2.549797773361206, + "step": 14980 + }, + { + "epoch": 2.582701585113715, + "grad_norm": 36.77607345581055, + "learning_rate": 1.155925865736055e-08, + "logits/chosen": -2.141984224319458, + "logits/rejected": -2.114633798599243, + "logps/chosen": -212.90451049804688, + "logps/rejected": -295.39056396484375, + "loss": 0.5129, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.60614812374115, + "rewards/margins": 0.8274170756340027, + "rewards/rejected": -2.4335649013519287, + "step": 14990 + }, + { + "epoch": 2.584424534803584, + "grad_norm": 40.01217269897461, + "learning_rate": 1.146586718352699e-08, + "logits/chosen": -2.1471335887908936, + "logits/rejected": -2.1165921688079834, + "logps/chosen": -217.9677734375, + "logps/rejected": -282.8397521972656, + "loss": 0.5354, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.6128219366073608, + "rewards/margins": 0.6820386648178101, + "rewards/rejected": -2.294860601425171, + "step": 15000 + }, + { + "epoch": 2.586147484493453, + "grad_norm": 43.856868743896484, + "learning_rate": 1.1372831564097286e-08, + "logits/chosen": -2.146790027618408, + "logits/rejected": -2.109755754470825, + "logps/chosen": -217.1037139892578, + "logps/rejected": -278.0974426269531, + "loss": 0.5797, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.6128923892974854, + "rewards/margins": 0.6384140253067017, + "rewards/rejected": -2.2513060569763184, + "step": 15010 + }, + { + "epoch": 2.587870434183322, + "grad_norm": 60.50875473022461, + "learning_rate": 1.1280152173019075e-08, + "logits/chosen": -2.067186117172241, + "logits/rejected": -2.046182155609131, + "logps/chosen": -217.5106201171875, + "logps/rejected": -281.8955993652344, + "loss": 0.5814, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.663536787033081, + "rewards/margins": 0.6339327096939087, + "rewards/rejected": -2.2974696159362793, + "step": 15020 + }, + { + "epoch": 2.589593383873191, + "grad_norm": 43.150291442871094, + "learning_rate": 1.118782938280829e-08, + "logits/chosen": -2.0811879634857178, + "logits/rejected": -2.0418200492858887, + "logps/chosen": -218.2391357421875, + "logps/rejected": -283.5480651855469, + "loss": 0.5568, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.6285641193389893, + "rewards/margins": 0.6660295724868774, + "rewards/rejected": -2.2945938110351562, + "step": 15030 + }, + { + "epoch": 2.59131633356306, + "grad_norm": 61.47371292114258, + "learning_rate": 1.1095863564547436e-08, + "logits/chosen": -2.1220862865448, + "logits/rejected": -2.0855085849761963, + "logps/chosen": -216.48977661132812, + "logps/rejected": -276.1427307128906, + "loss": 0.5495, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.6123539209365845, + "rewards/margins": 0.6356294751167297, + "rewards/rejected": -2.247983455657959, + "step": 15040 + }, + { + "epoch": 2.5930392832529288, + "grad_norm": 32.4649772644043, + "learning_rate": 1.1004255087884273e-08, + "logits/chosen": -2.1372647285461426, + "logits/rejected": -2.0886662006378174, + "logps/chosen": -213.27688598632812, + "logps/rejected": -276.65118408203125, + "loss": 0.5256, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.5744832754135132, + "rewards/margins": 0.7090197801589966, + "rewards/rejected": -2.2835030555725098, + "step": 15050 + }, + { + "epoch": 2.594762232942798, + "grad_norm": 49.57844161987305, + "learning_rate": 1.0913004321030195e-08, + "logits/chosen": -2.0831291675567627, + "logits/rejected": -2.048346996307373, + "logps/chosen": -208.37832641601562, + "logps/rejected": -278.1231994628906, + "loss": 0.5475, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.5380138158798218, + "rewards/margins": 0.7039236426353455, + "rewards/rejected": -2.2419376373291016, + "step": 15060 + }, + { + "epoch": 2.596485182632667, + "grad_norm": 40.79827117919922, + "learning_rate": 1.0822111630758901e-08, + "logits/chosen": -2.1562612056732178, + "logits/rejected": -2.1056265830993652, + "logps/chosen": -210.0137176513672, + "logps/rejected": -265.4194030761719, + "loss": 0.5531, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.5162451267242432, + "rewards/margins": 0.6312421560287476, + "rewards/rejected": -2.147487163543701, + "step": 15070 + }, + { + "epoch": 2.598208132322536, + "grad_norm": 57.13422775268555, + "learning_rate": 1.0731577382404744e-08, + "logits/chosen": -2.1240782737731934, + "logits/rejected": -2.078268527984619, + "logps/chosen": -200.83511352539062, + "logps/rejected": -282.7082824707031, + "loss": 0.4758, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.4630084037780762, + "rewards/margins": 0.8306447863578796, + "rewards/rejected": -2.2936532497406006, + "step": 15080 + }, + { + "epoch": 2.599931082012405, + "grad_norm": 37.599464416503906, + "learning_rate": 1.0641401939861417e-08, + "logits/chosen": -2.13403058052063, + "logits/rejected": -2.0908923149108887, + "logps/chosen": -209.10098266601562, + "logps/rejected": -272.2961730957031, + "loss": 0.5502, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.5389658212661743, + "rewards/margins": 0.6597687602043152, + "rewards/rejected": -2.1987345218658447, + "step": 15090 + }, + { + "epoch": 2.6016540317022745, + "grad_norm": 39.21672439575195, + "learning_rate": 1.0551585665580465e-08, + "logits/chosen": -2.0761544704437256, + "logits/rejected": -2.0411019325256348, + "logps/chosen": -205.38424682617188, + "logps/rejected": -269.87884521484375, + "loss": 0.549, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.536199927330017, + "rewards/margins": 0.6629042029380798, + "rewards/rejected": -2.1991045475006104, + "step": 15100 + }, + { + "epoch": 2.6033769813921435, + "grad_norm": 38.53199768066406, + "learning_rate": 1.0462128920569635e-08, + "logits/chosen": -2.109217643737793, + "logits/rejected": -2.07594633102417, + "logps/chosen": -216.8025360107422, + "logps/rejected": -277.08612060546875, + "loss": 0.5523, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.6238981485366821, + "rewards/margins": 0.6392677426338196, + "rewards/rejected": -2.2631657123565674, + "step": 15110 + }, + { + "epoch": 2.6050999310820124, + "grad_norm": 44.340145111083984, + "learning_rate": 1.0373032064391729e-08, + "logits/chosen": -2.0932981967926025, + "logits/rejected": -2.0627665519714355, + "logps/chosen": -231.3092498779297, + "logps/rejected": -290.2330017089844, + "loss": 0.5546, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.7282911539077759, + "rewards/margins": 0.6158181428909302, + "rewards/rejected": -2.344109296798706, + "step": 15120 + }, + { + "epoch": 2.6068228807718814, + "grad_norm": 41.10717010498047, + "learning_rate": 1.0284295455162995e-08, + "logits/chosen": -2.077404260635376, + "logits/rejected": -2.0305936336517334, + "logps/chosen": -207.697265625, + "logps/rejected": -279.94598388671875, + "loss": 0.5005, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5323796272277832, + "rewards/margins": 0.7784202694892883, + "rewards/rejected": -2.310800075531006, + "step": 15130 + }, + { + "epoch": 2.6085458304617504, + "grad_norm": 32.70948028564453, + "learning_rate": 1.0195919449551637e-08, + "logits/chosen": -2.106325149536133, + "logits/rejected": -2.063744068145752, + "logps/chosen": -221.25039672851562, + "logps/rejected": -297.72052001953125, + "loss": 0.489, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6539459228515625, + "rewards/margins": 0.7634718418121338, + "rewards/rejected": -2.4174180030822754, + "step": 15140 + }, + { + "epoch": 2.6102687801516193, + "grad_norm": 43.813720703125, + "learning_rate": 1.0107904402776468e-08, + "logits/chosen": -2.2083098888397217, + "logits/rejected": -2.1639564037323, + "logps/chosen": -216.8330841064453, + "logps/rejected": -282.2804260253906, + "loss": 0.5509, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.6166563034057617, + "rewards/margins": 0.6878889203071594, + "rewards/rejected": -2.3045449256896973, + "step": 15150 + }, + { + "epoch": 2.6119917298414888, + "grad_norm": 42.778221130371094, + "learning_rate": 1.002025066860549e-08, + "logits/chosen": -2.06974196434021, + "logits/rejected": -2.034329414367676, + "logps/chosen": -215.4232940673828, + "logps/rejected": -290.89227294921875, + "loss": 0.5088, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.5965074300765991, + "rewards/margins": 0.7750741243362427, + "rewards/rejected": -2.371581792831421, + "step": 15160 + }, + { + "epoch": 2.6137146795313577, + "grad_norm": 59.19876480102539, + "learning_rate": 9.932958599354457e-09, + "logits/chosen": -2.065034866333008, + "logits/rejected": -2.02778959274292, + "logps/chosen": -206.2749786376953, + "logps/rejected": -277.1195068359375, + "loss": 0.5321, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.5284637212753296, + "rewards/margins": 0.7130771279335022, + "rewards/rejected": -2.2415406703948975, + "step": 15170 + }, + { + "epoch": 2.6154376292212267, + "grad_norm": 34.03282928466797, + "learning_rate": 9.846028545885376e-09, + "logits/chosen": -2.122490644454956, + "logits/rejected": -2.095641613006592, + "logps/chosen": -231.06039428710938, + "logps/rejected": -298.1627197265625, + "loss": 0.5581, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.7421585321426392, + "rewards/margins": 0.6905821561813354, + "rewards/rejected": -2.4327406883239746, + "step": 15180 + }, + { + "epoch": 2.6171605789110957, + "grad_norm": 34.73793029785156, + "learning_rate": 9.75946085760524e-09, + "logits/chosen": -2.071645736694336, + "logits/rejected": -2.0422959327697754, + "logps/chosen": -218.0347442626953, + "logps/rejected": -283.3576965332031, + "loss": 0.528, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.642773985862732, + "rewards/margins": 0.6757218241691589, + "rewards/rejected": -2.318495988845825, + "step": 15190 + }, + { + "epoch": 2.618883528600965, + "grad_norm": 31.4705867767334, + "learning_rate": 9.673255882464504e-09, + "logits/chosen": -2.120964765548706, + "logits/rejected": -2.073119878768921, + "logps/chosen": -226.7511444091797, + "logps/rejected": -296.59942626953125, + "loss": 0.5114, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.7025848627090454, + "rewards/margins": 0.7379928231239319, + "rewards/rejected": -2.440577507019043, + "step": 15200 + }, + { + "epoch": 2.618883528600965, + "eval_logits/chosen": -2.1731746196746826, + "eval_logits/rejected": -2.153785228729248, + "eval_logps/chosen": -214.88101196289062, + "eval_logps/rejected": -249.07342529296875, + "eval_loss": 0.6408128142356873, + "eval_rewards/accuracies": 0.6310408711433411, + "eval_rewards/chosen": -1.5586552619934082, + "eval_rewards/margins": 0.3045828640460968, + "eval_rewards/rejected": -1.8632382154464722, + "eval_runtime": 382.9379, + "eval_samples_per_second": 11.239, + "eval_steps_per_second": 1.405, + "step": 15200 + }, + { + "epoch": 2.620606478290834, + "grad_norm": 51.31040954589844, + "learning_rate": 9.587413966955737e-09, + "logits/chosen": -2.033310651779175, + "logits/rejected": -1.9806716442108154, + "logps/chosen": -232.90646362304688, + "logps/rejected": -294.5772399902344, + "loss": 0.5784, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.7613096237182617, + "rewards/margins": 0.6706287860870361, + "rewards/rejected": -2.431938648223877, + "step": 15210 + }, + { + "epoch": 2.622329427980703, + "grad_norm": 41.79688262939453, + "learning_rate": 9.501935456112254e-09, + "logits/chosen": -2.0468971729278564, + "logits/rejected": -1.9976829290390015, + "logps/chosen": -209.7871856689453, + "logps/rejected": -284.422607421875, + "loss": 0.476, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.5661617517471313, + "rewards/margins": 0.7923303842544556, + "rewards/rejected": -2.358492136001587, + "step": 15220 + }, + { + "epoch": 2.624052377670572, + "grad_norm": 40.47127914428711, + "learning_rate": 9.416820693506677e-09, + "logits/chosen": -2.074531316757202, + "logits/rejected": -2.032819986343384, + "logps/chosen": -220.6643829345703, + "logps/rejected": -293.26055908203125, + "loss": 0.5271, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.6469504833221436, + "rewards/margins": 0.7576435804367065, + "rewards/rejected": -2.4045939445495605, + "step": 15230 + }, + { + "epoch": 2.625775327360441, + "grad_norm": 42.0208740234375, + "learning_rate": 9.332070021249595e-09, + "logits/chosen": -2.068018913269043, + "logits/rejected": -2.01499605178833, + "logps/chosen": -222.3048095703125, + "logps/rejected": -291.1240234375, + "loss": 0.5095, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.622515082359314, + "rewards/margins": 0.7594237923622131, + "rewards/rejected": -2.3819386959075928, + "step": 15240 + }, + { + "epoch": 2.62749827705031, + "grad_norm": 45.18162536621094, + "learning_rate": 9.247683779988113e-09, + "logits/chosen": -2.1060614585876465, + "logits/rejected": -2.070847272872925, + "logps/chosen": -216.2559356689453, + "logps/rejected": -289.8507385253906, + "loss": 0.5203, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.6200711727142334, + "rewards/margins": 0.742321789264679, + "rewards/rejected": -2.3623929023742676, + "step": 15250 + }, + { + "epoch": 2.6292212267401793, + "grad_norm": 42.50625228881836, + "learning_rate": 9.163662308904608e-09, + "logits/chosen": -2.0640547275543213, + "logits/rejected": -2.033785820007324, + "logps/chosen": -234.31454467773438, + "logps/rejected": -286.94061279296875, + "loss": 0.6048, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.7885749340057373, + "rewards/margins": 0.5866588354110718, + "rewards/rejected": -2.3752341270446777, + "step": 15260 + }, + { + "epoch": 2.6309441764300483, + "grad_norm": 43.62517166137695, + "learning_rate": 9.080005945715307e-09, + "logits/chosen": -2.1171374320983887, + "logits/rejected": -2.050475597381592, + "logps/chosen": -238.31082153320312, + "logps/rejected": -316.4501037597656, + "loss": 0.5307, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.825531244277954, + "rewards/margins": 0.8491969108581543, + "rewards/rejected": -2.6747279167175293, + "step": 15270 + }, + { + "epoch": 2.6326671261199173, + "grad_norm": 38.742164611816406, + "learning_rate": 8.996715026668867e-09, + "logits/chosen": -2.1770644187927246, + "logits/rejected": -2.140118360519409, + "logps/chosen": -222.07058715820312, + "logps/rejected": -297.3765869140625, + "loss": 0.4889, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.683081865310669, + "rewards/margins": 0.7598232626914978, + "rewards/rejected": -2.4429049491882324, + "step": 15280 + }, + { + "epoch": 2.6343900758097862, + "grad_norm": 29.044748306274414, + "learning_rate": 8.913789886545064e-09, + "logits/chosen": -2.0895426273345947, + "logits/rejected": -2.0382912158966064, + "logps/chosen": -234.68301391601562, + "logps/rejected": -315.882080078125, + "loss": 0.5222, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.7909761667251587, + "rewards/margins": 0.8641980886459351, + "rewards/rejected": -2.6551737785339355, + "step": 15290 + }, + { + "epoch": 2.6361130254996556, + "grad_norm": 48.1595344543457, + "learning_rate": 8.831230858653538e-09, + "logits/chosen": -2.0058329105377197, + "logits/rejected": -1.9574228525161743, + "logps/chosen": -230.89028930664062, + "logps/rejected": -308.9717712402344, + "loss": 0.5445, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.7803550958633423, + "rewards/margins": 0.8227502107620239, + "rewards/rejected": -2.6031055450439453, + "step": 15300 + }, + { + "epoch": 2.6378359751895246, + "grad_norm": 37.70459747314453, + "learning_rate": 8.749038274832343e-09, + "logits/chosen": -2.135277271270752, + "logits/rejected": -2.0876753330230713, + "logps/chosen": -224.98989868164062, + "logps/rejected": -303.2028503417969, + "loss": 0.4922, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.6934198141098022, + "rewards/margins": 0.8129827380180359, + "rewards/rejected": -2.5064024925231934, + "step": 15310 + }, + { + "epoch": 2.6395589248793936, + "grad_norm": 53.39246368408203, + "learning_rate": 8.667212465446617e-09, + "logits/chosen": -2.063955307006836, + "logits/rejected": -2.0377583503723145, + "logps/chosen": -230.70285034179688, + "logps/rejected": -303.0435485839844, + "loss": 0.5394, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.7863218784332275, + "rewards/margins": 0.7197625637054443, + "rewards/rejected": -2.506084680557251, + "step": 15320 + }, + { + "epoch": 2.6412818745692626, + "grad_norm": 40.4294319152832, + "learning_rate": 8.585753759387292e-09, + "logits/chosen": -2.0826597213745117, + "logits/rejected": -2.036132335662842, + "logps/chosen": -235.56198120117188, + "logps/rejected": -313.53350830078125, + "loss": 0.4952, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.7901794910430908, + "rewards/margins": 0.8129813075065613, + "rewards/rejected": -2.6031606197357178, + "step": 15330 + }, + { + "epoch": 2.6430048242591315, + "grad_norm": 37.36764144897461, + "learning_rate": 8.504662484069824e-09, + "logits/chosen": -2.0842349529266357, + "logits/rejected": -2.0482351779937744, + "logps/chosen": -238.4183349609375, + "logps/rejected": -314.86151123046875, + "loss": 0.5193, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.83328115940094, + "rewards/margins": 0.7736489772796631, + "rewards/rejected": -2.6069302558898926, + "step": 15340 + }, + { + "epoch": 2.6447277739490005, + "grad_norm": 53.557281494140625, + "learning_rate": 8.423938965432708e-09, + "logits/chosen": -2.002223491668701, + "logits/rejected": -1.9665918350219727, + "logps/chosen": -234.70602416992188, + "logps/rejected": -310.17669677734375, + "loss": 0.525, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.7966200113296509, + "rewards/margins": 0.7793971300125122, + "rewards/rejected": -2.576017379760742, + "step": 15350 + }, + { + "epoch": 2.64645072363887, + "grad_norm": 33.29266357421875, + "learning_rate": 8.343583527936382e-09, + "logits/chosen": -2.0914101600646973, + "logits/rejected": -2.0616629123687744, + "logps/chosen": -233.9851837158203, + "logps/rejected": -300.9095764160156, + "loss": 0.5747, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.8075244426727295, + "rewards/margins": 0.6563832759857178, + "rewards/rejected": -2.4639077186584473, + "step": 15360 + }, + { + "epoch": 2.648173673328739, + "grad_norm": 37.487945556640625, + "learning_rate": 8.263596494561765e-09, + "logits/chosen": -2.104043483734131, + "logits/rejected": -2.0622448921203613, + "logps/chosen": -238.8284454345703, + "logps/rejected": -303.08538818359375, + "loss": 0.5598, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.8186184167861938, + "rewards/margins": 0.7071665525436401, + "rewards/rejected": -2.525784969329834, + "step": 15370 + }, + { + "epoch": 2.649896623018608, + "grad_norm": 37.66122055053711, + "learning_rate": 8.183978186809026e-09, + "logits/chosen": -2.0884664058685303, + "logits/rejected": -2.0510292053222656, + "logps/chosen": -233.5215301513672, + "logps/rejected": -307.7198181152344, + "loss": 0.5061, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.7857778072357178, + "rewards/margins": 0.778823971748352, + "rewards/rejected": -2.5646018981933594, + "step": 15380 + }, + { + "epoch": 2.651619572708477, + "grad_norm": 36.3622932434082, + "learning_rate": 8.104728924696237e-09, + "logits/chosen": -2.157322645187378, + "logits/rejected": -2.1230530738830566, + "logps/chosen": -227.74887084960938, + "logps/rejected": -309.10894775390625, + "loss": 0.5155, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.7101014852523804, + "rewards/margins": 0.8325842022895813, + "rewards/rejected": -2.5426855087280273, + "step": 15390 + }, + { + "epoch": 2.6533425223983462, + "grad_norm": 29.958967208862305, + "learning_rate": 8.02584902675818e-09, + "logits/chosen": -2.1099212169647217, + "logits/rejected": -2.0682926177978516, + "logps/chosen": -246.22549438476562, + "logps/rejected": -295.62115478515625, + "loss": 0.6045, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.8742916584014893, + "rewards/margins": 0.5758206844329834, + "rewards/rejected": -2.4501121044158936, + "step": 15400 + }, + { + "epoch": 2.655065472088215, + "grad_norm": 34.939613342285156, + "learning_rate": 7.947338810045035e-09, + "logits/chosen": -2.099294900894165, + "logits/rejected": -2.0514397621154785, + "logps/chosen": -238.8804931640625, + "logps/rejected": -285.91949462890625, + "loss": 0.591, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.7824329137802124, + "rewards/margins": 0.5694050192832947, + "rewards/rejected": -2.351837635040283, + "step": 15410 + }, + { + "epoch": 2.656788421778084, + "grad_norm": 54.7791748046875, + "learning_rate": 7.869198590120962e-09, + "logits/chosen": -2.0966858863830566, + "logits/rejected": -2.054964542388916, + "logps/chosen": -214.153076171875, + "logps/rejected": -305.690673828125, + "loss": 0.4777, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.6057186126708984, + "rewards/margins": 0.926251232624054, + "rewards/rejected": -2.5319697856903076, + "step": 15420 + }, + { + "epoch": 2.658511371467953, + "grad_norm": 29.094106674194336, + "learning_rate": 7.791428681063084e-09, + "logits/chosen": -2.1707558631896973, + "logits/rejected": -2.1257083415985107, + "logps/chosen": -227.0717315673828, + "logps/rejected": -300.9056396484375, + "loss": 0.4866, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.6737381219863892, + "rewards/margins": 0.8080810308456421, + "rewards/rejected": -2.4818191528320312, + "step": 15430 + }, + { + "epoch": 2.660234321157822, + "grad_norm": 26.97381019592285, + "learning_rate": 7.714029395460054e-09, + "logits/chosen": -2.1956639289855957, + "logits/rejected": -2.1574459075927734, + "logps/chosen": -215.98043823242188, + "logps/rejected": -278.1003112792969, + "loss": 0.5109, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.6043450832366943, + "rewards/margins": 0.6456824541091919, + "rewards/rejected": -2.2500274181365967, + "step": 15440 + }, + { + "epoch": 2.661957270847691, + "grad_norm": 41.384124755859375, + "learning_rate": 7.637001044410784e-09, + "logits/chosen": -1.992272138595581, + "logits/rejected": -1.9533510208129883, + "logps/chosen": -219.4346923828125, + "logps/rejected": -281.28619384765625, + "loss": 0.5541, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.6634387969970703, + "rewards/margins": 0.6239207983016968, + "rewards/rejected": -2.2873597145080566, + "step": 15450 + }, + { + "epoch": 2.66368022053756, + "grad_norm": 44.49235534667969, + "learning_rate": 7.560343937523361e-09, + "logits/chosen": -2.1659507751464844, + "logits/rejected": -2.131117582321167, + "logps/chosen": -215.7431182861328, + "logps/rejected": -285.8184509277344, + "loss": 0.5275, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.5413073301315308, + "rewards/margins": 0.7475422024726868, + "rewards/rejected": -2.288849353790283, + "step": 15460 + }, + { + "epoch": 2.6654031702274295, + "grad_norm": 48.974029541015625, + "learning_rate": 7.484058382913583e-09, + "logits/chosen": -2.153371572494507, + "logits/rejected": -2.1122617721557617, + "logps/chosen": -233.70059204101562, + "logps/rejected": -304.9097900390625, + "loss": 0.5193, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.7240705490112305, + "rewards/margins": 0.7797734141349792, + "rewards/rejected": -2.5038444995880127, + "step": 15470 + }, + { + "epoch": 2.6671261199172984, + "grad_norm": 39.822750091552734, + "learning_rate": 7.40814468720391e-09, + "logits/chosen": -2.125124454498291, + "logits/rejected": -2.0769131183624268, + "logps/chosen": -212.4561309814453, + "logps/rejected": -281.7795715332031, + "loss": 0.5407, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.5578019618988037, + "rewards/margins": 0.7482360601425171, + "rewards/rejected": -2.3060381412506104, + "step": 15480 + }, + { + "epoch": 2.6688490696071674, + "grad_norm": 49.14643859863281, + "learning_rate": 7.332603155522066e-09, + "logits/chosen": -2.102980136871338, + "logits/rejected": -2.0779075622558594, + "logps/chosen": -230.55142211914062, + "logps/rejected": -287.64202880859375, + "loss": 0.5676, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.7493207454681396, + "rewards/margins": 0.612503170967102, + "rewards/rejected": -2.361823558807373, + "step": 15490 + }, + { + "epoch": 2.670572019297037, + "grad_norm": 58.49702835083008, + "learning_rate": 7.257434091500014e-09, + "logits/chosen": -2.0715060234069824, + "logits/rejected": -2.048126697540283, + "logps/chosen": -241.28201293945312, + "logps/rejected": -292.69305419921875, + "loss": 0.6058, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.863378882408142, + "rewards/margins": 0.5241729021072388, + "rewards/rejected": -2.387551784515381, + "step": 15500 + }, + { + "epoch": 2.6722949689869058, + "grad_norm": 42.66554260253906, + "learning_rate": 7.182637797272506e-09, + "logits/chosen": -2.052743434906006, + "logits/rejected": -2.0031511783599854, + "logps/chosen": -219.3328857421875, + "logps/rejected": -296.0882568359375, + "loss": 0.5025, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.6018941402435303, + "rewards/margins": 0.827610969543457, + "rewards/rejected": -2.4295051097869873, + "step": 15510 + }, + { + "epoch": 2.6740179186767747, + "grad_norm": 40.335811614990234, + "learning_rate": 7.108214573476035e-09, + "logits/chosen": -2.0336544513702393, + "logits/rejected": -2.001194715499878, + "logps/chosen": -218.55990600585938, + "logps/rejected": -279.3645935058594, + "loss": 0.5463, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.6355009078979492, + "rewards/margins": 0.6691679358482361, + "rewards/rejected": -2.30466890335083, + "step": 15520 + }, + { + "epoch": 2.6757408683666437, + "grad_norm": 37.94197463989258, + "learning_rate": 7.0341647192475704e-09, + "logits/chosen": -2.0494165420532227, + "logits/rejected": -2.0060877799987793, + "logps/chosen": -205.8288116455078, + "logps/rejected": -279.2801513671875, + "loss": 0.5031, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.5091311931610107, + "rewards/margins": 0.7558835744857788, + "rewards/rejected": -2.265014886856079, + "step": 15530 + }, + { + "epoch": 2.6774638180565127, + "grad_norm": 39.58177947998047, + "learning_rate": 6.960488532223374e-09, + "logits/chosen": -2.0769643783569336, + "logits/rejected": -2.033782482147217, + "logps/chosen": -227.2656707763672, + "logps/rejected": -295.30426025390625, + "loss": 0.5561, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.739140510559082, + "rewards/margins": 0.7042891383171082, + "rewards/rejected": -2.443429708480835, + "step": 15540 + }, + { + "epoch": 2.6791867677463816, + "grad_norm": 55.12656021118164, + "learning_rate": 6.887186308537763e-09, + "logits/chosen": -2.1615262031555176, + "logits/rejected": -2.121224880218506, + "logps/chosen": -227.6425018310547, + "logps/rejected": -294.74786376953125, + "loss": 0.5313, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.7184242010116577, + "rewards/margins": 0.7132667899131775, + "rewards/rejected": -2.4316909313201904, + "step": 15550 + }, + { + "epoch": 2.6809097174362506, + "grad_norm": 46.9069938659668, + "learning_rate": 6.814258342821932e-09, + "logits/chosen": -2.100739002227783, + "logits/rejected": -2.078354597091675, + "logps/chosen": -223.0258026123047, + "logps/rejected": -281.591064453125, + "loss": 0.5781, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.7119252681732178, + "rewards/margins": 0.5784019231796265, + "rewards/rejected": -2.2903270721435547, + "step": 15560 + }, + { + "epoch": 2.68263266712612, + "grad_norm": 37.50832748413086, + "learning_rate": 6.741704928202807e-09, + "logits/chosen": -2.142457962036133, + "logits/rejected": -2.100450038909912, + "logps/chosen": -228.854248046875, + "logps/rejected": -301.9892883300781, + "loss": 0.5141, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.751019835472107, + "rewards/margins": 0.7723337411880493, + "rewards/rejected": -2.5233535766601562, + "step": 15570 + }, + { + "epoch": 2.684355616815989, + "grad_norm": 57.68934631347656, + "learning_rate": 6.669526356301869e-09, + "logits/chosen": -2.1601061820983887, + "logits/rejected": -2.1275477409362793, + "logps/chosen": -223.7733917236328, + "logps/rejected": -289.5685729980469, + "loss": 0.5404, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.6700990200042725, + "rewards/margins": 0.6639571785926819, + "rewards/rejected": -2.3340563774108887, + "step": 15580 + }, + { + "epoch": 2.686078566505858, + "grad_norm": 45.8087272644043, + "learning_rate": 6.597722917233894e-09, + "logits/chosen": -2.0941264629364014, + "logits/rejected": -2.056966781616211, + "logps/chosen": -208.35531616210938, + "logps/rejected": -270.4266662597656, + "loss": 0.5518, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.5251692533493042, + "rewards/margins": 0.6424340605735779, + "rewards/rejected": -2.1676034927368164, + "step": 15590 + }, + { + "epoch": 2.687801516195727, + "grad_norm": 41.391231536865234, + "learning_rate": 6.526294899605878e-09, + "logits/chosen": -2.1049602031707764, + "logits/rejected": -2.0679843425750732, + "logps/chosen": -226.60043334960938, + "logps/rejected": -299.9786682128906, + "loss": 0.5356, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.7385717630386353, + "rewards/margins": 0.7576156854629517, + "rewards/rejected": -2.496187448501587, + "step": 15600 + }, + { + "epoch": 2.687801516195727, + "eval_logits/chosen": -2.174306631088257, + "eval_logits/rejected": -2.1549601554870605, + "eval_logps/chosen": -213.9473419189453, + "eval_logps/rejected": -248.0917510986328, + "eval_loss": 0.6404752731323242, + "eval_rewards/accuracies": 0.6266263723373413, + "eval_rewards/chosen": -1.549318552017212, + "eval_rewards/margins": 0.3041025400161743, + "eval_rewards/rejected": -1.8534212112426758, + "eval_runtime": 383.0657, + "eval_samples_per_second": 11.236, + "eval_steps_per_second": 1.404, + "step": 15600 + }, + { + "epoch": 2.6895244658855963, + "grad_norm": 41.80103302001953, + "learning_rate": 6.455242590515842e-09, + "logits/chosen": -2.163078784942627, + "logits/rejected": -2.1271393299102783, + "logps/chosen": -224.98046875, + "logps/rejected": -290.39678955078125, + "loss": 0.547, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.679848074913025, + "rewards/margins": 0.6862354278564453, + "rewards/rejected": -2.3660836219787598, + "step": 15610 + }, + { + "epoch": 2.6912474155754653, + "grad_norm": 39.715118408203125, + "learning_rate": 6.384566275551717e-09, + "logits/chosen": -2.0747292041778564, + "logits/rejected": -2.0420749187469482, + "logps/chosen": -199.78384399414062, + "logps/rejected": -281.0851745605469, + "loss": 0.4806, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.448569893836975, + "rewards/margins": 0.8093862533569336, + "rewards/rejected": -2.257956027984619, + "step": 15620 + }, + { + "epoch": 2.6929703652653343, + "grad_norm": 37.04435348510742, + "learning_rate": 6.314266238790089e-09, + "logits/chosen": -2.0782577991485596, + "logits/rejected": -2.0221097469329834, + "logps/chosen": -231.4824676513672, + "logps/rejected": -305.20196533203125, + "loss": 0.5169, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.756324052810669, + "rewards/margins": 0.8029062151908875, + "rewards/rejected": -2.559230089187622, + "step": 15630 + }, + { + "epoch": 2.6946933149552033, + "grad_norm": 32.966758728027344, + "learning_rate": 6.244342762795207e-09, + "logits/chosen": -2.085794687271118, + "logits/rejected": -2.042360544204712, + "logps/chosen": -221.9350128173828, + "logps/rejected": -310.47705078125, + "loss": 0.4609, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.68125319480896, + "rewards/margins": 0.8988713026046753, + "rewards/rejected": -2.5801243782043457, + "step": 15640 + }, + { + "epoch": 2.6964162646450722, + "grad_norm": 55.33198928833008, + "learning_rate": 6.1747961286177205e-09, + "logits/chosen": -2.070162296295166, + "logits/rejected": -2.031337022781372, + "logps/chosen": -219.86752319335938, + "logps/rejected": -287.378662109375, + "loss": 0.5562, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.6482536792755127, + "rewards/margins": 0.6931672096252441, + "rewards/rejected": -2.341421127319336, + "step": 15650 + }, + { + "epoch": 2.698139214334941, + "grad_norm": 30.554706573486328, + "learning_rate": 6.105626615793602e-09, + "logits/chosen": -2.145211696624756, + "logits/rejected": -2.114421844482422, + "logps/chosen": -226.2273406982422, + "logps/rejected": -297.3497314453125, + "loss": 0.5082, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.723120093345642, + "rewards/margins": 0.7179322242736816, + "rewards/rejected": -2.4410524368286133, + "step": 15660 + }, + { + "epoch": 2.6998621640248106, + "grad_norm": 28.581279754638672, + "learning_rate": 6.036834502343058e-09, + "logits/chosen": -2.0493581295013428, + "logits/rejected": -1.996788740158081, + "logps/chosen": -217.5170440673828, + "logps/rejected": -293.3517761230469, + "loss": 0.476, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.623051643371582, + "rewards/margins": 0.8018845319747925, + "rewards/rejected": -2.424935817718506, + "step": 15670 + }, + { + "epoch": 2.7015851137146796, + "grad_norm": 31.51927947998047, + "learning_rate": 5.968420064769342e-09, + "logits/chosen": -2.0571978092193604, + "logits/rejected": -2.023253917694092, + "logps/chosen": -238.87527465820312, + "logps/rejected": -311.1076965332031, + "loss": 0.5185, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.8466196060180664, + "rewards/margins": 0.7387115359306335, + "rewards/rejected": -2.5853309631347656, + "step": 15680 + }, + { + "epoch": 2.7033080634045485, + "grad_norm": 27.066898345947266, + "learning_rate": 5.9003835780576774e-09, + "logits/chosen": -2.0971992015838623, + "logits/rejected": -2.064260482788086, + "logps/chosen": -218.11105346679688, + "logps/rejected": -283.69268798828125, + "loss": 0.538, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.6466944217681885, + "rewards/margins": 0.6604923605918884, + "rewards/rejected": -2.3071868419647217, + "step": 15690 + }, + { + "epoch": 2.7050310130944175, + "grad_norm": 34.388179779052734, + "learning_rate": 5.832725315674147e-09, + "logits/chosen": -2.102985382080078, + "logits/rejected": -2.0637826919555664, + "logps/chosen": -230.23574829101562, + "logps/rejected": -303.936767578125, + "loss": 0.5343, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.7688966989517212, + "rewards/margins": 0.7512655854225159, + "rewards/rejected": -2.520162582397461, + "step": 15700 + }, + { + "epoch": 2.706753962784287, + "grad_norm": 39.30438995361328, + "learning_rate": 5.76544554956463e-09, + "logits/chosen": -2.105860710144043, + "logits/rejected": -2.063655138015747, + "logps/chosen": -236.8208770751953, + "logps/rejected": -317.7994079589844, + "loss": 0.5073, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.7796757221221924, + "rewards/margins": 0.8442095518112183, + "rewards/rejected": -2.6238853931427, + "step": 15710 + }, + { + "epoch": 2.708476912474156, + "grad_norm": 58.20681381225586, + "learning_rate": 5.698544550153661e-09, + "logits/chosen": -2.0977749824523926, + "logits/rejected": -2.0756781101226807, + "logps/chosen": -229.4589080810547, + "logps/rejected": -288.4719543457031, + "loss": 0.5559, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.753771424293518, + "rewards/margins": 0.6107560396194458, + "rewards/rejected": -2.364527463912964, + "step": 15720 + }, + { + "epoch": 2.710199862164025, + "grad_norm": 40.557525634765625, + "learning_rate": 5.632022586343333e-09, + "logits/chosen": -2.1652374267578125, + "logits/rejected": -2.1265182495117188, + "logps/chosen": -218.47988891601562, + "logps/rejected": -295.4986877441406, + "loss": 0.5085, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.6443450450897217, + "rewards/margins": 0.7873646020889282, + "rewards/rejected": -2.4317097663879395, + "step": 15730 + }, + { + "epoch": 2.711922811853894, + "grad_norm": 36.61780548095703, + "learning_rate": 5.565879925512252e-09, + "logits/chosen": -2.1096248626708984, + "logits/rejected": -2.0603795051574707, + "logps/chosen": -225.2184600830078, + "logps/rejected": -292.0648498535156, + "loss": 0.5589, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.7009508609771729, + "rewards/margins": 0.7264993190765381, + "rewards/rejected": -2.427450180053711, + "step": 15740 + }, + { + "epoch": 2.713645761543763, + "grad_norm": 39.6802864074707, + "learning_rate": 5.50011683351449e-09, + "logits/chosen": -2.1140971183776855, + "logits/rejected": -2.071514368057251, + "logps/chosen": -237.9486846923828, + "logps/rejected": -314.0099792480469, + "loss": 0.5053, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.8496208190917969, + "rewards/margins": 0.7466347217559814, + "rewards/rejected": -2.5962555408477783, + "step": 15750 + }, + { + "epoch": 2.7153687112336318, + "grad_norm": 38.177574157714844, + "learning_rate": 5.434733574678418e-09, + "logits/chosen": -2.029038429260254, + "logits/rejected": -1.992246389389038, + "logps/chosen": -224.46682739257812, + "logps/rejected": -286.83203125, + "loss": 0.5738, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.7270536422729492, + "rewards/margins": 0.6463114619255066, + "rewards/rejected": -2.3733649253845215, + "step": 15760 + }, + { + "epoch": 2.717091660923501, + "grad_norm": 29.399215698242188, + "learning_rate": 5.369730411805762e-09, + "logits/chosen": -2.0837364196777344, + "logits/rejected": -2.045706272125244, + "logps/chosen": -214.2775115966797, + "logps/rejected": -298.97247314453125, + "loss": 0.4496, + "rewards/accuracies": 0.84375, + "rewards/chosen": -1.6066551208496094, + "rewards/margins": 0.8585401773452759, + "rewards/rejected": -2.4651951789855957, + "step": 15770 + }, + { + "epoch": 2.71881461061337, + "grad_norm": 40.45827865600586, + "learning_rate": 5.3051076061704445e-09, + "logits/chosen": -2.1905288696289062, + "logits/rejected": -2.159872531890869, + "logps/chosen": -237.99563598632812, + "logps/rejected": -286.5212707519531, + "loss": 0.6026, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.83493971824646, + "rewards/margins": 0.5101853609085083, + "rewards/rejected": -2.3451249599456787, + "step": 15780 + }, + { + "epoch": 2.720537560303239, + "grad_norm": 37.64627456665039, + "learning_rate": 5.240865417517604e-09, + "logits/chosen": -2.0218024253845215, + "logits/rejected": -1.9925413131713867, + "logps/chosen": -232.785888671875, + "logps/rejected": -302.86700439453125, + "loss": 0.5064, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7826083898544312, + "rewards/margins": 0.7289341688156128, + "rewards/rejected": -2.511542797088623, + "step": 15790 + }, + { + "epoch": 2.722260509993108, + "grad_norm": 26.595001220703125, + "learning_rate": 5.177004104062521e-09, + "logits/chosen": -2.160280704498291, + "logits/rejected": -2.1041154861450195, + "logps/chosen": -219.24887084960938, + "logps/rejected": -291.0208435058594, + "loss": 0.482, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5983530282974243, + "rewards/margins": 0.8090564608573914, + "rewards/rejected": -2.407409429550171, + "step": 15800 + }, + { + "epoch": 2.7239834596829775, + "grad_norm": 57.61701202392578, + "learning_rate": 5.113523922489571e-09, + "logits/chosen": -2.13946270942688, + "logits/rejected": -2.113898992538452, + "logps/chosen": -230.5516357421875, + "logps/rejected": -298.66229248046875, + "loss": 0.5488, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.816119909286499, + "rewards/margins": 0.6525536775588989, + "rewards/rejected": -2.4686739444732666, + "step": 15810 + }, + { + "epoch": 2.7257064093728465, + "grad_norm": 36.64439010620117, + "learning_rate": 5.0504251279512415e-09, + "logits/chosen": -2.0380873680114746, + "logits/rejected": -1.9975385665893555, + "logps/chosen": -224.701904296875, + "logps/rejected": -303.8479919433594, + "loss": 0.5166, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.7237030267715454, + "rewards/margins": 0.8035634756088257, + "rewards/rejected": -2.527266025543213, + "step": 15820 + }, + { + "epoch": 2.7274293590627154, + "grad_norm": 51.537010192871094, + "learning_rate": 4.987707974067046e-09, + "logits/chosen": -2.1332333087921143, + "logits/rejected": -2.1054179668426514, + "logps/chosen": -218.86459350585938, + "logps/rejected": -286.1872863769531, + "loss": 0.5528, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.67132568359375, + "rewards/margins": 0.6859435439109802, + "rewards/rejected": -2.357269048690796, + "step": 15830 + }, + { + "epoch": 2.7291523087525844, + "grad_norm": 44.762908935546875, + "learning_rate": 4.9253727129224934e-09, + "logits/chosen": -2.1452794075012207, + "logits/rejected": -2.1109366416931152, + "logps/chosen": -240.15103149414062, + "logps/rejected": -313.2534484863281, + "loss": 0.5432, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.8571033477783203, + "rewards/margins": 0.7278814315795898, + "rewards/rejected": -2.58498477935791, + "step": 15840 + }, + { + "epoch": 2.7308752584424534, + "grad_norm": 67.00555419921875, + "learning_rate": 4.863419595068197e-09, + "logits/chosen": -2.116338014602661, + "logits/rejected": -2.079974889755249, + "logps/chosen": -224.298583984375, + "logps/rejected": -290.0685119628906, + "loss": 0.5594, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.6982589960098267, + "rewards/margins": 0.6676750183105469, + "rewards/rejected": -2.365933895111084, + "step": 15850 + }, + { + "epoch": 2.7325982081323223, + "grad_norm": 59.24932861328125, + "learning_rate": 4.801848869518721e-09, + "logits/chosen": -2.1024158000946045, + "logits/rejected": -2.0658373832702637, + "logps/chosen": -227.84585571289062, + "logps/rejected": -278.639892578125, + "loss": 0.5878, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.735727071762085, + "rewards/margins": 0.5497328042984009, + "rewards/rejected": -2.2854597568511963, + "step": 15860 + }, + { + "epoch": 2.7343211578221913, + "grad_norm": 41.53136444091797, + "learning_rate": 4.740660783751638e-09, + "logits/chosen": -2.085195779800415, + "logits/rejected": -2.038940191268921, + "logps/chosen": -231.2070770263672, + "logps/rejected": -308.63604736328125, + "loss": 0.5134, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.7543834447860718, + "rewards/margins": 0.8147646188735962, + "rewards/rejected": -2.569148302078247, + "step": 15870 + }, + { + "epoch": 2.7360441075120607, + "grad_norm": 39.869197845458984, + "learning_rate": 4.679855583706571e-09, + "logits/chosen": -2.0924925804138184, + "logits/rejected": -2.057692050933838, + "logps/chosen": -220.9092254638672, + "logps/rejected": -302.0570068359375, + "loss": 0.4935, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.6684633493423462, + "rewards/margins": 0.8464488983154297, + "rewards/rejected": -2.5149121284484863, + "step": 15880 + }, + { + "epoch": 2.7377670572019297, + "grad_norm": 39.32370376586914, + "learning_rate": 4.619433513784166e-09, + "logits/chosen": -2.1212639808654785, + "logits/rejected": -2.0752553939819336, + "logps/chosen": -225.4252166748047, + "logps/rejected": -288.95513916015625, + "loss": 0.5521, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.7253128290176392, + "rewards/margins": 0.6845265626907349, + "rewards/rejected": -2.409839153289795, + "step": 15890 + }, + { + "epoch": 2.7394900068917987, + "grad_norm": 52.2964973449707, + "learning_rate": 4.559394816845075e-09, + "logits/chosen": -2.1233177185058594, + "logits/rejected": -2.0619730949401855, + "logps/chosen": -233.4136505126953, + "logps/rejected": -299.57403564453125, + "loss": 0.5222, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.7194812297821045, + "rewards/margins": 0.7652706503868103, + "rewards/rejected": -2.4847517013549805, + "step": 15900 + }, + { + "epoch": 2.741212956581668, + "grad_norm": 44.44611740112305, + "learning_rate": 4.499739734209074e-09, + "logits/chosen": -2.05899977684021, + "logits/rejected": -2.015986204147339, + "logps/chosen": -208.9771728515625, + "logps/rejected": -275.2792053222656, + "loss": 0.5347, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.5446122884750366, + "rewards/margins": 0.7179552316665649, + "rewards/rejected": -2.2625672817230225, + "step": 15910 + }, + { + "epoch": 2.742935906271537, + "grad_norm": 39.67881393432617, + "learning_rate": 4.440468505653982e-09, + "logits/chosen": -2.0713725090026855, + "logits/rejected": -2.0402581691741943, + "logps/chosen": -231.6560516357422, + "logps/rejected": -300.25970458984375, + "loss": 0.5462, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.78274667263031, + "rewards/margins": 0.6845306158065796, + "rewards/rejected": -2.4672775268554688, + "step": 15920 + }, + { + "epoch": 2.744658855961406, + "grad_norm": 35.616493225097656, + "learning_rate": 4.381581369414822e-09, + "logits/chosen": -2.025343418121338, + "logits/rejected": -1.9802414178848267, + "logps/chosen": -212.61770629882812, + "logps/rejected": -276.72784423828125, + "loss": 0.51, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.5602275133132935, + "rewards/margins": 0.7136334180831909, + "rewards/rejected": -2.2738606929779053, + "step": 15930 + }, + { + "epoch": 2.746381805651275, + "grad_norm": 29.387271881103516, + "learning_rate": 4.323078562182702e-09, + "logits/chosen": -2.0832173824310303, + "logits/rejected": -2.031151533126831, + "logps/chosen": -224.5006561279297, + "logps/rejected": -316.13031005859375, + "loss": 0.4689, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.6402543783187866, + "rewards/margins": 0.9359604120254517, + "rewards/rejected": -2.5762152671813965, + "step": 15940 + }, + { + "epoch": 2.748104755341144, + "grad_norm": 55.440773010253906, + "learning_rate": 4.2649603191040715e-09, + "logits/chosen": -2.155172824859619, + "logits/rejected": -2.1123874187469482, + "logps/chosen": -219.11181640625, + "logps/rejected": -279.84783935546875, + "loss": 0.5402, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5970779657363892, + "rewards/margins": 0.68474942445755, + "rewards/rejected": -2.281827449798584, + "step": 15950 + }, + { + "epoch": 2.749827705031013, + "grad_norm": 27.0786075592041, + "learning_rate": 4.207226873779557e-09, + "logits/chosen": -2.1281867027282715, + "logits/rejected": -2.085352659225464, + "logps/chosen": -228.1505126953125, + "logps/rejected": -296.83099365234375, + "loss": 0.5425, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.7110164165496826, + "rewards/margins": 0.7366132736206055, + "rewards/rejected": -2.447629451751709, + "step": 15960 + }, + { + "epoch": 2.751550654720882, + "grad_norm": 47.35169982910156, + "learning_rate": 4.149878458263179e-09, + "logits/chosen": -2.1108779907226562, + "logits/rejected": -2.073906421661377, + "logps/chosen": -223.5197296142578, + "logps/rejected": -297.11297607421875, + "loss": 0.5184, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.6669301986694336, + "rewards/margins": 0.7547533512115479, + "rewards/rejected": -2.4216837882995605, + "step": 15970 + }, + { + "epoch": 2.7532736044107513, + "grad_norm": 55.57887268066406, + "learning_rate": 4.092915303061372e-09, + "logits/chosen": -2.08697772026062, + "logits/rejected": -2.0546367168426514, + "logps/chosen": -229.6186981201172, + "logps/rejected": -289.40289306640625, + "loss": 0.5517, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.7267612218856812, + "rewards/margins": 0.6117097735404968, + "rewards/rejected": -2.3384711742401123, + "step": 15980 + }, + { + "epoch": 2.7549965541006203, + "grad_norm": 42.30339431762695, + "learning_rate": 4.0363376371320366e-09, + "logits/chosen": -2.1910502910614014, + "logits/rejected": -2.1740078926086426, + "logps/chosen": -221.6997528076172, + "logps/rejected": -283.23016357421875, + "loss": 0.5523, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.6591275930404663, + "rewards/margins": 0.6236189603805542, + "rewards/rejected": -2.2827465534210205, + "step": 15990 + }, + { + "epoch": 2.7567195037904892, + "grad_norm": 33.60514450073242, + "learning_rate": 3.98014568788364e-09, + "logits/chosen": -2.065885543823242, + "logits/rejected": -2.0202903747558594, + "logps/chosen": -219.5049285888672, + "logps/rejected": -292.70697021484375, + "loss": 0.4885, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.6498653888702393, + "rewards/margins": 0.7682273983955383, + "rewards/rejected": -2.4180924892425537, + "step": 16000 + }, + { + "epoch": 2.7567195037904892, + "eval_logits/chosen": -2.170712947845459, + "eval_logits/rejected": -2.1511991024017334, + "eval_logps/chosen": -217.2328338623047, + "eval_logps/rejected": -251.9056396484375, + "eval_loss": 0.6406311392784119, + "eval_rewards/accuracies": 0.6268587112426758, + "eval_rewards/chosen": -1.5821737051010132, + "eval_rewards/margins": 0.3093867003917694, + "eval_rewards/rejected": -1.891560435295105, + "eval_runtime": 383.3325, + "eval_samples_per_second": 11.228, + "eval_steps_per_second": 1.403, + "step": 16000 + }, + { + "epoch": 2.758442453480358, + "grad_norm": 60.14817428588867, + "learning_rate": 3.924339681174293e-09, + "logits/chosen": -2.1418251991271973, + "logits/rejected": -2.115861177444458, + "logps/chosen": -228.0880584716797, + "logps/rejected": -287.68463134765625, + "loss": 0.5822, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.720970869064331, + "rewards/margins": 0.6308675408363342, + "rewards/rejected": -2.3518383502960205, + "step": 16010 + }, + { + "epoch": 2.7601654031702276, + "grad_norm": 23.835336685180664, + "learning_rate": 3.868919841310858e-09, + "logits/chosen": -2.176820993423462, + "logits/rejected": -2.137303590774536, + "logps/chosen": -223.884521484375, + "logps/rejected": -293.23089599609375, + "loss": 0.5382, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.6952816247940063, + "rewards/margins": 0.7307706475257874, + "rewards/rejected": -2.4260523319244385, + "step": 16020 + }, + { + "epoch": 2.7618883528600966, + "grad_norm": 41.4140625, + "learning_rate": 3.81388639104806e-09, + "logits/chosen": -2.159712314605713, + "logits/rejected": -2.121702194213867, + "logps/chosen": -223.4437713623047, + "logps/rejected": -291.7823181152344, + "loss": 0.5451, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.6980524063110352, + "rewards/margins": 0.7046465873718262, + "rewards/rejected": -2.4026989936828613, + "step": 16030 + }, + { + "epoch": 2.7636113025499656, + "grad_norm": 48.69667434692383, + "learning_rate": 3.759239551587512e-09, + "logits/chosen": -2.1217103004455566, + "logits/rejected": -2.083436965942383, + "logps/chosen": -231.5419464111328, + "logps/rejected": -299.17242431640625, + "loss": 0.546, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.7604268789291382, + "rewards/margins": 0.7160990834236145, + "rewards/rejected": -2.4765260219573975, + "step": 16040 + }, + { + "epoch": 2.7653342522398345, + "grad_norm": 35.25212860107422, + "learning_rate": 3.7049795425769027e-09, + "logits/chosen": -2.1005589962005615, + "logits/rejected": -2.057460308074951, + "logps/chosen": -217.8822021484375, + "logps/rejected": -303.2470397949219, + "loss": 0.4636, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -1.626956582069397, + "rewards/margins": 0.8781922459602356, + "rewards/rejected": -2.5051486492156982, + "step": 16050 + }, + { + "epoch": 2.7670572019297035, + "grad_norm": 38.03778076171875, + "learning_rate": 3.6511065821091314e-09, + "logits/chosen": -2.1131396293640137, + "logits/rejected": -2.079918384552002, + "logps/chosen": -217.0569305419922, + "logps/rejected": -286.45526123046875, + "loss": 0.5129, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6031198501586914, + "rewards/margins": 0.7212660908699036, + "rewards/rejected": -2.3243861198425293, + "step": 16060 + }, + { + "epoch": 2.7687801516195725, + "grad_norm": 36.92191696166992, + "learning_rate": 3.597620886721342e-09, + "logits/chosen": -2.0656356811523438, + "logits/rejected": -2.0290164947509766, + "logps/chosen": -225.89096069335938, + "logps/rejected": -298.1449279785156, + "loss": 0.487, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.6773040294647217, + "rewards/margins": 0.7722429633140564, + "rewards/rejected": -2.449547290802002, + "step": 16070 + }, + { + "epoch": 2.770503101309442, + "grad_norm": 32.893951416015625, + "learning_rate": 3.5445226713941457e-09, + "logits/chosen": -2.1103484630584717, + "logits/rejected": -2.0507454872131348, + "logps/chosen": -230.32687377929688, + "logps/rejected": -302.87579345703125, + "loss": 0.5217, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.7214205265045166, + "rewards/margins": 0.7978491187095642, + "rewards/rejected": -2.5192697048187256, + "step": 16080 + }, + { + "epoch": 2.772226050999311, + "grad_norm": 48.20229721069336, + "learning_rate": 3.491812149550688e-09, + "logits/chosen": -2.106853485107422, + "logits/rejected": -2.065788745880127, + "logps/chosen": -221.03012084960938, + "logps/rejected": -291.2431335449219, + "loss": 0.5219, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.6745634078979492, + "rewards/margins": 0.7341934442520142, + "rewards/rejected": -2.408756971359253, + "step": 16090 + }, + { + "epoch": 2.77394900068918, + "grad_norm": 40.7789192199707, + "learning_rate": 3.4394895330558284e-09, + "logits/chosen": -2.1259613037109375, + "logits/rejected": -2.0911924839019775, + "logps/chosen": -222.92385864257812, + "logps/rejected": -309.33013916015625, + "loss": 0.4922, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.6919066905975342, + "rewards/margins": 0.8644245862960815, + "rewards/rejected": -2.5563313961029053, + "step": 16100 + }, + { + "epoch": 2.775671950379049, + "grad_norm": 42.12590026855469, + "learning_rate": 3.3875550322152503e-09, + "logits/chosen": -2.0365333557128906, + "logits/rejected": -1.9861524105072021, + "logps/chosen": -221.8624725341797, + "logps/rejected": -294.39288330078125, + "loss": 0.5231, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.638580083847046, + "rewards/margins": 0.7712908983230591, + "rewards/rejected": -2.4098711013793945, + "step": 16110 + }, + { + "epoch": 2.777394900068918, + "grad_norm": 52.53358459472656, + "learning_rate": 3.3360088557746856e-09, + "logits/chosen": -2.1154911518096924, + "logits/rejected": -2.0882134437561035, + "logps/chosen": -217.0634765625, + "logps/rejected": -280.804443359375, + "loss": 0.5567, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.660840630531311, + "rewards/margins": 0.6266318559646606, + "rewards/rejected": -2.2874722480773926, + "step": 16120 + }, + { + "epoch": 2.779117849758787, + "grad_norm": 50.5311279296875, + "learning_rate": 3.2848512109190375e-09, + "logits/chosen": -2.087770700454712, + "logits/rejected": -2.048919916152954, + "logps/chosen": -232.2989959716797, + "logps/rejected": -297.85614013671875, + "loss": 0.5416, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.789602518081665, + "rewards/margins": 0.6839150190353394, + "rewards/rejected": -2.473517417907715, + "step": 16130 + }, + { + "epoch": 2.780840799448656, + "grad_norm": 43.92984390258789, + "learning_rate": 3.2340823032715125e-09, + "logits/chosen": -2.1730029582977295, + "logits/rejected": -2.129218578338623, + "logps/chosen": -229.45938110351562, + "logps/rejected": -296.0975036621094, + "loss": 0.5441, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.7570350170135498, + "rewards/margins": 0.6896018981933594, + "rewards/rejected": -2.44663667678833, + "step": 16140 + }, + { + "epoch": 2.782563749138525, + "grad_norm": 42.87978744506836, + "learning_rate": 3.1837023368928017e-09, + "logits/chosen": -2.144951343536377, + "logits/rejected": -2.1099157333374023, + "logps/chosen": -230.6343536376953, + "logps/rejected": -295.3407897949219, + "loss": 0.542, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.7498550415039062, + "rewards/margins": 0.6905009150505066, + "rewards/rejected": -2.4403560161590576, + "step": 16150 + }, + { + "epoch": 2.784286698828394, + "grad_norm": 44.73198318481445, + "learning_rate": 3.133711514280357e-09, + "logits/chosen": -2.131856679916382, + "logits/rejected": -2.0838818550109863, + "logps/chosen": -218.6786346435547, + "logps/rejected": -304.34088134765625, + "loss": 0.4743, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.6389833688735962, + "rewards/margins": 0.8746803402900696, + "rewards/rejected": -2.5136635303497314, + "step": 16160 + }, + { + "epoch": 2.786009648518263, + "grad_norm": 32.14342498779297, + "learning_rate": 3.084110036367449e-09, + "logits/chosen": -2.046508312225342, + "logits/rejected": -2.0131518840789795, + "logps/chosen": -235.529541015625, + "logps/rejected": -296.33636474609375, + "loss": 0.5556, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.8099933862686157, + "rewards/margins": 0.6219733953475952, + "rewards/rejected": -2.43196702003479, + "step": 16170 + }, + { + "epoch": 2.7877325982081325, + "grad_norm": 40.33845520019531, + "learning_rate": 3.034898102522454e-09, + "logits/chosen": -2.088554859161377, + "logits/rejected": -2.026787281036377, + "logps/chosen": -236.4054718017578, + "logps/rejected": -316.6522521972656, + "loss": 0.5058, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.7643744945526123, + "rewards/margins": 0.889947235584259, + "rewards/rejected": -2.6543216705322266, + "step": 16180 + }, + { + "epoch": 2.7894555478980014, + "grad_norm": 48.48818588256836, + "learning_rate": 2.9860759105479582e-09, + "logits/chosen": -2.0988450050354004, + "logits/rejected": -2.0645952224731445, + "logps/chosen": -227.4365234375, + "logps/rejected": -295.9173889160156, + "loss": 0.5447, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.7268270254135132, + "rewards/margins": 0.6935473680496216, + "rewards/rejected": -2.4203743934631348, + "step": 16190 + }, + { + "epoch": 2.7911784975878704, + "grad_norm": 45.35523223876953, + "learning_rate": 2.9376436566800667e-09, + "logits/chosen": -2.0436596870422363, + "logits/rejected": -1.9899866580963135, + "logps/chosen": -226.82186889648438, + "logps/rejected": -291.56561279296875, + "loss": 0.5485, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.7067333459854126, + "rewards/margins": 0.7155084609985352, + "rewards/rejected": -2.422241687774658, + "step": 16200 + }, + { + "epoch": 2.7929014472777394, + "grad_norm": 38.39668655395508, + "learning_rate": 2.8896015355875492e-09, + "logits/chosen": -2.01147198677063, + "logits/rejected": -1.9818751811981201, + "logps/chosen": -226.8378448486328, + "logps/rejected": -289.5968322753906, + "loss": 0.5536, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.7334760427474976, + "rewards/margins": 0.6313256025314331, + "rewards/rejected": -2.3648018836975098, + "step": 16210 + }, + { + "epoch": 2.794624396967609, + "grad_norm": 37.38385009765625, + "learning_rate": 2.841949740371086e-09, + "logits/chosen": -2.089500665664673, + "logits/rejected": -2.0475239753723145, + "logps/chosen": -218.9881591796875, + "logps/rejected": -295.2696228027344, + "loss": 0.5192, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.6111987829208374, + "rewards/margins": 0.7985876798629761, + "rewards/rejected": -2.4097864627838135, + "step": 16220 + }, + { + "epoch": 2.7963473466574778, + "grad_norm": 32.81590270996094, + "learning_rate": 2.7946884625624556e-09, + "logits/chosen": -2.110447406768799, + "logits/rejected": -2.0691370964050293, + "logps/chosen": -223.9718475341797, + "logps/rejected": -296.3196716308594, + "loss": 0.5205, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.7176158428192139, + "rewards/margins": 0.7397913932800293, + "rewards/rejected": -2.4574074745178223, + "step": 16230 + }, + { + "epoch": 2.7980702963473467, + "grad_norm": 37.23353958129883, + "learning_rate": 2.747817892123816e-09, + "logits/chosen": -2.0878543853759766, + "logits/rejected": -2.0457749366760254, + "logps/chosen": -232.74758911132812, + "logps/rejected": -308.37872314453125, + "loss": 0.5228, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.7941334247589111, + "rewards/margins": 0.7627833485603333, + "rewards/rejected": -2.5569167137145996, + "step": 16240 + }, + { + "epoch": 2.7997932460372157, + "grad_norm": 40.1833381652832, + "learning_rate": 2.7013382174468914e-09, + "logits/chosen": -2.112239360809326, + "logits/rejected": -2.079160213470459, + "logps/chosen": -226.0060272216797, + "logps/rejected": -285.4903564453125, + "loss": 0.5518, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6794464588165283, + "rewards/margins": 0.6362031102180481, + "rewards/rejected": -2.3156495094299316, + "step": 16250 + }, + { + "epoch": 2.8015161957270847, + "grad_norm": 49.72036361694336, + "learning_rate": 2.6552496253522518e-09, + "logits/chosen": -2.1082348823547363, + "logits/rejected": -2.0651748180389404, + "logps/chosen": -235.5941619873047, + "logps/rejected": -320.23138427734375, + "loss": 0.5462, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.8211790323257446, + "rewards/margins": 0.833513617515564, + "rewards/rejected": -2.654693126678467, + "step": 16260 + }, + { + "epoch": 2.8032391454169536, + "grad_norm": 45.086212158203125, + "learning_rate": 2.609552301088558e-09, + "logits/chosen": -2.1189451217651367, + "logits/rejected": -2.0799121856689453, + "logps/chosen": -230.2737579345703, + "logps/rejected": -297.52001953125, + "loss": 0.5645, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.7419335842132568, + "rewards/margins": 0.7099277377128601, + "rewards/rejected": -2.4518613815307617, + "step": 16270 + }, + { + "epoch": 2.804962095106823, + "grad_norm": 43.261688232421875, + "learning_rate": 2.5642464283317733e-09, + "logits/chosen": -2.1716601848602295, + "logits/rejected": -2.1354892253875732, + "logps/chosen": -235.68460083007812, + "logps/rejected": -300.2950744628906, + "loss": 0.5466, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.7859855890274048, + "rewards/margins": 0.6980710625648499, + "rewards/rejected": -2.4840569496154785, + "step": 16280 + }, + { + "epoch": 2.806685044796692, + "grad_norm": 29.19462776184082, + "learning_rate": 2.5193321891844866e-09, + "logits/chosen": -2.1258790493011475, + "logits/rejected": -2.0978095531463623, + "logps/chosen": -223.1262664794922, + "logps/rejected": -291.4661560058594, + "loss": 0.5461, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.6972472667694092, + "rewards/margins": 0.6799039244651794, + "rewards/rejected": -2.3771510124206543, + "step": 16290 + }, + { + "epoch": 2.808407994486561, + "grad_norm": 45.25839614868164, + "learning_rate": 2.4748097641751787e-09, + "logits/chosen": -2.1373369693756104, + "logits/rejected": -2.0963287353515625, + "logps/chosen": -239.76138305664062, + "logps/rejected": -298.3421936035156, + "loss": 0.5828, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.8480463027954102, + "rewards/margins": 0.6322572231292725, + "rewards/rejected": -2.4803037643432617, + "step": 16300 + }, + { + "epoch": 2.81013094417643, + "grad_norm": 41.874542236328125, + "learning_rate": 2.4306793322574014e-09, + "logits/chosen": -2.1024575233459473, + "logits/rejected": -2.0653140544891357, + "logps/chosen": -226.78759765625, + "logps/rejected": -290.859130859375, + "loss": 0.5652, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.7299995422363281, + "rewards/margins": 0.6347140669822693, + "rewards/rejected": -2.364713430404663, + "step": 16310 + }, + { + "epoch": 2.8118538938662994, + "grad_norm": 43.39019012451172, + "learning_rate": 2.3869410708091787e-09, + "logits/chosen": -2.108158588409424, + "logits/rejected": -2.0729689598083496, + "logps/chosen": -235.5603485107422, + "logps/rejected": -313.83453369140625, + "loss": 0.5132, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.79485285282135, + "rewards/margins": 0.8003241419792175, + "rewards/rejected": -2.595176935195923, + "step": 16320 + }, + { + "epoch": 2.8135768435561683, + "grad_norm": 30.487186431884766, + "learning_rate": 2.3435951556322386e-09, + "logits/chosen": -2.1272568702697754, + "logits/rejected": -2.086873769760132, + "logps/chosen": -223.5836944580078, + "logps/rejected": -285.99444580078125, + "loss": 0.5445, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.643500566482544, + "rewards/margins": 0.6845242381095886, + "rewards/rejected": -2.3280248641967773, + "step": 16330 + }, + { + "epoch": 2.8152997932460373, + "grad_norm": 36.673702239990234, + "learning_rate": 2.3006417609513053e-09, + "logits/chosen": -2.0551676750183105, + "logits/rejected": -2.0148515701293945, + "logps/chosen": -215.37350463867188, + "logps/rejected": -295.75323486328125, + "loss": 0.4746, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5917526483535767, + "rewards/margins": 0.8517443537712097, + "rewards/rejected": -2.4434971809387207, + "step": 16340 + }, + { + "epoch": 2.8170227429359063, + "grad_norm": 32.942054748535156, + "learning_rate": 2.258081059413397e-09, + "logits/chosen": -2.192537546157837, + "logits/rejected": -2.144319534301758, + "logps/chosen": -220.494384765625, + "logps/rejected": -292.1282653808594, + "loss": 0.493, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.6330026388168335, + "rewards/margins": 0.7516492605209351, + "rewards/rejected": -2.3846521377563477, + "step": 16350 + }, + { + "epoch": 2.8187456926257752, + "grad_norm": 42.28948211669922, + "learning_rate": 2.2159132220871623e-09, + "logits/chosen": -2.1154656410217285, + "logits/rejected": -2.0709640979766846, + "logps/chosen": -223.71414184570312, + "logps/rejected": -295.94488525390625, + "loss": 0.5153, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.6985929012298584, + "rewards/margins": 0.7562544345855713, + "rewards/rejected": -2.454847574234009, + "step": 16360 + }, + { + "epoch": 2.820468642315644, + "grad_norm": 44.34054183959961, + "learning_rate": 2.174138418462135e-09, + "logits/chosen": -2.087660789489746, + "logits/rejected": -2.0548341274261475, + "logps/chosen": -232.76956176757812, + "logps/rejected": -287.1643371582031, + "loss": 0.5742, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.7786098718643188, + "rewards/margins": 0.5987129807472229, + "rewards/rejected": -2.3773229122161865, + "step": 16370 + }, + { + "epoch": 2.822191592005513, + "grad_norm": 48.648616790771484, + "learning_rate": 2.132756816448111e-09, + "logits/chosen": -2.1305344104766846, + "logits/rejected": -2.101365089416504, + "logps/chosen": -225.6343994140625, + "logps/rejected": -291.4571533203125, + "loss": 0.5371, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.702154517173767, + "rewards/margins": 0.6686417460441589, + "rewards/rejected": -2.3707962036132812, + "step": 16380 + }, + { + "epoch": 2.8239145416953826, + "grad_norm": 39.413551330566406, + "learning_rate": 2.0917685823744426e-09, + "logits/chosen": -2.03899884223938, + "logits/rejected": -2.005887508392334, + "logps/chosen": -212.76266479492188, + "logps/rejected": -286.5272216796875, + "loss": 0.5432, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.587390422821045, + "rewards/margins": 0.7614668011665344, + "rewards/rejected": -2.3488574028015137, + "step": 16390 + }, + { + "epoch": 2.8256374913852516, + "grad_norm": 41.09767532348633, + "learning_rate": 2.0511738809894097e-09, + "logits/chosen": -2.0512211322784424, + "logits/rejected": -2.0137734413146973, + "logps/chosen": -222.48031616210938, + "logps/rejected": -295.8672790527344, + "loss": 0.5057, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.6593711376190186, + "rewards/margins": 0.7600074410438538, + "rewards/rejected": -2.4193787574768066, + "step": 16400 + }, + { + "epoch": 2.8256374913852516, + "eval_logits/chosen": -2.1720340251922607, + "eval_logits/rejected": -2.1526761054992676, + "eval_logps/chosen": -217.00506591796875, + "eval_logps/rejected": -251.57510375976562, + "eval_loss": 0.6409673094749451, + "eval_rewards/accuracies": 0.6305761933326721, + "eval_rewards/chosen": -1.5798962116241455, + "eval_rewards/margins": 0.30835920572280884, + "eval_rewards/rejected": -1.8882551193237305, + "eval_runtime": 383.5047, + "eval_samples_per_second": 11.223, + "eval_steps_per_second": 1.403, + "step": 16400 + }, + { + "epoch": 2.8273604410751205, + "grad_norm": 37.83739471435547, + "learning_rate": 2.0109728754594713e-09, + "logits/chosen": -2.1093385219573975, + "logits/rejected": -2.056161403656006, + "logps/chosen": -235.99423217773438, + "logps/rejected": -306.68701171875, + "loss": 0.5049, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.7986196279525757, + "rewards/margins": 0.7931631803512573, + "rewards/rejected": -2.591783046722412, + "step": 16410 + }, + { + "epoch": 2.82908339076499, + "grad_norm": 51.42935562133789, + "learning_rate": 1.9711657273686844e-09, + "logits/chosen": -2.0189051628112793, + "logits/rejected": -1.9867775440216064, + "logps/chosen": -228.15390014648438, + "logps/rejected": -289.6495666503906, + "loss": 0.5659, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.7289907932281494, + "rewards/margins": 0.6051722764968872, + "rewards/rejected": -2.334162950515747, + "step": 16420 + }, + { + "epoch": 2.830806340454859, + "grad_norm": 60.047271728515625, + "learning_rate": 1.93175259671805e-09, + "logits/chosen": -2.024369716644287, + "logits/rejected": -1.9945716857910156, + "logps/chosen": -232.27267456054688, + "logps/rejected": -299.2837829589844, + "loss": 0.5415, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.7775561809539795, + "rewards/margins": 0.6843501329421997, + "rewards/rejected": -2.4619064331054688, + "step": 16430 + }, + { + "epoch": 2.832529290144728, + "grad_norm": 29.9412841796875, + "learning_rate": 1.8927336419248596e-09, + "logits/chosen": -2.0732693672180176, + "logits/rejected": -2.032212734222412, + "logps/chosen": -221.9842987060547, + "logps/rejected": -298.64361572265625, + "loss": 0.532, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.699705719947815, + "rewards/margins": 0.7766598463058472, + "rewards/rejected": -2.476365327835083, + "step": 16440 + }, + { + "epoch": 2.834252239834597, + "grad_norm": 48.490699768066406, + "learning_rate": 1.8541090198220144e-09, + "logits/chosen": -2.151984691619873, + "logits/rejected": -2.109513521194458, + "logps/chosen": -230.19943237304688, + "logps/rejected": -307.9920959472656, + "loss": 0.4905, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.7183396816253662, + "rewards/margins": 0.8219423294067383, + "rewards/rejected": -2.5402817726135254, + "step": 16450 + }, + { + "epoch": 2.835975189524466, + "grad_norm": 61.01959228515625, + "learning_rate": 1.8158788856574624e-09, + "logits/chosen": -2.005765438079834, + "logits/rejected": -1.9797627925872803, + "logps/chosen": -213.1536865234375, + "logps/rejected": -289.0273742675781, + "loss": 0.5169, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.6353687047958374, + "rewards/margins": 0.7584549188613892, + "rewards/rejected": -2.3938233852386475, + "step": 16460 + }, + { + "epoch": 2.837698139214335, + "grad_norm": 49.8837890625, + "learning_rate": 1.7780433930935312e-09, + "logits/chosen": -2.124091148376465, + "logits/rejected": -2.0797622203826904, + "logps/chosen": -222.4127197265625, + "logps/rejected": -310.6213684082031, + "loss": 0.5079, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.7021658420562744, + "rewards/margins": 0.8872678875923157, + "rewards/rejected": -2.5894336700439453, + "step": 16470 + }, + { + "epoch": 2.8394210889042037, + "grad_norm": 52.482948303222656, + "learning_rate": 1.74060269420635e-09, + "logits/chosen": -2.023437738418579, + "logits/rejected": -1.982782006263733, + "logps/chosen": -220.4035186767578, + "logps/rejected": -297.6961669921875, + "loss": 0.502, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.668707251548767, + "rewards/margins": 0.7751259803771973, + "rewards/rejected": -2.443833112716675, + "step": 16480 + }, + { + "epoch": 2.841144038594073, + "grad_norm": 48.38195037841797, + "learning_rate": 1.7035569394851955e-09, + "logits/chosen": -2.1020920276641846, + "logits/rejected": -2.0738072395324707, + "logps/chosen": -222.8717041015625, + "logps/rejected": -270.82470703125, + "loss": 0.5943, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.7007564306259155, + "rewards/margins": 0.4976697862148285, + "rewards/rejected": -2.1984260082244873, + "step": 16490 + }, + { + "epoch": 2.842866988283942, + "grad_norm": 29.190269470214844, + "learning_rate": 1.6669062778318698e-09, + "logits/chosen": -2.1296000480651855, + "logits/rejected": -2.0699143409729004, + "logps/chosen": -217.5924530029297, + "logps/rejected": -268.601806640625, + "loss": 0.5326, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.5834438800811768, + "rewards/margins": 0.6127041578292847, + "rewards/rejected": -2.196147918701172, + "step": 16500 + }, + { + "epoch": 2.844589937973811, + "grad_norm": 63.935707092285156, + "learning_rate": 1.6306508565602228e-09, + "logits/chosen": -2.133659839630127, + "logits/rejected": -2.0986266136169434, + "logps/chosen": -220.5809326171875, + "logps/rejected": -289.7440490722656, + "loss": 0.5213, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6280450820922852, + "rewards/margins": 0.7229426503181458, + "rewards/rejected": -2.350987672805786, + "step": 16510 + }, + { + "epoch": 2.84631288766368, + "grad_norm": 50.458160400390625, + "learning_rate": 1.5947908213953753e-09, + "logits/chosen": -2.1781907081604004, + "logits/rejected": -2.126542568206787, + "logps/chosen": -232.2783660888672, + "logps/rejected": -316.50927734375, + "loss": 0.4762, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.7517235279083252, + "rewards/margins": 0.8916566967964172, + "rewards/rejected": -2.6433801651000977, + "step": 16520 + }, + { + "epoch": 2.8480358373535495, + "grad_norm": 53.46482467651367, + "learning_rate": 1.5593263164732972e-09, + "logits/chosen": -2.0542373657226562, + "logits/rejected": -2.0234646797180176, + "logps/chosen": -227.55838012695312, + "logps/rejected": -275.9589538574219, + "loss": 0.5728, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.7072197198867798, + "rewards/margins": 0.5339667201042175, + "rewards/rejected": -2.2411863803863525, + "step": 16530 + }, + { + "epoch": 2.8497587870434185, + "grad_norm": 53.02235794067383, + "learning_rate": 1.5242574843401524e-09, + "logits/chosen": -2.079049587249756, + "logits/rejected": -2.04000186920166, + "logps/chosen": -233.3113555908203, + "logps/rejected": -297.20574951171875, + "loss": 0.5702, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.7984466552734375, + "rewards/margins": 0.6766330003738403, + "rewards/rejected": -2.4750795364379883, + "step": 16540 + }, + { + "epoch": 2.8514817367332874, + "grad_norm": 33.67155456542969, + "learning_rate": 1.489584465951721e-09, + "logits/chosen": -2.0668838024139404, + "logits/rejected": -2.0234920978546143, + "logps/chosen": -229.3888702392578, + "logps/rejected": -313.6064758300781, + "loss": 0.4876, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.7475191354751587, + "rewards/margins": 0.8554231524467468, + "rewards/rejected": -2.60294246673584, + "step": 16550 + }, + { + "epoch": 2.8532046864231564, + "grad_norm": 34.016475677490234, + "learning_rate": 1.455307400672845e-09, + "logits/chosen": -2.0811784267425537, + "logits/rejected": -2.0434277057647705, + "logps/chosen": -222.74697875976562, + "logps/rejected": -296.45184326171875, + "loss": 0.5085, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.672705054283142, + "rewards/margins": 0.7318105101585388, + "rewards/rejected": -2.404515504837036, + "step": 16560 + }, + { + "epoch": 2.8549276361130254, + "grad_norm": 43.50887680053711, + "learning_rate": 1.421426426276895e-09, + "logits/chosen": -2.14603590965271, + "logits/rejected": -2.096391439437866, + "logps/chosen": -219.7465057373047, + "logps/rejected": -290.72857666015625, + "loss": 0.4984, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.6511335372924805, + "rewards/margins": 0.7488675713539124, + "rewards/rejected": -2.400001049041748, + "step": 16570 + }, + { + "epoch": 2.8566505858028943, + "grad_norm": 35.695289611816406, + "learning_rate": 1.3879416789451815e-09, + "logits/chosen": -2.062807321548462, + "logits/rejected": -2.0235047340393066, + "logps/chosen": -222.1370849609375, + "logps/rejected": -294.31915283203125, + "loss": 0.5454, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.7023674249649048, + "rewards/margins": 0.7214884757995605, + "rewards/rejected": -2.423856258392334, + "step": 16580 + }, + { + "epoch": 2.8583735354927637, + "grad_norm": 71.88639068603516, + "learning_rate": 1.3548532932663891e-09, + "logits/chosen": -2.0466041564941406, + "logits/rejected": -2.005546808242798, + "logps/chosen": -237.32559204101562, + "logps/rejected": -296.65850830078125, + "loss": 0.5666, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.8258588314056396, + "rewards/margins": 0.6459594368934631, + "rewards/rejected": -2.471818208694458, + "step": 16590 + }, + { + "epoch": 2.8600964851826327, + "grad_norm": 47.83307647705078, + "learning_rate": 1.3221614022361105e-09, + "logits/chosen": -2.078378200531006, + "logits/rejected": -2.0499420166015625, + "logps/chosen": -234.3688507080078, + "logps/rejected": -295.27923583984375, + "loss": 0.569, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.7668821811676025, + "rewards/margins": 0.641403079032898, + "rewards/rejected": -2.408285617828369, + "step": 16600 + }, + { + "epoch": 2.8618194348725017, + "grad_norm": 43.39389419555664, + "learning_rate": 1.289866137256257e-09, + "logits/chosen": -2.072134494781494, + "logits/rejected": -2.011193037033081, + "logps/chosen": -239.80465698242188, + "logps/rejected": -318.4080505371094, + "loss": 0.5168, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.8393303155899048, + "rewards/margins": 0.820911705493927, + "rewards/rejected": -2.6602420806884766, + "step": 16610 + }, + { + "epoch": 2.8635423845623706, + "grad_norm": 31.751678466796875, + "learning_rate": 1.2579676281345042e-09, + "logits/chosen": -2.0522119998931885, + "logits/rejected": -2.009146213531494, + "logps/chosen": -218.08444213867188, + "logps/rejected": -292.8926086425781, + "loss": 0.5054, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.6057907342910767, + "rewards/margins": 0.7817180752754211, + "rewards/rejected": -2.3875088691711426, + "step": 16620 + }, + { + "epoch": 2.86526533425224, + "grad_norm": 74.23681640625, + "learning_rate": 1.2264660030838592e-09, + "logits/chosen": -2.059389591217041, + "logits/rejected": -2.0104565620422363, + "logps/chosen": -231.3424072265625, + "logps/rejected": -301.61322021484375, + "loss": 0.5198, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.684125304222107, + "rewards/margins": 0.8123534917831421, + "rewards/rejected": -2.496478796005249, + "step": 16630 + }, + { + "epoch": 2.866988283942109, + "grad_norm": 64.68147277832031, + "learning_rate": 1.195361388722038e-09, + "logits/chosen": -2.0717241764068604, + "logits/rejected": -2.021415948867798, + "logps/chosen": -247.53384399414062, + "logps/rejected": -317.575439453125, + "loss": 0.5592, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.9080226421356201, + "rewards/margins": 0.7610594630241394, + "rewards/rejected": -2.6690824031829834, + "step": 16640 + }, + { + "epoch": 2.868711233631978, + "grad_norm": 42.12141418457031, + "learning_rate": 1.1646539100710562e-09, + "logits/chosen": -2.051790952682495, + "logits/rejected": -2.012688636779785, + "logps/chosen": -212.3316192626953, + "logps/rejected": -288.4221496582031, + "loss": 0.4891, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.5662543773651123, + "rewards/margins": 0.8092681765556335, + "rewards/rejected": -2.3755226135253906, + "step": 16650 + }, + { + "epoch": 2.870434183321847, + "grad_norm": 47.80633544921875, + "learning_rate": 1.1343436905566495e-09, + "logits/chosen": -2.088360548019409, + "logits/rejected": -2.055044174194336, + "logps/chosen": -228.2832794189453, + "logps/rejected": -304.2309875488281, + "loss": 0.5157, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7355735301971436, + "rewards/margins": 0.7714985013008118, + "rewards/rejected": -2.5070719718933105, + "step": 16660 + }, + { + "epoch": 2.872157133011716, + "grad_norm": 32.09943389892578, + "learning_rate": 1.1044308520078316e-09, + "logits/chosen": -2.1001334190368652, + "logits/rejected": -2.062692642211914, + "logps/chosen": -211.157958984375, + "logps/rejected": -298.08978271484375, + "loss": 0.4738, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.5801347494125366, + "rewards/margins": 0.8807607889175415, + "rewards/rejected": -2.460895538330078, + "step": 16670 + }, + { + "epoch": 2.873880082701585, + "grad_norm": 47.860862731933594, + "learning_rate": 1.0749155146563493e-09, + "logits/chosen": -2.022371530532837, + "logits/rejected": -1.9902015924453735, + "logps/chosen": -225.13632202148438, + "logps/rejected": -289.27484130859375, + "loss": 0.5621, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.7542200088500977, + "rewards/margins": 0.6523141264915466, + "rewards/rejected": -2.406534194946289, + "step": 16680 + }, + { + "epoch": 2.8756030323914543, + "grad_norm": 37.298927307128906, + "learning_rate": 1.0457977971362831e-09, + "logits/chosen": -2.1136248111724854, + "logits/rejected": -2.0815658569335938, + "logps/chosen": -221.63546752929688, + "logps/rejected": -277.52569580078125, + "loss": 0.5695, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.6617939472198486, + "rewards/margins": 0.5853327512741089, + "rewards/rejected": -2.247126579284668, + "step": 16690 + }, + { + "epoch": 2.8773259820813233, + "grad_norm": 48.6547966003418, + "learning_rate": 1.0170778164834581e-09, + "logits/chosen": -2.182882308959961, + "logits/rejected": -2.1483664512634277, + "logps/chosen": -232.5152587890625, + "logps/rejected": -292.2670593261719, + "loss": 0.591, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.7677936553955078, + "rewards/margins": 0.6265477538108826, + "rewards/rejected": -2.394341230392456, + "step": 16700 + }, + { + "epoch": 2.8790489317711923, + "grad_norm": 47.388885498046875, + "learning_rate": 9.887556881350901e-10, + "logits/chosen": -2.0984857082366943, + "logits/rejected": -2.055283784866333, + "logps/chosen": -229.77487182617188, + "logps/rejected": -302.63848876953125, + "loss": 0.553, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7760871648788452, + "rewards/margins": 0.75262051820755, + "rewards/rejected": -2.528707504272461, + "step": 16710 + }, + { + "epoch": 2.8807718814610612, + "grad_norm": 30.91242790222168, + "learning_rate": 9.608315259292288e-10, + "logits/chosen": -2.0606322288513184, + "logits/rejected": -2.015148639678955, + "logps/chosen": -229.14425659179688, + "logps/rejected": -295.65289306640625, + "loss": 0.5164, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.736588478088379, + "rewards/margins": 0.7398849129676819, + "rewards/rejected": -2.476473331451416, + "step": 16720 + }, + { + "epoch": 2.8824948311509306, + "grad_norm": 32.950870513916016, + "learning_rate": 9.333054421043484e-10, + "logits/chosen": -2.050093173980713, + "logits/rejected": -2.0072808265686035, + "logps/chosen": -218.2460479736328, + "logps/rejected": -304.90594482421875, + "loss": 0.4845, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6574676036834717, + "rewards/margins": 0.8856703042984009, + "rewards/rejected": -2.543137788772583, + "step": 16730 + }, + { + "epoch": 2.8842177808407996, + "grad_norm": 47.38127899169922, + "learning_rate": 9.06177547298892e-10, + "logits/chosen": -2.040806293487549, + "logits/rejected": -2.002763271331787, + "logps/chosen": -228.78292846679688, + "logps/rejected": -291.5965881347656, + "loss": 0.5487, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.7215182781219482, + "rewards/margins": 0.6703734993934631, + "rewards/rejected": -2.3918917179107666, + "step": 16740 + }, + { + "epoch": 2.8859407305306686, + "grad_norm": 37.412601470947266, + "learning_rate": 8.794479505508268e-10, + "logits/chosen": -2.125983476638794, + "logits/rejected": -2.08128023147583, + "logps/chosen": -225.7335968017578, + "logps/rejected": -317.2071838378906, + "loss": 0.4742, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.707411766052246, + "rewards/margins": 0.8916788101196289, + "rewards/rejected": -2.599090337753296, + "step": 16750 + }, + { + "epoch": 2.8876636802205375, + "grad_norm": 39.6590461730957, + "learning_rate": 8.531167592971566e-10, + "logits/chosen": -2.0325124263763428, + "logits/rejected": -1.992448091506958, + "logps/chosen": -233.4219970703125, + "logps/rejected": -304.00323486328125, + "loss": 0.5327, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.7760006189346313, + "rewards/margins": 0.7474104166030884, + "rewards/rejected": -2.5234107971191406, + "step": 16760 + }, + { + "epoch": 2.8893866299104065, + "grad_norm": 34.32674789428711, + "learning_rate": 8.271840793735884e-10, + "logits/chosen": -2.150630474090576, + "logits/rejected": -2.098395824432373, + "logps/chosen": -233.03384399414062, + "logps/rejected": -300.7596740722656, + "loss": 0.5388, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7554289102554321, + "rewards/margins": 0.7637473344802856, + "rewards/rejected": -2.5191762447357178, + "step": 16770 + }, + { + "epoch": 2.8911095796002755, + "grad_norm": 35.03683090209961, + "learning_rate": 8.016500150140215e-10, + "logits/chosen": -2.0818848609924316, + "logits/rejected": -2.048121452331543, + "logps/chosen": -225.534423828125, + "logps/rejected": -297.86639404296875, + "loss": 0.5285, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.7005207538604736, + "rewards/margins": 0.7076085805892944, + "rewards/rejected": -2.4081294536590576, + "step": 16780 + }, + { + "epoch": 2.892832529290145, + "grad_norm": 35.001564025878906, + "learning_rate": 7.765146688501589e-10, + "logits/chosen": -2.0988194942474365, + "logits/rejected": -2.039665460586548, + "logps/chosen": -226.2569122314453, + "logps/rejected": -283.5757141113281, + "loss": 0.5791, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.6833999156951904, + "rewards/margins": 0.6543527841567993, + "rewards/rejected": -2.3377525806427, + "step": 16790 + }, + { + "epoch": 2.894555478980014, + "grad_norm": 76.95634460449219, + "learning_rate": 7.51778141911108e-10, + "logits/chosen": -2.1269469261169434, + "logits/rejected": -2.0994410514831543, + "logps/chosen": -237.2293243408203, + "logps/rejected": -302.3692932128906, + "loss": 0.5731, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.8673900365829468, + "rewards/margins": 0.6507739424705505, + "rewards/rejected": -2.5181639194488525, + "step": 16800 + }, + { + "epoch": 2.894555478980014, + "eval_logits/chosen": -2.170219898223877, + "eval_logits/rejected": -2.150702953338623, + "eval_logps/chosen": -218.18539428710938, + "eval_logps/rejected": -252.9563751220703, + "eval_loss": 0.6412459015846252, + "eval_rewards/accuracies": 0.6270910501480103, + "eval_rewards/chosen": -1.5916991233825684, + "eval_rewards/margins": 0.3103685677051544, + "eval_rewards/rejected": -1.90206778049469, + "eval_runtime": 383.1692, + "eval_samples_per_second": 11.233, + "eval_steps_per_second": 1.404, + "step": 16800 + }, + { + "epoch": 2.896278428669883, + "grad_norm": 39.302852630615234, + "learning_rate": 7.274405336229361e-10, + "logits/chosen": -2.073404550552368, + "logits/rejected": -2.026705503463745, + "logps/chosen": -217.42691040039062, + "logps/rejected": -291.86859130859375, + "loss": 0.5075, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.6343815326690674, + "rewards/margins": 0.7991583347320557, + "rewards/rejected": -2.433539867401123, + "step": 16810 + }, + { + "epoch": 2.898001378359752, + "grad_norm": 57.89147186279297, + "learning_rate": 7.035019418083376e-10, + "logits/chosen": -2.1040916442871094, + "logits/rejected": -2.0590415000915527, + "logps/chosen": -234.2236328125, + "logps/rejected": -289.3756103515625, + "loss": 0.5516, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.780346155166626, + "rewards/margins": 0.5990556478500366, + "rewards/rejected": -2.379401683807373, + "step": 16820 + }, + { + "epoch": 2.899724328049621, + "grad_norm": 45.09634780883789, + "learning_rate": 6.799624626861456e-10, + "logits/chosen": -2.165893316268921, + "logits/rejected": -2.1156418323516846, + "logps/chosen": -242.53964233398438, + "logps/rejected": -330.9523010253906, + "loss": 0.4985, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -1.869193434715271, + "rewards/margins": 0.8991667032241821, + "rewards/rejected": -2.768360137939453, + "step": 16830 + }, + { + "epoch": 2.90144727773949, + "grad_norm": 64.08953857421875, + "learning_rate": 6.568221908710314e-10, + "logits/chosen": -2.063244104385376, + "logits/rejected": -2.0177295207977295, + "logps/chosen": -226.95565795898438, + "logps/rejected": -297.2478332519531, + "loss": 0.518, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.729138731956482, + "rewards/margins": 0.7378978133201599, + "rewards/rejected": -2.467036724090576, + "step": 16840 + }, + { + "epoch": 2.903170227429359, + "grad_norm": 83.15585327148438, + "learning_rate": 6.340812193730949e-10, + "logits/chosen": -2.103969097137451, + "logits/rejected": -2.0735132694244385, + "logps/chosen": -242.5148162841797, + "logps/rejected": -297.60333251953125, + "loss": 0.546, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.8600852489471436, + "rewards/margins": 0.6247068643569946, + "rewards/rejected": -2.4847922325134277, + "step": 16850 + }, + { + "epoch": 2.904893177119228, + "grad_norm": 39.73136520385742, + "learning_rate": 6.117396395974749e-10, + "logits/chosen": -2.1144349575042725, + "logits/rejected": -2.0644149780273438, + "logps/chosen": -232.8963165283203, + "logps/rejected": -293.119873046875, + "loss": 0.5494, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.7648794651031494, + "rewards/margins": 0.6657959222793579, + "rewards/rejected": -2.4306752681732178, + "step": 16860 + }, + { + "epoch": 2.906616126809097, + "grad_norm": 60.453369140625, + "learning_rate": 5.897975413439837e-10, + "logits/chosen": -2.123142719268799, + "logits/rejected": -2.086905002593994, + "logps/chosen": -229.3736572265625, + "logps/rejected": -286.8680114746094, + "loss": 0.5826, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.7463557720184326, + "rewards/margins": 0.6210822463035583, + "rewards/rejected": -2.3674380779266357, + "step": 16870 + }, + { + "epoch": 2.908339076498966, + "grad_norm": 46.789649963378906, + "learning_rate": 5.682550128067731e-10, + "logits/chosen": -2.1383373737335205, + "logits/rejected": -2.0988118648529053, + "logps/chosen": -225.3925018310547, + "logps/rejected": -306.0677490234375, + "loss": 0.5104, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.7493778467178345, + "rewards/margins": 0.7984367609024048, + "rewards/rejected": -2.54781436920166, + "step": 16880 + }, + { + "epoch": 2.910062026188835, + "grad_norm": 48.62042999267578, + "learning_rate": 5.471121405739687e-10, + "logits/chosen": -2.0652871131896973, + "logits/rejected": -2.0350475311279297, + "logps/chosen": -239.32034301757812, + "logps/rejected": -307.5757751464844, + "loss": 0.5242, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.8389384746551514, + "rewards/margins": 0.7051053047180176, + "rewards/rejected": -2.544044017791748, + "step": 16890 + }, + { + "epoch": 2.9117849758787044, + "grad_norm": 49.12190246582031, + "learning_rate": 5.263690096273033e-10, + "logits/chosen": -2.131868362426758, + "logits/rejected": -2.0974514484405518, + "logps/chosen": -223.4816436767578, + "logps/rejected": -291.1946716308594, + "loss": 0.5153, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.69611394405365, + "rewards/margins": 0.7257545590400696, + "rewards/rejected": -2.4218688011169434, + "step": 16900 + }, + { + "epoch": 2.9135079255685734, + "grad_norm": 31.582202911376953, + "learning_rate": 5.060257033417725e-10, + "logits/chosen": -2.151571273803711, + "logits/rejected": -2.1119132041931152, + "logps/chosen": -229.43637084960938, + "logps/rejected": -301.1514587402344, + "loss": 0.5362, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.769758939743042, + "rewards/margins": 0.7082198858261108, + "rewards/rejected": -2.4779789447784424, + "step": 16910 + }, + { + "epoch": 2.9152308752584424, + "grad_norm": 43.54109191894531, + "learning_rate": 4.860823034853468e-10, + "logits/chosen": -2.0970876216888428, + "logits/rejected": -2.0635154247283936, + "logps/chosen": -226.046875, + "logps/rejected": -284.4517517089844, + "loss": 0.5692, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.70242178440094, + "rewards/margins": 0.6119588613510132, + "rewards/rejected": -2.314380168914795, + "step": 16920 + }, + { + "epoch": 2.9169538249483113, + "grad_norm": 37.83869171142578, + "learning_rate": 4.66538890218593e-10, + "logits/chosen": -2.1397299766540527, + "logits/rejected": -2.1106526851654053, + "logps/chosen": -214.8348846435547, + "logps/rejected": -270.4018249511719, + "loss": 0.5509, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.5738188028335571, + "rewards/margins": 0.6245588660240173, + "rewards/rejected": -2.1983776092529297, + "step": 16930 + }, + { + "epoch": 2.9186767746381808, + "grad_norm": 38.34006881713867, + "learning_rate": 4.4739554209437536e-10, + "logits/chosen": -2.111955165863037, + "logits/rejected": -2.0768473148345947, + "logps/chosen": -220.9971923828125, + "logps/rejected": -288.90179443359375, + "loss": 0.521, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.6153472661972046, + "rewards/margins": 0.7338367104530334, + "rewards/rejected": -2.349184036254883, + "step": 16940 + }, + { + "epoch": 2.9203997243280497, + "grad_norm": 41.916229248046875, + "learning_rate": 4.286523360575334e-10, + "logits/chosen": -2.0766398906707764, + "logits/rejected": -2.0473122596740723, + "logps/chosen": -223.3889617919922, + "logps/rejected": -304.4686584472656, + "loss": 0.5363, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.7231884002685547, + "rewards/margins": 0.7785578370094299, + "rewards/rejected": -2.50174617767334, + "step": 16950 + }, + { + "epoch": 2.9221226740179187, + "grad_norm": 41.175865173339844, + "learning_rate": 4.103093474445818e-10, + "logits/chosen": -2.0784764289855957, + "logits/rejected": -2.036428689956665, + "logps/chosen": -229.8789825439453, + "logps/rejected": -313.3496398925781, + "loss": 0.5057, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.738173246383667, + "rewards/margins": 0.8513944745063782, + "rewards/rejected": -2.5895678997039795, + "step": 16960 + }, + { + "epoch": 2.9238456237077877, + "grad_norm": 45.59201431274414, + "learning_rate": 3.9236664998338885e-10, + "logits/chosen": -2.162342071533203, + "logits/rejected": -2.118924140930176, + "logps/chosen": -231.0322265625, + "logps/rejected": -301.0166931152344, + "loss": 0.5236, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.7455734014511108, + "rewards/margins": 0.7369973063468933, + "rewards/rejected": -2.4825704097747803, + "step": 16970 + }, + { + "epoch": 2.9255685733976566, + "grad_norm": 42.4276123046875, + "learning_rate": 3.7482431579289873e-10, + "logits/chosen": -2.136894702911377, + "logits/rejected": -2.0962953567504883, + "logps/chosen": -219.96725463867188, + "logps/rejected": -285.20452880859375, + "loss": 0.5119, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.650986671447754, + "rewards/margins": 0.6824150681495667, + "rewards/rejected": -2.3334014415740967, + "step": 16980 + }, + { + "epoch": 2.9272915230875256, + "grad_norm": 28.827068328857422, + "learning_rate": 3.5768241538282064e-10, + "logits/chosen": -2.1777596473693848, + "logits/rejected": -2.1340672969818115, + "logps/chosen": -210.70730590820312, + "logps/rejected": -287.60089111328125, + "loss": 0.5085, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5722601413726807, + "rewards/margins": 0.786490797996521, + "rewards/rejected": -2.358751058578491, + "step": 16990 + }, + { + "epoch": 2.929014472777395, + "grad_norm": 70.47722625732422, + "learning_rate": 3.4094101765338446e-10, + "logits/chosen": -2.1753337383270264, + "logits/rejected": -2.140491485595703, + "logps/chosen": -220.97109985351562, + "logps/rejected": -279.3697509765625, + "loss": 0.5656, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.6485884189605713, + "rewards/margins": 0.6162117719650269, + "rewards/rejected": -2.2647998332977295, + "step": 17000 + }, + { + "epoch": 2.930737422467264, + "grad_norm": 33.92271041870117, + "learning_rate": 3.24600189895019e-10, + "logits/chosen": -2.0778861045837402, + "logits/rejected": -2.0436019897460938, + "logps/chosen": -242.9732666015625, + "logps/rejected": -312.52880859375, + "loss": 0.5511, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.89163339138031, + "rewards/margins": 0.6904274225234985, + "rewards/rejected": -2.5820608139038086, + "step": 17010 + }, + { + "epoch": 2.932460372157133, + "grad_norm": 64.56465911865234, + "learning_rate": 3.086599977880855e-10, + "logits/chosen": -2.077587366104126, + "logits/rejected": -2.061856746673584, + "logps/chosen": -231.9762725830078, + "logps/rejected": -284.2293701171875, + "loss": 0.6059, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.7796518802642822, + "rewards/margins": 0.5387360453605652, + "rewards/rejected": -2.318387985229492, + "step": 17020 + }, + { + "epoch": 2.934183321847002, + "grad_norm": 44.50904083251953, + "learning_rate": 2.931205054026775e-10, + "logits/chosen": -2.123732566833496, + "logits/rejected": -2.082094669342041, + "logps/chosen": -231.95407104492188, + "logps/rejected": -289.9576110839844, + "loss": 0.5535, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.7447305917739868, + "rewards/margins": 0.6295886635780334, + "rewards/rejected": -2.374319076538086, + "step": 17030 + }, + { + "epoch": 2.9359062715368713, + "grad_norm": 35.688663482666016, + "learning_rate": 2.7798177519826605e-10, + "logits/chosen": -2.1549980640411377, + "logits/rejected": -2.1072096824645996, + "logps/chosen": -237.3199462890625, + "logps/rejected": -303.9327392578125, + "loss": 0.5179, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.8088245391845703, + "rewards/margins": 0.7268606424331665, + "rewards/rejected": -2.5356850624084473, + "step": 17040 + }, + { + "epoch": 2.9376292212267403, + "grad_norm": 52.765228271484375, + "learning_rate": 2.632438680235216e-10, + "logits/chosen": -2.083176851272583, + "logits/rejected": -2.050807476043701, + "logps/chosen": -238.5670166015625, + "logps/rejected": -295.58355712890625, + "loss": 0.5802, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.8304694890975952, + "rewards/margins": 0.5952358245849609, + "rewards/rejected": -2.4257054328918457, + "step": 17050 + }, + { + "epoch": 2.9393521709166093, + "grad_norm": 53.14019012451172, + "learning_rate": 2.4890684311603683e-10, + "logits/chosen": -2.1409833431243896, + "logits/rejected": -2.0979952812194824, + "logps/chosen": -231.754638671875, + "logps/rejected": -292.4068298339844, + "loss": 0.5759, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.7674375772476196, + "rewards/margins": 0.6122730374336243, + "rewards/rejected": -2.3797104358673096, + "step": 17060 + }, + { + "epoch": 2.9410751206064782, + "grad_norm": 63.452335357666016, + "learning_rate": 2.3497075810210433e-10, + "logits/chosen": -2.1018431186676025, + "logits/rejected": -2.0571179389953613, + "logps/chosen": -235.9232635498047, + "logps/rejected": -294.25274658203125, + "loss": 0.5765, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.7991282939910889, + "rewards/margins": 0.6322215795516968, + "rewards/rejected": -2.431349992752075, + "step": 17070 + }, + { + "epoch": 2.942798070296347, + "grad_norm": 39.4548454284668, + "learning_rate": 2.2143566899647248e-10, + "logits/chosen": -2.051210880279541, + "logits/rejected": -2.0057883262634277, + "logps/chosen": -226.5519256591797, + "logps/rejected": -315.3668518066406, + "loss": 0.4735, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.726647138595581, + "rewards/margins": 0.9013614654541016, + "rewards/rejected": -2.6280086040496826, + "step": 17080 + }, + { + "epoch": 2.944521019986216, + "grad_norm": 38.06551742553711, + "learning_rate": 2.0830163020212344e-10, + "logits/chosen": -2.099445104598999, + "logits/rejected": -2.0651302337646484, + "logps/chosen": -226.014404296875, + "logps/rejected": -304.4267272949219, + "loss": 0.5062, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.72359299659729, + "rewards/margins": 0.7966343760490417, + "rewards/rejected": -2.5202274322509766, + "step": 17090 + }, + { + "epoch": 2.9462439696760856, + "grad_norm": 42.14332580566406, + "learning_rate": 1.955686945100621e-10, + "logits/chosen": -2.061148166656494, + "logits/rejected": -2.016936779022217, + "logps/chosen": -233.59298706054688, + "logps/rejected": -298.9555358886719, + "loss": 0.5449, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.7688153982162476, + "rewards/margins": 0.700672447681427, + "rewards/rejected": -2.4694876670837402, + "step": 17100 + }, + { + "epoch": 2.9479669193659546, + "grad_norm": 39.180789947509766, + "learning_rate": 1.8323691309909407e-10, + "logits/chosen": -2.0598368644714355, + "logits/rejected": -2.03139066696167, + "logps/chosen": -250.33279418945312, + "logps/rejected": -325.10748291015625, + "loss": 0.5263, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.9446275234222412, + "rewards/margins": 0.7775768041610718, + "rewards/rejected": -2.7222044467926025, + "step": 17110 + }, + { + "epoch": 2.9496898690558235, + "grad_norm": 37.56937026977539, + "learning_rate": 1.7130633553561479e-10, + "logits/chosen": -2.158722400665283, + "logits/rejected": -2.103527545928955, + "logps/chosen": -226.38259887695312, + "logps/rejected": -308.0562438964844, + "loss": 0.4577, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.7288379669189453, + "rewards/margins": 0.8727463483810425, + "rewards/rejected": -2.6015844345092773, + "step": 17120 + }, + { + "epoch": 2.9514128187456925, + "grad_norm": 47.376258850097656, + "learning_rate": 1.597770097734541e-10, + "logits/chosen": -2.040189266204834, + "logits/rejected": -1.9927847385406494, + "logps/chosen": -238.5029754638672, + "logps/rejected": -307.3231201171875, + "loss": 0.5081, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.8275566101074219, + "rewards/margins": 0.7151464223861694, + "rewards/rejected": -2.542703151702881, + "step": 17130 + }, + { + "epoch": 2.953135768435562, + "grad_norm": 30.941635131835938, + "learning_rate": 1.4864898215359857e-10, + "logits/chosen": -2.0431325435638428, + "logits/rejected": -2.0120351314544678, + "logps/chosen": -223.69985961914062, + "logps/rejected": -299.73345947265625, + "loss": 0.5041, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6792500019073486, + "rewards/margins": 0.7977226972579956, + "rewards/rejected": -2.476972818374634, + "step": 17140 + }, + { + "epoch": 2.954858718125431, + "grad_norm": 46.703495025634766, + "learning_rate": 1.3792229740409166e-10, + "logits/chosen": -2.152099847793579, + "logits/rejected": -2.1069183349609375, + "logps/chosen": -233.14736938476562, + "logps/rejected": -298.739501953125, + "loss": 0.5626, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.7751268148422241, + "rewards/margins": 0.6954813599586487, + "rewards/rejected": -2.4706082344055176, + "step": 17150 + }, + { + "epoch": 2.9565816678153, + "grad_norm": 30.80360221862793, + "learning_rate": 1.2759699863980067e-10, + "logits/chosen": -2.1575663089752197, + "logits/rejected": -2.1177420616149902, + "logps/chosen": -221.9051513671875, + "logps/rejected": -321.9290771484375, + "loss": 0.4635, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -1.6959228515625, + "rewards/margins": 1.0101712942123413, + "rewards/rejected": -2.706094264984131, + "step": 17160 + }, + { + "epoch": 2.958304617505169, + "grad_norm": 36.00048065185547, + "learning_rate": 1.1767312736228329e-10, + "logits/chosen": -2.1762290000915527, + "logits/rejected": -2.138206958770752, + "logps/chosen": -250.8097686767578, + "logps/rejected": -305.70135498046875, + "loss": 0.6168, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.907684087753296, + "rewards/margins": 0.5790875554084778, + "rewards/rejected": -2.486771583557129, + "step": 17170 + }, + { + "epoch": 2.960027567195038, + "grad_norm": 45.304500579833984, + "learning_rate": 1.0815072345957688e-10, + "logits/chosen": -2.125030517578125, + "logits/rejected": -2.088151216506958, + "logps/chosen": -229.660888671875, + "logps/rejected": -296.909912109375, + "loss": 0.5473, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.7812343835830688, + "rewards/margins": 0.6809852719306946, + "rewards/rejected": -2.462219476699829, + "step": 17180 + }, + { + "epoch": 2.9617505168849068, + "grad_norm": 43.71578598022461, + "learning_rate": 9.902982520605396e-11, + "logits/chosen": -2.073464870452881, + "logits/rejected": -2.042576313018799, + "logps/chosen": -213.50186157226562, + "logps/rejected": -281.359619140625, + "loss": 0.5209, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.6059232950210571, + "rewards/margins": 0.6878377795219421, + "rewards/rejected": -2.2937610149383545, + "step": 17190 + }, + { + "epoch": 2.963473466574776, + "grad_norm": 33.20631790161133, + "learning_rate": 9.031046926230024e-11, + "logits/chosen": -2.134242534637451, + "logits/rejected": -2.099299907684326, + "logps/chosen": -210.3935546875, + "logps/rejected": -288.7630310058594, + "loss": 0.4958, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.551510214805603, + "rewards/margins": 0.8001585006713867, + "rewards/rejected": -2.3516688346862793, + "step": 17200 + }, + { + "epoch": 2.963473466574776, + "eval_logits/chosen": -2.1701745986938477, + "eval_logits/rejected": -2.150585651397705, + "eval_logps/chosen": -218.34732055664062, + "eval_logps/rejected": -253.1478271484375, + "eval_loss": 0.6411592364311218, + "eval_rewards/accuracies": 0.6296468377113342, + "eval_rewards/chosen": -1.5933184623718262, + "eval_rewards/margins": 0.31066349148750305, + "eval_rewards/rejected": -1.903982162475586, + "eval_runtime": 382.9775, + "eval_samples_per_second": 11.238, + "eval_steps_per_second": 1.405, + "step": 17200 + }, + { + "epoch": 2.965196416264645, + "grad_norm": 45.74662780761719, + "learning_rate": 8.199269067491466e-11, + "logits/chosen": -2.063141345977783, + "logits/rejected": -2.02927303314209, + "logps/chosen": -233.63015747070312, + "logps/rejected": -305.9888610839844, + "loss": 0.5514, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.8087259531021118, + "rewards/margins": 0.7234727144241333, + "rewards/rejected": -2.532198905944824, + "step": 17210 + }, + { + "epoch": 2.966919365954514, + "grad_norm": 30.98674201965332, + "learning_rate": 7.407652287640953e-11, + "logits/chosen": -2.1059508323669434, + "logits/rejected": -2.0682930946350098, + "logps/chosen": -230.4197998046875, + "logps/rejected": -323.3200378417969, + "loss": 0.4998, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.7401487827301025, + "rewards/margins": 0.9332057237625122, + "rewards/rejected": -2.673354387283325, + "step": 17220 + }, + { + "epoch": 2.968642315644383, + "grad_norm": 32.45381164550781, + "learning_rate": 6.656199768505511e-11, + "logits/chosen": -2.115161895751953, + "logits/rejected": -2.0847389698028564, + "logps/chosen": -229.85830688476562, + "logps/rejected": -305.0896911621094, + "loss": 0.5406, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.7608146667480469, + "rewards/margins": 0.7466955780982971, + "rewards/rejected": -2.5075104236602783, + "step": 17230 + }, + { + "epoch": 2.9703652653342525, + "grad_norm": 38.92058563232422, + "learning_rate": 5.944914530475742e-11, + "logits/chosen": -2.1152920722961426, + "logits/rejected": -2.0800163745880127, + "logps/chosen": -213.4129180908203, + "logps/rejected": -281.83587646484375, + "loss": 0.5245, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.586476445198059, + "rewards/margins": 0.6894331574440002, + "rewards/rejected": -2.275909662246704, + "step": 17240 + }, + { + "epoch": 2.9720882150241215, + "grad_norm": 39.71238327026367, + "learning_rate": 5.2737994324958403e-11, + "logits/chosen": -2.0901167392730713, + "logits/rejected": -2.052403688430786, + "logps/chosen": -223.3805389404297, + "logps/rejected": -303.0497741699219, + "loss": 0.4873, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.699758768081665, + "rewards/margins": 0.8128054738044739, + "rewards/rejected": -2.512564182281494, + "step": 17250 + }, + { + "epoch": 2.9738111647139904, + "grad_norm": 41.23759078979492, + "learning_rate": 4.642857172045822e-11, + "logits/chosen": -2.098942279815674, + "logits/rejected": -2.0471463203430176, + "logps/chosen": -218.2930145263672, + "logps/rejected": -304.4262390136719, + "loss": 0.4787, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -1.6233618259429932, + "rewards/margins": 0.9090239405632019, + "rewards/rejected": -2.53238582611084, + "step": 17260 + }, + { + "epoch": 2.9755341144038594, + "grad_norm": 42.076927185058594, + "learning_rate": 4.052090285138199e-11, + "logits/chosen": -2.0997719764709473, + "logits/rejected": -2.057018756866455, + "logps/chosen": -238.2406463623047, + "logps/rejected": -301.4361877441406, + "loss": 0.5706, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.8217058181762695, + "rewards/margins": 0.6541695594787598, + "rewards/rejected": -2.4758753776550293, + "step": 17270 + }, + { + "epoch": 2.9772570640937284, + "grad_norm": 38.43862533569336, + "learning_rate": 3.501501146304653e-11, + "logits/chosen": -2.0579917430877686, + "logits/rejected": -2.0086448192596436, + "logps/chosen": -221.3511505126953, + "logps/rejected": -305.3607482910156, + "loss": 0.4742, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.6612695455551147, + "rewards/margins": 0.8794782757759094, + "rewards/rejected": -2.540748119354248, + "step": 17280 + }, + { + "epoch": 2.9789800137835973, + "grad_norm": 39.56004333496094, + "learning_rate": 2.991091968582715e-11, + "logits/chosen": -2.1151480674743652, + "logits/rejected": -2.075232982635498, + "logps/chosen": -237.7044219970703, + "logps/rejected": -302.80474853515625, + "loss": 0.5218, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.7731300592422485, + "rewards/margins": 0.6949617266654968, + "rewards/rejected": -2.4680919647216797, + "step": 17290 + }, + { + "epoch": 2.9807029634734663, + "grad_norm": 59.17507553100586, + "learning_rate": 2.5208648035146553e-11, + "logits/chosen": -2.1309123039245605, + "logits/rejected": -2.1003451347351074, + "logps/chosen": -228.07443237304688, + "logps/rejected": -289.4764404296875, + "loss": 0.5408, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.7138761281967163, + "rewards/margins": 0.6651451587677002, + "rewards/rejected": -2.379021167755127, + "step": 17300 + }, + { + "epoch": 2.9824259131633357, + "grad_norm": 59.69649124145508, + "learning_rate": 2.0908215411330477e-11, + "logits/chosen": -2.1257481575012207, + "logits/rejected": -2.080110549926758, + "logps/chosen": -232.7665557861328, + "logps/rejected": -313.95428466796875, + "loss": 0.5014, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -1.771287202835083, + "rewards/margins": 0.8487504720687866, + "rewards/rejected": -2.62003755569458, + "step": 17310 + }, + { + "epoch": 2.9841488628532047, + "grad_norm": 39.24268341064453, + "learning_rate": 1.7009639099541118e-11, + "logits/chosen": -2.1227715015411377, + "logits/rejected": -2.0875697135925293, + "logps/chosen": -230.41921997070312, + "logps/rejected": -290.3526916503906, + "loss": 0.5541, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.7490037679672241, + "rewards/margins": 0.6406279802322388, + "rewards/rejected": -2.389631986618042, + "step": 17320 + }, + { + "epoch": 2.9858718125430737, + "grad_norm": 56.3477668762207, + "learning_rate": 1.35129347697438e-11, + "logits/chosen": -2.06532621383667, + "logits/rejected": -2.0280654430389404, + "logps/chosen": -231.87942504882812, + "logps/rejected": -289.4935302734375, + "loss": 0.5795, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.7513792514801025, + "rewards/margins": 0.6341745257377625, + "rewards/rejected": -2.3855533599853516, + "step": 17330 + }, + { + "epoch": 2.987594762232943, + "grad_norm": 27.002744674682617, + "learning_rate": 1.0418116476584859e-11, + "logits/chosen": -2.1330957412719727, + "logits/rejected": -2.0874831676483154, + "logps/chosen": -220.322265625, + "logps/rejected": -295.8821105957031, + "loss": 0.5003, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.665138840675354, + "rewards/margins": 0.7804524302482605, + "rewards/rejected": -2.445591449737549, + "step": 17340 + }, + { + "epoch": 2.989317711922812, + "grad_norm": 39.57339096069336, + "learning_rate": 7.725196659413847e-12, + "logits/chosen": -2.1366100311279297, + "logits/rejected": -2.088812828063965, + "logps/chosen": -214.4608612060547, + "logps/rejected": -283.12646484375, + "loss": 0.5176, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.6012340784072876, + "rewards/margins": 0.71893709897995, + "rewards/rejected": -2.3201708793640137, + "step": 17350 + }, + { + "epoch": 2.991040661612681, + "grad_norm": 59.013671875, + "learning_rate": 5.4341861421391965e-12, + "logits/chosen": -2.170902729034424, + "logits/rejected": -2.1363930702209473, + "logps/chosen": -226.77188110351562, + "logps/rejected": -299.11785888671875, + "loss": 0.5488, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.7157566547393799, + "rewards/margins": 0.7337027192115784, + "rewards/rejected": -2.4494593143463135, + "step": 17360 + }, + { + "epoch": 2.99276361130255, + "grad_norm": 42.70999526977539, + "learning_rate": 3.5450941332726415e-12, + "logits/chosen": -2.0977907180786133, + "logits/rejected": -2.068878650665283, + "logps/chosen": -229.20639038085938, + "logps/rejected": -288.0982666015625, + "loss": 0.569, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.760547399520874, + "rewards/margins": 0.5857253074645996, + "rewards/rejected": -2.3462727069854736, + "step": 17370 + }, + { + "epoch": 2.994486560992419, + "grad_norm": 69.87225341796875, + "learning_rate": 2.0579282258292862e-12, + "logits/chosen": -2.0864222049713135, + "logits/rejected": -2.054324150085449, + "logps/chosen": -231.4883270263672, + "logps/rejected": -297.22149658203125, + "loss": 0.537, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.7789758443832397, + "rewards/margins": 0.6550412178039551, + "rewards/rejected": -2.4340169429779053, + "step": 17380 + }, + { + "epoch": 2.996209510682288, + "grad_norm": 42.90975570678711, + "learning_rate": 9.726943973387137e-13, + "logits/chosen": -2.1187965869903564, + "logits/rejected": -2.0839667320251465, + "logps/chosen": -229.0812530517578, + "logps/rejected": -299.0865173339844, + "loss": 0.5169, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.7364839315414429, + "rewards/margins": 0.7252603769302368, + "rewards/rejected": -2.4617440700531006, + "step": 17390 + }, + { + "epoch": 2.997932460372157, + "grad_norm": 30.220075607299805, + "learning_rate": 2.8939700977836934e-13, + "logits/chosen": -2.106614589691162, + "logits/rejected": -2.068258285522461, + "logps/chosen": -235.975341796875, + "logps/rejected": -305.28216552734375, + "loss": 0.548, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.796933889389038, + "rewards/margins": 0.7672160267829895, + "rewards/rejected": -2.564150094985962, + "step": 17400 + }, + { + "epoch": 2.9996554100620263, + "grad_norm": 72.88008117675781, + "learning_rate": 8.038809595767305e-15, + "logits/chosen": -2.0713143348693848, + "logits/rejected": -2.0295281410217285, + "logps/chosen": -219.08755493164062, + "logps/rejected": -300.3836364746094, + "loss": 0.4683, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.6422719955444336, + "rewards/margins": 0.818253219127655, + "rewards/rejected": -2.4605250358581543, + "step": 17410 + }, + { + "epoch": 3.0, + "step": 17412, + "total_flos": 0.0, + "train_loss": 0.5850592939272546, + "train_runtime": 88547.231, + "train_samples_per_second": 3.146, + "train_steps_per_second": 0.197 + } + ], + "logging_steps": 10, + "max_steps": 17412, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 400, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}