{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.998691442030882, "eval_steps": 500, "global_step": 477, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.002093692750588851, "grad_norm": 1.971431833221162, "learning_rate": 1.0416666666666666e-08, "logits/chosen": -0.6187891364097595, "logits/rejected": -0.6613013744354248, "logps/chosen": -294.863037109375, "logps/rejected": -271.0777587890625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.004187385501177702, "grad_norm": 2.1186559350686602, "learning_rate": 2.083333333333333e-08, "logits/chosen": -0.7680065631866455, "logits/rejected": -0.8290236592292786, "logps/chosen": -425.2915954589844, "logps/rejected": -331.84942626953125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2 }, { "epoch": 0.006281078251766554, "grad_norm": 2.045591842555687, "learning_rate": 3.125e-08, "logits/chosen": -0.6234244108200073, "logits/rejected": -0.6143776178359985, "logps/chosen": -274.429443359375, "logps/rejected": -218.20416259765625, "loss": 0.6933, "rewards/accuracies": 0.25, "rewards/chosen": -0.0013771996600553393, "rewards/margins": -0.002150287153199315, "rewards/rejected": 0.0007730877259746194, "step": 3 }, { "epoch": 0.008374771002355404, "grad_norm": 1.8966643812038764, "learning_rate": 4.166666666666666e-08, "logits/chosen": -0.8728774785995483, "logits/rejected": -0.9451797008514404, "logps/chosen": -263.6605224609375, "logps/rejected": -190.5877227783203, "loss": 0.6931, "rewards/accuracies": 0.5, "rewards/chosen": 0.0005489029572345316, "rewards/margins": 0.0011190040968358517, "rewards/rejected": -0.0005701010813936591, "step": 4 }, { "epoch": 0.010468463752944255, "grad_norm": 1.7437744300216154, "learning_rate": 5.208333333333333e-08, "logits/chosen": -0.5824593305587769, "logits/rejected": -0.5350952744483948, "logps/chosen": -203.246826171875, "logps/rejected": -258.6231689453125, "loss": 0.693, "rewards/accuracies": 0.6875, "rewards/chosen": -0.000317087717121467, "rewards/margins": 0.0010086469119414687, "rewards/rejected": -0.001325734774582088, "step": 5 }, { "epoch": 0.012562156503533107, "grad_norm": 2.0552091549547513, "learning_rate": 6.25e-08, "logits/chosen": -0.7059261202812195, "logits/rejected": -0.7078484296798706, "logps/chosen": -275.582275390625, "logps/rejected": -263.0954284667969, "loss": 0.6932, "rewards/accuracies": 0.375, "rewards/chosen": -0.00035199220292270184, "rewards/margins": -0.0005319597548805177, "rewards/rejected": 0.00017996736278291792, "step": 6 }, { "epoch": 0.014655849254121958, "grad_norm": 1.9828322491758705, "learning_rate": 7.291666666666667e-08, "logits/chosen": -0.5636578798294067, "logits/rejected": -0.7353062629699707, "logps/chosen": -390.6772155761719, "logps/rejected": -301.40106201171875, "loss": 0.6925, "rewards/accuracies": 0.53125, "rewards/chosen": 0.000250263576162979, "rewards/margins": 0.0001694792154012248, "rewards/rejected": 8.07843025540933e-05, "step": 7 }, { "epoch": 0.016749542004710807, "grad_norm": 2.024861828934124, "learning_rate": 8.333333333333333e-08, "logits/chosen": -0.8094220757484436, "logits/rejected": -0.7803943157196045, "logps/chosen": -260.3701171875, "logps/rejected": -223.9737091064453, "loss": 0.6931, "rewards/accuracies": 0.5, "rewards/chosen": 0.000325973320286721, "rewards/margins": 0.0005722225178033113, "rewards/rejected": -0.00024624925572425127, "step": 8 }, { "epoch": 0.01884323475529966, "grad_norm": 2.3661594537898574, "learning_rate": 9.375e-08, "logits/chosen": -0.594490647315979, "logits/rejected": -0.6972835659980774, "logps/chosen": -244.74142456054688, "logps/rejected": -204.0095672607422, "loss": 0.693, "rewards/accuracies": 0.59375, "rewards/chosen": -0.0001926334953168407, "rewards/margins": -0.00010243598080705851, "rewards/rejected": -9.019754361361265e-05, "step": 9 }, { "epoch": 0.02093692750588851, "grad_norm": 1.9611664617984437, "learning_rate": 1.0416666666666667e-07, "logits/chosen": -0.7222006320953369, "logits/rejected": -0.6490722894668579, "logps/chosen": -227.97613525390625, "logps/rejected": -265.9739990234375, "loss": 0.6932, "rewards/accuracies": 0.46875, "rewards/chosen": 0.00022130551224108785, "rewards/margins": -0.00011947308667004108, "rewards/rejected": 0.00034077855525538325, "step": 10 }, { "epoch": 0.023030620256477362, "grad_norm": 1.9570608010326949, "learning_rate": 1.1458333333333332e-07, "logits/chosen": -0.7479411363601685, "logits/rejected": -0.7732167840003967, "logps/chosen": -321.7281799316406, "logps/rejected": -369.2265625, "loss": 0.6932, "rewards/accuracies": 0.34375, "rewards/chosen": -0.0006872442318126559, "rewards/margins": -0.001559309079311788, "rewards/rejected": 0.0008720647892914712, "step": 11 }, { "epoch": 0.025124313007066214, "grad_norm": 2.0419157539534667, "learning_rate": 1.25e-07, "logits/chosen": -0.6329588294029236, "logits/rejected": -0.6775169968605042, "logps/chosen": -280.7995910644531, "logps/rejected": -235.86380004882812, "loss": 0.693, "rewards/accuracies": 0.5, "rewards/chosen": 0.0002437659422867, "rewards/margins": 7.095193723216653e-06, "rewards/rejected": 0.00023667074856348336, "step": 12 }, { "epoch": 0.027218005757655064, "grad_norm": 1.8981404490812435, "learning_rate": 1.3541666666666666e-07, "logits/chosen": -0.7640203833580017, "logits/rejected": -0.6846197843551636, "logps/chosen": -270.2711181640625, "logps/rejected": -178.0986328125, "loss": 0.6932, "rewards/accuracies": 0.59375, "rewards/chosen": 0.00126883655320853, "rewards/margins": 0.0007917883340269327, "rewards/rejected": 0.00047704821918159723, "step": 13 }, { "epoch": 0.029311698508243916, "grad_norm": 2.1973317734637234, "learning_rate": 1.4583333333333335e-07, "logits/chosen": -0.6896005868911743, "logits/rejected": -0.6153122186660767, "logps/chosen": -280.47760009765625, "logps/rejected": -271.60186767578125, "loss": 0.6934, "rewards/accuracies": 0.5, "rewards/chosen": -0.0011689214734360576, "rewards/margins": -0.0003053966211155057, "rewards/rejected": -0.0008635248523205519, "step": 14 }, { "epoch": 0.031405391258832765, "grad_norm": 2.0027312385261045, "learning_rate": 1.5624999999999999e-07, "logits/chosen": -0.695854902267456, "logits/rejected": -0.710625410079956, "logps/chosen": -301.9278259277344, "logps/rejected": -299.91497802734375, "loss": 0.6929, "rewards/accuracies": 0.46875, "rewards/chosen": -0.00085478276014328, "rewards/margins": -0.0004967284039594233, "rewards/rejected": -0.0003580544434953481, "step": 15 }, { "epoch": 0.033499084009421615, "grad_norm": 1.9969335830269996, "learning_rate": 1.6666666666666665e-07, "logits/chosen": -0.5501838326454163, "logits/rejected": -0.5407338738441467, "logps/chosen": -256.4944763183594, "logps/rejected": -254.85549926757812, "loss": 0.6929, "rewards/accuracies": 0.34375, "rewards/chosen": -0.0003643968375399709, "rewards/margins": -0.0012601130874827504, "rewards/rejected": 0.0008957161917351186, "step": 16 }, { "epoch": 0.03559277676001047, "grad_norm": 2.0392285843637095, "learning_rate": 1.7708333333333334e-07, "logits/chosen": -0.6044663190841675, "logits/rejected": -0.6867555975914001, "logps/chosen": -313.10906982421875, "logps/rejected": -218.1583251953125, "loss": 0.6928, "rewards/accuracies": 0.5, "rewards/chosen": 0.0009736107313074172, "rewards/margins": -9.640939242672175e-05, "rewards/rejected": 0.0010700201382860541, "step": 17 }, { "epoch": 0.03768646951059932, "grad_norm": 2.026619110787916, "learning_rate": 1.875e-07, "logits/chosen": -0.724123477935791, "logits/rejected": -0.6556088328361511, "logps/chosen": -244.83108520507812, "logps/rejected": -215.4698028564453, "loss": 0.6935, "rewards/accuracies": 0.375, "rewards/chosen": -0.0009035511175170541, "rewards/margins": -0.001362514216452837, "rewards/rejected": 0.000458963040728122, "step": 18 }, { "epoch": 0.03978016226118817, "grad_norm": 2.1802865194721837, "learning_rate": 1.9791666666666664e-07, "logits/chosen": -0.6750373244285583, "logits/rejected": -0.7176267504692078, "logps/chosen": -344.3795166015625, "logps/rejected": -325.2415466308594, "loss": 0.6928, "rewards/accuracies": 0.5625, "rewards/chosen": 0.0016450565308332443, "rewards/margins": 0.0017648922512307763, "rewards/rejected": -0.00011983569129370153, "step": 19 }, { "epoch": 0.04187385501177702, "grad_norm": 2.0067586376854467, "learning_rate": 2.0833333333333333e-07, "logits/chosen": -0.596210777759552, "logits/rejected": -0.6797239780426025, "logps/chosen": -250.85330200195312, "logps/rejected": -272.17266845703125, "loss": 0.6923, "rewards/accuracies": 0.53125, "rewards/chosen": 0.0009612017311155796, "rewards/margins": 0.0008868351578712463, "rewards/rejected": 7.436659507220611e-05, "step": 20 }, { "epoch": 0.043967547762365874, "grad_norm": 1.6980184484368723, "learning_rate": 2.1875e-07, "logits/chosen": -0.7423078417778015, "logits/rejected": -0.6924868822097778, "logps/chosen": -190.86756896972656, "logps/rejected": -177.81466674804688, "loss": 0.6934, "rewards/accuracies": 0.46875, "rewards/chosen": 0.0005013238987885416, "rewards/margins": 2.132986264768988e-05, "rewards/rejected": 0.00047999396338127553, "step": 21 }, { "epoch": 0.046061240512954724, "grad_norm": 2.1901835811413477, "learning_rate": 2.2916666666666663e-07, "logits/chosen": -0.6114894151687622, "logits/rejected": -0.6214694380760193, "logps/chosen": -231.86090087890625, "logps/rejected": -197.80889892578125, "loss": 0.6928, "rewards/accuracies": 0.625, "rewards/chosen": 0.0002058513491647318, "rewards/margins": 0.00013158630463294685, "rewards/rejected": 7.426508818753064e-05, "step": 22 }, { "epoch": 0.04815493326354357, "grad_norm": 1.9371983551203638, "learning_rate": 2.3958333333333335e-07, "logits/chosen": -0.7673167586326599, "logits/rejected": -0.7702030539512634, "logps/chosen": -246.82473754882812, "logps/rejected": -218.8922882080078, "loss": 0.6925, "rewards/accuracies": 0.5, "rewards/chosen": 0.0007793458644300699, "rewards/margins": 0.0011465366696938872, "rewards/rejected": -0.0003671907470561564, "step": 23 }, { "epoch": 0.05024862601413243, "grad_norm": 1.953116296257954, "learning_rate": 2.5e-07, "logits/chosen": -0.7620508670806885, "logits/rejected": -0.6211656928062439, "logps/chosen": -319.53662109375, "logps/rejected": -231.24728393554688, "loss": 0.6928, "rewards/accuracies": 0.65625, "rewards/chosen": 0.0023938000667840242, "rewards/margins": 0.0013294587843120098, "rewards/rejected": 0.0010643412824720144, "step": 24 }, { "epoch": 0.05234231876472128, "grad_norm": 1.8034876266036606, "learning_rate": 2.604166666666667e-07, "logits/chosen": -0.5935025215148926, "logits/rejected": -0.6785202622413635, "logps/chosen": -195.31817626953125, "logps/rejected": -192.4014892578125, "loss": 0.6931, "rewards/accuracies": 0.46875, "rewards/chosen": 0.0001576008362462744, "rewards/margins": 7.37607479095459e-07, "rewards/rejected": 0.00015686321421526372, "step": 25 }, { "epoch": 0.05443601151531013, "grad_norm": 2.0409030265438175, "learning_rate": 2.708333333333333e-07, "logits/chosen": -0.7819192409515381, "logits/rejected": -0.8378588557243347, "logps/chosen": -201.65638732910156, "logps/rejected": -132.8858642578125, "loss": 0.6933, "rewards/accuracies": 0.40625, "rewards/chosen": -0.000557247141841799, "rewards/margins": -0.001297741662710905, "rewards/rejected": 0.0007404944626614451, "step": 26 }, { "epoch": 0.056529704265898977, "grad_norm": 1.9505622662402318, "learning_rate": 2.8125e-07, "logits/chosen": -0.6634794473648071, "logits/rejected": -0.5730906128883362, "logps/chosen": -277.8077697753906, "logps/rejected": -251.2814178466797, "loss": 0.6924, "rewards/accuracies": 0.53125, "rewards/chosen": 0.0009686153498478234, "rewards/margins": 0.001145586371421814, "rewards/rejected": -0.00017697090515866876, "step": 27 }, { "epoch": 0.05862339701648783, "grad_norm": 1.9837337802777917, "learning_rate": 2.916666666666667e-07, "logits/chosen": -0.5705346465110779, "logits/rejected": -0.6336209177970886, "logps/chosen": -261.667724609375, "logps/rejected": -226.7752227783203, "loss": 0.6928, "rewards/accuracies": 0.5625, "rewards/chosen": 0.0008887735893949866, "rewards/margins": 0.0010174165945500135, "rewards/rejected": -0.00012864297605119646, "step": 28 }, { "epoch": 0.06071708976707668, "grad_norm": 2.08573176741824, "learning_rate": 3.020833333333333e-07, "logits/chosen": -0.6381535530090332, "logits/rejected": -0.789168119430542, "logps/chosen": -349.0322265625, "logps/rejected": -269.6571044921875, "loss": 0.6926, "rewards/accuracies": 0.5, "rewards/chosen": -0.000616913428530097, "rewards/margins": -0.0007017964962869883, "rewards/rejected": 8.488312596455216e-05, "step": 29 }, { "epoch": 0.06281078251766553, "grad_norm": 2.0067289899339227, "learning_rate": 3.1249999999999997e-07, "logits/chosen": -0.601807713508606, "logits/rejected": -0.5770432353019714, "logps/chosen": -285.596435546875, "logps/rejected": -278.32427978515625, "loss": 0.6936, "rewards/accuracies": 0.40625, "rewards/chosen": -0.00028869439847767353, "rewards/margins": -0.002459221053868532, "rewards/rejected": 0.002170526422560215, "step": 30 }, { "epoch": 0.06490447526825438, "grad_norm": 2.0448764227610248, "learning_rate": 3.2291666666666666e-07, "logits/chosen": -0.7501438856124878, "logits/rejected": -0.7152121663093567, "logps/chosen": -195.2887420654297, "logps/rejected": -212.34326171875, "loss": 0.6924, "rewards/accuracies": 0.625, "rewards/chosen": 0.0013356053968891501, "rewards/margins": 0.001906500430777669, "rewards/rejected": -0.0005708950338885188, "step": 31 }, { "epoch": 0.06699816801884323, "grad_norm": 2.2617048466378207, "learning_rate": 3.333333333333333e-07, "logits/chosen": -0.743459165096283, "logits/rejected": -0.7608909010887146, "logps/chosen": -322.5064392089844, "logps/rejected": -267.0641784667969, "loss": 0.6925, "rewards/accuracies": 0.5625, "rewards/chosen": 0.003062322037294507, "rewards/margins": 0.002301964443176985, "rewards/rejected": 0.0007603574777022004, "step": 32 }, { "epoch": 0.06909186076943209, "grad_norm": 2.202241981854634, "learning_rate": 3.4375e-07, "logits/chosen": -0.6312966346740723, "logits/rejected": -0.5758035182952881, "logps/chosen": -315.8310241699219, "logps/rejected": -232.00596618652344, "loss": 0.6921, "rewards/accuracies": 0.6875, "rewards/chosen": 0.0022236956283450127, "rewards/margins": 0.004034089855849743, "rewards/rejected": -0.0018103942275047302, "step": 33 }, { "epoch": 0.07118555352002094, "grad_norm": 1.9894506981248388, "learning_rate": 3.541666666666667e-07, "logits/chosen": -0.6452329754829407, "logits/rejected": -0.5804439783096313, "logps/chosen": -191.28460693359375, "logps/rejected": -246.96041870117188, "loss": 0.6917, "rewards/accuracies": 0.65625, "rewards/chosen": 0.00262818718329072, "rewards/margins": 0.004321876913309097, "rewards/rejected": -0.0016936898464336991, "step": 34 }, { "epoch": 0.07327924627060979, "grad_norm": 1.9481676253095956, "learning_rate": 3.645833333333333e-07, "logits/chosen": -0.7310526371002197, "logits/rejected": -0.5909218788146973, "logps/chosen": -261.432861328125, "logps/rejected": -267.4197692871094, "loss": 0.6916, "rewards/accuracies": 0.71875, "rewards/chosen": 0.002795224543660879, "rewards/margins": 0.0024727522395551205, "rewards/rejected": 0.00032247230410575867, "step": 35 }, { "epoch": 0.07537293902119864, "grad_norm": 2.377463948387139, "learning_rate": 3.75e-07, "logits/chosen": -0.671576201915741, "logits/rejected": -0.739955484867096, "logps/chosen": -290.39739990234375, "logps/rejected": -231.30343627929688, "loss": 0.6913, "rewards/accuracies": 0.6875, "rewards/chosen": 0.0029074212070554495, "rewards/margins": 0.005217359866946936, "rewards/rejected": -0.00230993889272213, "step": 36 }, { "epoch": 0.07746663177178749, "grad_norm": 2.1192515265758294, "learning_rate": 3.8541666666666665e-07, "logits/chosen": -0.4991784691810608, "logits/rejected": -0.5041351914405823, "logps/chosen": -269.4438781738281, "logps/rejected": -250.40292358398438, "loss": 0.6919, "rewards/accuracies": 0.625, "rewards/chosen": 0.0039121531881392, "rewards/margins": 0.0033366698771715164, "rewards/rejected": 0.0005754834273830056, "step": 37 }, { "epoch": 0.07956032452237634, "grad_norm": 2.0427577682259432, "learning_rate": 3.958333333333333e-07, "logits/chosen": -0.7477656602859497, "logits/rejected": -0.6866115927696228, "logps/chosen": -279.0836181640625, "logps/rejected": -271.2953796386719, "loss": 0.6911, "rewards/accuracies": 0.6875, "rewards/chosen": 0.003979154862463474, "rewards/margins": 0.006106662098318338, "rewards/rejected": -0.0021275077015161514, "step": 38 }, { "epoch": 0.08165401727296519, "grad_norm": 2.3209408617260165, "learning_rate": 4.0625e-07, "logits/chosen": -0.733414888381958, "logits/rejected": -0.6295084357261658, "logps/chosen": -272.4120178222656, "logps/rejected": -210.99412536621094, "loss": 0.6892, "rewards/accuracies": 0.65625, "rewards/chosen": 0.0030861867126077414, "rewards/margins": 0.005528694950044155, "rewards/rejected": -0.0024425082374364138, "step": 39 }, { "epoch": 0.08374771002355404, "grad_norm": 1.9598088524138277, "learning_rate": 4.1666666666666667e-07, "logits/chosen": -0.6727321743965149, "logits/rejected": -0.8322780132293701, "logps/chosen": -226.13304138183594, "logps/rejected": -235.11257934570312, "loss": 0.6913, "rewards/accuracies": 0.6875, "rewards/chosen": 0.001920529524795711, "rewards/margins": 0.0045545585453510284, "rewards/rejected": -0.0026340289041399956, "step": 40 }, { "epoch": 0.0858414027741429, "grad_norm": 1.9587828300866148, "learning_rate": 4.270833333333333e-07, "logits/chosen": -0.5664694905281067, "logits/rejected": -0.5545387268066406, "logps/chosen": -244.36734008789062, "logps/rejected": -241.54855346679688, "loss": 0.6913, "rewards/accuracies": 0.59375, "rewards/chosen": 0.0032111844047904015, "rewards/margins": 0.0038819422479718924, "rewards/rejected": -0.00067075778497383, "step": 41 }, { "epoch": 0.08793509552473175, "grad_norm": 1.996767619864239, "learning_rate": 4.375e-07, "logits/chosen": -0.6485401391983032, "logits/rejected": -0.6958127617835999, "logps/chosen": -200.083740234375, "logps/rejected": -192.50198364257812, "loss": 0.6905, "rewards/accuracies": 0.71875, "rewards/chosen": 0.004931747913360596, "rewards/margins": 0.005231370683759451, "rewards/rejected": -0.0002996218972839415, "step": 42 }, { "epoch": 0.0900287882753206, "grad_norm": 2.1459031082226927, "learning_rate": 4.479166666666667e-07, "logits/chosen": -0.6633652448654175, "logits/rejected": -0.7224881649017334, "logps/chosen": -348.4588623046875, "logps/rejected": -325.5594787597656, "loss": 0.6891, "rewards/accuracies": 0.65625, "rewards/chosen": 0.00901946984231472, "rewards/margins": 0.007597425486892462, "rewards/rejected": 0.001422043889760971, "step": 43 }, { "epoch": 0.09212248102590945, "grad_norm": 2.047219946467369, "learning_rate": 4.5833333333333327e-07, "logits/chosen": -0.6751187443733215, "logits/rejected": -0.5717964172363281, "logps/chosen": -345.685791015625, "logps/rejected": -293.78857421875, "loss": 0.6892, "rewards/accuracies": 0.6875, "rewards/chosen": 0.007948920130729675, "rewards/margins": 0.010839272290468216, "rewards/rejected": -0.0028903530910611153, "step": 44 }, { "epoch": 0.0942161737764983, "grad_norm": 1.8126548880707323, "learning_rate": 4.6874999999999996e-07, "logits/chosen": -0.5376222133636475, "logits/rejected": -0.5230984687805176, "logps/chosen": -233.46946716308594, "logps/rejected": -219.74533081054688, "loss": 0.6902, "rewards/accuracies": 0.65625, "rewards/chosen": 0.0034225727431476116, "rewards/margins": 0.005250619724392891, "rewards/rejected": -0.0018280467484146357, "step": 45 }, { "epoch": 0.09630986652708715, "grad_norm": 2.2798902452338443, "learning_rate": 4.791666666666667e-07, "logits/chosen": -0.6814889311790466, "logits/rejected": -0.6853336691856384, "logps/chosen": -346.9630432128906, "logps/rejected": -244.8706512451172, "loss": 0.6889, "rewards/accuracies": 0.65625, "rewards/chosen": 0.005478914827108383, "rewards/margins": 0.004464020021259785, "rewards/rejected": 0.0010148946894332767, "step": 46 }, { "epoch": 0.098403559277676, "grad_norm": 6.17425606210882, "learning_rate": 4.895833333333333e-07, "logits/chosen": -0.7842140793800354, "logits/rejected": -0.7353090643882751, "logps/chosen": -316.4680480957031, "logps/rejected": -291.9408874511719, "loss": 0.6884, "rewards/accuracies": 0.71875, "rewards/chosen": 0.009343018755316734, "rewards/margins": 0.010767575353384018, "rewards/rejected": -0.001424555666744709, "step": 47 }, { "epoch": 0.10049725202826486, "grad_norm": 2.2081223180078506, "learning_rate": 5e-07, "logits/chosen": -0.40676793456077576, "logits/rejected": -0.44390735030174255, "logps/chosen": -270.2117004394531, "logps/rejected": -277.14300537109375, "loss": 0.6899, "rewards/accuracies": 0.5, "rewards/chosen": -0.0008061178377829492, "rewards/margins": 0.0034307213500142097, "rewards/rejected": -0.004236839711666107, "step": 48 }, { "epoch": 0.10259094477885371, "grad_norm": 2.0825721793727, "learning_rate": 4.999932966293553e-07, "logits/chosen": -0.7100363373756409, "logits/rejected": -0.8712139129638672, "logps/chosen": -289.8148498535156, "logps/rejected": -282.93341064453125, "loss": 0.6894, "rewards/accuracies": 0.65625, "rewards/chosen": 0.0048608481884002686, "rewards/margins": 0.009491749107837677, "rewards/rejected": -0.0046309009194374084, "step": 49 }, { "epoch": 0.10468463752944256, "grad_norm": 2.077750188377877, "learning_rate": 4.999731868769026e-07, "logits/chosen": -0.7161614894866943, "logits/rejected": -0.6530757546424866, "logps/chosen": -250.6453094482422, "logps/rejected": -266.81005859375, "loss": 0.6898, "rewards/accuracies": 0.65625, "rewards/chosen": 0.0027735906187444925, "rewards/margins": 0.003304342972114682, "rewards/rejected": -0.0005307523533701897, "step": 50 }, { "epoch": 0.1067783302800314, "grad_norm": 1.9134663806619183, "learning_rate": 4.99939671821067e-07, "logits/chosen": -0.8551144599914551, "logits/rejected": -0.758982241153717, "logps/chosen": -227.65618896484375, "logps/rejected": -159.5537567138672, "loss": 0.6896, "rewards/accuracies": 0.5625, "rewards/chosen": 0.0006121020414866507, "rewards/margins": 0.0013047237880527973, "rewards/rejected": -0.0006926218047738075, "step": 51 }, { "epoch": 0.10887202303062025, "grad_norm": 2.070815947577094, "learning_rate": 4.998927532591591e-07, "logits/chosen": -0.8150084018707275, "logits/rejected": -0.6602609753608704, "logps/chosen": -208.27174377441406, "logps/rejected": -219.9952392578125, "loss": 0.6888, "rewards/accuracies": 0.625, "rewards/chosen": 0.0021868993062525988, "rewards/margins": 0.008370739407837391, "rewards/rejected": -0.0061838398687541485, "step": 52 }, { "epoch": 0.1109657157812091, "grad_norm": 2.0247442623341376, "learning_rate": 4.998324337072792e-07, "logits/chosen": -0.740088939666748, "logits/rejected": -0.6923155784606934, "logps/chosen": -219.006591796875, "logps/rejected": -235.81396484375, "loss": 0.6879, "rewards/accuracies": 0.625, "rewards/chosen": 0.0023992201313376427, "rewards/margins": 0.007879557088017464, "rewards/rejected": -0.005480337422341108, "step": 53 }, { "epoch": 0.11305940853179795, "grad_norm": 2.1054725746070573, "learning_rate": 4.997587164001815e-07, "logits/chosen": -0.8336099982261658, "logits/rejected": -0.6551526188850403, "logps/chosen": -393.4664611816406, "logps/rejected": -233.5998992919922, "loss": 0.6879, "rewards/accuracies": 0.75, "rewards/chosen": 0.003564465092495084, "rewards/margins": 0.018503893166780472, "rewards/rejected": -0.014939428307116032, "step": 54 }, { "epoch": 0.11515310128238682, "grad_norm": 1.941385865397532, "learning_rate": 4.996716052911017e-07, "logits/chosen": -0.802671492099762, "logits/rejected": -0.6909992098808289, "logps/chosen": -225.2550048828125, "logps/rejected": -196.96612548828125, "loss": 0.6869, "rewards/accuracies": 0.71875, "rewards/chosen": 0.006678167264908552, "rewards/margins": 0.015130014158785343, "rewards/rejected": -0.008451846428215504, "step": 55 }, { "epoch": 0.11724679403297567, "grad_norm": 1.964335236633298, "learning_rate": 4.99571105051544e-07, "logits/chosen": -0.7504221200942993, "logits/rejected": -0.6486672759056091, "logps/chosen": -265.9771423339844, "logps/rejected": -252.11813354492188, "loss": 0.6856, "rewards/accuracies": 0.75, "rewards/chosen": 0.0025671645998954773, "rewards/margins": 0.01655544340610504, "rewards/rejected": -0.013988279737532139, "step": 56 }, { "epoch": 0.11934048678356451, "grad_norm": 2.1408594973954425, "learning_rate": 4.994572210710314e-07, "logits/chosen": -0.6982129812240601, "logits/rejected": -0.7081270813941956, "logps/chosen": -250.50503540039062, "logps/rejected": -260.558349609375, "loss": 0.6847, "rewards/accuracies": 0.71875, "rewards/chosen": 0.005940047092735767, "rewards/margins": 0.013509290292859077, "rewards/rejected": -0.00756924320012331, "step": 57 }, { "epoch": 0.12143417953415336, "grad_norm": 1.9900647186589018, "learning_rate": 4.993299594568162e-07, "logits/chosen": -0.5130844116210938, "logits/rejected": -0.6340294480323792, "logps/chosen": -222.11996459960938, "logps/rejected": -242.883056640625, "loss": 0.6847, "rewards/accuracies": 0.65625, "rewards/chosen": 0.007525183260440826, "rewards/margins": 0.02239028736948967, "rewards/rejected": -0.014865105971693993, "step": 58 }, { "epoch": 0.12352787228474221, "grad_norm": 1.9376996416941938, "learning_rate": 4.991893270335525e-07, "logits/chosen": -0.871770977973938, "logits/rejected": -0.7174229025840759, "logps/chosen": -323.6181640625, "logps/rejected": -286.60528564453125, "loss": 0.6877, "rewards/accuracies": 0.8125, "rewards/chosen": 0.002035085577517748, "rewards/margins": 0.01675288937985897, "rewards/rejected": -0.014717803336679935, "step": 59 }, { "epoch": 0.12562156503533106, "grad_norm": 2.573932160903455, "learning_rate": 4.990353313429303e-07, "logits/chosen": -0.5868269801139832, "logits/rejected": -0.6016286611557007, "logps/chosen": -219.28656005859375, "logps/rejected": -258.56231689453125, "loss": 0.6838, "rewards/accuracies": 0.6875, "rewards/chosen": 0.005587503779679537, "rewards/margins": 0.016260625794529915, "rewards/rejected": -0.010673120617866516, "step": 60 }, { "epoch": 0.1277152577859199, "grad_norm": 2.3770677094812664, "learning_rate": 4.988679806432711e-07, "logits/chosen": -0.6782679557800293, "logits/rejected": -0.5954487323760986, "logps/chosen": -368.7725830078125, "logps/rejected": -197.63526916503906, "loss": 0.6853, "rewards/accuracies": 0.65625, "rewards/chosen": 0.003363984636962414, "rewards/margins": 0.01563914678990841, "rewards/rejected": -0.01227516122162342, "step": 61 }, { "epoch": 0.12980895053650876, "grad_norm": 2.274534803471502, "learning_rate": 4.986872839090852e-07, "logits/chosen": -0.6938801407814026, "logits/rejected": -0.5450634360313416, "logps/chosen": -271.8917236328125, "logps/rejected": -265.5125732421875, "loss": 0.6837, "rewards/accuracies": 0.625, "rewards/chosen": 0.00043565803207457066, "rewards/margins": 0.00416939053684473, "rewards/rejected": -0.0037337325047701597, "step": 62 }, { "epoch": 0.1319026432870976, "grad_norm": 2.3111528490135345, "learning_rate": 4.9849325083059e-07, "logits/chosen": -0.5361589789390564, "logits/rejected": -0.4999515414237976, "logps/chosen": -251.64248657226562, "logps/rejected": -183.90286254882812, "loss": 0.6837, "rewards/accuracies": 0.8125, "rewards/chosen": 0.007958602160215378, "rewards/margins": 0.03242652490735054, "rewards/rejected": -0.024467922747135162, "step": 63 }, { "epoch": 0.13399633603768646, "grad_norm": 1.959384238779133, "learning_rate": 4.982858918131906e-07, "logits/chosen": -0.7587329745292664, "logits/rejected": -0.7490700483322144, "logps/chosen": -275.38116455078125, "logps/rejected": -231.76719665527344, "loss": 0.6862, "rewards/accuracies": 0.59375, "rewards/chosen": 4.9128313548862934e-05, "rewards/margins": 0.018238583579659462, "rewards/rejected": -0.018189454451203346, "step": 64 }, { "epoch": 0.1360900287882753, "grad_norm": 2.0443965427233226, "learning_rate": 4.980652179769217e-07, "logits/chosen": -0.6966503262519836, "logits/rejected": -0.7912587523460388, "logps/chosen": -396.6440124511719, "logps/rejected": -329.6874694824219, "loss": 0.6863, "rewards/accuracies": 0.59375, "rewards/chosen": -0.003138565458357334, "rewards/margins": 0.018537556752562523, "rewards/rejected": -0.02167612314224243, "step": 65 }, { "epoch": 0.13818372153886418, "grad_norm": 2.006231972376758, "learning_rate": 4.978312411558517e-07, "logits/chosen": -0.5625854134559631, "logits/rejected": -0.6061365008354187, "logps/chosen": -232.70681762695312, "logps/rejected": -250.01683044433594, "loss": 0.6849, "rewards/accuracies": 0.625, "rewards/chosen": -0.004839140921831131, "rewards/margins": 0.0109052499756217, "rewards/rejected": -0.015744389966130257, "step": 66 }, { "epoch": 0.14027741428945303, "grad_norm": 2.376956433563554, "learning_rate": 4.975839738974473e-07, "logits/chosen": -0.5915093421936035, "logits/rejected": -0.5767983198165894, "logps/chosen": -356.6428527832031, "logps/rejected": -355.6318664550781, "loss": 0.6756, "rewards/accuracies": 0.6875, "rewards/chosen": 0.007966393604874611, "rewards/margins": 0.03256059065461159, "rewards/rejected": -0.024594198912382126, "step": 67 }, { "epoch": 0.14237110704004188, "grad_norm": 2.0405831578430007, "learning_rate": 4.97323429461901e-07, "logits/chosen": -0.6072592735290527, "logits/rejected": -0.6102288961410522, "logps/chosen": -234.42662048339844, "logps/rejected": -248.77467346191406, "loss": 0.6823, "rewards/accuracies": 0.78125, "rewards/chosen": -0.006947504822164774, "rewards/margins": 0.02747585065662861, "rewards/rejected": -0.0344233512878418, "step": 68 }, { "epoch": 0.14446479979063073, "grad_norm": 2.1859681908655944, "learning_rate": 4.970496218214204e-07, "logits/chosen": -0.6759756803512573, "logits/rejected": -0.7359828948974609, "logps/chosen": -248.28138732910156, "logps/rejected": -281.7460632324219, "loss": 0.6853, "rewards/accuracies": 0.5, "rewards/chosen": -0.018444957211613655, "rewards/margins": 0.007142477668821812, "rewards/rejected": -0.02558743767440319, "step": 69 }, { "epoch": 0.14655849254121958, "grad_norm": 1.9582790956713594, "learning_rate": 4.967625656594781e-07, "logits/chosen": -0.6839518547058105, "logits/rejected": -0.5589462518692017, "logps/chosen": -296.99798583984375, "logps/rejected": -233.34228515625, "loss": 0.684, "rewards/accuracies": 0.6875, "rewards/chosen": -0.010661616921424866, "rewards/margins": 0.022116374224424362, "rewards/rejected": -0.032777994871139526, "step": 70 }, { "epoch": 0.14865218529180843, "grad_norm": 2.13237405552888, "learning_rate": 4.964622763700252e-07, "logits/chosen": -0.75205397605896, "logits/rejected": -0.6713017821311951, "logps/chosen": -264.08294677734375, "logps/rejected": -248.28883361816406, "loss": 0.6817, "rewards/accuracies": 0.625, "rewards/chosen": -0.012494457885622978, "rewards/margins": 0.01229042373597622, "rewards/rejected": -0.024784879758954048, "step": 71 }, { "epoch": 0.15074587804239728, "grad_norm": 2.143874653161483, "learning_rate": 4.961487700566646e-07, "logits/chosen": -0.7627979516983032, "logits/rejected": -0.6970850825309753, "logps/chosen": -327.4230041503906, "logps/rejected": -256.9564514160156, "loss": 0.6806, "rewards/accuracies": 0.65625, "rewards/chosen": -0.008451453410089016, "rewards/margins": 0.025298018008470535, "rewards/rejected": -0.03374946862459183, "step": 72 }, { "epoch": 0.15283957079298613, "grad_norm": 2.1391118093041657, "learning_rate": 4.958220635317885e-07, "logits/chosen": -0.6539451479911804, "logits/rejected": -0.5963329672813416, "logps/chosen": -284.647216796875, "logps/rejected": -200.07244873046875, "loss": 0.6819, "rewards/accuracies": 0.6875, "rewards/chosen": -0.02597237192094326, "rewards/margins": 0.023477813228964806, "rewards/rejected": -0.04945018142461777, "step": 73 }, { "epoch": 0.15493326354357498, "grad_norm": 2.1024260152103147, "learning_rate": 4.954821743156767e-07, "logits/chosen": -0.6240401864051819, "logits/rejected": -0.6036374568939209, "logps/chosen": -269.7067565917969, "logps/rejected": -260.08782958984375, "loss": 0.6823, "rewards/accuracies": 0.71875, "rewards/chosen": -0.024506010115146637, "rewards/margins": 0.03602860867977142, "rewards/rejected": -0.06053461506962776, "step": 74 }, { "epoch": 0.15702695629416383, "grad_norm": 2.1409942967531492, "learning_rate": 4.951291206355559e-07, "logits/chosen": -0.7455810308456421, "logits/rejected": -0.7258849740028381, "logps/chosen": -319.21630859375, "logps/rejected": -289.306640625, "loss": 0.6774, "rewards/accuracies": 0.8125, "rewards/chosen": -0.02300322614610195, "rewards/margins": 0.04593230411410332, "rewards/rejected": -0.06893552839756012, "step": 75 }, { "epoch": 0.15912064904475268, "grad_norm": 1.9586106002336963, "learning_rate": 4.947629214246236e-07, "logits/chosen": -0.6800742745399475, "logits/rejected": -0.6047073602676392, "logps/chosen": -244.04946899414062, "logps/rejected": -278.0081787109375, "loss": 0.6812, "rewards/accuracies": 0.59375, "rewards/chosen": -0.012847788631916046, "rewards/margins": 0.021262703463435173, "rewards/rejected": -0.03411049023270607, "step": 76 }, { "epoch": 0.16121434179534153, "grad_norm": 2.0214681327100186, "learning_rate": 4.943835963210323e-07, "logits/chosen": -0.8359452486038208, "logits/rejected": -0.7800988554954529, "logps/chosen": -323.22955322265625, "logps/rejected": -228.22254943847656, "loss": 0.6822, "rewards/accuracies": 0.6875, "rewards/chosen": -0.015957625582814217, "rewards/margins": 0.036251623183488846, "rewards/rejected": -0.05220925062894821, "step": 77 }, { "epoch": 0.16330803454593038, "grad_norm": 2.0168507552202533, "learning_rate": 4.939911656668361e-07, "logits/chosen": -0.6332938075065613, "logits/rejected": -0.6258643865585327, "logps/chosen": -328.7702941894531, "logps/rejected": -251.2275848388672, "loss": 0.6788, "rewards/accuracies": 0.75, "rewards/chosen": -0.022621842101216316, "rewards/margins": 0.044935792684555054, "rewards/rejected": -0.06755763292312622, "step": 78 }, { "epoch": 0.16540172729651922, "grad_norm": 2.072227054690479, "learning_rate": 4.935856505068998e-07, "logits/chosen": -0.775145947933197, "logits/rejected": -0.6434796452522278, "logps/chosen": -341.0631408691406, "logps/rejected": -214.4259796142578, "loss": 0.6751, "rewards/accuracies": 0.6875, "rewards/chosen": -0.02260693535208702, "rewards/margins": 0.04053226113319397, "rewards/rejected": -0.06313919275999069, "step": 79 }, { "epoch": 0.16749542004710807, "grad_norm": 2.244703676150944, "learning_rate": 4.93167072587771e-07, "logits/chosen": -0.5710870027542114, "logits/rejected": -0.4453847408294678, "logps/chosen": -247.19891357421875, "logps/rejected": -200.0994110107422, "loss": 0.6776, "rewards/accuracies": 0.65625, "rewards/chosen": -0.04803721606731415, "rewards/margins": 0.025850879028439522, "rewards/rejected": -0.07388809323310852, "step": 80 }, { "epoch": 0.16958911279769695, "grad_norm": 2.063665642835572, "learning_rate": 4.92735454356513e-07, "logits/chosen": -0.812736988067627, "logits/rejected": -0.5206312537193298, "logps/chosen": -276.9650573730469, "logps/rejected": -252.14804077148438, "loss": 0.6727, "rewards/accuracies": 0.71875, "rewards/chosen": -0.03331292048096657, "rewards/margins": 0.04902251809835434, "rewards/rejected": -0.08233543485403061, "step": 81 }, { "epoch": 0.1716828055482858, "grad_norm": 2.0155269028204232, "learning_rate": 4.922908189595017e-07, "logits/chosen": -0.4435555934906006, "logits/rejected": -0.38911890983581543, "logps/chosen": -264.0360107421875, "logps/rejected": -199.81163024902344, "loss": 0.6731, "rewards/accuracies": 0.625, "rewards/chosen": -0.04146213456988335, "rewards/margins": 0.016313647851347923, "rewards/rejected": -0.057775795459747314, "step": 82 }, { "epoch": 0.17377649829887465, "grad_norm": 2.0821907622714595, "learning_rate": 4.918331902411841e-07, "logits/chosen": -0.7084863185882568, "logits/rejected": -0.7467921376228333, "logps/chosen": -246.05174255371094, "logps/rejected": -243.80894470214844, "loss": 0.6797, "rewards/accuracies": 0.65625, "rewards/chosen": -0.03674688935279846, "rewards/margins": 0.03050503320991993, "rewards/rejected": -0.06725192815065384, "step": 83 }, { "epoch": 0.1758701910494635, "grad_norm": 2.246354017699191, "learning_rate": 4.913625927427995e-07, "logits/chosen": -0.8339526653289795, "logits/rejected": -0.9025973081588745, "logps/chosen": -300.51068115234375, "logps/rejected": -379.3022766113281, "loss": 0.6712, "rewards/accuracies": 0.625, "rewards/chosen": -0.03902056813240051, "rewards/margins": 0.044128429144620895, "rewards/rejected": -0.0831490010023117, "step": 84 }, { "epoch": 0.17796388380005235, "grad_norm": 2.197927092804245, "learning_rate": 4.908790517010636e-07, "logits/chosen": -0.8077641129493713, "logits/rejected": -0.7555009126663208, "logps/chosen": -212.62904357910156, "logps/rejected": -230.84715270996094, "loss": 0.6748, "rewards/accuracies": 0.5, "rewards/chosen": -0.050597868859767914, "rewards/margins": 0.017507800832390785, "rewards/rejected": -0.06810566782951355, "step": 85 }, { "epoch": 0.1800575765506412, "grad_norm": 2.1015679960935705, "learning_rate": 4.903825930468148e-07, "logits/chosen": -0.7299667596817017, "logits/rejected": -0.7447741627693176, "logps/chosen": -209.83750915527344, "logps/rejected": -228.0347137451172, "loss": 0.6682, "rewards/accuracies": 0.78125, "rewards/chosen": -0.03236336633563042, "rewards/margins": 0.05685946345329285, "rewards/rejected": -0.08922282606363297, "step": 86 }, { "epoch": 0.18215126930123005, "grad_norm": 2.231692626717296, "learning_rate": 4.898732434036243e-07, "logits/chosen": -0.5808308124542236, "logits/rejected": -0.6936120986938477, "logps/chosen": -264.3621520996094, "logps/rejected": -264.56329345703125, "loss": 0.6698, "rewards/accuracies": 0.71875, "rewards/chosen": -0.033665310591459274, "rewards/margins": 0.06689146906137466, "rewards/rejected": -0.10055678337812424, "step": 87 }, { "epoch": 0.1842449620518189, "grad_norm": 2.0534389893315326, "learning_rate": 4.893510300863676e-07, "logits/chosen": -0.6730471849441528, "logits/rejected": -0.626852810382843, "logps/chosen": -213.390380859375, "logps/rejected": -256.2815856933594, "loss": 0.6801, "rewards/accuracies": 0.59375, "rewards/chosen": -0.06772853434085846, "rewards/margins": 0.016132023185491562, "rewards/rejected": -0.08386056125164032, "step": 88 }, { "epoch": 0.18633865480240774, "grad_norm": 3.5774333565505825, "learning_rate": 4.8881598109976e-07, "logits/chosen": -0.8176313042640686, "logits/rejected": -0.6967725157737732, "logps/chosen": -332.6808166503906, "logps/rejected": -286.943359375, "loss": 0.6698, "rewards/accuracies": 0.59375, "rewards/chosen": -0.06047438830137253, "rewards/margins": 0.017377573996782303, "rewards/rejected": -0.07785196602344513, "step": 89 }, { "epoch": 0.1884323475529966, "grad_norm": 2.2048402638996194, "learning_rate": 4.882681251368548e-07, "logits/chosen": -0.8499391078948975, "logits/rejected": -0.8084088563919067, "logps/chosen": -361.3448181152344, "logps/rejected": -268.6840515136719, "loss": 0.6665, "rewards/accuracies": 0.75, "rewards/chosen": -0.05152790620923042, "rewards/margins": 0.10043416172266006, "rewards/rejected": -0.15196208655834198, "step": 90 }, { "epoch": 0.19052604030358544, "grad_norm": 2.3037940028279214, "learning_rate": 4.877074915775048e-07, "logits/chosen": -0.8542874455451965, "logits/rejected": -0.7181276082992554, "logps/chosen": -296.7271728515625, "logps/rejected": -259.791015625, "loss": 0.67, "rewards/accuracies": 0.78125, "rewards/chosen": -0.07071080803871155, "rewards/margins": 0.06956136971712112, "rewards/rejected": -0.14027217030525208, "step": 91 }, { "epoch": 0.1926197330541743, "grad_norm": 2.2446193686453713, "learning_rate": 4.871341104867864e-07, "logits/chosen": -0.5744785666465759, "logits/rejected": -0.6070349216461182, "logps/chosen": -263.32318115234375, "logps/rejected": -236.64015197753906, "loss": 0.6631, "rewards/accuracies": 0.78125, "rewards/chosen": -0.06148665398359299, "rewards/margins": 0.07813382893800735, "rewards/rejected": -0.13962048292160034, "step": 92 }, { "epoch": 0.19471342580476314, "grad_norm": 2.3193848391311396, "learning_rate": 4.865480126133871e-07, "logits/chosen": -0.39636602997779846, "logits/rejected": -0.49825718998908997, "logps/chosen": -217.11976623535156, "logps/rejected": -285.2427978515625, "loss": 0.6702, "rewards/accuracies": 0.75, "rewards/chosen": -0.05208142101764679, "rewards/margins": 0.09379248321056366, "rewards/rejected": -0.14587390422821045, "step": 93 }, { "epoch": 0.196807118555352, "grad_norm": 2.088378313594801, "learning_rate": 4.859492293879573e-07, "logits/chosen": -0.7813756465911865, "logits/rejected": -0.7635777592658997, "logps/chosen": -235.960693359375, "logps/rejected": -252.4083709716797, "loss": 0.6766, "rewards/accuracies": 0.65625, "rewards/chosen": -0.08113241195678711, "rewards/margins": 0.041926972568035126, "rewards/rejected": -0.12305939197540283, "step": 94 }, { "epoch": 0.19890081130594087, "grad_norm": 2.324031303285473, "learning_rate": 4.853377929214243e-07, "logits/chosen": -0.8027840852737427, "logits/rejected": -0.8413727283477783, "logps/chosen": -300.5863037109375, "logps/rejected": -368.9302673339844, "loss": 0.6667, "rewards/accuracies": 0.59375, "rewards/chosen": -0.09205912798643112, "rewards/margins": 0.03071211650967598, "rewards/rejected": -0.1227712482213974, "step": 95 }, { "epoch": 0.20099450405652972, "grad_norm": 2.1952114016909823, "learning_rate": 4.847137360032699e-07, "logits/chosen": -0.7533310055732727, "logits/rejected": -0.5886276364326477, "logps/chosen": -321.7952880859375, "logps/rejected": -251.48423767089844, "loss": 0.6752, "rewards/accuracies": 0.65625, "rewards/chosen": -0.07287660986185074, "rewards/margins": 0.06314495950937271, "rewards/rejected": -0.13602156937122345, "step": 96 }, { "epoch": 0.20308819680711857, "grad_norm": 2.352459159567149, "learning_rate": 4.84077092099773e-07, "logits/chosen": -0.8565846085548401, "logits/rejected": -0.7420638799667358, "logps/chosen": -336.0909423828125, "logps/rejected": -255.9292755126953, "loss": 0.6721, "rewards/accuracies": 0.78125, "rewards/chosen": -0.0998564064502716, "rewards/margins": 0.07056114822626114, "rewards/rejected": -0.17041754722595215, "step": 97 }, { "epoch": 0.20518188955770741, "grad_norm": 2.2420448964778026, "learning_rate": 4.834278953522137e-07, "logits/chosen": -0.7887314558029175, "logits/rejected": -0.7282671928405762, "logps/chosen": -356.1842346191406, "logps/rejected": -255.9042510986328, "loss": 0.6678, "rewards/accuracies": 0.59375, "rewards/chosen": -0.09596212208271027, "rewards/margins": 0.08084248006343842, "rewards/rejected": -0.17680461704730988, "step": 98 }, { "epoch": 0.20727558230829626, "grad_norm": 2.1524092097128094, "learning_rate": 4.827661805750437e-07, "logits/chosen": -0.766761064529419, "logits/rejected": -0.8436205983161926, "logps/chosen": -269.53497314453125, "logps/rejected": -275.6004638671875, "loss": 0.6761, "rewards/accuracies": 0.6875, "rewards/chosen": -0.11247224360704422, "rewards/margins": 0.06839179247617722, "rewards/rejected": -0.18086405098438263, "step": 99 }, { "epoch": 0.2093692750588851, "grad_norm": 2.2283585056044073, "learning_rate": 4.820919832540181e-07, "logits/chosen": -0.8120338916778564, "logits/rejected": -0.6906442642211914, "logps/chosen": -348.03997802734375, "logps/rejected": -272.8766784667969, "loss": 0.6548, "rewards/accuracies": 0.75, "rewards/chosen": -0.10157693922519684, "rewards/margins": 0.12684565782546997, "rewards/rejected": -0.22842258214950562, "step": 100 }, { "epoch": 0.21146296780947396, "grad_norm": 2.3253416214797618, "learning_rate": 4.814053395442932e-07, "logits/chosen": -0.7263871431350708, "logits/rejected": -0.6843710541725159, "logps/chosen": -309.49359130859375, "logps/rejected": -286.2685546875, "loss": 0.6742, "rewards/accuracies": 0.59375, "rewards/chosen": -0.14648452401161194, "rewards/margins": 0.04545987397432327, "rewards/rejected": -0.19194439053535461, "step": 101 }, { "epoch": 0.2135566605600628, "grad_norm": 2.2501981550420362, "learning_rate": 4.807062862684873e-07, "logits/chosen": -0.6538861989974976, "logits/rejected": -0.610673725605011, "logps/chosen": -283.59393310546875, "logps/rejected": -261.5007019042969, "loss": 0.6741, "rewards/accuracies": 0.59375, "rewards/chosen": -0.1330275535583496, "rewards/margins": 0.03604242950677872, "rewards/rejected": -0.16906996071338654, "step": 102 }, { "epoch": 0.21565035331065166, "grad_norm": 2.4122065464919693, "learning_rate": 4.799948609147061e-07, "logits/chosen": -0.7424915432929993, "logits/rejected": -0.6838539838790894, "logps/chosen": -344.6163330078125, "logps/rejected": -375.2181701660156, "loss": 0.6763, "rewards/accuracies": 0.59375, "rewards/chosen": -0.15367819368839264, "rewards/margins": 0.00416429853066802, "rewards/rejected": -0.1578424870967865, "step": 103 }, { "epoch": 0.2177440460612405, "grad_norm": 2.202206572706639, "learning_rate": 4.792711016345321e-07, "logits/chosen": -0.9123300909996033, "logits/rejected": -0.5909267067909241, "logps/chosen": -296.15655517578125, "logps/rejected": -277.62091064453125, "loss": 0.6618, "rewards/accuracies": 0.59375, "rewards/chosen": -0.15593835711479187, "rewards/margins": 0.032225266098976135, "rewards/rejected": -0.188163623213768, "step": 104 }, { "epoch": 0.21983773881182936, "grad_norm": 2.4899405877102736, "learning_rate": 4.785350472409791e-07, "logits/chosen": -0.801268458366394, "logits/rejected": -0.693882167339325, "logps/chosen": -312.6393127441406, "logps/rejected": -302.8817443847656, "loss": 0.659, "rewards/accuracies": 0.59375, "rewards/chosen": -0.18543393909931183, "rewards/margins": 0.02549637109041214, "rewards/rejected": -0.21093033254146576, "step": 105 }, { "epoch": 0.2219314315624182, "grad_norm": 2.505438867299177, "learning_rate": 4.777867372064105e-07, "logits/chosen": -0.9436739087104797, "logits/rejected": -0.8665189743041992, "logps/chosen": -391.1027526855469, "logps/rejected": -311.3518981933594, "loss": 0.6681, "rewards/accuracies": 0.59375, "rewards/chosen": -0.18962696194648743, "rewards/margins": 0.061485111713409424, "rewards/rejected": -0.25111207365989685, "step": 106 }, { "epoch": 0.22402512431300706, "grad_norm": 2.4500215949755964, "learning_rate": 4.770262116604223e-07, "logits/chosen": -0.9061310291290283, "logits/rejected": -0.7668851613998413, "logps/chosen": -280.1986389160156, "logps/rejected": -248.912353515625, "loss": 0.6684, "rewards/accuracies": 0.625, "rewards/chosen": -0.16495895385742188, "rewards/margins": 0.0857238918542862, "rewards/rejected": -0.25068286061286926, "step": 107 }, { "epoch": 0.2261188170635959, "grad_norm": 2.363568349167871, "learning_rate": 4.7625351138769166e-07, "logits/chosen": -0.8817356824874878, "logits/rejected": -0.767876386642456, "logps/chosen": -286.58935546875, "logps/rejected": -216.1655731201172, "loss": 0.6581, "rewards/accuracies": 0.71875, "rewards/chosen": -0.16056066751480103, "rewards/margins": 0.15276899933815002, "rewards/rejected": -0.31332966685295105, "step": 108 }, { "epoch": 0.22821250981418476, "grad_norm": 2.3586814888932097, "learning_rate": 4.75468677825789e-07, "logits/chosen": -0.9033939838409424, "logits/rejected": -0.9127844572067261, "logps/chosen": -367.2960205078125, "logps/rejected": -316.00579833984375, "loss": 0.6661, "rewards/accuracies": 0.65625, "rewards/chosen": -0.16384515166282654, "rewards/margins": 0.11176474392414093, "rewards/rejected": -0.27560991048812866, "step": 109 }, { "epoch": 0.23030620256477363, "grad_norm": 2.3117137268190864, "learning_rate": 4.7467175306295647e-07, "logits/chosen": -0.7496669888496399, "logits/rejected": -0.7020635604858398, "logps/chosen": -236.1432647705078, "logps/rejected": -244.9922637939453, "loss": 0.6611, "rewards/accuracies": 0.6875, "rewards/chosen": -0.17641478776931763, "rewards/margins": 0.0799957737326622, "rewards/rejected": -0.2564105987548828, "step": 110 }, { "epoch": 0.23239989531536248, "grad_norm": 2.4646751056166787, "learning_rate": 4.7386277983585053e-07, "logits/chosen": -0.5802682638168335, "logits/rejected": -0.5567517876625061, "logps/chosen": -277.6441345214844, "logps/rejected": -293.482177734375, "loss": 0.6567, "rewards/accuracies": 0.75, "rewards/chosen": -0.20104415714740753, "rewards/margins": 0.10905403643846512, "rewards/rejected": -0.31009820103645325, "step": 111 }, { "epoch": 0.23449358806595133, "grad_norm": 2.7630036820939328, "learning_rate": 4.7304180152725024e-07, "logits/chosen": -0.6056971549987793, "logits/rejected": -0.730692982673645, "logps/chosen": -220.58412170410156, "logps/rejected": -272.75457763671875, "loss": 0.637, "rewards/accuracies": 0.6875, "rewards/chosen": -0.16876310110092163, "rewards/margins": 0.0999322161078453, "rewards/rejected": -0.26869529485702515, "step": 112 }, { "epoch": 0.23658728081654018, "grad_norm": 2.5092473279499967, "learning_rate": 4.7220886216373085e-07, "logits/chosen": -0.9281202554702759, "logits/rejected": -0.8081488013267517, "logps/chosen": -253.2613067626953, "logps/rejected": -225.5787811279297, "loss": 0.6759, "rewards/accuracies": 0.53125, "rewards/chosen": -0.2524522542953491, "rewards/margins": 0.0012571308761835098, "rewards/rejected": -0.2537093758583069, "step": 113 }, { "epoch": 0.23868097356712903, "grad_norm": 2.5404189900386744, "learning_rate": 4.7136400641330245e-07, "logits/chosen": -0.8768197894096375, "logits/rejected": -0.7804550528526306, "logps/chosen": -356.69403076171875, "logps/rejected": -314.6672668457031, "loss": 0.6515, "rewards/accuracies": 0.625, "rewards/chosen": -0.18054646253585815, "rewards/margins": 0.08943217992782593, "rewards/rejected": -0.2699786424636841, "step": 114 }, { "epoch": 0.24077466631771788, "grad_norm": 3.557240631428595, "learning_rate": 4.70507279583015e-07, "logits/chosen": -0.8765246272087097, "logits/rejected": -0.7100471258163452, "logps/chosen": -297.8513488769531, "logps/rejected": -278.15692138671875, "loss": 0.653, "rewards/accuracies": 0.6875, "rewards/chosen": -0.2482127547264099, "rewards/margins": 0.1277945637702942, "rewards/rejected": -0.3760073482990265, "step": 115 }, { "epoch": 0.24286835906830673, "grad_norm": 2.7121533201433397, "learning_rate": 4.6963872761652834e-07, "logits/chosen": -0.6465345621109009, "logits/rejected": -0.6419689655303955, "logps/chosen": -250.98475646972656, "logps/rejected": -236.76187133789062, "loss": 0.6455, "rewards/accuracies": 0.59375, "rewards/chosen": -0.2517586946487427, "rewards/margins": 0.06587891280651093, "rewards/rejected": -0.3176375925540924, "step": 116 }, { "epoch": 0.24496205181889558, "grad_norm": 2.899570295960883, "learning_rate": 4.687583970916486e-07, "logits/chosen": -0.8432968258857727, "logits/rejected": -0.7110350728034973, "logps/chosen": -332.67950439453125, "logps/rejected": -276.92584228515625, "loss": 0.6596, "rewards/accuracies": 0.59375, "rewards/chosen": -0.289572149515152, "rewards/margins": 0.05645378679037094, "rewards/rejected": -0.3460259437561035, "step": 117 }, { "epoch": 0.24705574456948443, "grad_norm": 2.6590826737047206, "learning_rate": 4.6786633521783005e-07, "logits/chosen": -0.8244806528091431, "logits/rejected": -0.7650989294052124, "logps/chosen": -306.63909912109375, "logps/rejected": -286.13336181640625, "loss": 0.6481, "rewards/accuracies": 0.75, "rewards/chosen": -0.23957893252372742, "rewards/margins": 0.09444958716630936, "rewards/rejected": -0.3340285122394562, "step": 118 }, { "epoch": 0.24914943732007327, "grad_norm": 2.63508284285874, "learning_rate": 4.669625898336438e-07, "logits/chosen": -0.7436507940292358, "logits/rejected": -0.7288177013397217, "logps/chosen": -253.5546875, "logps/rejected": -248.28660583496094, "loss": 0.6474, "rewards/accuracies": 0.71875, "rewards/chosen": -0.24074944853782654, "rewards/margins": 0.12997063994407654, "rewards/rejected": -0.3707200884819031, "step": 119 }, { "epoch": 0.2512431300706621, "grad_norm": 2.6955107050229716, "learning_rate": 4.6604720940421207e-07, "logits/chosen": -0.6667100787162781, "logits/rejected": -0.6995654106140137, "logps/chosen": -315.94873046875, "logps/rejected": -275.4889831542969, "loss": 0.6513, "rewards/accuracies": 0.59375, "rewards/chosen": -0.28267398476600647, "rewards/margins": 0.12529906630516052, "rewards/rejected": -0.4079730212688446, "step": 120 }, { "epoch": 0.253336822821251, "grad_norm": 2.5556325455562146, "learning_rate": 4.651202430186092e-07, "logits/chosen": -0.8555119037628174, "logits/rejected": -0.8221947550773621, "logps/chosen": -314.2303466796875, "logps/rejected": -281.5426025390625, "loss": 0.6403, "rewards/accuracies": 0.65625, "rewards/chosen": -0.26283323764801025, "rewards/margins": 0.11305763572454453, "rewards/rejected": -0.3758908808231354, "step": 121 }, { "epoch": 0.2554305155718398, "grad_norm": 2.824190828267635, "learning_rate": 4.6418174038722924e-07, "logits/chosen": -0.8114756941795349, "logits/rejected": -0.8379485011100769, "logps/chosen": -297.67950439453125, "logps/rejected": -253.80145263671875, "loss": 0.6292, "rewards/accuracies": 0.6875, "rewards/chosen": -0.2982485294342041, "rewards/margins": 0.15191057324409485, "rewards/rejected": -0.45015910267829895, "step": 122 }, { "epoch": 0.25752420832242867, "grad_norm": 2.6531488119711737, "learning_rate": 4.6323175183912023e-07, "logits/chosen": -0.7469431757926941, "logits/rejected": -0.8131128549575806, "logps/chosen": -303.617431640625, "logps/rejected": -276.1275939941406, "loss": 0.6502, "rewards/accuracies": 0.75, "rewards/chosen": -0.3178527355194092, "rewards/margins": 0.14342176914215088, "rewards/rejected": -0.46127447485923767, "step": 123 }, { "epoch": 0.2596179010730175, "grad_norm": 2.67634370124945, "learning_rate": 4.6227032831928483e-07, "logits/chosen": -0.7674790024757385, "logits/rejected": -0.6936861872673035, "logps/chosen": -346.21173095703125, "logps/rejected": -275.9278259277344, "loss": 0.625, "rewards/accuracies": 0.75, "rewards/chosen": -0.2985009551048279, "rewards/margins": 0.1785661280155182, "rewards/rejected": -0.4770670533180237, "step": 124 }, { "epoch": 0.26171159382360637, "grad_norm": 2.6613188985285863, "learning_rate": 4.612975213859487e-07, "logits/chosen": -0.8101502060890198, "logits/rejected": -0.8271470069885254, "logps/chosen": -257.361328125, "logps/rejected": -248.89871215820312, "loss": 0.658, "rewards/accuracies": 0.625, "rewards/chosen": -0.3189204931259155, "rewards/margins": 0.09418576955795288, "rewards/rejected": -0.4131062924861908, "step": 125 }, { "epoch": 0.2638052865741952, "grad_norm": 2.6264787904396334, "learning_rate": 4.603133832077953e-07, "logits/chosen": -1.0526741743087769, "logits/rejected": -0.8893198370933533, "logps/chosen": -330.3887939453125, "logps/rejected": -258.9415283203125, "loss": 0.6395, "rewards/accuracies": 0.59375, "rewards/chosen": -0.40112945437431335, "rewards/margins": 0.04682213068008423, "rewards/rejected": -0.4479515552520752, "step": 126 }, { "epoch": 0.26589897932478407, "grad_norm": 2.955758218852093, "learning_rate": 4.5931796656116837e-07, "logits/chosen": -1.027914047241211, "logits/rejected": -0.817064642906189, "logps/chosen": -285.3546447753906, "logps/rejected": -262.5631408691406, "loss": 0.6612, "rewards/accuracies": 0.6875, "rewards/chosen": -0.3817771077156067, "rewards/margins": 0.13248004019260406, "rewards/rejected": -0.5142571926116943, "step": 127 }, { "epoch": 0.2679926720753729, "grad_norm": 2.8798732668842075, "learning_rate": 4.5831132482724193e-07, "logits/chosen": -0.9484893679618835, "logits/rejected": -0.9374022483825684, "logps/chosen": -304.084228515625, "logps/rejected": -315.7504577636719, "loss": 0.652, "rewards/accuracies": 0.5625, "rewards/chosen": -0.3972603678703308, "rewards/margins": 0.11615100502967834, "rewards/rejected": -0.5134113430976868, "step": 128 }, { "epoch": 0.27008636482596177, "grad_norm": 2.726046827234988, "learning_rate": 4.5729351198915705e-07, "logits/chosen": -0.821631908416748, "logits/rejected": -0.881803035736084, "logps/chosen": -292.4264221191406, "logps/rejected": -327.34796142578125, "loss": 0.6356, "rewards/accuracies": 0.59375, "rewards/chosen": -0.44171586632728577, "rewards/margins": 0.06979832798242569, "rewards/rejected": -0.5115141868591309, "step": 129 }, { "epoch": 0.2721800575765506, "grad_norm": 3.082925552643973, "learning_rate": 4.5626458262912735e-07, "logits/chosen": -0.9053035378456116, "logits/rejected": -0.7657517790794373, "logps/chosen": -402.1451110839844, "logps/rejected": -342.5970764160156, "loss": 0.6291, "rewards/accuracies": 0.6875, "rewards/chosen": -0.3552600145339966, "rewards/margins": 0.12334448099136353, "rewards/rejected": -0.4786044657230377, "step": 130 }, { "epoch": 0.27427375032713947, "grad_norm": 2.938721119624447, "learning_rate": 4.5522459192551166e-07, "logits/chosen": -0.9134140610694885, "logits/rejected": -0.9281446933746338, "logps/chosen": -281.65924072265625, "logps/rejected": -280.98150634765625, "loss": 0.612, "rewards/accuracies": 0.78125, "rewards/chosen": -0.32678863406181335, "rewards/margins": 0.20846635103225708, "rewards/rejected": -0.535254955291748, "step": 131 }, { "epoch": 0.27636744307772837, "grad_norm": 2.831973318930255, "learning_rate": 4.541735956498554e-07, "logits/chosen": -0.9787561893463135, "logits/rejected": -0.8255198001861572, "logps/chosen": -389.1434326171875, "logps/rejected": -409.82928466796875, "loss": 0.6461, "rewards/accuracies": 0.71875, "rewards/chosen": -0.3761777877807617, "rewards/margins": 0.1663150042295456, "rewards/rejected": -0.5424928665161133, "step": 132 }, { "epoch": 0.2784611358283172, "grad_norm": 3.119418422383628, "learning_rate": 4.5311165016389914e-07, "logits/chosen": -0.9133492708206177, "logits/rejected": -0.7962071895599365, "logps/chosen": -330.25750732421875, "logps/rejected": -248.0698699951172, "loss": 0.6534, "rewards/accuracies": 0.53125, "rewards/chosen": -0.3898540139198303, "rewards/margins": 0.11309933662414551, "rewards/rejected": -0.502953290939331, "step": 133 }, { "epoch": 0.28055482857890607, "grad_norm": 2.794780073382938, "learning_rate": 4.520388124165564e-07, "logits/chosen": -0.9228350520133972, "logits/rejected": -0.9974980354309082, "logps/chosen": -301.2253112792969, "logps/rejected": -332.54779052734375, "loss": 0.6285, "rewards/accuracies": 0.75, "rewards/chosen": -0.40412798523902893, "rewards/margins": 0.15353348851203918, "rewards/rejected": -0.5576615333557129, "step": 134 }, { "epoch": 0.2826485213294949, "grad_norm": 2.9941090454341244, "learning_rate": 4.5095513994085974e-07, "logits/chosen": -0.8510048985481262, "logits/rejected": -0.8742996454238892, "logps/chosen": -252.69781494140625, "logps/rejected": -226.66986083984375, "loss": 0.6264, "rewards/accuracies": 0.53125, "rewards/chosen": -0.4069235920906067, "rewards/margins": 0.09905406832695007, "rewards/rejected": -0.5059777498245239, "step": 135 }, { "epoch": 0.28474221408008377, "grad_norm": 3.167148290638935, "learning_rate": 4.498606908508753e-07, "logits/chosen": -1.0063254833221436, "logits/rejected": -0.9629747271537781, "logps/chosen": -383.5644226074219, "logps/rejected": -343.880859375, "loss": 0.646, "rewards/accuracies": 0.65625, "rewards/chosen": -0.4646209180355072, "rewards/margins": 0.1816629320383072, "rewards/rejected": -0.6462838053703308, "step": 136 }, { "epoch": 0.2868359068306726, "grad_norm": 2.8544650227179487, "learning_rate": 4.487555238385862e-07, "logits/chosen": -0.9133685827255249, "logits/rejected": -0.8350323438644409, "logps/chosen": -317.8162841796875, "logps/rejected": -285.8211669921875, "loss": 0.6623, "rewards/accuracies": 0.5625, "rewards/chosen": -0.42300063371658325, "rewards/margins": 0.12662599980831146, "rewards/rejected": -0.5496265888214111, "step": 137 }, { "epoch": 0.28892959958126146, "grad_norm": 3.304254539909629, "learning_rate": 4.476396981707453e-07, "logits/chosen": -0.91913902759552, "logits/rejected": -0.8310465812683105, "logps/chosen": -301.3184509277344, "logps/rejected": -277.730224609375, "loss": 0.6751, "rewards/accuracies": 0.71875, "rewards/chosen": -0.46760421991348267, "rewards/margins": 0.14180074632167816, "rewards/rejected": -0.6094049215316772, "step": 138 }, { "epoch": 0.2910232923318503, "grad_norm": 2.9628971123573566, "learning_rate": 4.4651327368569684e-07, "logits/chosen": -0.9257422685623169, "logits/rejected": -0.9691517353057861, "logps/chosen": -362.6524353027344, "logps/rejected": -326.07568359375, "loss": 0.6322, "rewards/accuracies": 0.6875, "rewards/chosen": -0.376315712928772, "rewards/margins": 0.2625727355480194, "rewards/rejected": -0.6388884782791138, "step": 139 }, { "epoch": 0.29311698508243916, "grad_norm": 3.3501655983344003, "learning_rate": 4.453763107901675e-07, "logits/chosen": -0.7563897967338562, "logits/rejected": -0.7877973318099976, "logps/chosen": -334.58599853515625, "logps/rejected": -297.6249694824219, "loss": 0.6226, "rewards/accuracies": 0.84375, "rewards/chosen": -0.3671949803829193, "rewards/margins": 0.22407814860343933, "rewards/rejected": -0.5912730693817139, "step": 140 }, { "epoch": 0.295210677833028, "grad_norm": 3.2442618110820693, "learning_rate": 4.4422887045602674e-07, "logits/chosen": -1.0642178058624268, "logits/rejected": -0.9513648152351379, "logps/chosen": -318.6060485839844, "logps/rejected": -296.2124938964844, "loss": 0.6696, "rewards/accuracies": 0.6875, "rewards/chosen": -0.4183207154273987, "rewards/margins": 0.14716774225234985, "rewards/rejected": -0.5654884576797485, "step": 141 }, { "epoch": 0.29730437058361686, "grad_norm": 2.9829977486412433, "learning_rate": 4.4307101421701755e-07, "logits/chosen": -0.9983013868331909, "logits/rejected": -0.8099947571754456, "logps/chosen": -346.5638427734375, "logps/rejected": -264.23040771484375, "loss": 0.6366, "rewards/accuracies": 0.6875, "rewards/chosen": -0.4408864974975586, "rewards/margins": 0.1328776627779007, "rewards/rejected": -0.5737640857696533, "step": 142 }, { "epoch": 0.2993980633342057, "grad_norm": 3.2604591939144605, "learning_rate": 4.419028041654559e-07, "logits/chosen": -1.1009725332260132, "logits/rejected": -1.0175827741622925, "logps/chosen": -362.67767333984375, "logps/rejected": -375.475830078125, "loss": 0.6385, "rewards/accuracies": 0.75, "rewards/chosen": -0.46520116925239563, "rewards/margins": 0.23704932630062103, "rewards/rejected": -0.7022505402565002, "step": 143 }, { "epoch": 0.30149175608479456, "grad_norm": 2.902493900124081, "learning_rate": 4.4072430294890166e-07, "logits/chosen": -1.0160636901855469, "logits/rejected": -0.9776636362075806, "logps/chosen": -316.94683837890625, "logps/rejected": -316.5914306640625, "loss": 0.6356, "rewards/accuracies": 0.84375, "rewards/chosen": -0.4207112193107605, "rewards/margins": 0.2683696746826172, "rewards/rejected": -0.6890809535980225, "step": 144 }, { "epoch": 0.3035854488353834, "grad_norm": 3.2624126203448194, "learning_rate": 4.395355737667985e-07, "logits/chosen": -0.9603081345558167, "logits/rejected": -0.8384232521057129, "logps/chosen": -333.023193359375, "logps/rejected": -283.44921875, "loss": 0.6424, "rewards/accuracies": 0.53125, "rewards/chosen": -0.5103600025177002, "rewards/margins": 0.0808807760477066, "rewards/rejected": -0.5912408232688904, "step": 145 }, { "epoch": 0.30567914158597226, "grad_norm": 3.506132940462953, "learning_rate": 4.3833668036708483e-07, "logits/chosen": -0.9286113381385803, "logits/rejected": -0.8193081617355347, "logps/chosen": -317.3082580566406, "logps/rejected": -263.2864990234375, "loss": 0.6677, "rewards/accuracies": 0.5625, "rewards/chosen": -0.5821487903594971, "rewards/margins": 0.00119849294424057, "rewards/rejected": -0.5833473205566406, "step": 146 }, { "epoch": 0.3077728343365611, "grad_norm": 3.050145379356228, "learning_rate": 4.3712768704277524e-07, "logits/chosen": -0.8422614932060242, "logits/rejected": -0.951869785785675, "logps/chosen": -328.5840759277344, "logps/rejected": -313.745849609375, "loss": 0.636, "rewards/accuracies": 0.71875, "rewards/chosen": -0.42078661918640137, "rewards/margins": 0.1792883574962616, "rewards/rejected": -0.6000750064849854, "step": 147 }, { "epoch": 0.30986652708714996, "grad_norm": 3.0989574716052912, "learning_rate": 4.3590865862851263e-07, "logits/chosen": -0.7778136730194092, "logits/rejected": -0.8868573307991028, "logps/chosen": -258.16766357421875, "logps/rejected": -295.7030334472656, "loss": 0.635, "rewards/accuracies": 0.6875, "rewards/chosen": -0.4501706063747406, "rewards/margins": 0.2438880056142807, "rewards/rejected": -0.6940585970878601, "step": 148 }, { "epoch": 0.3119602198377388, "grad_norm": 3.1709856744651916, "learning_rate": 4.346796604970912e-07, "logits/chosen": -0.930332362651825, "logits/rejected": -1.014702558517456, "logps/chosen": -298.2159729003906, "logps/rejected": -354.9796142578125, "loss": 0.6385, "rewards/accuracies": 0.59375, "rewards/chosen": -0.499337375164032, "rewards/margins": 0.12938818335533142, "rewards/rejected": -0.6287255883216858, "step": 149 }, { "epoch": 0.31405391258832765, "grad_norm": 3.2867193816853884, "learning_rate": 4.3344075855595097e-07, "logits/chosen": -1.0004366636276245, "logits/rejected": -0.9738812446594238, "logps/chosen": -287.9877014160156, "logps/rejected": -276.92138671875, "loss": 0.6606, "rewards/accuracies": 0.59375, "rewards/chosen": -0.38925284147262573, "rewards/margins": 0.06718149781227112, "rewards/rejected": -0.45643430948257446, "step": 150 }, { "epoch": 0.3161476053389165, "grad_norm": 2.7998389591139965, "learning_rate": 4.3219201924364323e-07, "logits/chosen": -0.8822636604309082, "logits/rejected": -0.8813071250915527, "logps/chosen": -336.11895751953125, "logps/rejected": -299.78082275390625, "loss": 0.6213, "rewards/accuracies": 0.6875, "rewards/chosen": -0.489565372467041, "rewards/margins": 0.1789228767156601, "rewards/rejected": -0.6684882044792175, "step": 151 }, { "epoch": 0.31824129808950535, "grad_norm": 3.086975439112919, "learning_rate": 4.309335095262675e-07, "logits/chosen": -1.074826955795288, "logits/rejected": -0.8874093890190125, "logps/chosen": -355.8520202636719, "logps/rejected": -374.1164245605469, "loss": 0.6279, "rewards/accuracies": 0.84375, "rewards/chosen": -0.46743249893188477, "rewards/margins": 0.29971230030059814, "rewards/rejected": -0.7671448588371277, "step": 152 }, { "epoch": 0.3203349908400942, "grad_norm": 6.714786625921316, "learning_rate": 4.2966529689388064e-07, "logits/chosen": -0.8606640100479126, "logits/rejected": -0.741177499294281, "logps/chosen": -312.07568359375, "logps/rejected": -280.4713439941406, "loss": 0.6104, "rewards/accuracies": 0.71875, "rewards/chosen": -0.42823731899261475, "rewards/margins": 0.14162594079971313, "rewards/rejected": -0.5698632597923279, "step": 153 }, { "epoch": 0.32242868359068305, "grad_norm": 3.1267962459357275, "learning_rate": 4.2838744935687716e-07, "logits/chosen": -0.9769381284713745, "logits/rejected": -0.946052610874176, "logps/chosen": -282.4183654785156, "logps/rejected": -242.77899169921875, "loss": 0.6318, "rewards/accuracies": 0.65625, "rewards/chosen": -0.46384236216545105, "rewards/margins": 0.16929683089256287, "rewards/rejected": -0.6331391930580139, "step": 154 }, { "epoch": 0.3245223763412719, "grad_norm": 3.3970609407511665, "learning_rate": 4.271000354423425e-07, "logits/chosen": -0.9080954194068909, "logits/rejected": -0.8809458017349243, "logps/chosen": -394.0725402832031, "logps/rejected": -333.3432312011719, "loss": 0.6077, "rewards/accuracies": 0.75, "rewards/chosen": -0.41418159008026123, "rewards/margins": 0.3541857898235321, "rewards/rejected": -0.7683672904968262, "step": 155 }, { "epoch": 0.32661606909186075, "grad_norm": 4.085656012275449, "learning_rate": 4.258031241903777e-07, "logits/chosen": -0.8592851758003235, "logits/rejected": -0.8081427216529846, "logps/chosen": -271.9188537597656, "logps/rejected": -351.8356628417969, "loss": 0.6485, "rewards/accuracies": 0.71875, "rewards/chosen": -0.5018432140350342, "rewards/margins": 0.14107055962085724, "rewards/rejected": -0.6429137587547302, "step": 156 }, { "epoch": 0.3287097618424496, "grad_norm": 3.2776284234859867, "learning_rate": 4.2449678515039743e-07, "logits/chosen": -0.8017740249633789, "logits/rejected": -0.8596625328063965, "logps/chosen": -226.88877868652344, "logps/rejected": -240.0575408935547, "loss": 0.6364, "rewards/accuracies": 0.75, "rewards/chosen": -0.4924962520599365, "rewards/margins": 0.14559431374073029, "rewards/rejected": -0.6380904912948608, "step": 157 }, { "epoch": 0.33080345459303845, "grad_norm": 3.1496305893741137, "learning_rate": 4.2318108837739986e-07, "logits/chosen": -0.8733360171318054, "logits/rejected": -0.9309042692184448, "logps/chosen": -346.1365966796875, "logps/rejected": -333.12176513671875, "loss": 0.6375, "rewards/accuracies": 0.59375, "rewards/chosen": -0.47889524698257446, "rewards/margins": 0.2082909345626831, "rewards/rejected": -0.6871861219406128, "step": 158 }, { "epoch": 0.3328971473436273, "grad_norm": 3.6451097075083774, "learning_rate": 4.218561044282098e-07, "logits/chosen": -0.9299649000167847, "logits/rejected": -0.9889509677886963, "logps/chosen": -405.2691650390625, "logps/rejected": -362.0888977050781, "loss": 0.6414, "rewards/accuracies": 0.6875, "rewards/chosen": -0.49829256534576416, "rewards/margins": 0.14991487562656403, "rewards/rejected": -0.6482074856758118, "step": 159 }, { "epoch": 0.33499084009421615, "grad_norm": 3.49929798625805, "learning_rate": 4.2052190435769554e-07, "logits/chosen": -1.1320297718048096, "logits/rejected": -1.0407469272613525, "logps/chosen": -253.47024536132812, "logps/rejected": -276.7797546386719, "loss": 0.6313, "rewards/accuracies": 0.71875, "rewards/chosen": -0.48531901836395264, "rewards/margins": 0.23991630971431732, "rewards/rejected": -0.7252352833747864, "step": 160 }, { "epoch": 0.33708453284480505, "grad_norm": 3.5621057534930953, "learning_rate": 4.1917855971495763e-07, "logits/chosen": -1.0667507648468018, "logits/rejected": -1.0060434341430664, "logps/chosen": -325.3588562011719, "logps/rejected": -322.5223693847656, "loss": 0.6335, "rewards/accuracies": 0.65625, "rewards/chosen": -0.5729315876960754, "rewards/margins": 0.16416212916374207, "rewards/rejected": -0.7370936870574951, "step": 161 }, { "epoch": 0.3391782255953939, "grad_norm": 3.656800542423542, "learning_rate": 4.1782614253949255e-07, "logits/chosen": -0.8102587461471558, "logits/rejected": -0.8433481454849243, "logps/chosen": -327.4964599609375, "logps/rejected": -299.7692565917969, "loss": 0.6116, "rewards/accuracies": 0.6875, "rewards/chosen": -0.4140094220638275, "rewards/margins": 0.20236673951148987, "rewards/rejected": -0.6163761615753174, "step": 162 }, { "epoch": 0.34127191834598275, "grad_norm": 3.2172452898763244, "learning_rate": 4.164647253573289e-07, "logits/chosen": -1.1590189933776855, "logits/rejected": -1.1172659397125244, "logps/chosen": -294.23291015625, "logps/rejected": -263.67706298828125, "loss": 0.6155, "rewards/accuracies": 0.78125, "rewards/chosen": -0.5377013087272644, "rewards/margins": 0.24606366455554962, "rewards/rejected": -0.7837649583816528, "step": 163 }, { "epoch": 0.3433656110965716, "grad_norm": 4.831981607581527, "learning_rate": 4.1509438117713863e-07, "logits/chosen": -1.0070375204086304, "logits/rejected": -0.969563364982605, "logps/chosen": -305.4855041503906, "logps/rejected": -347.95880126953125, "loss": 0.5986, "rewards/accuracies": 0.75, "rewards/chosen": -0.4911316931247711, "rewards/margins": 0.28633809089660645, "rewards/rejected": -0.7774698138237, "step": 164 }, { "epoch": 0.34545930384716045, "grad_norm": 3.738895673493201, "learning_rate": 4.137151834863213e-07, "logits/chosen": -1.1650652885437012, "logits/rejected": -1.1875463724136353, "logps/chosen": -318.5315856933594, "logps/rejected": -347.0186767578125, "loss": 0.6512, "rewards/accuracies": 0.40625, "rewards/chosen": -0.7454348802566528, "rewards/margins": -0.04513206705451012, "rewards/rejected": -0.70030277967453, "step": 165 }, { "epoch": 0.3475529965977493, "grad_norm": 3.677592967442104, "learning_rate": 4.123272062470633e-07, "logits/chosen": -0.8033360242843628, "logits/rejected": -0.860325813293457, "logps/chosen": -302.0838928222656, "logps/rejected": -330.8976135253906, "loss": 0.6455, "rewards/accuracies": 0.625, "rewards/chosen": -0.497585654258728, "rewards/margins": 0.17136982083320618, "rewards/rejected": -0.6689555048942566, "step": 166 }, { "epoch": 0.34964668934833815, "grad_norm": 3.5280288496363035, "learning_rate": 4.1093052389237174e-07, "logits/chosen": -1.118594765663147, "logits/rejected": -1.012195110321045, "logps/chosen": -341.0162353515625, "logps/rejected": -312.8038330078125, "loss": 0.6404, "rewards/accuracies": 0.6875, "rewards/chosen": -0.6263291239738464, "rewards/margins": 0.12790195643901825, "rewards/rejected": -0.7542310953140259, "step": 167 }, { "epoch": 0.351740382098927, "grad_norm": 3.910799881193677, "learning_rate": 4.0952521132208267e-07, "logits/chosen": -0.7724168300628662, "logits/rejected": -0.7733991742134094, "logps/chosen": -243.98910522460938, "logps/rejected": -289.62164306640625, "loss": 0.6477, "rewards/accuracies": 0.625, "rewards/chosen": -0.5303620100021362, "rewards/margins": 0.14749875664710999, "rewards/rejected": -0.6778607368469238, "step": 168 }, { "epoch": 0.35383407484951584, "grad_norm": 5.9403809724099865, "learning_rate": 4.081113438988443e-07, "logits/chosen": -0.9893789887428284, "logits/rejected": -0.9568789601325989, "logps/chosen": -313.6084899902344, "logps/rejected": -286.12030029296875, "loss": 0.657, "rewards/accuracies": 0.65625, "rewards/chosen": -0.7507687211036682, "rewards/margins": 0.1456027626991272, "rewards/rejected": -0.8963714838027954, "step": 169 }, { "epoch": 0.3559277676001047, "grad_norm": 3.838990979680008, "learning_rate": 4.0668899744407567e-07, "logits/chosen": -0.8731168508529663, "logits/rejected": -0.8513322472572327, "logps/chosen": -377.73931884765625, "logps/rejected": -293.05670166015625, "loss": 0.6267, "rewards/accuracies": 0.625, "rewards/chosen": -0.6137698292732239, "rewards/margins": 0.2576186954975128, "rewards/rejected": -0.8713886141777039, "step": 170 }, { "epoch": 0.35802146035069354, "grad_norm": 4.343837385170686, "learning_rate": 4.0525824823390043e-07, "logits/chosen": -1.0764936208724976, "logits/rejected": -0.9941959977149963, "logps/chosen": -311.1845397949219, "logps/rejected": -278.13653564453125, "loss": 0.6197, "rewards/accuracies": 0.6875, "rewards/chosen": -0.6960826516151428, "rewards/margins": 0.17637191712856293, "rewards/rejected": -0.8724545240402222, "step": 171 }, { "epoch": 0.3601151531012824, "grad_norm": 4.936581171770645, "learning_rate": 4.0381917299505686e-07, "logits/chosen": -1.0785207748413086, "logits/rejected": -0.9231001734733582, "logps/chosen": -353.14019775390625, "logps/rejected": -302.4104919433594, "loss": 0.6336, "rewards/accuracies": 0.4375, "rewards/chosen": -0.7409242391586304, "rewards/margins": 0.051116734743118286, "rewards/rejected": -0.792041003704071, "step": 172 }, { "epoch": 0.36220884585187124, "grad_norm": 3.544319974463203, "learning_rate": 4.0237184890078243e-07, "logits/chosen": -1.021276593208313, "logits/rejected": -0.9218824505805969, "logps/chosen": -361.08990478515625, "logps/rejected": -378.88232421875, "loss": 0.611, "rewards/accuracies": 0.59375, "rewards/chosen": -0.7728280425071716, "rewards/margins": 0.19495846331119537, "rewards/rejected": -0.967786431312561, "step": 173 }, { "epoch": 0.3643025386024601, "grad_norm": 3.589809468662677, "learning_rate": 4.00916353566676e-07, "logits/chosen": -1.0353068113327026, "logits/rejected": -1.072916865348816, "logps/chosen": -331.47344970703125, "logps/rejected": -356.8760070800781, "loss": 0.6128, "rewards/accuracies": 0.625, "rewards/chosen": -0.7766618132591248, "rewards/margins": 0.23775213956832886, "rewards/rejected": -1.0144139528274536, "step": 174 }, { "epoch": 0.36639623135304894, "grad_norm": 4.066069782311012, "learning_rate": 3.994527650465352e-07, "logits/chosen": -1.0037815570831299, "logits/rejected": -0.9480621218681335, "logps/chosen": -363.3121643066406, "logps/rejected": -319.31817626953125, "loss": 0.6321, "rewards/accuracies": 0.625, "rewards/chosen": -0.6948989629745483, "rewards/margins": 0.15337017178535461, "rewards/rejected": -0.8482690453529358, "step": 175 }, { "epoch": 0.3684899241036378, "grad_norm": 4.6012253365610025, "learning_rate": 3.979811618281705e-07, "logits/chosen": -1.1139051914215088, "logits/rejected": -1.0669904947280884, "logps/chosen": -367.2355651855469, "logps/rejected": -348.9967346191406, "loss": 0.6126, "rewards/accuracies": 0.78125, "rewards/chosen": -0.6754685044288635, "rewards/margins": 0.3092360496520996, "rewards/rejected": -0.9847044944763184, "step": 176 }, { "epoch": 0.37058361685422664, "grad_norm": 4.391471848980245, "learning_rate": 3.9650162282919654e-07, "logits/chosen": -1.0147432088851929, "logits/rejected": -0.9802550077438354, "logps/chosen": -421.4739990234375, "logps/rejected": -385.60943603515625, "loss": 0.6022, "rewards/accuracies": 0.75, "rewards/chosen": -0.6281643509864807, "rewards/margins": 0.20620116591453552, "rewards/rejected": -0.8343654870986938, "step": 177 }, { "epoch": 0.3726773096048155, "grad_norm": 4.0770810589563125, "learning_rate": 3.9501422739279953e-07, "logits/chosen": -0.8377776145935059, "logits/rejected": -0.8392380475997925, "logps/chosen": -218.2079620361328, "logps/rejected": -240.64779663085938, "loss": 0.6378, "rewards/accuracies": 0.6875, "rewards/chosen": -0.628129780292511, "rewards/margins": 0.18135149776935577, "rewards/rejected": -0.8094812631607056, "step": 178 }, { "epoch": 0.37477100235540434, "grad_norm": 3.5648473815723154, "learning_rate": 3.935190552834828e-07, "logits/chosen": -0.8449621200561523, "logits/rejected": -0.9233438372612, "logps/chosen": -360.1451721191406, "logps/rejected": -393.1477355957031, "loss": 0.6009, "rewards/accuracies": 0.75, "rewards/chosen": -0.619637131690979, "rewards/margins": 0.26286301016807556, "rewards/rejected": -0.882500171661377, "step": 179 }, { "epoch": 0.3768646951059932, "grad_norm": 3.9490433264410463, "learning_rate": 3.920161866827889e-07, "logits/chosen": -0.9300554990768433, "logits/rejected": -0.9424443244934082, "logps/chosen": -264.6137390136719, "logps/rejected": -279.89459228515625, "loss": 0.6077, "rewards/accuracies": 0.75, "rewards/chosen": -0.6152034997940063, "rewards/margins": 0.18732516467571259, "rewards/rejected": -0.8025285601615906, "step": 180 }, { "epoch": 0.37895838785658204, "grad_norm": 3.713630049915069, "learning_rate": 3.90505702185e-07, "logits/chosen": -1.064460039138794, "logits/rejected": -1.002063274383545, "logps/chosen": -344.5096435546875, "logps/rejected": -323.7726135253906, "loss": 0.6392, "rewards/accuracies": 0.78125, "rewards/chosen": -0.7072369456291199, "rewards/margins": 0.4485825002193451, "rewards/rejected": -1.1558194160461426, "step": 181 }, { "epoch": 0.3810520806071709, "grad_norm": 3.621641726163966, "learning_rate": 3.889876827928156e-07, "logits/chosen": -0.9555776715278625, "logits/rejected": -0.9446895122528076, "logps/chosen": -278.3901062011719, "logps/rejected": -386.50567626953125, "loss": 0.6282, "rewards/accuracies": 0.625, "rewards/chosen": -0.6192256808280945, "rewards/margins": 0.3184245526790619, "rewards/rejected": -0.937650203704834, "step": 182 }, { "epoch": 0.38314577335775973, "grad_norm": 4.3409685713340025, "learning_rate": 3.874622099130087e-07, "logits/chosen": -1.0874221324920654, "logits/rejected": -1.0712660551071167, "logps/chosen": -382.76556396484375, "logps/rejected": -350.2206115722656, "loss": 0.6039, "rewards/accuracies": 0.625, "rewards/chosen": -0.7503204941749573, "rewards/margins": 0.1922486126422882, "rewards/rejected": -0.9425691962242126, "step": 183 }, { "epoch": 0.3852394661083486, "grad_norm": 3.7993958998789603, "learning_rate": 3.859293653520604e-07, "logits/chosen": -0.8046805262565613, "logits/rejected": -0.8877884149551392, "logps/chosen": -275.6742248535156, "logps/rejected": -306.5643310546875, "loss": 0.6008, "rewards/accuracies": 0.84375, "rewards/chosen": -0.46405842900276184, "rewards/margins": 0.2889067530632019, "rewards/rejected": -0.7529651522636414, "step": 184 }, { "epoch": 0.38733315885893743, "grad_norm": 3.292511318185781, "learning_rate": 3.8438923131177237e-07, "logits/chosen": -0.8814327716827393, "logits/rejected": -0.9647997617721558, "logps/chosen": -321.913818359375, "logps/rejected": -260.08203125, "loss": 0.6226, "rewards/accuracies": 0.625, "rewards/chosen": -0.5652605891227722, "rewards/margins": 0.16028377413749695, "rewards/rejected": -0.7255443930625916, "step": 185 }, { "epoch": 0.3894268516095263, "grad_norm": 3.872784050983529, "learning_rate": 3.828418903848593e-07, "logits/chosen": -1.0048834085464478, "logits/rejected": -0.945990800857544, "logps/chosen": -383.0162353515625, "logps/rejected": -349.11053466796875, "loss": 0.632, "rewards/accuracies": 0.5625, "rewards/chosen": -0.7166603207588196, "rewards/margins": 0.1435239613056183, "rewards/rejected": -0.8601843118667603, "step": 186 }, { "epoch": 0.39152054436011513, "grad_norm": 4.410648603224739, "learning_rate": 3.812874255505191e-07, "logits/chosen": -1.0335661172866821, "logits/rejected": -0.9277456402778625, "logps/chosen": -363.1829528808594, "logps/rejected": -432.8406982421875, "loss": 0.6363, "rewards/accuracies": 0.5625, "rewards/chosen": -0.7093329429626465, "rewards/margins": 0.2529588043689728, "rewards/rejected": -0.9622917175292969, "step": 187 }, { "epoch": 0.393614237110704, "grad_norm": 4.791496133787846, "learning_rate": 3.797259201699833e-07, "logits/chosen": -1.0139541625976562, "logits/rejected": -1.0900686979293823, "logps/chosen": -319.1822509765625, "logps/rejected": -341.49847412109375, "loss": 0.6789, "rewards/accuracies": 0.71875, "rewards/chosen": -0.6620299220085144, "rewards/margins": 0.2886333465576172, "rewards/rejected": -0.9506633281707764, "step": 188 }, { "epoch": 0.39570792986129283, "grad_norm": 3.9215322142669597, "learning_rate": 3.781574579820464e-07, "logits/chosen": -0.8225542902946472, "logits/rejected": -0.8100268840789795, "logps/chosen": -210.12469482421875, "logps/rejected": -194.2835693359375, "loss": 0.6698, "rewards/accuracies": 0.53125, "rewards/chosen": -0.5047827959060669, "rewards/margins": 0.13596275448799133, "rewards/rejected": -0.6407455205917358, "step": 189 }, { "epoch": 0.39780162261188173, "grad_norm": 4.073407255706279, "learning_rate": 3.765821230985757e-07, "logits/chosen": -0.9837744832038879, "logits/rejected": -1.0709593296051025, "logps/chosen": -373.37628173828125, "logps/rejected": -442.47210693359375, "loss": 0.5834, "rewards/accuracies": 0.8125, "rewards/chosen": -0.5400768518447876, "rewards/margins": 0.315401166677475, "rewards/rejected": -0.8554779887199402, "step": 190 }, { "epoch": 0.3998953153624706, "grad_norm": 4.159836202157624, "learning_rate": 3.75e-07, "logits/chosen": -1.01850426197052, "logits/rejected": -1.0318413972854614, "logps/chosen": -338.2696533203125, "logps/rejected": -355.7770080566406, "loss": 0.6583, "rewards/accuracies": 0.53125, "rewards/chosen": -0.7128810882568359, "rewards/margins": 0.07909521460533142, "rewards/rejected": -0.7919763922691345, "step": 191 }, { "epoch": 0.40198900811305943, "grad_norm": 4.023262219468805, "learning_rate": 3.734111735307796e-07, "logits/chosen": -0.8635963201522827, "logits/rejected": -0.8391140699386597, "logps/chosen": -352.0013427734375, "logps/rejected": -326.6541442871094, "loss": 0.6207, "rewards/accuracies": 0.6875, "rewards/chosen": -0.5172160863876343, "rewards/margins": 0.19833798706531525, "rewards/rejected": -0.7155541181564331, "step": 192 }, { "epoch": 0.4040827008636483, "grad_norm": 4.2336545839962705, "learning_rate": 3.7181572889485623e-07, "logits/chosen": -0.9286049008369446, "logits/rejected": -0.8810209035873413, "logps/chosen": -283.4415588378906, "logps/rejected": -337.71905517578125, "loss": 0.6179, "rewards/accuracies": 0.71875, "rewards/chosen": -0.6143100261688232, "rewards/margins": 0.197503924369812, "rewards/rejected": -0.8118139505386353, "step": 193 }, { "epoch": 0.40617639361423713, "grad_norm": 3.4555463741404604, "learning_rate": 3.7021375165108377e-07, "logits/chosen": -0.9236476421356201, "logits/rejected": -0.9367985725402832, "logps/chosen": -304.7623596191406, "logps/rejected": -261.16162109375, "loss": 0.602, "rewards/accuracies": 0.65625, "rewards/chosen": -0.5301774144172668, "rewards/margins": 0.17531868815422058, "rewards/rejected": -0.7054961919784546, "step": 194 }, { "epoch": 0.408270086364826, "grad_norm": 3.9445519259265653, "learning_rate": 3.6860532770864005e-07, "logits/chosen": -1.0149463415145874, "logits/rejected": -1.0133212804794312, "logps/chosen": -368.0845642089844, "logps/rejected": -337.61224365234375, "loss": 0.6517, "rewards/accuracies": 0.5625, "rewards/chosen": -0.7208981513977051, "rewards/margins": 0.2204747349023819, "rewards/rejected": -0.9413729310035706, "step": 195 }, { "epoch": 0.41036377911541483, "grad_norm": 3.9384184857327424, "learning_rate": 3.6699054332241985e-07, "logits/chosen": -1.0823029279708862, "logits/rejected": -1.0432735681533813, "logps/chosen": -332.11187744140625, "logps/rejected": -284.4735412597656, "loss": 0.606, "rewards/accuracies": 0.75, "rewards/chosen": -0.6617094278335571, "rewards/margins": 0.2158508449792862, "rewards/rejected": -0.8775602579116821, "step": 196 }, { "epoch": 0.4124574718660037, "grad_norm": 3.744153654110652, "learning_rate": 3.653694850884091e-07, "logits/chosen": -0.9696850180625916, "logits/rejected": -1.0235822200775146, "logps/chosen": -325.044921875, "logps/rejected": -380.5304870605469, "loss": 0.6225, "rewards/accuracies": 0.5, "rewards/chosen": -0.8089624643325806, "rewards/margins": -0.0897996723651886, "rewards/rejected": -0.7191627621650696, "step": 197 }, { "epoch": 0.4145511646165925, "grad_norm": 3.7844776811117296, "learning_rate": 3.6374223993904124e-07, "logits/chosen": -1.0175514221191406, "logits/rejected": -0.9938327670097351, "logps/chosen": -258.1676330566406, "logps/rejected": -343.2113037109375, "loss": 0.6224, "rewards/accuracies": 0.65625, "rewards/chosen": -0.6013177633285522, "rewards/margins": 0.31439411640167236, "rewards/rejected": -0.9157118797302246, "step": 198 }, { "epoch": 0.4166448573671814, "grad_norm": 3.870926133879283, "learning_rate": 3.621088951385353e-07, "logits/chosen": -1.0541473627090454, "logits/rejected": -1.0209746360778809, "logps/chosen": -356.140869140625, "logps/rejected": -361.7309875488281, "loss": 0.6103, "rewards/accuracies": 0.65625, "rewards/chosen": -0.6403514742851257, "rewards/margins": 0.12953613698482513, "rewards/rejected": -0.7698875665664673, "step": 199 }, { "epoch": 0.4187385501177702, "grad_norm": 4.342199077521211, "learning_rate": 3.604695382782159e-07, "logits/chosen": -0.830146074295044, "logits/rejected": -0.7588611245155334, "logps/chosen": -343.5314025878906, "logps/rejected": -285.595947265625, "loss": 0.6481, "rewards/accuracies": 0.625, "rewards/chosen": -0.5845489501953125, "rewards/margins": 0.11396576464176178, "rewards/rejected": -0.6985146999359131, "step": 200 }, { "epoch": 0.4208322428683591, "grad_norm": 4.017845874503845, "learning_rate": 3.588242572718162e-07, "logits/chosen": -0.9542269706726074, "logits/rejected": -0.9640634059906006, "logps/chosen": -287.4453430175781, "logps/rejected": -321.0850830078125, "loss": 0.6007, "rewards/accuracies": 0.71875, "rewards/chosen": -0.5165853500366211, "rewards/margins": 0.3088611662387848, "rewards/rejected": -0.8254464864730835, "step": 201 }, { "epoch": 0.4229259356189479, "grad_norm": 3.9085443472781267, "learning_rate": 3.571731403507635e-07, "logits/chosen": -1.0116387605667114, "logits/rejected": -0.9675155282020569, "logps/chosen": -345.8041687011719, "logps/rejected": -382.7454833984375, "loss": 0.6328, "rewards/accuracies": 0.71875, "rewards/chosen": -0.6035252809524536, "rewards/margins": 0.25514647364616394, "rewards/rejected": -0.8586717247962952, "step": 202 }, { "epoch": 0.4250196283695368, "grad_norm": 3.843124578642179, "learning_rate": 3.5551627605944746e-07, "logits/chosen": -0.8555247187614441, "logits/rejected": -0.7720202207565308, "logps/chosen": -356.9623107910156, "logps/rejected": -294.16107177734375, "loss": 0.6304, "rewards/accuracies": 0.71875, "rewards/chosen": -0.7502315044403076, "rewards/margins": 0.25580644607543945, "rewards/rejected": -1.0060380697250366, "step": 203 }, { "epoch": 0.4271133211201256, "grad_norm": 3.9564818074559494, "learning_rate": 3.5385375325047163e-07, "logits/chosen": -1.043159008026123, "logits/rejected": -1.020918846130371, "logps/chosen": -295.48736572265625, "logps/rejected": -285.2925109863281, "loss": 0.6475, "rewards/accuracies": 0.6875, "rewards/chosen": -0.702950656414032, "rewards/margins": 0.1394089162349701, "rewards/rejected": -0.8423596620559692, "step": 204 }, { "epoch": 0.42920701387071447, "grad_norm": 3.7934149819311367, "learning_rate": 3.5218566107988867e-07, "logits/chosen": -1.14256751537323, "logits/rejected": -1.117805004119873, "logps/chosen": -373.2093505859375, "logps/rejected": -348.0001525878906, "loss": 0.6039, "rewards/accuracies": 0.875, "rewards/chosen": -0.5889281630516052, "rewards/margins": 0.44839006662368774, "rewards/rejected": -1.037318229675293, "step": 205 }, { "epoch": 0.4313007066213033, "grad_norm": 3.9857658278901553, "learning_rate": 3.505120890024195e-07, "logits/chosen": -0.9535520076751709, "logits/rejected": -0.8674717545509338, "logps/chosen": -290.2284240722656, "logps/rejected": -305.6851806640625, "loss": 0.6398, "rewards/accuracies": 0.75, "rewards/chosen": -0.5809500813484192, "rewards/margins": 0.23095953464508057, "rewards/rejected": -0.8119096159934998, "step": 206 }, { "epoch": 0.43339439937189217, "grad_norm": 4.104349143716103, "learning_rate": 3.4883312676665534e-07, "logits/chosen": -0.9587735533714294, "logits/rejected": -0.9456522464752197, "logps/chosen": -345.64520263671875, "logps/rejected": -315.80706787109375, "loss": 0.6538, "rewards/accuracies": 0.59375, "rewards/chosen": -0.5532993078231812, "rewards/margins": 0.13507559895515442, "rewards/rejected": -0.6883749961853027, "step": 207 }, { "epoch": 0.435488092122481, "grad_norm": 3.7033850854773993, "learning_rate": 3.4714886441024573e-07, "logits/chosen": -0.9060953259468079, "logits/rejected": -0.8163396120071411, "logps/chosen": -372.0910339355469, "logps/rejected": -269.00567626953125, "loss": 0.6097, "rewards/accuracies": 0.65625, "rewards/chosen": -0.5927330851554871, "rewards/margins": 0.16467399895191193, "rewards/rejected": -0.7574071288108826, "step": 208 }, { "epoch": 0.43758178487306987, "grad_norm": 3.4645622409797596, "learning_rate": 3.454593922550693e-07, "logits/chosen": -1.0778393745422363, "logits/rejected": -1.0153272151947021, "logps/chosen": -364.489013671875, "logps/rejected": -349.343017578125, "loss": 0.6323, "rewards/accuracies": 0.75, "rewards/chosen": -0.6318523287773132, "rewards/margins": 0.31523868441581726, "rewards/rejected": -0.9470909237861633, "step": 209 }, { "epoch": 0.4396754776236587, "grad_norm": 4.05541309485243, "learning_rate": 3.4376480090239047e-07, "logits/chosen": -1.1387578248977661, "logits/rejected": -1.0902557373046875, "logps/chosen": -450.9346008300781, "logps/rejected": -353.08984375, "loss": 0.6027, "rewards/accuracies": 0.6875, "rewards/chosen": -0.7492223381996155, "rewards/margins": 0.12559929490089417, "rewards/rejected": -0.874821662902832, "step": 210 }, { "epoch": 0.44176917037424757, "grad_norm": 4.119638715515438, "learning_rate": 3.4206518122800055e-07, "logits/chosen": -0.8814012408256531, "logits/rejected": -0.9084731936454773, "logps/chosen": -290.82110595703125, "logps/rejected": -295.4637451171875, "loss": 0.6081, "rewards/accuracies": 0.71875, "rewards/chosen": -0.572927713394165, "rewards/margins": 0.2446538209915161, "rewards/rejected": -0.8175814747810364, "step": 211 }, { "epoch": 0.4438628631248364, "grad_norm": 4.77790727447983, "learning_rate": 3.403606243773448e-07, "logits/chosen": -0.9413045048713684, "logits/rejected": -0.8758677244186401, "logps/chosen": -306.1932067871094, "logps/rejected": -272.56787109375, "loss": 0.6546, "rewards/accuracies": 0.5625, "rewards/chosen": -0.8097062110900879, "rewards/margins": 0.0024148859083652496, "rewards/rejected": -0.8121210932731628, "step": 212 }, { "epoch": 0.44595655587542526, "grad_norm": 3.9859575276001573, "learning_rate": 3.3865122176063385e-07, "logits/chosen": -0.802134096622467, "logits/rejected": -0.8242202401161194, "logps/chosen": -314.6690673828125, "logps/rejected": -399.07275390625, "loss": 0.6023, "rewards/accuracies": 0.75, "rewards/chosen": -0.6408030390739441, "rewards/margins": 0.2936129570007324, "rewards/rejected": -0.9344159960746765, "step": 213 }, { "epoch": 0.4480502486260141, "grad_norm": 4.054167387160397, "learning_rate": 3.3693706504794243e-07, "logits/chosen": -0.9933522939682007, "logits/rejected": -1.0025355815887451, "logps/chosen": -336.1624755859375, "logps/rejected": -356.69793701171875, "loss": 0.6077, "rewards/accuracies": 0.75, "rewards/chosen": -0.6096816658973694, "rewards/margins": 0.29134050011634827, "rewards/rejected": -0.9010221362113953, "step": 214 }, { "epoch": 0.45014394137660296, "grad_norm": 4.014374704086859, "learning_rate": 3.3521824616429284e-07, "logits/chosen": -1.1052436828613281, "logits/rejected": -0.9307059049606323, "logps/chosen": -276.3744201660156, "logps/rejected": -294.1133117675781, "loss": 0.6117, "rewards/accuracies": 0.6875, "rewards/chosen": -0.5845311880111694, "rewards/margins": 0.2892147898674011, "rewards/rejected": -0.8737459778785706, "step": 215 }, { "epoch": 0.4522376341271918, "grad_norm": 4.12425566351273, "learning_rate": 3.334948572847253e-07, "logits/chosen": -0.8936595916748047, "logits/rejected": -0.8552988767623901, "logps/chosen": -223.66757202148438, "logps/rejected": -279.2814636230469, "loss": 0.5915, "rewards/accuracies": 0.5625, "rewards/chosen": -0.7131213545799255, "rewards/margins": 0.213251531124115, "rewards/rejected": -0.9263728857040405, "step": 216 }, { "epoch": 0.45433132687778066, "grad_norm": 3.820544486483965, "learning_rate": 3.317669908293554e-07, "logits/chosen": -0.8404410481452942, "logits/rejected": -0.8817408084869385, "logps/chosen": -306.64813232421875, "logps/rejected": -294.3273010253906, "loss": 0.6454, "rewards/accuracies": 0.59375, "rewards/chosen": -0.607830286026001, "rewards/margins": 0.07771724462509155, "rewards/rejected": -0.6855475902557373, "step": 217 }, { "epoch": 0.4564250196283695, "grad_norm": 4.380665987787131, "learning_rate": 3.300347394584172e-07, "logits/chosen": -0.9957190155982971, "logits/rejected": -0.9547911882400513, "logps/chosen": -347.39599609375, "logps/rejected": -307.5665588378906, "loss": 0.5824, "rewards/accuracies": 0.6875, "rewards/chosen": -0.7037020325660706, "rewards/margins": 0.27967917919158936, "rewards/rejected": -0.9833812713623047, "step": 218 }, { "epoch": 0.4585187123789584, "grad_norm": 4.572756653676887, "learning_rate": 3.2829819606729477e-07, "logits/chosen": -1.039097785949707, "logits/rejected": -1.0874122381210327, "logps/chosen": -344.16949462890625, "logps/rejected": -368.737548828125, "loss": 0.5793, "rewards/accuracies": 0.71875, "rewards/chosen": -0.611099362373352, "rewards/margins": 0.36952948570251465, "rewards/rejected": -0.9806288480758667, "step": 219 }, { "epoch": 0.46061240512954726, "grad_norm": 4.129541516381941, "learning_rate": 3.265574537815398e-07, "logits/chosen": -0.9006705284118652, "logits/rejected": -0.9682837128639221, "logps/chosen": -318.3325500488281, "logps/rejected": -286.1507263183594, "loss": 0.6273, "rewards/accuracies": 0.625, "rewards/chosen": -0.7855298519134521, "rewards/margins": 0.08917371928691864, "rewards/rejected": -0.8747035264968872, "step": 220 }, { "epoch": 0.4627060978801361, "grad_norm": 4.18617869768596, "learning_rate": 3.248126059518784e-07, "logits/chosen": -1.0923103094100952, "logits/rejected": -1.1423243284225464, "logps/chosen": -353.5962219238281, "logps/rejected": -356.5256042480469, "loss": 0.5851, "rewards/accuracies": 0.78125, "rewards/chosen": -0.7234691977500916, "rewards/margins": 0.40291115641593933, "rewards/rejected": -1.1263803243637085, "step": 221 }, { "epoch": 0.46479979063072496, "grad_norm": 3.888803472283558, "learning_rate": 3.230637461492043e-07, "logits/chosen": -1.011338710784912, "logits/rejected": -1.0403305292129517, "logps/chosen": -339.11419677734375, "logps/rejected": -347.67218017578125, "loss": 0.6119, "rewards/accuracies": 0.59375, "rewards/chosen": -0.6807050704956055, "rewards/margins": 0.15236997604370117, "rewards/rejected": -0.8330751657485962, "step": 222 }, { "epoch": 0.4668934833813138, "grad_norm": 4.085061017059798, "learning_rate": 3.213109681595612e-07, "logits/chosen": -1.108197808265686, "logits/rejected": -1.0261244773864746, "logps/chosen": -376.7074279785156, "logps/rejected": -432.5005187988281, "loss": 0.6017, "rewards/accuracies": 0.65625, "rewards/chosen": -0.8131324648857117, "rewards/margins": 0.3406909108161926, "rewards/rejected": -1.1538234949111938, "step": 223 }, { "epoch": 0.46898717613190266, "grad_norm": 4.073534306860452, "learning_rate": 3.1955436597911315e-07, "logits/chosen": -0.8938140869140625, "logits/rejected": -0.9309756755828857, "logps/chosen": -274.193359375, "logps/rejected": -262.4495849609375, "loss": 0.6131, "rewards/accuracies": 0.6875, "rewards/chosen": -0.6037336587905884, "rewards/margins": 0.2819165289402008, "rewards/rejected": -0.8856501579284668, "step": 224 }, { "epoch": 0.4710808688824915, "grad_norm": 4.405484814747871, "learning_rate": 3.1779403380910425e-07, "logits/chosen": -0.8677606582641602, "logits/rejected": -0.9438505172729492, "logps/chosen": -287.43084716796875, "logps/rejected": -398.9632263183594, "loss": 0.593, "rewards/accuracies": 0.78125, "rewards/chosen": -0.6562673449516296, "rewards/margins": 0.34873324632644653, "rewards/rejected": -1.0050005912780762, "step": 225 }, { "epoch": 0.47317456163308036, "grad_norm": 4.061867331496559, "learning_rate": 3.160300660508064e-07, "logits/chosen": -0.9015076160430908, "logits/rejected": -0.9211095571517944, "logps/chosen": -283.0917663574219, "logps/rejected": -277.0173034667969, "loss": 0.615, "rewards/accuracies": 0.6875, "rewards/chosen": -0.7114819288253784, "rewards/margins": 0.2215462028980255, "rewards/rejected": -0.9330282211303711, "step": 226 }, { "epoch": 0.4752682543836692, "grad_norm": 5.130156131210273, "learning_rate": 3.1426255730045695e-07, "logits/chosen": -0.9921966791152954, "logits/rejected": -1.0387006998062134, "logps/chosen": -312.1060485839844, "logps/rejected": -364.17913818359375, "loss": 0.6295, "rewards/accuracies": 0.71875, "rewards/chosen": -0.783414363861084, "rewards/margins": 0.36942586302757263, "rewards/rejected": -1.1528401374816895, "step": 227 }, { "epoch": 0.47736194713425806, "grad_norm": 4.604756377667637, "learning_rate": 3.1249160234418644e-07, "logits/chosen": -1.1062538623809814, "logits/rejected": -1.0098164081573486, "logps/chosen": -365.3369140625, "logps/rejected": -355.32086181640625, "loss": 0.6415, "rewards/accuracies": 0.59375, "rewards/chosen": -0.7709512710571289, "rewards/margins": 0.19630008935928345, "rewards/rejected": -0.9672513604164124, "step": 228 }, { "epoch": 0.4794556398848469, "grad_norm": 5.555137415141404, "learning_rate": 3.1071729615293424e-07, "logits/chosen": -0.9563548564910889, "logits/rejected": -0.9007817506790161, "logps/chosen": -311.4552307128906, "logps/rejected": -299.1227111816406, "loss": 0.6236, "rewards/accuracies": 0.78125, "rewards/chosen": -0.6697904467582703, "rewards/margins": 0.23303496837615967, "rewards/rejected": -0.9028254151344299, "step": 229 }, { "epoch": 0.48154933263543576, "grad_norm": 4.642808114882321, "learning_rate": 3.0893973387735683e-07, "logits/chosen": -1.2301511764526367, "logits/rejected": -1.1639983654022217, "logps/chosen": -394.49932861328125, "logps/rejected": -433.455322265625, "loss": 0.6109, "rewards/accuracies": 0.75, "rewards/chosen": -0.7546640634536743, "rewards/margins": 0.4264376759529114, "rewards/rejected": -1.181101679801941, "step": 230 }, { "epoch": 0.4836430253860246, "grad_norm": 4.3892796665907285, "learning_rate": 3.071590108427243e-07, "logits/chosen": -0.9773148894309998, "logits/rejected": -0.9659315347671509, "logps/chosen": -285.85943603515625, "logps/rejected": -322.55364990234375, "loss": 0.5925, "rewards/accuracies": 0.5625, "rewards/chosen": -0.8015893697738647, "rewards/margins": 0.3736291229724884, "rewards/rejected": -1.1752185821533203, "step": 231 }, { "epoch": 0.48573671813661345, "grad_norm": 3.9299173167410846, "learning_rate": 3.05375222543809e-07, "logits/chosen": -1.1926710605621338, "logits/rejected": -1.1512194871902466, "logps/chosen": -353.67626953125, "logps/rejected": -389.539306640625, "loss": 0.5713, "rewards/accuracies": 0.75, "rewards/chosen": -0.7308621406555176, "rewards/margins": 0.20806965231895447, "rewards/rejected": -0.9389318227767944, "step": 232 }, { "epoch": 0.4878304108872023, "grad_norm": 4.616522731616022, "learning_rate": 3.035884646397637e-07, "logits/chosen": -1.0583614110946655, "logits/rejected": -1.076390027999878, "logps/chosen": -342.8740539550781, "logps/rejected": -314.76873779296875, "loss": 0.6306, "rewards/accuracies": 0.53125, "rewards/chosen": -0.7622975707054138, "rewards/margins": 0.06755637377500534, "rewards/rejected": -0.829853892326355, "step": 233 }, { "epoch": 0.48992410363779115, "grad_norm": 4.0242769118158215, "learning_rate": 3.017988329489923e-07, "logits/chosen": -1.0355958938598633, "logits/rejected": -1.06577467918396, "logps/chosen": -314.82666015625, "logps/rejected": -301.7773132324219, "loss": 0.5321, "rewards/accuracies": 0.71875, "rewards/chosen": -0.6720834374427795, "rewards/margins": 0.32785531878471375, "rewards/rejected": -0.9999387264251709, "step": 234 }, { "epoch": 0.49201779638838, "grad_norm": 4.343403940596401, "learning_rate": 3.000064234440111e-07, "logits/chosen": -0.9155338406562805, "logits/rejected": -0.8366042375564575, "logps/chosen": -333.7397155761719, "logps/rejected": -288.471435546875, "loss": 0.6115, "rewards/accuracies": 0.59375, "rewards/chosen": -0.6950631141662598, "rewards/margins": 0.2937913239002228, "rewards/rejected": -0.9888544082641602, "step": 235 }, { "epoch": 0.49411148913896885, "grad_norm": 4.623768252322274, "learning_rate": 2.9821133224630223e-07, "logits/chosen": -0.9640467762947083, "logits/rejected": -0.871576189994812, "logps/chosen": -370.4189758300781, "logps/rejected": -356.257568359375, "loss": 0.6095, "rewards/accuracies": 0.6875, "rewards/chosen": -0.716641902923584, "rewards/margins": 0.33154234290122986, "rewards/rejected": -1.0481841564178467, "step": 236 }, { "epoch": 0.4962051818895577, "grad_norm": 4.392410492832627, "learning_rate": 2.964136556211588e-07, "logits/chosen": -1.1828854084014893, "logits/rejected": -0.9942708015441895, "logps/chosen": -330.2734069824219, "logps/rejected": -379.3336181640625, "loss": 0.6028, "rewards/accuracies": 0.625, "rewards/chosen": -0.7781244516372681, "rewards/margins": 0.289692759513855, "rewards/rejected": -1.067817211151123, "step": 237 }, { "epoch": 0.49829887464014655, "grad_norm": 5.911837229541137, "learning_rate": 2.946134899725226e-07, "logits/chosen": -1.1307326555252075, "logits/rejected": -1.0896378755569458, "logps/chosen": -396.4427490234375, "logps/rejected": -300.9529724121094, "loss": 0.6517, "rewards/accuracies": 0.625, "rewards/chosen": -0.7453781962394714, "rewards/margins": 0.17206819355487823, "rewards/rejected": -0.917446494102478, "step": 238 }, { "epoch": 0.5003925673907355, "grad_norm": 3.728111860538933, "learning_rate": 2.9281093183781403e-07, "logits/chosen": -0.9374557137489319, "logits/rejected": -0.9252691268920898, "logps/chosen": -296.291015625, "logps/rejected": -343.64825439453125, "loss": 0.604, "rewards/accuracies": 0.84375, "rewards/chosen": -0.65325927734375, "rewards/margins": 0.3434145450592041, "rewards/rejected": -0.9966738224029541, "step": 239 }, { "epoch": 0.5024862601413242, "grad_norm": 4.0698921485095365, "learning_rate": 2.910060778827554e-07, "logits/chosen": -0.8723117113113403, "logits/rejected": -0.9028252959251404, "logps/chosen": -377.4078063964844, "logps/rejected": -383.0431823730469, "loss": 0.6095, "rewards/accuracies": 0.75, "rewards/chosen": -0.7764265537261963, "rewards/margins": 0.28098151087760925, "rewards/rejected": -1.057408094406128, "step": 240 }, { "epoch": 0.5045799528919132, "grad_norm": 4.181190167016197, "learning_rate": 2.891990248961871e-07, "logits/chosen": -0.8871288299560547, "logits/rejected": -0.8694429993629456, "logps/chosen": -348.8471984863281, "logps/rejected": -395.46636962890625, "loss": 0.5994, "rewards/accuracies": 0.75, "rewards/chosen": -0.6026455163955688, "rewards/margins": 0.3517201542854309, "rewards/rejected": -0.9543656706809998, "step": 241 }, { "epoch": 0.506673645642502, "grad_norm": 4.289794023518283, "learning_rate": 2.873898697848762e-07, "logits/chosen": -0.9010360836982727, "logits/rejected": -0.8979989886283875, "logps/chosen": -328.00994873046875, "logps/rejected": -342.6231994628906, "loss": 0.5712, "rewards/accuracies": 0.71875, "rewards/chosen": -0.6888183951377869, "rewards/margins": 0.42270714044570923, "rewards/rejected": -1.1115254163742065, "step": 242 }, { "epoch": 0.5087673383930909, "grad_norm": 4.7150644940366355, "learning_rate": 2.8557870956832133e-07, "logits/chosen": -1.0637683868408203, "logits/rejected": -1.0127991437911987, "logps/chosen": -374.3121643066406, "logps/rejected": -409.6117248535156, "loss": 0.6362, "rewards/accuracies": 0.75, "rewards/chosen": -0.7771110534667969, "rewards/margins": 0.3026796579360962, "rewards/rejected": -1.079790711402893, "step": 243 }, { "epoch": 0.5108610311436796, "grad_norm": 4.016454290567565, "learning_rate": 2.837656413735479e-07, "logits/chosen": -1.0830886363983154, "logits/rejected": -1.003158450126648, "logps/chosen": -508.61737060546875, "logps/rejected": -455.4622497558594, "loss": 0.5798, "rewards/accuracies": 0.75, "rewards/chosen": -0.7906192541122437, "rewards/margins": 0.47089505195617676, "rewards/rejected": -1.26151442527771, "step": 244 }, { "epoch": 0.5129547238942685, "grad_norm": 5.222541158654268, "learning_rate": 2.8195076242990116e-07, "logits/chosen": -1.0686064958572388, "logits/rejected": -0.969489336013794, "logps/chosen": -314.2198486328125, "logps/rejected": -249.83529663085938, "loss": 0.6294, "rewards/accuracies": 0.65625, "rewards/chosen": -0.7531698346138, "rewards/margins": 0.24897263944149017, "rewards/rejected": -1.0021424293518066, "step": 245 }, { "epoch": 0.5150484166448573, "grad_norm": 6.219852619432657, "learning_rate": 2.801341700638307e-07, "logits/chosen": -1.1121360063552856, "logits/rejected": -1.085639238357544, "logps/chosen": -351.9165344238281, "logps/rejected": -326.91357421875, "loss": 0.6493, "rewards/accuracies": 0.53125, "rewards/chosen": -0.7655088901519775, "rewards/margins": 0.19527119398117065, "rewards/rejected": -0.9607800841331482, "step": 246 }, { "epoch": 0.5171421093954462, "grad_norm": 4.990676228234231, "learning_rate": 2.7831596169367227e-07, "logits/chosen": -1.0086631774902344, "logits/rejected": -1.0574275255203247, "logps/chosen": -401.2663879394531, "logps/rejected": -329.6419982910156, "loss": 0.6166, "rewards/accuracies": 0.71875, "rewards/chosen": -0.7250420451164246, "rewards/margins": 0.2370998114347458, "rewards/rejected": -0.9621418118476868, "step": 247 }, { "epoch": 0.519235802146035, "grad_norm": 4.377890153832782, "learning_rate": 2.7649623482442274e-07, "logits/chosen": -0.8921763896942139, "logits/rejected": -0.8677330017089844, "logps/chosen": -312.86846923828125, "logps/rejected": -321.2801818847656, "loss": 0.6054, "rewards/accuracies": 0.78125, "rewards/chosen": -0.806898832321167, "rewards/margins": 0.2529359459877014, "rewards/rejected": -1.0598348379135132, "step": 248 }, { "epoch": 0.521329494896624, "grad_norm": 4.23406757706141, "learning_rate": 2.7467508704251135e-07, "logits/chosen": -1.0979777574539185, "logits/rejected": -1.1933913230895996, "logps/chosen": -326.0444641113281, "logps/rejected": -349.9424133300781, "loss": 0.6213, "rewards/accuracies": 0.65625, "rewards/chosen": -0.6599812507629395, "rewards/margins": 0.3665139377117157, "rewards/rejected": -1.0264949798583984, "step": 249 }, { "epoch": 0.5234231876472127, "grad_norm": 4.731548648931599, "learning_rate": 2.7285261601056697e-07, "logits/chosen": -0.9986559152603149, "logits/rejected": -0.9786302447319031, "logps/chosen": -354.235107421875, "logps/rejected": -345.6477355957031, "loss": 0.6149, "rewards/accuracies": 0.6875, "rewards/chosen": -0.7075580954551697, "rewards/margins": 0.2955675423145294, "rewards/rejected": -1.003125548362732, "step": 250 }, { "epoch": 0.5255168803978016, "grad_norm": 4.3238721143310395, "learning_rate": 2.7102891946217994e-07, "logits/chosen": -0.9398631453514099, "logits/rejected": -0.9578242301940918, "logps/chosen": -286.37744140625, "logps/rejected": -307.2151184082031, "loss": 0.5846, "rewards/accuracies": 0.65625, "rewards/chosen": -0.7264272570610046, "rewards/margins": 0.21789371967315674, "rewards/rejected": -0.9443210363388062, "step": 251 }, { "epoch": 0.5276105731483904, "grad_norm": 4.019224039656086, "learning_rate": 2.692040951966617e-07, "logits/chosen": -1.0304282903671265, "logits/rejected": -0.9507336020469666, "logps/chosen": -265.782470703125, "logps/rejected": -341.2494201660156, "loss": 0.5911, "rewards/accuracies": 0.65625, "rewards/chosen": -0.7337517738342285, "rewards/margins": 0.35664138197898865, "rewards/rejected": -1.0903931856155396, "step": 252 }, { "epoch": 0.5297042658989793, "grad_norm": 4.4008346238725204, "learning_rate": 2.6737824107379947e-07, "logits/chosen": -1.0753947496414185, "logits/rejected": -1.1241540908813477, "logps/chosen": -408.1168212890625, "logps/rejected": -350.13104248046875, "loss": 0.6011, "rewards/accuracies": 0.78125, "rewards/chosen": -0.7371947169303894, "rewards/margins": 0.3128925859928131, "rewards/rejected": -1.050087332725525, "step": 253 }, { "epoch": 0.5317979586495681, "grad_norm": 4.478612836134575, "learning_rate": 2.655514550086086e-07, "logits/chosen": -1.0122687816619873, "logits/rejected": -1.0811606645584106, "logps/chosen": -329.6254577636719, "logps/rejected": -314.5368957519531, "loss": 0.6173, "rewards/accuracies": 0.6875, "rewards/chosen": -0.8279380202293396, "rewards/margins": 0.29747316241264343, "rewards/rejected": -1.1254112720489502, "step": 254 }, { "epoch": 0.533891651400157, "grad_norm": 4.592493486130326, "learning_rate": 2.6372383496608186e-07, "logits/chosen": -1.0779926776885986, "logits/rejected": -1.1338144540786743, "logps/chosen": -344.75738525390625, "logps/rejected": -346.9084777832031, "loss": 0.5915, "rewards/accuracies": 0.6875, "rewards/chosen": -0.6491105556488037, "rewards/margins": 0.4319334030151367, "rewards/rejected": -1.0810439586639404, "step": 255 }, { "epoch": 0.5359853441507458, "grad_norm": 6.606187103284956, "learning_rate": 2.618954789559356e-07, "logits/chosen": -0.9693815112113953, "logits/rejected": -0.9025633931159973, "logps/chosen": -362.2373352050781, "logps/rejected": -278.95977783203125, "loss": 0.6188, "rewards/accuracies": 0.625, "rewards/chosen": -0.6773243546485901, "rewards/margins": 0.2625731825828552, "rewards/rejected": -0.9398975372314453, "step": 256 }, { "epoch": 0.5380790369013347, "grad_norm": 4.879374874714332, "learning_rate": 2.600664850273538e-07, "logits/chosen": -1.0554527044296265, "logits/rejected": -1.0014501810073853, "logps/chosen": -425.541015625, "logps/rejected": -349.0471496582031, "loss": 0.5812, "rewards/accuracies": 0.71875, "rewards/chosen": -0.774941086769104, "rewards/margins": 0.47778624296188354, "rewards/rejected": -1.2527272701263428, "step": 257 }, { "epoch": 0.5401727296519235, "grad_norm": 4.678135079731708, "learning_rate": 2.582369512637302e-07, "logits/chosen": -0.9576770663261414, "logits/rejected": -0.9998583793640137, "logps/chosen": -295.5948486328125, "logps/rejected": -340.5544128417969, "loss": 0.599, "rewards/accuracies": 0.59375, "rewards/chosen": -0.9188826084136963, "rewards/margins": 0.15708282589912415, "rewards/rejected": -1.075965404510498, "step": 258 }, { "epoch": 0.5422664224025124, "grad_norm": 4.362700286570303, "learning_rate": 2.5640697577740815e-07, "logits/chosen": -1.0147913694381714, "logits/rejected": -1.0017765760421753, "logps/chosen": -392.20428466796875, "logps/rejected": -401.2511901855469, "loss": 0.6061, "rewards/accuracies": 0.71875, "rewards/chosen": -0.7479867935180664, "rewards/margins": 0.4889204800128937, "rewards/rejected": -1.2369073629379272, "step": 259 }, { "epoch": 0.5443601151531012, "grad_norm": 4.794875109043323, "learning_rate": 2.5457665670441937e-07, "logits/chosen": -1.0625568628311157, "logits/rejected": -1.0530543327331543, "logps/chosen": -336.122314453125, "logps/rejected": -343.56201171875, "loss": 0.6422, "rewards/accuracies": 0.59375, "rewards/chosen": -0.9290524125099182, "rewards/margins": 0.02330094762146473, "rewards/rejected": -0.9523533582687378, "step": 260 }, { "epoch": 0.5464538079036901, "grad_norm": 4.690519917570305, "learning_rate": 2.527460921992209e-07, "logits/chosen": -1.0372705459594727, "logits/rejected": -0.9663622379302979, "logps/chosen": -342.4421081542969, "logps/rejected": -418.2093811035156, "loss": 0.5958, "rewards/accuracies": 0.875, "rewards/chosen": -0.6581472158432007, "rewards/margins": 0.5107102394104004, "rewards/rejected": -1.1688575744628906, "step": 261 }, { "epoch": 0.5485475006542789, "grad_norm": 4.269984393704101, "learning_rate": 2.509153804294318e-07, "logits/chosen": -1.0221374034881592, "logits/rejected": -1.1132960319519043, "logps/chosen": -439.9560546875, "logps/rejected": -385.68682861328125, "loss": 0.5967, "rewards/accuracies": 0.6875, "rewards/chosen": -0.8774237036705017, "rewards/margins": 0.33529484272003174, "rewards/rejected": -1.2127184867858887, "step": 262 }, { "epoch": 0.5506411934048678, "grad_norm": 4.559950825873929, "learning_rate": 2.4908461957056825e-07, "logits/chosen": -1.0179204940795898, "logits/rejected": -1.0103511810302734, "logps/chosen": -352.5959167480469, "logps/rejected": -321.5555114746094, "loss": 0.5981, "rewards/accuracies": 0.65625, "rewards/chosen": -0.7945467829704285, "rewards/margins": 0.4398004412651062, "rewards/rejected": -1.2343473434448242, "step": 263 }, { "epoch": 0.5527348861554567, "grad_norm": 4.920306542640304, "learning_rate": 2.4725390780077905e-07, "logits/chosen": -1.1204428672790527, "logits/rejected": -1.100524663925171, "logps/chosen": -338.38031005859375, "logps/rejected": -386.240966796875, "loss": 0.6007, "rewards/accuracies": 0.71875, "rewards/chosen": -0.6985955834388733, "rewards/margins": 0.5098826289176941, "rewards/rejected": -1.2084782123565674, "step": 264 }, { "epoch": 0.5548285789060455, "grad_norm": 4.377449352487147, "learning_rate": 2.454233432955807e-07, "logits/chosen": -0.9631950855255127, "logits/rejected": -0.9512149095535278, "logps/chosen": -301.6888427734375, "logps/rejected": -318.80352783203125, "loss": 0.633, "rewards/accuracies": 0.65625, "rewards/chosen": -0.6564803123474121, "rewards/margins": 0.19614030420780182, "rewards/rejected": -0.8526206016540527, "step": 265 }, { "epoch": 0.5569222716566344, "grad_norm": 5.2605905033637255, "learning_rate": 2.435930242225919e-07, "logits/chosen": -1.015745759010315, "logits/rejected": -0.9821041822433472, "logps/chosen": -386.2413330078125, "logps/rejected": -379.1528015136719, "loss": 0.6088, "rewards/accuracies": 0.6875, "rewards/chosen": -0.8659415245056152, "rewards/margins": 0.15608720481395721, "rewards/rejected": -1.0220288038253784, "step": 266 }, { "epoch": 0.5590159644072232, "grad_norm": 4.086597191724232, "learning_rate": 2.4176304873626984e-07, "logits/chosen": -1.038877010345459, "logits/rejected": -1.125354290008545, "logps/chosen": -317.1690673828125, "logps/rejected": -303.6486511230469, "loss": 0.621, "rewards/accuracies": 0.71875, "rewards/chosen": -0.6402660608291626, "rewards/margins": 0.3106432557106018, "rewards/rejected": -0.9509092569351196, "step": 267 }, { "epoch": 0.5611096571578121, "grad_norm": 4.44877961649535, "learning_rate": 2.399335149726463e-07, "logits/chosen": -1.0465306043624878, "logits/rejected": -1.0398961305618286, "logps/chosen": -304.38079833984375, "logps/rejected": -302.5268859863281, "loss": 0.6046, "rewards/accuracies": 0.53125, "rewards/chosen": -0.9296619296073914, "rewards/margins": 0.23252925276756287, "rewards/rejected": -1.1621911525726318, "step": 268 }, { "epoch": 0.5632033499084009, "grad_norm": 4.185109199067693, "learning_rate": 2.381045210440644e-07, "logits/chosen": -1.031615138053894, "logits/rejected": -1.0007036924362183, "logps/chosen": -359.1358642578125, "logps/rejected": -288.4352111816406, "loss": 0.6201, "rewards/accuracies": 0.625, "rewards/chosen": -0.8603268265724182, "rewards/margins": 0.3853450417518616, "rewards/rejected": -1.2456719875335693, "step": 269 }, { "epoch": 0.5652970426589898, "grad_norm": 4.2824888231636224, "learning_rate": 2.3627616503391812e-07, "logits/chosen": -1.0695422887802124, "logits/rejected": -1.0857460498809814, "logps/chosen": -380.8077697753906, "logps/rejected": -323.8515625, "loss": 0.5837, "rewards/accuracies": 0.6875, "rewards/chosen": -0.7763439416885376, "rewards/margins": 0.37218236923217773, "rewards/rejected": -1.1485263109207153, "step": 270 }, { "epoch": 0.5673907354095786, "grad_norm": 4.795438264904108, "learning_rate": 2.344485449913914e-07, "logits/chosen": -1.0134387016296387, "logits/rejected": -0.978182315826416, "logps/chosen": -330.4658508300781, "logps/rejected": -281.619384765625, "loss": 0.6056, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9220022559165955, "rewards/margins": 0.19838015735149384, "rewards/rejected": -1.1203824281692505, "step": 271 }, { "epoch": 0.5694844281601675, "grad_norm": 5.142718293986198, "learning_rate": 2.3262175892620062e-07, "logits/chosen": -0.9586224555969238, "logits/rejected": -0.9129223823547363, "logps/chosen": -335.9892883300781, "logps/rejected": -303.89398193359375, "loss": 0.6247, "rewards/accuracies": 0.59375, "rewards/chosen": -0.7998490929603577, "rewards/margins": 0.33477380871772766, "rewards/rejected": -1.1346228122711182, "step": 272 }, { "epoch": 0.5715781209107563, "grad_norm": 4.936418936395716, "learning_rate": 2.3079590480333827e-07, "logits/chosen": -0.9717981815338135, "logits/rejected": -0.9322594404220581, "logps/chosen": -399.5758972167969, "logps/rejected": -365.136474609375, "loss": 0.6355, "rewards/accuracies": 0.53125, "rewards/chosen": -0.8702175617218018, "rewards/margins": 0.12933433055877686, "rewards/rejected": -0.9995518922805786, "step": 273 }, { "epoch": 0.5736718136613452, "grad_norm": 4.693935695214012, "learning_rate": 2.2897108053782e-07, "logits/chosen": -0.9046379923820496, "logits/rejected": -0.8849986791610718, "logps/chosen": -343.8055419921875, "logps/rejected": -348.1899108886719, "loss": 0.6382, "rewards/accuracies": 0.59375, "rewards/chosen": -0.8294025659561157, "rewards/margins": 0.19483700394630432, "rewards/rejected": -1.0242396593093872, "step": 274 }, { "epoch": 0.575765506411934, "grad_norm": 4.607222899335075, "learning_rate": 2.2714738398943308e-07, "logits/chosen": -1.1204267740249634, "logits/rejected": -1.1009920835494995, "logps/chosen": -412.75653076171875, "logps/rejected": -401.6040344238281, "loss": 0.5915, "rewards/accuracies": 0.75, "rewards/chosen": -0.8488682508468628, "rewards/margins": 0.35305196046829224, "rewards/rejected": -1.2019202709197998, "step": 275 }, { "epoch": 0.5778591991625229, "grad_norm": 4.218947387266857, "learning_rate": 2.2532491295748865e-07, "logits/chosen": -1.0519425868988037, "logits/rejected": -1.0002329349517822, "logps/chosen": -265.54205322265625, "logps/rejected": -330.5592346191406, "loss": 0.6157, "rewards/accuracies": 0.6875, "rewards/chosen": -0.835458517074585, "rewards/margins": 0.28700703382492065, "rewards/rejected": -1.1224654912948608, "step": 276 }, { "epoch": 0.5799528919131117, "grad_norm": 5.468276821530396, "learning_rate": 2.2350376517557726e-07, "logits/chosen": -0.8847948312759399, "logits/rejected": -0.9408563375473022, "logps/chosen": -357.6964111328125, "logps/rejected": -352.73284912109375, "loss": 0.6197, "rewards/accuracies": 0.78125, "rewards/chosen": -0.6024115085601807, "rewards/margins": 0.4576915204524994, "rewards/rejected": -1.0601030588150024, "step": 277 }, { "epoch": 0.5820465846637006, "grad_norm": 4.58956986890656, "learning_rate": 2.2168403830632769e-07, "logits/chosen": -0.9672602415084839, "logits/rejected": -1.074077844619751, "logps/chosen": -373.9370422363281, "logps/rejected": -367.44647216796875, "loss": 0.6135, "rewards/accuracies": 0.75, "rewards/chosen": -0.7608107924461365, "rewards/margins": 0.24896277487277985, "rewards/rejected": -1.0097734928131104, "step": 278 }, { "epoch": 0.5841402774142894, "grad_norm": 5.621876485242202, "learning_rate": 2.1986582993616925e-07, "logits/chosen": -1.0262584686279297, "logits/rejected": -1.006858468055725, "logps/chosen": -356.4263610839844, "logps/rejected": -342.4321594238281, "loss": 0.6099, "rewards/accuracies": 0.625, "rewards/chosen": -0.8674944043159485, "rewards/margins": 0.28997913002967834, "rewards/rejected": -1.1574735641479492, "step": 279 }, { "epoch": 0.5862339701648783, "grad_norm": 4.84561254478642, "learning_rate": 2.1804923757009882e-07, "logits/chosen": -1.1037640571594238, "logits/rejected": -1.2060396671295166, "logps/chosen": -471.8620910644531, "logps/rejected": -353.9450988769531, "loss": 0.6189, "rewards/accuracies": 0.6875, "rewards/chosen": -0.8083339929580688, "rewards/margins": 0.2710271179676056, "rewards/rejected": -1.079361081123352, "step": 280 }, { "epoch": 0.5883276629154671, "grad_norm": 4.317743426334099, "learning_rate": 2.1623435862645205e-07, "logits/chosen": -0.8186547160148621, "logits/rejected": -0.7757397890090942, "logps/chosen": -275.0792236328125, "logps/rejected": -338.761962890625, "loss": 0.5923, "rewards/accuracies": 0.78125, "rewards/chosen": -0.6970844864845276, "rewards/margins": 0.31228139996528625, "rewards/rejected": -1.0093659162521362, "step": 281 }, { "epoch": 0.590421355666056, "grad_norm": 4.850118952492567, "learning_rate": 2.1442129043167873e-07, "logits/chosen": -1.0171983242034912, "logits/rejected": -0.9386220574378967, "logps/chosen": -287.52899169921875, "logps/rejected": -314.5382385253906, "loss": 0.6266, "rewards/accuracies": 0.59375, "rewards/chosen": -0.8988039493560791, "rewards/margins": 0.1374320536851883, "rewards/rejected": -1.036236047744751, "step": 282 }, { "epoch": 0.5925150484166448, "grad_norm": 4.994726372259734, "learning_rate": 2.1261013021512378e-07, "logits/chosen": -1.0433508157730103, "logits/rejected": -1.080753207206726, "logps/chosen": -406.97760009765625, "logps/rejected": -383.5968933105469, "loss": 0.5738, "rewards/accuracies": 0.6875, "rewards/chosen": -0.7879077792167664, "rewards/margins": 0.35200175642967224, "rewards/rejected": -1.1399095058441162, "step": 283 }, { "epoch": 0.5946087411672337, "grad_norm": 4.218224431132751, "learning_rate": 2.1080097510381294e-07, "logits/chosen": -1.1848828792572021, "logits/rejected": -1.1189886331558228, "logps/chosen": -394.01995849609375, "logps/rejected": -337.9090270996094, "loss": 0.617, "rewards/accuracies": 0.65625, "rewards/chosen": -0.7961695194244385, "rewards/margins": 0.2834487557411194, "rewards/rejected": -1.0796183347702026, "step": 284 }, { "epoch": 0.5967024339178225, "grad_norm": 4.359946177614127, "learning_rate": 2.089939221172446e-07, "logits/chosen": -1.1179041862487793, "logits/rejected": -1.0704071521759033, "logps/chosen": -379.2666931152344, "logps/rejected": -454.59765625, "loss": 0.5839, "rewards/accuracies": 0.71875, "rewards/chosen": -0.838219165802002, "rewards/margins": 0.37780043482780457, "rewards/rejected": -1.2160195112228394, "step": 285 }, { "epoch": 0.5987961266684114, "grad_norm": 4.362203057151049, "learning_rate": 2.0718906816218595e-07, "logits/chosen": -1.1546528339385986, "logits/rejected": -1.1141022443771362, "logps/chosen": -408.8633117675781, "logps/rejected": -321.8983154296875, "loss": 0.6025, "rewards/accuracies": 0.59375, "rewards/chosen": -0.8321748971939087, "rewards/margins": 0.22147051990032196, "rewards/rejected": -1.0536454916000366, "step": 286 }, { "epoch": 0.6008898194190002, "grad_norm": 4.377715490479787, "learning_rate": 2.053865100274774e-07, "logits/chosen": -0.8707497119903564, "logits/rejected": -0.7416224479675293, "logps/chosen": -290.25872802734375, "logps/rejected": -320.2662658691406, "loss": 0.6198, "rewards/accuracies": 0.53125, "rewards/chosen": -0.886518120765686, "rewards/margins": 0.1348564326763153, "rewards/rejected": -1.0213744640350342, "step": 287 }, { "epoch": 0.6029835121695891, "grad_norm": 4.049171813840425, "learning_rate": 2.035863443788411e-07, "logits/chosen": -1.0420244932174683, "logits/rejected": -1.0206215381622314, "logps/chosen": -266.638427734375, "logps/rejected": -295.54052734375, "loss": 0.6061, "rewards/accuracies": 0.71875, "rewards/chosen": -0.7523874044418335, "rewards/margins": 0.21901753544807434, "rewards/rejected": -0.9714049696922302, "step": 288 }, { "epoch": 0.6050772049201779, "grad_norm": 3.988927800356053, "learning_rate": 2.0178866775369774e-07, "logits/chosen": -0.6950379014015198, "logits/rejected": -0.8033851385116577, "logps/chosen": -238.45318603515625, "logps/rejected": -293.4297790527344, "loss": 0.5979, "rewards/accuracies": 0.8125, "rewards/chosen": -0.6485308408737183, "rewards/margins": 0.3279150426387787, "rewards/rejected": -0.9764459133148193, "step": 289 }, { "epoch": 0.6071708976707668, "grad_norm": 3.9733190922046484, "learning_rate": 1.9999357655598891e-07, "logits/chosen": -1.0313737392425537, "logits/rejected": -1.0281918048858643, "logps/chosen": -348.35040283203125, "logps/rejected": -319.60919189453125, "loss": 0.6033, "rewards/accuracies": 0.6875, "rewards/chosen": -0.794158935546875, "rewards/margins": 0.311911404132843, "rewards/rejected": -1.1060703992843628, "step": 290 }, { "epoch": 0.6092645904213556, "grad_norm": 4.6620921892698455, "learning_rate": 1.9820116705100775e-07, "logits/chosen": -1.0011757612228394, "logits/rejected": -1.1109023094177246, "logps/chosen": -307.2973937988281, "logps/rejected": -410.60797119140625, "loss": 0.6157, "rewards/accuracies": 0.71875, "rewards/chosen": -0.7315064072608948, "rewards/margins": 0.3776722252368927, "rewards/rejected": -1.1091786623001099, "step": 291 }, { "epoch": 0.6113582831719445, "grad_norm": 4.248003741330825, "learning_rate": 1.9641153536023642e-07, "logits/chosen": -0.9552513360977173, "logits/rejected": -0.9853905439376831, "logps/chosen": -417.6492004394531, "logps/rejected": -367.10369873046875, "loss": 0.5846, "rewards/accuracies": 0.75, "rewards/chosen": -0.7497143149375916, "rewards/margins": 0.4044032692909241, "rewards/rejected": -1.1541175842285156, "step": 292 }, { "epoch": 0.6134519759225334, "grad_norm": 4.8527444441441325, "learning_rate": 1.9462477745619106e-07, "logits/chosen": -1.0434277057647705, "logits/rejected": -1.0358684062957764, "logps/chosen": -295.6050720214844, "logps/rejected": -371.32568359375, "loss": 0.6329, "rewards/accuracies": 0.65625, "rewards/chosen": -0.6973177194595337, "rewards/margins": 0.40979140996932983, "rewards/rejected": -1.1071091890335083, "step": 293 }, { "epoch": 0.6155456686731222, "grad_norm": 4.382971107252221, "learning_rate": 1.928409891572757e-07, "logits/chosen": -0.9290079474449158, "logits/rejected": -1.0360666513442993, "logps/chosen": -324.1712646484375, "logps/rejected": -336.0502624511719, "loss": 0.5681, "rewards/accuracies": 0.6875, "rewards/chosen": -0.7327239513397217, "rewards/margins": 0.21111561357975006, "rewards/rejected": -0.9438395500183105, "step": 294 }, { "epoch": 0.6176393614237111, "grad_norm": 4.170285692393253, "learning_rate": 1.9106026612264315e-07, "logits/chosen": -1.0163110494613647, "logits/rejected": -0.9899307489395142, "logps/chosen": -289.9325256347656, "logps/rejected": -330.5543518066406, "loss": 0.5927, "rewards/accuracies": 0.84375, "rewards/chosen": -0.5417231321334839, "rewards/margins": 0.33823326230049133, "rewards/rejected": -0.8799563050270081, "step": 295 }, { "epoch": 0.6197330541742999, "grad_norm": 4.86142868880124, "learning_rate": 1.8928270384706582e-07, "logits/chosen": -1.0178709030151367, "logits/rejected": -1.1050466299057007, "logps/chosen": -370.6733093261719, "logps/rejected": -367.6428527832031, "loss": 0.6008, "rewards/accuracies": 0.8125, "rewards/chosen": -0.70284104347229, "rewards/margins": 0.43484029173851013, "rewards/rejected": -1.137681245803833, "step": 296 }, { "epoch": 0.6218267469248888, "grad_norm": 3.9718768417441286, "learning_rate": 1.875083976558136e-07, "logits/chosen": -1.0082323551177979, "logits/rejected": -0.9717690348625183, "logps/chosen": -470.0350341796875, "logps/rejected": -431.5830993652344, "loss": 0.6047, "rewards/accuracies": 0.78125, "rewards/chosen": -0.7216346263885498, "rewards/margins": 0.34156107902526855, "rewards/rejected": -1.0631957054138184, "step": 297 }, { "epoch": 0.6239204396754776, "grad_norm": 3.8567631410728556, "learning_rate": 1.8573744269954297e-07, "logits/chosen": -0.8579521775245667, "logits/rejected": -0.8760216236114502, "logps/chosen": -393.8965148925781, "logps/rejected": -329.06884765625, "loss": 0.6153, "rewards/accuracies": 0.5625, "rewards/chosen": -0.7873497009277344, "rewards/margins": 0.13453426957130432, "rewards/rejected": -0.9218839406967163, "step": 298 }, { "epoch": 0.6260141324260665, "grad_norm": 4.395813970282501, "learning_rate": 1.839699339491937e-07, "logits/chosen": -0.8673360347747803, "logits/rejected": -0.6946086287498474, "logps/chosen": -316.01434326171875, "logps/rejected": -326.8657531738281, "loss": 0.5857, "rewards/accuracies": 0.65625, "rewards/chosen": -0.686599612236023, "rewards/margins": 0.22971749305725098, "rewards/rejected": -0.9163171052932739, "step": 299 }, { "epoch": 0.6281078251766553, "grad_norm": 4.712875710259624, "learning_rate": 1.8220596619089573e-07, "logits/chosen": -0.9354798197746277, "logits/rejected": -0.882164716720581, "logps/chosen": -341.39495849609375, "logps/rejected": -408.0284729003906, "loss": 0.5811, "rewards/accuracies": 0.625, "rewards/chosen": -0.8664036989212036, "rewards/margins": 0.16512806713581085, "rewards/rejected": -1.0315316915512085, "step": 300 }, { "epoch": 0.6302015179272442, "grad_norm": 3.932746110863379, "learning_rate": 1.8044563402088682e-07, "logits/chosen": -1.081218957901001, "logits/rejected": -1.0549954175949097, "logps/chosen": -365.10137939453125, "logps/rejected": -355.4117736816406, "loss": 0.5966, "rewards/accuracies": 0.71875, "rewards/chosen": -0.7783509492874146, "rewards/margins": 0.39208564162254333, "rewards/rejected": -1.1704365015029907, "step": 301 }, { "epoch": 0.632295210677833, "grad_norm": 3.9470810504569873, "learning_rate": 1.7868903184043885e-07, "logits/chosen": -0.928838849067688, "logits/rejected": -0.8947494626045227, "logps/chosen": -326.4456787109375, "logps/rejected": -332.5851745605469, "loss": 0.5944, "rewards/accuracies": 0.75, "rewards/chosen": -0.7215900421142578, "rewards/margins": 0.33560532331466675, "rewards/rejected": -1.0571954250335693, "step": 302 }, { "epoch": 0.6343889034284219, "grad_norm": 4.298143353895178, "learning_rate": 1.7693625385079574e-07, "logits/chosen": -0.8463895916938782, "logits/rejected": -0.8464228510856628, "logps/chosen": -359.86669921875, "logps/rejected": -347.84619140625, "loss": 0.5991, "rewards/accuracies": 0.65625, "rewards/chosen": -0.8105530738830566, "rewards/margins": 0.21008439362049103, "rewards/rejected": -1.0206375122070312, "step": 303 }, { "epoch": 0.6364825961790107, "grad_norm": 4.165207861252242, "learning_rate": 1.7518739404812155e-07, "logits/chosen": -1.0326260328292847, "logits/rejected": -1.0655555725097656, "logps/chosen": -274.3062744140625, "logps/rejected": -373.55035400390625, "loss": 0.6011, "rewards/accuracies": 0.84375, "rewards/chosen": -0.7011924386024475, "rewards/margins": 0.5916566252708435, "rewards/rejected": -1.292849063873291, "step": 304 }, { "epoch": 0.6385762889295996, "grad_norm": 4.256529652545562, "learning_rate": 1.7344254621846017e-07, "logits/chosen": -0.8909204602241516, "logits/rejected": -0.8685436248779297, "logps/chosen": -378.1118469238281, "logps/rejected": -323.2134704589844, "loss": 0.5764, "rewards/accuracies": 0.8125, "rewards/chosen": -0.7002477645874023, "rewards/margins": 0.42042091488838196, "rewards/rejected": -1.1206685304641724, "step": 305 }, { "epoch": 0.6406699816801884, "grad_norm": 4.257338244648936, "learning_rate": 1.717018039327053e-07, "logits/chosen": -1.087096929550171, "logits/rejected": -0.8930917978286743, "logps/chosen": -472.75830078125, "logps/rejected": -390.8045959472656, "loss": 0.5629, "rewards/accuracies": 0.8125, "rewards/chosen": -0.932883083820343, "rewards/margins": 0.25906437635421753, "rewards/rejected": -1.191947340965271, "step": 306 }, { "epoch": 0.6427636744307773, "grad_norm": 4.840958808910903, "learning_rate": 1.699652605415828e-07, "logits/chosen": -0.9760967493057251, "logits/rejected": -0.9729673266410828, "logps/chosen": -405.1291198730469, "logps/rejected": -403.11041259765625, "loss": 0.5758, "rewards/accuracies": 0.78125, "rewards/chosen": -0.7552483081817627, "rewards/margins": 0.4882379174232483, "rewards/rejected": -1.2434864044189453, "step": 307 }, { "epoch": 0.6448573671813661, "grad_norm": 4.249214667372905, "learning_rate": 1.6823300917064458e-07, "logits/chosen": -1.0708398818969727, "logits/rejected": -1.0699355602264404, "logps/chosen": -440.5609436035156, "logps/rejected": -367.2522277832031, "loss": 0.5629, "rewards/accuracies": 0.75, "rewards/chosen": -0.8135306239128113, "rewards/margins": 0.41567546129226685, "rewards/rejected": -1.2292060852050781, "step": 308 }, { "epoch": 0.646951059931955, "grad_norm": 4.550585369271294, "learning_rate": 1.6650514271527465e-07, "logits/chosen": -1.1219197511672974, "logits/rejected": -1.0999243259429932, "logps/chosen": -354.8285217285156, "logps/rejected": -337.6871337890625, "loss": 0.5901, "rewards/accuracies": 0.75, "rewards/chosen": -0.7251974940299988, "rewards/margins": 0.2991696894168854, "rewards/rejected": -1.0243672132492065, "step": 309 }, { "epoch": 0.6490447526825438, "grad_norm": 4.8210934669532115, "learning_rate": 1.647817538357072e-07, "logits/chosen": -1.0552057027816772, "logits/rejected": -1.0507606267929077, "logps/chosen": -321.98870849609375, "logps/rejected": -338.76214599609375, "loss": 0.6065, "rewards/accuracies": 0.6875, "rewards/chosen": -0.8213024139404297, "rewards/margins": 0.5251730680465698, "rewards/rejected": -1.34647536277771, "step": 310 }, { "epoch": 0.6511384454331327, "grad_norm": 5.043210103924577, "learning_rate": 1.6306293495205755e-07, "logits/chosen": -0.9259848594665527, "logits/rejected": -1.0731109380722046, "logps/chosen": -358.4002380371094, "logps/rejected": -337.6957092285156, "loss": 0.5884, "rewards/accuracies": 0.59375, "rewards/chosen": -0.8706588745117188, "rewards/margins": 0.3227222263813019, "rewards/rejected": -1.1933811902999878, "step": 311 }, { "epoch": 0.6532321381837215, "grad_norm": 4.932912067444989, "learning_rate": 1.6134877823936607e-07, "logits/chosen": -1.02803635597229, "logits/rejected": -1.0049376487731934, "logps/chosen": -366.85186767578125, "logps/rejected": -370.9542236328125, "loss": 0.5688, "rewards/accuracies": 0.71875, "rewards/chosen": -0.8058578372001648, "rewards/margins": 0.3618282675743103, "rewards/rejected": -1.1676859855651855, "step": 312 }, { "epoch": 0.6553258309343104, "grad_norm": 4.553166501029724, "learning_rate": 1.5963937562265522e-07, "logits/chosen": -1.1713640689849854, "logits/rejected": -1.0340291261672974, "logps/chosen": -364.327880859375, "logps/rejected": -379.64862060546875, "loss": 0.6043, "rewards/accuracies": 0.71875, "rewards/chosen": -0.8824548721313477, "rewards/margins": 0.16307507455348969, "rewards/rejected": -1.0455299615859985, "step": 313 }, { "epoch": 0.6574195236848992, "grad_norm": 4.221950833726903, "learning_rate": 1.5793481877199943e-07, "logits/chosen": -0.9396820664405823, "logits/rejected": -1.0585650205612183, "logps/chosen": -393.2031555175781, "logps/rejected": -356.1634216308594, "loss": 0.6016, "rewards/accuracies": 0.625, "rewards/chosen": -0.8781794309616089, "rewards/margins": 0.43085843324661255, "rewards/rejected": -1.3090378046035767, "step": 314 }, { "epoch": 0.6595132164354881, "grad_norm": 4.269236110796968, "learning_rate": 1.562351990976095e-07, "logits/chosen": -1.0203323364257812, "logits/rejected": -0.9932332038879395, "logps/chosen": -365.14306640625, "logps/rejected": -310.42840576171875, "loss": 0.5489, "rewards/accuracies": 0.6875, "rewards/chosen": -0.7792743444442749, "rewards/margins": 0.4027741551399231, "rewards/rejected": -1.1820485591888428, "step": 315 }, { "epoch": 0.6616069091860769, "grad_norm": 5.10476967011241, "learning_rate": 1.5454060774493065e-07, "logits/chosen": -1.0745798349380493, "logits/rejected": -1.041635274887085, "logps/chosen": -266.32916259765625, "logps/rejected": -277.79998779296875, "loss": 0.5841, "rewards/accuracies": 0.71875, "rewards/chosen": -0.7605333924293518, "rewards/margins": 0.285404235124588, "rewards/rejected": -1.0459375381469727, "step": 316 }, { "epoch": 0.6637006019366658, "grad_norm": 4.9839636087340695, "learning_rate": 1.5285113558975427e-07, "logits/chosen": -0.8860970735549927, "logits/rejected": -0.8327404260635376, "logps/chosen": -312.1811828613281, "logps/rejected": -374.24017333984375, "loss": 0.5967, "rewards/accuracies": 0.78125, "rewards/chosen": -0.832179069519043, "rewards/margins": 0.3623540997505188, "rewards/rejected": -1.194533109664917, "step": 317 }, { "epoch": 0.6657942946872546, "grad_norm": 4.440232748408542, "learning_rate": 1.5116687323334464e-07, "logits/chosen": -1.0527782440185547, "logits/rejected": -1.0963683128356934, "logps/chosen": -311.2167663574219, "logps/rejected": -320.79815673828125, "loss": 0.566, "rewards/accuracies": 0.875, "rewards/chosen": -0.8770865201950073, "rewards/margins": 0.645134449005127, "rewards/rejected": -1.5222209692001343, "step": 318 }, { "epoch": 0.6678879874378435, "grad_norm": 5.143631298396963, "learning_rate": 1.4948791099758052e-07, "logits/chosen": -0.9916408061981201, "logits/rejected": -0.9548324942588806, "logps/chosen": -319.84576416015625, "logps/rejected": -358.84210205078125, "loss": 0.5878, "rewards/accuracies": 0.5625, "rewards/chosen": -0.997856855392456, "rewards/margins": 0.13200606405735016, "rewards/rejected": -1.129862904548645, "step": 319 }, { "epoch": 0.6699816801884323, "grad_norm": 4.99726162986902, "learning_rate": 1.478143389201113e-07, "logits/chosen": -0.8910217881202698, "logits/rejected": -0.929844319820404, "logps/chosen": -334.52044677734375, "logps/rejected": -328.8065490722656, "loss": 0.6047, "rewards/accuracies": 0.75, "rewards/chosen": -0.736925482749939, "rewards/margins": 0.3704027533531189, "rewards/rejected": -1.1073282957077026, "step": 320 }, { "epoch": 0.6720753729390212, "grad_norm": 5.642279204037802, "learning_rate": 1.461462467495284e-07, "logits/chosen": -1.0580946207046509, "logits/rejected": -0.8903120756149292, "logps/chosen": -342.75177001953125, "logps/rejected": -378.611328125, "loss": 0.6439, "rewards/accuracies": 0.625, "rewards/chosen": -0.8806452751159668, "rewards/margins": 0.2598419487476349, "rewards/rejected": -1.1404871940612793, "step": 321 }, { "epoch": 0.6741690656896101, "grad_norm": 4.210716134739545, "learning_rate": 1.4448372394055246e-07, "logits/chosen": -1.0174405574798584, "logits/rejected": -0.9792740941047668, "logps/chosen": -350.93048095703125, "logps/rejected": -347.7519836425781, "loss": 0.5554, "rewards/accuracies": 0.78125, "rewards/chosen": -0.8710691332817078, "rewards/margins": 0.42834046483039856, "rewards/rejected": -1.2994095087051392, "step": 322 }, { "epoch": 0.6762627584401989, "grad_norm": 4.726570675317681, "learning_rate": 1.428268596492364e-07, "logits/chosen": -1.0862951278686523, "logits/rejected": -1.0758510828018188, "logps/chosen": -464.050048828125, "logps/rejected": -467.544921875, "loss": 0.5861, "rewards/accuracies": 0.59375, "rewards/chosen": -1.0785822868347168, "rewards/margins": 0.305502325296402, "rewards/rejected": -1.3840845823287964, "step": 323 }, { "epoch": 0.6783564511907878, "grad_norm": 5.040706620769479, "learning_rate": 1.4117574272818386e-07, "logits/chosen": -0.7669979929924011, "logits/rejected": -0.9290164113044739, "logps/chosen": -345.0854187011719, "logps/rejected": -398.5189208984375, "loss": 0.6054, "rewards/accuracies": 0.65625, "rewards/chosen": -0.9167202115058899, "rewards/margins": 0.30732500553131104, "rewards/rejected": -1.2240452766418457, "step": 324 }, { "epoch": 0.6804501439413766, "grad_norm": 5.116329955412458, "learning_rate": 1.3953046172178413e-07, "logits/chosen": -1.0031001567840576, "logits/rejected": -1.0913097858428955, "logps/chosen": -389.1529541015625, "logps/rejected": -406.4329528808594, "loss": 0.6259, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9163541793823242, "rewards/margins": 0.3937912583351135, "rewards/rejected": -1.310145378112793, "step": 325 }, { "epoch": 0.6825438366919655, "grad_norm": 4.955624344058672, "learning_rate": 1.3789110486146468e-07, "logits/chosen": -1.1282416582107544, "logits/rejected": -1.0974756479263306, "logps/chosen": -355.1131286621094, "logps/rejected": -370.2046203613281, "loss": 0.5829, "rewards/accuracies": 0.75, "rewards/chosen": -0.9410170316696167, "rewards/margins": 0.4337049126625061, "rewards/rejected": -1.374721884727478, "step": 326 }, { "epoch": 0.6846375294425543, "grad_norm": 4.690201202745688, "learning_rate": 1.362577600609588e-07, "logits/chosen": -1.0833415985107422, "logits/rejected": -1.0749824047088623, "logps/chosen": -290.5574645996094, "logps/rejected": -311.0929260253906, "loss": 0.5464, "rewards/accuracies": 0.75, "rewards/chosen": -0.9207600951194763, "rewards/margins": 0.4717293083667755, "rewards/rejected": -1.3924893140792847, "step": 327 }, { "epoch": 0.6867312221931432, "grad_norm": 5.2509984699592955, "learning_rate": 1.3463051491159093e-07, "logits/chosen": -0.8964591026306152, "logits/rejected": -0.906419038772583, "logps/chosen": -379.51763916015625, "logps/rejected": -391.96429443359375, "loss": 0.6306, "rewards/accuracies": 0.5625, "rewards/chosen": -1.0814236402511597, "rewards/margins": 0.2742454409599304, "rewards/rejected": -1.3556691408157349, "step": 328 }, { "epoch": 0.688824914943732, "grad_norm": 4.477637131717329, "learning_rate": 1.3300945667758012e-07, "logits/chosen": -1.007467269897461, "logits/rejected": -0.9721682071685791, "logps/chosen": -382.72344970703125, "logps/rejected": -380.7666931152344, "loss": 0.5589, "rewards/accuracies": 0.71875, "rewards/chosen": -0.8336280584335327, "rewards/margins": 0.4594077169895172, "rewards/rejected": -1.2930357456207275, "step": 329 }, { "epoch": 0.6909186076943209, "grad_norm": 6.042412321647793, "learning_rate": 1.3139467229135998e-07, "logits/chosen": -1.10155189037323, "logits/rejected": -1.0159432888031006, "logps/chosen": -319.6428527832031, "logps/rejected": -358.0398864746094, "loss": 0.6493, "rewards/accuracies": 0.59375, "rewards/chosen": -0.8517560958862305, "rewards/margins": 0.2770592272281647, "rewards/rejected": -1.1288154125213623, "step": 330 }, { "epoch": 0.6930123004449097, "grad_norm": 4.7576525638084055, "learning_rate": 1.2978624834891626e-07, "logits/chosen": -1.0450632572174072, "logits/rejected": -1.0797748565673828, "logps/chosen": -381.4608154296875, "logps/rejected": -390.8157958984375, "loss": 0.5379, "rewards/accuracies": 0.71875, "rewards/chosen": -0.8275765776634216, "rewards/margins": 0.4613792896270752, "rewards/rejected": -1.2889559268951416, "step": 331 }, { "epoch": 0.6951059931954986, "grad_norm": 4.514857884793274, "learning_rate": 1.281842711051438e-07, "logits/chosen": -0.9356201887130737, "logits/rejected": -0.8875215649604797, "logps/chosen": -292.5213928222656, "logps/rejected": -364.8160705566406, "loss": 0.5462, "rewards/accuracies": 0.8125, "rewards/chosen": -0.8070015907287598, "rewards/margins": 0.5474440455436707, "rewards/rejected": -1.3544456958770752, "step": 332 }, { "epoch": 0.6971996859460874, "grad_norm": 6.238681872348042, "learning_rate": 1.2658882646922033e-07, "logits/chosen": -0.9494622945785522, "logits/rejected": -0.9183204770088196, "logps/chosen": -343.158203125, "logps/rejected": -333.4990234375, "loss": 0.6212, "rewards/accuracies": 0.625, "rewards/chosen": -0.9169262647628784, "rewards/margins": 0.15336479246616364, "rewards/rejected": -1.0702910423278809, "step": 333 }, { "epoch": 0.6992933786966763, "grad_norm": 5.403137056030116, "learning_rate": 1.2500000000000005e-07, "logits/chosen": -0.9445061683654785, "logits/rejected": -0.8896490931510925, "logps/chosen": -474.9695739746094, "logps/rejected": -362.90380859375, "loss": 0.5762, "rewards/accuracies": 0.75, "rewards/chosen": -0.8365269303321838, "rewards/margins": 0.46907490491867065, "rewards/rejected": -1.3056018352508545, "step": 334 }, { "epoch": 0.7013870714472651, "grad_norm": 5.235385059292377, "learning_rate": 1.2341787690142435e-07, "logits/chosen": -0.8802973628044128, "logits/rejected": -0.8504537343978882, "logps/chosen": -335.79473876953125, "logps/rejected": -305.86529541015625, "loss": 0.6276, "rewards/accuracies": 0.71875, "rewards/chosen": -0.9935099482536316, "rewards/margins": 0.22254012525081635, "rewards/rejected": -1.216050148010254, "step": 335 }, { "epoch": 0.703480764197854, "grad_norm": 5.693730319056324, "learning_rate": 1.2184254201795363e-07, "logits/chosen": -1.14717435836792, "logits/rejected": -1.211289882659912, "logps/chosen": -344.61572265625, "logps/rejected": -366.61865234375, "loss": 0.5613, "rewards/accuracies": 0.625, "rewards/chosen": -0.9456732273101807, "rewards/margins": 0.3434966802597046, "rewards/rejected": -1.2891699075698853, "step": 336 }, { "epoch": 0.7055744569484428, "grad_norm": 4.629544110339346, "learning_rate": 1.202740798300168e-07, "logits/chosen": -1.1199169158935547, "logits/rejected": -1.1445202827453613, "logps/chosen": -423.0330505371094, "logps/rejected": -459.98529052734375, "loss": 0.5464, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9166252613067627, "rewards/margins": 0.5783544778823853, "rewards/rejected": -1.494979739189148, "step": 337 }, { "epoch": 0.7076681496990317, "grad_norm": 5.388911186978744, "learning_rate": 1.1871257444948096e-07, "logits/chosen": -1.166109561920166, "logits/rejected": -1.1599714756011963, "logps/chosen": -330.5888366699219, "logps/rejected": -364.29150390625, "loss": 0.6057, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9304347038269043, "rewards/margins": 0.3187927305698395, "rewards/rejected": -1.249227523803711, "step": 338 }, { "epoch": 0.7097618424496205, "grad_norm": 5.489520028714185, "learning_rate": 1.1715810961514072e-07, "logits/chosen": -0.9611755609512329, "logits/rejected": -1.0104998350143433, "logps/chosen": -372.9747314453125, "logps/rejected": -330.63720703125, "loss": 0.574, "rewards/accuracies": 0.6875, "rewards/chosen": -0.8798213601112366, "rewards/margins": 0.3336760997772217, "rewards/rejected": -1.2134974002838135, "step": 339 }, { "epoch": 0.7118555352002094, "grad_norm": 5.589571549464936, "learning_rate": 1.1561076868822755e-07, "logits/chosen": -0.9107836484909058, "logits/rejected": -0.9165200591087341, "logps/chosen": -398.4732971191406, "logps/rejected": -393.54791259765625, "loss": 0.6335, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9415509104728699, "rewards/margins": 0.28252679109573364, "rewards/rejected": -1.2240777015686035, "step": 340 }, { "epoch": 0.7139492279507982, "grad_norm": 5.118512916173862, "learning_rate": 1.1407063464793965e-07, "logits/chosen": -1.1467581987380981, "logits/rejected": -0.9921854734420776, "logps/chosen": -358.225341796875, "logps/rejected": -334.1575927734375, "loss": 0.607, "rewards/accuracies": 0.75, "rewards/chosen": -0.976783037185669, "rewards/margins": 0.22393885254859924, "rewards/rejected": -1.2007219791412354, "step": 341 }, { "epoch": 0.7160429207013871, "grad_norm": 4.644926078108535, "learning_rate": 1.125377900869913e-07, "logits/chosen": -1.061423897743225, "logits/rejected": -1.046433448791504, "logps/chosen": -375.1291198730469, "logps/rejected": -327.4958801269531, "loss": 0.5683, "rewards/accuracies": 0.625, "rewards/chosen": -0.8810850381851196, "rewards/margins": 0.2843707501888275, "rewards/rejected": -1.1654558181762695, "step": 342 }, { "epoch": 0.7181366134519759, "grad_norm": 4.696833545273878, "learning_rate": 1.110123172071844e-07, "logits/chosen": -1.0328636169433594, "logits/rejected": -1.0211386680603027, "logps/chosen": -387.2093811035156, "logps/rejected": -362.0821838378906, "loss": 0.6118, "rewards/accuracies": 0.65625, "rewards/chosen": -0.8726330399513245, "rewards/margins": 0.24517491459846497, "rewards/rejected": -1.1178081035614014, "step": 343 }, { "epoch": 0.7202303062025648, "grad_norm": 4.930293752560766, "learning_rate": 1.09494297815e-07, "logits/chosen": -1.2321884632110596, "logits/rejected": -1.1722993850708008, "logps/chosen": -390.91552734375, "logps/rejected": -369.99267578125, "loss": 0.5794, "rewards/accuracies": 0.65625, "rewards/chosen": -1.0183380842208862, "rewards/margins": 0.31329479813575745, "rewards/rejected": -1.3316328525543213, "step": 344 }, { "epoch": 0.7223239989531536, "grad_norm": 4.630425364273001, "learning_rate": 1.0798381331721107e-07, "logits/chosen": -1.1355465650558472, "logits/rejected": -1.1947945356369019, "logps/chosen": -340.59912109375, "logps/rejected": -319.89996337890625, "loss": 0.575, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9519971609115601, "rewards/margins": 0.264494925737381, "rewards/rejected": -1.2164921760559082, "step": 345 }, { "epoch": 0.7244176917037425, "grad_norm": 5.428113189532964, "learning_rate": 1.0648094471651722e-07, "logits/chosen": -0.92652428150177, "logits/rejected": -1.0354893207550049, "logps/chosen": -355.3974914550781, "logps/rejected": -381.45892333984375, "loss": 0.6005, "rewards/accuracies": 0.65625, "rewards/chosen": -0.8462040424346924, "rewards/margins": 0.5168175101280212, "rewards/rejected": -1.3630216121673584, "step": 346 }, { "epoch": 0.7265113844543313, "grad_norm": 5.292128110322091, "learning_rate": 1.0498577260720048e-07, "logits/chosen": -0.886002779006958, "logits/rejected": -0.8788304328918457, "logps/chosen": -296.1125183105469, "logps/rejected": -381.228759765625, "loss": 0.5772, "rewards/accuracies": 0.65625, "rewards/chosen": -0.9059064388275146, "rewards/margins": 0.3659990429878235, "rewards/rejected": -1.2719054222106934, "step": 347 }, { "epoch": 0.7286050772049202, "grad_norm": 4.823722862308988, "learning_rate": 1.0349837717080347e-07, "logits/chosen": -1.0212054252624512, "logits/rejected": -1.0662332773208618, "logps/chosen": -361.3704528808594, "logps/rejected": -354.66290283203125, "loss": 0.5587, "rewards/accuracies": 0.6875, "rewards/chosen": -0.999699592590332, "rewards/margins": 0.272427499294281, "rewards/rejected": -1.2721270322799683, "step": 348 }, { "epoch": 0.730698769955509, "grad_norm": 5.252568914718968, "learning_rate": 1.0201883817182949e-07, "logits/chosen": -1.0344340801239014, "logits/rejected": -1.0427623987197876, "logps/chosen": -317.0735168457031, "logps/rejected": -296.1258239746094, "loss": 0.6002, "rewards/accuracies": 0.5, "rewards/chosen": -1.091943383216858, "rewards/margins": 0.13458918035030365, "rewards/rejected": -1.2265325784683228, "step": 349 }, { "epoch": 0.7327924627060979, "grad_norm": 5.422164808490072, "learning_rate": 1.0054723495346482e-07, "logits/chosen": -1.0686841011047363, "logits/rejected": -1.0204654932022095, "logps/chosen": -462.5711975097656, "logps/rejected": -425.97320556640625, "loss": 0.6226, "rewards/accuracies": 0.6875, "rewards/chosen": -1.1374996900558472, "rewards/margins": 0.22476544976234436, "rewards/rejected": -1.3622651100158691, "step": 350 }, { "epoch": 0.7348861554566868, "grad_norm": 5.48415534251731, "learning_rate": 9.908364643332398e-08, "logits/chosen": -1.1773720979690552, "logits/rejected": -1.2301090955734253, "logps/chosen": -353.4706115722656, "logps/rejected": -412.4661865234375, "loss": 0.6083, "rewards/accuracies": 0.625, "rewards/chosen": -1.0794364213943481, "rewards/margins": 0.35158830881118774, "rewards/rejected": -1.4310247898101807, "step": 351 }, { "epoch": 0.7369798482072756, "grad_norm": 5.245710545036415, "learning_rate": 9.76281510992176e-08, "logits/chosen": -0.9535574316978455, "logits/rejected": -0.8419931530952454, "logps/chosen": -375.006103515625, "logps/rejected": -375.0938720703125, "loss": 0.6056, "rewards/accuracies": 0.75, "rewards/chosen": -1.0380980968475342, "rewards/margins": 0.34433096647262573, "rewards/rejected": -1.3824288845062256, "step": 352 }, { "epoch": 0.7390735409578645, "grad_norm": 4.970121418312639, "learning_rate": 9.618082700494318e-08, "logits/chosen": -0.8720242381095886, "logits/rejected": -0.8382505774497986, "logps/chosen": -297.3468017578125, "logps/rejected": -346.72149658203125, "loss": 0.632, "rewards/accuracies": 0.59375, "rewards/chosen": -1.0575071573257446, "rewards/margins": 0.24765054881572723, "rewards/rejected": -1.3051576614379883, "step": 353 }, { "epoch": 0.7411672337084533, "grad_norm": 6.3001176780480375, "learning_rate": 9.474175176609956e-08, "logits/chosen": -1.0442111492156982, "logits/rejected": -1.0468512773513794, "logps/chosen": -364.1036376953125, "logps/rejected": -331.75244140625, "loss": 0.623, "rewards/accuracies": 0.625, "rewards/chosen": -0.8924232721328735, "rewards/margins": 0.35785213112831116, "rewards/rejected": -1.2502753734588623, "step": 354 }, { "epoch": 0.7432609264590422, "grad_norm": 5.472172481405132, "learning_rate": 9.331100255592436e-08, "logits/chosen": -1.112056851387024, "logits/rejected": -1.1578809022903442, "logps/chosen": -434.7376403808594, "logps/rejected": -378.6532897949219, "loss": 0.6189, "rewards/accuracies": 0.71875, "rewards/chosen": -1.093883991241455, "rewards/margins": 0.3683215379714966, "rewards/rejected": -1.462205410003662, "step": 355 }, { "epoch": 0.745354619209631, "grad_norm": 4.948122980325836, "learning_rate": 9.18886561011557e-08, "logits/chosen": -1.072540044784546, "logits/rejected": -1.1065843105316162, "logps/chosen": -517.6097412109375, "logps/rejected": -460.107421875, "loss": 0.5659, "rewards/accuracies": 0.6875, "rewards/chosen": -0.92432701587677, "rewards/margins": 0.5668249726295471, "rewards/rejected": -1.4911519289016724, "step": 356 }, { "epoch": 0.7474483119602199, "grad_norm": 5.379908919792136, "learning_rate": 9.047478867791731e-08, "logits/chosen": -1.0750786066055298, "logits/rejected": -1.0371627807617188, "logps/chosen": -398.7431640625, "logps/rejected": -410.0162658691406, "loss": 0.5469, "rewards/accuracies": 0.625, "rewards/chosen": -1.0447100400924683, "rewards/margins": 0.5210673809051514, "rewards/rejected": -1.5657775402069092, "step": 357 }, { "epoch": 0.7495420047108087, "grad_norm": 5.580761505247945, "learning_rate": 8.906947610762825e-08, "logits/chosen": -1.1895625591278076, "logits/rejected": -1.2580678462982178, "logps/chosen": -471.7377014160156, "logps/rejected": -425.9092102050781, "loss": 0.5983, "rewards/accuracies": 0.75, "rewards/chosen": -0.9444851875305176, "rewards/margins": 0.35002943873405457, "rewards/rejected": -1.294514775276184, "step": 358 }, { "epoch": 0.7516356974613976, "grad_norm": 4.846143362911699, "learning_rate": 8.76727937529367e-08, "logits/chosen": -1.1407859325408936, "logits/rejected": -1.0891239643096924, "logps/chosen": -391.1314697265625, "logps/rejected": -408.205810546875, "loss": 0.6261, "rewards/accuracies": 0.71875, "rewards/chosen": -0.864660382270813, "rewards/margins": 0.35379558801651, "rewards/rejected": -1.2184560298919678, "step": 359 }, { "epoch": 0.7537293902119864, "grad_norm": 5.6108900087192, "learning_rate": 8.628481651367875e-08, "logits/chosen": -1.1937055587768555, "logits/rejected": -1.2158100605010986, "logps/chosen": -385.84259033203125, "logps/rejected": -350.4278259277344, "loss": 0.6303, "rewards/accuracies": 0.6875, "rewards/chosen": -0.8502019047737122, "rewards/margins": 0.3716661036014557, "rewards/rejected": -1.2218680381774902, "step": 360 }, { "epoch": 0.7558230829625753, "grad_norm": 5.457348350605904, "learning_rate": 8.490561882286135e-08, "logits/chosen": -1.1212856769561768, "logits/rejected": -0.9991218447685242, "logps/chosen": -351.9570617675781, "logps/rejected": -359.6358947753906, "loss": 0.6084, "rewards/accuracies": 0.84375, "rewards/chosen": -0.8738343119621277, "rewards/margins": 0.4490945339202881, "rewards/rejected": -1.3229289054870605, "step": 361 }, { "epoch": 0.7579167757131641, "grad_norm": 5.506633644432408, "learning_rate": 8.353527464267104e-08, "logits/chosen": -0.9683207869529724, "logits/rejected": -1.0034483671188354, "logps/chosen": -384.5888671875, "logps/rejected": -379.39642333984375, "loss": 0.6422, "rewards/accuracies": 0.78125, "rewards/chosen": -0.8334957957267761, "rewards/margins": 0.38540419936180115, "rewards/rejected": -1.2188999652862549, "step": 362 }, { "epoch": 0.760010468463753, "grad_norm": 5.6474649886106745, "learning_rate": 8.217385746050742e-08, "logits/chosen": -1.0939087867736816, "logits/rejected": -1.005664587020874, "logps/chosen": -289.66986083984375, "logps/rejected": -269.9555969238281, "loss": 0.59, "rewards/accuracies": 0.65625, "rewards/chosen": -0.9255156517028809, "rewards/margins": 0.3094920217990875, "rewards/rejected": -1.2350077629089355, "step": 363 }, { "epoch": 0.7621041612143418, "grad_norm": 5.08222381427678, "learning_rate": 8.082144028504231e-08, "logits/chosen": -0.9446636438369751, "logits/rejected": -0.9510993361473083, "logps/chosen": -261.65966796875, "logps/rejected": -343.13397216796875, "loss": 0.5927, "rewards/accuracies": 0.71875, "rewards/chosen": -0.7469099164009094, "rewards/margins": 0.47956645488739014, "rewards/rejected": -1.2264763116836548, "step": 364 }, { "epoch": 0.7641978539649307, "grad_norm": 6.003981074829619, "learning_rate": 7.947809564230445e-08, "logits/chosen": -1.0882418155670166, "logits/rejected": -1.1745942831039429, "logps/chosen": -357.9566650390625, "logps/rejected": -350.71923828125, "loss": 0.6307, "rewards/accuracies": 0.65625, "rewards/chosen": -1.0244779586791992, "rewards/margins": 0.11149246245622635, "rewards/rejected": -1.1359704732894897, "step": 365 }, { "epoch": 0.7662915467155195, "grad_norm": 5.180707267994661, "learning_rate": 7.814389557179016e-08, "logits/chosen": -0.9484068751335144, "logits/rejected": -0.9714826941490173, "logps/chosen": -395.81884765625, "logps/rejected": -401.05609130859375, "loss": 0.6014, "rewards/accuracies": 0.71875, "rewards/chosen": -0.8172423839569092, "rewards/margins": 0.46140384674072266, "rewards/rejected": -1.2786462306976318, "step": 366 }, { "epoch": 0.7683852394661084, "grad_norm": 4.969492925711831, "learning_rate": 7.681891162260015e-08, "logits/chosen": -0.9409385919570923, "logits/rejected": -0.9392369985580444, "logps/chosen": -363.1473693847656, "logps/rejected": -312.6502990722656, "loss": 0.5662, "rewards/accuracies": 0.5625, "rewards/chosen": -0.9812503457069397, "rewards/margins": 0.1279146373271942, "rewards/rejected": -1.109164834022522, "step": 367 }, { "epoch": 0.7704789322166972, "grad_norm": 5.217255733816973, "learning_rate": 7.550321484960251e-08, "logits/chosen": -0.956232488155365, "logits/rejected": -0.9681949615478516, "logps/chosen": -365.8873291015625, "logps/rejected": -329.0688781738281, "loss": 0.5822, "rewards/accuracies": 0.65625, "rewards/chosen": -1.0013463497161865, "rewards/margins": 0.259126216173172, "rewards/rejected": -1.2604725360870361, "step": 368 }, { "epoch": 0.7725726249672861, "grad_norm": 5.153897518052571, "learning_rate": 7.419687580962222e-08, "logits/chosen": -0.905741810798645, "logits/rejected": -1.028702974319458, "logps/chosen": -402.5704345703125, "logps/rejected": -325.1583557128906, "loss": 0.6184, "rewards/accuracies": 0.6875, "rewards/chosen": -0.8294796347618103, "rewards/margins": 0.4220150113105774, "rewards/rejected": -1.2514946460723877, "step": 369 }, { "epoch": 0.7746663177178749, "grad_norm": 5.358092950600321, "learning_rate": 7.289996455765748e-08, "logits/chosen": -1.008590579032898, "logits/rejected": -1.0161044597625732, "logps/chosen": -390.44647216796875, "logps/rejected": -371.59912109375, "loss": 0.5801, "rewards/accuracies": 0.71875, "rewards/chosen": -1.0201014280319214, "rewards/margins": 0.428865909576416, "rewards/rejected": -1.448967456817627, "step": 370 }, { "epoch": 0.7767600104684638, "grad_norm": 4.682124991160459, "learning_rate": 7.161255064312283e-08, "logits/chosen": -1.0318840742111206, "logits/rejected": -0.9978671073913574, "logps/chosen": -342.7257385253906, "logps/rejected": -361.2938232421875, "loss": 0.5714, "rewards/accuracies": 0.625, "rewards/chosen": -1.0108859539031982, "rewards/margins": 0.3244949281215668, "rewards/rejected": -1.3353807926177979, "step": 371 }, { "epoch": 0.7788537032190526, "grad_norm": 5.647206650698066, "learning_rate": 7.033470310611945e-08, "logits/chosen": -0.9945604205131531, "logits/rejected": -1.1038204431533813, "logps/chosen": -350.92669677734375, "logps/rejected": -395.6362609863281, "loss": 0.6068, "rewards/accuracies": 0.78125, "rewards/chosen": -0.8008663654327393, "rewards/margins": 0.49505653977394104, "rewards/rejected": -1.2959227561950684, "step": 372 }, { "epoch": 0.7809473959696415, "grad_norm": 4.6780079300428214, "learning_rate": 6.906649047373245e-08, "logits/chosen": -0.7515249848365784, "logits/rejected": -0.8388275504112244, "logps/chosen": -290.88262939453125, "logps/rejected": -351.8504943847656, "loss": 0.568, "rewards/accuracies": 0.65625, "rewards/chosen": -0.8086748719215393, "rewards/margins": 0.30446967482566833, "rewards/rejected": -1.1131445169448853, "step": 373 }, { "epoch": 0.7830410887202303, "grad_norm": 5.152995330858469, "learning_rate": 6.780798075635675e-08, "logits/chosen": -0.9450228810310364, "logits/rejected": -0.9634541273117065, "logps/chosen": -303.8283386230469, "logps/rejected": -341.2734375, "loss": 0.6134, "rewards/accuracies": 0.6875, "rewards/chosen": -0.8979825973510742, "rewards/margins": 0.3980700373649597, "rewards/rejected": -1.2960525751113892, "step": 374 }, { "epoch": 0.7851347814708192, "grad_norm": 4.659803666118181, "learning_rate": 6.655924144404906e-08, "logits/chosen": -0.9799182415008545, "logits/rejected": -0.9487051963806152, "logps/chosen": -318.1759033203125, "logps/rejected": -341.6868896484375, "loss": 0.6016, "rewards/accuracies": 0.625, "rewards/chosen": -0.9127873182296753, "rewards/margins": 0.2682987451553345, "rewards/rejected": -1.1810860633850098, "step": 375 }, { "epoch": 0.787228474221408, "grad_norm": 4.426496558489463, "learning_rate": 6.532033950290885e-08, "logits/chosen": -0.9759746193885803, "logits/rejected": -1.1222018003463745, "logps/chosen": -336.4270324707031, "logps/rejected": -378.2254638671875, "loss": 0.5714, "rewards/accuracies": 0.65625, "rewards/chosen": -0.9670044183731079, "rewards/margins": 0.5407288074493408, "rewards/rejected": -1.5077333450317383, "step": 376 }, { "epoch": 0.7893221669719969, "grad_norm": 5.072254374076571, "learning_rate": 6.409134137148736e-08, "logits/chosen": -1.168020486831665, "logits/rejected": -1.0352768898010254, "logps/chosen": -355.5277099609375, "logps/rejected": -334.4648132324219, "loss": 0.5843, "rewards/accuracies": 0.625, "rewards/chosen": -1.095816731452942, "rewards/margins": 0.3464164137840271, "rewards/rejected": -1.4422332048416138, "step": 377 }, { "epoch": 0.7914158597225857, "grad_norm": 5.352195029066714, "learning_rate": 6.28723129572247e-08, "logits/chosen": -0.9497919082641602, "logits/rejected": -0.9399784803390503, "logps/chosen": -370.9161071777344, "logps/rejected": -375.0029296875, "loss": 0.6092, "rewards/accuracies": 0.65625, "rewards/chosen": -0.9709791541099548, "rewards/margins": 0.34797006845474243, "rewards/rejected": -1.3189491033554077, "step": 378 }, { "epoch": 0.7935095524731746, "grad_norm": 5.125689813718339, "learning_rate": 6.166331963291519e-08, "logits/chosen": -0.915034294128418, "logits/rejected": -0.8931662440299988, "logps/chosen": -301.0979919433594, "logps/rejected": -344.7012023925781, "loss": 0.5914, "rewards/accuracies": 0.6875, "rewards/chosen": -0.8687721490859985, "rewards/margins": 0.2647196650505066, "rewards/rejected": -1.1334917545318604, "step": 379 }, { "epoch": 0.7956032452237635, "grad_norm": 4.869735640806175, "learning_rate": 6.046442623320145e-08, "logits/chosen": -1.1444369554519653, "logits/rejected": -1.128493309020996, "logps/chosen": -393.46380615234375, "logps/rejected": -322.0780944824219, "loss": 0.5873, "rewards/accuracies": 0.625, "rewards/chosen": -1.039123296737671, "rewards/margins": 0.1910269558429718, "rewards/rejected": -1.2301501035690308, "step": 380 }, { "epoch": 0.7976969379743523, "grad_norm": 4.621632560835905, "learning_rate": 5.9275697051098275e-08, "logits/chosen": -1.1916906833648682, "logits/rejected": -1.2293490171432495, "logps/chosen": -360.90032958984375, "logps/rejected": -409.75408935546875, "loss": 0.5622, "rewards/accuracies": 0.75, "rewards/chosen": -0.8390352129936218, "rewards/margins": 0.5701740980148315, "rewards/rejected": -1.4092092514038086, "step": 381 }, { "epoch": 0.7997906307249412, "grad_norm": 4.665505489647805, "learning_rate": 5.809719583454414e-08, "logits/chosen": -1.055574893951416, "logits/rejected": -1.0372941493988037, "logps/chosen": -348.5665588378906, "logps/rejected": -496.86822509765625, "loss": 0.5619, "rewards/accuracies": 0.75, "rewards/chosen": -0.8670852780342102, "rewards/margins": 0.6146461963653564, "rewards/rejected": -1.4817314147949219, "step": 382 }, { "epoch": 0.80188432347553, "grad_norm": 5.748991681323489, "learning_rate": 5.6928985782982524e-08, "logits/chosen": -1.1491328477859497, "logits/rejected": -1.162519931793213, "logps/chosen": -407.32989501953125, "logps/rejected": -420.8095703125, "loss": 0.6003, "rewards/accuracies": 0.625, "rewards/chosen": -0.8567126393318176, "rewards/margins": 0.5241937041282654, "rewards/rejected": -1.3809062242507935, "step": 383 }, { "epoch": 0.8039780162261189, "grad_norm": 5.416301322728307, "learning_rate": 5.57711295439732e-08, "logits/chosen": -1.1133326292037964, "logits/rejected": -1.1402982473373413, "logps/chosen": -384.0762939453125, "logps/rejected": -408.9583740234375, "loss": 0.607, "rewards/accuracies": 0.625, "rewards/chosen": -0.990085244178772, "rewards/margins": 0.23447242379188538, "rewards/rejected": -1.2245577573776245, "step": 384 }, { "epoch": 0.8060717089767077, "grad_norm": 4.830977765255834, "learning_rate": 5.4623689209832484e-08, "logits/chosen": -1.0206141471862793, "logits/rejected": -0.9997459650039673, "logps/chosen": -382.82916259765625, "logps/rejected": -379.95074462890625, "loss": 0.5782, "rewards/accuracies": 0.75, "rewards/chosen": -0.9502476453781128, "rewards/margins": 0.2668820023536682, "rewards/rejected": -1.2171294689178467, "step": 385 }, { "epoch": 0.8081654017272966, "grad_norm": 5.61816278421482, "learning_rate": 5.3486726314303175e-08, "logits/chosen": -0.9696506261825562, "logits/rejected": -0.8919385075569153, "logps/chosen": -391.99822998046875, "logps/rejected": -392.7474365234375, "loss": 0.5803, "rewards/accuracies": 0.65625, "rewards/chosen": -0.9037584066390991, "rewards/margins": 0.19231368601322174, "rewards/rejected": -1.0960720777511597, "step": 386 }, { "epoch": 0.8102590944778854, "grad_norm": 5.032275398665892, "learning_rate": 5.2360301829254745e-08, "logits/chosen": -1.0510151386260986, "logits/rejected": -1.0829778909683228, "logps/chosen": -387.7805480957031, "logps/rejected": -320.8028869628906, "loss": 0.5565, "rewards/accuracies": 0.875, "rewards/chosen": -0.7343047857284546, "rewards/margins": 0.5666511654853821, "rewards/rejected": -1.3009560108184814, "step": 387 }, { "epoch": 0.8123527872284743, "grad_norm": 6.027313832646389, "learning_rate": 5.1244476161413806e-08, "logits/chosen": -1.21254563331604, "logits/rejected": -1.2364054918289185, "logps/chosen": -386.1059265136719, "logps/rejected": -398.5882568359375, "loss": 0.5957, "rewards/accuracies": 0.625, "rewards/chosen": -0.9164823889732361, "rewards/margins": 0.27108636498451233, "rewards/rejected": -1.1875687837600708, "step": 388 }, { "epoch": 0.814446479979063, "grad_norm": 4.957681908942393, "learning_rate": 5.013930914912476e-08, "logits/chosen": -0.9387479424476624, "logits/rejected": -0.7969966530799866, "logps/chosen": -385.599853515625, "logps/rejected": -377.96649169921875, "loss": 0.5589, "rewards/accuracies": 0.6875, "rewards/chosen": -0.8022289276123047, "rewards/margins": 0.43812647461891174, "rewards/rejected": -1.2403552532196045, "step": 389 }, { "epoch": 0.816540172729652, "grad_norm": 4.796435392548042, "learning_rate": 4.904486005914027e-08, "logits/chosen": -1.0699548721313477, "logits/rejected": -1.085415005683899, "logps/chosen": -434.664306640625, "logps/rejected": -396.49981689453125, "loss": 0.6038, "rewards/accuracies": 0.6875, "rewards/chosen": -0.91024249792099, "rewards/margins": 0.3675413131713867, "rewards/rejected": -1.277783751487732, "step": 390 }, { "epoch": 0.8186338654802408, "grad_norm": 4.5232082570212295, "learning_rate": 4.796118758344353e-08, "logits/chosen": -0.8997459411621094, "logits/rejected": -0.9253852367401123, "logps/chosen": -372.18511962890625, "logps/rejected": -406.17431640625, "loss": 0.5759, "rewards/accuracies": 0.8125, "rewards/chosen": -0.8996187448501587, "rewards/margins": 0.5443833470344543, "rewards/rejected": -1.4440019130706787, "step": 391 }, { "epoch": 0.8207275582308297, "grad_norm": 5.58974873793547, "learning_rate": 4.688834983610082e-08, "logits/chosen": -1.1056690216064453, "logits/rejected": -1.1191550493240356, "logps/chosen": -306.69293212890625, "logps/rejected": -299.0478210449219, "loss": 0.6227, "rewards/accuracies": 0.59375, "rewards/chosen": -0.7949439287185669, "rewards/margins": 0.2265232354402542, "rewards/rejected": -1.0214672088623047, "step": 392 }, { "epoch": 0.8228212509814185, "grad_norm": 4.995643744303848, "learning_rate": 4.582640435014459e-08, "logits/chosen": -1.0539535284042358, "logits/rejected": -1.05762779712677, "logps/chosen": -318.402099609375, "logps/rejected": -329.4332580566406, "loss": 0.6108, "rewards/accuracies": 0.71875, "rewards/chosen": -0.9555471539497375, "rewards/margins": 0.33469048142433167, "rewards/rejected": -1.2902376651763916, "step": 393 }, { "epoch": 0.8249149437320074, "grad_norm": 4.473768324528174, "learning_rate": 4.477540807448832e-08, "logits/chosen": -1.0575326681137085, "logits/rejected": -1.0112919807434082, "logps/chosen": -357.1743469238281, "logps/rejected": -366.81805419921875, "loss": 0.5759, "rewards/accuracies": 0.75, "rewards/chosen": -0.9400140047073364, "rewards/margins": 0.4440167546272278, "rewards/rejected": -1.384030818939209, "step": 394 }, { "epoch": 0.8270086364825961, "grad_norm": 4.8187043032955845, "learning_rate": 4.373541737087263e-08, "logits/chosen": -1.0284297466278076, "logits/rejected": -1.0268464088439941, "logps/chosen": -326.4049072265625, "logps/rejected": -381.6387939453125, "loss": 0.6127, "rewards/accuracies": 0.6875, "rewards/chosen": -1.0302150249481201, "rewards/margins": 0.2750374674797058, "rewards/rejected": -1.3052525520324707, "step": 395 }, { "epoch": 0.829102329233185, "grad_norm": 4.470284644964193, "learning_rate": 4.270648801084295e-08, "logits/chosen": -1.0326392650604248, "logits/rejected": -1.1006556749343872, "logps/chosen": -326.6824951171875, "logps/rejected": -362.699462890625, "loss": 0.5885, "rewards/accuracies": 0.75, "rewards/chosen": -0.8322039842605591, "rewards/margins": 0.36433660984039307, "rewards/rejected": -1.1965405941009521, "step": 396 }, { "epoch": 0.8311960219837738, "grad_norm": 4.806046731431988, "learning_rate": 4.168867517275806e-08, "logits/chosen": -0.8822562098503113, "logits/rejected": -0.8652749061584473, "logps/chosen": -313.2331237792969, "logps/rejected": -319.55267333984375, "loss": 0.59, "rewards/accuracies": 0.84375, "rewards/chosen": -0.7936717867851257, "rewards/margins": 0.4282276928424835, "rewards/rejected": -1.2218995094299316, "step": 397 }, { "epoch": 0.8332897147343628, "grad_norm": 4.970760877108581, "learning_rate": 4.0682033438831584e-08, "logits/chosen": -1.0456138849258423, "logits/rejected": -1.0084147453308105, "logps/chosen": -311.2091979980469, "logps/rejected": -366.0251770019531, "loss": 0.5683, "rewards/accuracies": 0.625, "rewards/chosen": -0.8852429389953613, "rewards/margins": 0.2783307433128357, "rewards/rejected": -1.1635735034942627, "step": 398 }, { "epoch": 0.8353834074849515, "grad_norm": 5.454456420711039, "learning_rate": 3.968661679220467e-08, "logits/chosen": -0.9650417566299438, "logits/rejected": -1.0301586389541626, "logps/chosen": -378.8026428222656, "logps/rejected": -379.7239990234375, "loss": 0.571, "rewards/accuracies": 0.75, "rewards/chosen": -0.8987617492675781, "rewards/margins": 0.46284791827201843, "rewards/rejected": -1.361609697341919, "step": 399 }, { "epoch": 0.8374771002355405, "grad_norm": 5.504291529178289, "learning_rate": 3.8702478614051345e-08, "logits/chosen": -0.9630382657051086, "logits/rejected": -1.0812740325927734, "logps/chosen": -292.8814697265625, "logps/rejected": -365.01947021484375, "loss": 0.609, "rewards/accuracies": 0.65625, "rewards/chosen": -1.015270471572876, "rewards/margins": 0.4502546787261963, "rewards/rejected": -1.4655249118804932, "step": 400 }, { "epoch": 0.8395707929861292, "grad_norm": 5.849177401846944, "learning_rate": 3.772967168071517e-08, "logits/chosen": -1.0673670768737793, "logits/rejected": -1.0560370683670044, "logps/chosen": -384.3044128417969, "logps/rejected": -316.4158630371094, "loss": 0.6284, "rewards/accuracies": 0.8125, "rewards/chosen": -0.8781154155731201, "rewards/margins": 0.29097747802734375, "rewards/rejected": -1.1690927743911743, "step": 401 }, { "epoch": 0.8416644857367181, "grad_norm": 5.0719206576440286, "learning_rate": 3.676824816087978e-08, "logits/chosen": -0.8619110584259033, "logits/rejected": -0.8989958763122559, "logps/chosen": -436.34698486328125, "logps/rejected": -389.6102294921875, "loss": 0.5573, "rewards/accuracies": 0.8125, "rewards/chosen": -0.7849858999252319, "rewards/margins": 0.4747275114059448, "rewards/rejected": -1.2597134113311768, "step": 402 }, { "epoch": 0.8437581784873069, "grad_norm": 4.434919966920402, "learning_rate": 3.581825961277074e-08, "logits/chosen": -1.129549264907837, "logits/rejected": -1.0329104661941528, "logps/chosen": -397.8834228515625, "logps/rejected": -449.1318664550781, "loss": 0.6002, "rewards/accuracies": 0.59375, "rewards/chosen": -1.1101666688919067, "rewards/margins": 0.0822356566786766, "rewards/rejected": -1.1924023628234863, "step": 403 }, { "epoch": 0.8458518712378958, "grad_norm": 5.3072467215436845, "learning_rate": 3.487975698139084e-08, "logits/chosen": -1.056693196296692, "logits/rejected": -0.962243914604187, "logps/chosen": -405.84173583984375, "logps/rejected": -348.1914978027344, "loss": 0.642, "rewards/accuracies": 0.625, "rewards/chosen": -0.9228596687316895, "rewards/margins": 0.2819378674030304, "rewards/rejected": -1.2047975063323975, "step": 404 }, { "epoch": 0.8479455639884846, "grad_norm": 4.7363934455043015, "learning_rate": 3.3952790595787986e-08, "logits/chosen": -1.1622726917266846, "logits/rejected": -1.1050869226455688, "logps/chosen": -406.8652038574219, "logps/rejected": -395.9289245605469, "loss": 0.5825, "rewards/accuracies": 0.71875, "rewards/chosen": -1.0214625597000122, "rewards/margins": 0.4575739800930023, "rewards/rejected": -1.479036569595337, "step": 405 }, { "epoch": 0.8500392567390735, "grad_norm": 4.747893114548071, "learning_rate": 3.303741016635614e-08, "logits/chosen": -1.044054388999939, "logits/rejected": -1.0940017700195312, "logps/chosen": -378.45306396484375, "logps/rejected": -397.84869384765625, "loss": 0.5681, "rewards/accuracies": 0.75, "rewards/chosen": -0.8469173312187195, "rewards/margins": 0.4933964014053345, "rewards/rejected": -1.3403139114379883, "step": 406 }, { "epoch": 0.8521329494896623, "grad_norm": 5.609201203022989, "learning_rate": 3.2133664782169944e-08, "logits/chosen": -1.1649484634399414, "logits/rejected": -1.1291017532348633, "logps/chosen": -363.727783203125, "logps/rejected": -347.2157287597656, "loss": 0.5609, "rewards/accuracies": 0.65625, "rewards/chosen": -0.973659873008728, "rewards/margins": 0.43522804975509644, "rewards/rejected": -1.4088878631591797, "step": 407 }, { "epoch": 0.8542266422402512, "grad_norm": 5.0332007764320705, "learning_rate": 3.12416029083514e-08, "logits/chosen": -1.0819900035858154, "logits/rejected": -1.0853097438812256, "logps/chosen": -319.9803771972656, "logps/rejected": -363.8924560546875, "loss": 0.6011, "rewards/accuracies": 0.625, "rewards/chosen": -0.9644487500190735, "rewards/margins": 0.18513096868991852, "rewards/rejected": -1.1495797634124756, "step": 408 }, { "epoch": 0.8563203349908401, "grad_norm": 4.681133689389535, "learning_rate": 3.036127238347164e-08, "logits/chosen": -1.0036630630493164, "logits/rejected": -1.092750072479248, "logps/chosen": -359.8594665527344, "logps/rejected": -393.84918212890625, "loss": 0.6194, "rewards/accuracies": 0.59375, "rewards/chosen": -0.9892240166664124, "rewards/margins": 0.318713515996933, "rewards/rejected": -1.3079373836517334, "step": 409 }, { "epoch": 0.8584140277414289, "grad_norm": 5.389903459300115, "learning_rate": 2.9492720416985e-08, "logits/chosen": -1.0511116981506348, "logits/rejected": -1.1079765558242798, "logps/chosen": -412.9842834472656, "logps/rejected": -414.5954895019531, "loss": 0.6234, "rewards/accuracies": 0.71875, "rewards/chosen": -0.8149458765983582, "rewards/margins": 0.4206196665763855, "rewards/rejected": -1.2355656623840332, "step": 410 }, { "epoch": 0.8605077204920178, "grad_norm": 8.616145883009287, "learning_rate": 2.863599358669755e-08, "logits/chosen": -1.1962544918060303, "logits/rejected": -1.1816613674163818, "logps/chosen": -432.10772705078125, "logps/rejected": -431.0619812011719, "loss": 0.6394, "rewards/accuracies": 0.5625, "rewards/chosen": -1.1603362560272217, "rewards/margins": 0.13714662194252014, "rewards/rejected": -1.2974828481674194, "step": 411 }, { "epoch": 0.8626014132426066, "grad_norm": 5.617880368978512, "learning_rate": 2.7791137836269158e-08, "logits/chosen": -0.8768041133880615, "logits/rejected": -0.8986093401908875, "logps/chosen": -334.2413330078125, "logps/rejected": -396.01080322265625, "loss": 0.5753, "rewards/accuracies": 0.5625, "rewards/chosen": -0.9527013301849365, "rewards/margins": 0.25225597620010376, "rewards/rejected": -1.204957127571106, "step": 412 }, { "epoch": 0.8646951059931955, "grad_norm": 5.164364838192052, "learning_rate": 2.6958198472749717e-08, "logits/chosen": -1.0828931331634521, "logits/rejected": -1.1191623210906982, "logps/chosen": -327.3102722167969, "logps/rejected": -421.7712707519531, "loss": 0.599, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9811720252037048, "rewards/margins": 0.3252337574958801, "rewards/rejected": -1.306405782699585, "step": 413 }, { "epoch": 0.8667887987437843, "grad_norm": 4.7600577219959215, "learning_rate": 2.613722016414943e-08, "logits/chosen": -1.0034255981445312, "logits/rejected": -1.008399248123169, "logps/chosen": -306.27734375, "logps/rejected": -311.43829345703125, "loss": 0.5869, "rewards/accuracies": 0.53125, "rewards/chosen": -0.8177580237388611, "rewards/margins": 0.3242667615413666, "rewards/rejected": -1.1420248746871948, "step": 414 }, { "epoch": 0.8688824914943732, "grad_norm": 5.881812398701957, "learning_rate": 2.5328246937043525e-08, "logits/chosen": -1.090468406677246, "logits/rejected": -0.966783881187439, "logps/chosen": -294.0365295410156, "logps/rejected": -340.3030090332031, "loss": 0.6059, "rewards/accuracies": 0.8125, "rewards/chosen": -0.8205872178077698, "rewards/margins": 0.4039844274520874, "rewards/rejected": -1.2245714664459229, "step": 415 }, { "epoch": 0.870976184244962, "grad_norm": 4.462776486007767, "learning_rate": 2.4531322174210973e-08, "logits/chosen": -1.0929807424545288, "logits/rejected": -1.0541623830795288, "logps/chosen": -376.1136169433594, "logps/rejected": -355.0390625, "loss": 0.5891, "rewards/accuracies": 0.6875, "rewards/chosen": -0.7710814476013184, "rewards/margins": 0.5452392101287842, "rewards/rejected": -1.3163206577301025, "step": 416 }, { "epoch": 0.8730698769955509, "grad_norm": 5.364093630079675, "learning_rate": 2.3746488612308295e-08, "logits/chosen": -0.8915029168128967, "logits/rejected": -0.8739762902259827, "logps/chosen": -341.0782165527344, "logps/rejected": -373.2362976074219, "loss": 0.5922, "rewards/accuracies": 0.59375, "rewards/chosen": -1.0400235652923584, "rewards/margins": 0.25072982907295227, "rewards/rejected": -1.2907533645629883, "step": 417 }, { "epoch": 0.8751635697461397, "grad_norm": 5.9198475058546745, "learning_rate": 2.297378833957761e-08, "logits/chosen": -1.068786382675171, "logits/rejected": -1.155449628829956, "logps/chosen": -398.5052490234375, "logps/rejected": -495.4448547363281, "loss": 0.5493, "rewards/accuracies": 0.78125, "rewards/chosen": -1.0525850057601929, "rewards/margins": 0.5073192119598389, "rewards/rejected": -1.5599043369293213, "step": 418 }, { "epoch": 0.8772572624967286, "grad_norm": 6.119605278896458, "learning_rate": 2.2213262793589482e-08, "logits/chosen": -0.9477488398551941, "logits/rejected": -0.9867204427719116, "logps/chosen": -333.72406005859375, "logps/rejected": -393.95654296875, "loss": 0.5679, "rewards/accuracies": 0.5625, "rewards/chosen": -0.9634656310081482, "rewards/margins": 0.18628139793872833, "rewards/rejected": -1.1497470140457153, "step": 419 }, { "epoch": 0.8793509552473174, "grad_norm": 4.46736802660283, "learning_rate": 2.1464952759020856e-08, "logits/chosen": -0.9734982848167419, "logits/rejected": -0.8660587668418884, "logps/chosen": -477.8684387207031, "logps/rejected": -453.4022521972656, "loss": 0.5436, "rewards/accuracies": 0.65625, "rewards/chosen": -0.9317688345909119, "rewards/margins": 0.5228405594825745, "rewards/rejected": -1.4546092748641968, "step": 420 }, { "epoch": 0.8814446479979063, "grad_norm": 4.862852167544358, "learning_rate": 2.07288983654679e-08, "logits/chosen": -1.0473878383636475, "logits/rejected": -0.9858657717704773, "logps/chosen": -351.1941223144531, "logps/rejected": -351.5199279785156, "loss": 0.5741, "rewards/accuracies": 0.625, "rewards/chosen": -0.988927960395813, "rewards/margins": 0.2549222707748413, "rewards/rejected": -1.2438502311706543, "step": 421 }, { "epoch": 0.8835383407484951, "grad_norm": 5.958903981984076, "learning_rate": 2.0005139085293942e-08, "logits/chosen": -0.9384050965309143, "logits/rejected": -1.0206750631332397, "logps/chosen": -368.50982666015625, "logps/rejected": -300.23272705078125, "loss": 0.6521, "rewards/accuracies": 0.59375, "rewards/chosen": -0.8948423266410828, "rewards/margins": 0.1242271363735199, "rewards/rejected": -1.0190695524215698, "step": 422 }, { "epoch": 0.885632033499084, "grad_norm": 4.728833200473606, "learning_rate": 1.9293713731512673e-08, "logits/chosen": -1.1004122495651245, "logits/rejected": -0.9806270599365234, "logps/chosen": -353.2975769042969, "logps/rejected": -388.24285888671875, "loss": 0.5801, "rewards/accuracies": 0.75, "rewards/chosen": -0.980697512626648, "rewards/margins": 0.3601706027984619, "rewards/rejected": -1.3408679962158203, "step": 423 }, { "epoch": 0.8877257262496728, "grad_norm": 5.054031739657731, "learning_rate": 1.8594660455706763e-08, "logits/chosen": -0.8540740013122559, "logits/rejected": -0.8462531566619873, "logps/chosen": -353.05694580078125, "logps/rejected": -322.58251953125, "loss": 0.5569, "rewards/accuracies": 0.65625, "rewards/chosen": -0.93426513671875, "rewards/margins": 0.39288660883903503, "rewards/rejected": -1.3271517753601074, "step": 424 }, { "epoch": 0.8898194190002617, "grad_norm": 5.206266337812995, "learning_rate": 1.7908016745981856e-08, "logits/chosen": -1.0436545610427856, "logits/rejected": -1.034414529800415, "logps/chosen": -469.0371398925781, "logps/rejected": -435.71771240234375, "loss": 0.5682, "rewards/accuracies": 0.71875, "rewards/chosen": -0.8871874213218689, "rewards/margins": 0.43279707431793213, "rewards/rejected": -1.3199846744537354, "step": 425 }, { "epoch": 0.8919131117508505, "grad_norm": 5.1909412300843405, "learning_rate": 1.7233819424956247e-08, "logits/chosen": -1.031982660293579, "logits/rejected": -1.1009619235992432, "logps/chosen": -513.76171875, "logps/rejected": -422.9322509765625, "loss": 0.5787, "rewards/accuracies": 0.8125, "rewards/chosen": -0.8494269251823425, "rewards/margins": 0.5463772416114807, "rewards/rejected": -1.3958041667938232, "step": 426 }, { "epoch": 0.8940068045014394, "grad_norm": 5.003735901632841, "learning_rate": 1.6572104647786245e-08, "logits/chosen": -1.0100113153457642, "logits/rejected": -0.9566894173622131, "logps/chosen": -315.8883972167969, "logps/rejected": -368.1168518066406, "loss": 0.596, "rewards/accuracies": 0.5625, "rewards/chosen": -0.9657600522041321, "rewards/margins": 0.13975828886032104, "rewards/rejected": -1.1055183410644531, "step": 427 }, { "epoch": 0.8961004972520282, "grad_norm": 5.30361447765211, "learning_rate": 1.5922907900227017e-08, "logits/chosen": -1.0869308710098267, "logits/rejected": -1.1259006261825562, "logps/chosen": -368.03173828125, "logps/rejected": -374.6971435546875, "loss": 0.5664, "rewards/accuracies": 0.71875, "rewards/chosen": -0.8075523376464844, "rewards/margins": 0.5772351622581482, "rewards/rejected": -1.3847875595092773, "step": 428 }, { "epoch": 0.8981941900026171, "grad_norm": 4.679101988343094, "learning_rate": 1.5286263996730026e-08, "logits/chosen": -1.0106351375579834, "logits/rejected": -0.9624335765838623, "logps/chosen": -367.8698425292969, "logps/rejected": -354.481689453125, "loss": 0.5701, "rewards/accuracies": 0.625, "rewards/chosen": -0.9276455640792847, "rewards/margins": 0.2800782322883606, "rewards/rejected": -1.20772385597229, "step": 429 }, { "epoch": 0.9002878827532059, "grad_norm": 5.535314376855842, "learning_rate": 1.4662207078575684e-08, "logits/chosen": -1.059343934059143, "logits/rejected": -1.0445940494537354, "logps/chosen": -338.7727966308594, "logps/rejected": -319.46270751953125, "loss": 0.6137, "rewards/accuracies": 0.65625, "rewards/chosen": -0.9054205417633057, "rewards/margins": 0.11171019077301025, "rewards/rejected": -1.0171306133270264, "step": 430 }, { "epoch": 0.9023815755037948, "grad_norm": 4.753937204067147, "learning_rate": 1.40507706120426e-08, "logits/chosen": -1.0011266469955444, "logits/rejected": -1.0196805000305176, "logps/chosen": -375.32476806640625, "logps/rejected": -312.6873474121094, "loss": 0.5998, "rewards/accuracies": 0.5, "rewards/chosen": -0.9956127405166626, "rewards/margins": 0.16968894004821777, "rewards/rejected": -1.16530179977417, "step": 431 }, { "epoch": 0.9044752682543836, "grad_norm": 4.871203748235653, "learning_rate": 1.345198738661285e-08, "logits/chosen": -1.1051032543182373, "logits/rejected": -1.096787452697754, "logps/chosen": -433.490234375, "logps/rejected": -374.37127685546875, "loss": 0.5978, "rewards/accuracies": 0.8125, "rewards/chosen": -0.7922812700271606, "rewards/margins": 0.6507806777954102, "rewards/rejected": -1.4430619478225708, "step": 432 }, { "epoch": 0.9065689610049725, "grad_norm": 4.775931616670958, "learning_rate": 1.2865889513213628e-08, "logits/chosen": -0.8161289691925049, "logits/rejected": -0.8606552481651306, "logps/chosen": -295.0851745605469, "logps/rejected": -348.60479736328125, "loss": 0.5604, "rewards/accuracies": 0.6875, "rewards/chosen": -0.8485819101333618, "rewards/margins": 0.4231727421283722, "rewards/rejected": -1.2717547416687012, "step": 433 }, { "epoch": 0.9086626537555613, "grad_norm": 5.437842601435475, "learning_rate": 1.2292508422495157e-08, "logits/chosen": -1.1662708520889282, "logits/rejected": -1.0712344646453857, "logps/chosen": -546.4298706054688, "logps/rejected": -394.296875, "loss": 0.6305, "rewards/accuracies": 0.5, "rewards/chosen": -0.9335058331489563, "rewards/margins": 0.3330295979976654, "rewards/rejected": -1.2665355205535889, "step": 434 }, { "epoch": 0.9107563465061502, "grad_norm": 5.8733039375215, "learning_rate": 1.1731874863145142e-08, "logits/chosen": -1.070544958114624, "logits/rejected": -0.9656955599784851, "logps/chosen": -353.1357421875, "logps/rejected": -379.1081237792969, "loss": 0.6071, "rewards/accuracies": 0.53125, "rewards/chosen": -1.1403696537017822, "rewards/margins": 0.20886562764644623, "rewards/rejected": -1.3492351770401, "step": 435 }, { "epoch": 0.912850039256739, "grad_norm": 5.07294970000016, "learning_rate": 1.118401890024001e-08, "logits/chosen": -0.9361265897750854, "logits/rejected": -1.0257855653762817, "logps/chosen": -329.8653259277344, "logps/rejected": -324.8919677734375, "loss": 0.5794, "rewards/accuracies": 0.625, "rewards/chosen": -0.9130148887634277, "rewards/margins": 0.4156607985496521, "rewards/rejected": -1.328675627708435, "step": 436 }, { "epoch": 0.9149437320073279, "grad_norm": 5.233286977848408, "learning_rate": 1.06489699136324e-08, "logits/chosen": -1.2846165895462036, "logits/rejected": -1.2109489440917969, "logps/chosen": -430.3236083984375, "logps/rejected": -355.3746032714844, "loss": 0.6074, "rewards/accuracies": 0.625, "rewards/chosen": -0.9691572189331055, "rewards/margins": 0.1984838843345642, "rewards/rejected": -1.1676411628723145, "step": 437 }, { "epoch": 0.9170374247579168, "grad_norm": 5.467812511461663, "learning_rate": 1.0126756596375685e-08, "logits/chosen": -1.0143589973449707, "logits/rejected": -1.0571720600128174, "logps/chosen": -364.5919189453125, "logps/rejected": -371.653564453125, "loss": 0.5861, "rewards/accuracies": 0.625, "rewards/chosen": -0.7239144444465637, "rewards/margins": 0.24616940319538116, "rewards/rejected": -0.9700838923454285, "step": 438 }, { "epoch": 0.9191311175085056, "grad_norm": 5.01920989804905, "learning_rate": 9.617406953185136e-09, "logits/chosen": -0.9698410630226135, "logits/rejected": -1.0264487266540527, "logps/chosen": -310.87841796875, "logps/rejected": -295.1427001953125, "loss": 0.5469, "rewards/accuracies": 0.625, "rewards/chosen": -0.8337924480438232, "rewards/margins": 0.2656683325767517, "rewards/rejected": -1.0994608402252197, "step": 439 }, { "epoch": 0.9212248102590945, "grad_norm": 4.673823728612535, "learning_rate": 9.12094829893642e-09, "logits/chosen": -1.077862024307251, "logits/rejected": -1.124623417854309, "logps/chosen": -374.5534973144531, "logps/rejected": -362.8243408203125, "loss": 0.5669, "rewards/accuracies": 0.65625, "rewards/chosen": -0.9717316031455994, "rewards/margins": 0.26587387919425964, "rewards/rejected": -1.2376054525375366, "step": 440 }, { "epoch": 0.9233185030096833, "grad_norm": 6.205109405766369, "learning_rate": 8.637407257200496e-09, "logits/chosen": -0.8686705827713013, "logits/rejected": -0.9191367626190186, "logps/chosen": -269.9112548828125, "logps/rejected": -338.52227783203125, "loss": 0.5826, "rewards/accuracies": 0.78125, "rewards/chosen": -0.8218514323234558, "rewards/margins": 0.40259477496147156, "rewards/rejected": -1.2244462966918945, "step": 441 }, { "epoch": 0.9254121957602722, "grad_norm": 5.195248397451198, "learning_rate": 8.166809758815895e-09, "logits/chosen": -1.0188101530075073, "logits/rejected": -0.936786949634552, "logps/chosen": -442.087646484375, "logps/rejected": -413.1994934082031, "loss": 0.6095, "rewards/accuracies": 0.78125, "rewards/chosen": -1.0302540063858032, "rewards/margins": 0.5127990245819092, "rewards/rejected": -1.5430529117584229, "step": 442 }, { "epoch": 0.927505888510861, "grad_norm": 4.560326269338869, "learning_rate": 7.709181040498253e-09, "logits/chosen": -1.1566600799560547, "logits/rejected": -1.116146206855774, "logps/chosen": -409.8769226074219, "logps/rejected": -419.85302734375, "loss": 0.5639, "rewards/accuracies": 0.875, "rewards/chosen": -0.8342012166976929, "rewards/margins": 0.5906370878219604, "rewards/rejected": -1.4248384237289429, "step": 443 }, { "epoch": 0.9295995812614499, "grad_norm": 5.161701938851564, "learning_rate": 7.2645456434869965e-09, "logits/chosen": -0.8712857961654663, "logits/rejected": -0.8976655006408691, "logps/chosen": -294.1389465332031, "logps/rejected": -355.3047180175781, "loss": 0.6401, "rewards/accuracies": 0.59375, "rewards/chosen": -1.0274994373321533, "rewards/margins": 0.11138283461332321, "rewards/rejected": -1.1388822793960571, "step": 444 }, { "epoch": 0.9316932740120387, "grad_norm": 5.193927404013284, "learning_rate": 6.832927412229017e-09, "logits/chosen": -1.0100668668746948, "logits/rejected": -0.9370667338371277, "logps/chosen": -372.62432861328125, "logps/rejected": -358.0182189941406, "loss": 0.5839, "rewards/accuracies": 0.8125, "rewards/chosen": -0.8954147100448608, "rewards/margins": 0.5471333861351013, "rewards/rejected": -1.4425480365753174, "step": 445 }, { "epoch": 0.9337869667626276, "grad_norm": 5.071483891680644, "learning_rate": 6.414349493100129e-09, "logits/chosen": -0.8581067323684692, "logits/rejected": -0.938692569732666, "logps/chosen": -375.258056640625, "logps/rejected": -362.7691345214844, "loss": 0.5764, "rewards/accuracies": 0.78125, "rewards/chosen": -0.8390679955482483, "rewards/margins": 0.41947251558303833, "rewards/rejected": -1.2585406303405762, "step": 446 }, { "epoch": 0.9358806595132164, "grad_norm": 4.940106873508351, "learning_rate": 6.0088343331638756e-09, "logits/chosen": -1.0209871530532837, "logits/rejected": -1.233198881149292, "logps/chosen": -429.74127197265625, "logps/rejected": -621.1878051757812, "loss": 0.5934, "rewards/accuracies": 0.65625, "rewards/chosen": -1.0128525495529175, "rewards/margins": 0.38664138317108154, "rewards/rejected": -1.399493932723999, "step": 447 }, { "epoch": 0.9379743522638053, "grad_norm": 5.512737446251382, "learning_rate": 5.616403678967624e-09, "logits/chosen": -0.9423820376396179, "logits/rejected": -1.0280300378799438, "logps/chosen": -410.8369140625, "logps/rejected": -436.2171325683594, "loss": 0.5454, "rewards/accuracies": 0.71875, "rewards/chosen": -0.914763331413269, "rewards/margins": 0.4588293731212616, "rewards/rejected": -1.3735928535461426, "step": 448 }, { "epoch": 0.9400680450143941, "grad_norm": 5.287387664074463, "learning_rate": 5.2370785753763356e-09, "logits/chosen": -1.0338518619537354, "logits/rejected": -1.1173179149627686, "logps/chosen": -344.6166076660156, "logps/rejected": -393.3231201171875, "loss": 0.6261, "rewards/accuracies": 0.75, "rewards/chosen": -0.9901973009109497, "rewards/margins": 0.3103466331958771, "rewards/rejected": -1.300544023513794, "step": 449 }, { "epoch": 0.942161737764983, "grad_norm": 4.817463690488231, "learning_rate": 4.8708793644441086e-09, "logits/chosen": -0.8159360885620117, "logits/rejected": -0.8673432469367981, "logps/chosen": -246.53567504882812, "logps/rejected": -294.34918212890625, "loss": 0.5727, "rewards/accuracies": 0.78125, "rewards/chosen": -0.7399657368659973, "rewards/margins": 0.37775641679763794, "rewards/rejected": -1.1177222728729248, "step": 450 }, { "epoch": 0.9442554305155718, "grad_norm": 4.81330823496849, "learning_rate": 4.517825684323323e-09, "logits/chosen": -0.9904710054397583, "logits/rejected": -1.0202052593231201, "logps/chosen": -369.5208740234375, "logps/rejected": -333.48370361328125, "loss": 0.6169, "rewards/accuracies": 0.71875, "rewards/chosen": -0.9806219935417175, "rewards/margins": 0.373673677444458, "rewards/rejected": -1.3542956113815308, "step": 451 }, { "epoch": 0.9463491232661607, "grad_norm": 5.137535300805967, "learning_rate": 4.1779364682113794e-09, "logits/chosen": -0.983173131942749, "logits/rejected": -0.9742081761360168, "logps/chosen": -368.712646484375, "logps/rejected": -319.7051696777344, "loss": 0.6285, "rewards/accuracies": 0.65625, "rewards/chosen": -0.9488739967346191, "rewards/margins": 0.31694379448890686, "rewards/rejected": -1.2658177614212036, "step": 452 }, { "epoch": 0.9484428160167495, "grad_norm": 5.456309813227564, "learning_rate": 3.851229943335393e-09, "logits/chosen": -0.9394152164459229, "logits/rejected": -0.8610240817070007, "logps/chosen": -359.17584228515625, "logps/rejected": -315.705810546875, "loss": 0.5749, "rewards/accuracies": 0.78125, "rewards/chosen": -0.7318274974822998, "rewards/margins": 0.4379826784133911, "rewards/rejected": -1.169810175895691, "step": 453 }, { "epoch": 0.9505365087673384, "grad_norm": 4.898540304783387, "learning_rate": 3.5377236299748147e-09, "logits/chosen": -0.9331487417221069, "logits/rejected": -0.9384573698043823, "logps/chosen": -324.64373779296875, "logps/rejected": -297.3902282714844, "loss": 0.6033, "rewards/accuracies": 0.53125, "rewards/chosen": -0.9078226685523987, "rewards/margins": 0.18427318334579468, "rewards/rejected": -1.092095971107483, "step": 454 }, { "epoch": 0.9526302015179272, "grad_norm": 4.710865083186366, "learning_rate": 3.2374343405217884e-09, "logits/chosen": -1.0101709365844727, "logits/rejected": -1.068909764289856, "logps/chosen": -364.3077087402344, "logps/rejected": -378.56109619140625, "loss": 0.5817, "rewards/accuracies": 0.84375, "rewards/chosen": -0.8725315928459167, "rewards/margins": 0.3408425450325012, "rewards/rejected": -1.213374137878418, "step": 455 }, { "epoch": 0.9547238942685161, "grad_norm": 4.352561065678752, "learning_rate": 2.9503781785795713e-09, "logits/chosen": -1.1207157373428345, "logits/rejected": -1.1230698823928833, "logps/chosen": -306.4914855957031, "logps/rejected": -303.0159912109375, "loss": 0.5729, "rewards/accuracies": 0.6875, "rewards/chosen": -0.8190067410469055, "rewards/margins": 0.45388132333755493, "rewards/rejected": -1.2728880643844604, "step": 456 }, { "epoch": 0.9568175870191049, "grad_norm": 4.69592176698614, "learning_rate": 2.6765705380989432e-09, "logits/chosen": -1.099668025970459, "logits/rejected": -1.0729769468307495, "logps/chosen": -372.81939697265625, "logps/rejected": -384.171875, "loss": 0.5745, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9699320793151855, "rewards/margins": 0.29789042472839355, "rewards/rejected": -1.2678226232528687, "step": 457 }, { "epoch": 0.9589112797696938, "grad_norm": 4.577613525209299, "learning_rate": 2.416026102552732e-09, "logits/chosen": -1.0326286554336548, "logits/rejected": -0.9567270874977112, "logps/chosen": -468.9176025390625, "logps/rejected": -424.97821044921875, "loss": 0.598, "rewards/accuracies": 0.59375, "rewards/chosen": -1.1172494888305664, "rewards/margins": 0.19202867150306702, "rewards/rejected": -1.309278130531311, "step": 458 }, { "epoch": 0.9610049725202826, "grad_norm": 5.9686439401089, "learning_rate": 2.168758844148272e-09, "logits/chosen": -1.012804388999939, "logits/rejected": -0.7858539819717407, "logps/chosen": -367.9031982421875, "logps/rejected": -374.16265869140625, "loss": 0.6311, "rewards/accuracies": 0.5, "rewards/chosen": -1.125437617301941, "rewards/margins": 0.09359197318553925, "rewards/rejected": -1.2190295457839966, "step": 459 }, { "epoch": 0.9630986652708715, "grad_norm": 4.930581615588632, "learning_rate": 1.9347820230782295e-09, "logits/chosen": -0.9225434064865112, "logits/rejected": -0.979276716709137, "logps/chosen": -371.576904296875, "logps/rejected": -356.37310791015625, "loss": 0.5379, "rewards/accuracies": 0.65625, "rewards/chosen": -0.8668937683105469, "rewards/margins": 0.4253525137901306, "rewards/rejected": -1.2922462224960327, "step": 460 }, { "epoch": 0.9651923580214603, "grad_norm": 4.471295961382442, "learning_rate": 1.7141081868094209e-09, "logits/chosen": -1.1269583702087402, "logits/rejected": -1.0261245965957642, "logps/chosen": -480.5639343261719, "logps/rejected": -411.71087646484375, "loss": 0.5492, "rewards/accuracies": 0.59375, "rewards/chosen": -0.9406365752220154, "rewards/margins": 0.4158119559288025, "rewards/rejected": -1.3564486503601074, "step": 461 }, { "epoch": 0.9672860507720492, "grad_norm": 5.482890410875146, "learning_rate": 1.5067491694100153e-09, "logits/chosen": -0.9238781929016113, "logits/rejected": -0.9906557202339172, "logps/chosen": -375.0228576660156, "logps/rejected": -368.6358947753906, "loss": 0.6158, "rewards/accuracies": 0.65625, "rewards/chosen": -0.9464489817619324, "rewards/margins": 0.3078148365020752, "rewards/rejected": -1.2542637586593628, "step": 462 }, { "epoch": 0.969379743522638, "grad_norm": 6.254939979465275, "learning_rate": 1.3127160909147672e-09, "logits/chosen": -0.8874152302742004, "logits/rejected": -0.8142355680465698, "logps/chosen": -443.54425048828125, "logps/rejected": -369.4252014160156, "loss": 0.6109, "rewards/accuracies": 0.59375, "rewards/chosen": -0.9176414608955383, "rewards/margins": 0.2700048089027405, "rewards/rejected": -1.1876461505889893, "step": 463 }, { "epoch": 0.9714734362732269, "grad_norm": 5.014812723719896, "learning_rate": 1.1320193567288527e-09, "logits/chosen": -1.177965760231018, "logits/rejected": -0.9275381565093994, "logps/chosen": -387.7260437011719, "logps/rejected": -320.2764587402344, "loss": 0.6008, "rewards/accuracies": 0.625, "rewards/chosen": -0.9729608297348022, "rewards/margins": 0.19940996170043945, "rewards/rejected": -1.1723709106445312, "step": 464 }, { "epoch": 0.9735671290238157, "grad_norm": 5.147283024255825, "learning_rate": 9.64668657069706e-10, "logits/chosen": -0.9534504413604736, "logits/rejected": -0.9172082543373108, "logps/chosen": -301.088623046875, "logps/rejected": -276.7142639160156, "loss": 0.5775, "rewards/accuracies": 0.65625, "rewards/chosen": -0.8230394124984741, "rewards/margins": 0.2633233070373535, "rewards/rejected": -1.0863627195358276, "step": 465 }, { "epoch": 0.9756608217744046, "grad_norm": 4.884909064472299, "learning_rate": 8.106729664475176e-10, "logits/chosen": -0.8963732719421387, "logits/rejected": -0.9166386127471924, "logps/chosen": -285.1668701171875, "logps/rejected": -341.3136291503906, "loss": 0.5797, "rewards/accuracies": 0.65625, "rewards/chosen": -0.832474946975708, "rewards/margins": 0.322449266910553, "rewards/rejected": -1.1549241542816162, "step": 466 }, { "epoch": 0.9777545145249935, "grad_norm": 5.408956575431113, "learning_rate": 6.700405431837585e-10, "logits/chosen": -0.9801924228668213, "logits/rejected": -0.9249019622802734, "logps/chosen": -357.9735107421875, "logps/rejected": -407.9078369140625, "loss": 0.5459, "rewards/accuracies": 0.65625, "rewards/chosen": -0.8716942667961121, "rewards/margins": 0.24341756105422974, "rewards/rejected": -1.1151118278503418, "step": 467 }, { "epoch": 0.9798482072755823, "grad_norm": 5.666218181438657, "learning_rate": 5.427789289685347e-10, "logits/chosen": -1.0714266300201416, "logits/rejected": -1.0156344175338745, "logps/chosen": -367.01177978515625, "logps/rejected": -375.07159423828125, "loss": 0.5728, "rewards/accuracies": 0.75, "rewards/chosen": -0.8229541778564453, "rewards/margins": 0.45294615626335144, "rewards/rejected": -1.27590012550354, "step": 468 }, { "epoch": 0.9819419000261712, "grad_norm": 4.89515188861137, "learning_rate": 4.288949484559934e-10, "logits/chosen": -1.0786808729171753, "logits/rejected": -1.0295497179031372, "logps/chosen": -363.5501708984375, "logps/rejected": -440.8451232910156, "loss": 0.5401, "rewards/accuracies": 0.6875, "rewards/chosen": -0.8827618360519409, "rewards/margins": 0.4070644974708557, "rewards/rejected": -1.2898262739181519, "step": 469 }, { "epoch": 0.98403559277676, "grad_norm": 5.187141858011842, "learning_rate": 3.2839470889836627e-10, "logits/chosen": -1.117061972618103, "logits/rejected": -1.1202375888824463, "logps/chosen": -396.63446044921875, "logps/rejected": -403.7596435546875, "loss": 0.6187, "rewards/accuracies": 0.5625, "rewards/chosen": -1.0092464685440063, "rewards/margins": 0.21749506890773773, "rewards/rejected": -1.2267415523529053, "step": 470 }, { "epoch": 0.9861292855273489, "grad_norm": 5.754826825720099, "learning_rate": 2.412835998185092e-10, "logits/chosen": -1.050107717514038, "logits/rejected": -1.044450283050537, "logps/chosen": -474.8680419921875, "logps/rejected": -416.51910400390625, "loss": 0.5501, "rewards/accuracies": 0.59375, "rewards/chosen": -1.0314342975616455, "rewards/margins": 0.2724250257015228, "rewards/rejected": -1.3038592338562012, "step": 471 }, { "epoch": 0.9882229782779377, "grad_norm": 4.681972880187453, "learning_rate": 1.6756629272085544e-10, "logits/chosen": -0.9889604449272156, "logits/rejected": -0.9807059168815613, "logps/chosen": -360.0508728027344, "logps/rejected": -386.60345458984375, "loss": 0.6423, "rewards/accuracies": 0.75, "rewards/chosen": -0.8153883218765259, "rewards/margins": 0.25652480125427246, "rewards/rejected": -1.071913242340088, "step": 472 }, { "epoch": 0.9903166710285266, "grad_norm": 4.991403365051279, "learning_rate": 1.072467408408384e-10, "logits/chosen": -1.052950382232666, "logits/rejected": -0.9636208415031433, "logps/chosen": -340.3130187988281, "logps/rejected": -323.96282958984375, "loss": 0.5899, "rewards/accuracies": 0.65625, "rewards/chosen": -1.003516674041748, "rewards/margins": 0.36056941747665405, "rewards/rejected": -1.3640862703323364, "step": 473 }, { "epoch": 0.9924103637791154, "grad_norm": 4.917536152400757, "learning_rate": 6.032817893297793e-11, "logits/chosen": -1.1692841053009033, "logits/rejected": -1.092441201210022, "logps/chosen": -392.82257080078125, "logps/rejected": -401.7308349609375, "loss": 0.531, "rewards/accuracies": 0.84375, "rewards/chosen": -0.7280994653701782, "rewards/margins": 0.6018933653831482, "rewards/rejected": -1.3299927711486816, "step": 474 }, { "epoch": 0.9945040565297043, "grad_norm": 4.833161418557504, "learning_rate": 2.6813123097352287e-11, "logits/chosen": -0.9915435314178467, "logits/rejected": -0.8906069397926331, "logps/chosen": -294.5395202636719, "logps/rejected": -288.666259765625, "loss": 0.6081, "rewards/accuracies": 0.65625, "rewards/chosen": -0.8269048929214478, "rewards/margins": 0.23639872670173645, "rewards/rejected": -1.0633035898208618, "step": 475 }, { "epoch": 0.9965977492802931, "grad_norm": 5.189347878325577, "learning_rate": 6.7033706447061635e-12, "logits/chosen": -0.9558994770050049, "logits/rejected": -0.9206646680831909, "logps/chosen": -339.8170471191406, "logps/rejected": -366.52288818359375, "loss": 0.6146, "rewards/accuracies": 0.5625, "rewards/chosen": -1.0402597188949585, "rewards/margins": 0.14135634899139404, "rewards/rejected": -1.1816160678863525, "step": 476 }, { "epoch": 0.998691442030882, "grad_norm": 4.61441819018956, "learning_rate": 0.0, "logits/chosen": -1.0164192914962769, "logits/rejected": -1.0064038038253784, "logps/chosen": -473.8236083984375, "logps/rejected": -429.7413635253906, "loss": 0.5619, "rewards/accuracies": 0.8125, "rewards/chosen": -0.9510712027549744, "rewards/margins": 0.6312524676322937, "rewards/rejected": -1.5823237895965576, "step": 477 }, { "epoch": 0.998691442030882, "eval_logits/chosen": -1.0320225954055786, "eval_logits/rejected": -1.019870400428772, "eval_logps/chosen": -371.21112060546875, "eval_logps/rejected": -372.5772399902344, "eval_loss": 0.6026462316513062, "eval_rewards/accuracies": 0.656000018119812, "eval_rewards/chosen": -0.9395894408226013, "eval_rewards/margins": 0.3643726706504822, "eval_rewards/rejected": -1.3039621114730835, "eval_runtime": 224.9141, "eval_samples_per_second": 8.892, "eval_steps_per_second": 0.556, "step": 477 }, { "epoch": 0.998691442030882, "step": 477, "total_flos": 0.0, "train_loss": 0.6240661440405456, "train_runtime": 18073.4146, "train_samples_per_second": 3.383, "train_steps_per_second": 0.026 } ], "logging_steps": 1, "max_steps": 477, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }